(index) Clean up result domain deduplicator
This commit is contained in:
parent
1e6800565a
commit
56eb83319d
@ -11,7 +11,8 @@ import java.util.List;
|
|||||||
/** Represents a document matching a search query */
|
/** Represents a document matching a search query */
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
public class SearchResultItem implements Comparable<SearchResultItem> {
|
public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||||
/** Encoded ID that contains both the URL id and its ranking */
|
/** Encoded ID that contains both the URL id and its ranking. This is
|
||||||
|
* probably not what you want, use getDocumentId() instead */
|
||||||
public final long combinedId;
|
public final long combinedId;
|
||||||
|
|
||||||
/** How did the subqueries match against the document ? */
|
/** How did the subqueries match against the document ? */
|
||||||
@ -20,8 +21,8 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
/** How many other potential results existed in the same domain */
|
/** How many other potential results existed in the same domain */
|
||||||
public int resultsFromDomain;
|
public int resultsFromDomain;
|
||||||
|
|
||||||
public SearchResultItem(long val) {
|
public SearchResultItem(long combinedId) {
|
||||||
this.combinedId = val;
|
this.combinedId = combinedId;
|
||||||
this.keywordScores = new ArrayList<>(16);
|
this.keywordScores = new ArrayList<>(16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,16 +67,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long deduplicationKey() {
|
|
||||||
final int domainId = getDomainId();
|
|
||||||
|
|
||||||
if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return domainId;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(@NotNull SearchResultItem o) {
|
public int compareTo(@NotNull SearchResultItem o) {
|
||||||
// this looks like a bug, but we actually want this in a reversed order
|
// this looks like a bug, but we actually want this in a reversed order
|
||||||
|
@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap;
|
|||||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
|
|
||||||
public class IndexResultDomainDeduplicator {
|
public class IndexResultDomainDeduplicator {
|
||||||
final TLongIntMap resultsByRankingId = CachedObjects.getMap();
|
final TLongIntMap resultsByDomainId = CachedObjects.getMap();
|
||||||
final int limitByDomain;
|
final int limitByDomain;
|
||||||
|
|
||||||
public IndexResultDomainDeduplicator(int limitByDomain) {
|
public IndexResultDomainDeduplicator(int limitByDomain) {
|
||||||
@ -13,19 +13,15 @@ public class IndexResultDomainDeduplicator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean test(SearchResultItem item) {
|
public boolean test(SearchResultItem item) {
|
||||||
final long key = item.deduplicationKey();
|
final long key = item.getDomainId();
|
||||||
if (key == 0)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
return resultsByDomainId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCount(SearchResultItem item) {
|
public int getCount(SearchResultItem item) {
|
||||||
final long key = item.deduplicationKey();
|
final long key = item.getDomainId();
|
||||||
if (key == 0)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return resultsByRankingId.get(key);
|
return resultsByDomainId.get(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class CachedObjects {
|
private static class CachedObjects {
|
||||||
@ -38,7 +34,14 @@ public class IndexResultDomainDeduplicator {
|
|||||||
ret.clear();
|
ret.clear();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void clear() {
|
||||||
|
mapCache.remove();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void clearCachedObjects() {
|
||||||
|
CachedObjects.clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,35 @@
|
|||||||
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class IndexResultDomainDeduplicatorTest {
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void clear() {
|
||||||
|
IndexResultDomainDeduplicator.clearCachedObjects();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDeduplicator() {
|
||||||
|
|
||||||
|
IndexResultDomainDeduplicator deduplicator = new IndexResultDomainDeduplicator(3);
|
||||||
|
|
||||||
|
assertTrue(deduplicator.test(forId(3, 0)));
|
||||||
|
assertTrue(deduplicator.test(forId(3, 1)));
|
||||||
|
assertTrue(deduplicator.test(forId(3, 2)));
|
||||||
|
assertFalse(deduplicator.test(forId(3, 3)));
|
||||||
|
assertFalse(deduplicator.test(forId(3, 4)));
|
||||||
|
|
||||||
|
assertEquals(5, deduplicator.getCount(forId(3, 3)));
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchResultItem forId(int domain, int ordinal) {
|
||||||
|
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user