(index) Clean up result domain deduplicator

This commit is contained in:
Viktor Lofgren 2023-08-24 18:24:55 +02:00
parent 1e6800565a
commit 56eb83319d
3 changed files with 51 additions and 22 deletions

View File

@ -11,7 +11,8 @@ import java.util.List;
/** Represents a document matching a search query */
@AllArgsConstructor @Getter
public class SearchResultItem implements Comparable<SearchResultItem> {
/** Encoded ID that contains both the URL id and its ranking */
/** Encoded ID that contains both the URL id and its ranking. This is
* probably not what you want, use getDocumentId() instead */
public final long combinedId;
/** How did the subqueries match against the document ? */
@ -20,8 +21,8 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public SearchResultItem(long val) {
this.combinedId = val;
public SearchResultItem(long combinedId) {
this.combinedId = combinedId;
this.keywordScores = new ArrayList<>(16);
}
@ -66,16 +67,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
return false;
}
public long deduplicationKey() {
final int domainId = getDomainId();
if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) {
return 0;
}
return domainId;
}
@Override
public int compareTo(@NotNull SearchResultItem o) {
// this looks like a bug, but we actually want this in a reversed order

View File

@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.index.client.model.results.SearchResultItem;
public class IndexResultDomainDeduplicator {
final TLongIntMap resultsByRankingId = CachedObjects.getMap();
final TLongIntMap resultsByDomainId = CachedObjects.getMap();
final int limitByDomain;
public IndexResultDomainDeduplicator(int limitByDomain) {
@ -13,19 +13,15 @@ public class IndexResultDomainDeduplicator {
}
public boolean test(SearchResultItem item) {
final long key = item.deduplicationKey();
if (key == 0)
return true;
final long key = item.getDomainId();
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
return resultsByDomainId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
}
public int getCount(SearchResultItem item) {
final long key = item.deduplicationKey();
if (key == 0)
return 1;
final long key = item.getDomainId();
return resultsByRankingId.get(key);
return resultsByDomainId.get(key);
}
private static class CachedObjects {
@ -38,7 +34,14 @@ public class IndexResultDomainDeduplicator {
ret.clear();
return ret;
}
public static void clear() {
mapCache.remove();
}
}
static void clearCachedObjects() {
CachedObjects.clear();
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.index.results;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.model.id.UrlIdCodec;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class IndexResultDomainDeduplicatorTest {
@AfterEach
public void clear() {
IndexResultDomainDeduplicator.clearCachedObjects();
}
@Test
public void testDeduplicator() {
IndexResultDomainDeduplicator deduplicator = new IndexResultDomainDeduplicator(3);
assertTrue(deduplicator.test(forId(3, 0)));
assertTrue(deduplicator.test(forId(3, 1)));
assertTrue(deduplicator.test(forId(3, 2)));
assertFalse(deduplicator.test(forId(3, 3)));
assertFalse(deduplicator.test(forId(3, 4)));
assertEquals(5, deduplicator.getCount(forId(3, 3)));
}
SearchResultItem forId(int domain, int ordinal) {
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal));
}
}