(search) Don't run LSH deduplication on details with zero lsh to support not calculating this hash.

This commit is contained in:
Viktor Lofgren 2023-09-21 12:47:02 +02:00
parent 5b0a6d7ec1
commit f8050816ac

View File

@ -22,17 +22,17 @@ public class UrlDeduplicator {
this.resultsPerKey = resultsPerKey;
}
public boolean shouldRemove(UrlDetails details) {
return !filter(details);
}
public synchronized boolean shouldRemove(UrlDetails details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
return true;
if (!limitResultsPerDomain(details))
return true;
public synchronized boolean filter(UrlDetails details) {
return deduplicateOnSuperficialHash(details)
&& deduplicateOnLSH(details)
&& limitResultsPerDomain(details);
return false;
}
private boolean deduplicateOnSuperficialHash(UrlDetails details) {
return seenSuperficialhashes.add(details.getSuperficialHash());
}
@ -40,11 +40,15 @@ public class UrlDeduplicator {
private boolean deduplicateOnLSH(UrlDetails details) {
long thisHash = details.dataHash;
if (0 == thisHash)
return true;
if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD))
{
seehLSHList.add(thisHash);
return true;
}
return false;
}