(search) Don't run LSH deduplication on details with zero lsh to support not calculating this hash.
This commit is contained in:
parent
5b0a6d7ec1
commit
f8050816ac
@ -22,17 +22,17 @@ public class UrlDeduplicator {
|
||||
this.resultsPerKey = resultsPerKey;
|
||||
}
|
||||
|
||||
public boolean shouldRemove(UrlDetails details) {
|
||||
return !filter(details);
|
||||
}
|
||||
public synchronized boolean shouldRemove(UrlDetails details) {
|
||||
if (!deduplicateOnSuperficialHash(details))
|
||||
return true;
|
||||
if (!deduplicateOnLSH(details))
|
||||
return true;
|
||||
if (!limitResultsPerDomain(details))
|
||||
return true;
|
||||
|
||||
public synchronized boolean filter(UrlDetails details) {
|
||||
return deduplicateOnSuperficialHash(details)
|
||||
&& deduplicateOnLSH(details)
|
||||
&& limitResultsPerDomain(details);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean deduplicateOnSuperficialHash(UrlDetails details) {
|
||||
return seenSuperficialhashes.add(details.getSuperficialHash());
|
||||
}
|
||||
@ -40,11 +40,15 @@ public class UrlDeduplicator {
|
||||
private boolean deduplicateOnLSH(UrlDetails details) {
|
||||
long thisHash = details.dataHash;
|
||||
|
||||
if (0 == thisHash)
|
||||
return true;
|
||||
|
||||
if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD))
|
||||
{
|
||||
seehLSHList.add(thisHash);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user