(crawler) Switch hash function in crawler

Guava's hashers are a bit allocation hungry, and a big driver of GC churn in the crawler.   This switches to the modified Murmur hash function used throughout Marginalia.
This commit is contained in:
Viktor Lofgren 2023-12-27 13:29:00 +01:00
parent 3ea1ddae22
commit 9e5fe71f5b
3 changed files with 7 additions and 9 deletions

View File

@ -42,6 +42,7 @@ dependencies {
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j

View File

@ -1,7 +1,5 @@
package nu.marginalia.crawl.retreival;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentType;
@ -19,7 +17,6 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival;
import com.google.common.hash.HashFunction;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
@ -16,6 +16,7 @@ public class DomainCrawlFrontier {
private static final LinkParser linkParser = new LinkParser();
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
private final ArrayDeque<String> queue;
// To save the number of strings kept in memory,
@ -27,7 +28,6 @@ public class DomainCrawlFrontier {
// territory
private final LongOpenHashSet visited;
private final LongOpenHashSet known;
private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128();
private final EdgeDomain thisDomain;
private final UrlBlocklist urlBlocklist;
@ -98,17 +98,17 @@ public class DomainCrawlFrontier {
}
public boolean addVisited(EdgeUrl url) {
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
long hashCode = hasher.hashNearlyASCII(url.toString());
return visited.add(hashCode);
}
public boolean addKnown(EdgeUrl url) {
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
long hashCode = hasher.hashNearlyASCII(url.toString());
return known.add(hashCode);
}
boolean isVisited(EdgeUrl url) {
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
public boolean isVisited(EdgeUrl url) {
long hashCode = hasher.hashNearlyASCII(url.toString());
return visited.contains(hashCode);
}