diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index baa02906..2e99b8a7 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -42,6 +42,7 @@ dependencies { implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-crawl:content-type') + implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ea1ccfe4..49760046 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -1,7 +1,5 @@ package nu.marginalia.crawl.retreival; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; @@ -19,7 +17,6 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 46446fee..4b501826 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival; -import com.google.common.hash.HashFunction; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; @@ -16,6 +16,7 @@ public class DomainCrawlFrontier { private static final LinkParser linkParser = new LinkParser(); + private static final MurmurHash3_128 hasher = new MurmurHash3_128(); private final ArrayDeque queue; // To save the number of strings kept in memory, @@ -27,7 +28,6 @@ public class DomainCrawlFrontier { // territory private final LongOpenHashSet visited; private final LongOpenHashSet known; - private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128(); private final EdgeDomain thisDomain; private final UrlBlocklist urlBlocklist; @@ -98,17 +98,17 @@ public class DomainCrawlFrontier { } public boolean addVisited(EdgeUrl url) { - long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + long hashCode = hasher.hashNearlyASCII(url.toString()); return visited.add(hashCode); } public boolean addKnown(EdgeUrl url) { - long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + long hashCode = hasher.hashNearlyASCII(url.toString()); return known.add(hashCode); } - boolean isVisited(EdgeUrl url) { - long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + public boolean isVisited(EdgeUrl url) { + long hashCode = hasher.hashNearlyASCII(url.toString()); return visited.contains(hashCode); }