From 2afbdc2269de0fc360b5ac3dcbdac0b2cca667b6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 7 Jun 2023 22:01:35 +0200 Subject: [PATCH] Adjust the logic for the crawl job extractor to set a relatively low visit limit for websites that are new in the index or has not yielded many good documents previously. --- .../crawl/CrawlJobDomainExtractor.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java index 9e1ad2de..c7909ac7 100644 --- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java +++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java @@ -16,7 +16,7 @@ import java.util.Set; import java.util.stream.Stream; public class CrawlJobDomainExtractor { - private static final int MIN_VISIT_COUNT = 1000; + private static final int MIN_VISIT_COUNT = 200; private static final int MAX_VISIT_COUNT = 100000; private static final String specificDomainSql = @@ -198,11 +198,20 @@ public class CrawlJobDomainExtractor { } private int calculateCrawlDepthFromVisitedCount(int count) { - count = count + 1000 + count / 4; - - if (count < MIN_VISIT_COUNT) { + if (count < MIN_VISIT_COUNT / 2) { + /* If we aren't getting very many good documents + out of this webpage on previous attempts, we + won't dig very deeply this time. This saves time + and resources for both the crawler and the server, + and also prevents deep crawls on foreign websites we aren't + interested in crawling at this point. */ count = MIN_VISIT_COUNT; } + else { + /* If we got many results previously, we'll + dig deeper with each successive crawl. */ + count = count + 1000 + count / 4; + } if (count > MAX_VISIT_COUNT) { count = MAX_VISIT_COUNT;