Adjust the logic for the crawl job extractor to set a relatively low visit limit for websites that are new in the index or has not yielded many good documents previously.

2023-06-07 22:01:35 +02:00 · 2023-06-07 22:01:35 +02:00 · 2afbdc2269
commit 2afbdc2269
parent d82a858491
1 changed files with 13 additions and 4 deletions
--- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
+++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
@ -16,7 +16,7 @@ import java.util.Set;
 import java.util.stream.Stream;

 public class CrawlJobDomainExtractor {
-    private static final int MIN_VISIT_COUNT = 1000;
+    private static final int MIN_VISIT_COUNT = 200;
    private static final int MAX_VISIT_COUNT = 100000;

    private static final String specificDomainSql =
@ -198,11 +198,20 @@ public class CrawlJobDomainExtractor {
    }

    private int calculateCrawlDepthFromVisitedCount(int count) {
-        count = count + 1000 + count / 4;
-
-        if (count < MIN_VISIT_COUNT) {
+        if (count < MIN_VISIT_COUNT / 2) {
+            /* If we aren't getting very many good documents
+              out of this webpage on previous attempts, we
+              won't dig very deeply this time.  This saves time
+              and resources for both the crawler and the server,
+              and also prevents deep crawls on foreign websites we aren't
+              interested in crawling at this point. */
            count = MIN_VISIT_COUNT;
        }
+        else {
+            /* If we got many results previously, we'll
+               dig deeper with each successive crawl. */
+            count = count + 1000 + count / 4;
+        }

        if (count > MAX_VISIT_COUNT) {
            count = MAX_VISIT_COUNT;