From 2afbdc2269de0fc360b5ac3dcbdac0b2cca667b6 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Wed, 7 Jun 2023 22:01:35 +0200
Subject: [PATCH] Adjust the logic for the crawl job extractor to set a
 relatively low visit limit for websites that are new in the index or has not
 yielded many good documents previously.

---
 .../crawl/CrawlJobDomainExtractor.java          | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
index 9e1ad2de..c7909ac7 100644
--- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
+++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
@@ -16,7 +16,7 @@ import java.util.Set;
 import java.util.stream.Stream;
 
 public class CrawlJobDomainExtractor {
-    private static final int MIN_VISIT_COUNT = 1000;
+    private static final int MIN_VISIT_COUNT = 200;
     private static final int MAX_VISIT_COUNT = 100000;
 
     private static final String specificDomainSql =
@@ -198,11 +198,20 @@ public class CrawlJobDomainExtractor {
     }
 
     private int calculateCrawlDepthFromVisitedCount(int count) {
-        count = count + 1000 + count / 4;
-
-        if (count < MIN_VISIT_COUNT) {
+        if (count < MIN_VISIT_COUNT / 2) {
+            /* If we aren't getting very many good documents
+              out of this webpage on previous attempts, we
+              won't dig very deeply this time.  This saves time
+              and resources for both the crawler and the server,
+              and also prevents deep crawls on foreign websites we aren't
+              interested in crawling at this point. */
             count = MIN_VISIT_COUNT;
         }
+        else {
+            /* If we got many results previously, we'll
+               dig deeper with each successive crawl. */
+            count = count + 1000 + count / 4;
+        }
 
         if (count > MAX_VISIT_COUNT) {
             count = MAX_VISIT_COUNT;