From e1a155a9c8324552f9cf64b981e38d39facf7bc4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 23 Dec 2023 13:22:10 +0100 Subject: [PATCH] (crawler) Increase growth of crawl jobs A number of crawl jobs get stuck at about 300 documents, or just under. This seems to be because we fail to increase the crawl limit, which is based on MAX(200, 1.25 x GOOD_URLS) with a 1.5x modifier applied upon a recrawl. GOOD_URLS is based on how many documents successfully process, which is typically fairly small. Switching to KNOWN_URLS should let this grow faster. The SQL query in the DbCrawlSpecProvider class has been updated; 'GOOD_URLS' has been replaced with 'KNOWN_URLS'. This update ensures the correct data is selected from the DOMAIN_METADATA table. The floor is also increased to 250 from 200. --- .../java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java index 79b08117..c0dfda01 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java @@ -36,7 +36,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { try (var conn = dataSource.getConnection(); var query = conn.prepareStatement(""" - SELECT DOMAIN_NAME, COALESCE(GOOD_URLS, 0) + SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0) FROM EC_DOMAIN LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY=? @@ -48,7 +48,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { while (rs.next()) { domains.add(new CrawlSpecRecord( rs.getString(1), - Math.clamp((int) (1.25 * rs.getInt(2)), 200, 10_000), + Math.clamp((int) (1.25 * rs.getInt(2)), 250, 10_000), List.of() )); }