From cee707abd8319a5b227867d01f2b20f5b3755859 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 17 Feb 2024 17:47:38 +0100 Subject: [PATCH] (crawler) Implement domain shuffling in DbCrawlSpecProvider Modified the DbCrawlSpecProvider to shuffle domains after loading to ensure a good mix for each crawl. This change prevents overload of crawling the same server in parallel from different subdomains or crawling big domains all at once. --- .../java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java index 756b4dd8..755cec43 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/spec/DbCrawlSpecProvider.java @@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.stream.Stream; @@ -60,6 +61,11 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider { logger.info("Loaded {} domains", domains.size()); + // Shuffle the domains to ensure we get a good mix of domains in each crawl, + // so that e.g. the big domains don't get all crawled at once, or we end up + // crawling the same server in parallel from different subdomains... + Collections.shuffle(domains); + return domains; }