(crawler) Implement domain shuffling in DbCrawlSpecProvider

Modified the DbCrawlSpecProvider to shuffle domains after loading to ensure a good mix for each crawl. This change prevents overload of crawling the same server in parallel from different subdomains or crawling big domains all at once.
This commit is contained in:
Viktor Lofgren 2024-02-17 17:47:38 +01:00
parent 92717a4832
commit cee707abd8

View File

@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Stream;
@ -60,6 +61,11 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
logger.info("Loaded {} domains", domains.size());
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
// so that e.g. the big domains don't get all crawled at once, or we end up
// crawling the same server in parallel from different subdomains...
Collections.shuffle(domains);
return domains;
}