(crawler) Implement domain shuffling in DbCrawlSpecProvider
Modified the DbCrawlSpecProvider to shuffle domains after loading to ensure a good mix for each crawl. This change prevents overload of crawling the same server in parallel from different subdomains or crawling big domains all at once.
This commit is contained in:
parent
92717a4832
commit
cee707abd8
@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@ -60,6 +61,11 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
||||
|
||||
logger.info("Loaded {} domains", domains.size());
|
||||
|
||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
||||
// crawling the same server in parallel from different subdomains...
|
||||
Collections.shuffle(domains);
|
||||
|
||||
return domains;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user