Merge pull request 'Don't try to fetch text/css and text/javascript-files. Refactor fetcher to separate content type sniffing logic. Clean up crawler a smidge.' (#91) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/91
This commit is contained in:
commit
014068ace5
@ -20,7 +20,7 @@ import java.net.InetAddress;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.Collections;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Optional;
|
||||
@ -129,7 +129,7 @@ public class CrawlerRetreiver {
|
||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null);
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
|
||||
|
||||
int fetchedCount = 0;
|
||||
|
||||
@ -137,7 +137,7 @@ public class CrawlerRetreiver {
|
||||
var top = queue.removeFirst();
|
||||
|
||||
if (!robotsRules.isAllowed(top.toString())) {
|
||||
ret.doc.add(createRobotsError(top));
|
||||
crawledDomainWriter.accept(createRobotsError(top));
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user