Merge pull request 'Don't try to fetch text/css and text/javascript-files. Refactor fetcher to separate content type sniffing logic. Clean up crawler a smidge.' (#91) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/91
This commit is contained in:
commit
014068ace5
@ -20,7 +20,7 @@ import java.net.InetAddress;
|
|||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.Collections;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -129,7 +129,7 @@ public class CrawlerRetreiver {
|
|||||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||||
long crawlDelay = robotsRules.getCrawlDelay();
|
long crawlDelay = robotsRules.getCrawlDelay();
|
||||||
|
|
||||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null);
|
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
|
||||||
|
|
||||||
int fetchedCount = 0;
|
int fetchedCount = 0;
|
||||||
|
|
||||||
@ -137,7 +137,7 @@ public class CrawlerRetreiver {
|
|||||||
var top = queue.removeFirst();
|
var top = queue.removeFirst();
|
||||||
|
|
||||||
if (!robotsRules.isAllowed(top.toString())) {
|
if (!robotsRules.isAllowed(top.toString())) {
|
||||||
ret.doc.add(createRobotsError(top));
|
crawledDomainWriter.accept(createRobotsError(top));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user