From 340d80f6c7cc4d44fc4d3f2d7856b3be40929c40 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 18 Aug 2022 18:40:34 +0200 Subject: [PATCH] Don't try to fetch text/css and text/javascript-files. Refactor fetcher to separate content type sniffing logic. Clean up crawler a smidge. --- .../wmsa/edge/crawling/retreival/CrawlerRetreiver.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 44281f9f..786df800 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -20,7 +20,7 @@ import java.net.InetAddress; import java.net.URISyntaxException; import java.net.UnknownHostException; import java.time.LocalDateTime; -import java.util.Collections; +import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.Optional; @@ -129,7 +129,7 @@ public class CrawlerRetreiver { var robotsRules = fetcher.fetchRobotRules(queue.peek().domain); long crawlDelay = robotsRules.getCrawlDelay(); - CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null); + CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); int fetchedCount = 0; @@ -137,7 +137,7 @@ public class CrawlerRetreiver { var top = queue.removeFirst(); if (!robotsRules.isAllowed(top.toString())) { - ret.doc.add(createRobotsError(top)); + crawledDomainWriter.accept(createRobotsError(top)); continue; }