From 4afccdc536262e1b98e74d4271afb81ae26b5f3e Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 18 Aug 2022 17:25:19 +0200 Subject: [PATCH 1/2] Don't try to fetch ftp://, webcal://, etc. --- .../wmsa/edge/crawling/retreival/CrawlerRetreiver.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index a8793a36..e6abe52c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -151,6 +151,8 @@ public class CrawlerRetreiver { if (urlBlocklist.isUrlBlocked(top)) continue; + if (!isAllowedProtocol(top.proto)) + continue; if (top.toString().length() > 255) continue; @@ -189,6 +191,11 @@ public class CrawlerRetreiver { return fetchedCount; } + private boolean isAllowedProtocol(String proto) { + return proto.equalsIgnoreCase("http") + || proto.equalsIgnoreCase("https"); + } + private Optional fetchUrl(EdgeUrl top) { try { From 6b6cd56e3aad5648e10ba51864c8f897871f37b3 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 18 Aug 2022 18:25:09 +0200 Subject: [PATCH 2/2] Don't try to fetch text/css and text/javascript-files. Refactor fetcher to separate content type sniffing logic. Clean up crawler a smidge. --- .../crawling/retreival/CrawlerRetreiver.java | 96 +++++++++---------- .../edge/crawling/retreival/HttpFetcher.java | 42 ++------ .../retreival/logic/ContentTypeLogic.java | 53 ++++++++++ .../{ => logic}/ContentTypeParser.java | 2 +- .../marginalia/wmsa/edge/model/EdgeUrl.java | 9 ++ .../wmsa/edge/crawling/HttpFetcherTest.java | 17 ++-- 6 files changed, 127 insertions(+), 92 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/{ => logic}/ContentTypeParser.java (98%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index e6abe52c..44281f9f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -20,7 +20,10 @@ import java.net.InetAddress; import java.net.URISyntaxException; import java.net.UnknownHostException; import java.time.LocalDateTime; -import java.util.*; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Optional; public class CrawlerRetreiver { private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000); @@ -49,7 +52,7 @@ public class CrawlerRetreiver { } } - public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter crawledDomainWriter) { + public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) { this.fetcher = fetcher; visited = new HashSet<>((int)(specs.urls.size() * 1.5)); known = new HashSet<>(specs.urls.size() * 10); @@ -57,14 +60,14 @@ public class CrawlerRetreiver { depth = specs.crawlDepth; id = specs.id; domain = specs.domain; - this.crawledDomainWriter = crawledDomainWriter; - specs.urls.stream() - .map(this::parseUrl) - .filter(Optional::isPresent) - .map(Optional::get) - .filter(known::add) - .forEach(queue::addLast); + crawledDomainWriter = writer; + + for (String urlStr : specs.urls) { + EdgeUrl.parse(urlStr) + .filter(known::add) + .ifPresent(queue::addLast); + } if (queue.peek() != null) { var fst = queue.peek(); @@ -74,15 +77,6 @@ public class CrawlerRetreiver { } } - private Optional parseUrl(String str) { - try { - return Optional.of(new EdgeUrl(str)); - } - catch (Exception ex) { - return Optional.empty(); - } - } - public int fetch() throws IOException { Optional probeResult = probeDomainForProblems(domain); @@ -135,13 +129,11 @@ public class CrawlerRetreiver { var robotsRules = fetcher.fetchRobotRules(queue.peek().domain); long crawlDelay = robotsRules.getCrawlDelay(); - List docs = new ArrayList<>(depth); - CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, docs, null); + CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null); - int visitedCount = 0; int fetchedCount = 0; - while (!queue.isEmpty() && visitedCount < depth) { + while (!queue.isEmpty() && visited.size() < depth) { var top = queue.removeFirst(); if (!robotsRules.isAllowed(top.toString())) { @@ -155,33 +147,12 @@ public class CrawlerRetreiver { continue; if (top.toString().length() > 255) continue; - - if (!visited.add(top)) { + if (!visited.add(top)) continue; - } - logger.debug("Fetching {}", top); - long startTime = System.currentTimeMillis(); - - var doc = fetchUrl(top); - if (doc.isPresent()) { + if (fetchDocument(top, crawlDelay)) { fetchedCount++; - - var d = doc.get(); - crawledDomainWriter.accept(d); - - if (d.url != null) { - try { - visited.add(new EdgeUrl(d.url)); - } catch (URISyntaxException ex) {} - } - } - - long crawledTime = System.currentTimeMillis() - startTime; - delay(crawlDelay, crawledTime); - - visitedCount ++; } ret.cookies = fetcher.getCookies(); @@ -191,6 +162,29 @@ public class CrawlerRetreiver { return fetchedCount; } + private boolean fetchDocument(EdgeUrl top, long crawlDelay) throws IOException { + logger.debug("Fetching {}", top); + long startTime = System.currentTimeMillis(); + + var doc = fetchUrl(top); + if (doc.isPresent()) { + var d = doc.get(); + crawledDomainWriter.accept(d); + + if (d.url != null) { + try { + visited.add(new EdgeUrl(d.url)); + } catch (URISyntaxException ex) {} + } + + } + + long crawledTime = System.currentTimeMillis() - startTime; + delay(crawlDelay, crawledTime); + + return doc.isPresent(); + } + private boolean isAllowedProtocol(String proto) { return proto.equalsIgnoreCase("http") || proto.equalsIgnoreCase("https"); @@ -285,18 +279,18 @@ public class CrawlerRetreiver { } @SneakyThrows - private void delay(long crawlDelay, long timeParsed) { - if (crawlDelay >= 1) { - if (timeParsed > crawlDelay) + private void delay(long sleepTime, long spentTime) { + if (sleepTime >= 1) { + if (spentTime > sleepTime) return; - Thread.sleep(Math.min(crawlDelay-timeParsed, 5000)); + Thread.sleep(Math.min(sleepTime-spentTime, 5000)); } else { - if (timeParsed > DEFAULT_CRAWL_DELAY_MS) + if (spentTime > DEFAULT_CRAWL_DELAY_MS) return; - Thread.sleep(DEFAULT_CRAWL_DELAY_MS - timeParsed); + Thread.sleep(DEFAULT_CRAWL_DELAY_MS - spentTime); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 76a2e247..cb198e21 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -9,6 +9,8 @@ import lombok.SneakyThrows; import lombok.ToString; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; +import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import okhttp3.Dispatcher; @@ -29,8 +31,6 @@ import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.concurrent.TimeUnit; -import java.util.function.Predicate; -import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; public class HttpFetcher { @@ -42,11 +42,11 @@ public class HttpFetcher { private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); - public void setAllowAllContentTypes(boolean allowAllContentTypes) { - this.allowAllContentTypes = allowAllContentTypes; - } + private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); - private boolean allowAllContentTypes = false; + public void setAllowAllContentTypes(boolean allowAllContentTypes) { + contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes); + } private final OkHttpClient client; @@ -142,8 +142,8 @@ public class HttpFetcher { @SneakyThrows public CrawledDocument fetchContent(EdgeUrl url) { - if (isUrlLikeBinary(url)) { + if (contentTypeLogic.isUrlLikeBinary(url)) { logger.debug("Probing suspected binary {}", url); var head = createHeadRequest(url); @@ -151,7 +151,7 @@ public class HttpFetcher { try (var rsp = call.execute()) { var contentTypeHeader = rsp.header("Content-type"); - if (contentTypeHeader != null && !isAllowableContentType(contentTypeHeader)) { + if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); } } @@ -163,9 +163,6 @@ public class HttpFetcher { var get = createGetRequest(url); var call = client.newCall(get); - - - try (var rsp = call.execute()) { return extractBody(url, rsp); } @@ -217,14 +214,14 @@ public class HttpFetcher { byteStream = new BOMInputStream(byteStream); var contentTypeHeader = rsp.header("Content-type"); - if (contentTypeHeader != null && !isAllowableContentType(contentTypeHeader)) { + if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } byte[] data = byteStream.readNBytes(maxFetchSize); var contentType = ContentTypeParser.parse(contentTypeHeader, data); - if (!isAllowableContentType(contentType.contentType)) { + if (!contentTypeLogic.isAllowableContentType(contentType.contentType)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } @@ -262,25 +259,6 @@ public class HttpFetcher { } - private final Predicate probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)(\\?.*)?$").asPredicate(); - private final Predicate probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asPredicate(); - - public boolean isUrlLikeBinary(EdgeUrl url) { - String urlString = url.toString().toLowerCase(); - - return (!probableHtmlPattern.test(urlString) && probableBinaryPattern.test(urlString)); - } - - private boolean isAllowableContentType(String contentType) { - return allowAllContentTypes || contentType.startsWith("text") - || contentType.startsWith("application/xhtml") - || contentType.startsWith("application/xml") - || contentType.startsWith("application/atom+xml") - || contentType.startsWith("application/rss+xml") - || contentType.startsWith("application/x-rss+xml") - || contentType.startsWith("application/rdf+xml") - || contentType.startsWith("x-rss+xml"); - } public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { return fetchRobotsForProto("https", domain) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java new file mode 100644 index 00000000..9d05026c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeLogic.java @@ -0,0 +1,53 @@ +package nu.marginalia.wmsa.edge.crawling.retreival.logic; + +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class ContentTypeLogic { + + private static final Predicate probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)$").asMatchPredicate(); + private static final Predicate probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate(); + private static final Set blockedContentTypes = Set.of("text/css", "text/javascript"); + private static final List acceptedContentTypePrefixes = List.of( + "text/", + "application/xhtml", + "application/xml", + "application/atom+xml", + "application/rss+xml", + "application/x-rss+xml", + "application/rdf+xml", + "x-rss+xml" + ); + private boolean allowAllContentTypes = false; + + public void setAllowAllContentTypes(boolean allowAllContentTypes) { + this.allowAllContentTypes = allowAllContentTypes; + } + + public boolean isUrlLikeBinary(EdgeUrl url) { + String pathLowerCase = url.path.toLowerCase(); + + if (probableHtmlPattern.test(pathLowerCase)) + return false; + + return probableBinaryPattern.test(pathLowerCase); + } + + public boolean isAllowableContentType(String contentType) { + if (allowAllContentTypes) + return true; + if (blockedContentTypes.contains(contentType)) { + return false; + } + for (var prefix : acceptedContentTypePrefixes) { + if (contentType.startsWith(prefix)) + return true; + } + return false; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/ContentTypeParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeParser.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/ContentTypeParser.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeParser.java index ec09b974..2f3359f3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/ContentTypeParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/logic/ContentTypeParser.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.crawling.retreival; +package nu.marginalia.wmsa.edge.crawling.retreival.logic; import crawlercommons.mimetypes.MimeTypeDetector; import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 2e451ba1..2ba9234c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams; import java.net.URI; import java.net.URISyntaxException; +import java.util.Optional; import java.util.regex.Pattern; @Getter @Setter @Builder @EqualsAndHashCode @@ -30,6 +31,14 @@ public class EdgeUrl implements WideHashable { this(new URI(urlencodeFixer(url))); } + public static Optional parse(String url) { + try { + return Optional.of(new EdgeUrl(url)); + } catch (URISyntaxException e) { + return Optional.empty(); + } + } + private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]"); public static String urlencodeFixer(String url) throws URISyntaxException { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java index ecf818ff..1fba5fe3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/HttpFetcherTest.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver; +import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -14,15 +15,15 @@ class HttpFetcherTest { @SneakyThrows @Test void testUrlPattern() { - var fetcher = new HttpFetcher("nu.marginalia.edge-crawler"); + ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); - Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt"))); - Assertions.assertTrue(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin"))); - Assertions.assertTrue(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz"))); - Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm"))); - Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html"))); - Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log"))); - Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1"))); + Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt"))); + Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin"))); + Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz"))); + Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm"))); + Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html"))); + Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log"))); + Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1"))); } @Test