Merge pull request 'Don't try to fetch ftp://, webcal://, etc.' (#90) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/90
This commit is contained in:
Viktor Lofgren 2022-08-18 18:27:15 +02:00
commit a915b2d37a
6 changed files with 134 additions and 92 deletions

View File

@ -20,7 +20,10 @@ import java.net.InetAddress;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.*; import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Optional;
public class CrawlerRetreiver { public class CrawlerRetreiver {
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000); private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
@ -49,7 +52,7 @@ public class CrawlerRetreiver {
} }
} }
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter crawledDomainWriter) { public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) {
this.fetcher = fetcher; this.fetcher = fetcher;
visited = new HashSet<>((int)(specs.urls.size() * 1.5)); visited = new HashSet<>((int)(specs.urls.size() * 1.5));
known = new HashSet<>(specs.urls.size() * 10); known = new HashSet<>(specs.urls.size() * 10);
@ -57,14 +60,14 @@ public class CrawlerRetreiver {
depth = specs.crawlDepth; depth = specs.crawlDepth;
id = specs.id; id = specs.id;
domain = specs.domain; domain = specs.domain;
this.crawledDomainWriter = crawledDomainWriter;
specs.urls.stream() crawledDomainWriter = writer;
.map(this::parseUrl)
.filter(Optional::isPresent) for (String urlStr : specs.urls) {
.map(Optional::get) EdgeUrl.parse(urlStr)
.filter(known::add) .filter(known::add)
.forEach(queue::addLast); .ifPresent(queue::addLast);
}
if (queue.peek() != null) { if (queue.peek() != null) {
var fst = queue.peek(); var fst = queue.peek();
@ -74,15 +77,6 @@ public class CrawlerRetreiver {
} }
} }
private Optional<EdgeUrl> parseUrl(String str) {
try {
return Optional.of(new EdgeUrl(str));
}
catch (Exception ex) {
return Optional.empty();
}
}
public int fetch() throws IOException { public int fetch() throws IOException {
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain); Optional<CrawledDomain> probeResult = probeDomainForProblems(domain);
@ -135,13 +129,11 @@ public class CrawlerRetreiver {
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain); var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
long crawlDelay = robotsRules.getCrawlDelay(); long crawlDelay = robotsRules.getCrawlDelay();
List<CrawledDocument> docs = new ArrayList<>(depth); CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null);
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, docs, null);
int visitedCount = 0;
int fetchedCount = 0; int fetchedCount = 0;
while (!queue.isEmpty() && visitedCount < depth) { while (!queue.isEmpty() && visited.size() < depth) {
var top = queue.removeFirst(); var top = queue.removeFirst();
if (!robotsRules.isAllowed(top.toString())) { if (!robotsRules.isAllowed(top.toString())) {
@ -151,20 +143,31 @@ public class CrawlerRetreiver {
if (urlBlocklist.isUrlBlocked(top)) if (urlBlocklist.isUrlBlocked(top))
continue; continue;
if (!isAllowedProtocol(top.proto))
continue;
if (top.toString().length() > 255) if (top.toString().length() > 255)
continue; continue;
if (!visited.add(top))
if (!visited.add(top)) {
continue; continue;
if (fetchDocument(top, crawlDelay)) {
fetchedCount++;
}
} }
ret.cookies = fetcher.getCookies();
crawledDomainWriter.accept(ret);
return fetchedCount;
}
private boolean fetchDocument(EdgeUrl top, long crawlDelay) throws IOException {
logger.debug("Fetching {}", top); logger.debug("Fetching {}", top);
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
var doc = fetchUrl(top); var doc = fetchUrl(top);
if (doc.isPresent()) { if (doc.isPresent()) {
fetchedCount++;
var d = doc.get(); var d = doc.get();
crawledDomainWriter.accept(d); crawledDomainWriter.accept(d);
@ -179,14 +182,12 @@ public class CrawlerRetreiver {
long crawledTime = System.currentTimeMillis() - startTime; long crawledTime = System.currentTimeMillis() - startTime;
delay(crawlDelay, crawledTime); delay(crawlDelay, crawledTime);
visitedCount ++; return doc.isPresent();
} }
ret.cookies = fetcher.getCookies(); private boolean isAllowedProtocol(String proto) {
return proto.equalsIgnoreCase("http")
crawledDomainWriter.accept(ret); || proto.equalsIgnoreCase("https");
return fetchedCount;
} }
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) { private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
@ -278,18 +279,18 @@ public class CrawlerRetreiver {
} }
@SneakyThrows @SneakyThrows
private void delay(long crawlDelay, long timeParsed) { private void delay(long sleepTime, long spentTime) {
if (crawlDelay >= 1) { if (sleepTime >= 1) {
if (timeParsed > crawlDelay) if (spentTime > sleepTime)
return; return;
Thread.sleep(Math.min(crawlDelay-timeParsed, 5000)); Thread.sleep(Math.min(sleepTime-spentTime, 5000));
} }
else { else {
if (timeParsed > DEFAULT_CRAWL_DELAY_MS) if (spentTime > DEFAULT_CRAWL_DELAY_MS)
return; return;
Thread.sleep(DEFAULT_CRAWL_DELAY_MS - timeParsed); Thread.sleep(DEFAULT_CRAWL_DELAY_MS - spentTime);
} }
} }

View File

@ -9,6 +9,8 @@ import lombok.SneakyThrows;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import okhttp3.Dispatcher; import okhttp3.Dispatcher;
@ -29,8 +31,6 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
public class HttpFetcher { public class HttpFetcher {
@ -42,11 +42,11 @@ public class HttpFetcher {
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
public void setAllowAllContentTypes(boolean allowAllContentTypes) { private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
this.allowAllContentTypes = allowAllContentTypes;
}
private boolean allowAllContentTypes = false; public void setAllowAllContentTypes(boolean allowAllContentTypes) {
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
}
private final OkHttpClient client; private final OkHttpClient client;
@ -142,8 +142,8 @@ public class HttpFetcher {
@SneakyThrows @SneakyThrows
public CrawledDocument fetchContent(EdgeUrl url) { public CrawledDocument fetchContent(EdgeUrl url) {
if (isUrlLikeBinary(url)) {
if (contentTypeLogic.isUrlLikeBinary(url)) {
logger.debug("Probing suspected binary {}", url); logger.debug("Probing suspected binary {}", url);
var head = createHeadRequest(url); var head = createHeadRequest(url);
@ -151,7 +151,7 @@ public class HttpFetcher {
try (var rsp = call.execute()) { try (var rsp = call.execute()) {
var contentTypeHeader = rsp.header("Content-type"); var contentTypeHeader = rsp.header("Content-type");
if (contentTypeHeader != null && !isAllowableContentType(contentTypeHeader)) { if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
} }
} }
@ -163,9 +163,6 @@ public class HttpFetcher {
var get = createGetRequest(url); var get = createGetRequest(url);
var call = client.newCall(get); var call = client.newCall(get);
try (var rsp = call.execute()) { try (var rsp = call.execute()) {
return extractBody(url, rsp); return extractBody(url, rsp);
} }
@ -217,14 +214,14 @@ public class HttpFetcher {
byteStream = new BOMInputStream(byteStream); byteStream = new BOMInputStream(byteStream);
var contentTypeHeader = rsp.header("Content-type"); var contentTypeHeader = rsp.header("Content-type");
if (contentTypeHeader != null && !isAllowableContentType(contentTypeHeader)) { if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
} }
byte[] data = byteStream.readNBytes(maxFetchSize); byte[] data = byteStream.readNBytes(maxFetchSize);
var contentType = ContentTypeParser.parse(contentTypeHeader, data); var contentType = ContentTypeParser.parse(contentTypeHeader, data);
if (!isAllowableContentType(contentType.contentType)) { if (!contentTypeLogic.isAllowableContentType(contentType.contentType)) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
} }
@ -262,25 +259,6 @@ public class HttpFetcher {
} }
private final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)(\\?.*)?$").asPredicate();
private final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asPredicate();
public boolean isUrlLikeBinary(EdgeUrl url) {
String urlString = url.toString().toLowerCase();
return (!probableHtmlPattern.test(urlString) && probableBinaryPattern.test(urlString));
}
private boolean isAllowableContentType(String contentType) {
return allowAllContentTypes || contentType.startsWith("text")
|| contentType.startsWith("application/xhtml")
|| contentType.startsWith("application/xml")
|| contentType.startsWith("application/atom+xml")
|| contentType.startsWith("application/rss+xml")
|| contentType.startsWith("application/x-rss+xml")
|| contentType.startsWith("application/rdf+xml")
|| contentType.startsWith("x-rss+xml");
}
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return fetchRobotsForProto("https", domain) return fetchRobotsForProto("https", domain)

View File

@ -0,0 +1,53 @@
package nu.marginalia.wmsa.edge.crawling.retreival.logic;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class ContentTypeLogic {
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)$").asMatchPredicate();
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
private static final List<String> acceptedContentTypePrefixes = List.of(
"text/",
"application/xhtml",
"application/xml",
"application/atom+xml",
"application/rss+xml",
"application/x-rss+xml",
"application/rdf+xml",
"x-rss+xml"
);
private boolean allowAllContentTypes = false;
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
this.allowAllContentTypes = allowAllContentTypes;
}
public boolean isUrlLikeBinary(EdgeUrl url) {
String pathLowerCase = url.path.toLowerCase();
if (probableHtmlPattern.test(pathLowerCase))
return false;
return probableBinaryPattern.test(pathLowerCase);
}
public boolean isAllowableContentType(String contentType) {
if (allowAllContentTypes)
return true;
if (blockedContentTypes.contains(contentType)) {
return false;
}
for (var prefix : acceptedContentTypePrefixes) {
if (contentType.startsWith(prefix))
return true;
}
return false;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.crawling.retreival; package nu.marginalia.wmsa.edge.crawling.retreival.logic;
import crawlercommons.mimetypes.MimeTypeDetector; import crawlercommons.mimetypes.MimeTypeDetector;
import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType; import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType;

View File

@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.Optional;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@Getter @Setter @Builder @EqualsAndHashCode @Getter @Setter @Builder @EqualsAndHashCode
@ -30,6 +31,14 @@ public class EdgeUrl implements WideHashable {
this(new URI(urlencodeFixer(url))); this(new URI(urlencodeFixer(url)));
} }
public static Optional<EdgeUrl> parse(String url) {
try {
return Optional.of(new EdgeUrl(url));
} catch (URISyntaxException e) {
return Optional.empty();
}
}
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]"); private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
public static String urlencodeFixer(String url) throws URISyntaxException { public static String urlencodeFixer(String url) throws URISyntaxException {

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver; import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -14,15 +15,15 @@ class HttpFetcherTest {
@SneakyThrows @SneakyThrows
@Test @Test
void testUrlPattern() { void testUrlPattern() {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler"); ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt"))); Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt")));
Assertions.assertTrue(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin"))); Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin")));
Assertions.assertTrue(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz"))); Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz")));
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm"))); Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm")));
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html"))); Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html")));
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log"))); Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log")));
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1"))); Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1")));
} }
@Test @Test