Merge pull request 'Don't try to fetch ftp://, webcal://, etc.' (#90) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/90
This commit is contained in:
commit
a915b2d37a
@ -20,7 +20,10 @@ import java.net.InetAddress;
|
|||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
public class CrawlerRetreiver {
|
public class CrawlerRetreiver {
|
||||||
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
|
private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
|
||||||
@ -49,7 +52,7 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter crawledDomainWriter) {
|
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) {
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
||||||
known = new HashSet<>(specs.urls.size() * 10);
|
known = new HashSet<>(specs.urls.size() * 10);
|
||||||
@ -57,14 +60,14 @@ public class CrawlerRetreiver {
|
|||||||
depth = specs.crawlDepth;
|
depth = specs.crawlDepth;
|
||||||
id = specs.id;
|
id = specs.id;
|
||||||
domain = specs.domain;
|
domain = specs.domain;
|
||||||
this.crawledDomainWriter = crawledDomainWriter;
|
|
||||||
|
|
||||||
specs.urls.stream()
|
crawledDomainWriter = writer;
|
||||||
.map(this::parseUrl)
|
|
||||||
.filter(Optional::isPresent)
|
for (String urlStr : specs.urls) {
|
||||||
.map(Optional::get)
|
EdgeUrl.parse(urlStr)
|
||||||
.filter(known::add)
|
.filter(known::add)
|
||||||
.forEach(queue::addLast);
|
.ifPresent(queue::addLast);
|
||||||
|
}
|
||||||
|
|
||||||
if (queue.peek() != null) {
|
if (queue.peek() != null) {
|
||||||
var fst = queue.peek();
|
var fst = queue.peek();
|
||||||
@ -74,15 +77,6 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<EdgeUrl> parseUrl(String str) {
|
|
||||||
try {
|
|
||||||
return Optional.of(new EdgeUrl(str));
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int fetch() throws IOException {
|
public int fetch() throws IOException {
|
||||||
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain);
|
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain);
|
||||||
|
|
||||||
@ -135,13 +129,11 @@ public class CrawlerRetreiver {
|
|||||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||||
long crawlDelay = robotsRules.getCrawlDelay();
|
long crawlDelay = robotsRules.getCrawlDelay();
|
||||||
|
|
||||||
List<CrawledDocument> docs = new ArrayList<>(depth);
|
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, Collections.emptyList(), null);
|
||||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, docs, null);
|
|
||||||
|
|
||||||
int visitedCount = 0;
|
|
||||||
int fetchedCount = 0;
|
int fetchedCount = 0;
|
||||||
|
|
||||||
while (!queue.isEmpty() && visitedCount < depth) {
|
while (!queue.isEmpty() && visited.size() < depth) {
|
||||||
var top = queue.removeFirst();
|
var top = queue.removeFirst();
|
||||||
|
|
||||||
if (!robotsRules.isAllowed(top.toString())) {
|
if (!robotsRules.isAllowed(top.toString())) {
|
||||||
@ -151,35 +143,16 @@ public class CrawlerRetreiver {
|
|||||||
|
|
||||||
if (urlBlocklist.isUrlBlocked(top))
|
if (urlBlocklist.isUrlBlocked(top))
|
||||||
continue;
|
continue;
|
||||||
|
if (!isAllowedProtocol(top.proto))
|
||||||
|
continue;
|
||||||
if (top.toString().length() > 255)
|
if (top.toString().length() > 255)
|
||||||
continue;
|
continue;
|
||||||
|
if (!visited.add(top))
|
||||||
if (!visited.add(top)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Fetching {}", top);
|
if (fetchDocument(top, crawlDelay)) {
|
||||||
long startTime = System.currentTimeMillis();
|
|
||||||
|
|
||||||
var doc = fetchUrl(top);
|
|
||||||
if (doc.isPresent()) {
|
|
||||||
fetchedCount++;
|
fetchedCount++;
|
||||||
|
|
||||||
var d = doc.get();
|
|
||||||
crawledDomainWriter.accept(d);
|
|
||||||
|
|
||||||
if (d.url != null) {
|
|
||||||
try {
|
|
||||||
visited.add(new EdgeUrl(d.url));
|
|
||||||
} catch (URISyntaxException ex) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
long crawledTime = System.currentTimeMillis() - startTime;
|
|
||||||
delay(crawlDelay, crawledTime);
|
|
||||||
|
|
||||||
visitedCount ++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.cookies = fetcher.getCookies();
|
ret.cookies = fetcher.getCookies();
|
||||||
@ -189,6 +162,34 @@ public class CrawlerRetreiver {
|
|||||||
return fetchedCount;
|
return fetchedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean fetchDocument(EdgeUrl top, long crawlDelay) throws IOException {
|
||||||
|
logger.debug("Fetching {}", top);
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
var doc = fetchUrl(top);
|
||||||
|
if (doc.isPresent()) {
|
||||||
|
var d = doc.get();
|
||||||
|
crawledDomainWriter.accept(d);
|
||||||
|
|
||||||
|
if (d.url != null) {
|
||||||
|
try {
|
||||||
|
visited.add(new EdgeUrl(d.url));
|
||||||
|
} catch (URISyntaxException ex) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
long crawledTime = System.currentTimeMillis() - startTime;
|
||||||
|
delay(crawlDelay, crawledTime);
|
||||||
|
|
||||||
|
return doc.isPresent();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isAllowedProtocol(String proto) {
|
||||||
|
return proto.equalsIgnoreCase("http")
|
||||||
|
|| proto.equalsIgnoreCase("https");
|
||||||
|
}
|
||||||
|
|
||||||
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
|
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
|
||||||
try {
|
try {
|
||||||
|
|
||||||
@ -278,18 +279,18 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void delay(long crawlDelay, long timeParsed) {
|
private void delay(long sleepTime, long spentTime) {
|
||||||
if (crawlDelay >= 1) {
|
if (sleepTime >= 1) {
|
||||||
if (timeParsed > crawlDelay)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(Math.min(crawlDelay-timeParsed, 5000));
|
Thread.sleep(Math.min(sleepTime-spentTime, 5000));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (timeParsed > DEFAULT_CRAWL_DELAY_MS)
|
if (spentTime > DEFAULT_CRAWL_DELAY_MS)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MS - timeParsed);
|
Thread.sleep(DEFAULT_CRAWL_DELAY_MS - spentTime);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,8 @@ import lombok.SneakyThrows;
|
|||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeParser;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import okhttp3.Dispatcher;
|
import okhttp3.Dispatcher;
|
||||||
@ -29,8 +31,6 @@ import java.util.List;
|
|||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
public class HttpFetcher {
|
public class HttpFetcher {
|
||||||
@ -42,11 +42,11 @@ public class HttpFetcher {
|
|||||||
|
|
||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
|
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
this.allowAllContentTypes = allowAllContentTypes;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean allowAllContentTypes = false;
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
|
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||||
|
}
|
||||||
|
|
||||||
private final OkHttpClient client;
|
private final OkHttpClient client;
|
||||||
|
|
||||||
@ -142,8 +142,8 @@ public class HttpFetcher {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public CrawledDocument fetchContent(EdgeUrl url) {
|
public CrawledDocument fetchContent(EdgeUrl url) {
|
||||||
if (isUrlLikeBinary(url)) {
|
|
||||||
|
|
||||||
|
if (contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
logger.debug("Probing suspected binary {}", url);
|
logger.debug("Probing suspected binary {}", url);
|
||||||
|
|
||||||
var head = createHeadRequest(url);
|
var head = createHeadRequest(url);
|
||||||
@ -151,7 +151,7 @@ public class HttpFetcher {
|
|||||||
|
|
||||||
try (var rsp = call.execute()) {
|
try (var rsp = call.execute()) {
|
||||||
var contentTypeHeader = rsp.header("Content-type");
|
var contentTypeHeader = rsp.header("Content-type");
|
||||||
if (contentTypeHeader != null && !isAllowableContentType(contentTypeHeader)) {
|
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
|
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -163,9 +163,6 @@ public class HttpFetcher {
|
|||||||
var get = createGetRequest(url);
|
var get = createGetRequest(url);
|
||||||
var call = client.newCall(get);
|
var call = client.newCall(get);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
try (var rsp = call.execute()) {
|
||||||
return extractBody(url, rsp);
|
return extractBody(url, rsp);
|
||||||
}
|
}
|
||||||
@ -217,14 +214,14 @@ public class HttpFetcher {
|
|||||||
byteStream = new BOMInputStream(byteStream);
|
byteStream = new BOMInputStream(byteStream);
|
||||||
|
|
||||||
var contentTypeHeader = rsp.header("Content-type");
|
var contentTypeHeader = rsp.header("Content-type");
|
||||||
if (contentTypeHeader != null && !isAllowableContentType(contentTypeHeader)) {
|
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] data = byteStream.readNBytes(maxFetchSize);
|
byte[] data = byteStream.readNBytes(maxFetchSize);
|
||||||
|
|
||||||
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
|
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
|
||||||
if (!isAllowableContentType(contentType.contentType)) {
|
if (!contentTypeLogic.isAllowableContentType(contentType.contentType)) {
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -262,25 +259,6 @@ public class HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)(\\?.*)?$").asPredicate();
|
|
||||||
private final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asPredicate();
|
|
||||||
|
|
||||||
public boolean isUrlLikeBinary(EdgeUrl url) {
|
|
||||||
String urlString = url.toString().toLowerCase();
|
|
||||||
|
|
||||||
return (!probableHtmlPattern.test(urlString) && probableBinaryPattern.test(urlString));
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isAllowableContentType(String contentType) {
|
|
||||||
return allowAllContentTypes || contentType.startsWith("text")
|
|
||||||
|| contentType.startsWith("application/xhtml")
|
|
||||||
|| contentType.startsWith("application/xml")
|
|
||||||
|| contentType.startsWith("application/atom+xml")
|
|
||||||
|| contentType.startsWith("application/rss+xml")
|
|
||||||
|| contentType.startsWith("application/x-rss+xml")
|
|
||||||
|| contentType.startsWith("application/rdf+xml")
|
|
||||||
|| contentType.startsWith("x-rss+xml");
|
|
||||||
}
|
|
||||||
|
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||||
return fetchRobotsForProto("https", domain)
|
return fetchRobotsForProto("https", domain)
|
||||||
|
@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.crawling.retreival.logic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class ContentTypeLogic {
|
||||||
|
|
||||||
|
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)$").asMatchPredicate();
|
||||||
|
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
||||||
|
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
||||||
|
private static final List<String> acceptedContentTypePrefixes = List.of(
|
||||||
|
"text/",
|
||||||
|
"application/xhtml",
|
||||||
|
"application/xml",
|
||||||
|
"application/atom+xml",
|
||||||
|
"application/rss+xml",
|
||||||
|
"application/x-rss+xml",
|
||||||
|
"application/rdf+xml",
|
||||||
|
"x-rss+xml"
|
||||||
|
);
|
||||||
|
private boolean allowAllContentTypes = false;
|
||||||
|
|
||||||
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
|
this.allowAllContentTypes = allowAllContentTypes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isUrlLikeBinary(EdgeUrl url) {
|
||||||
|
String pathLowerCase = url.path.toLowerCase();
|
||||||
|
|
||||||
|
if (probableHtmlPattern.test(pathLowerCase))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return probableBinaryPattern.test(pathLowerCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAllowableContentType(String contentType) {
|
||||||
|
if (allowAllContentTypes)
|
||||||
|
return true;
|
||||||
|
if (blockedContentTypes.contains(contentType)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (var prefix : acceptedContentTypePrefixes) {
|
||||||
|
if (contentType.startsWith(prefix))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.crawling.retreival;
|
package nu.marginalia.wmsa.edge.crawling.retreival.logic;
|
||||||
|
|
||||||
import crawlercommons.mimetypes.MimeTypeDetector;
|
import crawlercommons.mimetypes.MimeTypeDetector;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeContentType;
|
@ -8,6 +8,7 @@ import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams;
|
|||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@Getter @Setter @Builder @EqualsAndHashCode
|
@Getter @Setter @Builder @EqualsAndHashCode
|
||||||
@ -30,6 +31,14 @@ public class EdgeUrl implements WideHashable {
|
|||||||
this(new URI(urlencodeFixer(url)));
|
this(new URI(urlencodeFixer(url)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Optional<EdgeUrl> parse(String url) {
|
||||||
|
try {
|
||||||
|
return Optional.of(new EdgeUrl(url));
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||||
|
|
||||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
|
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
|
||||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver;
|
import nu.marginalia.wmsa.edge.crawling.retreival.HttpRedirectResolver;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -14,15 +15,15 @@ class HttpFetcherTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Test
|
@Test
|
||||||
void testUrlPattern() {
|
void testUrlPattern() {
|
||||||
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
|
ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt")));
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.txt")));
|
||||||
Assertions.assertTrue(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin")));
|
Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.bin")));
|
||||||
Assertions.assertTrue(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz")));
|
Assertions.assertTrue(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.tar.gz")));
|
||||||
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm")));
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.htm")));
|
||||||
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html")));
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.html")));
|
||||||
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log")));
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log")));
|
||||||
Assertions.assertFalse(fetcher.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1")));
|
Assertions.assertFalse(contentTypeLogic.isUrlLikeBinary(new EdgeUrl("https://marginalia.nu/log.php?id=1")));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
Loading…
Reference in New Issue
Block a user