Merge pull request #62 from MarginaliaSearch/warc
(WIP) Use WARCs in the crawler
This commit is contained in:
commit
8bbb533c9a
@ -4,5 +4,6 @@ public enum ConvertAction {
|
|||||||
ConvertCrawlData,
|
ConvertCrawlData,
|
||||||
SideloadEncyclopedia,
|
SideloadEncyclopedia,
|
||||||
SideloadDirtree,
|
SideloadDirtree,
|
||||||
|
SideloadWarc,
|
||||||
SideloadStackexchange
|
SideloadStackexchange
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,13 @@ public class ConvertRequest {
|
|||||||
destId,
|
destId,
|
||||||
null);
|
null);
|
||||||
}
|
}
|
||||||
|
public static ConvertRequest forWarc(Path sourcePath, FileStorageId destId) {
|
||||||
|
return new ConvertRequest(ConvertAction.SideloadWarc,
|
||||||
|
sourcePath.toString(),
|
||||||
|
null,
|
||||||
|
destId,
|
||||||
|
null);
|
||||||
|
}
|
||||||
|
|
||||||
public static ConvertRequest forStackexchange(Path sourcePath, FileStorageId destId) {
|
public static ConvertRequest forStackexchange(Path sourcePath, FileStorageId destId) {
|
||||||
return new ConvertRequest(ConvertAction.SideloadStackexchange,
|
return new ConvertRequest(ConvertAction.SideloadStackexchange,
|
||||||
|
@ -224,12 +224,19 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public URL asURL() throws MalformedURLException {
|
public URL asURL() throws MalformedURLException {
|
||||||
int port = this.port != null ? this.port : switch(proto) {
|
try {
|
||||||
case "http" -> 80;
|
return asURI().toURL();
|
||||||
case "https" -> 443;
|
}
|
||||||
default -> 0;
|
catch (URISyntaxException e) {
|
||||||
};
|
throw new MalformedURLException(e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return new URL(this.proto, this.domain.toString(), port, this.path);
|
public URI asURI() throws URISyntaxException {
|
||||||
|
if (port != null) {
|
||||||
|
return new URI(this.proto, null, this.domain.toString(), this.port, this.path, this.param, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
29
code/features-crawl/content-type/build.gradle
Normal file
29
code/features-crawl/content-type/build.gradle
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(21))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation libs.crawlercommons
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
|
implementation libs.bundles.gson
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
|
||||||
|
implementation libs.jsoup
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.contenttype;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
/** Content type and charset of a document
|
||||||
|
* @param contentType The content type, e.g. "text/html"
|
||||||
|
* @param charset The charset, e.g. "UTF-8"
|
||||||
|
*/
|
||||||
|
public record ContentType(String contentType, String charset) {
|
||||||
|
public static ContentType parse(String contentTypeHeader) {
|
||||||
|
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
|
||||||
|
String contentType = parts[0].trim();
|
||||||
|
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";
|
||||||
|
|
||||||
|
return new ContentType(contentType, charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean is(String contentType) {
|
||||||
|
return this.contentType.equalsIgnoreCase(contentType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
if (charset == null || charset.isBlank())
|
||||||
|
return contentType;
|
||||||
|
|
||||||
|
return STR."\{contentType}; charset=\{charset}";
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,8 @@
|
|||||||
package nu.marginalia.crawl.retreival.logic;
|
package nu.marginalia.contenttype;
|
||||||
|
|
||||||
import crawlercommons.mimetypes.MimeTypeDetector;
|
import crawlercommons.mimetypes.MimeTypeDetector;
|
||||||
import nu.marginalia.crawling.model.ContentType;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -11,28 +12,40 @@ public class ContentTypeParser {
|
|||||||
|
|
||||||
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
|
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
|
||||||
|
|
||||||
public static ContentType parse(String contentType, byte[] data) {
|
/** Parse the content type and charset from a content type header and/or the body of a document,
|
||||||
return getContentTypeFromContentTypeString(contentType)
|
* best effort
|
||||||
.or(() -> getContentTypeStringFromTag(data))
|
*/
|
||||||
|
public static ContentType parseContentType(
|
||||||
|
@Nullable String contentTypeHeader,
|
||||||
|
@NotNull byte[] body)
|
||||||
|
{
|
||||||
|
return getContentTypeFromContentTypeString(contentTypeHeader)
|
||||||
|
.or(() -> getContentTypeStringFromTag(body))
|
||||||
.orElseGet(() -> {
|
.orElseGet(() -> {
|
||||||
Optional<String> charset = getCharsetFromTag(data);
|
Optional<String> charset = getCharsetFromTag(body);
|
||||||
return new ContentType(
|
return new ContentType(
|
||||||
Optional.ofNullable(contentType)
|
Optional.ofNullable(contentTypeHeader)
|
||||||
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
|
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(body)))
|
||||||
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
|
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1"));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
|
/** Parse the charset from a content type string. */
|
||||||
if (contentType != null && contentType.contains(";")) {
|
private static Optional<ContentType> getContentTypeFromContentTypeString(@Nullable String contentType) {
|
||||||
var parts = contentType.split(";");
|
if (contentType == null)
|
||||||
var content = parts[0].trim();
|
return Optional.empty();
|
||||||
var extra = parts[1].trim();
|
|
||||||
if (extra.startsWith("charset=")) {
|
if (!contentType.contains(";"))
|
||||||
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
|
return Optional.empty();
|
||||||
}
|
|
||||||
}
|
var parts = contentType.split(";");
|
||||||
return Optional.empty();
|
var content = parts[0].trim();
|
||||||
|
var extra = parts[1].trim();
|
||||||
|
|
||||||
|
if (!extra.startsWith("charset="))
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String shittyMimeSniffer(byte[] data) {
|
private static String shittyMimeSniffer(byte[] data) {
|
||||||
@ -45,6 +58,7 @@ public class ContentTypeParser {
|
|||||||
|
|
||||||
String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
|
String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
|
||||||
if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
|
if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
|
||||||
|
// note we use contains here, since xhtml may be served with a <?xml-style header first
|
||||||
return "text/html";
|
return "text/html";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.contenttype;
|
||||||
|
|
||||||
|
import java.nio.charset.*;
|
||||||
|
|
||||||
|
public class DocumentBodyToString {
|
||||||
|
|
||||||
|
/** Get the string data from a document body, given the content type and charset */
|
||||||
|
public static String getStringData(ContentType type, byte[] data) {
|
||||||
|
Charset charset;
|
||||||
|
try {
|
||||||
|
charset = Charset.forName(type.charset());
|
||||||
|
}
|
||||||
|
catch (IllegalCharsetNameException ex) {
|
||||||
|
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
|
||||||
|
charset = StandardCharsets.UTF_8;
|
||||||
|
}
|
||||||
|
catch (UnsupportedCharsetException ex) {
|
||||||
|
// This is usually like Macintosh Latin
|
||||||
|
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
||||||
|
//
|
||||||
|
// It's close enough to 8859-1 to serve
|
||||||
|
charset = StandardCharsets.ISO_8859_1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new String(data, charset);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,50 @@
|
|||||||
|
package nu.marginalia.contenttype;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
|
||||||
|
public class ContentTypeParserTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParseContentTypeWithHeader() {
|
||||||
|
byte[] body = "<!DOCTYPE html><html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
|
||||||
|
String contentTypeHeader = "text/html; charset=UTF-8";
|
||||||
|
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
|
||||||
|
assertNotNull(result);
|
||||||
|
assertEquals("text/html", result.contentType());
|
||||||
|
assertEquals("UTF-8", result.charset());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParseContentTypeWithMetaCharset() {
|
||||||
|
byte[] body = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
|
||||||
|
ContentType result = ContentTypeParser.parseContentType(null, body);
|
||||||
|
assertNotNull(result);
|
||||||
|
assertEquals("text/html", result.contentType());
|
||||||
|
assertEquals("UTF-8", result.charset());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParseContentTypeWithHeaderValueAbsent() {
|
||||||
|
byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8);
|
||||||
|
String contentTypeHeader = "text/plain";
|
||||||
|
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
|
||||||
|
assertNotNull(result);
|
||||||
|
assertEquals("text/plain", result.contentType());
|
||||||
|
assertEquals("ISO_8859_1", result.charset());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParseContentTypeWithBinaryData() {
|
||||||
|
byte[] body = new byte[128];
|
||||||
|
body[0] = 31; // ascii value less than 32
|
||||||
|
ContentType result = ContentTypeParser.parseContentType(null, body);
|
||||||
|
assertNotNull(result);
|
||||||
|
assertEquals("application/binary", result.contentType());
|
||||||
|
assertEquals("ISO_8859_1", result.charset());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.contenttype;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
public class DocumentBodyToStringTest {
|
||||||
|
@Test
|
||||||
|
public void testGetStringData_onUTF8(){
|
||||||
|
|
||||||
|
ContentType type = new ContentType("text/html", "UTF-8");
|
||||||
|
|
||||||
|
String expected = "Hello, World!";
|
||||||
|
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
String result = DocumentBodyToString.getStringData(type, data);
|
||||||
|
|
||||||
|
assertEquals(expected, result, "Result should match the expected string");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetStringData_onIllegalCharsetName(){
|
||||||
|
|
||||||
|
ContentType type = new ContentType("text/html", "unsupportedname");
|
||||||
|
|
||||||
|
String expected = "Hello, World!";
|
||||||
|
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
String result = DocumentBodyToString.getStringData(type, data);
|
||||||
|
|
||||||
|
assertEquals(expected, result, "Result should match the expected string if charset is illegal name");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetStringData_onUnsupportedCharset(){
|
||||||
|
|
||||||
|
ContentType type = new ContentType("text/html", "Macintosh");
|
||||||
|
|
||||||
|
String expected = "Hello, World!";
|
||||||
|
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
String result = DocumentBodyToString.getStringData(type, data);
|
||||||
|
|
||||||
|
assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -37,7 +37,9 @@ public class GeoIpDictionary {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
this.notifyAll();
|
synchronized (this) {
|
||||||
|
this.notifyAll();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -15,18 +15,28 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:process')
|
implementation project(':code:common:process')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
|
implementation project(':code:features-crawl:content-type')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
implementation project(':third-party:parquet-floor')
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
|
implementation libs.bundles.parquet
|
||||||
|
|
||||||
|
implementation libs.jwarc
|
||||||
implementation libs.gson
|
implementation libs.gson
|
||||||
|
implementation libs.commons.io
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
implementation libs.okhttp3
|
||||||
|
implementation libs.jsoup
|
||||||
implementation libs.snakeyaml
|
implementation libs.snakeyaml
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival.logic;
|
package nu.marginalia.crawling.body;
|
||||||
|
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -37,6 +38,9 @@ public class ContentTypeLogic {
|
|||||||
return probableBinaryPattern.test(pathLowerCase);
|
return probableBinaryPattern.test(pathLowerCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isAllowableContentType(ContentType contentType) {
|
||||||
|
return isAllowableContentType(contentType.contentType());
|
||||||
|
}
|
||||||
public boolean isAllowableContentType(String contentType) {
|
public boolean isAllowableContentType(String contentType) {
|
||||||
if (allowAllContentTypes)
|
if (allowAllContentTypes)
|
||||||
return true;
|
return true;
|
@ -0,0 +1,76 @@
|
|||||||
|
package nu.marginalia.crawling.body;
|
||||||
|
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.contenttype.ContentTypeParser;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
public class DocumentBodyExtractor {
|
||||||
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class);
|
||||||
|
|
||||||
|
/** Extract the body from a fetch result as a byte array. */
|
||||||
|
public static DocumentBodyResult<byte[]> asBytes(HttpFetchResult result) {
|
||||||
|
if (result instanceof HttpFetchResult.ResultOk fetchOk) {
|
||||||
|
return asBytes(fetchOk);
|
||||||
|
}
|
||||||
|
else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) {
|
||||||
|
return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Extract the body from a fetch result as a string. This function performs
|
||||||
|
* content-type checks to ensure that the content-type is such that this operation
|
||||||
|
* makes sense.
|
||||||
|
*
|
||||||
|
* @see ContentTypeLogic#isAllowableContentType(String)
|
||||||
|
* */
|
||||||
|
public static DocumentBodyResult<String> asString(HttpFetchResult result) {
|
||||||
|
return asBytes(result).flatMap(DocumentBodyExtractor::toStringResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DocumentBodyResult<String> toStringResult(ContentType contentType, byte[] bytes) {
|
||||||
|
if (contentTypeLogic.isAllowableContentType(contentType)) {
|
||||||
|
try {
|
||||||
|
return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Extract the body from a fetch result as a byte array. */
|
||||||
|
public static DocumentBodyResult<byte[]> asBytes(HttpFetchResult.ResultOk rsp) {
|
||||||
|
try {
|
||||||
|
var byteStream = rsp.getInputStream();
|
||||||
|
|
||||||
|
if ("gzip".equals(rsp.header("Content-Encoding"))) {
|
||||||
|
byteStream = new GZIPInputStream(byteStream);
|
||||||
|
}
|
||||||
|
byteStream = new BOMInputStream(byteStream);
|
||||||
|
|
||||||
|
var contentTypeHeader = rsp.header("Content-Type");
|
||||||
|
|
||||||
|
byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder
|
||||||
|
var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data);
|
||||||
|
|
||||||
|
return new DocumentBodyResult.Ok<>(contentType, data);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("Failed to extract body", ex);
|
||||||
|
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,58 @@
|
|||||||
|
package nu.marginalia.crawling.body;
|
||||||
|
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.function.BiFunction;
|
||||||
|
|
||||||
|
public sealed interface DocumentBodyResult<T> {
|
||||||
|
record Ok<T>(ContentType contentType, T body) implements DocumentBodyResult<T> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper) {
|
||||||
|
return Optional.of(mapper.apply(contentType, body));
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper) {
|
||||||
|
return mapper.apply(contentType, body);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper) {
|
||||||
|
return mapper.apply(contentType, body);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
||||||
|
consumer.accept(contentType, body);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record Error<T>(CrawlerDocumentStatus status, String why) implements DocumentBodyResult<T> {
|
||||||
|
@Override
|
||||||
|
public <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
public <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper) { return Optional.empty(); }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper) {
|
||||||
|
return (DocumentBodyResult<T2>) this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
|
||||||
|
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
|
||||||
|
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
|
||||||
|
|
||||||
|
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
|
||||||
|
|
||||||
|
interface ExConsumer<T,E extends Exception> {
|
||||||
|
void accept(ContentType contentType, T t) throws E;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,160 @@
|
|||||||
|
package nu.marginalia.crawling.body;
|
||||||
|
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import okhttp3.Headers;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.netpreserve.jwarc.MessageHeaders;
|
||||||
|
import org.netpreserve.jwarc.WarcResponse;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.InetAddress;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
|
*/
|
||||||
|
public sealed interface HttpFetchResult {
|
||||||
|
|
||||||
|
boolean isOk();
|
||||||
|
|
||||||
|
/** Convert a WarcResponse to a HttpFetchResult */
|
||||||
|
static HttpFetchResult importWarc(WarcResponse response) {
|
||||||
|
try {
|
||||||
|
var http = response.http();
|
||||||
|
|
||||||
|
try (var body = http.body()) {
|
||||||
|
byte[] bytes = body.stream().readAllBytes();
|
||||||
|
|
||||||
|
String ipAddress = response
|
||||||
|
.ipAddress()
|
||||||
|
.map(InetAddress::getHostAddress)
|
||||||
|
.orElse("");
|
||||||
|
|
||||||
|
return new ResultOk(
|
||||||
|
response.targetURI(),
|
||||||
|
http.status(),
|
||||||
|
http.headers(),
|
||||||
|
ipAddress,
|
||||||
|
bytes,
|
||||||
|
0,
|
||||||
|
bytes.length
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return new ResultException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Corresponds to a successful retrieval of a document
|
||||||
|
* from the remote server. Note that byte[] is only borrowed
|
||||||
|
* and subsequent calls may overwrite the contents of this buffer.
|
||||||
|
*/
|
||||||
|
record ResultOk(URI uri,
|
||||||
|
int statusCode,
|
||||||
|
Headers headers,
|
||||||
|
String ipAddress,
|
||||||
|
byte[] bytesRaw,
|
||||||
|
int bytesStart,
|
||||||
|
int bytesLength
|
||||||
|
) implements HttpFetchResult {
|
||||||
|
|
||||||
|
public boolean isOk() {
|
||||||
|
return statusCode >= 200 && statusCode < 300;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ResultOk(URI uri,
|
||||||
|
int statusCode,
|
||||||
|
MessageHeaders headers,
|
||||||
|
String ipAddress,
|
||||||
|
byte[] bytesRaw,
|
||||||
|
int bytesStart,
|
||||||
|
int bytesLength) {
|
||||||
|
this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Headers convertHeaders(MessageHeaders headers) {
|
||||||
|
var ret = new Headers.Builder();
|
||||||
|
for (var header : headers.map().entrySet()) {
|
||||||
|
for (var value : header.getValue()) {
|
||||||
|
ret.add(header.getKey(), value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public InputStream getInputStream() {
|
||||||
|
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<Document> parseDocument() throws IOException {
|
||||||
|
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||||
|
if (contentType.is("text/html")) {
|
||||||
|
return Optional.of(Jsoup.parse(body));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public String header(String name) {
|
||||||
|
return headers.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
/** This is a special case where the document was not fetched
|
||||||
|
* because it was already in the database. In this case, we
|
||||||
|
* replace the original data.
|
||||||
|
*
|
||||||
|
* @see Result304Raw for the case where the document has not yet been replaced with the reference data.
|
||||||
|
*/
|
||||||
|
record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult {
|
||||||
|
|
||||||
|
public boolean isOk() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<Document> parseDocument() {
|
||||||
|
try {
|
||||||
|
return Optional.of(Jsoup.parse(body));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Fetching resulted in an exception */
|
||||||
|
record ResultException(Exception ex) implements HttpFetchResult {
|
||||||
|
public boolean isOk() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Fetching resulted in a HTTP 304, the remote content is identical to
|
||||||
|
* our reference copy. This will be replaced with a Result304ReplacedWithReference
|
||||||
|
* at a later stage.
|
||||||
|
*
|
||||||
|
* @see Result304ReplacedWithReference
|
||||||
|
*/
|
||||||
|
record Result304Raw() implements HttpFetchResult {
|
||||||
|
public boolean isOk() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** No result. This is typically injected at a later stage
|
||||||
|
* of processing, e.g. after filtering out irrelevant responses.
|
||||||
|
*/
|
||||||
|
record ResultNone() implements HttpFetchResult {
|
||||||
|
public boolean isOk() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
@ -1,156 +1,52 @@
|
|||||||
package nu.marginalia.crawling.io;
|
package nu.marginalia.crawling.io;
|
||||||
|
|
||||||
import com.github.luben.zstd.RecyclingBufferPool;
|
|
||||||
import com.github.luben.zstd.ZstdInputStream;
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.concurrent.ForkJoinPool;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
public class CrawledDomainReader {
|
public class CrawledDomainReader {
|
||||||
private final Gson gson = GsonFactory.get();
|
private static final Gson gson = GsonFactory.get();
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private final ForkJoinPool pool = new ForkJoinPool(6);
|
|
||||||
|
|
||||||
public CrawledDomainReader() {
|
public CrawledDomainReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||||
public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
|
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
|
||||||
return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile());
|
String fileName = fullPath.getFileName().toString();
|
||||||
|
if (fileName.endsWith(".zstd")) {
|
||||||
|
return new LegacySerializableCrawlDataStream(gson, fullPath.toFile());
|
||||||
|
}
|
||||||
|
else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) {
|
||||||
|
return new WarcSerializableCrawlDataStream(fullPath);
|
||||||
|
}
|
||||||
|
else if (fileName.endsWith(".parquet")) {
|
||||||
|
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new IllegalArgumentException("Unknown file type: " + fullPath);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
|
/** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
|
||||||
public SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
|
public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
|
||||||
return createDataStream(CrawlerOutputFile.getOutputFile(basePath, id, domain));
|
Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
|
||||||
}
|
Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL);
|
||||||
|
|
||||||
/** Read the entirety of the domain data into memory. This uses a lot of RAM */
|
if (Files.exists(parquetPath)) {
|
||||||
public CrawledDomain read(Path path) throws IOException {
|
return createDataStream(parquetPath);
|
||||||
DomainDataAssembler domainData = new DomainDataAssembler();
|
|
||||||
|
|
||||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) {
|
|
||||||
String line;
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
if (line.startsWith("//")) {
|
|
||||||
String identifier = line;
|
|
||||||
String data = br.readLine();
|
|
||||||
|
|
||||||
pool.execute(() -> deserializeLine(identifier, data, domainData));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (Files.exists(warcPath)) {
|
||||||
while (!pool.awaitQuiescence(1, TimeUnit.SECONDS));
|
return createDataStream(warcPath);
|
||||||
|
|
||||||
return domainData.assemble();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) {
|
|
||||||
if (null == data) {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
else {
|
||||||
assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class));
|
return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain));
|
||||||
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
|
||||||
assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<CrawledDomain> readOptionally(Path path) {
|
|
||||||
try {
|
|
||||||
return Optional.of(read(path));
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class DomainDataAssembler {
|
|
||||||
private CrawledDomain domainPrototype;
|
|
||||||
private final List<CrawledDocument> docs = new ArrayList<>();
|
|
||||||
|
|
||||||
public synchronized void acceptDomain(CrawledDomain domain) {
|
|
||||||
this.domainPrototype = domain;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized void acceptDoc(CrawledDocument doc) {
|
|
||||||
docs.add(doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized CrawledDomain assemble() {
|
|
||||||
if (!docs.isEmpty()) {
|
|
||||||
if (domainPrototype.doc == null)
|
|
||||||
domainPrototype.doc = new ArrayList<>();
|
|
||||||
|
|
||||||
domainPrototype.doc.addAll(docs);
|
|
||||||
}
|
|
||||||
return domainPrototype;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
|
|
||||||
private final Gson gson;
|
|
||||||
private final BufferedReader bufferedReader;
|
|
||||||
private SerializableCrawlData next = null;
|
|
||||||
|
|
||||||
public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException {
|
|
||||||
this.gson = gson;
|
|
||||||
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public SerializableCrawlData next() throws IOException {
|
|
||||||
if (hasNext()) {
|
|
||||||
var ret = next;
|
|
||||||
next = null;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
throw new IllegalStateException("No more data");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() throws IOException {
|
|
||||||
if (next != null)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
String identifier = bufferedReader.readLine();
|
|
||||||
if (identifier == null) {
|
|
||||||
bufferedReader.close();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
String data = bufferedReader.readLine();
|
|
||||||
if (data == null) {
|
|
||||||
bufferedReader.close();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
|
||||||
next = gson.fromJson(data, CrawledDomain.class);
|
|
||||||
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
|
||||||
next = gson.fromJson(data, CrawledDocument.class);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new IllegalStateException("Unknown identifier: " + identifier);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws Exception {
|
|
||||||
bufferedReader.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -55,7 +55,7 @@ public class CrawledDomainWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Path getOutputFile(String id, String name) throws IOException {
|
private Path getOutputFile(String id, String name) throws IOException {
|
||||||
return CrawlerOutputFile.createOutputPath(outputDir, id, name);
|
return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -9,20 +9,20 @@ import java.nio.file.Path;
|
|||||||
public class CrawlerOutputFile {
|
public class CrawlerOutputFile {
|
||||||
|
|
||||||
/** Return the Path to a file for the given id and name */
|
/** Return the Path to a file for the given id and name */
|
||||||
public static Path getOutputFile(Path base, String id, String name) {
|
public static Path getLegacyOutputFile(Path base, String id, String name) {
|
||||||
|
id = padId(id);
|
||||||
|
|
||||||
String first = id.substring(0, 2);
|
String first = id.substring(0, 2);
|
||||||
String second = id.substring(2, 4);
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
Path destDir = base.resolve(first).resolve(second);
|
Path destDir = base.resolve(first).resolve(second);
|
||||||
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
|
return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return the Path to a file for the given id and name, creating the prerequisite
|
/** Return the Path to a file for the given id and name, creating the prerequisite
|
||||||
* directory structure as necessary. */
|
* directory structure as necessary. */
|
||||||
public static Path createOutputPath(Path base, String id, String name) throws IOException {
|
public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException {
|
||||||
if (id.length() < 4) {
|
id = padId(id);
|
||||||
id = Strings.repeat("0", 4 - id.length()) + id;
|
|
||||||
}
|
|
||||||
|
|
||||||
String first = id.substring(0, 2);
|
String first = id.substring(0, 2);
|
||||||
String second = id.substring(2, 4);
|
String second = id.substring(2, 4);
|
||||||
@ -31,7 +31,7 @@ public class CrawlerOutputFile {
|
|||||||
if (!Files.exists(destDir)) {
|
if (!Files.exists(destDir)) {
|
||||||
Files.createDirectories(destDir);
|
Files.createDirectories(destDir);
|
||||||
}
|
}
|
||||||
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
|
return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -49,4 +49,71 @@ public class CrawlerOutputFile {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Path createWarcPath(Path basePath, String id, String domain, WarcFileVersion version) throws IOException {
|
||||||
|
id = padId(id);
|
||||||
|
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = basePath.resolve(first).resolve(second);
|
||||||
|
if (!Files.exists(destDir)) {
|
||||||
|
Files.createDirectories(destDir);
|
||||||
|
}
|
||||||
|
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}-\{version.suffix}.warc.gz");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Path createParquetPath(Path basePath, String id, String domain) throws IOException {
|
||||||
|
id = padId(id);
|
||||||
|
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = basePath.resolve(first).resolve(second);
|
||||||
|
if (!Files.exists(destDir)) {
|
||||||
|
Files.createDirectories(destDir);
|
||||||
|
}
|
||||||
|
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet");
|
||||||
|
}
|
||||||
|
public static Path getParquetPath(Path basePath, String id, String domain) {
|
||||||
|
id = padId(id);
|
||||||
|
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = basePath.resolve(first).resolve(second);
|
||||||
|
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet");
|
||||||
|
}
|
||||||
|
public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) {
|
||||||
|
id = padId(id);
|
||||||
|
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = basePath.resolve(first).resolve(second);
|
||||||
|
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.warc\{version.suffix}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pads the given ID with leading zeros to ensure it has a length of 4 characters.
|
||||||
|
*/
|
||||||
|
private static String padId(String id) {
|
||||||
|
if (id.length() < 4) {
|
||||||
|
id = Strings.repeat("0", 4 - id.length()) + id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public enum WarcFileVersion {
|
||||||
|
LIVE("open"),
|
||||||
|
TEMP("tmp"),
|
||||||
|
FINAL("final");
|
||||||
|
|
||||||
|
public final String suffix;
|
||||||
|
|
||||||
|
WarcFileVersion(String suffix) {
|
||||||
|
this.suffix = suffix;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
package nu.marginalia.crawling.io;
|
package nu.marginalia.crawling.io;
|
||||||
|
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
/** Closable iterator over serialized crawl data
|
/** Closable iterator exceptional over serialized crawl data
|
||||||
* The data may appear in any order, and the iterator must be closed.
|
* The data may appear in any order, and the iterator must be closed.
|
||||||
*
|
*
|
||||||
* @see CrawledDomainReader
|
* @see CrawledDomainReader
|
||||||
@ -17,6 +19,8 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
|||||||
|
|
||||||
boolean hasNext() throws IOException;
|
boolean hasNext() throws IOException;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
default Path path() { return null; }
|
||||||
|
|
||||||
// Dummy iterator over nothing
|
// Dummy iterator over nothing
|
||||||
static SerializableCrawlDataStream empty() {
|
static SerializableCrawlDataStream empty() {
|
||||||
|
@ -0,0 +1,73 @@
|
|||||||
|
package nu.marginalia.crawling.io.format;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.RecyclingBufferPool;
|
||||||
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
|
||||||
|
* with type delimiters between records.
|
||||||
|
*/
|
||||||
|
public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
|
||||||
|
private final Gson gson;
|
||||||
|
private final BufferedReader bufferedReader;
|
||||||
|
private SerializableCrawlData next = null;
|
||||||
|
|
||||||
|
private final Path path;
|
||||||
|
public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
|
||||||
|
this.gson = gson;
|
||||||
|
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
|
||||||
|
path = file.toPath();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path path() {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public SerializableCrawlData next() throws IOException {
|
||||||
|
if (hasNext()) {
|
||||||
|
var ret = next;
|
||||||
|
next = null;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
throw new IllegalStateException("No more data");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() throws IOException {
|
||||||
|
if (next != null)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
String identifier = bufferedReader.readLine();
|
||||||
|
if (identifier == null) {
|
||||||
|
bufferedReader.close();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String data = bufferedReader.readLine();
|
||||||
|
if (data == null) {
|
||||||
|
bufferedReader.close();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||||
|
next = gson.fromJson(data, CrawledDomain.class);
|
||||||
|
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||||
|
next = gson.fromJson(data, CrawledDocument.class);
|
||||||
|
} else {
|
||||||
|
throw new IllegalStateException("Unknown identifier: " + identifier);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
bufferedReader.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,135 @@
|
|||||||
|
package nu.marginalia.crawling.io.format;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.crawling.model.*;
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
|
||||||
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
|
||||||
|
|
||||||
|
private final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||||
|
private final Iterator<CrawledDocumentParquetRecord> backingIterator;
|
||||||
|
private final Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
|
||||||
|
private boolean wroteDomainRecord = false;
|
||||||
|
private final Path path;
|
||||||
|
|
||||||
|
public ParquetSerializableCrawlDataStream(Path file) throws IOException {
|
||||||
|
path = file;
|
||||||
|
|
||||||
|
backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path path() {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SneakyThrows
|
||||||
|
public boolean hasNext() {
|
||||||
|
while (backingIterator.hasNext() && nextQ.isEmpty()) {
|
||||||
|
var nextRecord = backingIterator.next();
|
||||||
|
if (!wroteDomainRecord) {
|
||||||
|
createDomainRecord(nextRecord);
|
||||||
|
wroteDomainRecord = true;
|
||||||
|
}
|
||||||
|
createDocumentRecord(nextRecord);
|
||||||
|
}
|
||||||
|
return !nextQ.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) throws URISyntaxException {
|
||||||
|
|
||||||
|
CrawlerDomainStatus status = CrawlerDomainStatus.OK;
|
||||||
|
String statusReason = "";
|
||||||
|
|
||||||
|
String redirectDomain = null;
|
||||||
|
if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) {
|
||||||
|
EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url);
|
||||||
|
redirectDomain = crawledUrl.getDomain().toString();
|
||||||
|
status = CrawlerDomainStatus.REDIRECT;
|
||||||
|
}
|
||||||
|
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) {
|
||||||
|
status = CrawlerDomainStatus.BLOCKED;
|
||||||
|
}
|
||||||
|
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) {
|
||||||
|
status = CrawlerDomainStatus.ERROR;
|
||||||
|
statusReason = new String(parquetRecord.body);
|
||||||
|
}
|
||||||
|
|
||||||
|
nextQ.add(new CrawledDomain(
|
||||||
|
parquetRecord.domain,
|
||||||
|
redirectDomain,
|
||||||
|
status.toString(),
|
||||||
|
statusReason,
|
||||||
|
parquetRecord.ip,
|
||||||
|
new ArrayList<>(),
|
||||||
|
new ArrayList<>()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
|
||||||
|
String bodyString = "";
|
||||||
|
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
|
||||||
|
|
||||||
|
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
|
||||||
|
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
|
||||||
|
}
|
||||||
|
else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
|
||||||
|
status = CrawlerDocumentStatus.ROBOTS_TXT;
|
||||||
|
}
|
||||||
|
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
try {
|
||||||
|
bodyString = DocumentBodyToString.getStringData(
|
||||||
|
ContentType.parse(nextRecord.contentType),
|
||||||
|
nextRecord.body);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("Failed to convert body to string", ex);
|
||||||
|
status = CrawlerDocumentStatus.BAD_CHARSET;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nextQ.add(new CrawledDocument("",
|
||||||
|
nextRecord.url,
|
||||||
|
nextRecord.contentType,
|
||||||
|
nextRecord.timestamp.toString(),
|
||||||
|
nextRecord.httpStatus,
|
||||||
|
status.toString(),
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
bodyString,
|
||||||
|
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
||||||
|
nextRecord.url,
|
||||||
|
null,
|
||||||
|
"",
|
||||||
|
nextRecord.cookies));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SerializableCrawlData next() throws IOException {
|
||||||
|
if (!hasNext())
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
|
||||||
|
return nextQ.poll();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,151 @@
|
|||||||
|
package nu.marginalia.crawling.io.format;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class WarcSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(WarcSerializableCrawlDataStream.class);
|
||||||
|
|
||||||
|
private final WarcReader reader;
|
||||||
|
private final Iterator<WarcRecord> backingIterator;
|
||||||
|
private SerializableCrawlData next = null;
|
||||||
|
private final Path path;
|
||||||
|
|
||||||
|
public WarcSerializableCrawlDataStream(Path file) throws IOException {
|
||||||
|
path = file;
|
||||||
|
reader = new WarcReader(file);
|
||||||
|
WarcXResponseReference.register(reader);
|
||||||
|
WarcXEntityRefused.register(reader);
|
||||||
|
|
||||||
|
backingIterator = reader.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path path() {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SneakyThrows
|
||||||
|
public boolean hasNext() {
|
||||||
|
while (backingIterator.hasNext() && next == null) {
|
||||||
|
var nextRecord = backingIterator.next();
|
||||||
|
if (nextRecord instanceof WarcResponse response) { // this also includes WarcXResponseReference
|
||||||
|
convertResponse(response);
|
||||||
|
}
|
||||||
|
else if (nextRecord instanceof Warcinfo warcinfo) {
|
||||||
|
convertWarcinfo(warcinfo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return next != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void convertWarcinfo(Warcinfo warcinfo) throws IOException {
|
||||||
|
var headers = warcinfo.fields();
|
||||||
|
String probeStatus = headers.first("X-WARC-Probe-Status").orElse("");
|
||||||
|
String[] parts = probeStatus.split(" ", 2);
|
||||||
|
|
||||||
|
|
||||||
|
String domain = headers.first("domain").orElseThrow(() -> new IllegalStateException("Missing domain header"));
|
||||||
|
String status = parts[0];
|
||||||
|
String statusReason = parts.length > 1 ? parts[1] : "";
|
||||||
|
String ip = headers.first("ip").orElse("");
|
||||||
|
|
||||||
|
String redirectDomain = null;
|
||||||
|
if ("REDIRECT".equalsIgnoreCase(status)) {
|
||||||
|
redirectDomain = statusReason;
|
||||||
|
}
|
||||||
|
|
||||||
|
next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip,
|
||||||
|
new ArrayList<>(),
|
||||||
|
new ArrayList<>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void convertResponse(WarcResponse response) throws IOException {
|
||||||
|
var http = response.http();
|
||||||
|
|
||||||
|
if (http.status() != 200) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
|
||||||
|
if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
|
||||||
|
next = new CrawledDocument(
|
||||||
|
"",
|
||||||
|
response.targetURI().toString(),
|
||||||
|
http.contentType().raw(),
|
||||||
|
response.date().toString(),
|
||||||
|
http.status(),
|
||||||
|
error.status().toString(),
|
||||||
|
error.why(),
|
||||||
|
headers(http.headers()),
|
||||||
|
null,
|
||||||
|
response.payloadDigest().map(WarcDigest::base64).orElse(""),
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
WarcXCookieInformationHeader.hasCookies(response)
|
||||||
|
);
|
||||||
|
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
|
||||||
|
next = new CrawledDocument(
|
||||||
|
"",
|
||||||
|
response.targetURI().toString(),
|
||||||
|
ok.contentType().toString(),
|
||||||
|
response.date().toString(),
|
||||||
|
http.status(),
|
||||||
|
"OK",
|
||||||
|
"",
|
||||||
|
headers(http.headers()),
|
||||||
|
ok.body(),
|
||||||
|
response.payloadDigest().map(WarcDigest::base64).orElse(""),
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
WarcXCookieInformationHeader.hasCookies(response));
|
||||||
|
} else {
|
||||||
|
// unreachable
|
||||||
|
throw new IllegalStateException("Unknown body type: " + parsedBody);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String headers(MessageHeaders headers) {
|
||||||
|
StringJoiner ret = new StringJoiner("\n");
|
||||||
|
for (var header : headers.map().entrySet()) {
|
||||||
|
for (var value : header.getValue()) {
|
||||||
|
ret.add(STR."\{header.getKey()}: \{value}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SerializableCrawlData next() throws IOException {
|
||||||
|
if (!hasNext())
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
try {
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
next = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,5 +0,0 @@
|
|||||||
package nu.marginalia.crawling.model;
|
|
||||||
|
|
||||||
|
|
||||||
public record ContentType(String contentType, String charset) {
|
|
||||||
}
|
|
@ -23,13 +23,21 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
|
|
||||||
public String headers;
|
public String headers;
|
||||||
public String documentBody;
|
public String documentBody;
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public String documentBodyHash;
|
public String documentBodyHash;
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public String canonicalUrl;
|
public String canonicalUrl;
|
||||||
public String redirectUrl;
|
public String redirectUrl;
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public String recrawlState;
|
public String recrawlState;
|
||||||
|
|
||||||
|
/** This is not guaranteed to be set in all versions of the format,
|
||||||
|
* information may come in CrawledDomain instead */
|
||||||
|
public Boolean hasCookies = false;
|
||||||
|
|
||||||
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
||||||
@Override
|
@Override
|
||||||
public String getSerialIdentifier() {
|
public String getSerialIdentifier() {
|
||||||
|
@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
public String ip;
|
public String ip;
|
||||||
|
|
||||||
public List<CrawledDocument> doc;
|
public List<CrawledDocument> doc;
|
||||||
|
|
||||||
|
/** This is not guaranteed to be set in all versions of the format,
|
||||||
|
* information may come in CrawledDocument instead */
|
||||||
public List<String> cookies;
|
public List<String> cookies;
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
@ -24,6 +27,10 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
return doc.size();
|
return doc.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hasCookies() {
|
||||||
|
return cookies != null && !cookies.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
public static final String SERIAL_IDENTIFIER = "// DOMAIN";
|
public static final String SERIAL_IDENTIFIER = "// DOMAIN";
|
||||||
@Override
|
@Override
|
||||||
public String getSerialIdentifier() {
|
public String getSerialIdentifier() {
|
||||||
|
@ -0,0 +1,97 @@
|
|||||||
|
package nu.marginalia.crawling.parquet;
|
||||||
|
|
||||||
|
import blue.strategic.parquet.Dehydrator;
|
||||||
|
import blue.strategic.parquet.Hydrator;
|
||||||
|
import blue.strategic.parquet.ValueWriter;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.ToString;
|
||||||
|
import org.apache.parquet.schema.MessageType;
|
||||||
|
import org.apache.parquet.schema.Types;
|
||||||
|
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
import static org.apache.parquet.schema.LogicalTypeAnnotation.*;
|
||||||
|
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
|
||||||
|
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@EqualsAndHashCode
|
||||||
|
@ToString
|
||||||
|
public class CrawledDocumentParquetRecord {
|
||||||
|
public String domain;
|
||||||
|
public String url;
|
||||||
|
public String ip;
|
||||||
|
public boolean cookies;
|
||||||
|
public int httpStatus;
|
||||||
|
public Instant timestamp;
|
||||||
|
public String contentType;
|
||||||
|
public byte[] body;
|
||||||
|
|
||||||
|
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
||||||
|
return new CrawledDocumentParquetRecordHydrator();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Dehydrator<CrawledDocumentParquetRecord> newDehydrator() {
|
||||||
|
return CrawledDocumentParquetRecord::dehydrate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static MessageType schema = new MessageType(
|
||||||
|
CrawledDocumentParquetRecord.class.getSimpleName(),
|
||||||
|
Types.required(BINARY).as(stringType()).named("domain"),
|
||||||
|
Types.required(BINARY).as(stringType()).named("url"),
|
||||||
|
Types.required(BINARY).as(stringType()).named("ip"),
|
||||||
|
Types.required(BOOLEAN).named("cookies"),
|
||||||
|
Types.required(INT32).named("httpStatus"),
|
||||||
|
Types.required(INT64).named("epochSeconds"),
|
||||||
|
Types.required(BINARY).as(stringType()).named("contentType"),
|
||||||
|
Types.required(BINARY).named("body")
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
public CrawledDocumentParquetRecord add(String heading, Object value) {
|
||||||
|
switch (heading) {
|
||||||
|
case "domain" -> domain = (String) value;
|
||||||
|
case "url" -> url = (String) value;
|
||||||
|
case "ip" -> ip = (String) value;
|
||||||
|
case "httpStatus" -> httpStatus = (Integer) value;
|
||||||
|
case "cookies" -> cookies = (Boolean) value;
|
||||||
|
case "contentType" -> contentType = (String) value;
|
||||||
|
case "body" -> body = (byte[]) value;
|
||||||
|
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
||||||
|
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
||||||
|
}
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void dehydrate(ValueWriter valueWriter) {
|
||||||
|
valueWriter.write("domain", domain);
|
||||||
|
valueWriter.write("url", url);
|
||||||
|
valueWriter.write("ip", ip);
|
||||||
|
valueWriter.write("epochSeconds", timestamp.getEpochSecond());
|
||||||
|
valueWriter.write("httpStatus", httpStatus);
|
||||||
|
valueWriter.write("cookies", cookies);
|
||||||
|
valueWriter.write("contentType", contentType);
|
||||||
|
valueWriter.write("body", body);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class CrawledDocumentParquetRecordHydrator implements Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CrawledDocumentParquetRecord start() {
|
||||||
|
return new CrawledDocumentParquetRecord();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CrawledDocumentParquetRecord add(CrawledDocumentParquetRecord target, String heading, Object value) {
|
||||||
|
return target.add(heading, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CrawledDocumentParquetRecord finish(CrawledDocumentParquetRecord target) {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
package nu.marginalia.crawling.parquet;
|
||||||
|
|
||||||
|
import blue.strategic.parquet.HydratorSupplier;
|
||||||
|
import blue.strategic.parquet.ParquetReader;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
public class CrawledDocumentParquetRecordFileReader {
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
public static Stream<CrawledDocumentParquetRecord> stream(Path path) throws IOException {
|
||||||
|
return ParquetReader.streamContent(path.toFile(),
|
||||||
|
HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,247 @@
|
|||||||
|
package nu.marginalia.crawling.parquet;
|
||||||
|
|
||||||
|
import blue.strategic.parquet.ParquetWriter;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||||
|
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
|
||||||
|
|
||||||
|
public static void convertWarc(String domain,
|
||||||
|
UserAgent userAgent,
|
||||||
|
Path warcInputFile,
|
||||||
|
Path parquetOutputFile) {
|
||||||
|
try (var warcReader = new WarcReader(warcInputFile);
|
||||||
|
var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
|
||||||
|
) {
|
||||||
|
WarcXResponseReference.register(warcReader);
|
||||||
|
WarcXEntityRefused.register(warcReader);
|
||||||
|
|
||||||
|
String uaString = userAgent.uaString();
|
||||||
|
|
||||||
|
for (var record : warcReader) {
|
||||||
|
if (record instanceof WarcResponse response) {
|
||||||
|
// this also captures WarcXResponseReference, which inherits from WarcResponse
|
||||||
|
// and is used to store old responses from previous crawls; in this part of the logic
|
||||||
|
// we treat them the same as a normal response
|
||||||
|
|
||||||
|
if (!filterResponse(uaString, response)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
parquetWriter.write(domain, response);
|
||||||
|
}
|
||||||
|
else if (record instanceof WarcXEntityRefused refused) {
|
||||||
|
parquetWriter.write(domain, refused);
|
||||||
|
}
|
||||||
|
else if (record instanceof Warcinfo warcinfo) {
|
||||||
|
parquetWriter.write(warcinfo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to convert WARC file to Parquet", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return true if the WarcResponse should be excluded from conversion */
|
||||||
|
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
|
// We don't want to store robots.txt files, as they are not
|
||||||
|
// interesting for the analysis we want to do. This is important
|
||||||
|
// since txt-files in general are interesting, and we don't want to
|
||||||
|
// exclude them as a class.
|
||||||
|
|
||||||
|
if (response.targetURI().getPath().equals("/robots.txt")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var robotsTags = response.http().headers().all("X-Robots-Tag");
|
||||||
|
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
||||||
|
URI profile = refused.profile();
|
||||||
|
|
||||||
|
String meta;
|
||||||
|
if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) {
|
||||||
|
meta = "x-marginalia/advisory;state=robots-txt-skipped";
|
||||||
|
}
|
||||||
|
else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) {
|
||||||
|
meta = "x-marginalia/advisory;state=content-type-failed-probe";
|
||||||
|
}
|
||||||
|
else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) {
|
||||||
|
meta = "x-marginalia/advisory;state=timeout-probe";
|
||||||
|
}
|
||||||
|
else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) {
|
||||||
|
meta = "x-marginalia/advisory;state=doc-error";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
meta = "x-marginalia/advisory;state=unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
write(forDocError(domain, refused.date(), refused.target(), meta));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void write(Warcinfo warcinfo) throws IOException {
|
||||||
|
String selfDomain = warcinfo.fields().first("domain").orElse("");
|
||||||
|
String ip = warcinfo.fields().first("ip").orElse("");
|
||||||
|
String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse("");
|
||||||
|
|
||||||
|
if (probeStatus.startsWith("REDIRECT")) {
|
||||||
|
String redirectDomain = probeStatus.substring("REDIRECT;".length());
|
||||||
|
write(forDomainRedirect(selfDomain, warcinfo.date(), redirectDomain));
|
||||||
|
}
|
||||||
|
else if (!"OK".equals(probeStatus)) {
|
||||||
|
write(forDomainError(selfDomain, warcinfo.date(), ip, probeStatus));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException {
|
||||||
|
writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema,
|
||||||
|
file.toFile(), CrawledDocumentParquetRecord.newDehydrator());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void write(CrawledDocumentParquetRecord domainData) throws IOException {
|
||||||
|
writer.write(domainData);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void write(String domain, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
|
HttpFetchResult result = HttpFetchResult.importWarc(response);
|
||||||
|
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] bodyBytes;
|
||||||
|
String contentType;
|
||||||
|
|
||||||
|
var body = DocumentBodyExtractor.asBytes(result);
|
||||||
|
|
||||||
|
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
|
||||||
|
bodyBytes = bodyOk.body();
|
||||||
|
contentType = bodyOk.contentType().toString();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
bodyBytes = new byte[0];
|
||||||
|
contentType = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
write(new CrawledDocumentParquetRecord(
|
||||||
|
domain,
|
||||||
|
response.target(),
|
||||||
|
fetchOk.ipAddress(),
|
||||||
|
WarcXCookieInformationHeader.hasCookies(response),
|
||||||
|
fetchOk.statusCode(),
|
||||||
|
response.date(),
|
||||||
|
contentType,
|
||||||
|
bodyBytes)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private CrawledDocumentParquetRecord forDomainRedirect(String domain, Instant date, String redirectDomain) {
|
||||||
|
return new CrawledDocumentParquetRecord(domain,
|
||||||
|
STR."https://\{redirectDomain}/",
|
||||||
|
"",
|
||||||
|
false,
|
||||||
|
0,
|
||||||
|
date,
|
||||||
|
"x-marginalia/advisory;state=redirect",
|
||||||
|
new byte[0]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
|
||||||
|
return new CrawledDocumentParquetRecord(domain,
|
||||||
|
STR."https://\{domain}/",
|
||||||
|
ip,
|
||||||
|
false,
|
||||||
|
0,
|
||||||
|
date,
|
||||||
|
"x-marginalia/advisory;state=error",
|
||||||
|
errorStatus.getBytes()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CrawledDocumentParquetRecord forDocError(String domain, Instant date, String url, String errorStatus) {
|
||||||
|
return new CrawledDocumentParquetRecord(domain,
|
||||||
|
url,
|
||||||
|
"",
|
||||||
|
false,
|
||||||
|
0,
|
||||||
|
date,
|
||||||
|
errorStatus,
|
||||||
|
new byte[0]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
||||||
|
* <p>
|
||||||
|
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
|
||||||
|
*
|
||||||
|
* @param xRobotsHeaderTags List of X-Robots-Tag values
|
||||||
|
* @param userAgent User agent string
|
||||||
|
* @return true if we are allowed to index this page
|
||||||
|
*/
|
||||||
|
// Visible for tests
|
||||||
|
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
|
||||||
|
boolean isPermittedGeneral = true;
|
||||||
|
boolean isPermittedMarginalia = false;
|
||||||
|
boolean isForbiddenMarginalia = false;
|
||||||
|
|
||||||
|
for (String header : xRobotsHeaderTags) {
|
||||||
|
if (header.indexOf(':') >= 0) {
|
||||||
|
String[] parts = StringUtils.split(header, ":", 2);
|
||||||
|
|
||||||
|
if (parts.length < 2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Is this relevant to us?
|
||||||
|
if (!Objects.equals(parts[0].trim(), userAgent))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (parts[1].contains("noindex"))
|
||||||
|
isForbiddenMarginalia = true;
|
||||||
|
else if (parts[1].contains("none"))
|
||||||
|
isForbiddenMarginalia = true;
|
||||||
|
else if (parts[1].contains("all"))
|
||||||
|
isPermittedMarginalia = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (header.contains("noindex"))
|
||||||
|
isPermittedGeneral = false;
|
||||||
|
if (header.contains("none"))
|
||||||
|
isPermittedGeneral = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isPermittedMarginalia)
|
||||||
|
return true;
|
||||||
|
if (isForbiddenMarginalia)
|
||||||
|
return false;
|
||||||
|
return isPermittedGeneral;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
package org.netpreserve.jwarc;
|
||||||
|
|
||||||
|
import okhttp3.HttpUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
|
||||||
|
/** Encapsulates out-of-band information about whether a website uses cookies,
|
||||||
|
* using a non-standard WARC header "X-Has-Cookies".
|
||||||
|
*/
|
||||||
|
public class WarcXCookieInformationHeader {
|
||||||
|
private boolean hasCookies = false;
|
||||||
|
private static final String headerName = "X-Has-Cookies";
|
||||||
|
|
||||||
|
public void update(OkHttpClient client, HttpUrl url) {
|
||||||
|
if (!hasCookies) {
|
||||||
|
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasCookies() {
|
||||||
|
return hasCookies;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void paint(WarcResponse.Builder builder) {
|
||||||
|
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
||||||
|
}
|
||||||
|
public void paint(WarcXResponseReference.Builder builder) {
|
||||||
|
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean hasCookies(WarcRecord record) {
|
||||||
|
return record.headers().contains(headerName, "1");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
package org.netpreserve.jwarc;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
|
||||||
|
/** This defines a non-standard extension to WARC for storing old HTTP responses,
|
||||||
|
* essentially a 'response' with different semantics
|
||||||
|
*/
|
||||||
|
public class WarcXEntityRefused extends WarcRevisit {
|
||||||
|
private static final String TYPE_NAME = "x-entity-refused";
|
||||||
|
|
||||||
|
public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
|
||||||
|
public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
|
||||||
|
public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
|
||||||
|
public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
|
||||||
|
|
||||||
|
WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) {
|
||||||
|
super(version, headers, body);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void register(WarcReader reader) {
|
||||||
|
reader.registerType(TYPE_NAME, WarcXEntityRefused::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class Builder extends AbstractBuilder<WarcXEntityRefused, Builder> {
|
||||||
|
public Builder(URI targetURI, URI profile) {
|
||||||
|
this(targetURI.toString(), profile.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder(String targetURI, String profileURI) {
|
||||||
|
super(TYPE_NAME);
|
||||||
|
setHeader("WARC-Target-URI", targetURI);
|
||||||
|
setHeader("WARC-Profile", profileURI);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder body(HttpResponse httpResponse) throws IOException {
|
||||||
|
return body(MediaType.HTTP_RESPONSE, httpResponse);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WarcXEntityRefused build() {
|
||||||
|
return build(WarcXEntityRefused::new);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
package org.netpreserve.jwarc;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
|
||||||
|
/** This defines a non-standard extension to WARC for storing old HTTP responses,
|
||||||
|
* essentially a 'response' with different semantics..
|
||||||
|
* <p>
|
||||||
|
* An x-response-reference record is a response record with a full body, where
|
||||||
|
* the data is a reconstructed HTTP response from a previous crawl.
|
||||||
|
*/
|
||||||
|
public class WarcXResponseReference extends WarcResponse {
|
||||||
|
private static final String TYPE_NAME = "x-response-reference";
|
||||||
|
|
||||||
|
WarcXResponseReference(MessageVersion version, MessageHeaders headers, MessageBody body) {
|
||||||
|
super(version, headers, body);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void register(WarcReader reader) {
|
||||||
|
reader.registerType(TYPE_NAME, WarcXResponseReference::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class Builder extends AbstractBuilder<WarcXResponseReference, Builder> {
|
||||||
|
public Builder(URI targetURI) {
|
||||||
|
this(targetURI.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder(String targetURI) {
|
||||||
|
super(TYPE_NAME);
|
||||||
|
setHeader("WARC-Target-URI", targetURI);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder body(HttpResponse httpResponse) throws IOException {
|
||||||
|
return body(MediaType.HTTP_RESPONSE, httpResponse);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public WarcXResponseReference build() {
|
||||||
|
return build(WarcXResponseReference::new);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -74,23 +74,13 @@ public class CrawlPlan {
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public Iterable<CrawledDomain> domainsIterable() {
|
public Iterable<CrawledDomain> domainsIterable() {
|
||||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
// This is no longer supported
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
return WorkLog.iterableMap(crawl.getLogFile(),
|
|
||||||
entry -> {
|
|
||||||
var path = getCrawledFilePath(entry.path());
|
|
||||||
if (!Files.exists(path)) {
|
|
||||||
logger.warn("File not found: {}", path);
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
return reader.readOptionally(path);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterable<SerializableCrawlDataStream> crawlDataIterable(Predicate<String> idPredicate) {
|
public Iterable<SerializableCrawlDataStream> crawlDataIterable(Predicate<String> idPredicate) {
|
||||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
|
||||||
|
|
||||||
return WorkLog.iterableMap(crawl.getLogFile(),
|
return WorkLog.iterableMap(crawl.getLogFile(),
|
||||||
entry -> {
|
entry -> {
|
||||||
if (!idPredicate.test(entry.id())) {
|
if (!idPredicate.test(entry.id())) {
|
||||||
@ -105,7 +95,7 @@ public class CrawlPlan {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return Optional.of(reader.createDataStream(path));
|
return Optional.of(CrawledDomainReader.createDataStream(path));
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException ex) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@ -0,0 +1,78 @@
|
|||||||
|
package nu.marginalia.crawling.parquet;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class CrawledDocumentParquetRecordFileWriterTest {
|
||||||
|
Path tempFile;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
tempFile = Files.createTempFile("test", ".parquet");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWriteRead() throws IOException {
|
||||||
|
var original = new CrawledDocumentParquetRecord("www.marginalia.nu",
|
||||||
|
"https://www.marginalia.nu/",
|
||||||
|
"127.0.0.1",
|
||||||
|
false,
|
||||||
|
200,
|
||||||
|
Instant.now(),
|
||||||
|
"text/html",
|
||||||
|
"hello world".getBytes());
|
||||||
|
|
||||||
|
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
|
||||||
|
writer.write(original);
|
||||||
|
}
|
||||||
|
|
||||||
|
var items = new ArrayList<SerializableCrawlData>();
|
||||||
|
|
||||||
|
try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
items.add(stream.next());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(2, items.size());
|
||||||
|
|
||||||
|
var firstItem = items.get(0);
|
||||||
|
assertInstanceOf(CrawledDomain.class, firstItem);
|
||||||
|
var domain = (CrawledDomain) firstItem;
|
||||||
|
assertEquals("www.marginalia.nu", domain.domain);
|
||||||
|
assertNull(domain.redirectDomain);
|
||||||
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
|
assertEquals("", domain.crawlerStatusDesc);
|
||||||
|
assertEquals(new ArrayList<>(), domain.doc);
|
||||||
|
assertEquals(new ArrayList<>(), domain.cookies);
|
||||||
|
|
||||||
|
var secondItem = items.get(1);
|
||||||
|
assertInstanceOf(CrawledDocument.class, secondItem);
|
||||||
|
|
||||||
|
var document = (CrawledDocument) secondItem;
|
||||||
|
assertEquals("https://www.marginalia.nu/", document.url);
|
||||||
|
assertEquals("text/html", document.contentType);
|
||||||
|
assertEquals("hello world", document.documentBody);
|
||||||
|
assertEquals(200, document.httpStatus);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -59,6 +59,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:features-crawl:crawl-blocklist')
|
implementation project(':code:features-crawl:crawl-blocklist')
|
||||||
implementation project(':code:features-crawl:link-parser')
|
implementation project(':code:features-crawl:link-parser')
|
||||||
|
implementation project(':code:features-crawl:content-type')
|
||||||
|
|
||||||
testImplementation project(':code:libraries:term-frequency-dict')
|
testImplementation project(':code:libraries:term-frequency-dict')
|
||||||
testImplementation project(':code:process-models:crawl-spec')
|
testImplementation project(':code:process-models:crawl-spec')
|
||||||
@ -66,6 +67,7 @@ dependencies {
|
|||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
|
implementation libs.jwarc
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
|
|
||||||
|
@ -268,6 +268,14 @@ public class ConverterMain {
|
|||||||
processData.asPath(),
|
processData.asPath(),
|
||||||
msg, inbox);
|
msg, inbox);
|
||||||
}
|
}
|
||||||
|
case SideloadWarc -> {
|
||||||
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
yield new SideloadAction(
|
||||||
|
sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)),
|
||||||
|
processData.asPath(),
|
||||||
|
msg, inbox);
|
||||||
|
}
|
||||||
case SideloadStackexchange -> {
|
case SideloadStackexchange -> {
|
||||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||||
|
|
||||||
|
@ -105,13 +105,6 @@ public class DocumentProcessor {
|
|||||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||||
throws URISyntaxException
|
throws URISyntaxException
|
||||||
{
|
{
|
||||||
if (crawledDocument.canonicalUrl != null) {
|
|
||||||
try {
|
|
||||||
return new EdgeUrl(crawledDocument.canonicalUrl);
|
|
||||||
}
|
|
||||||
catch (URISyntaxException ex) { /* fallthrough */ }
|
|
||||||
}
|
|
||||||
|
|
||||||
return new EdgeUrl(crawledDocument.url);
|
return new EdgeUrl(crawledDocument.url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -53,9 +54,15 @@ public class DomainProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
@Nullable
|
||||||
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
|
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
|
||||||
|
if (!dataStream.hasNext()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
var ret = new ProcessedDomain();
|
var ret = new ProcessedDomain();
|
||||||
List<ProcessedDocument> docs = new ArrayList<>();
|
List<ProcessedDocument> docs = new ArrayList<>();
|
||||||
|
Set<String> processedUrls = new HashSet<>();
|
||||||
|
|
||||||
boolean cookies = false;
|
boolean cookies = false;
|
||||||
String ip = "";
|
String ip = "";
|
||||||
@ -79,7 +86,7 @@ public class DomainProcessor {
|
|||||||
ret.domain = new EdgeDomain(crawledDomain.domain);
|
ret.domain = new EdgeDomain(crawledDomain.domain);
|
||||||
ret.ip = crawledDomain.ip;
|
ret.ip = crawledDomain.ip;
|
||||||
|
|
||||||
cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0;
|
cookies = crawledDomain.hasCookies();
|
||||||
ip = crawledDomain.ip;
|
ip = crawledDomain.ip;
|
||||||
|
|
||||||
if (crawledDomain.redirectDomain != null) {
|
if (crawledDomain.redirectDomain != null) {
|
||||||
@ -90,10 +97,12 @@ public class DomainProcessor {
|
|||||||
}
|
}
|
||||||
else if (data instanceof CrawledDocument doc) {
|
else if (data instanceof CrawledDocument doc) {
|
||||||
try {
|
try {
|
||||||
if (doc.url == null)
|
if (doc.url == null || !processedUrls.add(doc.url))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
fixBadCanonicalTag(doc);
|
if (Boolean.TRUE.equals(doc.hasCookies)) {
|
||||||
|
cookies = true;
|
||||||
|
}
|
||||||
|
|
||||||
// This case should never be reachable, as we should have initiated
|
// This case should never be reachable, as we should have initiated
|
||||||
// the externalDomainLinks variable above if we made it past the
|
// the externalDomainLinks variable above if we made it past the
|
||||||
@ -161,25 +170,6 @@ public class DomainProcessor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fixBadCanonicalTag(CrawledDocument doc) {
|
|
||||||
// Some sites have a canonical tag that points to a different domain,
|
|
||||||
// but our loader can not support this, so we point these back to the
|
|
||||||
// original url.
|
|
||||||
|
|
||||||
var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl);
|
|
||||||
if (canonicalOpt.isEmpty()) return;
|
|
||||||
|
|
||||||
var urlOpt = EdgeUrl.parse(doc.url);
|
|
||||||
if (urlOpt.isEmpty()) return;
|
|
||||||
|
|
||||||
var urlActual = urlOpt.get();
|
|
||||||
var canonicalActual = canonicalOpt.get();
|
|
||||||
|
|
||||||
if (!Objects.equals(urlActual.domain, canonicalActual.domain)) {
|
|
||||||
doc.canonicalUrl = doc.url;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
|
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
|
||||||
LinkGraph linkGraph = new LinkGraph();
|
LinkGraph linkGraph = new LinkGraph();
|
||||||
TopKeywords topKeywords = new TopKeywords();
|
TopKeywords topKeywords = new TopKeywords();
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
|||||||
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
||||||
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
||||||
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
|
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
|
||||||
|
import nu.marginalia.converting.sideload.warc.WarcSideloadFactory;
|
||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
|
||||||
@ -24,6 +25,7 @@ public class SideloadSourceFactory {
|
|||||||
private final AnchorTextKeywords anchorTextKeywords;
|
private final AnchorTextKeywords anchorTextKeywords;
|
||||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||||
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
||||||
|
private final WarcSideloadFactory warcSideloadFactory;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SideloadSourceFactory(Gson gson,
|
public SideloadSourceFactory(Gson gson,
|
||||||
@ -31,7 +33,8 @@ public class SideloadSourceFactory {
|
|||||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
|
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||||
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
|
DirtreeSideloaderFactory dirtreeSideloaderFactory,
|
||||||
|
WarcSideloadFactory warcSideloadFactory) {
|
||||||
this.gson = gson;
|
this.gson = gson;
|
||||||
this.sideloaderProcessing = sideloaderProcessing;
|
this.sideloaderProcessing = sideloaderProcessing;
|
||||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
@ -39,6 +42,7 @@ public class SideloadSourceFactory {
|
|||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||||
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
||||||
|
this.warcSideloadFactory = warcSideloadFactory;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
|
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
|
||||||
@ -49,6 +53,10 @@ public class SideloadSourceFactory {
|
|||||||
return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile);
|
return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Collection<? extends SideloadSource> sideloadWarc(Path pathToWarcFiles) throws IOException {
|
||||||
|
return warcSideloadFactory.createSideloaders(pathToWarcFiles);
|
||||||
|
}
|
||||||
|
|
||||||
/** Do not use, this code isn't finished */
|
/** Do not use, this code isn't finished */
|
||||||
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
||||||
try (var dirs = Files.walk(pathToDbFileRoot)) {
|
try (var dirs = Files.walk(pathToDbFileRoot)) {
|
||||||
|
@ -50,7 +50,8 @@ public class SideloaderProcessing {
|
|||||||
Integer.toHexString(url.hashCode()),
|
Integer.toHexString(url.hashCode()),
|
||||||
url,
|
url,
|
||||||
"",
|
"",
|
||||||
"SIDELOAD"
|
"SIDELOAD",
|
||||||
|
false
|
||||||
);
|
);
|
||||||
|
|
||||||
var ret = new ProcessedDocument();
|
var ret = new ProcessedDocument();
|
||||||
|
@ -0,0 +1,32 @@
|
|||||||
|
package nu.marginalia.converting.sideload.warc;
|
||||||
|
|
||||||
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class WarcSideloadFactory {
|
||||||
|
|
||||||
|
public Collection<? extends SideloadSource> createSideloaders(Path pathToWarcFiles) throws IOException {
|
||||||
|
final List<Path> files = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var stream = Files.list(pathToWarcFiles)) {
|
||||||
|
stream
|
||||||
|
.filter(Files::isRegularFile)
|
||||||
|
.filter(this::isWarcFile)
|
||||||
|
.forEach(files::add);
|
||||||
|
|
||||||
|
}
|
||||||
|
// stub
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isWarcFile(Path path) {
|
||||||
|
return path.toString().endsWith(".warc")
|
||||||
|
|| path.toString().endsWith(".warc.gz");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,160 @@
|
|||||||
|
package nu.marginalia.converting.sideload.warc;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.contenttype.ContentTypeParser;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
|
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
|
||||||
|
|
||||||
|
private final SideloaderProcessing sideloaderProcessing;
|
||||||
|
|
||||||
|
private final WarcReader reader;
|
||||||
|
|
||||||
|
private final EdgeDomain domain;
|
||||||
|
|
||||||
|
|
||||||
|
public WarcSideloader(Path warcFile,
|
||||||
|
SideloaderProcessing sideloaderProcessing)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
this.sideloaderProcessing = sideloaderProcessing;
|
||||||
|
this.reader = new WarcReader(warcFile);
|
||||||
|
this.domain = sniffDomainFromWarc()
|
||||||
|
.orElseThrow(() -> new IOException("Could not identify domain from warc file"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public ProcessedDomain getDomain() {
|
||||||
|
var ret = new ProcessedDomain();
|
||||||
|
|
||||||
|
ret.domain = domain;
|
||||||
|
ret.ip = "0.0.0.0";
|
||||||
|
ret.state = DomainIndexingState.ACTIVE;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<EdgeDomain> sniffDomainFromWarc() throws IOException {
|
||||||
|
try {
|
||||||
|
for (var record : reader) {
|
||||||
|
if (!(record instanceof WarcRequest request)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String target = request.target();
|
||||||
|
if (target.startsWith("http://") || target.startsWith("https://")) {
|
||||||
|
return Optional.of(new EdgeUrl(target).getDomain());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
return Optional.empty();
|
||||||
|
} finally {
|
||||||
|
reader.position(0);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||||
|
return reader.records()
|
||||||
|
.filter(record -> record instanceof WarcResponse)
|
||||||
|
.map(WarcResponse.class::cast)
|
||||||
|
.filter(this::isRelevantResponse)
|
||||||
|
.map(this::process)
|
||||||
|
.filter(Optional::isPresent)
|
||||||
|
.map(Optional::get)
|
||||||
|
.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isRelevantResponse(WarcResponse warcResponse) {
|
||||||
|
try {
|
||||||
|
HttpResponse httpResponse = warcResponse.http();
|
||||||
|
if (httpResponse == null)
|
||||||
|
return false;
|
||||||
|
if (httpResponse.status() != 200)
|
||||||
|
return false;
|
||||||
|
if (!Objects.equals(httpResponse.contentType(), MediaType.HTML))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
var url = new EdgeUrl(warcResponse.target());
|
||||||
|
if (!Objects.equals(url.getDomain(), domain)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private Optional<ProcessedDocument> process(WarcResponse response) {
|
||||||
|
Optional<String> body = getBody(response);
|
||||||
|
String url = response.target();
|
||||||
|
|
||||||
|
// We trim "/index.html"-suffixes from the index if they are present,
|
||||||
|
// since this is typically an artifact from document retrieval
|
||||||
|
if (url.endsWith("/index.html")) {
|
||||||
|
url = url.substring(0, url.length() - "index.html".length());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (body.isEmpty()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(sideloaderProcessing
|
||||||
|
.processDocument(url, body.get(), List.of(), new DomainLinks(),
|
||||||
|
GeneratorType.DOCS,
|
||||||
|
10_000));
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private Optional<String> getBody(WarcResponse response) {
|
||||||
|
var http = response.http();
|
||||||
|
|
||||||
|
// TODO: We should support additional encodings here
|
||||||
|
try (var body = http.body()) {
|
||||||
|
String contentType = http.headers().first("Content-Type").orElse(null);
|
||||||
|
byte[] bytes = body.stream().readAllBytes();
|
||||||
|
|
||||||
|
var ct = ContentTypeParser.parseContentType(contentType, bytes);
|
||||||
|
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Failed to parse body", ex);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.writer;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.worklog.BatchingWorkLog;
|
import nu.marginalia.worklog.BatchingWorkLog;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -41,7 +42,10 @@ public class ConverterWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void accept(ProcessedDomain domain) {
|
public void accept(@Nullable ProcessedDomain domain) {
|
||||||
|
if (null == domain)
|
||||||
|
return;
|
||||||
|
|
||||||
domainData.put(domain);
|
domainData.put(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,6 +65,7 @@ public class ConvertingIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testMemexMarginaliaNu() throws IOException {
|
public void testMemexMarginaliaNu() throws IOException {
|
||||||
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||||
|
assertNotNull(ret);
|
||||||
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
||||||
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
||||||
|
|
||||||
@ -114,7 +115,8 @@ public class ConvertingIntegrationTest {
|
|||||||
Double.toString(Math.random()),
|
Double.toString(Math.random()),
|
||||||
"https://memex.marginalia.nu/" + file,
|
"https://memex.marginalia.nu/" + file,
|
||||||
null,
|
null,
|
||||||
""
|
"",
|
||||||
|
false
|
||||||
);
|
);
|
||||||
docs.add(doc);
|
docs.add(doc);
|
||||||
}
|
}
|
||||||
|
@ -3,31 +3,51 @@ package nu.marginalia.converting;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.slf4j.Logger;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/* This is mostly a debugging utility */
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
/** Tests for the crawler and converter integration. These are pretty slow and potentially
|
||||||
|
* a bit flaky, since they attempt to fetch real websites.
|
||||||
|
*/
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
public class CrawlingThenConvertingIntegrationTest {
|
public class CrawlingThenConvertingIntegrationTest {
|
||||||
private DomainProcessor domainProcessor;
|
private DomainProcessor domainProcessor;
|
||||||
private HttpFetcher httpFetcher;
|
private HttpFetcher httpFetcher;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class);
|
||||||
|
|
||||||
|
private Path fileName;
|
||||||
|
private Path fileName2;
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setUpAll() {
|
public static void setUpAll() {
|
||||||
@ -44,10 +64,80 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
domainProcessor = injector.getInstance(DomainProcessor.class);
|
domainProcessor = injector.getInstance(DomainProcessor.class);
|
||||||
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
||||||
|
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||||
|
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(fileName);
|
||||||
|
Files.deleteIfExists(fileName2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlThenProcess() {
|
public void testInvalidDomain() throws IOException {
|
||||||
|
// Attempt to fetch an invalid domain
|
||||||
|
var specs = CrawlSpecRecord.builder()
|
||||||
|
.domain("invalid.invalid.invalid")
|
||||||
|
.crawlDepth(10)
|
||||||
|
.urls(List.of()) // add specific URLs to crawl here
|
||||||
|
.build();
|
||||||
|
|
||||||
|
CrawledDomain crawlData = crawl(specs);
|
||||||
|
|
||||||
|
assertEquals("ERROR", crawlData.crawlerStatus);
|
||||||
|
assertTrue(crawlData.doc.isEmpty());
|
||||||
|
|
||||||
|
var processedData = process();
|
||||||
|
|
||||||
|
assertNotNull(processedData);
|
||||||
|
assertTrue(processedData.documents.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRedirectingDomain() throws IOException {
|
||||||
|
// Attempt to fetch an invalid domain
|
||||||
|
var specs = CrawlSpecRecord.builder()
|
||||||
|
.domain("memex.marginalia.nu")
|
||||||
|
.crawlDepth(10)
|
||||||
|
.urls(List.of()) // add specific URLs to crawl here
|
||||||
|
.build();
|
||||||
|
|
||||||
|
CrawledDomain crawlData = crawl(specs);
|
||||||
|
|
||||||
|
assertEquals("REDIRECT", crawlData.crawlerStatus);
|
||||||
|
assertEquals("www.marginalia.nu", crawlData.redirectDomain);
|
||||||
|
assertTrue(crawlData.doc.isEmpty());
|
||||||
|
|
||||||
|
var processedData = process();
|
||||||
|
|
||||||
|
assertNotNull(processedData);
|
||||||
|
assertTrue(processedData.documents.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBlockedDomain() throws IOException {
|
||||||
|
// Attempt to fetch an invalid domain
|
||||||
|
var specs = CrawlSpecRecord.builder()
|
||||||
|
.domain("search.marginalia.nu")
|
||||||
|
.crawlDepth(10)
|
||||||
|
.urls(List.of()) // add specific URLs to crawl here
|
||||||
|
.build();
|
||||||
|
|
||||||
|
CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
|
||||||
|
|
||||||
|
assertEquals("ERROR", crawlData.crawlerStatus);
|
||||||
|
assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc);
|
||||||
|
assertTrue(crawlData.doc.isEmpty());
|
||||||
|
|
||||||
|
var processedData = process();
|
||||||
|
|
||||||
|
assertNotNull(processedData);
|
||||||
|
assertTrue(processedData.documents.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void crawlSunnyDay() throws IOException {
|
||||||
var specs = CrawlSpecRecord.builder()
|
var specs = CrawlSpecRecord.builder()
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
.crawlDepth(10)
|
.crawlDepth(10)
|
||||||
@ -55,12 +145,20 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
.build();
|
.build();
|
||||||
|
|
||||||
CrawledDomain domain = crawl(specs);
|
CrawledDomain domain = crawl(specs);
|
||||||
|
assertFalse(domain.doc.isEmpty());
|
||||||
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
|
assertEquals("www.marginalia.nu", domain.domain);
|
||||||
|
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt"));
|
||||||
data.add(domain);
|
assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler");
|
||||||
data.addAll(domain.doc);
|
|
||||||
|
var output = process();
|
||||||
|
|
||||||
|
assertNotNull(output);
|
||||||
|
assertFalse(output.documents.isEmpty());
|
||||||
|
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
|
||||||
|
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||||
|
|
||||||
var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator()));
|
|
||||||
|
|
||||||
for (var doc : output.documents) {
|
for (var doc : output.documents) {
|
||||||
if (doc.isOk()) {
|
if (doc.isOk()) {
|
||||||
@ -73,12 +171,122 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private CrawledDomain crawl(CrawlSpecRecord specs) {
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void crawlContentTypes() throws IOException {
|
||||||
|
var specs = CrawlSpecRecord.builder()
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.crawlDepth(5)
|
||||||
|
.urls(List.of(
|
||||||
|
"https://www.marginalia.nu/sanic.png",
|
||||||
|
"https://www.marginalia.nu/invalid"
|
||||||
|
))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
CrawledDomain domain = crawl(specs);
|
||||||
|
assertFalse(domain.doc.isEmpty());
|
||||||
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
|
assertEquals("www.marginalia.nu", domain.domain);
|
||||||
|
|
||||||
|
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||||
|
assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type");
|
||||||
|
assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL");
|
||||||
|
|
||||||
|
var output = process();
|
||||||
|
|
||||||
|
assertNotNull(output);
|
||||||
|
assertFalse(output.documents.isEmpty());
|
||||||
|
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
|
||||||
|
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||||
|
|
||||||
|
|
||||||
|
for (var doc : output.documents) {
|
||||||
|
if (doc.isOk()) {
|
||||||
|
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void crawlRobotsTxt() throws IOException {
|
||||||
|
var specs = CrawlSpecRecord.builder()
|
||||||
|
.domain("search.marginalia.nu")
|
||||||
|
.crawlDepth(5)
|
||||||
|
.urls(List.of(
|
||||||
|
"https://search.marginalia.nu/search?q=hello+world"
|
||||||
|
))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
CrawledDomain domain = crawl(specs);
|
||||||
|
assertFalse(domain.doc.isEmpty());
|
||||||
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
|
assertEquals("search.marginalia.nu", domain.domain);
|
||||||
|
|
||||||
|
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||||
|
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
|
||||||
|
|
||||||
|
var output = process();
|
||||||
|
|
||||||
|
assertNotNull(output);
|
||||||
|
assertFalse(output.documents.isEmpty());
|
||||||
|
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
|
||||||
|
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||||
|
|
||||||
|
for (var doc : output.documents) {
|
||||||
|
if (doc.isOk()) {
|
||||||
|
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private ProcessedDomain process() {
|
||||||
|
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||||
|
return domainProcessor.process(stream);
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
Assertions.fail(e);
|
||||||
|
return null; // unreachable
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException {
|
||||||
|
return crawl(specs, domain -> true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CrawledDomain crawl(CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
try (var recorder = new WarcRecorder(fileName)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
|
||||||
|
new UserAgent("test"),
|
||||||
|
fileName, fileName2);
|
||||||
|
|
||||||
|
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||||
|
while (reader.hasNext()) {
|
||||||
|
var next = reader.next();
|
||||||
|
logger.info("{}", next);
|
||||||
|
data.add(next);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CrawledDomain domain = data.stream()
|
||||||
|
.filter(CrawledDomain.class::isInstance)
|
||||||
|
.map(CrawledDomain.class::cast)
|
||||||
|
.findFirst()
|
||||||
|
.get();
|
||||||
|
|
||||||
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
|
|
||||||
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
||||||
return domain;
|
return domain;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,81 @@
|
|||||||
|
package nu.marginalia.converting.sideload.warc;
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import nu.marginalia.converting.ConverterModule;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
|
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
import org.netpreserve.jwarc.WarcWriter;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
class WarcSideloaderTest extends AbstractModule {
|
||||||
|
SideloaderProcessing processing;
|
||||||
|
|
||||||
|
Path warcFile;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
processing = Guice.createInjector(new ConverterModule(), this)
|
||||||
|
.getInstance(SideloaderProcessing.class);
|
||||||
|
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc.gz");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(warcFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void configure() {
|
||||||
|
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
|
||||||
|
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
|
||||||
|
|
||||||
|
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws IOException {
|
||||||
|
try (var writer = new WarcWriter(Files.newOutputStream(warcFile))) {
|
||||||
|
writer.fetch(new URI("https://www.marginalia.nu/"));
|
||||||
|
writer.fetch(new URI("https://www.marginalia.nu/log/93_atags/"));
|
||||||
|
writer.fetch(new URI("https://www.marginalia.nu/links/"));
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
ProcessedDomain domain;
|
||||||
|
List<ProcessedDocument> docs = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var sideloader = new WarcSideloader(warcFile, processing)) {
|
||||||
|
domain = sideloader.getDomain();
|
||||||
|
sideloader.getDocumentsStream().forEachRemaining(docs::add);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
assertNotNull(domain);
|
||||||
|
assertEquals(3, docs.size());
|
||||||
|
List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
|
||||||
|
assertEquals(List.of(
|
||||||
|
"https://www.marginalia.nu/",
|
||||||
|
"https://www.marginalia.nu/log/93_atags/",
|
||||||
|
"https://www.marginalia.nu/links/"),
|
||||||
|
fetchedUrls);
|
||||||
|
}
|
||||||
|
}
|
@ -41,6 +41,7 @@ dependencies {
|
|||||||
implementation project(':code:features-convert:anchor-keywords')
|
implementation project(':code:features-convert:anchor-keywords')
|
||||||
implementation project(':code:features-crawl:crawl-blocklist')
|
implementation project(':code:features-crawl:crawl-blocklist')
|
||||||
implementation project(':code:features-crawl:link-parser')
|
implementation project(':code:features-crawl:link-parser')
|
||||||
|
implementation project(':code:features-crawl:content-type')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
@ -48,6 +49,7 @@ dependencies {
|
|||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
implementation libs.gson
|
implementation libs.gson
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
implementation libs.jwarc
|
||||||
implementation libs.crawlercommons
|
implementation libs.crawlercommons
|
||||||
implementation libs.okhttp3
|
implementation libs.okhttp3
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
|
@ -1,83 +0,0 @@
|
|||||||
package nu.marginalia.crawl;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.concurrent.Semaphore;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
public class CrawlLimiter {
|
|
||||||
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256);
|
|
||||||
|
|
||||||
// Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this
|
|
||||||
private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4;
|
|
||||||
private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2;
|
|
||||||
|
|
||||||
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
|
|
||||||
|
|
||||||
// When set to true, the crawler will wait before starting additional tasks
|
|
||||||
private final AtomicBoolean throttle = new AtomicBoolean(false);
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class);
|
|
||||||
|
|
||||||
public CrawlLimiter() {
|
|
||||||
Thread monitorThread = new Thread(this::monitor, "Memory Monitor");
|
|
||||||
monitorThread.setDaemon(true);
|
|
||||||
monitorThread.start();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void monitor() {
|
|
||||||
for (;;) {
|
|
||||||
synchronized (throttle) {
|
|
||||||
boolean oldThrottle = throttle.get();
|
|
||||||
boolean newThrottle = oldThrottle;
|
|
||||||
|
|
||||||
if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) {
|
|
||||||
// According to the spec this may happen, although it seems to rarely
|
|
||||||
// be the case in practice
|
|
||||||
logger.warn("Memory based throttling disabled (set Xmx)");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
|
|
||||||
|
|
||||||
if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) {
|
|
||||||
newThrottle = false;
|
|
||||||
logger.warn("Memory based throttling released");
|
|
||||||
}
|
|
||||||
else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) {
|
|
||||||
newThrottle = true;
|
|
||||||
logger.warn("Memory based throttling triggered");
|
|
||||||
|
|
||||||
// Try to GC
|
|
||||||
System.gc();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
throttle.set(newThrottle);
|
|
||||||
|
|
||||||
if (!newThrottle) {
|
|
||||||
throttle.notifyAll();
|
|
||||||
}
|
|
||||||
if (newThrottle != oldThrottle) {
|
|
||||||
logger.warn("Memory based throttling set to {}", newThrottle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TimeUnit.SECONDS.sleep(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void waitForEnoughRAM() {
|
|
||||||
while (throttle.get()) {
|
|
||||||
synchronized (throttle) {
|
|
||||||
throttle.wait(30000);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -13,10 +13,13 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
|||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
||||||
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
||||||
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
|
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
|
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
import nu.marginalia.crawlspec.CrawlSpecFileNames;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
@ -27,18 +30,17 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
|||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import okhttp3.ConnectionPool;
|
import okhttp3.ConnectionPool;
|
||||||
import okhttp3.Dispatcher;
|
import okhttp3.Dispatcher;
|
||||||
import okhttp3.internal.Util;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.*;
|
import java.util.concurrent.*;
|
||||||
@ -49,13 +51,8 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
|||||||
public class CrawlerMain {
|
public class CrawlerMain {
|
||||||
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
||||||
|
|
||||||
private final ProcessHeartbeatImpl heartbeat;
|
|
||||||
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
|
|
||||||
|
|
||||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
|
||||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
|
||||||
|
|
||||||
private final UserAgent userAgent;
|
private final UserAgent userAgent;
|
||||||
|
private final ProcessHeartbeatImpl heartbeat;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
@ -66,13 +63,12 @@ public class CrawlerMain {
|
|||||||
private final SimpleBlockingThreadPool pool;
|
private final SimpleBlockingThreadPool pool;
|
||||||
|
|
||||||
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
||||||
private final CrawledDomainReader reader = new CrawledDomainReader();
|
|
||||||
|
|
||||||
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||||
|
|
||||||
volatile int totalTasks;
|
volatile int totalTasks;
|
||||||
final AtomicInteger tasksDone = new AtomicInteger(0);
|
final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||||
private final CrawlLimiter limiter = new CrawlLimiter();
|
private HttpFetcherImpl fetcher;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public CrawlerMain(UserAgent userAgent,
|
public CrawlerMain(UserAgent userAgent,
|
||||||
@ -83,8 +79,8 @@ public class CrawlerMain {
|
|||||||
DbCrawlSpecProvider dbCrawlSpecProvider,
|
DbCrawlSpecProvider dbCrawlSpecProvider,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||||
Gson gson) {
|
Gson gson) {
|
||||||
this.heartbeat = heartbeat;
|
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
|
this.heartbeat = heartbeat;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
@ -93,8 +89,14 @@ public class CrawlerMain {
|
|||||||
this.gson = gson;
|
this.gson = gson;
|
||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
// maybe need to set -Xss for JVM to deal with this?
|
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||||
pool = new SimpleBlockingThreadPool("CrawlerPool", CrawlLimiter.maxPoolSize, 1);
|
Integer.getInteger("crawler.pool-size", 256),
|
||||||
|
1);
|
||||||
|
|
||||||
|
fetcher = new HttpFetcherImpl(userAgent.uaString(),
|
||||||
|
new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()),
|
||||||
|
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
@ -141,6 +143,7 @@ public class CrawlerMain {
|
|||||||
public void run(CrawlSpecProvider specProvider, Path outputDir) throws InterruptedException, IOException {
|
public void run(CrawlSpecProvider specProvider, Path outputDir) throws InterruptedException, IOException {
|
||||||
|
|
||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
|
|
||||||
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains())
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains())
|
||||||
) {
|
) {
|
||||||
@ -175,6 +178,7 @@ public class CrawlerMain {
|
|||||||
activePoolCount = newActivePoolCount;
|
activePoolCount = newActivePoolCount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.warn("Exception in crawler", ex);
|
logger.warn("Exception in crawler", ex);
|
||||||
@ -211,27 +215,48 @@ public class CrawlerMain {
|
|||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
limiter.waitForEnoughRAM();
|
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||||
|
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||||
|
Path finalWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL);
|
||||||
|
Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain);
|
||||||
|
|
||||||
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
if (Files.exists(newWarcFile)) {
|
||||||
|
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id);
|
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||||
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
||||||
CrawlDataReference reference = getReference())
|
CrawlDataReference reference = getReference())
|
||||||
{
|
{
|
||||||
Thread.currentThread().setName("crawling:" + domain);
|
Thread.currentThread().setName("crawling:" + domain);
|
||||||
|
|
||||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||||
|
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept);
|
if (Files.exists(tempFile)) {
|
||||||
int size = retreiver.fetch(domainLinks, reference);
|
retriever.syncAbortedRun(tempFile);
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
|
int size = retriever.fetch(domainLinks, reference);
|
||||||
|
|
||||||
|
// Delete the reference crawl data if it's not the same as the new one
|
||||||
|
// (mostly a case when migrating from legacy->warc)
|
||||||
|
reference.delete();
|
||||||
|
|
||||||
|
CrawledDocumentParquetRecordFileWriter
|
||||||
|
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
|
||||||
|
|
||||||
|
workLog.setJobToFinished(domain, parquetFile.toString(), size);
|
||||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||||
|
|
||||||
logger.info("Fetched {}", domain);
|
logger.info("Fetched {}", domain);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Error fetching domain " + domain, e);
|
logger.error("Error fetching domain " + domain, e);
|
||||||
|
Files.deleteIfExists(newWarcFile);
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
// We don't need to double-count these; it's also kept int he workLog
|
// We don't need to double-count these; it's also kept int he workLog
|
||||||
@ -242,8 +267,7 @@ public class CrawlerMain {
|
|||||||
|
|
||||||
private CrawlDataReference getReference() {
|
private CrawlDataReference getReference() {
|
||||||
try {
|
try {
|
||||||
var dataStream = reader.createDataStream(outputDir, domain, id);
|
return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
|
||||||
return new CrawlDataReference(dataStream);
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.debug("Failed to read previous crawl data for {}", specification.domain);
|
logger.debug("Failed to read previous crawl data for {}", specification.domain);
|
||||||
return new CrawlDataReference();
|
return new CrawlDataReference();
|
||||||
|
@ -5,14 +5,19 @@ import com.google.common.hash.Hashing;
|
|||||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
/** A reference to a domain that has been crawled before. */
|
/** A reference to a domain that has been crawled before. */
|
||||||
public class CrawlDataReference implements AutoCloseable {
|
public class CrawlDataReference implements AutoCloseable {
|
||||||
|
|
||||||
private final SerializableCrawlDataStream data;
|
private final SerializableCrawlDataStream data;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
||||||
|
|
||||||
public CrawlDataReference(SerializableCrawlDataStream data) {
|
public CrawlDataReference(SerializableCrawlDataStream data) {
|
||||||
this.data = data;
|
this.data = data;
|
||||||
@ -22,6 +27,15 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
this(SerializableCrawlDataStream.empty());
|
this(SerializableCrawlDataStream.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Delete the associated data from disk, if it exists */
|
||||||
|
public void delete() throws IOException {
|
||||||
|
Path filePath = data.path();
|
||||||
|
|
||||||
|
if (filePath != null) {
|
||||||
|
Files.deleteIfExists(filePath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public CrawledDocument nextDocument() {
|
public CrawledDocument nextDocument() {
|
||||||
try {
|
try {
|
||||||
@ -32,17 +46,16 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException ex) {
|
||||||
ex.printStackTrace();
|
logger.error("Failed to read next document", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) {
|
public boolean isContentBodySame(String one, String other) {
|
||||||
assert one.documentBody != null;
|
|
||||||
assert other.documentBody != null;
|
|
||||||
|
|
||||||
final long contentHashOne = contentHash(one.documentBody);
|
final long contentHashOne = contentHash(one);
|
||||||
final long contentHashOther = contentHash(other.documentBody);
|
final long contentHashOther = contentHash(other);
|
||||||
|
|
||||||
return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
|
return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
|
||||||
}
|
}
|
||||||
|
@ -20,8 +20,18 @@ public class CrawlDelayTimer {
|
|||||||
this.delayTime = delayTime;
|
this.delayTime = delayTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
|
||||||
|
* set a flag that slows down the main crawl delay as well. */
|
||||||
|
public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
|
||||||
|
slowDown = true;
|
||||||
|
|
||||||
|
int delay = ex.retryAfter();
|
||||||
|
|
||||||
|
Thread.sleep(Math.clamp(delay, 100, 5000));
|
||||||
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void delay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
if (sleepTime >= 1) {
|
if (sleepTime >= 1) {
|
||||||
@ -30,10 +40,6 @@ public class CrawlDelayTimer {
|
|||||||
|
|
||||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
Thread.sleep(min(sleepTime - spentTime, 5000));
|
||||||
}
|
}
|
||||||
else if (slowDown) {
|
|
||||||
// Additional delay when the server is signalling it wants slower requests
|
|
||||||
Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
|
|
||||||
}
|
|
||||||
else {
|
else {
|
||||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||||
// within sane limits. This means slower servers get slower crawling, and faster
|
// within sane limits. This means slower servers get slower crawling, and faster
|
||||||
@ -48,10 +54,10 @@ public class CrawlDelayTimer {
|
|||||||
|
|
||||||
Thread.sleep(sleepTime - spentTime);
|
Thread.sleep(sleepTime - spentTime);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
|
if (slowDown) {
|
||||||
public void slowDown() {
|
// Additional delay when the server is signalling it wants slower requests
|
||||||
slowDown = true;
|
Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,91 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class CrawledDocumentFactory {
|
||||||
|
|
||||||
|
public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
||||||
|
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.url(url.toString())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CrawledDocument createUnknownHostError(EdgeUrl url) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
||||||
|
.crawlerStatusDesc("Unknown Host")
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.url(url.toString())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.crawlerStatus("Timeout")
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.url(url.toString())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.crawlerStatus(status.toString())
|
||||||
|
.crawlerStatusDesc(why)
|
||||||
|
.headers(rsp.headers().toString())
|
||||||
|
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.httpStatus(rsp.statusCode())
|
||||||
|
.url(url.toString())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.crawlerStatus(status.toString())
|
||||||
|
.crawlerStatusDesc(why)
|
||||||
|
.headers("")
|
||||||
|
.contentType(contentType)
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.httpStatus(statusCode)
|
||||||
|
.url(url.toString())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
|
||||||
|
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
|
||||||
|
.redirectUrl(responseUrl.toString())
|
||||||
|
.headers(rsp.headers().toString())
|
||||||
|
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.httpStatus(rsp.statusCode())
|
||||||
|
.url(url.toString())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CrawledDocument createRobotsError(EdgeUrl url) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.url(url.toString())
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.httpStatus(-1)
|
||||||
|
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
public static CrawledDocument createRetryError(EdgeUrl url) {
|
||||||
|
return CrawledDocument.builder()
|
||||||
|
.url(url.toString())
|
||||||
|
.timestamp(LocalDateTime.now().toString())
|
||||||
|
.httpStatus(429)
|
||||||
|
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
}
|
@ -3,11 +3,15 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import com.google.common.hash.HashFunction;
|
import com.google.common.hash.HashFunction;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
|
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||||
|
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.crawling.model.*;
|
import nu.marginalia.crawling.model.*;
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
@ -19,54 +23,49 @@ import org.jsoup.nodes.Document;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.time.LocalDateTime;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
public class CrawlerRetreiver {
|
public class CrawlerRetreiver implements AutoCloseable {
|
||||||
|
|
||||||
private static final int MAX_ERRORS = 20;
|
private static final int MAX_ERRORS = 20;
|
||||||
|
private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
|
||||||
|
|
||||||
private final HttpFetcher fetcher;
|
private final HttpFetcher fetcher;
|
||||||
|
|
||||||
private final String domain;
|
private final String domain;
|
||||||
private final Consumer<SerializableCrawlData> crawledDomainWriter;
|
|
||||||
|
|
||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
||||||
|
|
||||||
private static final HashFunction hashMethod = Hashing.murmur3_128(0);
|
|
||||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||||
|
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
private final SitemapRetriever sitemapRetriever;
|
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
private final WarcRecorder warcRecorder;
|
||||||
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
|
private final SitemapFetcher sitemapFetcher;
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
|
||||||
/** recrawlState tag for documents that had a HTTP status 304 */
|
|
||||||
private static final String documentWasRetainedTag = "RETAINED/304";
|
|
||||||
|
|
||||||
/** recrawlState tag for documents that had a 200 status but were identical to a previous version */
|
|
||||||
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
|
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
DomainProber domainProber,
|
DomainProber domainProber,
|
||||||
CrawlSpecRecord specs,
|
CrawlSpecRecord specs,
|
||||||
Consumer<SerializableCrawlData> writer) {
|
WarcRecorder warcRecorder)
|
||||||
|
{
|
||||||
|
this.warcRecorder = warcRecorder;
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
|
|
||||||
domain = specs.domain;
|
domain = specs.domain;
|
||||||
|
|
||||||
crawledDomainWriter = writer;
|
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth);
|
||||||
|
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
|
||||||
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth);
|
sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
|
||||||
sitemapRetriever = fetcher.createSitemapRetriever();
|
|
||||||
|
|
||||||
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
||||||
var fst = crawlFrontier.peek();
|
var fst = crawlFrontier.peek();
|
||||||
@ -90,43 +89,42 @@ public class CrawlerRetreiver {
|
|||||||
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
|
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
|
||||||
|
|
||||||
return switch (probeResult) {
|
try {
|
||||||
case DomainProber.ProbeResultOk(EdgeUrl probedUrl) -> crawlDomain(oldCrawlData, probedUrl, domainLinks);
|
return crawlDomain(oldCrawlData, probeResult, domainLinks);
|
||||||
case DomainProber.ProbeResultError(CrawlerDomainStatus status, String desc) -> {
|
}
|
||||||
crawledDomainWriter.accept(
|
catch (Exception ex) {
|
||||||
CrawledDomain.builder()
|
logger.error("Error crawling domain {}", domain, ex);
|
||||||
.crawlerStatus(status.name())
|
return 0;
|
||||||
.crawlerStatusDesc(desc)
|
}
|
||||||
.domain(domain)
|
|
||||||
.ip(findIp(domain))
|
|
||||||
.build()
|
|
||||||
);
|
|
||||||
yield 1;
|
|
||||||
}
|
|
||||||
case DomainProber.ProbeResultRedirect(EdgeDomain redirectDomain) -> {
|
|
||||||
crawledDomainWriter.accept(
|
|
||||||
CrawledDomain.builder()
|
|
||||||
.crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
|
|
||||||
.crawlerStatusDesc("Redirected to different domain")
|
|
||||||
.redirectDomain(redirectDomain.toString())
|
|
||||||
.domain(domain)
|
|
||||||
.ip(findIp(domain))
|
|
||||||
.build()
|
|
||||||
);
|
|
||||||
yield 1;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private int crawlDomain(CrawlDataReference oldCrawlData, EdgeUrl rootUrl, DomainLinks domainLinks) {
|
public void syncAbortedRun(Path warcFile) {
|
||||||
|
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
|
||||||
|
|
||||||
|
resync.run(warcFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
|
||||||
String ip = findIp(domain);
|
String ip = findIp(domain);
|
||||||
|
|
||||||
|
EdgeUrl rootUrl;
|
||||||
|
|
||||||
|
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
||||||
|
|
||||||
|
if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
rootUrl = ok.probedUrl();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
assert !crawlFrontier.isEmpty();
|
assert !crawlFrontier.isEmpty();
|
||||||
|
|
||||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
|
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder);
|
||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
sniffRootDocument(delayTimer, rootUrl);
|
sniffRootDocument(rootUrl);
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
|
int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
@ -140,9 +138,15 @@ public class CrawlerRetreiver {
|
|||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
// Add links from the sitemap to the crawl frontier
|
// Add links from the sitemap to the crawl frontier
|
||||||
downloadSitemaps(robotsRules, rootUrl);
|
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
||||||
|
|
||||||
CrawledDomain ret = new CrawledDomain(domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
|
CrawledDomain ret = new CrawledDomain(domain,
|
||||||
|
null,
|
||||||
|
CrawlerDomainStatus.OK.name(),
|
||||||
|
null,
|
||||||
|
ip,
|
||||||
|
new ArrayList<>(),
|
||||||
|
null);
|
||||||
|
|
||||||
int fetchedCount = recrawled;
|
int fetchedCount = recrawled;
|
||||||
|
|
||||||
@ -154,7 +158,7 @@ public class CrawlerRetreiver {
|
|||||||
var top = crawlFrontier.takeNextUrl();
|
var top = crawlFrontier.takeNextUrl();
|
||||||
|
|
||||||
if (!robotsRules.isAllowed(top.toString())) {
|
if (!robotsRules.isAllowed(top.toString())) {
|
||||||
crawledDomainWriter.accept(createRobotsError(top));
|
warcRecorder.flagAsRobotsTxtError(top);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -177,149 +181,43 @@ public class CrawlerRetreiver {
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
||||||
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) {
|
try {
|
||||||
fetchedCount++;
|
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
||||||
|
fetchedCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException ex) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.cookies = fetcher.getCookies();
|
ret.cookies = fetcher.getCookies();
|
||||||
|
|
||||||
crawledDomainWriter.accept(ret);
|
|
||||||
|
|
||||||
return fetchedCount;
|
return fetchedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
/** Using the old crawl data, fetch the documents comparing etags and last-modified */
|
||||||
private int recrawl(CrawlDataReference oldCrawlData,
|
private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException {
|
||||||
SimpleRobotRules robotsRules,
|
return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
CrawlDelayTimer delayTimer) {
|
|
||||||
int recrawled = 0;
|
|
||||||
int retained = 0;
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
CrawledDocument doc = oldCrawlData.nextDocument();
|
|
||||||
|
|
||||||
if (doc == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This Shouldn't Happen (TM)
|
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
|
||||||
if (urlMaybe.isEmpty()) continue;
|
|
||||||
var url = urlMaybe.get();
|
|
||||||
|
|
||||||
// If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
|
|
||||||
if (doc.httpStatus == 404) {
|
|
||||||
crawlFrontier.addVisited(url);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doc.httpStatus != 200) continue;
|
|
||||||
|
|
||||||
if (!robotsRules.isAllowed(url.toString())) {
|
|
||||||
crawledDomainWriter.accept(createRobotsError(url));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!crawlFrontier.filterLink(url))
|
|
||||||
continue;
|
|
||||||
if (!crawlFrontier.addVisited(url))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
|
|
||||||
if (recrawled > 5
|
|
||||||
&& retained > 0.9 * recrawled
|
|
||||||
&& Math.random() < 0.9)
|
|
||||||
{
|
|
||||||
// Since it looks like most of these documents haven't changed,
|
|
||||||
// we'll load the documents directly; but we do this in a random
|
|
||||||
// fashion to make sure we eventually catch changes over time
|
|
||||||
|
|
||||||
crawledDomainWriter.accept(doc);
|
|
||||||
crawlFrontier.addVisited(url);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// GET the document with the stored document as a reference
|
|
||||||
// providing etag and last-modified headers, so we can recycle the
|
|
||||||
// document if it hasn't changed without actually downloading it
|
|
||||||
|
|
||||||
var fetchedDocOpt = fetchWriteAndSleep(url,
|
|
||||||
delayTimer,
|
|
||||||
new DocumentWithReference(doc, oldCrawlData));
|
|
||||||
if (fetchedDocOpt.isEmpty()) continue;
|
|
||||||
|
|
||||||
if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
|
|
||||||
else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
|
|
||||||
|
|
||||||
recrawled ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return recrawled;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
private void sniffRootDocument(EdgeUrl rootUrl) {
|
||||||
List<String> sitemaps = robotsRules.getSitemaps();
|
|
||||||
|
|
||||||
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
|
|
||||||
if (!sitemaps.isEmpty()) {
|
|
||||||
for (var url : sitemaps) {
|
|
||||||
EdgeUrl.parse(url).ifPresent(urls::add);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
|
|
||||||
}
|
|
||||||
|
|
||||||
downloadSitemaps(urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void downloadSitemaps(List<EdgeUrl> urls) {
|
|
||||||
|
|
||||||
Set<String> checkedSitemaps = new HashSet<>();
|
|
||||||
|
|
||||||
for (var url : urls) {
|
|
||||||
// Let's not download sitemaps from other domains for now
|
|
||||||
if (!crawlFrontier.isSameDomain(url)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (checkedSitemaps.contains(url.path))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var sitemap = sitemapRetriever.fetchSitemap(url);
|
|
||||||
if (sitemap.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure we don't try to download this sitemap again
|
|
||||||
// (don't move this up, as we may want to check the same
|
|
||||||
// path with different protocols until we find one that works)
|
|
||||||
|
|
||||||
checkedSitemaps.add(url.path);
|
|
||||||
|
|
||||||
crawlFrontier.addAllToQueue(sitemap);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Queue is now {}", crawlFrontier.queueSize());
|
|
||||||
}
|
|
||||||
|
|
||||||
private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) {
|
|
||||||
try {
|
try {
|
||||||
logger.debug("Configuring link filter");
|
logger.debug("Configuring link filter");
|
||||||
|
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200);
|
var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
||||||
if (maybeSample.isEmpty())
|
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||||
return;
|
return;
|
||||||
var sample = maybeSample.get();
|
|
||||||
|
|
||||||
if (sample.documentBody == null)
|
var optDoc = ok.parseDocument();
|
||||||
|
if (optDoc.isEmpty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Sniff the software based on the sample document
|
// Sniff the software based on the sample document
|
||||||
var doc = Jsoup.parse(sample.documentBody);
|
var doc = optDoc.get();
|
||||||
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
||||||
|
|
||||||
for (var link : doc.getElementsByTag("link")) {
|
for (var link : doc.getElementsByTag("link")) {
|
||||||
@ -338,7 +236,7 @@ public class CrawlerRetreiver {
|
|||||||
linkParser.parseLink(url, href)
|
linkParser.parseLink(url, href)
|
||||||
.filter(crawlFrontier::isSameDomain)
|
.filter(crawlFrontier::isSameDomain)
|
||||||
.map(List::of)
|
.map(List::of)
|
||||||
.ifPresent(this::downloadSitemaps);
|
.ifPresent(sitemapFetcher::downloadSitemaps);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@ -346,41 +244,67 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<CrawledDocument> fetchWriteAndSleep(EdgeUrl top,
|
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
|
||||||
CrawlDelayTimer timer,
|
CrawlDelayTimer timer,
|
||||||
DocumentWithReference reference) {
|
DocumentWithReference reference) throws InterruptedException
|
||||||
|
{
|
||||||
logger.debug("Fetching {}", top);
|
logger.debug("Fetching {}", top);
|
||||||
|
|
||||||
|
HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
|
||||||
|
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
|
var contentTags = reference.getContentTags();
|
||||||
|
|
||||||
var docOpt = fetchUrl(top, timer, reference);
|
// Fetch the document, retrying if we get a rate limit exception
|
||||||
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
if (docOpt.isPresent()) {
|
try {
|
||||||
var doc = docOpt.get();
|
fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
|
||||||
|
break;
|
||||||
if (!Objects.equals(doc.recrawlState, documentWasRetainedTag)
|
|
||||||
&& reference.isContentBodySame(doc))
|
|
||||||
{
|
|
||||||
// The document didn't change since the last time
|
|
||||||
doc.recrawlState = documentWasSameTag;
|
|
||||||
}
|
}
|
||||||
|
catch (RateLimitException ex) {
|
||||||
crawledDomainWriter.accept(doc);
|
timer.waitRetryDelay(ex);
|
||||||
|
|
||||||
if (doc.url != null) {
|
|
||||||
// We may have redirected to a different path
|
|
||||||
EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited);
|
|
||||||
}
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) {
|
logger.warn("Failed to fetch {}", top, ex);
|
||||||
errorCount++;
|
fetchedDoc = new HttpFetchResult.ResultException(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
timer.delay(System.currentTimeMillis() - startTime);
|
try {
|
||||||
|
if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
|
||||||
|
var docOpt = ok.parseDocument();
|
||||||
|
if (docOpt.isPresent()) {
|
||||||
|
var doc = docOpt.get();
|
||||||
|
|
||||||
return docOpt;
|
crawlFrontier.enqueueLinksFromDocument(top, doc);
|
||||||
|
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||||
|
var doc = reference.doc();
|
||||||
|
|
||||||
|
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
|
||||||
|
|
||||||
|
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||||
|
new ContentType(doc.contentType, "UTF-8"),
|
||||||
|
doc.documentBody);
|
||||||
|
|
||||||
|
var parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
|
||||||
|
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||||
|
crawlFrontier.addVisited(top);
|
||||||
|
}
|
||||||
|
else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) {
|
||||||
|
errorCount ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Error parsing document {}", top, ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
timer.waitFetchDelay(System.currentTimeMillis() - startTime);
|
||||||
|
|
||||||
|
return fetchedDoc;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isAllowedProtocol(String proto) {
|
private boolean isAllowedProtocol(String proto) {
|
||||||
@ -388,91 +312,6 @@ public class CrawlerRetreiver {
|
|||||||
|| proto.equalsIgnoreCase("https");
|
|| proto.equalsIgnoreCase("https");
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<CrawledDocument> fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) {
|
|
||||||
try {
|
|
||||||
var contentTags = reference.getContentTags();
|
|
||||||
var fetchedDoc = tryDownload(top, timer, contentTags);
|
|
||||||
|
|
||||||
CrawledDocument doc = reference.replaceOn304(fetchedDoc);
|
|
||||||
|
|
||||||
if (doc.documentBody != null) {
|
|
||||||
doc.documentBodyHash = createHash(doc.documentBody);
|
|
||||||
|
|
||||||
var parsedDoc = Jsoup.parse(doc.documentBody);
|
|
||||||
EdgeUrl url = new EdgeUrl(doc.url);
|
|
||||||
|
|
||||||
findLinks(url, parsedDoc);
|
|
||||||
findCanonicalUrl(url, parsedDoc)
|
|
||||||
.ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
return Optional.of(doc);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to process document {}", top);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Optional.empty();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
|
|
||||||
for (int i = 0; i < 2; i++) {
|
|
||||||
try {
|
|
||||||
var doc = fetcher.fetchContent(top, tags);
|
|
||||||
doc.recrawlState = "NEW";
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
catch (RateLimitException ex) {
|
|
||||||
timer.slowDown();
|
|
||||||
|
|
||||||
int delay = ex.retryAfter();
|
|
||||||
if (delay > 0 && delay < 5000) {
|
|
||||||
Thread.sleep(delay);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return createRetryError(top);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String createHash(String documentBodyHash) {
|
|
||||||
return hashMethod.hashUnencodedChars(documentBodyHash).toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
|
||||||
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
|
||||||
|
|
||||||
for (var link : parsed.getElementsByTag("a")) {
|
|
||||||
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
|
||||||
}
|
|
||||||
for (var link : parsed.getElementsByTag("frame")) {
|
|
||||||
linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
|
||||||
}
|
|
||||||
for (var link : parsed.getElementsByTag("iframe")) {
|
|
||||||
linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
|
||||||
}
|
|
||||||
for (var link : parsed.getElementsByTag("link")) {
|
|
||||||
String rel = link.attr("rel");
|
|
||||||
|
|
||||||
if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
|
|
||||||
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
|
|
||||||
baseUrl = baseUrl.domain.toRootUrl();
|
|
||||||
|
|
||||||
for (var link : parsed.select("link[rel=canonical]")) {
|
|
||||||
return linkParser.parseLink(baseUrl, link);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String findIp(String domain) {
|
private String findIp(String domain) {
|
||||||
try {
|
try {
|
||||||
return InetAddress.getByName(domain).getHostAddress();
|
return InetAddress.getByName(domain).getHostAddress();
|
||||||
@ -481,92 +320,9 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private CrawledDocument createRobotsError(EdgeUrl url) {
|
@Override
|
||||||
return CrawledDocument.builder()
|
public void close() throws Exception {
|
||||||
.url(url.toString())
|
warcRecorder.close();
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(-1)
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
private CrawledDocument createRetryError(EdgeUrl url) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.url(url.toString())
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(429)
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private record DocumentWithReference(
|
|
||||||
@Nullable CrawledDocument doc,
|
|
||||||
@Nullable CrawlDataReference reference) {
|
|
||||||
|
|
||||||
private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
|
|
||||||
public static DocumentWithReference empty() {
|
|
||||||
return emptyInstance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isContentBodySame(CrawledDocument newDoc) {
|
|
||||||
if (reference == null)
|
|
||||||
return false;
|
|
||||||
if (doc == null)
|
|
||||||
return false;
|
|
||||||
if (doc.documentBody == null)
|
|
||||||
return false;
|
|
||||||
if (newDoc.documentBody == null)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return reference.isContentBodySame(doc, newDoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
private ContentTags getContentTags() {
|
|
||||||
if (null == doc)
|
|
||||||
return ContentTags.empty();
|
|
||||||
|
|
||||||
String headers = doc.headers;
|
|
||||||
if (headers == null)
|
|
||||||
return ContentTags.empty();
|
|
||||||
|
|
||||||
String[] headersLines = headers.split("\n");
|
|
||||||
|
|
||||||
String lastmod = null;
|
|
||||||
String etag = null;
|
|
||||||
|
|
||||||
for (String line : headersLines) {
|
|
||||||
if (line.toLowerCase().startsWith("etag:")) {
|
|
||||||
etag = line.substring(5).trim();
|
|
||||||
}
|
|
||||||
if (line.toLowerCase().startsWith("last-modified:")) {
|
|
||||||
lastmod = line.substring(14).trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ContentTags(etag, lastmod);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isEmpty() {
|
|
||||||
return doc == null || reference == null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If the provided document has HTTP status 304, and the reference document is provided,
|
|
||||||
* return the reference document; otherwise return the provided document.
|
|
||||||
*/
|
|
||||||
public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) {
|
|
||||||
|
|
||||||
if (doc == null)
|
|
||||||
return fetchedDoc;
|
|
||||||
|
|
||||||
// HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
|
|
||||||
// we fetched it last time. We can recycle the reference document.
|
|
||||||
if (fetchedDoc.httpStatus != 304)
|
|
||||||
return fetchedDoc;
|
|
||||||
|
|
||||||
var ret = doc;
|
|
||||||
ret.recrawlState = documentWasRetainedTag;
|
|
||||||
ret.timestamp = LocalDateTime.now().toString();
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,107 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is responsible for resynchronizing the crawl frontier with a partially written
|
||||||
|
* warc file. This may happen if the crawl is interrupted or crashes.
|
||||||
|
* <p>
|
||||||
|
* This is best-effort and not guaranteed to recover all data, but it should limit
|
||||||
|
* the amount of data that is lost and needs to be re-crawled in the event of an unexpected
|
||||||
|
* shutdown.
|
||||||
|
*/
|
||||||
|
public class CrawlerWarcResynchronizer {
|
||||||
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
private final WarcRecorder recorder;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerWarcResynchronizer.class);
|
||||||
|
public CrawlerWarcResynchronizer(DomainCrawlFrontier crawlFrontier, WarcRecorder recorder) {
|
||||||
|
this.crawlFrontier = crawlFrontier;
|
||||||
|
this.recorder = recorder;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run(Path tempFile) {
|
||||||
|
// First pass, enqueue links
|
||||||
|
try (var reader = new WarcReader(tempFile)) {
|
||||||
|
WarcXResponseReference.register(reader);
|
||||||
|
WarcXEntityRefused.register(reader);
|
||||||
|
|
||||||
|
for (var item : reader) {
|
||||||
|
accept(item);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.info(STR."Failed read full warc file \{tempFile}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass, copy records to the new warc file
|
||||||
|
try (var reader = new WarcReader(tempFile)) {
|
||||||
|
for (var item : reader) {
|
||||||
|
recorder.resync(item);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.info(STR."Failed read full warc file \{tempFile}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void accept(WarcRecord item) {
|
||||||
|
try {
|
||||||
|
if (item instanceof WarcResponse rsp) {
|
||||||
|
response(rsp);
|
||||||
|
} else if (item instanceof WarcRequest req) {
|
||||||
|
request(req);
|
||||||
|
} else if (item instanceof WarcXEntityRefused refused) {
|
||||||
|
refused(refused);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info(STR."Failed to process warc record \{item}", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void refused(WarcXEntityRefused refused) {
|
||||||
|
// In general, we don't want to re-crawl urls that were refused,
|
||||||
|
// but to permit circumstances to change over time, we'll
|
||||||
|
// allow for a small chance of re-probing these entries
|
||||||
|
|
||||||
|
if (Math.random() > 0.1) {
|
||||||
|
crawlFrontier.addVisited(new EdgeUrl(refused.targetURI()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void request(WarcRequest request) {
|
||||||
|
EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void response(WarcResponse rsp) {
|
||||||
|
var url = new EdgeUrl(rsp.targetURI());
|
||||||
|
|
||||||
|
crawlFrontier.addVisited(url);
|
||||||
|
|
||||||
|
try {
|
||||||
|
var response = HttpFetchResult.importWarc(rsp);
|
||||||
|
DocumentBodyExtractor
|
||||||
|
.asString(response)
|
||||||
|
.ifPresent((ct, body) ->
|
||||||
|
{
|
||||||
|
var doc = Jsoup.parse(body);
|
||||||
|
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.info(STR."Failed to parse response body for \{url}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -3,14 +3,19 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import com.google.common.hash.HashFunction;
|
import com.google.common.hash.HashFunction;
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
public class DomainCrawlFrontier {
|
public class DomainCrawlFrontier {
|
||||||
|
|
||||||
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
|
|
||||||
private final ArrayDeque<String> queue;
|
private final ArrayDeque<String> queue;
|
||||||
|
|
||||||
// To save the number of strings kept in memory,
|
// To save the number of strings kept in memory,
|
||||||
@ -45,9 +50,14 @@ public class DomainCrawlFrontier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
||||||
|
* than the number of already visited documents, the base depth will be adjusted
|
||||||
|
* to the visited count first.
|
||||||
|
*/
|
||||||
public void increaseDepth(double depthIncreaseFactor) {
|
public void increaseDepth(double depthIncreaseFactor) {
|
||||||
depth = (int)(depth * depthIncreaseFactor);
|
depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
||||||
this.linkFilter = linkFilter;
|
this.linkFilter = linkFilter;
|
||||||
}
|
}
|
||||||
@ -141,4 +151,27 @@ public class DomainCrawlFrontier {
|
|||||||
public int queueSize() {
|
public int queueSize() {
|
||||||
return queue.size();
|
return queue.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) {
|
||||||
|
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
||||||
|
|
||||||
|
for (var link : parsed.getElementsByTag("a")) {
|
||||||
|
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
|
||||||
|
}
|
||||||
|
for (var link : parsed.getElementsByTag("frame")) {
|
||||||
|
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||||
|
}
|
||||||
|
for (var link : parsed.getElementsByTag("iframe")) {
|
||||||
|
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||||
|
}
|
||||||
|
for (var link : parsed.getElementsByTag("link")) {
|
||||||
|
String rel = link.attr("rel");
|
||||||
|
|
||||||
|
if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
|
||||||
|
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import okhttp3.Request;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.SocketTimeoutException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class ContentTypeProber {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
|
||||||
|
private final String userAgent;
|
||||||
|
private final OkHttpClient client;
|
||||||
|
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
|
public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
|
||||||
|
this.userAgent = userAgent;
|
||||||
|
this.client = httpClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Probe the content type of the given URL with a HEAD request.
|
||||||
|
* This is used to detect binary files, which we don't want to crawl.
|
||||||
|
* <p>
|
||||||
|
* If the URL redirects, the final URL is returned, to avoid redundant
|
||||||
|
* requests.
|
||||||
|
*
|
||||||
|
* @param url The URL to probe
|
||||||
|
* @return A ContentTypeProbeResult
|
||||||
|
*/
|
||||||
|
public ContentTypeProbeResult probeContentType(EdgeUrl url) {
|
||||||
|
logger.debug("Probing suspected binary {}", url);
|
||||||
|
|
||||||
|
var headBuilder = new Request.Builder().head()
|
||||||
|
.addHeader("User-agent", userAgent)
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.url(url.toString());
|
||||||
|
|
||||||
|
var head = headBuilder.build();
|
||||||
|
var call = client.newCall(head);
|
||||||
|
|
||||||
|
try (var rsp = call.execute()) {
|
||||||
|
var contentTypeHeader = rsp.header("Content-type");
|
||||||
|
|
||||||
|
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||||
|
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||||
|
|
||||||
|
// HEAD 301 url1 -> url2
|
||||||
|
// HEAD 200 url2
|
||||||
|
// GET 301 url1 -> url2
|
||||||
|
// GET 200 url2
|
||||||
|
|
||||||
|
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
||||||
|
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||||
|
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||||
|
|
||||||
|
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
|
||||||
|
EdgeUrl ret;
|
||||||
|
|
||||||
|
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
||||||
|
else ret = url;
|
||||||
|
|
||||||
|
return new ContentTypeProbeResult.Ok(ret);
|
||||||
|
|
||||||
|
} catch (SocketTimeoutException ex) {
|
||||||
|
return new ContentTypeProbeResult.Timeout();
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
|
||||||
|
return new ContentTypeProbeResult.Exception(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed interface ContentTypeProbeResult {
|
||||||
|
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
||||||
|
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
||||||
|
record Timeout() implements ContentTypeProbeResult { }
|
||||||
|
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||||
|
}
|
||||||
|
}
|
@ -3,7 +3,8 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
@ -18,9 +19,9 @@ public interface HttpFetcher {
|
|||||||
|
|
||||||
FetchResult probeDomain(EdgeUrl url);
|
FetchResult probeDomain(EdgeUrl url);
|
||||||
|
|
||||||
CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;
|
HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||||
|
|
||||||
SitemapRetriever createSitemapRetriever();
|
SitemapRetriever createSitemapRetriever();
|
||||||
}
|
}
|
||||||
|
@ -7,43 +7,41 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.crawl.retreival.Cookies;
|
import nu.marginalia.crawl.retreival.Cookies;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
|
||||||
import nu.marginalia.crawling.model.ContentType;
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
import okhttp3.ConnectionPool;
|
||||||
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
|
import okhttp3.Dispatcher;
|
||||||
import okhttp3.*;
|
import okhttp3.OkHttpClient;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import okhttp3.Request;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.net.ssl.SSLException;
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
import javax.net.ssl.X509TrustManager;
|
||||||
import java.io.EOFException;
|
import java.util.List;
|
||||||
import java.io.IOException;
|
import java.util.Objects;
|
||||||
import java.net.*;
|
import java.util.Optional;
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.IllegalCharsetNameException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.nio.charset.UnsupportedCharsetException;
|
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.zip.GZIPInputStream;
|
|
||||||
|
|
||||||
public class HttpFetcherImpl implements HttpFetcher {
|
public class HttpFetcherImpl implements HttpFetcher {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final String userAgent;
|
private final String userAgent;
|
||||||
private final int maxFetchSize = 1024*512;
|
|
||||||
private final Cookies cookies = new Cookies();
|
private final Cookies cookies = new Cookies();
|
||||||
|
|
||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
|
|
||||||
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
private final ContentTypeProber contentTypeProber;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
@ -64,6 +62,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
||||||
.socketFactory(ftSocketFactory)
|
.socketFactory(ftSocketFactory)
|
||||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
||||||
|
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
||||||
.connectionPool(pool)
|
.connectionPool(pool)
|
||||||
.cookieJar(cookies.getJar())
|
.cookieJar(cookies.getJar())
|
||||||
.followRedirects(true)
|
.followRedirects(true)
|
||||||
@ -92,13 +91,22 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
{
|
{
|
||||||
this.client = createClient(dispatcher, connectionPool);
|
this.client = createClient(dispatcher, connectionPool);
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
|
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
|
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
|
||||||
this.client = createClient(null, new ConnectionPool());
|
this.client = createClient(null, new ConnectionPool());
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
|
this.contentTypeProber = new ContentTypeProber(userAgent, client);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
|
||||||
|
* and if there are any redirects. This is done by one or more HEAD requests.
|
||||||
|
*
|
||||||
|
* @param url The URL to probe.
|
||||||
|
* @return The result of the probe, indicating the state and the URL.
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public FetchResult probeDomain(EdgeUrl url) {
|
public FetchResult probeDomain(EdgeUrl url) {
|
||||||
@ -130,8 +138,9 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public CrawledDocument fetchContent(EdgeUrl url,
|
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
ContentTags contentTags)
|
WarcRecorder warcRecorder,
|
||||||
|
ContentTags contentTags)
|
||||||
throws RateLimitException
|
throws RateLimitException
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -139,268 +148,54 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
// looks like it might be something else, we perform a HEAD first to check the content type
|
// looks like it might be something else, we perform a HEAD first to check the content type
|
||||||
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
||||||
{
|
{
|
||||||
logger.debug("Probing suspected binary {}", url);
|
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
|
||||||
|
if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
|
||||||
var headBuilder = new Request.Builder().head()
|
url = ok.resolvedUrl();
|
||||||
.addHeader("User-agent", userAgent)
|
|
||||||
.url(url.toString())
|
|
||||||
.addHeader("Accept-Encoding", "gzip");
|
|
||||||
|
|
||||||
var head = headBuilder.build();
|
|
||||||
var call = client.newCall(head);
|
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
|
||||||
var contentTypeHeader = rsp.header("Content-type");
|
|
||||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
|
||||||
|
|
||||||
// HEAD 301 url1 -> url2
|
|
||||||
// HEAD 200 url2
|
|
||||||
// GET 301 url1 -> url2
|
|
||||||
// GET 200 url2
|
|
||||||
|
|
||||||
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
|
||||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
|
||||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
|
||||||
|
|
||||||
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
|
|
||||||
if (Objects.equals(redirectUrl.domain, url.domain))
|
|
||||||
url = redirectUrl;
|
|
||||||
}
|
}
|
||||||
catch (SocketTimeoutException ex) {
|
else if (probeResult instanceof ContentTypeProbeResult.BadContentType badContentType) {
|
||||||
return createTimeoutErrorRsp(url, ex);
|
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||||
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
else if (probeResult instanceof ContentTypeProbeResult.BadContentType.Timeout timeout) {
|
||||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
warcRecorder.flagAsTimeout(url);
|
||||||
return createHardErrorRsp(url, ex);
|
return new HttpFetchResult.ResultNone();
|
||||||
|
}
|
||||||
|
else if (probeResult instanceof ContentTypeProbeResult.Exception exception) {
|
||||||
|
warcRecorder.flagAsError(url, exception.ex());
|
||||||
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var getBuilder = new Request.Builder().get();
|
var getBuilder = new Request.Builder().get();
|
||||||
|
|
||||||
getBuilder.addHeader("User-agent", userAgent)
|
getBuilder.url(url.toString())
|
||||||
.url(url.toString())
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.addHeader("Accept-Encoding", "gzip");
|
.addHeader("User-agent", userAgent);
|
||||||
|
|
||||||
contentTags.paint(getBuilder);
|
contentTags.paint(getBuilder);
|
||||||
|
|
||||||
var get = getBuilder.build();
|
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||||
var call = client.newCall(get);
|
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||||
return extractBody(url, rsp);
|
if (ok.statusCode() == 429) {
|
||||||
}
|
String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
|
||||||
catch (RateLimitException rle) {
|
throw new RateLimitException(retryAfter);
|
||||||
throw rle;
|
|
||||||
}
|
|
||||||
catch (SocketTimeoutException ex) {
|
|
||||||
return createTimeoutErrorRsp(url, ex);
|
|
||||||
}
|
|
||||||
catch (UnknownHostException ex) {
|
|
||||||
return createUnknownHostError(url, ex);
|
|
||||||
}
|
|
||||||
catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) {
|
|
||||||
// This is a bit of a grab-bag of errors that crop up
|
|
||||||
// IllegalCharsetName is egg on our face,
|
|
||||||
// but SSLException and EOFException are probably the server's fault
|
|
||||||
|
|
||||||
return createHardErrorRsp(url, ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.error("Error during fetching", ex);
|
|
||||||
return createHardErrorRsp(url, ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
|
||||||
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
|
||||||
.crawlerStatusDesc("Unknown Host")
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus("Timeout")
|
|
||||||
.crawlerStatusDesc(why.getMessage())
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(status.toString())
|
|
||||||
.crawlerStatusDesc(why)
|
|
||||||
.headers(rsp.headers().toString())
|
|
||||||
.contentType(rsp.header("Content-type"))
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(rsp.code())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException {
|
|
||||||
|
|
||||||
var responseUrl = new EdgeUrl(rsp.request().url().toString());
|
|
||||||
if (!Objects.equals(responseUrl.domain, url.domain)) {
|
|
||||||
return createRedirectResponse(url, rsp, responseUrl);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (rsp.code() == 429) {
|
|
||||||
throw new RateLimitException(rsp.header("Retry-After", "1000"));
|
|
||||||
}
|
|
||||||
|
|
||||||
var body = rsp.body();
|
|
||||||
if (null == body) {
|
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body");
|
|
||||||
}
|
|
||||||
|
|
||||||
var byteStream = body.byteStream();
|
|
||||||
|
|
||||||
if ("gzip".equals(rsp.header("Content-encoding"))) {
|
|
||||||
byteStream = new GZIPInputStream(byteStream);
|
|
||||||
}
|
|
||||||
byteStream = new BOMInputStream(byteStream);
|
|
||||||
|
|
||||||
var contentTypeHeader = rsp.header("Content-type");
|
|
||||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
byte[] data = byteStream.readNBytes(maxFetchSize);
|
|
||||||
|
|
||||||
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
|
|
||||||
if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
|
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) {
|
|
||||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
|
|
||||||
.crawlerStatusDesc("X-Robots-Tag")
|
|
||||||
.url(responseUrl.toString())
|
|
||||||
.httpStatus(-1)
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.headers(rsp.headers().toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
var strData = getStringData(data, contentType);
|
|
||||||
var canonical = rsp.header("rel=canonical", "");
|
|
||||||
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
|
||||||
.headers(rsp.headers().toString())
|
|
||||||
.contentType(rsp.header("Content-type"))
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.canonicalUrl(canonical)
|
|
||||||
.httpStatus(rsp.code())
|
|
||||||
.url(responseUrl.toString())
|
|
||||||
.documentBody(strData)
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
|
||||||
* <p>
|
|
||||||
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
|
|
||||||
*
|
|
||||||
* @param xRobotsHeaderTags List of X-Robots-Tag values
|
|
||||||
* @param userAgent User agent string
|
|
||||||
* @return true if we are allowed to index this page
|
|
||||||
*/
|
|
||||||
// Visible for tests
|
|
||||||
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
|
|
||||||
boolean isPermittedGeneral = true;
|
|
||||||
boolean isPermittedMarginalia = false;
|
|
||||||
boolean isForbiddenMarginalia = false;
|
|
||||||
|
|
||||||
for (String header : xRobotsHeaderTags) {
|
|
||||||
if (header.indexOf(':') >= 0) {
|
|
||||||
String[] parts = StringUtils.split(header, ":", 2);
|
|
||||||
|
|
||||||
if (parts.length < 2)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// Is this relevant to us?
|
|
||||||
if (!Objects.equals(parts[0].trim(), userAgent))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (parts[1].contains("noindex"))
|
|
||||||
isForbiddenMarginalia = true;
|
|
||||||
else if (parts[1].contains("none"))
|
|
||||||
isForbiddenMarginalia = true;
|
|
||||||
else if (parts[1].contains("all"))
|
|
||||||
isPermittedMarginalia = true;
|
|
||||||
}
|
}
|
||||||
else {
|
if (ok.statusCode() == 304) {
|
||||||
if (header.contains("noindex"))
|
return new HttpFetchResult.Result304Raw();
|
||||||
isPermittedGeneral = false;
|
}
|
||||||
if (header.contains("none"))
|
if (ok.statusCode() == 200) {
|
||||||
isPermittedGeneral = false;
|
return ok;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isPermittedMarginalia)
|
return new HttpFetchResult.ResultNone();
|
||||||
return true;
|
|
||||||
if (isForbiddenMarginalia)
|
|
||||||
return false;
|
|
||||||
return isPermittedGeneral;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getStringData(byte[] data, ContentType contentType) {
|
|
||||||
Charset charset;
|
|
||||||
try {
|
|
||||||
charset = Charset.forName(contentType.charset());
|
|
||||||
}
|
|
||||||
catch (IllegalCharsetNameException ex) {
|
|
||||||
charset = StandardCharsets.UTF_8;
|
|
||||||
}
|
|
||||||
catch (UnsupportedCharsetException ex) {
|
|
||||||
// This is usually like Macintosh Latin
|
|
||||||
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
|
|
||||||
//
|
|
||||||
// It's close enough to 8859-1 to serve
|
|
||||||
charset = StandardCharsets.ISO_8859_1;
|
|
||||||
}
|
|
||||||
return new String(data, charset);
|
|
||||||
}
|
|
||||||
|
|
||||||
private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
|
|
||||||
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
|
|
||||||
.redirectUrl(responseUrl.toString())
|
|
||||||
.headers(rsp.headers().toString())
|
|
||||||
.contentType(rsp.header("Content-type"))
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(rsp.code())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
return fetchRobotsForProto("https", domain)
|
return fetchRobotsForProto("https", recorder, domain)
|
||||||
.or(() -> fetchRobotsForProto("http", domain))
|
.or(() -> fetchRobotsForProto("http", recorder, domain))
|
||||||
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
|
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -409,21 +204,31 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new SitemapRetriever();
|
return new SitemapRetriever();
|
||||||
}
|
}
|
||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, WarcRecorder recorder, EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||||
return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
|
|
||||||
|
var getBuilder = new Request.Builder().get();
|
||||||
|
|
||||||
|
getBuilder.url(url.toString())
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.addHeader("User-agent", userAgent);
|
||||||
|
|
||||||
|
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
||||||
|
|
||||||
|
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||||
|
robotsParser.parseContent(url.toString(),
|
||||||
|
body,
|
||||||
|
contentType.toString(),
|
||||||
|
userAgent)
|
||||||
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
|
|
||||||
return robotsParser.parseContent(doc.url,
|
|
||||||
doc.documentBody.getBytes(),
|
|
||||||
doc.contentType,
|
|
||||||
userAgent);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher.socket;
|
||||||
|
|
||||||
import javax.net.SocketFactory;
|
import javax.net.SocketFactory;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
@ -0,0 +1,31 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher.socket;
|
||||||
|
|
||||||
|
import okhttp3.Interceptor;
|
||||||
|
import okhttp3.Response;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
/** An interceptor that intercepts network requests and adds the remote IP address as
|
||||||
|
* a header in the response. This is used to pass the remote IP address to the Warc
|
||||||
|
* writer, as this information is not available in the response.
|
||||||
|
*/
|
||||||
|
public class IpInterceptingNetworkInterceptor implements Interceptor {
|
||||||
|
private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
@Override
|
||||||
|
public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
|
||||||
|
String IP = chain.connection().socket().getInetAddress().getHostAddress();
|
||||||
|
|
||||||
|
return chain.proceed(chain.request())
|
||||||
|
.newBuilder()
|
||||||
|
.addHeader(pseudoHeaderName, IP)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getIpFromResponse(Response response) {
|
||||||
|
return response.header(pseudoHeaderName);
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher.socket;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@ -8,6 +8,8 @@ import java.security.cert.X509Certificate;
|
|||||||
public class NoSecuritySSL {
|
public class NoSecuritySSL {
|
||||||
|
|
||||||
// Create a trust manager that does not validate certificate chains
|
// Create a trust manager that does not validate certificate chains
|
||||||
|
// We want to accept e.g. self-signed certificates and certificates
|
||||||
|
// that are not signed by a CA is generally trusted by the system.
|
||||||
public static final TrustManager[] trustAllCerts = new TrustManager[]{
|
public static final TrustManager[] trustAllCerts = new TrustManager[]{
|
||||||
new X509TrustManager() {
|
new X509TrustManager() {
|
||||||
@Override
|
@Override
|
||||||
@ -27,7 +29,6 @@ public class NoSecuritySSL {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static SSLSocketFactory buildSocketFactory() {
|
public static SSLSocketFactory buildSocketFactory() {
|
||||||
// Install the all-trusting trust manager
|
// Install the all-trusting trust manager
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||||
|
|
||||||
|
import org.netpreserve.jwarc.WarcDigest;
|
||||||
|
|
||||||
|
import java.security.MessageDigest;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
|
||||||
|
class WarcDigestBuilder {
|
||||||
|
private final MessageDigest digest;
|
||||||
|
|
||||||
|
private static final String digestAlgorithm = "SHA-1";
|
||||||
|
|
||||||
|
public WarcDigestBuilder() throws NoSuchAlgorithmException {
|
||||||
|
this.digest = MessageDigest.getInstance(digestAlgorithm);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(String s) {
|
||||||
|
byte[] bytes = s.getBytes();
|
||||||
|
update(bytes, bytes.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(byte[] buffer, int n) {
|
||||||
|
update(buffer, 0, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update(byte[] buffer, int s, int n) {
|
||||||
|
digest.update(buffer, s, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
public WarcDigest build() {
|
||||||
|
return new WarcDigest(digest);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,170 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||||
|
|
||||||
|
import okhttp3.Protocol;
|
||||||
|
import okhttp3.Request;
|
||||||
|
import okhttp3.Response;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URLEncoder;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/** We don't have access to the raw HTTP request and response, so we need to reconstruct them
|
||||||
|
* as best is possible from the data we have available.
|
||||||
|
*/
|
||||||
|
public class WarcProtocolReconstructor {
|
||||||
|
|
||||||
|
static String getHttpRequestString(Request request, URI uri) {
|
||||||
|
StringBuilder requestStringBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
final String encodedURL = encodeURLKeepSlashes(uri.getPath());
|
||||||
|
|
||||||
|
requestStringBuilder.append(request.method()).append(" ").append(encodedURL);
|
||||||
|
|
||||||
|
if (uri.getQuery() != null) {
|
||||||
|
requestStringBuilder.append("?").append(uri.getQuery());
|
||||||
|
}
|
||||||
|
requestStringBuilder.append(" HTTP/1.1\r\n");
|
||||||
|
requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");
|
||||||
|
|
||||||
|
request.headers().toMultimap().forEach((k, values) -> {
|
||||||
|
for (var value : values) {
|
||||||
|
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return requestStringBuilder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Java's URLEncoder will URLEncode slashes, which is not desirable
|
||||||
|
* when sanitizing a URL for HTTP protocol purposes
|
||||||
|
*/
|
||||||
|
|
||||||
|
private static String encodeURLKeepSlashes(String URL) {
|
||||||
|
String[] parts = StringUtils.split(URL,"/");
|
||||||
|
StringJoiner joiner = new StringJoiner("/");
|
||||||
|
for (String part : parts) {
|
||||||
|
joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
return joiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
static String getResponseHeader(String headersAsString, int code) {
|
||||||
|
String version = "1.1";
|
||||||
|
|
||||||
|
String statusCode = String.valueOf(code);
|
||||||
|
String statusMessage = STATUS_CODE_MAP.getOrDefault(code, "Unknown");
|
||||||
|
|
||||||
|
String headerString = getHeadersAsString(headersAsString);
|
||||||
|
|
||||||
|
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
static String getResponseHeader(Response response) {
|
||||||
|
String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
|
||||||
|
|
||||||
|
String statusCode = String.valueOf(response.code());
|
||||||
|
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
|
||||||
|
|
||||||
|
String headerString = getHeadersAsString(response);
|
||||||
|
|
||||||
|
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
||||||
|
Map.entry(200, "OK"),
|
||||||
|
Map.entry(201, "Created"),
|
||||||
|
Map.entry(202, "Accepted"),
|
||||||
|
Map.entry(203, "Non-Authoritative Information"),
|
||||||
|
Map.entry(204, "No Content"),
|
||||||
|
Map.entry(205, "Reset Content"),
|
||||||
|
Map.entry(206, "Partial Content"),
|
||||||
|
Map.entry(207, "Multi-Status"),
|
||||||
|
Map.entry(208, "Already Reported"),
|
||||||
|
Map.entry(226, "IM Used"),
|
||||||
|
Map.entry(300, "Multiple Choices"),
|
||||||
|
Map.entry(301, "Moved Permanently"),
|
||||||
|
Map.entry(302, "Found"),
|
||||||
|
Map.entry(303, "See Other"),
|
||||||
|
Map.entry(304, "Not Modified"),
|
||||||
|
Map.entry(307, "Temporary Redirect"),
|
||||||
|
Map.entry(308, "Permanent Redirect"),
|
||||||
|
Map.entry(400, "Bad Request"),
|
||||||
|
Map.entry(401, "Unauthorized"),
|
||||||
|
Map.entry(403, "Forbidden"),
|
||||||
|
Map.entry(404, "Not Found"),
|
||||||
|
Map.entry(405, "Method Not Allowed"),
|
||||||
|
Map.entry(406, "Not Acceptable"),
|
||||||
|
Map.entry(408, "Request Timeout"),
|
||||||
|
Map.entry(409, "Conflict"),
|
||||||
|
Map.entry(410, "Gone"),
|
||||||
|
Map.entry(411, "Length Required"),
|
||||||
|
Map.entry(412, "Precondition Failed"),
|
||||||
|
Map.entry(413, "Payload Too Large"),
|
||||||
|
Map.entry(414, "URI Too Long"),
|
||||||
|
Map.entry(415, "Unsupported Media Type"),
|
||||||
|
Map.entry(416, "Range Not Satisfiable"),
|
||||||
|
Map.entry(417, "Expectation Failed"),
|
||||||
|
Map.entry(418, "I'm a teapot"),
|
||||||
|
Map.entry(421, "Misdirected Request"),
|
||||||
|
Map.entry(426, "Upgrade Required"),
|
||||||
|
Map.entry(428, "Precondition Required"),
|
||||||
|
Map.entry(429, "Too Many Requests"),
|
||||||
|
Map.entry(431, "Request Header Fields Too Large"),
|
||||||
|
Map.entry(451, "Unavailable For Legal Reasons"),
|
||||||
|
Map.entry(500, "Internal Server Error"),
|
||||||
|
Map.entry(501, "Not Implemented"),
|
||||||
|
Map.entry(502, "Bad Gateway"),
|
||||||
|
Map.entry(503, "Service Unavailable"),
|
||||||
|
Map.entry(504, "Gateway Timeout"),
|
||||||
|
Map.entry(505, "HTTP Version Not Supported"),
|
||||||
|
Map.entry(506, "Variant Also Negotiates"),
|
||||||
|
Map.entry(507, "Insufficient Storage"),
|
||||||
|
Map.entry(508, "Loop Detected"),
|
||||||
|
Map.entry(510, "Not Extended"),
|
||||||
|
Map.entry(511, "Network Authentication Required")
|
||||||
|
);
|
||||||
|
|
||||||
|
static private String getHeadersAsString(String headersBlob) {
|
||||||
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
|
Arrays.stream(headersBlob.split("\n")).forEach(joiner::add);
|
||||||
|
|
||||||
|
return joiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
static private String getHeadersAsString(Response response) {
|
||||||
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
|
response.headers().toMultimap().forEach((k, values) -> {
|
||||||
|
String headerCapitalized = capitalizeHeader(k);
|
||||||
|
|
||||||
|
// Omit pseudoheaders injected by the crawler itself
|
||||||
|
if (headerCapitalized.startsWith("X-Marginalia"))
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Omit Transfer-Encoding header, as we'll be using Content-Length
|
||||||
|
// instead in the warc file, despite what the server says
|
||||||
|
if (headerCapitalized.startsWith("Transfer-Encoding"))
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (var value : values) {
|
||||||
|
joiner.add(headerCapitalized + ": " + value);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return joiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
||||||
|
// for the WARC parser's sake...
|
||||||
|
static private String capitalizeHeader(String k) {
|
||||||
|
return Arrays.stream(StringUtils.split(k, '-'))
|
||||||
|
.map(StringUtils::capitalize)
|
||||||
|
.collect(Collectors.joining("-"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,402 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import okhttp3.Request;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.InetAddress;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/** Based on JWarc's fetch method, APL 2.0 license
|
||||||
|
* <p></p>
|
||||||
|
* This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
|
||||||
|
* as best is possible given not all the data is available at the same time and needs to
|
||||||
|
* be reconstructed.
|
||||||
|
*/
|
||||||
|
public class WarcRecorder implements AutoCloseable {
|
||||||
|
private static final int MAX_TIME = 30_000;
|
||||||
|
private static final int MAX_SIZE = 1024 * 1024 * 10;
|
||||||
|
private final WarcWriter writer;
|
||||||
|
private final Path warcFile;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class);
|
||||||
|
|
||||||
|
private final ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
|
||||||
|
|
||||||
|
private boolean temporaryFile = false;
|
||||||
|
|
||||||
|
// Affix a version string in case we need to change the format in the future
|
||||||
|
// in some way
|
||||||
|
private final String warcRecorderVersion = "1.0";
|
||||||
|
|
||||||
|
// We need to know if the site uses cookies so this can be reported among the search results
|
||||||
|
// -- flip this to true if we see any cookies. This information will also be painted on any
|
||||||
|
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
|
||||||
|
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new WarcRecorder that will write to the given file
|
||||||
|
*
|
||||||
|
* @param warcFile The file to write to
|
||||||
|
*/
|
||||||
|
public WarcRecorder(Path warcFile) throws IOException {
|
||||||
|
this.warcFile = warcFile;
|
||||||
|
this.writer = new WarcWriter(warcFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new WarcRecorder that will write to a temporary file
|
||||||
|
* and delete it when close() is called.
|
||||||
|
*/
|
||||||
|
public WarcRecorder() throws IOException {
|
||||||
|
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||||
|
this.writer = new WarcWriter(this.warcFile);
|
||||||
|
|
||||||
|
temporaryFile = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
|
||||||
|
IOException,
|
||||||
|
URISyntaxException,
|
||||||
|
InterruptedException
|
||||||
|
{
|
||||||
|
URI requestUri = request.url().uri();
|
||||||
|
|
||||||
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
|
String ip;
|
||||||
|
Instant date = Instant.now();
|
||||||
|
long startMillis = date.toEpochMilli();
|
||||||
|
|
||||||
|
var call = client.newCall(request);
|
||||||
|
|
||||||
|
int totalLength = 0;
|
||||||
|
|
||||||
|
WarcTruncationReason truncationReason = null;
|
||||||
|
|
||||||
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
|
||||||
|
|
||||||
|
cookieInformation.update(client, request.url());
|
||||||
|
|
||||||
|
try (var response = call.execute()) {
|
||||||
|
var body = response.body();
|
||||||
|
InputStream inputStream;
|
||||||
|
|
||||||
|
if (body == null) {
|
||||||
|
inputStream = null;
|
||||||
|
truncationReason = WarcTruncationReason.DISCONNECT;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
inputStream = body.byteStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
|
||||||
|
|
||||||
|
String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response);
|
||||||
|
|
||||||
|
responseDataBuffer.put(responseHeaders);
|
||||||
|
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length());
|
||||||
|
|
||||||
|
int dataStart = responseDataBuffer.pos();
|
||||||
|
|
||||||
|
while (inputStream != null) {
|
||||||
|
int remainingLength = responseDataBuffer.remaining();
|
||||||
|
if (remainingLength == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
int startPos = responseDataBuffer.pos();
|
||||||
|
|
||||||
|
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||||
|
if (n < 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||||
|
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||||
|
totalLength += n;
|
||||||
|
|
||||||
|
if (MAX_TIME > 0 && System.currentTimeMillis() - startMillis > MAX_TIME) {
|
||||||
|
truncationReason = WarcTruncationReason.TIME;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (MAX_SIZE > 0 && totalLength >= MAX_SIZE) {
|
||||||
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// It looks like this might be the same as requestUri, but it's not;
|
||||||
|
// it's the URI after resolving redirects.
|
||||||
|
final URI responseUri = response.request().url().uri();
|
||||||
|
|
||||||
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
|
.blockDigest(responseDigestBuilder.build())
|
||||||
|
.date(date)
|
||||||
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
|
cookieInformation.paint(responseBuilder);
|
||||||
|
|
||||||
|
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
|
||||||
|
|
||||||
|
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||||
|
|
||||||
|
if (truncationReason != null)
|
||||||
|
responseBuilder.truncated(truncationReason);
|
||||||
|
|
||||||
|
// Build and write the response
|
||||||
|
|
||||||
|
var warcResponse = responseBuilder.build();
|
||||||
|
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
writer.write(warcResponse);
|
||||||
|
|
||||||
|
// Build and write the request
|
||||||
|
|
||||||
|
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
|
String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), requestUri);
|
||||||
|
|
||||||
|
requestDigestBuilder.update(httpRequestString);
|
||||||
|
|
||||||
|
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||||
|
.blockDigest(requestDigestBuilder.build())
|
||||||
|
.date(date)
|
||||||
|
.body(MediaType.HTTP_REQUEST, httpRequestString.getBytes())
|
||||||
|
.concurrentTo(warcResponse.id())
|
||||||
|
.build();
|
||||||
|
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
writer.write(warcRequest);
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultOk(responseUri,
|
||||||
|
response.code(),
|
||||||
|
response.headers(),
|
||||||
|
ip,
|
||||||
|
responseDataBuffer.data,
|
||||||
|
dataStart,
|
||||||
|
responseDataBuffer.length() - dataStart);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to fetch URL {}", requestUri, ex);
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void resync(WarcRecord item) throws IOException {
|
||||||
|
writer.write(item);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||||
|
try {
|
||||||
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
|
byte[] bytes = documentBody.getBytes();
|
||||||
|
|
||||||
|
String fakeHeaders = STR."""
|
||||||
|
Content-Type: \{contentType}
|
||||||
|
Content-Length: \{bytes.length}
|
||||||
|
Content-Encoding: UTF-8
|
||||||
|
""";
|
||||||
|
|
||||||
|
String header = WarcProtocolReconstructor.getResponseHeader(fakeHeaders, statusCode);
|
||||||
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
|
||||||
|
responseDataBuffer.put(header);
|
||||||
|
|
||||||
|
responseDigestBuilder.update(header);
|
||||||
|
|
||||||
|
responseDigestBuilder.update(bytes, bytes.length);
|
||||||
|
payloadDigestBuilder.update(bytes, bytes.length);
|
||||||
|
responseDataBuffer.put(bytes, 0, bytes.length);
|
||||||
|
|
||||||
|
WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI())
|
||||||
|
.blockDigest(responseDigestBuilder.build())
|
||||||
|
.payloadDigest(payloadDigestBuilder.build())
|
||||||
|
.date(Instant.now())
|
||||||
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
|
cookieInformation.paint(builder);
|
||||||
|
|
||||||
|
var reference = builder.build();
|
||||||
|
|
||||||
|
reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
|
||||||
|
writer.write(reference);
|
||||||
|
|
||||||
|
} catch (URISyntaxException | IOException | NoSuchAlgorithmException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flag the given URL as skipped by the crawler, so that it will not be retried.
|
||||||
|
* Which URLs were skipped is still important when resynchronizing on the WARC file,
|
||||||
|
* so that the crawler can avoid re-fetching them.
|
||||||
|
*/
|
||||||
|
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||||
|
saveOldResponse(url, contentType, statusCode, documentBody);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a reference copy of the given document data. This is used when the crawler provides
|
||||||
|
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||||
|
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||||
|
*/
|
||||||
|
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
||||||
|
saveOldResponse(url, contentType, statusCode, documentBody);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
|
||||||
|
|
||||||
|
Map<String, List<String>> fields = new HashMap<>();
|
||||||
|
fields.put("ip", List.of(ip));
|
||||||
|
fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
|
||||||
|
fields.put("domain", List.of(domain.toString()));
|
||||||
|
|
||||||
|
switch (result) {
|
||||||
|
case DomainProber.ProbeResultRedirect redirectDomain:
|
||||||
|
fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
|
||||||
|
break;
|
||||||
|
case DomainProber.ProbeResultError error:
|
||||||
|
fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
|
||||||
|
break;
|
||||||
|
case DomainProber.ProbeResultOk ok:
|
||||||
|
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
var warcinfo = new Warcinfo.Builder()
|
||||||
|
.date(Instant.now())
|
||||||
|
.fields(fields)
|
||||||
|
.recordId(UUID.randomUUID())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
writer.write(warcinfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagAsRobotsTxtError(EdgeUrl top) {
|
||||||
|
try {
|
||||||
|
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN)
|
||||||
|
.date(Instant.now())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
writer.write(refusal);
|
||||||
|
} catch (URISyntaxException | IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) {
|
||||||
|
try {
|
||||||
|
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN)
|
||||||
|
.date(Instant.now())
|
||||||
|
.addHeader("Rejected-Content-Type", contentType)
|
||||||
|
.addHeader("Http-Status", Integer.toString(status))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
writer.write(refusal);
|
||||||
|
} catch (URISyntaxException | IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagAsError(EdgeUrl url, Exception ex) {
|
||||||
|
try {
|
||||||
|
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError)
|
||||||
|
.date(Instant.now())
|
||||||
|
.addHeader("Exception", ex.getClass().getSimpleName())
|
||||||
|
.addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), ""))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
writer.write(refusal);
|
||||||
|
} catch (URISyntaxException | IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagAsTimeout(EdgeUrl url) {
|
||||||
|
try {
|
||||||
|
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout)
|
||||||
|
.date(Instant.now())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
writer.write(refusal);
|
||||||
|
} catch (URISyntaxException | IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class ResponseDataBuffer {
|
||||||
|
private final byte[] data;
|
||||||
|
private int length = 0;
|
||||||
|
private int pos = 0;
|
||||||
|
|
||||||
|
public ResponseDataBuffer() {
|
||||||
|
data = bufferThreadLocal.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int pos() {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
public int length() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void put(String s) {
|
||||||
|
byte[] bytes = s.getBytes();
|
||||||
|
put(bytes, 0, bytes.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void put(byte[] bytes, int i, int n) {
|
||||||
|
System.arraycopy(bytes, i, data, pos, n);
|
||||||
|
pos += n;
|
||||||
|
length += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int readFrom(InputStream inputStream, int remainingLength) throws IOException {
|
||||||
|
int n = inputStream.read(data, pos, remainingLength);
|
||||||
|
if (n > 0) {
|
||||||
|
pos += n;
|
||||||
|
length += n;
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int remaining() {
|
||||||
|
return MAX_SIZE - pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateDigest(WarcDigestBuilder digestBuilder, int startPos, int n) {
|
||||||
|
digestBuilder.update(data, startPos, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] copyBytes() {
|
||||||
|
byte[] copy = new byte[length];
|
||||||
|
System.arraycopy(data, 0, copy, 0, length);
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
writer.close();
|
||||||
|
if (temporaryFile)
|
||||||
|
Files.deleteIfExists(warcFile);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,108 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.revisit;
|
||||||
|
|
||||||
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
/** This class encapsulates the logic for re-visiting a domain that has already been crawled.
|
||||||
|
* We may use information from the previous crawl to inform the next crawl, specifically the
|
||||||
|
* E-Tag and Last-Modified headers.
|
||||||
|
*/
|
||||||
|
public class CrawlerRevisitor {
|
||||||
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
private final CrawlerRetreiver crawlerRetreiver;
|
||||||
|
private final WarcRecorder warcRecorder;
|
||||||
|
|
||||||
|
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
||||||
|
CrawlerRetreiver crawlerRetreiver,
|
||||||
|
WarcRecorder warcRecorder) {
|
||||||
|
this.crawlFrontier = crawlFrontier;
|
||||||
|
this.crawlerRetreiver = crawlerRetreiver;
|
||||||
|
this.warcRecorder = warcRecorder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||||
|
public int recrawl(CrawlDataReference oldCrawlData,
|
||||||
|
SimpleRobotRules robotsRules,
|
||||||
|
CrawlDelayTimer delayTimer)
|
||||||
|
throws InterruptedException {
|
||||||
|
int recrawled = 0;
|
||||||
|
int retained = 0;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
CrawledDocument doc = oldCrawlData.nextDocument();
|
||||||
|
|
||||||
|
if (doc == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This Shouldn't Happen (TM)
|
||||||
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
|
if (urlMaybe.isEmpty()) continue;
|
||||||
|
var url = urlMaybe.get();
|
||||||
|
|
||||||
|
// If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
|
||||||
|
if (doc.httpStatus == 404) {
|
||||||
|
crawlFrontier.addVisited(url);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doc.httpStatus != 200) continue;
|
||||||
|
|
||||||
|
if (!robotsRules.isAllowed(url.toString())) {
|
||||||
|
warcRecorder.flagAsRobotsTxtError(url);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!crawlFrontier.filterLink(url))
|
||||||
|
continue;
|
||||||
|
if (!crawlFrontier.addVisited(url))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
|
||||||
|
if (recrawled > 5
|
||||||
|
&& retained > 0.9 * recrawled
|
||||||
|
&& Math.random() < 0.9)
|
||||||
|
{
|
||||||
|
// Since it looks like most of these documents haven't changed,
|
||||||
|
// we'll load the documents directly; but we do this in a random
|
||||||
|
// fashion to make sure we eventually catch changes over time
|
||||||
|
// and ensure we discover new links
|
||||||
|
|
||||||
|
crawlFrontier.addVisited(url);
|
||||||
|
|
||||||
|
// Hoover up any links from the document
|
||||||
|
if (doc.httpStatus == 200 && doc.documentBody != null) {
|
||||||
|
var parsedDoc = Jsoup.parse(doc.documentBody);
|
||||||
|
crawlFrontier.enqueueLinksFromDocument(url, parsedDoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a WARC record so we don't repeat this
|
||||||
|
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// GET the document with the stored document as a reference
|
||||||
|
// providing etag and last-modified headers, so we can recycle the
|
||||||
|
// document if it hasn't changed without actually downloading it
|
||||||
|
|
||||||
|
var reference = new DocumentWithReference(doc, oldCrawlData);
|
||||||
|
var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
|
||||||
|
|
||||||
|
if (reference.isSame(result)) {
|
||||||
|
retained++;
|
||||||
|
}
|
||||||
|
|
||||||
|
recrawled++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return recrawled;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,77 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.revisit;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
|
public record DocumentWithReference(
|
||||||
|
@Nullable CrawledDocument doc,
|
||||||
|
@Nullable CrawlDataReference reference) {
|
||||||
|
|
||||||
|
private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
|
||||||
|
|
||||||
|
public static DocumentWithReference empty() {
|
||||||
|
return emptyInstance;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the provided document is the same as the reference document,
|
||||||
|
* or if the result was retained via HTTP 304.
|
||||||
|
*/
|
||||||
|
public boolean isSame(HttpFetchResult result) {
|
||||||
|
if (result instanceof HttpFetchResult.Result304Raw)
|
||||||
|
return true;
|
||||||
|
if (result instanceof HttpFetchResult.Result304ReplacedWithReference)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!(result instanceof HttpFetchResult.ResultOk resultOk))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (reference == null)
|
||||||
|
return false;
|
||||||
|
if (doc == null)
|
||||||
|
return false;
|
||||||
|
if (doc.documentBody == null)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!(DocumentBodyExtractor.asString(resultOk) instanceof DocumentBodyResult.Ok<String> bodyOk)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return reference.isContentBodySame(doc.documentBody, bodyOk.body());
|
||||||
|
}
|
||||||
|
|
||||||
|
public ContentTags getContentTags() {
|
||||||
|
if (null == doc)
|
||||||
|
return ContentTags.empty();
|
||||||
|
|
||||||
|
String headers = doc.headers;
|
||||||
|
if (headers == null)
|
||||||
|
return ContentTags.empty();
|
||||||
|
|
||||||
|
String[] headersLines = headers.split("\n");
|
||||||
|
|
||||||
|
String lastmod = null;
|
||||||
|
String etag = null;
|
||||||
|
|
||||||
|
for (String line : headersLines) {
|
||||||
|
if (line.toLowerCase().startsWith("etag:")) {
|
||||||
|
etag = line.substring(5).trim();
|
||||||
|
}
|
||||||
|
if (line.toLowerCase().startsWith("last-modified:")) {
|
||||||
|
lastmod = line.substring(14).trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ContentTags(etag, lastmod);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return doc == null || reference == null;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,71 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.sitemap;
|
||||||
|
|
||||||
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class SitemapFetcher {
|
||||||
|
|
||||||
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
private final SitemapRetriever sitemapRetriever;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
|
||||||
|
|
||||||
|
public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
|
||||||
|
this.crawlFrontier = crawlFrontier;
|
||||||
|
this.sitemapRetriever = sitemapRetriever;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
||||||
|
List<String> sitemaps = robotsRules.getSitemaps();
|
||||||
|
|
||||||
|
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
|
||||||
|
if (!sitemaps.isEmpty()) {
|
||||||
|
for (var url : sitemaps) {
|
||||||
|
EdgeUrl.parse(url).ifPresent(urls::add);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
|
||||||
|
}
|
||||||
|
|
||||||
|
downloadSitemaps(urls);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void downloadSitemaps(List<EdgeUrl> urls) {
|
||||||
|
|
||||||
|
Set<String> checkedSitemaps = new HashSet<>();
|
||||||
|
|
||||||
|
for (var url : urls) {
|
||||||
|
// Let's not download sitemaps from other domains for now
|
||||||
|
if (!crawlFrontier.isSameDomain(url)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkedSitemaps.contains(url.path))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var sitemap = sitemapRetriever.fetchSitemap(url);
|
||||||
|
if (sitemap.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensure we don't try to download this sitemap again
|
||||||
|
// (don't move this up, as we may want to check the same
|
||||||
|
// path with different protocols until we find one that works)
|
||||||
|
|
||||||
|
checkedSitemaps.add(url.path);
|
||||||
|
|
||||||
|
crawlFrontier.addAllToQueue(sitemap);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug("Queue is now {}", crawlFrontier.queueSize());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,88 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import okhttp3.Request;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.netpreserve.jwarc.WarcReader;
|
||||||
|
import org.netpreserve.jwarc.WarcRequest;
|
||||||
|
import org.netpreserve.jwarc.WarcResponse;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class CrawlerWarcResynchronizerTest {
|
||||||
|
Path fileName;
|
||||||
|
Path outputFile;
|
||||||
|
OkHttpClient httpClient;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
httpClient = new OkHttpClient.Builder()
|
||||||
|
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
fileName = Files.createTempFile("test", ".warc.gz");
|
||||||
|
outputFile = Files.createTempFile("test", ".warc.gz");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
Files.deleteIfExists(fileName);
|
||||||
|
Files.deleteIfExists(outputFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void run() throws IOException, URISyntaxException {
|
||||||
|
try (var oldRecorder = new WarcRecorder(fileName)) {
|
||||||
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||||
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||||
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||||
|
} catch (Exception e) {
|
||||||
|
fail(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||||
|
|
||||||
|
try (var newRecorder = new WarcRecorder(outputFile)) {
|
||||||
|
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/")));
|
||||||
|
assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/")));
|
||||||
|
assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/feed/")));
|
||||||
|
|
||||||
|
try (var warcReader = new WarcReader(outputFile)) {
|
||||||
|
for (var item : warcReader) {
|
||||||
|
if (item instanceof WarcRequest req) {
|
||||||
|
System.out.println("req:" + req.target());
|
||||||
|
}
|
||||||
|
if (item instanceof WarcResponse rsp) {
|
||||||
|
System.out.println("req:" + rsp.target());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new GZIPInputStream(Files.newInputStream(outputFile)).transferTo(System.out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
|
var req = new Request.Builder().url(url)
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.get().build();
|
||||||
|
recorder.fetch(httpClient, req);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.ConnectionPool;
|
||||||
|
import okhttp3.Dispatcher;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class ContentTypeProberTest {
|
||||||
|
|
||||||
|
ContentTypeProber prober;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
OkHttpClient client = new OkHttpClient.Builder()
|
||||||
|
.dispatcher(new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()))
|
||||||
|
.connectionPool(new ConnectionPool(0, 1, TimeUnit.NANOSECONDS))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
prober = new ContentTypeProber("test.marginalia.nu", client);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void probeContentType() throws URISyntaxException {
|
||||||
|
assertEquals(
|
||||||
|
new Ok(new EdgeUrl("https://www.marginalia.nu/robots.txt")),
|
||||||
|
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/robots.txt")),
|
||||||
|
"robots.txt is expected to pass the probing test since it's text/plain"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
new BadContentType("image/png", 200),
|
||||||
|
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/sanic.png")),
|
||||||
|
"sanic.png is expected to pass the probing test since it's image/png"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
new Ok(new EdgeUrl("https://www.marginalia.nu/dev/null")),
|
||||||
|
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/dev/null")),
|
||||||
|
"Despite being a 404, we expect this to be passed as OK as it's NotMyJob(TM) to verify response codes"
|
||||||
|
);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
new Ok(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi/")),
|
||||||
|
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi")),
|
||||||
|
"about.gmi is expected to give a redirect to about.gmi/ which is served as text/html"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -7,30 +8,30 @@ import java.util.List;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
class HttpFetcherImplTest {
|
class CrawledDocumentParquetRecordFileWriterTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testXRobotsTag() {
|
public void testXRobotsTag() {
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
|
||||||
|
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
|
||||||
|
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,147 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import okhttp3.Request;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class WarcRecorderTest {
|
||||||
|
Path fileNameWarc;
|
||||||
|
Path fileNameParquet;
|
||||||
|
WarcRecorder client;
|
||||||
|
OkHttpClient httpClient;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
httpClient = new OkHttpClient.Builder()
|
||||||
|
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||||
|
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||||
|
|
||||||
|
client = new WarcRecorder(fileNameWarc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
client.close();
|
||||||
|
Files.delete(fileNameWarc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
|
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.get().build());
|
||||||
|
|
||||||
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
|
warcReader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
sampleData.put(record.type(), req.target());
|
||||||
|
}
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
sampleData.put(record.type(), rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals("https://www.marginalia.nu/", sampleData.get("request"));
|
||||||
|
assertEquals("https://www.marginalia.nu/", sampleData.get("response"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
|
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||||
|
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
|
"text/html",
|
||||||
|
200,
|
||||||
|
"<?doctype html><html><body>test</body></html>");
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(fileNameWarc)) {
|
||||||
|
for (var record : reader) {
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
assertEquals("https://www.marginalia.nu/", rsp.target());
|
||||||
|
assertEquals("text/html", rsp.contentType().type());
|
||||||
|
assertEquals(200, rsp.http().status());
|
||||||
|
assertEquals("1", rsp.http().headers().first("X-Cookies").orElse(null));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSaveImport() throws URISyntaxException, IOException {
|
||||||
|
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||||
|
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
|
"text/html",
|
||||||
|
200,
|
||||||
|
"<?doctype html><html><body>test</body></html>");
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(fileNameWarc)) {
|
||||||
|
WarcXResponseReference.register(reader);
|
||||||
|
|
||||||
|
for (var record : reader) {
|
||||||
|
System.out.println(record.type());
|
||||||
|
System.out.println(record.getClass().getSimpleName());
|
||||||
|
if (record instanceof WarcXResponseReference rsp) {
|
||||||
|
assertEquals("https://www.marginalia.nu/", rsp.target());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
|
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.get().build());
|
||||||
|
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.get().build());
|
||||||
|
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.get().build());
|
||||||
|
client.close();
|
||||||
|
|
||||||
|
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||||
|
"www.marginalia.nu",
|
||||||
|
new UserAgent("test"),
|
||||||
|
fileNameWarc,
|
||||||
|
fileNameParquet);
|
||||||
|
|
||||||
|
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||||
|
assertEquals(3, urls.size());
|
||||||
|
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||||
|
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||||
|
assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -4,11 +4,15 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
class HttpFetcherTest {
|
class HttpFetcherTest {
|
||||||
@ -28,16 +32,25 @@ class HttpFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchUTF8() throws URISyntaxException, RateLimitException {
|
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
|
try (var recorder = new WarcRecorder()) {
|
||||||
System.out.println(str.contentType);
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
|
||||||
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||||
|
System.out.println(bodyOk.contentType());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchText() throws URISyntaxException, RateLimitException {
|
void fetchText() throws URISyntaxException, RateLimitException, IOException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
|
|
||||||
System.out.println(str);
|
try (var recorder = new WarcRecorder()) {
|
||||||
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
|
||||||
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||||
|
System.out.println(bodyOk.contentType());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -5,6 +5,8 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
@ -12,17 +14,16 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
|
import okhttp3.Headers;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class CrawlerMockFetcherTest {
|
public class CrawlerMockFetcherTest {
|
||||||
|
|
||||||
@ -61,44 +62,42 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void crawl(CrawlSpecRecord spec) throws IOException {
|
||||||
|
try (var recorder = new WarcRecorder()) {
|
||||||
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
||||||
|
.fetch();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLemmy() throws URISyntaxException {
|
public void testLemmy() throws URISyntaxException, IOException {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
|
crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
|
||||||
.fetch();
|
|
||||||
|
|
||||||
out.forEach(System.out::println);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMediawiki() throws URISyntaxException {
|
public void testMediawiki() throws URISyntaxException, IOException {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
|
crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
|
||||||
.fetch();
|
|
||||||
|
|
||||||
out.forEach(System.out::println);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDiscourse() throws URISyntaxException {
|
public void testDiscourse() throws URISyntaxException, IOException {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
|
crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
|
||||||
.fetch();
|
|
||||||
|
|
||||||
out.forEach(System.out::println);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class MockFetcher implements HttpFetcher {
|
class MockFetcher implements HttpFetcher {
|
||||||
@ -118,25 +117,28 @@ public class CrawlerMockFetcherTest {
|
|||||||
return new FetchResult(FetchResultState.OK, url);
|
return new FetchResult(FetchResultState.OK, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
|
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
|
||||||
logger.info("Fetching {}", url);
|
logger.info("Fetching {}", url);
|
||||||
if (mockData.containsKey(url)) {
|
if (mockData.containsKey(url)) {
|
||||||
return mockData.get(url);
|
byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
|
||||||
}
|
return new HttpFetchResult.ResultOk(
|
||||||
else {
|
url.asURI(),
|
||||||
return CrawledDocument.builder()
|
200,
|
||||||
.crawlId("1")
|
new Headers.Builder().build(),
|
||||||
.url(url.toString())
|
"127.0.0.1",
|
||||||
.contentType("text/html")
|
bodyBytes,
|
||||||
.httpStatus(404)
|
0,
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
bodyBytes.length
|
||||||
.build();
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
return new SimpleRobotRules();
|
return new SimpleRobotRules();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -144,5 +146,6 @@ public class CrawlerMockFetcherTest {
|
|||||||
public SitemapRetriever createSitemapRetriever() {
|
public SitemapRetriever createSitemapRetriever() {
|
||||||
return Mockito.mock(SitemapRetriever.class);
|
return Mockito.mock(SitemapRetriever.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
|||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
@ -15,22 +16,24 @@ import nu.marginalia.crawling.model.CrawledDomain;
|
|||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
class CrawlerRetreiverTest {
|
class CrawlerRetreiverTest {
|
||||||
private HttpFetcher httpFetcher;
|
private HttpFetcher httpFetcher;
|
||||||
|
|
||||||
|
Path tempFile;
|
||||||
|
Path tempFile2;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||||
@ -43,8 +46,62 @@ class CrawlerRetreiverTest {
|
|||||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
if (tempFile != null) {
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
if (tempFile2 != null) {
|
||||||
|
Files.deleteIfExists(tempFile2);
|
||||||
|
}
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
public void testWithKnownDomains() {
|
public void testWarcOutput() throws IOException {
|
||||||
|
var specs = CrawlSpecRecord
|
||||||
|
.builder()
|
||||||
|
.crawlDepth(5)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
|
||||||
|
.build();
|
||||||
|
Path tempFile = null;
|
||||||
|
try {
|
||||||
|
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||||
|
|
||||||
|
try (var recorder = new WarcRecorder(tempFile)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
Assertions.fail(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> requests = new HashSet<>();
|
||||||
|
Set<String> responses = new HashSet<>();
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(tempFile)) {
|
||||||
|
reader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
requests.add(req.target());
|
||||||
|
System.out.println(req.type() + ":" + req.target());
|
||||||
|
}
|
||||||
|
else if (record instanceof WarcResponse rsp) {
|
||||||
|
responses.add(rsp.target());
|
||||||
|
System.out.println(rsp.type() + ":" + rsp.target());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(record.type());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||||
|
assertEquals(requests, responses);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (tempFile != null)
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testWithKnownDomains() throws IOException {
|
||||||
var specs = CrawlSpecRecord
|
var specs = CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(5)
|
.crawlDepth(5)
|
||||||
@ -54,10 +111,30 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
tempFile = Files.createTempFile("crawling-process", ".warc");
|
||||||
|
|
||||||
|
try (var recorder = new WarcRecorder(tempFile)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
Assertions.fail(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
|
data.add(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
var fetchedUrls =
|
var fetchedUrls =
|
||||||
data.stream().filter(CrawledDocument.class::isInstance)
|
data.stream()
|
||||||
|
.peek(System.out::println)
|
||||||
|
.filter(CrawledDocument.class::isInstance)
|
||||||
.map(CrawledDocument.class::cast)
|
.map(CrawledDocument.class::cast)
|
||||||
.map(doc -> doc.url)
|
.map(doc -> doc.url)
|
||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
@ -72,7 +149,7 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEmptySet() {
|
public void testEmptySet() throws IOException {
|
||||||
|
|
||||||
var specs = CrawlSpecRecord
|
var specs = CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
@ -81,9 +158,29 @@ class CrawlerRetreiverTest {
|
|||||||
.urls(List.of())
|
.urls(List.of())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
|
tempFile = Files.createTempFile("crawling-process", ".warc");
|
||||||
|
|
||||||
|
try (var recorder = new WarcRecorder(tempFile)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
Assertions.fail(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
|
data.add(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
data.stream().filter(CrawledDocument.class::isInstance)
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
.map(CrawledDocument.class::cast)
|
.map(CrawledDocument.class::cast)
|
||||||
@ -115,33 +212,70 @@ class CrawlerRetreiverTest {
|
|||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
|
||||||
Path out = Files.createTempDirectory("crawling-process");
|
tempFile = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
var writer = new CrawledDomainWriter(out, specs.domain, "idid");
|
tempFile2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
|
|
||||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
|
try (var recorder = new WarcRecorder(tempFile)) {
|
||||||
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||||
if (d instanceof CrawledDocument doc) {
|
}
|
||||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
catch (IOException ex) {
|
||||||
if (Math.random() > 0.5) {
|
Assertions.fail(ex);
|
||||||
doc.headers = "";
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
writer.accept(d);
|
|
||||||
}).fetch();
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
var reader = new CrawledDomainReader();
|
try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
|
||||||
var stream = reader.createDataStream(out, specs.domain, "idid");
|
while (stream.hasNext()) {
|
||||||
|
var doc = stream.next();
|
||||||
|
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
var stream = CrawledDomainReader.createDataStream(tempFile);
|
||||||
|
|
||||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||||
|
try (var recorder = new WarcRecorder(tempFile2)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
||||||
|
new CrawlDataReference(stream));
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
Assertions.fail(ex);
|
||||||
|
}
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
|
new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
|
||||||
if (d instanceof CrawledDocument doc) {
|
|
||||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
try (var reader = new WarcReader(tempFile2)) {
|
||||||
|
WarcXResponseReference.register(reader);
|
||||||
|
|
||||||
|
reader.forEach(record -> {
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
try {
|
||||||
|
System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (record instanceof WarcMetadata rsp) {
|
||||||
|
System.out.println("meta:" + rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var ds = CrawledDomainReader.createDataStream(tempFile2)) {
|
||||||
|
while (ds.hasNext()) {
|
||||||
|
var doc = ds.next();
|
||||||
|
if (doc instanceof CrawledDomain dr) {
|
||||||
|
System.out.println(dr.domain + "/" + dr.crawlerStatus);
|
||||||
|
}
|
||||||
|
else if (doc instanceof CrawledDocument dc) {
|
||||||
|
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}).fetch(new DomainLinks(), new CrawlDataReference(stream));
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -32,6 +32,7 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
public record Convert(FileStorageId fid) implements ActorStep {};
|
public record Convert(FileStorageId fid) implements ActorStep {};
|
||||||
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
|
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
|
||||||
public record ConvertDirtree(String source) implements ActorStep {};
|
public record ConvertDirtree(String source) implements ActorStep {};
|
||||||
|
public record ConvertWarc(String source) implements ActorStep {};
|
||||||
public record ConvertStackexchange(String source) implements ActorStep {};
|
public record ConvertStackexchange(String source) implements ActorStep {};
|
||||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||||
public record ConvertWait(FileStorageId destFid,
|
public record ConvertWait(FileStorageId destFid,
|
||||||
@ -74,6 +75,25 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
mqConverterOutbox.sendAsync(ConvertRequest.forDirtree(sourcePath, processedArea.id()))
|
mqConverterOutbox.sendAsync(ConvertRequest.forDirtree(sourcePath, processedArea.id()))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
case ConvertWarc(String source) -> {
|
||||||
|
Path sourcePath = Path.of(source);
|
||||||
|
if (!Files.exists(sourcePath))
|
||||||
|
yield new Error("Source path does not exist: " + sourcePath);
|
||||||
|
|
||||||
|
String fileName = sourcePath.toFile().getName();
|
||||||
|
|
||||||
|
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||||
|
var processedArea = storageService.allocateTemporaryStorage(base,
|
||||||
|
FileStorageType.PROCESSED_DATA, "processed-data",
|
||||||
|
"Processed Warc Data; " + fileName);
|
||||||
|
|
||||||
|
storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);
|
||||||
|
|
||||||
|
yield new ConvertWait(
|
||||||
|
processedArea.id(),
|
||||||
|
mqConverterOutbox.sendAsync(ConvertRequest.forWarc(sourcePath, processedArea.id()))
|
||||||
|
);
|
||||||
|
}
|
||||||
case ConvertEncyclopedia(String source, String baseUrl) -> {
|
case ConvertEncyclopedia(String source, String baseUrl) -> {
|
||||||
|
|
||||||
Path sourcePath = Path.of(source);
|
Path sourcePath = Path.of(source);
|
||||||
|
@ -63,8 +63,6 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||||
|
|
||||||
var reader = new CrawledDomainReader();
|
|
||||||
|
|
||||||
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
|
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
@ -78,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||||
try (var stream = reader.createDataStream(crawlDataPath)) {
|
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||||
exportLinks(tagWriter, stream);
|
exportLinks(tagWriter, stream);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
@ -170,6 +170,7 @@ public class IndexQueryService extends IndexApiImplBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// GRPC endpoint
|
// GRPC endpoint
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void query(nu.marginalia.index.api.RpcIndexQuery request,
|
public void query(nu.marginalia.index.api.RpcIndexQuery request,
|
||||||
|
@ -29,13 +29,11 @@ public class CrawlDataUnfcker {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var reader = new CrawledDomainReader();
|
|
||||||
|
|
||||||
try (var wl = new WorkLog(output.resolve("crawler.log"))) {
|
try (var wl = new WorkLog(output.resolve("crawler.log"))) {
|
||||||
for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
|
for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
|
||||||
Path inputPath = input.resolve(inputItem.relPath());
|
Path inputPath = input.resolve(inputItem.relPath());
|
||||||
|
|
||||||
var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain);
|
var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain);
|
||||||
if (domainMaybe.isEmpty())
|
if (domainMaybe.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
var domain = domainMaybe.get();
|
var domain = domainMaybe.get();
|
||||||
@ -43,7 +41,7 @@ public class CrawlDataUnfcker {
|
|||||||
// Generate conformant ID
|
// Generate conformant ID
|
||||||
String newId = Integer.toHexString(domain.hashCode());
|
String newId = Integer.toHexString(domain.hashCode());
|
||||||
|
|
||||||
var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain);
|
var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain);
|
||||||
var outputFileName = outputPath.toFile().getName();
|
var outputFileName = outputPath.toFile().getName();
|
||||||
|
|
||||||
System.out.println(inputPath + " -> " + outputPath);
|
System.out.println(inputPath + " -> " + outputPath);
|
||||||
@ -56,13 +54,13 @@ public class CrawlDataUnfcker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Optional<CrawledDomain> readDomain(CrawledDomainReader reader, Path file) {
|
static Optional<CrawledDomain> readDomain(Path file) {
|
||||||
if (!Files.exists(file)) {
|
if (!Files.exists(file)) {
|
||||||
System.out.println("Missing file " + file);
|
System.out.println("Missing file " + file);
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var stream = reader.createDataStream(file)) {
|
try (var stream = CrawledDomainReader.createDataStream(file)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDomain domain) {
|
if (stream.next() instanceof CrawledDomain domain) {
|
||||||
return Optional.of(domain);
|
return Optional.of(domain);
|
||||||
|
@ -50,10 +50,9 @@ public class ExperimentRunnerMain {
|
|||||||
experiment.args(Arrays.copyOfRange(args, 2, args.length));
|
experiment.args(Arrays.copyOfRange(args, 2, args.length));
|
||||||
|
|
||||||
Path basePath = Path.of(args[0]);
|
Path basePath = Path.of(args[0]);
|
||||||
var reader = new CrawledDomainReader();
|
|
||||||
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
|
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
|
||||||
Path crawlDataPath = basePath.resolve(item.relPath());
|
Path crawlDataPath = basePath.resolve(item.relPath());
|
||||||
try (var stream = reader.createDataStream(crawlDataPath)) {
|
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||||
experiment.process(stream);
|
experiment.process(stream);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
@ -5,12 +5,12 @@ import nu.marginalia.crawling.model.CrawledDocument;
|
|||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public abstract class LegacyExperiment extends Experiment {
|
public abstract class LegacyExperiment extends Experiment {
|
||||||
public abstract boolean process(CrawledDomain domain);
|
public abstract boolean process(CrawledDomain domain);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean process(SerializableCrawlDataStream dataStream) throws IOException {
|
public boolean process(SerializableCrawlDataStream dataStream) throws IOException {
|
||||||
List<CrawledDocument> documentList = new ArrayList<>();
|
List<CrawledDocument> documentList = new ArrayList<>();
|
||||||
|
@ -41,6 +41,7 @@ include 'code:features-convert:topic-detection'
|
|||||||
|
|
||||||
include 'code:features-crawl:crawl-blocklist'
|
include 'code:features-crawl:crawl-blocklist'
|
||||||
include 'code:features-crawl:link-parser'
|
include 'code:features-crawl:link-parser'
|
||||||
|
include 'code:features-crawl:content-type'
|
||||||
|
|
||||||
include 'code:features-index:index-journal'
|
include 'code:features-index:index-journal'
|
||||||
include 'code:features-index:index-query'
|
include 'code:features-index:index-query'
|
||||||
@ -154,6 +155,8 @@ dependencyResolutionManagement {
|
|||||||
library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1')
|
library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1')
|
||||||
library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0')
|
library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0')
|
||||||
|
|
||||||
|
library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
|
||||||
|
|
||||||
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
|
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
|
||||||
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
|
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
|
||||||
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
||||||
|
@ -13,6 +13,7 @@ import org.apache.parquet.io.DelegatingSeekableInputStream;
|
|||||||
import org.apache.parquet.io.InputFile;
|
import org.apache.parquet.io.InputFile;
|
||||||
import org.apache.parquet.io.SeekableInputStream;
|
import org.apache.parquet.io.SeekableInputStream;
|
||||||
import org.apache.parquet.io.api.GroupConverter;
|
import org.apache.parquet.io.api.GroupConverter;
|
||||||
|
import org.apache.parquet.schema.LogicalTypeAnnotation;
|
||||||
import org.apache.parquet.schema.MessageType;
|
import org.apache.parquet.schema.MessageType;
|
||||||
import org.apache.parquet.schema.PrimitiveType;
|
import org.apache.parquet.schema.PrimitiveType;
|
||||||
|
|
||||||
@ -144,7 +145,11 @@ public final class ParquetReader<U, S> implements Spliterator<S>, Closeable {
|
|||||||
case BINARY:
|
case BINARY:
|
||||||
case FIXED_LEN_BYTE_ARRAY:
|
case FIXED_LEN_BYTE_ARRAY:
|
||||||
case INT96:
|
case INT96:
|
||||||
return primitiveType.stringifier().stringify(columnReader.getBinary());
|
if (primitiveType.getLogicalTypeAnnotation() == null) {
|
||||||
|
return columnReader.getBinary().getBytes();
|
||||||
|
} else {
|
||||||
|
return primitiveType.stringifier().stringify(columnReader.getBinary());
|
||||||
|
}
|
||||||
case BOOLEAN:
|
case BOOLEAN:
|
||||||
return columnReader.getBoolean();
|
return columnReader.getBoolean();
|
||||||
case DOUBLE:
|
case DOUBLE:
|
||||||
|
@ -242,7 +242,7 @@ public final class ParquetWriter<T> implements Closeable {
|
|||||||
if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) {
|
if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) {
|
||||||
recordConsumer.addBinary(Binary.fromString((String)value));
|
recordConsumer.addBinary(Binary.fromString((String)value));
|
||||||
} else {
|
} else {
|
||||||
throw new UnsupportedOperationException("We don't support writing logical annotation type " + type.getLogicalTypeAnnotation());
|
recordConsumer.addBinary(Binary.fromConstantByteArray((byte[])value));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
Loading…
Reference in New Issue
Block a user