Merge pull request #62 from MarginaliaSearch/warc

(WIP) Use WARCs in the crawler
This commit is contained in:
Viktor 2023-12-16 16:02:46 +01:00 committed by GitHub
commit 8bbb533c9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
82 changed files with 4209 additions and 1107 deletions

View File

@ -4,5 +4,6 @@ public enum ConvertAction {
ConvertCrawlData,
SideloadEncyclopedia,
SideloadDirtree,
SideloadWarc,
SideloadStackexchange
}

View File

@ -38,6 +38,13 @@ public class ConvertRequest {
destId,
null);
}
public static ConvertRequest forWarc(Path sourcePath, FileStorageId destId) {
return new ConvertRequest(ConvertAction.SideloadWarc,
sourcePath.toString(),
null,
destId,
null);
}
public static ConvertRequest forStackexchange(Path sourcePath, FileStorageId destId) {
return new ConvertRequest(ConvertAction.SideloadStackexchange,

View File

@ -224,12 +224,19 @@ public class EdgeUrl implements Serializable {
}
public URL asURL() throws MalformedURLException {
int port = this.port != null ? this.port : switch(proto) {
case "http" -> 80;
case "https" -> 443;
default -> 0;
};
try {
return asURI().toURL();
}
catch (URISyntaxException e) {
throw new MalformedURLException(e.getMessage());
}
}
return new URL(this.proto, this.domain.toString(), port, this.path);
public URI asURI() throws URISyntaxException {
if (port != null) {
return new URI(this.proto, null, this.domain.toString(), this.port, this.path, this.param, null);
}
return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
}
}

View File

@ -0,0 +1,29 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:model')
implementation libs.crawlercommons
implementation libs.notnull
implementation libs.bundles.gson
implementation libs.bundles.slf4j
testImplementation libs.bundles.slf4j.test
implementation libs.jsoup
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.contenttype;
import org.apache.commons.lang3.StringUtils;
/** Content type and charset of a document
* @param contentType The content type, e.g. "text/html"
* @param charset The charset, e.g. "UTF-8"
*/
public record ContentType(String contentType, String charset) {
public static ContentType parse(String contentTypeHeader) {
String[] parts = StringUtils.split(contentTypeHeader, ";", 2);
String contentType = parts[0].trim();
String charset = parts.length > 1 ? parts[1].trim() : "UTF-8";
return new ContentType(contentType, charset);
}
public boolean is(String contentType) {
return this.contentType.equalsIgnoreCase(contentType);
}
public String toString() {
if (charset == null || charset.isBlank())
return contentType;
return STR."\{contentType}; charset=\{charset}";
}
}

View File

@ -1,7 +1,8 @@
package nu.marginalia.crawl.retreival.logic;
package nu.marginalia.contenttype;
import crawlercommons.mimetypes.MimeTypeDetector;
import nu.marginalia.crawling.model.ContentType;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.Jsoup;
import java.util.Arrays;
@ -11,28 +12,40 @@ public class ContentTypeParser {
static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector();
public static ContentType parse(String contentType, byte[] data) {
return getContentTypeFromContentTypeString(contentType)
.or(() -> getContentTypeStringFromTag(data))
/** Parse the content type and charset from a content type header and/or the body of a document,
* best effort
*/
public static ContentType parseContentType(
@Nullable String contentTypeHeader,
@NotNull byte[] body)
{
return getContentTypeFromContentTypeString(contentTypeHeader)
.or(() -> getContentTypeStringFromTag(body))
.orElseGet(() -> {
Optional<String> charset = getCharsetFromTag(data);
Optional<String> charset = getCharsetFromTag(body);
return new ContentType(
Optional.ofNullable(contentType)
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(data)))
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1"));
Optional.ofNullable(contentTypeHeader)
.or(() -> Optional.ofNullable(mimeTypeDetector.detect(body)))
.orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1"));
});
}
private static Optional<ContentType> getContentTypeFromContentTypeString(String contentType) {
if (contentType != null && contentType.contains(";")) {
var parts = contentType.split(";");
var content = parts[0].trim();
var extra = parts[1].trim();
if (extra.startsWith("charset=")) {
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
}
}
return Optional.empty();
/** Parse the charset from a content type string. */
private static Optional<ContentType> getContentTypeFromContentTypeString(@Nullable String contentType) {
if (contentType == null)
return Optional.empty();
if (!contentType.contains(";"))
return Optional.empty();
var parts = contentType.split(";");
var content = parts[0].trim();
var extra = parts[1].trim();
if (!extra.startsWith("charset="))
return Optional.empty();
return Optional.of(new ContentType(content, extra.substring("charset=".length())));
}
private static String shittyMimeSniffer(byte[] data) {
@ -45,6 +58,7 @@ public class ContentTypeParser {
String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase();
if (startStr.contains("<!doctype html") || startStr.contains("<html")) {
// note we use contains here, since xhtml may be served with a <?xml-style header first
return "text/html";
}
else {

View File

@ -0,0 +1,27 @@
package nu.marginalia.contenttype;
import java.nio.charset.*;
public class DocumentBodyToString {
/** Get the string data from a document body, given the content type and charset */
public static String getStringData(ContentType type, byte[] data) {
Charset charset;
try {
charset = Charset.forName(type.charset());
}
catch (IllegalCharsetNameException ex) {
// Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe?
charset = StandardCharsets.UTF_8;
}
catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
//
// It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1;
}
return new String(data, charset);
}
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.contenttype;
import org.junit.jupiter.api.Test;
import java.nio.charset.StandardCharsets;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public class ContentTypeParserTest {
@Test
public void testParseContentTypeWithHeader() {
byte[] body = "<!DOCTYPE html><html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
String contentTypeHeader = "text/html; charset=UTF-8";
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
assertNotNull(result);
assertEquals("text/html", result.contentType());
assertEquals("UTF-8", result.charset());
}
@Test
public void testParseContentTypeWithMetaCharset() {
byte[] body = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8);
ContentType result = ContentTypeParser.parseContentType(null, body);
assertNotNull(result);
assertEquals("text/html", result.contentType());
assertEquals("UTF-8", result.charset());
}
@Test
public void testParseContentTypeWithHeaderValueAbsent() {
byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8);
String contentTypeHeader = "text/plain";
ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body);
assertNotNull(result);
assertEquals("text/plain", result.contentType());
assertEquals("ISO_8859_1", result.charset());
}
@Test
public void testParseContentTypeWithBinaryData() {
byte[] body = new byte[128];
body[0] = 31; // ascii value less than 32
ContentType result = ContentTypeParser.parseContentType(null, body);
assertNotNull(result);
assertEquals("application/binary", result.contentType());
assertEquals("ISO_8859_1", result.charset());
}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.contenttype;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
import java.nio.charset.StandardCharsets;
public class DocumentBodyToStringTest {
@Test
public void testGetStringData_onUTF8(){
ContentType type = new ContentType("text/html", "UTF-8");
String expected = "Hello, World!";
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
String result = DocumentBodyToString.getStringData(type, data);
assertEquals(expected, result, "Result should match the expected string");
}
@Test
public void testGetStringData_onIllegalCharsetName(){
ContentType type = new ContentType("text/html", "unsupportedname");
String expected = "Hello, World!";
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
String result = DocumentBodyToString.getStringData(type, data);
assertEquals(expected, result, "Result should match the expected string if charset is illegal name");
}
@Test
public void testGetStringData_onUnsupportedCharset(){
ContentType type = new ContentType("text/html", "Macintosh");
String expected = "Hello, World!";
byte[] data = expected.getBytes(StandardCharsets.UTF_8);
String result = DocumentBodyToString.getStringData(type, data);
assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported");
}
}

View File

@ -37,7 +37,9 @@ public class GeoIpDictionary {
throw new RuntimeException(e);
}
finally {
this.notifyAll();
synchronized (this) {
this.notifyAll();
}
}
});
}

View File

@ -15,18 +15,28 @@ java {
dependencies {
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:features-crawl:content-type')
implementation project(':code:libraries:language-processing')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.bundles.parquet
implementation libs.jwarc
implementation libs.gson
implementation libs.commons.io
implementation libs.commons.lang3
implementation libs.okhttp3
implementation libs.jsoup
implementation libs.snakeyaml
implementation libs.zstd

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawl.retreival.logic;
package nu.marginalia.crawling.body;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.model.EdgeUrl;
import java.util.List;
@ -37,6 +38,9 @@ public class ContentTypeLogic {
return probableBinaryPattern.test(pathLowerCase);
}
public boolean isAllowableContentType(ContentType contentType) {
return isAllowableContentType(contentType.contentType());
}
public boolean isAllowableContentType(String contentType) {
if (allowAllContentTypes)
return true;

View File

@ -0,0 +1,76 @@
package nu.marginalia.crawling.body;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.contenttype.ContentTypeParser;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.zip.GZIPInputStream;
public class DocumentBodyExtractor {
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class);
/** Extract the body from a fetch result as a byte array. */
public static DocumentBodyResult<byte[]> asBytes(HttpFetchResult result) {
if (result instanceof HttpFetchResult.ResultOk fetchOk) {
return asBytes(fetchOk);
}
else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) {
return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes());
}
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok");
}
/** Extract the body from a fetch result as a string. This function performs
* content-type checks to ensure that the content-type is such that this operation
* makes sense.
*
* @see ContentTypeLogic#isAllowableContentType(String)
* */
public static DocumentBodyResult<String> asString(HttpFetchResult result) {
return asBytes(result).flatMap(DocumentBodyExtractor::toStringResult);
}
private static DocumentBodyResult<String> toStringResult(ContentType contentType, byte[] bytes) {
if (contentTypeLogic.isAllowableContentType(contentType)) {
try {
return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes));
}
catch (Exception ex) {
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}
}
else {
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}
}
/** Extract the body from a fetch result as a byte array. */
public static DocumentBodyResult<byte[]> asBytes(HttpFetchResult.ResultOk rsp) {
try {
var byteStream = rsp.getInputStream();
if ("gzip".equals(rsp.header("Content-Encoding"))) {
byteStream = new GZIPInputStream(byteStream);
}
byteStream = new BOMInputStream(byteStream);
var contentTypeHeader = rsp.header("Content-Type");
byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder
var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data);
return new DocumentBodyResult.Ok<>(contentType, data);
} catch (Exception ex) {
logger.error("Failed to extract body", ex);
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "");
}
}
}

View File

@ -0,0 +1,58 @@
package nu.marginalia.crawling.body;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import java.util.Optional;
import java.util.function.BiFunction;
public sealed interface DocumentBodyResult<T> {
record Ok<T>(ContentType contentType, T body) implements DocumentBodyResult<T> {
@Override
public <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper) {
return Optional.of(mapper.apply(contentType, body));
}
@Override
public <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper) {
return mapper.apply(contentType, body);
}
@Override
public <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper) {
return mapper.apply(contentType, body);
}
@Override
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
consumer.accept(contentType, body);
}
}
record Error<T>(CrawlerDocumentStatus status, String why) implements DocumentBodyResult<T> {
@Override
public <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper) {
return Optional.empty();
}
public <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper) { return Optional.empty(); }
@Override
@SuppressWarnings("unchecked")
public <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper) {
return (DocumentBodyResult<T2>) this;
}
@Override
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
}
}
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
interface ExConsumer<T,E extends Exception> {
void accept(ContentType contentType, T t) throws E;
}
}

View File

@ -0,0 +1,160 @@
package nu.marginalia.crawling.body;
import nu.marginalia.contenttype.ContentType;
import okhttp3.Headers;
import org.jsoup.Jsoup;
import org.netpreserve.jwarc.MessageHeaders;
import org.netpreserve.jwarc.WarcResponse;
import org.jsoup.nodes.Document;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.URI;
import java.util.Optional;
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
*/
public sealed interface HttpFetchResult {
boolean isOk();
/** Convert a WarcResponse to a HttpFetchResult */
static HttpFetchResult importWarc(WarcResponse response) {
try {
var http = response.http();
try (var body = http.body()) {
byte[] bytes = body.stream().readAllBytes();
String ipAddress = response
.ipAddress()
.map(InetAddress::getHostAddress)
.orElse("");
return new ResultOk(
response.targetURI(),
http.status(),
http.headers(),
ipAddress,
bytes,
0,
bytes.length
);
}
}
catch (Exception ex) {
return new ResultException(ex);
}
}
/** Corresponds to a successful retrieval of a document
* from the remote server. Note that byte[] is only borrowed
* and subsequent calls may overwrite the contents of this buffer.
*/
record ResultOk(URI uri,
int statusCode,
Headers headers,
String ipAddress,
byte[] bytesRaw,
int bytesStart,
int bytesLength
) implements HttpFetchResult {
public boolean isOk() {
return statusCode >= 200 && statusCode < 300;
}
public ResultOk(URI uri,
int statusCode,
MessageHeaders headers,
String ipAddress,
byte[] bytesRaw,
int bytesStart,
int bytesLength) {
this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
}
private static Headers convertHeaders(MessageHeaders headers) {
var ret = new Headers.Builder();
for (var header : headers.map().entrySet()) {
for (var value : header.getValue()) {
ret.add(header.getKey(), value);
}
}
return ret.build();
}
public InputStream getInputStream() {
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
}
public Optional<Document> parseDocument() throws IOException {
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
if (contentType.is("text/html")) {
return Optional.of(Jsoup.parse(body));
}
else {
return Optional.empty();
}
});
}
public String header(String name) {
return headers.get(name);
}
};
/** This is a special case where the document was not fetched
* because it was already in the database. In this case, we
* replace the original data.
*
* @see Result304Raw for the case where the document has not yet been replaced with the reference data.
*/
record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult {
public boolean isOk() {
return true;
}
public Optional<Document> parseDocument() {
try {
return Optional.of(Jsoup.parse(body));
}
catch (Exception ex) {
return Optional.empty();
}
}
};
/** Fetching resulted in an exception */
record ResultException(Exception ex) implements HttpFetchResult {
public boolean isOk() {
return false;
}
};
/** Fetching resulted in a HTTP 304, the remote content is identical to
* our reference copy. This will be replaced with a Result304ReplacedWithReference
* at a later stage.
*
* @see Result304ReplacedWithReference
*/
record Result304Raw() implements HttpFetchResult {
public boolean isOk() {
return false;
}
};
/** No result. This is typically injected at a later stage
* of processing, e.g. after filtering out irrelevant responses.
*/
record ResultNone() implements HttpFetchResult {
public boolean isOk() {
return false;
}
};
}

View File

@ -1,156 +1,52 @@
package nu.marginalia.crawling.io;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
public class CrawledDomainReader {
private final Gson gson = GsonFactory.get();
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ForkJoinPool pool = new ForkJoinPool(6);
private static final Gson gson = GsonFactory.get();
public CrawledDomainReader() {
}
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile());
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException {
String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".zstd")) {
return new LegacySerializableCrawlDataStream(gson, fullPath.toFile());
}
else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) {
return new WarcSerializableCrawlDataStream(fullPath);
}
else if (fileName.endsWith(".parquet")) {
return new ParquetSerializableCrawlDataStream(fullPath);
}
else {
throw new IllegalArgumentException("Unknown file type: " + fullPath);
}
}
/** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
public SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
return createDataStream(CrawlerOutputFile.getOutputFile(basePath, id, domain));
}
public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL);
/** Read the entirety of the domain data into memory. This uses a lot of RAM */
public CrawledDomain read(Path path) throws IOException {
DomainDataAssembler domainData = new DomainDataAssembler();
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("//")) {
String identifier = line;
String data = br.readLine();
pool.execute(() -> deserializeLine(identifier, data, domainData));
}
}
if (Files.exists(parquetPath)) {
return createDataStream(parquetPath);
}
while (!pool.awaitQuiescence(1, TimeUnit.SECONDS));
return domainData.assemble();
}
private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) {
if (null == data) {
return;
if (Files.exists(warcPath)) {
return createDataStream(warcPath);
}
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class));
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class));
else {
return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain));
}
}
public Optional<CrawledDomain> readOptionally(Path path) {
try {
return Optional.of(read(path));
}
catch (Exception ex) {
return Optional.empty();
}
}
private static class DomainDataAssembler {
private CrawledDomain domainPrototype;
private final List<CrawledDocument> docs = new ArrayList<>();
public synchronized void acceptDomain(CrawledDomain domain) {
this.domainPrototype = domain;
}
public synchronized void acceptDoc(CrawledDocument doc) {
docs.add(doc);
}
public synchronized CrawledDomain assemble() {
if (!docs.isEmpty()) {
if (domainPrototype.doc == null)
domainPrototype.doc = new ArrayList<>();
domainPrototype.doc.addAll(docs);
}
return domainPrototype;
}
}
private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private final Gson gson;
private final BufferedReader bufferedReader;
private SerializableCrawlData next = null;
public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException {
this.gson = gson;
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
}
@Override
public SerializableCrawlData next() throws IOException {
if (hasNext()) {
var ret = next;
next = null;
return ret;
}
throw new IllegalStateException("No more data");
}
@Override
public boolean hasNext() throws IOException {
if (next != null)
return true;
String identifier = bufferedReader.readLine();
if (identifier == null) {
bufferedReader.close();
return false;
}
String data = bufferedReader.readLine();
if (data == null) {
bufferedReader.close();
return false;
}
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDomain.class);
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDocument.class);
}
else {
throw new IllegalStateException("Unknown identifier: " + identifier);
}
return true;
}
@Override
public void close() throws Exception {
bufferedReader.close();
}
}
}

View File

@ -55,7 +55,7 @@ public class CrawledDomainWriter implements AutoCloseable {
}
private Path getOutputFile(String id, String name) throws IOException {
return CrawlerOutputFile.createOutputPath(outputDir, id, name);
return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name);
}
@Override

View File

@ -9,20 +9,20 @@ import java.nio.file.Path;
public class CrawlerOutputFile {
/** Return the Path to a file for the given id and name */
public static Path getOutputFile(Path base, String id, String name) {
public static Path getLegacyOutputFile(Path base, String id, String name) {
id = padId(id);
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = base.resolve(first).resolve(second);
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
}
/** Return the Path to a file for the given id and name, creating the prerequisite
* directory structure as necessary. */
public static Path createOutputPath(Path base, String id, String name) throws IOException {
if (id.length() < 4) {
id = Strings.repeat("0", 4 - id.length()) + id;
}
public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException {
id = padId(id);
String first = id.substring(0, 2);
String second = id.substring(2, 4);
@ -31,7 +31,7 @@ public class CrawlerOutputFile {
if (!Files.exists(destDir)) {
Files.createDirectories(destDir);
}
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd");
}
@ -49,4 +49,71 @@ public class CrawlerOutputFile {
}
public static Path createWarcPath(Path basePath, String id, String domain, WarcFileVersion version) throws IOException {
id = padId(id);
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = basePath.resolve(first).resolve(second);
if (!Files.exists(destDir)) {
Files.createDirectories(destDir);
}
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}-\{version.suffix}.warc.gz");
}
public static Path createParquetPath(Path basePath, String id, String domain) throws IOException {
id = padId(id);
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = basePath.resolve(first).resolve(second);
if (!Files.exists(destDir)) {
Files.createDirectories(destDir);
}
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet");
}
public static Path getParquetPath(Path basePath, String id, String domain) {
id = padId(id);
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = basePath.resolve(first).resolve(second);
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet");
}
public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) {
id = padId(id);
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = basePath.resolve(first).resolve(second);
return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.warc\{version.suffix}");
}
/**
* Pads the given ID with leading zeros to ensure it has a length of 4 characters.
*/
private static String padId(String id) {
if (id.length() < 4) {
id = Strings.repeat("0", 4 - id.length()) + id;
}
return id;
}
public enum WarcFileVersion {
LIVE("open"),
TEMP("tmp"),
FINAL("final");
public final String suffix;
WarcFileVersion(String suffix) {
this.suffix = suffix;
}
}
}

View File

@ -1,11 +1,13 @@
package nu.marginalia.crawling.io;
import nu.marginalia.crawling.model.SerializableCrawlData;
import org.jetbrains.annotations.Nullable;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;
/** Closable iterator over serialized crawl data
/** Closable iterator exceptional over serialized crawl data
* The data may appear in any order, and the iterator must be closed.
*
* @see CrawledDomainReader
@ -17,6 +19,8 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
boolean hasNext() throws IOException;
@Nullable
default Path path() { return null; }
// Dummy iterator over nothing
static SerializableCrawlDataStream empty() {

View File

@ -0,0 +1,73 @@
package nu.marginalia.crawling.io.format;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import java.io.*;
import java.nio.file.Path;
/** This class is used to read the old format of crawl data, which was zstd-compressed JSON
* with type delimiters between records.
*/
public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private final Gson gson;
private final BufferedReader bufferedReader;
private SerializableCrawlData next = null;
private final Path path;
public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException {
this.gson = gson;
bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)));
path = file.toPath();
}
@Override
public Path path() {
return path;
}
@Override
public SerializableCrawlData next() throws IOException {
if (hasNext()) {
var ret = next;
next = null;
return ret;
}
throw new IllegalStateException("No more data");
}
@Override
public boolean hasNext() throws IOException {
if (next != null)
return true;
String identifier = bufferedReader.readLine();
if (identifier == null) {
bufferedReader.close();
return false;
}
String data = bufferedReader.readLine();
if (data == null) {
bufferedReader.close();
return false;
}
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDomain.class);
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDocument.class);
} else {
throw new IllegalStateException("Unknown identifier: " + identifier);
}
return true;
}
@Override
public void close() throws Exception {
bufferedReader.close();
}
}

View File

@ -0,0 +1,135 @@
package nu.marginalia.crawling.io.format;
import lombok.SneakyThrows;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.*;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.util.*;
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
private final MurmurHash3_128 hash = new MurmurHash3_128();
private final Iterator<CrawledDocumentParquetRecord> backingIterator;
private final Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
private boolean wroteDomainRecord = false;
private final Path path;
public ParquetSerializableCrawlDataStream(Path file) throws IOException {
path = file;
backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator();
}
@Override
public Path path() {
return path;
}
@Override
@SneakyThrows
public boolean hasNext() {
while (backingIterator.hasNext() && nextQ.isEmpty()) {
var nextRecord = backingIterator.next();
if (!wroteDomainRecord) {
createDomainRecord(nextRecord);
wroteDomainRecord = true;
}
createDocumentRecord(nextRecord);
}
return !nextQ.isEmpty();
}
private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) throws URISyntaxException {
CrawlerDomainStatus status = CrawlerDomainStatus.OK;
String statusReason = "";
String redirectDomain = null;
if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) {
EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url);
redirectDomain = crawledUrl.getDomain().toString();
status = CrawlerDomainStatus.REDIRECT;
}
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) {
status = CrawlerDomainStatus.BLOCKED;
}
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) {
status = CrawlerDomainStatus.ERROR;
statusReason = new String(parquetRecord.body);
}
nextQ.add(new CrawledDomain(
parquetRecord.domain,
redirectDomain,
status.toString(),
statusReason,
parquetRecord.ip,
new ArrayList<>(),
new ArrayList<>()
));
}
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
String bodyString = "";
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
}
else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
status = CrawlerDocumentStatus.ROBOTS_TXT;
}
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
return;
}
else {
try {
bodyString = DocumentBodyToString.getStringData(
ContentType.parse(nextRecord.contentType),
nextRecord.body);
} catch (Exception ex) {
logger.error("Failed to convert body to string", ex);
status = CrawlerDocumentStatus.BAD_CHARSET;
}
}
nextQ.add(new CrawledDocument("",
nextRecord.url,
nextRecord.contentType,
nextRecord.timestamp.toString(),
nextRecord.httpStatus,
status.toString(),
"",
"",
bodyString,
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
nextRecord.url,
null,
"",
nextRecord.cookies));
}
public void close() throws IOException {
}
@Override
public SerializableCrawlData next() throws IOException {
if (!hasNext())
throw new NoSuchElementException();
return nextQ.poll();
}
}

View File

@ -0,0 +1,151 @@
package nu.marginalia.crawling.io.format;
import lombok.SneakyThrows;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
public class WarcSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private static final Logger logger = LoggerFactory.getLogger(WarcSerializableCrawlDataStream.class);
private final WarcReader reader;
private final Iterator<WarcRecord> backingIterator;
private SerializableCrawlData next = null;
private final Path path;
public WarcSerializableCrawlDataStream(Path file) throws IOException {
path = file;
reader = new WarcReader(file);
WarcXResponseReference.register(reader);
WarcXEntityRefused.register(reader);
backingIterator = reader.iterator();
}
@Override
public Path path() {
return path;
}
@Override
@SneakyThrows
public boolean hasNext() {
while (backingIterator.hasNext() && next == null) {
var nextRecord = backingIterator.next();
if (nextRecord instanceof WarcResponse response) { // this also includes WarcXResponseReference
convertResponse(response);
}
else if (nextRecord instanceof Warcinfo warcinfo) {
convertWarcinfo(warcinfo);
}
}
return next != null;
}
private void convertWarcinfo(Warcinfo warcinfo) throws IOException {
var headers = warcinfo.fields();
String probeStatus = headers.first("X-WARC-Probe-Status").orElse("");
String[] parts = probeStatus.split(" ", 2);
String domain = headers.first("domain").orElseThrow(() -> new IllegalStateException("Missing domain header"));
String status = parts[0];
String statusReason = parts.length > 1 ? parts[1] : "";
String ip = headers.first("ip").orElse("");
String redirectDomain = null;
if ("REDIRECT".equalsIgnoreCase(status)) {
redirectDomain = statusReason;
}
next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip,
new ArrayList<>(),
new ArrayList<>()
);
}
private void convertResponse(WarcResponse response) throws IOException {
var http = response.http();
if (http.status() != 200) {
return;
}
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
next = new CrawledDocument(
"",
response.targetURI().toString(),
http.contentType().raw(),
response.date().toString(),
http.status(),
error.status().toString(),
error.why(),
headers(http.headers()),
null,
response.payloadDigest().map(WarcDigest::base64).orElse(""),
"",
"",
"",
WarcXCookieInformationHeader.hasCookies(response)
);
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
next = new CrawledDocument(
"",
response.targetURI().toString(),
ok.contentType().toString(),
response.date().toString(),
http.status(),
"OK",
"",
headers(http.headers()),
ok.body(),
response.payloadDigest().map(WarcDigest::base64).orElse(""),
"",
"",
"",
WarcXCookieInformationHeader.hasCookies(response));
} else {
// unreachable
throw new IllegalStateException("Unknown body type: " + parsedBody);
}
}
public String headers(MessageHeaders headers) {
StringJoiner ret = new StringJoiner("\n");
for (var header : headers.map().entrySet()) {
for (var value : header.getValue()) {
ret.add(STR."\{header.getKey()}: \{value}");
}
}
return ret.toString();
}
public void close() throws IOException {
reader.close();
}
@Override
public SerializableCrawlData next() throws IOException {
if (!hasNext())
throw new NoSuchElementException();
try {
return next;
}
finally {
next = null;
}
}
}

View File

@ -1,5 +0,0 @@
package nu.marginalia.crawling.model;
public record ContentType(String contentType, String charset) {
}

View File

@ -23,13 +23,21 @@ public class CrawledDocument implements SerializableCrawlData {
public String headers;
public String documentBody;
@Deprecated
public String documentBodyHash;
@Deprecated
public String canonicalUrl;
public String redirectUrl;
@Deprecated
public String recrawlState;
/** This is not guaranteed to be set in all versions of the format,
* information may come in CrawledDomain instead */
public Boolean hasCookies = false;
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
@Override
public String getSerialIdentifier() {

View File

@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData {
public String ip;
public List<CrawledDocument> doc;
/** This is not guaranteed to be set in all versions of the format,
* information may come in CrawledDocument instead */
public List<String> cookies;
public int size() {
@ -24,6 +27,10 @@ public class CrawledDomain implements SerializableCrawlData {
return doc.size();
}
public boolean hasCookies() {
return cookies != null && !cookies.isEmpty();
}
public static final String SERIAL_IDENTIFIER = "// DOMAIN";
@Override
public String getSerialIdentifier() {

View File

@ -0,0 +1,97 @@
package nu.marginalia.crawling.parquet;
import blue.strategic.parquet.Dehydrator;
import blue.strategic.parquet.Hydrator;
import blue.strategic.parquet.ValueWriter;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.ToString;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types;
import java.time.Instant;
import static org.apache.parquet.schema.LogicalTypeAnnotation.*;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode
@ToString
public class CrawledDocumentParquetRecord {
public String domain;
public String url;
public String ip;
public boolean cookies;
public int httpStatus;
public Instant timestamp;
public String contentType;
public byte[] body;
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
return new CrawledDocumentParquetRecordHydrator();
}
public static Dehydrator<CrawledDocumentParquetRecord> newDehydrator() {
return CrawledDocumentParquetRecord::dehydrate;
}
public static MessageType schema = new MessageType(
CrawledDocumentParquetRecord.class.getSimpleName(),
Types.required(BINARY).as(stringType()).named("domain"),
Types.required(BINARY).as(stringType()).named("url"),
Types.required(BINARY).as(stringType()).named("ip"),
Types.required(BOOLEAN).named("cookies"),
Types.required(INT32).named("httpStatus"),
Types.required(INT64).named("epochSeconds"),
Types.required(BINARY).as(stringType()).named("contentType"),
Types.required(BINARY).named("body")
);
public CrawledDocumentParquetRecord add(String heading, Object value) {
switch (heading) {
case "domain" -> domain = (String) value;
case "url" -> url = (String) value;
case "ip" -> ip = (String) value;
case "httpStatus" -> httpStatus = (Integer) value;
case "cookies" -> cookies = (Boolean) value;
case "contentType" -> contentType = (String) value;
case "body" -> body = (byte[]) value;
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
}
return this;
}
public void dehydrate(ValueWriter valueWriter) {
valueWriter.write("domain", domain);
valueWriter.write("url", url);
valueWriter.write("ip", ip);
valueWriter.write("epochSeconds", timestamp.getEpochSecond());
valueWriter.write("httpStatus", httpStatus);
valueWriter.write("cookies", cookies);
valueWriter.write("contentType", contentType);
valueWriter.write("body", body);
}
}
class CrawledDocumentParquetRecordHydrator implements Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> {
@Override
public CrawledDocumentParquetRecord start() {
return new CrawledDocumentParquetRecord();
}
@Override
public CrawledDocumentParquetRecord add(CrawledDocumentParquetRecord target, String heading, Object value) {
return target.add(heading, value);
}
@Override
public CrawledDocumentParquetRecord finish(CrawledDocumentParquetRecord target) {
return target;
}
}

View File

@ -0,0 +1,19 @@
package nu.marginalia.crawling.parquet;
import blue.strategic.parquet.HydratorSupplier;
import blue.strategic.parquet.ParquetReader;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.stream.Stream;
public class CrawledDocumentParquetRecordFileReader {
@NotNull
public static Stream<CrawledDocumentParquetRecord> stream(Path path) throws IOException {
return ParquetReader.streamContent(path.toFile(),
HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator()));
}
}

View File

@ -0,0 +1,247 @@
package nu.marginalia.crawling.parquet;
import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.UserAgent;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.crawling.body.HttpFetchResult;
import org.apache.commons.lang3.StringUtils;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Path;
import java.time.Instant;
import java.util.List;
import java.util.Objects;
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
public static void convertWarc(String domain,
UserAgent userAgent,
Path warcInputFile,
Path parquetOutputFile) {
try (var warcReader = new WarcReader(warcInputFile);
var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
) {
WarcXResponseReference.register(warcReader);
WarcXEntityRefused.register(warcReader);
String uaString = userAgent.uaString();
for (var record : warcReader) {
if (record instanceof WarcResponse response) {
// this also captures WarcXResponseReference, which inherits from WarcResponse
// and is used to store old responses from previous crawls; in this part of the logic
// we treat them the same as a normal response
if (!filterResponse(uaString, response)) {
continue;
}
parquetWriter.write(domain, response);
}
else if (record instanceof WarcXEntityRefused refused) {
parquetWriter.write(domain, refused);
}
else if (record instanceof Warcinfo warcinfo) {
parquetWriter.write(warcinfo);
}
}
}
catch (Exception ex) {
logger.error("Failed to convert WARC file to Parquet", ex);
}
}
/** Return true if the WarcResponse should be excluded from conversion */
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
// We don't want to store robots.txt files, as they are not
// interesting for the analysis we want to do. This is important
// since txt-files in general are interesting, and we don't want to
// exclude them as a class.
if (response.targetURI().getPath().equals("/robots.txt")) {
return false;
}
var robotsTags = response.http().headers().all("X-Robots-Tag");
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
return false;
}
return true;
}
private void write(String domain, WarcXEntityRefused refused) throws IOException {
URI profile = refused.profile();
String meta;
if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) {
meta = "x-marginalia/advisory;state=robots-txt-skipped";
}
else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) {
meta = "x-marginalia/advisory;state=content-type-failed-probe";
}
else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) {
meta = "x-marginalia/advisory;state=timeout-probe";
}
else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) {
meta = "x-marginalia/advisory;state=doc-error";
}
else {
meta = "x-marginalia/advisory;state=unknown";
}
write(forDocError(domain, refused.date(), refused.target(), meta));
}
private void write(Warcinfo warcinfo) throws IOException {
String selfDomain = warcinfo.fields().first("domain").orElse("");
String ip = warcinfo.fields().first("ip").orElse("");
String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse("");
if (probeStatus.startsWith("REDIRECT")) {
String redirectDomain = probeStatus.substring("REDIRECT;".length());
write(forDomainRedirect(selfDomain, warcinfo.date(), redirectDomain));
}
else if (!"OK".equals(probeStatus)) {
write(forDomainError(selfDomain, warcinfo.date(), ip, probeStatus));
}
}
public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException {
writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema,
file.toFile(), CrawledDocumentParquetRecord.newDehydrator());
}
public void write(CrawledDocumentParquetRecord domainData) throws IOException {
writer.write(domainData);
}
public void write(String domain, WarcResponse response) throws IOException {
HttpFetchResult result = HttpFetchResult.importWarc(response);
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
return;
}
byte[] bodyBytes;
String contentType;
var body = DocumentBodyExtractor.asBytes(result);
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
bodyBytes = bodyOk.body();
contentType = bodyOk.contentType().toString();
}
else {
bodyBytes = new byte[0];
contentType = "";
}
write(new CrawledDocumentParquetRecord(
domain,
response.target(),
fetchOk.ipAddress(),
WarcXCookieInformationHeader.hasCookies(response),
fetchOk.statusCode(),
response.date(),
contentType,
bodyBytes)
);
}
public void close() throws IOException {
writer.close();
}
private CrawledDocumentParquetRecord forDomainRedirect(String domain, Instant date, String redirectDomain) {
return new CrawledDocumentParquetRecord(domain,
STR."https://\{redirectDomain}/",
"",
false,
0,
date,
"x-marginalia/advisory;state=redirect",
new byte[0]
);
}
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
return new CrawledDocumentParquetRecord(domain,
STR."https://\{domain}/",
ip,
false,
0,
date,
"x-marginalia/advisory;state=error",
errorStatus.getBytes()
);
}
private CrawledDocumentParquetRecord forDocError(String domain, Instant date, String url, String errorStatus) {
return new CrawledDocumentParquetRecord(domain,
url,
"",
false,
0,
date,
errorStatus,
new byte[0]
);
}
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
* <p>
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
*
* @param xRobotsHeaderTags List of X-Robots-Tag values
* @param userAgent User agent string
* @return true if we are allowed to index this page
*/
// Visible for tests
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
boolean isPermittedGeneral = true;
boolean isPermittedMarginalia = false;
boolean isForbiddenMarginalia = false;
for (String header : xRobotsHeaderTags) {
if (header.indexOf(':') >= 0) {
String[] parts = StringUtils.split(header, ":", 2);
if (parts.length < 2)
continue;
// Is this relevant to us?
if (!Objects.equals(parts[0].trim(), userAgent))
continue;
if (parts[1].contains("noindex"))
isForbiddenMarginalia = true;
else if (parts[1].contains("none"))
isForbiddenMarginalia = true;
else if (parts[1].contains("all"))
isPermittedMarginalia = true;
}
else {
if (header.contains("noindex"))
isPermittedGeneral = false;
if (header.contains("none"))
isPermittedGeneral = false;
}
}
if (isPermittedMarginalia)
return true;
if (isForbiddenMarginalia)
return false;
return isPermittedGeneral;
}
}

View File

@ -0,0 +1,35 @@
package org.netpreserve.jwarc;
import okhttp3.HttpUrl;
import okhttp3.OkHttpClient;
/** Encapsulates out-of-band information about whether a website uses cookies,
* using a non-standard WARC header "X-Has-Cookies".
*/
public class WarcXCookieInformationHeader {
private boolean hasCookies = false;
private static final String headerName = "X-Has-Cookies";
public void update(OkHttpClient client, HttpUrl url) {
if (!hasCookies) {
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
}
}
public boolean hasCookies() {
return hasCookies;
}
public void paint(WarcResponse.Builder builder) {
builder.addHeader(headerName, hasCookies ? "1" : "0");
}
public void paint(WarcXResponseReference.Builder builder) {
builder.addHeader(headerName, hasCookies ? "1" : "0");
}
public static boolean hasCookies(WarcRecord record) {
return record.headers().contains(headerName, "1");
}
}

View File

@ -0,0 +1,45 @@
package org.netpreserve.jwarc;
import java.io.IOException;
import java.net.URI;
/** This defines a non-standard extension to WARC for storing old HTTP responses,
* essentially a 'response' with different semantics
*/
public class WarcXEntityRefused extends WarcRevisit {
private static final String TYPE_NAME = "x-entity-refused";
public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped");
public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe");
public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe");
public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error");
WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) {
super(version, headers, body);
}
public static void register(WarcReader reader) {
reader.registerType(TYPE_NAME, WarcXEntityRefused::new);
}
public static class Builder extends AbstractBuilder<WarcXEntityRefused, Builder> {
public Builder(URI targetURI, URI profile) {
this(targetURI.toString(), profile.toString());
}
public Builder(String targetURI, String profileURI) {
super(TYPE_NAME);
setHeader("WARC-Target-URI", targetURI);
setHeader("WARC-Profile", profileURI);
}
public Builder body(HttpResponse httpResponse) throws IOException {
return body(MediaType.HTTP_RESPONSE, httpResponse);
}
@Override
public WarcXEntityRefused build() {
return build(WarcXEntityRefused::new);
}
}
}

View File

@ -0,0 +1,42 @@
package org.netpreserve.jwarc;
import java.io.IOException;
import java.net.URI;
/** This defines a non-standard extension to WARC for storing old HTTP responses,
* essentially a 'response' with different semantics..
* <p>
* An x-response-reference record is a response record with a full body, where
* the data is a reconstructed HTTP response from a previous crawl.
*/
public class WarcXResponseReference extends WarcResponse {
private static final String TYPE_NAME = "x-response-reference";
WarcXResponseReference(MessageVersion version, MessageHeaders headers, MessageBody body) {
super(version, headers, body);
}
public static void register(WarcReader reader) {
reader.registerType(TYPE_NAME, WarcXResponseReference::new);
}
public static class Builder extends AbstractBuilder<WarcXResponseReference, Builder> {
public Builder(URI targetURI) {
this(targetURI.toString());
}
public Builder(String targetURI) {
super(TYPE_NAME);
setHeader("WARC-Target-URI", targetURI);
}
public Builder body(HttpResponse httpResponse) throws IOException {
return body(MediaType.HTTP_RESPONSE, httpResponse);
}
@Override
public WarcXResponseReference build() {
return build(WarcXResponseReference::new);
}
}
}

View File

@ -74,23 +74,13 @@ public class CrawlPlan {
return count;
}
@Deprecated
public Iterable<CrawledDomain> domainsIterable() {
final CrawledDomainReader reader = new CrawledDomainReader();
return WorkLog.iterableMap(crawl.getLogFile(),
entry -> {
var path = getCrawledFilePath(entry.path());
if (!Files.exists(path)) {
logger.warn("File not found: {}", path);
return Optional.empty();
}
return reader.readOptionally(path);
});
// This is no longer supported
throw new UnsupportedOperationException();
}
public Iterable<SerializableCrawlDataStream> crawlDataIterable(Predicate<String> idPredicate) {
final CrawledDomainReader reader = new CrawledDomainReader();
return WorkLog.iterableMap(crawl.getLogFile(),
entry -> {
if (!idPredicate.test(entry.id())) {
@ -105,7 +95,7 @@ public class CrawlPlan {
}
try {
return Optional.of(reader.createDataStream(path));
return Optional.of(CrawledDomainReader.createDataStream(path));
}
catch (IOException ex) {
return Optional.empty();

View File

@ -0,0 +1,78 @@
package nu.marginalia.crawling.parquet;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Instant;
import java.util.ArrayList;
import static org.junit.jupiter.api.Assertions.*;
class CrawledDocumentParquetRecordFileWriterTest {
Path tempFile;
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile("test", ".parquet");
}
@AfterEach
public void tearDown() throws IOException {
Files.delete(tempFile);
}
@Test
void testWriteRead() throws IOException {
var original = new CrawledDocumentParquetRecord("www.marginalia.nu",
"https://www.marginalia.nu/",
"127.0.0.1",
false,
200,
Instant.now(),
"text/html",
"hello world".getBytes());
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
writer.write(original);
}
var items = new ArrayList<SerializableCrawlData>();
try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) {
while (stream.hasNext()) {
items.add(stream.next());
}
}
assertEquals(2, items.size());
var firstItem = items.get(0);
assertInstanceOf(CrawledDomain.class, firstItem);
var domain = (CrawledDomain) firstItem;
assertEquals("www.marginalia.nu", domain.domain);
assertNull(domain.redirectDomain);
assertEquals("OK", domain.crawlerStatus);
assertEquals("", domain.crawlerStatusDesc);
assertEquals(new ArrayList<>(), domain.doc);
assertEquals(new ArrayList<>(), domain.cookies);
var secondItem = items.get(1);
assertInstanceOf(CrawledDocument.class, secondItem);
var document = (CrawledDocument) secondItem;
assertEquals("https://www.marginalia.nu/", document.url);
assertEquals("text/html", document.contentType);
assertEquals("hello world", document.documentBody);
assertEquals(200, document.httpStatus);
}
}

View File

@ -59,6 +59,7 @@ dependencies {
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:process-models:crawl-spec')
@ -66,6 +67,7 @@ dependencies {
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.jwarc
implementation libs.jsoup

View File

@ -268,6 +268,14 @@ public class ConverterMain {
processData.asPath(),
msg, inbox);
}
case SideloadWarc -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);
yield new SideloadAction(
sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)),
processData.asPath(),
msg, inbox);
}
case SideloadStackexchange -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);

View File

@ -105,13 +105,6 @@ public class DocumentProcessor {
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
throws URISyntaxException
{
if (crawledDocument.canonicalUrl != null) {
try {
return new EdgeUrl(crawledDocument.canonicalUrl);
}
catch (URISyntaxException ex) { /* fallthrough */ }
}
return new EdgeUrl(crawledDocument.url);
}

View File

@ -18,6 +18,7 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.links.TopKeywords;
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
import nu.marginalia.model.crawl.HtmlFeature;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -53,9 +54,15 @@ public class DomainProcessor {
}
@SneakyThrows
@Nullable
public ProcessedDomain process(SerializableCrawlDataStream dataStream) {
if (!dataStream.hasNext()) {
return null;
}
var ret = new ProcessedDomain();
List<ProcessedDocument> docs = new ArrayList<>();
Set<String> processedUrls = new HashSet<>();
boolean cookies = false;
String ip = "";
@ -79,7 +86,7 @@ public class DomainProcessor {
ret.domain = new EdgeDomain(crawledDomain.domain);
ret.ip = crawledDomain.ip;
cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0;
cookies = crawledDomain.hasCookies();
ip = crawledDomain.ip;
if (crawledDomain.redirectDomain != null) {
@ -90,10 +97,12 @@ public class DomainProcessor {
}
else if (data instanceof CrawledDocument doc) {
try {
if (doc.url == null)
if (doc.url == null || !processedUrls.add(doc.url))
continue;
fixBadCanonicalTag(doc);
if (Boolean.TRUE.equals(doc.hasCookies)) {
cookies = true;
}
// This case should never be reachable, as we should have initiated
// the externalDomainLinks variable above if we made it past the
@ -161,25 +170,6 @@ public class DomainProcessor {
return false;
}
private void fixBadCanonicalTag(CrawledDocument doc) {
// Some sites have a canonical tag that points to a different domain,
// but our loader can not support this, so we point these back to the
// original url.
var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl);
if (canonicalOpt.isEmpty()) return;
var urlOpt = EdgeUrl.parse(doc.url);
if (urlOpt.isEmpty()) return;
var urlActual = urlOpt.get();
var canonicalActual = canonicalOpt.get();
if (!Objects.equals(urlActual.domain, canonicalActual.domain)) {
doc.canonicalUrl = doc.url;
}
}
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
LinkGraph linkGraph = new LinkGraph();
TopKeywords topKeywords = new TopKeywords();

View File

@ -7,6 +7,7 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
import nu.marginalia.converting.sideload.warc.WarcSideloadFactory;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
@ -24,6 +25,7 @@ public class SideloadSourceFactory {
private final AnchorTextKeywords anchorTextKeywords;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
private final WarcSideloadFactory warcSideloadFactory;
@Inject
public SideloadSourceFactory(Gson gson,
@ -31,7 +33,8 @@ public class SideloadSourceFactory {
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
AnchorTagsSourceFactory anchorTagsSourceFactory,
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
DirtreeSideloaderFactory dirtreeSideloaderFactory,
WarcSideloadFactory warcSideloadFactory) {
this.gson = gson;
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractorProvider = sentenceExtractorProvider;
@ -39,6 +42,7 @@ public class SideloadSourceFactory {
this.anchorTextKeywords = anchorTextKeywords;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
this.warcSideloadFactory = warcSideloadFactory;
}
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
@ -49,6 +53,10 @@ public class SideloadSourceFactory {
return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile);
}
public Collection<? extends SideloadSource> sideloadWarc(Path pathToWarcFiles) throws IOException {
return warcSideloadFactory.createSideloaders(pathToWarcFiles);
}
/** Do not use, this code isn't finished */
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
try (var dirs = Files.walk(pathToDbFileRoot)) {

View File

@ -50,7 +50,8 @@ public class SideloaderProcessing {
Integer.toHexString(url.hashCode()),
url,
"",
"SIDELOAD"
"SIDELOAD",
false
);
var ret = new ProcessedDocument();

View File

@ -0,0 +1,32 @@
package nu.marginalia.converting.sideload.warc;
import nu.marginalia.converting.sideload.SideloadSource;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
public class WarcSideloadFactory {
public Collection<? extends SideloadSource> createSideloaders(Path pathToWarcFiles) throws IOException {
final List<Path> files = new ArrayList<>();
try (var stream = Files.list(pathToWarcFiles)) {
stream
.filter(Files::isRegularFile)
.filter(this::isWarcFile)
.forEach(files::add);
}
// stub
return null;
}
private boolean isWarcFile(Path path) {
return path.toString().endsWith(".warc")
|| path.toString().endsWith(".warc.gz");
}
}

View File

@ -0,0 +1,160 @@
package nu.marginalia.converting.sideload.warc;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentTypeParser;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
public class WarcSideloader implements SideloadSource, AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
private final SideloaderProcessing sideloaderProcessing;
private final WarcReader reader;
private final EdgeDomain domain;
public WarcSideloader(Path warcFile,
SideloaderProcessing sideloaderProcessing)
throws IOException
{
this.sideloaderProcessing = sideloaderProcessing;
this.reader = new WarcReader(warcFile);
this.domain = sniffDomainFromWarc()
.orElseThrow(() -> new IOException("Could not identify domain from warc file"));
}
@SneakyThrows
@Override
public ProcessedDomain getDomain() {
var ret = new ProcessedDomain();
ret.domain = domain;
ret.ip = "0.0.0.0";
ret.state = DomainIndexingState.ACTIVE;
return ret;
}
private Optional<EdgeDomain> sniffDomainFromWarc() throws IOException {
try {
for (var record : reader) {
if (!(record instanceof WarcRequest request)) {
continue;
}
String target = request.target();
if (target.startsWith("http://") || target.startsWith("https://")) {
return Optional.of(new EdgeUrl(target).getDomain());
}
}
} catch (URISyntaxException e) {
return Optional.empty();
} finally {
reader.position(0);
}
return Optional.empty();
}
@SneakyThrows
@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
return reader.records()
.filter(record -> record instanceof WarcResponse)
.map(WarcResponse.class::cast)
.filter(this::isRelevantResponse)
.map(this::process)
.filter(Optional::isPresent)
.map(Optional::get)
.iterator();
}
private boolean isRelevantResponse(WarcResponse warcResponse) {
try {
HttpResponse httpResponse = warcResponse.http();
if (httpResponse == null)
return false;
if (httpResponse.status() != 200)
return false;
if (!Objects.equals(httpResponse.contentType(), MediaType.HTML))
return false;
var url = new EdgeUrl(warcResponse.target());
if (!Objects.equals(url.getDomain(), domain)) {
return false;
}
return true;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
@SneakyThrows
private Optional<ProcessedDocument> process(WarcResponse response) {
Optional<String> body = getBody(response);
String url = response.target();
// We trim "/index.html"-suffixes from the index if they are present,
// since this is typically an artifact from document retrieval
if (url.endsWith("/index.html")) {
url = url.substring(0, url.length() - "index.html".length());
}
if (body.isEmpty()) {
return Optional.empty();
}
return Optional.of(sideloaderProcessing
.processDocument(url, body.get(), List.of(), new DomainLinks(),
GeneratorType.DOCS,
10_000));
}
@SneakyThrows
private Optional<String> getBody(WarcResponse response) {
var http = response.http();
// TODO: We should support additional encodings here
try (var body = http.body()) {
String contentType = http.headers().first("Content-Type").orElse(null);
byte[] bytes = body.stream().readAllBytes();
var ct = ContentTypeParser.parseContentType(contentType, bytes);
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
}
catch (Exception ex) {
logger.info("Failed to parse body", ex);
}
return Optional.empty();
}
@Override
public void close() throws Exception {
reader.close();
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.writer;
import lombok.SneakyThrows;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.worklog.BatchingWorkLog;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -41,7 +42,10 @@ public class ConverterWriter implements AutoCloseable {
}
@SneakyThrows
public void accept(ProcessedDomain domain) {
public void accept(@Nullable ProcessedDomain domain) {
if (null == domain)
return;
domainData.put(domain);
}

View File

@ -65,6 +65,7 @@ public class ConvertingIntegrationTest {
@Test
public void testMemexMarginaliaNu() throws IOException {
var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
@ -114,7 +115,8 @@ public class ConvertingIntegrationTest {
Double.toString(Math.random()),
"https://memex.marginalia.nu/" + file,
null,
""
"",
false
);
docs.add(doc);
}

View File

@ -3,31 +3,51 @@ package nu.marginalia.converting;
import com.google.inject.Guice;
import com.google.inject.Injector;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
/* This is mostly a debugging utility */
import static org.junit.jupiter.api.Assertions.*;
/** Tests for the crawler and converter integration. These are pretty slow and potentially
* a bit flaky, since they attempt to fetch real websites.
*/
@Tag("slow")
public class CrawlingThenConvertingIntegrationTest {
private DomainProcessor domainProcessor;
private HttpFetcher httpFetcher;
private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class);
private Path fileName;
private Path fileName2;
@SneakyThrows
@BeforeAll
public static void setUpAll() {
@ -44,10 +64,80 @@ public class CrawlingThenConvertingIntegrationTest {
domainProcessor = injector.getInstance(DomainProcessor.class);
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
}
@AfterEach
public void tearDown() throws IOException {
Files.deleteIfExists(fileName);
Files.deleteIfExists(fileName2);
}
@Test
public void crawlThenProcess() {
public void testInvalidDomain() throws IOException {
// Attempt to fetch an invalid domain
var specs = CrawlSpecRecord.builder()
.domain("invalid.invalid.invalid")
.crawlDepth(10)
.urls(List.of()) // add specific URLs to crawl here
.build();
CrawledDomain crawlData = crawl(specs);
assertEquals("ERROR", crawlData.crawlerStatus);
assertTrue(crawlData.doc.isEmpty());
var processedData = process();
assertNotNull(processedData);
assertTrue(processedData.documents.isEmpty());
}
@Test
public void testRedirectingDomain() throws IOException {
// Attempt to fetch an invalid domain
var specs = CrawlSpecRecord.builder()
.domain("memex.marginalia.nu")
.crawlDepth(10)
.urls(List.of()) // add specific URLs to crawl here
.build();
CrawledDomain crawlData = crawl(specs);
assertEquals("REDIRECT", crawlData.crawlerStatus);
assertEquals("www.marginalia.nu", crawlData.redirectDomain);
assertTrue(crawlData.doc.isEmpty());
var processedData = process();
assertNotNull(processedData);
assertTrue(processedData.documents.isEmpty());
}
@Test
public void testBlockedDomain() throws IOException {
// Attempt to fetch an invalid domain
var specs = CrawlSpecRecord.builder()
.domain("search.marginalia.nu")
.crawlDepth(10)
.urls(List.of()) // add specific URLs to crawl here
.build();
CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
assertEquals("ERROR", crawlData.crawlerStatus);
assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc);
assertTrue(crawlData.doc.isEmpty());
var processedData = process();
assertNotNull(processedData);
assertTrue(processedData.documents.isEmpty());
}
@Test
public void crawlSunnyDay() throws IOException {
var specs = CrawlSpecRecord.builder()
.domain("www.marginalia.nu")
.crawlDepth(10)
@ -55,12 +145,20 @@ public class CrawlingThenConvertingIntegrationTest {
.build();
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("www.marginalia.nu", domain.domain);
List<SerializableCrawlData> data = new ArrayList<>();
data.add(domain);
data.addAll(domain.doc);
boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt"));
assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator()));
for (var doc : output.documents) {
if (doc.isOk()) {
@ -73,12 +171,122 @@ public class CrawlingThenConvertingIntegrationTest {
}
private CrawledDomain crawl(CrawlSpecRecord specs) {
@Test
public void crawlContentTypes() throws IOException {
var specs = CrawlSpecRecord.builder()
.domain("www.marginalia.nu")
.crawlDepth(5)
.urls(List.of(
"https://www.marginalia.nu/sanic.png",
"https://www.marginalia.nu/invalid"
))
.build();
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("www.marginalia.nu", domain.domain);
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type");
assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
for (var doc : output.documents) {
if (doc.isOk()) {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
}
else {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
}
}
}
@Test
public void crawlRobotsTxt() throws IOException {
var specs = CrawlSpecRecord.builder()
.domain("search.marginalia.nu")
.crawlDepth(5)
.urls(List.of(
"https://search.marginalia.nu/search?q=hello+world"
))
.build();
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("search.marginalia.nu", domain.domain);
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
for (var doc : output.documents) {
if (doc.isOk()) {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
}
else {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
}
}
}
private ProcessedDomain process() {
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
return domainProcessor.process(stream);
}
catch (Exception e) {
Assertions.fail(e);
return null; // unreachable
}
}
private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException {
return crawl(specs, domain -> true);
}
private CrawledDomain crawl(CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
try (var recorder = new WarcRecorder(fileName)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
}
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
new UserAgent("test"),
fileName, fileName2);
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
while (reader.hasNext()) {
var next = reader.next();
logger.info("{}", next);
data.add(next);
}
}
CrawledDomain domain = data.stream()
.filter(CrawledDomain.class::isInstance)
.map(CrawledDomain.class::cast)
.findFirst()
.get();
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
return domain;
}

View File

@ -0,0 +1,81 @@
package nu.marginalia.converting.sideload.warc;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.netpreserve.jwarc.WarcWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.when;
class WarcSideloaderTest extends AbstractModule {
SideloaderProcessing processing;
Path warcFile;
@BeforeEach
public void setUp() throws IOException {
processing = Guice.createInjector(new ConverterModule(), this)
.getInstance(SideloaderProcessing.class);
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc.gz");
}
@AfterEach
public void tearDown() throws IOException {
Files.deleteIfExists(warcFile);
}
public void configure() {
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
}
@Test
public void test() throws IOException {
try (var writer = new WarcWriter(Files.newOutputStream(warcFile))) {
writer.fetch(new URI("https://www.marginalia.nu/"));
writer.fetch(new URI("https://www.marginalia.nu/log/93_atags/"));
writer.fetch(new URI("https://www.marginalia.nu/links/"));
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
ProcessedDomain domain;
List<ProcessedDocument> docs = new ArrayList<>();
try (var sideloader = new WarcSideloader(warcFile, processing)) {
domain = sideloader.getDomain();
sideloader.getDocumentsStream().forEachRemaining(docs::add);
} catch (Exception e) {
throw new RuntimeException(e);
}
assertNotNull(domain);
assertEquals(3, docs.size());
List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
assertEquals(List.of(
"https://www.marginalia.nu/",
"https://www.marginalia.nu/log/93_atags/",
"https://www.marginalia.nu/links/"),
fetchedUrls);
}
}

View File

@ -41,6 +41,7 @@ dependencies {
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
implementation libs.bundles.slf4j
@ -48,6 +49,7 @@ dependencies {
implementation libs.guice
implementation libs.gson
implementation libs.zstd
implementation libs.jwarc
implementation libs.crawlercommons
implementation libs.okhttp3
implementation libs.jsoup

View File

@ -1,83 +0,0 @@
package nu.marginalia.crawl;
import lombok.SneakyThrows;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
public class CrawlLimiter {
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256);
// Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this
private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4;
private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2;
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
// When set to true, the crawler will wait before starting additional tasks
private final AtomicBoolean throttle = new AtomicBoolean(false);
private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class);
public CrawlLimiter() {
Thread monitorThread = new Thread(this::monitor, "Memory Monitor");
monitorThread.setDaemon(true);
monitorThread.start();
}
@SneakyThrows
public void monitor() {
for (;;) {
synchronized (throttle) {
boolean oldThrottle = throttle.get();
boolean newThrottle = oldThrottle;
if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) {
// According to the spec this may happen, although it seems to rarely
// be the case in practice
logger.warn("Memory based throttling disabled (set Xmx)");
return;
}
final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) {
newThrottle = false;
logger.warn("Memory based throttling released");
}
else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) {
newThrottle = true;
logger.warn("Memory based throttling triggered");
// Try to GC
System.gc();
}
throttle.set(newThrottle);
if (!newThrottle) {
throttle.notifyAll();
}
if (newThrottle != oldThrottle) {
logger.warn("Memory based throttling set to {}", newThrottle);
}
}
TimeUnit.SECONDS.sleep(1);
}
}
@SneakyThrows
public void waitForEnoughRAM() {
while (throttle.get()) {
synchronized (throttle) {
throttle.wait(30000);
}
}
}
}

View File

@ -13,10 +13,13 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.spec.CrawlSpecProvider;
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.CrawlerOutputFile;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.crawlspec.CrawlSpecFileNames;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
@ -27,18 +30,17 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox;
import nu.marginalia.process.control.ProcessHeartbeatImpl;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.crawling.io.CrawledDomainWriter;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.util.SimpleBlockingThreadPool;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.internal.Util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.*;
@ -49,13 +51,8 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
public class CrawlerMain {
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
private final ProcessHeartbeatImpl heartbeat;
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
private final UserAgent userAgent;
private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory;
private final DomainProber domainProber;
private final FileStorageService fileStorageService;
@ -66,13 +63,12 @@ public class CrawlerMain {
private final SimpleBlockingThreadPool pool;
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
private final CrawledDomainReader reader = new CrawledDomainReader();
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
volatile int totalTasks;
final AtomicInteger tasksDone = new AtomicInteger(0);
private final CrawlLimiter limiter = new CrawlLimiter();
private HttpFetcherImpl fetcher;
@Inject
public CrawlerMain(UserAgent userAgent,
@ -83,8 +79,8 @@ public class CrawlerMain {
DbCrawlSpecProvider dbCrawlSpecProvider,
AnchorTagsSourceFactory anchorTagsSourceFactory,
Gson gson) {
this.heartbeat = heartbeat;
this.userAgent = userAgent;
this.heartbeat = heartbeat;
this.messageQueueFactory = messageQueueFactory;
this.domainProber = domainProber;
this.fileStorageService = fileStorageService;
@ -93,8 +89,14 @@ public class CrawlerMain {
this.gson = gson;
this.node = processConfiguration.node();
// maybe need to set -Xss for JVM to deal with this?
pool = new SimpleBlockingThreadPool("CrawlerPool", CrawlLimiter.maxPoolSize, 1);
pool = new SimpleBlockingThreadPool("CrawlerPool",
Integer.getInteger("crawler.pool-size", 256),
1);
fetcher = new HttpFetcherImpl(userAgent.uaString(),
new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()),
new ConnectionPool(5, 10, TimeUnit.SECONDS)
);
}
public static void main(String... args) throws Exception {
@ -141,6 +143,7 @@ public class CrawlerMain {
public void run(CrawlSpecProvider specProvider, Path outputDir) throws InterruptedException, IOException {
heartbeat.start();
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains())
) {
@ -175,6 +178,7 @@ public class CrawlerMain {
activePoolCount = newActivePoolCount;
}
}
}
catch (Exception ex) {
logger.warn("Exception in crawler", ex);
@ -211,27 +215,48 @@ public class CrawlerMain {
@Override
public void run() throws Exception {
limiter.waitForEnoughRAM();
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
Path finalWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL);
Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain);
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
if (Files.exists(newWarcFile)) {
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
}
else {
Files.deleteIfExists(tempFile);
}
try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id);
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
CrawlDataReference reference = getReference())
{
Thread.currentThread().setName("crawling:" + domain);
var domainLinks = anchorTagsSource.getAnchorTags(domain);
var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept);
int size = retreiver.fetch(domainLinks, reference);
if (Files.exists(tempFile)) {
retriever.syncAbortedRun(tempFile);
Files.delete(tempFile);
}
workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size);
int size = retriever.fetch(domainLinks, reference);
// Delete the reference crawl data if it's not the same as the new one
// (mostly a case when migrating from legacy->warc)
reference.delete();
CrawledDocumentParquetRecordFileWriter
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
workLog.setJobToFinished(domain, parquetFile.toString(), size);
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
logger.info("Fetched {}", domain);
} catch (Exception e) {
logger.error("Error fetching domain " + domain, e);
Files.deleteIfExists(newWarcFile);
Files.deleteIfExists(tempFile);
}
finally {
// We don't need to double-count these; it's also kept int he workLog
@ -242,8 +267,7 @@ public class CrawlerMain {
private CrawlDataReference getReference() {
try {
var dataStream = reader.createDataStream(outputDir, domain, id);
return new CrawlDataReference(dataStream);
return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
} catch (IOException e) {
logger.debug("Failed to read previous crawl data for {}", specification.domain);
return new CrawlDataReference();

View File

@ -5,14 +5,19 @@ import com.google.common.hash.Hashing;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.lsh.EasyLSH;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
/** A reference to a domain that has been crawled before. */
public class CrawlDataReference implements AutoCloseable {
private final SerializableCrawlDataStream data;
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
public CrawlDataReference(SerializableCrawlDataStream data) {
this.data = data;
@ -22,6 +27,15 @@ public class CrawlDataReference implements AutoCloseable {
this(SerializableCrawlDataStream.empty());
}
/** Delete the associated data from disk, if it exists */
public void delete() throws IOException {
Path filePath = data.path();
if (filePath != null) {
Files.deleteIfExists(filePath);
}
}
@Nullable
public CrawledDocument nextDocument() {
try {
@ -32,17 +46,16 @@ public class CrawlDataReference implements AutoCloseable {
}
}
catch (IOException ex) {
ex.printStackTrace();
logger.error("Failed to read next document", ex);
}
return null;
}
public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) {
assert one.documentBody != null;
assert other.documentBody != null;
public boolean isContentBodySame(String one, String other) {
final long contentHashOne = contentHash(one.documentBody);
final long contentHashOther = contentHash(other.documentBody);
final long contentHashOne = contentHash(one);
final long contentHashOther = contentHash(other);
return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
}

View File

@ -20,8 +20,18 @@ public class CrawlDelayTimer {
this.delayTime = delayTime;
}
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
* set a flag that slows down the main crawl delay as well. */
public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
slowDown = true;
int delay = ex.retryAfter();
Thread.sleep(Math.clamp(delay, 100, 5000));
}
@SneakyThrows
public void delay(long spentTime) {
public void waitFetchDelay(long spentTime) {
long sleepTime = delayTime;
if (sleepTime >= 1) {
@ -30,10 +40,6 @@ public class CrawlDelayTimer {
Thread.sleep(min(sleepTime - spentTime, 5000));
}
else if (slowDown) {
// Additional delay when the server is signalling it wants slower requests
Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
}
else {
// When no crawl delay is specified, lean toward twice the fetch+process time,
// within sane limits. This means slower servers get slower crawling, and faster
@ -48,10 +54,10 @@ public class CrawlDelayTimer {
Thread.sleep(sleepTime - spentTime);
}
}
/** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */
public void slowDown() {
slowDown = true;
if (slowDown) {
// Additional delay when the server is signalling it wants slower requests
Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS);
}
}
}

View File

@ -0,0 +1,91 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.model.EdgeUrl;
import java.time.LocalDateTime;
import java.util.Objects;
public class CrawledDocumentFactory {
public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createUnknownHostError(EdgeUrl url) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc("Unknown Host")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
return CrawledDocument.builder()
.crawlerStatus("Timeout")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers(rsp.headers().toString())
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.statusCode())
.url(url.toString())
.build();
}
public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers("")
.contentType(contentType)
.timestamp(LocalDateTime.now().toString())
.httpStatus(statusCode)
.url(url.toString())
.build();
}
public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
.redirectUrl(responseUrl.toString())
.headers(rsp.headers().toString())
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.statusCode())
.url(url.toString())
.build();
}
public static CrawledDocument createRobotsError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(-1)
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
.build();
}
public static CrawledDocument createRetryError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(429)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
}
}

View File

@ -3,11 +3,15 @@ package nu.marginalia.crawl.retreival;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.*;
import nu.marginalia.ip_blocklist.UrlBlocklist;
@ -19,54 +23,49 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.time.LocalDateTime;
import java.nio.file.Path;
import java.util.*;
import java.util.function.Consumer;
public class CrawlerRetreiver {
public class CrawlerRetreiver implements AutoCloseable {
private static final int MAX_ERRORS = 20;
private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
private final HttpFetcher fetcher;
private final String domain;
private final Consumer<SerializableCrawlData> crawledDomainWriter;
private static final LinkParser linkParser = new LinkParser();
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
private static final HashFunction hashMethod = Hashing.murmur3_128(0);
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private final DomainProber domainProber;
private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier;
private final WarcRecorder warcRecorder;
private final CrawlerRevisitor crawlerRevisitor;
private final SitemapFetcher sitemapFetcher;
int errorCount = 0;
/** recrawlState tag for documents that had a HTTP status 304 */
private static final String documentWasRetainedTag = "RETAINED/304";
/** recrawlState tag for documents that had a 200 status but were identical to a previous version */
private static final String documentWasSameTag = "SAME-BY-COMPARISON";
public CrawlerRetreiver(HttpFetcher fetcher,
DomainProber domainProber,
CrawlSpecRecord specs,
Consumer<SerializableCrawlData> writer) {
WarcRecorder warcRecorder)
{
this.warcRecorder = warcRecorder;
this.fetcher = fetcher;
this.domainProber = domainProber;
domain = specs.domain;
crawledDomainWriter = writer;
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth);
sitemapRetriever = fetcher.createSitemapRetriever();
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth);
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
// We must always crawl the index page first, this is assumed when fingerprinting the server
var fst = crawlFrontier.peek();
@ -90,43 +89,42 @@ public class CrawlerRetreiver {
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
return switch (probeResult) {
case DomainProber.ProbeResultOk(EdgeUrl probedUrl) -> crawlDomain(oldCrawlData, probedUrl, domainLinks);
case DomainProber.ProbeResultError(CrawlerDomainStatus status, String desc) -> {
crawledDomainWriter.accept(
CrawledDomain.builder()
.crawlerStatus(status.name())
.crawlerStatusDesc(desc)
.domain(domain)
.ip(findIp(domain))
.build()
);
yield 1;
}
case DomainProber.ProbeResultRedirect(EdgeDomain redirectDomain) -> {
crawledDomainWriter.accept(
CrawledDomain.builder()
.crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
.crawlerStatusDesc("Redirected to different domain")
.redirectDomain(redirectDomain.toString())
.domain(domain)
.ip(findIp(domain))
.build()
);
yield 1;
}
};
try {
return crawlDomain(oldCrawlData, probeResult, domainLinks);
}
catch (Exception ex) {
logger.error("Error crawling domain {}", domain, ex);
return 0;
}
}
private int crawlDomain(CrawlDataReference oldCrawlData, EdgeUrl rootUrl, DomainLinks domainLinks) {
public void syncAbortedRun(Path warcFile) {
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
resync.run(warcFile);
}
private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
String ip = findIp(domain);
EdgeUrl rootUrl;
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
return 1;
}
else {
rootUrl = ok.probedUrl();
}
assert !crawlFrontier.isEmpty();
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder);
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
sniffRootDocument(delayTimer, rootUrl);
sniffRootDocument(rootUrl);
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer);
@ -140,9 +138,15 @@ public class CrawlerRetreiver {
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
// Add links from the sitemap to the crawl frontier
downloadSitemaps(robotsRules, rootUrl);
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
CrawledDomain ret = new CrawledDomain(domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
CrawledDomain ret = new CrawledDomain(domain,
null,
CrawlerDomainStatus.OK.name(),
null,
ip,
new ArrayList<>(),
null);
int fetchedCount = recrawled;
@ -154,7 +158,7 @@ public class CrawlerRetreiver {
var top = crawlFrontier.takeNextUrl();
if (!robotsRules.isAllowed(top.toString())) {
crawledDomainWriter.accept(createRobotsError(top));
warcRecorder.flagAsRobotsTxtError(top);
continue;
}
@ -177,149 +181,43 @@ public class CrawlerRetreiver {
continue;
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) {
fetchedCount++;
try {
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
fetchedCount++;
}
}
catch (InterruptedException ex) {
Thread.currentThread().interrupt();
break;
}
}
ret.cookies = fetcher.getCookies();
crawledDomainWriter.accept(ret);
return fetchedCount;
}
/** Performs a re-crawl of old documents, comparing etags and last-modified */
private int recrawl(CrawlDataReference oldCrawlData,
SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer) {
int recrawled = 0;
int retained = 0;
for (;;) {
CrawledDocument doc = oldCrawlData.nextDocument();
if (doc == null) {
break;
}
// This Shouldn't Happen (TM)
var urlMaybe = EdgeUrl.parse(doc.url);
if (urlMaybe.isEmpty()) continue;
var url = urlMaybe.get();
// If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
if (doc.httpStatus == 404) {
crawlFrontier.addVisited(url);
continue;
}
if (doc.httpStatus != 200) continue;
if (!robotsRules.isAllowed(url.toString())) {
crawledDomainWriter.accept(createRobotsError(url));
continue;
}
if (!crawlFrontier.filterLink(url))
continue;
if (!crawlFrontier.addVisited(url))
continue;
if (recrawled > 5
&& retained > 0.9 * recrawled
&& Math.random() < 0.9)
{
// Since it looks like most of these documents haven't changed,
// we'll load the documents directly; but we do this in a random
// fashion to make sure we eventually catch changes over time
crawledDomainWriter.accept(doc);
crawlFrontier.addVisited(url);
continue;
}
// GET the document with the stored document as a reference
// providing etag and last-modified headers, so we can recycle the
// document if it hasn't changed without actually downloading it
var fetchedDocOpt = fetchWriteAndSleep(url,
delayTimer,
new DocumentWithReference(doc, oldCrawlData));
if (fetchedDocOpt.isEmpty()) continue;
if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++;
recrawled ++;
}
return recrawled;
/** Using the old crawl data, fetch the documents comparing etags and last-modified */
private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException {
return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
}
private void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
List<String> sitemaps = robotsRules.getSitemaps();
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
if (!sitemaps.isEmpty()) {
for (var url : sitemaps) {
EdgeUrl.parse(url).ifPresent(urls::add);
}
}
else {
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
}
downloadSitemaps(urls);
}
private void downloadSitemaps(List<EdgeUrl> urls) {
Set<String> checkedSitemaps = new HashSet<>();
for (var url : urls) {
// Let's not download sitemaps from other domains for now
if (!crawlFrontier.isSameDomain(url)) {
continue;
}
if (checkedSitemaps.contains(url.path))
continue;
var sitemap = sitemapRetriever.fetchSitemap(url);
if (sitemap.isEmpty()) {
continue;
}
// ensure we don't try to download this sitemap again
// (don't move this up, as we may want to check the same
// path with different protocols until we find one that works)
checkedSitemaps.add(url.path);
crawlFrontier.addAllToQueue(sitemap);
}
logger.debug("Queue is now {}", crawlFrontier.queueSize());
}
private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) {
private void sniffRootDocument(EdgeUrl rootUrl) {
try {
logger.debug("Configuring link filter");
var url = rootUrl.withPathAndParam("/", null);
var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200);
if (maybeSample.isEmpty())
var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
if (!(result instanceof HttpFetchResult.ResultOk ok))
return;
var sample = maybeSample.get();
if (sample.documentBody == null)
var optDoc = ok.parseDocument();
if (optDoc.isEmpty())
return;
// Sniff the software based on the sample document
var doc = Jsoup.parse(sample.documentBody);
var doc = optDoc.get();
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
for (var link : doc.getElementsByTag("link")) {
@ -338,7 +236,7 @@ public class CrawlerRetreiver {
linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain)
.map(List::of)
.ifPresent(this::downloadSitemaps);
.ifPresent(sitemapFetcher::downloadSitemaps);
}
}
catch (Exception ex) {
@ -346,41 +244,67 @@ public class CrawlerRetreiver {
}
}
private Optional<CrawledDocument> fetchWriteAndSleep(EdgeUrl top,
CrawlDelayTimer timer,
DocumentWithReference reference) {
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
CrawlDelayTimer timer,
DocumentWithReference reference) throws InterruptedException
{
logger.debug("Fetching {}", top);
HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
long startTime = System.currentTimeMillis();
var contentTags = reference.getContentTags();
var docOpt = fetchUrl(top, timer, reference);
if (docOpt.isPresent()) {
var doc = docOpt.get();
if (!Objects.equals(doc.recrawlState, documentWasRetainedTag)
&& reference.isContentBodySame(doc))
{
// The document didn't change since the last time
doc.recrawlState = documentWasSameTag;
// Fetch the document, retrying if we get a rate limit exception
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
try {
fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
break;
}
crawledDomainWriter.accept(doc);
if (doc.url != null) {
// We may have redirected to a different path
EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited);
catch (RateLimitException ex) {
timer.waitRetryDelay(ex);
}
if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) {
errorCount++;
catch (Exception ex) {
logger.warn("Failed to fetch {}", top, ex);
fetchedDoc = new HttpFetchResult.ResultException(ex);
}
}
timer.delay(System.currentTimeMillis() - startTime);
try {
if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
var docOpt = ok.parseDocument();
if (docOpt.isPresent()) {
var doc = docOpt.get();
return docOpt;
crawlFrontier.enqueueLinksFromDocument(top, doc);
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
}
}
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
var doc = reference.doc();
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody);
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
new ContentType(doc.contentType, "UTF-8"),
doc.documentBody);
var parsed = Jsoup.parse(doc.documentBody);
crawlFrontier.enqueueLinksFromDocument(top, parsed);
crawlFrontier.addVisited(top);
}
else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) {
errorCount ++;
}
}
catch (Exception ex) {
logger.error("Error parsing document {}", top, ex);
}
timer.waitFetchDelay(System.currentTimeMillis() - startTime);
return fetchedDoc;
}
private boolean isAllowedProtocol(String proto) {
@ -388,91 +312,6 @@ public class CrawlerRetreiver {
|| proto.equalsIgnoreCase("https");
}
private Optional<CrawledDocument> fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) {
try {
var contentTags = reference.getContentTags();
var fetchedDoc = tryDownload(top, timer, contentTags);
CrawledDocument doc = reference.replaceOn304(fetchedDoc);
if (doc.documentBody != null) {
doc.documentBodyHash = createHash(doc.documentBody);
var parsedDoc = Jsoup.parse(doc.documentBody);
EdgeUrl url = new EdgeUrl(doc.url);
findLinks(url, parsedDoc);
findCanonicalUrl(url, parsedDoc)
.ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString());
}
return Optional.of(doc);
}
catch (Exception ex) {
logger.warn("Failed to process document {}", top);
}
return Optional.empty();
}
@SneakyThrows
private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) {
for (int i = 0; i < 2; i++) {
try {
var doc = fetcher.fetchContent(top, tags);
doc.recrawlState = "NEW";
return doc;
}
catch (RateLimitException ex) {
timer.slowDown();
int delay = ex.retryAfter();
if (delay > 0 && delay < 5000) {
Thread.sleep(delay);
}
}
}
return createRetryError(top);
}
private String createHash(String documentBodyHash) {
return hashMethod.hashUnencodedChars(documentBodyHash).toString();
}
private void findLinks(EdgeUrl baseUrl, Document parsed) {
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
for (var link : parsed.getElementsByTag("a")) {
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
}
for (var link : parsed.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
}
for (var link : parsed.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
}
for (var link : parsed.getElementsByTag("link")) {
String rel = link.attr("rel");
if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
}
}
}
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
baseUrl = baseUrl.domain.toRootUrl();
for (var link : parsed.select("link[rel=canonical]")) {
return linkParser.parseLink(baseUrl, link);
}
return Optional.empty();
}
private String findIp(String domain) {
try {
return InetAddress.getByName(domain).getHostAddress();
@ -481,92 +320,9 @@ public class CrawlerRetreiver {
}
}
private CrawledDocument createRobotsError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(-1)
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
.build();
}
private CrawledDocument createRetryError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(429)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
}
private record DocumentWithReference(
@Nullable CrawledDocument doc,
@Nullable CrawlDataReference reference) {
private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
public static DocumentWithReference empty() {
return emptyInstance;
}
public boolean isContentBodySame(CrawledDocument newDoc) {
if (reference == null)
return false;
if (doc == null)
return false;
if (doc.documentBody == null)
return false;
if (newDoc.documentBody == null)
return false;
return reference.isContentBodySame(doc, newDoc);
}
private ContentTags getContentTags() {
if (null == doc)
return ContentTags.empty();
String headers = doc.headers;
if (headers == null)
return ContentTags.empty();
String[] headersLines = headers.split("\n");
String lastmod = null;
String etag = null;
for (String line : headersLines) {
if (line.toLowerCase().startsWith("etag:")) {
etag = line.substring(5).trim();
}
if (line.toLowerCase().startsWith("last-modified:")) {
lastmod = line.substring(14).trim();
}
}
return new ContentTags(etag, lastmod);
}
public boolean isEmpty() {
return doc == null || reference == null;
}
/** If the provided document has HTTP status 304, and the reference document is provided,
* return the reference document; otherwise return the provided document.
*/
public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) {
if (doc == null)
return fetchedDoc;
// HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
// we fetched it last time. We can recycle the reference document.
if (fetchedDoc.httpStatus != 304)
return fetchedDoc;
var ret = doc;
ret.recrawlState = documentWasRetainedTag;
ret.timestamp = LocalDateTime.now().toString();
return ret;
}
@Override
public void close() throws Exception {
warcRecorder.close();
}
}

View File

@ -0,0 +1,107 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
/**
* This class is responsible for resynchronizing the crawl frontier with a partially written
* warc file. This may happen if the crawl is interrupted or crashes.
* <p>
* This is best-effort and not guaranteed to recover all data, but it should limit
* the amount of data that is lost and needs to be re-crawled in the event of an unexpected
* shutdown.
*/
public class CrawlerWarcResynchronizer {
private final DomainCrawlFrontier crawlFrontier;
private final WarcRecorder recorder;
private static final Logger logger = LoggerFactory.getLogger(CrawlerWarcResynchronizer.class);
public CrawlerWarcResynchronizer(DomainCrawlFrontier crawlFrontier, WarcRecorder recorder) {
this.crawlFrontier = crawlFrontier;
this.recorder = recorder;
}
public void run(Path tempFile) {
// First pass, enqueue links
try (var reader = new WarcReader(tempFile)) {
WarcXResponseReference.register(reader);
WarcXEntityRefused.register(reader);
for (var item : reader) {
accept(item);
}
} catch (IOException e) {
logger.info(STR."Failed read full warc file \{tempFile}", e);
}
// Second pass, copy records to the new warc file
try (var reader = new WarcReader(tempFile)) {
for (var item : reader) {
recorder.resync(item);
}
} catch (IOException e) {
logger.info(STR."Failed read full warc file \{tempFile}", e);
}
}
public void accept(WarcRecord item) {
try {
if (item instanceof WarcResponse rsp) {
response(rsp);
} else if (item instanceof WarcRequest req) {
request(req);
} else if (item instanceof WarcXEntityRefused refused) {
refused(refused);
}
}
catch (Exception ex) {
logger.info(STR."Failed to process warc record \{item}", ex);
}
}
private void refused(WarcXEntityRefused refused) {
// In general, we don't want to re-crawl urls that were refused,
// but to permit circumstances to change over time, we'll
// allow for a small chance of re-probing these entries
if (Math.random() > 0.1) {
crawlFrontier.addVisited(new EdgeUrl(refused.targetURI()));
}
}
private void request(WarcRequest request) {
EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
}
private void response(WarcResponse rsp) {
var url = new EdgeUrl(rsp.targetURI());
crawlFrontier.addVisited(url);
try {
var response = HttpFetchResult.importWarc(rsp);
DocumentBodyExtractor
.asString(response)
.ifPresent((ct, body) ->
{
var doc = Jsoup.parse(body);
crawlFrontier.enqueueLinksFromDocument(url, doc);
});
}
catch (Exception e) {
logger.info(STR."Failed to parse response body for \{url}", e);
}
}
}

View File

@ -3,14 +3,19 @@ package nu.marginalia.crawl.retreival;
import com.google.common.hash.HashFunction;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.net.URISyntaxException;
import java.util.*;
import java.util.function.Predicate;
public class DomainCrawlFrontier {
private static final LinkParser linkParser = new LinkParser();
private final ArrayDeque<String> queue;
// To save the number of strings kept in memory,
@ -45,9 +50,14 @@ public class DomainCrawlFrontier {
}
}
/** Increase the depth of the crawl by a factor. If the current depth is smaller
* than the number of already visited documents, the base depth will be adjusted
* to the visited count first.
*/
public void increaseDepth(double depthIncreaseFactor) {
depth = (int)(depth * depthIncreaseFactor);
depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
}
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
this.linkFilter = linkFilter;
}
@ -141,4 +151,27 @@ public class DomainCrawlFrontier {
public int queueSize() {
return queue.size();
}
public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) {
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
for (var link : parsed.getElementsByTag("a")) {
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
}
for (var link : parsed.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
}
for (var link : parsed.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
}
for (var link : parsed.getElementsByTag("link")) {
String rel = link.attr("rel");
if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
}
}
}
}

View File

@ -0,0 +1,86 @@
package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.SocketTimeoutException;
import java.util.Objects;
public class ContentTypeProber {
private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class);
private final String userAgent;
private final OkHttpClient client;
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
public ContentTypeProber(String userAgent, OkHttpClient httpClient) {
this.userAgent = userAgent;
this.client = httpClient;
}
/** Probe the content type of the given URL with a HEAD request.
* This is used to detect binary files, which we don't want to crawl.
* <p>
* If the URL redirects, the final URL is returned, to avoid redundant
* requests.
*
* @param url The URL to probe
* @return A ContentTypeProbeResult
*/
public ContentTypeProbeResult probeContentType(EdgeUrl url) {
logger.debug("Probing suspected binary {}", url);
var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgent)
.addHeader("Accept-Encoding", "gzip")
.url(url.toString());
var head = headBuilder.build();
var call = client.newCall(head);
try (var rsp = call.execute()) {
var contentTypeHeader = rsp.header("Content-type");
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
}
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
// HEAD 301 url1 -> url2
// HEAD 200 url2
// GET 301 url1 -> url2
// GET 200 url2
// which is not what we want. Overall we want to do as few requests as possible to not raise
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
EdgeUrl ret;
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
else ret = url;
return new ContentTypeProbeResult.Ok(ret);
} catch (SocketTimeoutException ex) {
return new ContentTypeProbeResult.Timeout();
} catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return new ContentTypeProbeResult.Exception(ex);
}
}
public sealed interface ContentTypeProbeResult {
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
record Timeout() implements ContentTypeProbeResult { }
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
}
}

View File

@ -3,7 +3,8 @@ package nu.marginalia.crawl.retreival.fetcher;
import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
@ -18,9 +19,9 @@ public interface HttpFetcher {
FetchResult probeDomain(EdgeUrl url);
CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;
HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
SitemapRetriever createSitemapRetriever();
}

View File

@ -7,43 +7,41 @@ import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.Cookies;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.ContentType;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
import nu.marginalia.crawl.retreival.logic.ContentTypeParser;
import okhttp3.*;
import org.apache.commons.io.input.BOMInputStream;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLException;
import javax.net.ssl.X509TrustManager;
import java.io.EOFException;
import java.io.IOException;
import java.net.*;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.time.LocalDateTime;
import java.util.*;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
public class HttpFetcherImpl implements HttpFetcher {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final String userAgent;
private final int maxFetchSize = 1024*512;
private final Cookies cookies = new Cookies();
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final ContentTypeProber contentTypeProber;
@Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@ -64,6 +62,7 @@ public class HttpFetcherImpl implements HttpFetcher {
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
.socketFactory(ftSocketFactory)
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
.connectionPool(pool)
.cookieJar(cookies.getJar())
.followRedirects(true)
@ -92,13 +91,22 @@ public class HttpFetcherImpl implements HttpFetcher {
{
this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);
}
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
this.client = createClient(null, new ConnectionPool());
this.userAgent = userAgent;
this.contentTypeProber = new ContentTypeProber(userAgent, client);
}
/**
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
* and if there are any redirects. This is done by one or more HEAD requests.
*
* @param url The URL to probe.
* @return The result of the probe, indicating the state and the URL.
*/
@Override
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
@ -130,8 +138,9 @@ public class HttpFetcherImpl implements HttpFetcher {
@Override
@SneakyThrows
public CrawledDocument fetchContent(EdgeUrl url,
ContentTags contentTags)
public HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder warcRecorder,
ContentTags contentTags)
throws RateLimitException
{
@ -139,268 +148,54 @@ public class HttpFetcherImpl implements HttpFetcher {
// looks like it might be something else, we perform a HEAD first to check the content type
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
{
logger.debug("Probing suspected binary {}", url);
var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgent)
.url(url.toString())
.addHeader("Accept-Encoding", "gzip");
var head = headBuilder.build();
var call = client.newCall(head);
try (var rsp = call.execute()) {
var contentTypeHeader = rsp.header("Content-type");
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
}
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
// HEAD 301 url1 -> url2
// HEAD 200 url2
// GET 301 url1 -> url2
// GET 200 url2
// which is not what we want. Overall we want to do as few requests as possible to not raise
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
if (Objects.equals(redirectUrl.domain, url.domain))
url = redirectUrl;
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
url = ok.resolvedUrl();
}
catch (SocketTimeoutException ex) {
return createTimeoutErrorRsp(url, ex);
else if (probeResult instanceof ContentTypeProbeResult.BadContentType badContentType) {
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
return new HttpFetchResult.ResultNone();
}
catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return createHardErrorRsp(url, ex);
else if (probeResult instanceof ContentTypeProbeResult.BadContentType.Timeout timeout) {
warcRecorder.flagAsTimeout(url);
return new HttpFetchResult.ResultNone();
}
else if (probeResult instanceof ContentTypeProbeResult.Exception exception) {
warcRecorder.flagAsError(url, exception.ex());
return new HttpFetchResult.ResultNone();
}
}
var getBuilder = new Request.Builder().get();
getBuilder.addHeader("User-agent", userAgent)
.url(url.toString())
.addHeader("Accept-Encoding", "gzip");
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("User-agent", userAgent);
contentTags.paint(getBuilder);
var get = getBuilder.build();
var call = client.newCall(get);
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
try (var rsp = call.execute()) {
return extractBody(url, rsp);
}
catch (RateLimitException rle) {
throw rle;
}
catch (SocketTimeoutException ex) {
return createTimeoutErrorRsp(url, ex);
}
catch (UnknownHostException ex) {
return createUnknownHostError(url, ex);
}
catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) {
// This is a bit of a grab-bag of errors that crop up
// IllegalCharsetName is egg on our face,
// but SSLException and EOFException are probably the server's fault
return createHardErrorRsp(url, ex);
}
catch (Exception ex) {
logger.error("Error during fetching", ex);
return createHardErrorRsp(url, ex);
}
}
private CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc("Unknown Host")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus("Timeout")
.crawlerStatusDesc(why.getMessage())
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers(rsp.headers().toString())
.contentType(rsp.header("Content-type"))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.code())
.url(url.toString())
.build();
}
private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException {
var responseUrl = new EdgeUrl(rsp.request().url().toString());
if (!Objects.equals(responseUrl.domain, url.domain)) {
return createRedirectResponse(url, rsp, responseUrl);
}
if (rsp.code() == 429) {
throw new RateLimitException(rsp.header("Retry-After", "1000"));
}
var body = rsp.body();
if (null == body) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body");
}
var byteStream = body.byteStream();
if ("gzip".equals(rsp.header("Content-encoding"))) {
byteStream = new GZIPInputStream(byteStream);
}
byteStream = new BOMInputStream(byteStream);
var contentTypeHeader = rsp.header("Content-type");
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}
byte[] data = byteStream.readNBytes(maxFetchSize);
var contentType = ContentTypeParser.parse(contentTypeHeader, data);
if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}
if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, "");
}
if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
.crawlerStatusDesc("X-Robots-Tag")
.url(responseUrl.toString())
.httpStatus(-1)
.timestamp(LocalDateTime.now().toString())
.headers(rsp.headers().toString())
.build();
}
var strData = getStringData(data, contentType);
var canonical = rsp.header("rel=canonical", "");
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.OK.name())
.headers(rsp.headers().toString())
.contentType(rsp.header("Content-type"))
.timestamp(LocalDateTime.now().toString())
.canonicalUrl(canonical)
.httpStatus(rsp.code())
.url(responseUrl.toString())
.documentBody(strData)
.build();
}
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
* <p>
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
*
* @param xRobotsHeaderTags List of X-Robots-Tag values
* @param userAgent User agent string
* @return true if we are allowed to index this page
*/
// Visible for tests
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
boolean isPermittedGeneral = true;
boolean isPermittedMarginalia = false;
boolean isForbiddenMarginalia = false;
for (String header : xRobotsHeaderTags) {
if (header.indexOf(':') >= 0) {
String[] parts = StringUtils.split(header, ":", 2);
if (parts.length < 2)
continue;
// Is this relevant to us?
if (!Objects.equals(parts[0].trim(), userAgent))
continue;
if (parts[1].contains("noindex"))
isForbiddenMarginalia = true;
else if (parts[1].contains("none"))
isForbiddenMarginalia = true;
else if (parts[1].contains("all"))
isPermittedMarginalia = true;
if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 429) {
String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
throw new RateLimitException(retryAfter);
}
else {
if (header.contains("noindex"))
isPermittedGeneral = false;
if (header.contains("none"))
isPermittedGeneral = false;
if (ok.statusCode() == 304) {
return new HttpFetchResult.Result304Raw();
}
if (ok.statusCode() == 200) {
return ok;
}
}
if (isPermittedMarginalia)
return true;
if (isForbiddenMarginalia)
return false;
return isPermittedGeneral;
}
private String getStringData(byte[] data, ContentType contentType) {
Charset charset;
try {
charset = Charset.forName(contentType.charset());
}
catch (IllegalCharsetNameException ex) {
charset = StandardCharsets.UTF_8;
}
catch (UnsupportedCharsetException ex) {
// This is usually like Macintosh Latin
// (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding)
//
// It's close enough to 8859-1 to serve
charset = StandardCharsets.ISO_8859_1;
}
return new String(data, charset);
}
private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
.redirectUrl(responseUrl.toString())
.headers(rsp.headers().toString())
.contentType(rsp.header("Content-type"))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.code())
.url(url.toString())
.build();
return new HttpFetchResult.ResultNone();
}
@Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return fetchRobotsForProto("https", domain)
.or(() -> fetchRobotsForProto("http", domain))
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
return fetchRobotsForProto("https", recorder, domain)
.or(() -> fetchRobotsForProto("http", recorder, domain))
.orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL));
}
@ -409,21 +204,31 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever();
}
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, WarcRecorder recorder, EdgeDomain domain) {
try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
var getBuilder = new Request.Builder().get();
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("User-agent", userAgent);
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
robotsParser.parseContent(url.toString(),
body,
contentType.toString(),
userAgent)
);
}
catch (Exception ex) {
return Optional.empty();
}
}
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
return robotsParser.parseContent(doc.url,
doc.documentBody.getBytes(),
doc.contentType,
userAgent);
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.retreival.fetcher.socket;
import javax.net.SocketFactory;
import java.io.IOException;

View File

@ -0,0 +1,31 @@
package nu.marginalia.crawl.retreival.fetcher.socket;
import okhttp3.Interceptor;
import okhttp3.Response;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
/** An interceptor that intercepts network requests and adds the remote IP address as
* a header in the response. This is used to pass the remote IP address to the Warc
* writer, as this information is not available in the response.
*/
public class IpInterceptingNetworkInterceptor implements Interceptor {
private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
@NotNull
@Override
public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
String IP = chain.connection().socket().getInetAddress().getHostAddress();
return chain.proceed(chain.request())
.newBuilder()
.addHeader(pseudoHeaderName, IP)
.build();
}
public static String getIpFromResponse(Response response) {
return response.header(pseudoHeaderName);
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.retreival.fetcher.socket;
import lombok.SneakyThrows;
@ -8,6 +8,8 @@ import java.security.cert.X509Certificate;
public class NoSecuritySSL {
// Create a trust manager that does not validate certificate chains
// We want to accept e.g. self-signed certificates and certificates
// that are not signed by a CA is generally trusted by the system.
public static final TrustManager[] trustAllCerts = new TrustManager[]{
new X509TrustManager() {
@Override
@ -27,7 +29,6 @@ public class NoSecuritySSL {
}
};
@SneakyThrows
public static SSLSocketFactory buildSocketFactory() {
// Install the all-trusting trust manager

View File

@ -0,0 +1,33 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
import org.netpreserve.jwarc.WarcDigest;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
class WarcDigestBuilder {
private final MessageDigest digest;
private static final String digestAlgorithm = "SHA-1";
public WarcDigestBuilder() throws NoSuchAlgorithmException {
this.digest = MessageDigest.getInstance(digestAlgorithm);
}
public void update(String s) {
byte[] bytes = s.getBytes();
update(bytes, bytes.length);
}
public void update(byte[] buffer, int n) {
update(buffer, 0, n);
}
public void update(byte[] buffer, int s, int n) {
digest.update(buffer, s, n);
}
public WarcDigest build() {
return new WarcDigest(digest);
}
}

View File

@ -0,0 +1,170 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
import okhttp3.Protocol;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.net.URI;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Map;
import java.util.StringJoiner;
import java.util.stream.Collectors;
/** We don't have access to the raw HTTP request and response, so we need to reconstruct them
* as best is possible from the data we have available.
*/
public class WarcProtocolReconstructor {
static String getHttpRequestString(Request request, URI uri) {
StringBuilder requestStringBuilder = new StringBuilder();
final String encodedURL = encodeURLKeepSlashes(uri.getPath());
requestStringBuilder.append(request.method()).append(" ").append(encodedURL);
if (uri.getQuery() != null) {
requestStringBuilder.append("?").append(uri.getQuery());
}
requestStringBuilder.append(" HTTP/1.1\r\n");
requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n");
request.headers().toMultimap().forEach((k, values) -> {
for (var value : values) {
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
}
});
return requestStringBuilder.toString();
}
/** Java's URLEncoder will URLEncode slashes, which is not desirable
* when sanitizing a URL for HTTP protocol purposes
*/
private static String encodeURLKeepSlashes(String URL) {
String[] parts = StringUtils.split(URL,"/");
StringJoiner joiner = new StringJoiner("/");
for (String part : parts) {
joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8));
}
return joiner.toString();
}
static String getResponseHeader(String headersAsString, int code) {
String version = "1.1";
String statusCode = String.valueOf(code);
String statusMessage = STATUS_CODE_MAP.getOrDefault(code, "Unknown");
String headerString = getHeadersAsString(headersAsString);
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
}
static String getResponseHeader(Response response) {
String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
String statusCode = String.valueOf(response.code());
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
String headerString = getHeadersAsString(response);
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
}
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
Map.entry(200, "OK"),
Map.entry(201, "Created"),
Map.entry(202, "Accepted"),
Map.entry(203, "Non-Authoritative Information"),
Map.entry(204, "No Content"),
Map.entry(205, "Reset Content"),
Map.entry(206, "Partial Content"),
Map.entry(207, "Multi-Status"),
Map.entry(208, "Already Reported"),
Map.entry(226, "IM Used"),
Map.entry(300, "Multiple Choices"),
Map.entry(301, "Moved Permanently"),
Map.entry(302, "Found"),
Map.entry(303, "See Other"),
Map.entry(304, "Not Modified"),
Map.entry(307, "Temporary Redirect"),
Map.entry(308, "Permanent Redirect"),
Map.entry(400, "Bad Request"),
Map.entry(401, "Unauthorized"),
Map.entry(403, "Forbidden"),
Map.entry(404, "Not Found"),
Map.entry(405, "Method Not Allowed"),
Map.entry(406, "Not Acceptable"),
Map.entry(408, "Request Timeout"),
Map.entry(409, "Conflict"),
Map.entry(410, "Gone"),
Map.entry(411, "Length Required"),
Map.entry(412, "Precondition Failed"),
Map.entry(413, "Payload Too Large"),
Map.entry(414, "URI Too Long"),
Map.entry(415, "Unsupported Media Type"),
Map.entry(416, "Range Not Satisfiable"),
Map.entry(417, "Expectation Failed"),
Map.entry(418, "I'm a teapot"),
Map.entry(421, "Misdirected Request"),
Map.entry(426, "Upgrade Required"),
Map.entry(428, "Precondition Required"),
Map.entry(429, "Too Many Requests"),
Map.entry(431, "Request Header Fields Too Large"),
Map.entry(451, "Unavailable For Legal Reasons"),
Map.entry(500, "Internal Server Error"),
Map.entry(501, "Not Implemented"),
Map.entry(502, "Bad Gateway"),
Map.entry(503, "Service Unavailable"),
Map.entry(504, "Gateway Timeout"),
Map.entry(505, "HTTP Version Not Supported"),
Map.entry(506, "Variant Also Negotiates"),
Map.entry(507, "Insufficient Storage"),
Map.entry(508, "Loop Detected"),
Map.entry(510, "Not Extended"),
Map.entry(511, "Network Authentication Required")
);
static private String getHeadersAsString(String headersBlob) {
StringJoiner joiner = new StringJoiner("\r\n");
Arrays.stream(headersBlob.split("\n")).forEach(joiner::add);
return joiner.toString();
}
static private String getHeadersAsString(Response response) {
StringJoiner joiner = new StringJoiner("\r\n");
response.headers().toMultimap().forEach((k, values) -> {
String headerCapitalized = capitalizeHeader(k);
// Omit pseudoheaders injected by the crawler itself
if (headerCapitalized.startsWith("X-Marginalia"))
return;
// Omit Transfer-Encoding header, as we'll be using Content-Length
// instead in the warc file, despite what the server says
if (headerCapitalized.startsWith("Transfer-Encoding"))
return;
for (var value : values) {
joiner.add(headerCapitalized + ": " + value);
}
});
return joiner.toString();
}
// okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
// for the WARC parser's sake...
static private String capitalizeHeader(String k) {
return Arrays.stream(StringUtils.split(k, '-'))
.map(StringUtils::capitalize)
.collect(Collectors.joining("-"));
}
}

View File

@ -0,0 +1,402 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.time.Instant;
import java.util.*;
/** Based on JWarc's fetch method, APL 2.0 license
* <p></p>
* This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
* as best is possible given not all the data is available at the same time and needs to
* be reconstructed.
*/
public class WarcRecorder implements AutoCloseable {
private static final int MAX_TIME = 30_000;
private static final int MAX_SIZE = 1024 * 1024 * 10;
private final WarcWriter writer;
private final Path warcFile;
private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class);
private final ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
private boolean temporaryFile = false;
// Affix a version string in case we need to change the format in the future
// in some way
private final String warcRecorderVersion = "1.0";
// We need to know if the site uses cookies so this can be reported among the search results
// -- flip this to true if we see any cookies. This information will also be painted on any
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
/**
* Create a new WarcRecorder that will write to the given file
*
* @param warcFile The file to write to
*/
public WarcRecorder(Path warcFile) throws IOException {
this.warcFile = warcFile;
this.writer = new WarcWriter(warcFile);
}
/**
* Create a new WarcRecorder that will write to a temporary file
* and delete it when close() is called.
*/
public WarcRecorder() throws IOException {
this.warcFile = Files.createTempFile("warc", ".warc.gz");
this.writer = new WarcWriter(this.warcFile);
temporaryFile = true;
}
public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
IOException,
URISyntaxException,
InterruptedException
{
URI requestUri = request.url().uri();
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
String ip;
Instant date = Instant.now();
long startMillis = date.toEpochMilli();
var call = client.newCall(request);
int totalLength = 0;
WarcTruncationReason truncationReason = null;
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
cookieInformation.update(client, request.url());
try (var response = call.execute()) {
var body = response.body();
InputStream inputStream;
if (body == null) {
inputStream = null;
truncationReason = WarcTruncationReason.DISCONNECT;
}
else {
inputStream = body.byteStream();
}
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response);
responseDataBuffer.put(responseHeaders);
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length());
int dataStart = responseDataBuffer.pos();
while (inputStream != null) {
int remainingLength = responseDataBuffer.remaining();
if (remainingLength == 0)
break;
int startPos = responseDataBuffer.pos();
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
if (n < 0)
break;
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
totalLength += n;
if (MAX_TIME > 0 && System.currentTimeMillis() - startMillis > MAX_TIME) {
truncationReason = WarcTruncationReason.TIME;
break;
}
if (MAX_SIZE > 0 && totalLength >= MAX_SIZE) {
truncationReason = WarcTruncationReason.LENGTH;
break;
}
}
// It looks like this might be the same as requestUri, but it's not;
// it's the URI after resolving redirects.
final URI responseUri = response.request().url().uri();
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
.blockDigest(responseDigestBuilder.build())
.date(date)
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
cookieInformation.paint(responseBuilder);
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
responseBuilder.payloadDigest(payloadDigestBuilder.build());
if (truncationReason != null)
responseBuilder.truncated(truncationReason);
// Build and write the response
var warcResponse = responseBuilder.build();
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcResponse);
// Build and write the request
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), requestUri);
requestDigestBuilder.update(httpRequestString);
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
.blockDigest(requestDigestBuilder.build())
.date(date)
.body(MediaType.HTTP_REQUEST, httpRequestString.getBytes())
.concurrentTo(warcResponse.id())
.build();
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcRequest);
return new HttpFetchResult.ResultOk(responseUri,
response.code(),
response.headers(),
ip,
responseDataBuffer.data,
dataStart,
responseDataBuffer.length() - dataStart);
}
catch (Exception ex) {
logger.warn("Failed to fetch URL {}", requestUri, ex);
return new HttpFetchResult.ResultException(ex);
}
}
public void resync(WarcRecord item) throws IOException {
writer.write(item);
}
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody) {
try {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
byte[] bytes = documentBody.getBytes();
String fakeHeaders = STR."""
Content-Type: \{contentType}
Content-Length: \{bytes.length}
Content-Encoding: UTF-8
""";
String header = WarcProtocolReconstructor.getResponseHeader(fakeHeaders, statusCode);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer();
responseDataBuffer.put(header);
responseDigestBuilder.update(header);
responseDigestBuilder.update(bytes, bytes.length);
payloadDigestBuilder.update(bytes, bytes.length);
responseDataBuffer.put(bytes, 0, bytes.length);
WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI())
.blockDigest(responseDigestBuilder.build())
.payloadDigest(payloadDigestBuilder.build())
.date(Instant.now())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
cookieInformation.paint(builder);
var reference = builder.build();
reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(reference);
} catch (URISyntaxException | IOException | NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
/**
* Flag the given URL as skipped by the crawler, so that it will not be retried.
* Which URLs were skipped is still important when resynchronizing on the WARC file,
* so that the crawler can avoid re-fetching them.
*/
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
saveOldResponse(url, contentType, statusCode, documentBody);
}
/**
* Write a reference copy of the given document data. This is used when the crawler provides
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
*/
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody) {
saveOldResponse(url, contentType, statusCode, documentBody);
}
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
Map<String, List<String>> fields = new HashMap<>();
fields.put("ip", List.of(ip));
fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
fields.put("domain", List.of(domain.toString()));
switch (result) {
case DomainProber.ProbeResultRedirect redirectDomain:
fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
break;
case DomainProber.ProbeResultError error:
fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
break;
case DomainProber.ProbeResultOk ok:
fields.put("X-WARC-Probe-Status", List.of("OK"));
break;
}
var warcinfo = new Warcinfo.Builder()
.date(Instant.now())
.fields(fields)
.recordId(UUID.randomUUID())
.build();
writer.write(warcinfo);
}
public void flagAsRobotsTxtError(EdgeUrl top) {
try {
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN)
.date(Instant.now())
.build();
writer.write(refusal);
} catch (URISyntaxException | IOException e) {
throw new RuntimeException(e);
}
}
public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) {
try {
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN)
.date(Instant.now())
.addHeader("Rejected-Content-Type", contentType)
.addHeader("Http-Status", Integer.toString(status))
.build();
writer.write(refusal);
} catch (URISyntaxException | IOException e) {
throw new RuntimeException(e);
}
}
public void flagAsError(EdgeUrl url, Exception ex) {
try {
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError)
.date(Instant.now())
.addHeader("Exception", ex.getClass().getSimpleName())
.addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), ""))
.build();
writer.write(refusal);
} catch (URISyntaxException | IOException e) {
throw new RuntimeException(e);
}
}
public void flagAsTimeout(EdgeUrl url) {
try {
WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout)
.date(Instant.now())
.build();
writer.write(refusal);
} catch (URISyntaxException | IOException e) {
throw new RuntimeException(e);
}
}
private class ResponseDataBuffer {
private final byte[] data;
private int length = 0;
private int pos = 0;
public ResponseDataBuffer() {
data = bufferThreadLocal.get();
}
public int pos() {
return pos;
}
public int length() {
return length;
}
public void put(String s) {
byte[] bytes = s.getBytes();
put(bytes, 0, bytes.length);
}
private void put(byte[] bytes, int i, int n) {
System.arraycopy(bytes, i, data, pos, n);
pos += n;
length += n;
}
public int readFrom(InputStream inputStream, int remainingLength) throws IOException {
int n = inputStream.read(data, pos, remainingLength);
if (n > 0) {
pos += n;
length += n;
}
return n;
}
public int remaining() {
return MAX_SIZE - pos;
}
public void updateDigest(WarcDigestBuilder digestBuilder, int startPos, int n) {
digestBuilder.update(data, startPos, n);
}
public byte[] copyBytes() {
byte[] copy = new byte[length];
System.arraycopy(data, 0, copy, 0, length);
return copy;
}
}
public void close() {
try {
writer.close();
if (temporaryFile)
Files.deleteIfExists(warcFile);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,108 @@
package nu.marginalia.crawl.retreival.revisit;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
/** This class encapsulates the logic for re-visiting a domain that has already been crawled.
* We may use information from the previous crawl to inform the next crawl, specifically the
* E-Tag and Last-Modified headers.
*/
public class CrawlerRevisitor {
private final DomainCrawlFrontier crawlFrontier;
private final CrawlerRetreiver crawlerRetreiver;
private final WarcRecorder warcRecorder;
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
CrawlerRetreiver crawlerRetreiver,
WarcRecorder warcRecorder) {
this.crawlFrontier = crawlFrontier;
this.crawlerRetreiver = crawlerRetreiver;
this.warcRecorder = warcRecorder;
}
/** Performs a re-crawl of old documents, comparing etags and last-modified */
public int recrawl(CrawlDataReference oldCrawlData,
SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer)
throws InterruptedException {
int recrawled = 0;
int retained = 0;
for (;;) {
CrawledDocument doc = oldCrawlData.nextDocument();
if (doc == null) {
break;
}
// This Shouldn't Happen (TM)
var urlMaybe = EdgeUrl.parse(doc.url);
if (urlMaybe.isEmpty()) continue;
var url = urlMaybe.get();
// If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
if (doc.httpStatus == 404) {
crawlFrontier.addVisited(url);
continue;
}
if (doc.httpStatus != 200) continue;
if (!robotsRules.isAllowed(url.toString())) {
warcRecorder.flagAsRobotsTxtError(url);
continue;
}
if (!crawlFrontier.filterLink(url))
continue;
if (!crawlFrontier.addVisited(url))
continue;
if (recrawled > 5
&& retained > 0.9 * recrawled
&& Math.random() < 0.9)
{
// Since it looks like most of these documents haven't changed,
// we'll load the documents directly; but we do this in a random
// fashion to make sure we eventually catch changes over time
// and ensure we discover new links
crawlFrontier.addVisited(url);
// Hoover up any links from the document
if (doc.httpStatus == 200 && doc.documentBody != null) {
var parsedDoc = Jsoup.parse(doc.documentBody);
crawlFrontier.enqueueLinksFromDocument(url, parsedDoc);
}
// Add a WARC record so we don't repeat this
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
continue;
}
// GET the document with the stored document as a reference
// providing etag and last-modified headers, so we can recycle the
// document if it hasn't changed without actually downloading it
var reference = new DocumentWithReference(doc, oldCrawlData);
var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
if (reference.isSame(result)) {
retained++;
}
recrawled++;
}
return recrawled;
}
}

View File

@ -0,0 +1,77 @@
package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawling.model.CrawledDocument;
import javax.annotation.Nullable;
public record DocumentWithReference(
@Nullable CrawledDocument doc,
@Nullable CrawlDataReference reference) {
private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null);
public static DocumentWithReference empty() {
return emptyInstance;
}
/** Returns true if the provided document is the same as the reference document,
* or if the result was retained via HTTP 304.
*/
public boolean isSame(HttpFetchResult result) {
if (result instanceof HttpFetchResult.Result304Raw)
return true;
if (result instanceof HttpFetchResult.Result304ReplacedWithReference)
return true;
if (!(result instanceof HttpFetchResult.ResultOk resultOk))
return false;
if (reference == null)
return false;
if (doc == null)
return false;
if (doc.documentBody == null)
return false;
if (!(DocumentBodyExtractor.asString(resultOk) instanceof DocumentBodyResult.Ok<String> bodyOk)) {
return false;
}
return reference.isContentBodySame(doc.documentBody, bodyOk.body());
}
public ContentTags getContentTags() {
if (null == doc)
return ContentTags.empty();
String headers = doc.headers;
if (headers == null)
return ContentTags.empty();
String[] headersLines = headers.split("\n");
String lastmod = null;
String etag = null;
for (String line : headersLines) {
if (line.toLowerCase().startsWith("etag:")) {
etag = line.substring(5).trim();
}
if (line.toLowerCase().startsWith("last-modified:")) {
lastmod = line.substring(14).trim();
}
}
return new ContentTags(etag, lastmod);
}
public boolean isEmpty() {
return doc == null || reference == null;
}
}

View File

@ -0,0 +1,71 @@
package nu.marginalia.crawl.retreival.sitemap;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class SitemapFetcher {
private final DomainCrawlFrontier crawlFrontier;
private final SitemapRetriever sitemapRetriever;
private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
this.crawlFrontier = crawlFrontier;
this.sitemapRetriever = sitemapRetriever;
}
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
List<String> sitemaps = robotsRules.getSitemaps();
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
if (!sitemaps.isEmpty()) {
for (var url : sitemaps) {
EdgeUrl.parse(url).ifPresent(urls::add);
}
}
else {
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
}
downloadSitemaps(urls);
}
public void downloadSitemaps(List<EdgeUrl> urls) {
Set<String> checkedSitemaps = new HashSet<>();
for (var url : urls) {
// Let's not download sitemaps from other domains for now
if (!crawlFrontier.isSameDomain(url)) {
continue;
}
if (checkedSitemaps.contains(url.path))
continue;
var sitemap = sitemapRetriever.fetchSitemap(url);
if (sitemap.isEmpty()) {
continue;
}
// ensure we don't try to download this sitemap again
// (don't move this up, as we may want to check the same
// path with different protocols until we find one that works)
checkedSitemaps.add(url.path);
crawlFrontier.addAllToQueue(sitemap);
}
logger.debug("Queue is now {}", crawlFrontier.queueSize());
}
}

View File

@ -0,0 +1,88 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRequest;
import org.netpreserve.jwarc.WarcResponse;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.util.List;
import java.util.zip.GZIPInputStream;
import static org.junit.jupiter.api.Assertions.*;
class CrawlerWarcResynchronizerTest {
Path fileName;
Path outputFile;
OkHttpClient httpClient;
@BeforeEach
public void setUp() throws Exception {
httpClient = new OkHttpClient.Builder()
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
.build();
fileName = Files.createTempFile("test", ".warc.gz");
outputFile = Files.createTempFile("test", ".warc.gz");
}
@AfterEach
public void tearDown() throws Exception {
Files.deleteIfExists(fileName);
Files.deleteIfExists(outputFile);
}
@Test
void run() throws IOException, URISyntaxException {
try (var oldRecorder = new WarcRecorder(fileName)) {
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
} catch (Exception e) {
fail(e);
}
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
try (var newRecorder = new WarcRecorder(outputFile)) {
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
}
assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/")));
assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/")));
assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/feed/")));
try (var warcReader = new WarcReader(outputFile)) {
for (var item : warcReader) {
if (item instanceof WarcRequest req) {
System.out.println("req:" + req.target());
}
if (item instanceof WarcResponse rsp) {
System.out.println("req:" + rsp.target());
}
}
}
new GZIPInputStream(Files.newInputStream(outputFile)).transferTo(System.out);
}
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
var req = new Request.Builder().url(url)
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build();
recorder.fetch(httpClient, req);
}
}

View File

@ -0,0 +1,59 @@
package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
import nu.marginalia.model.EdgeUrl;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.*;
class ContentTypeProberTest {
ContentTypeProber prober;
@BeforeEach
void setUp() {
OkHttpClient client = new OkHttpClient.Builder()
.dispatcher(new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()))
.connectionPool(new ConnectionPool(0, 1, TimeUnit.NANOSECONDS))
.build();
prober = new ContentTypeProber("test.marginalia.nu", client);
}
@Test
void probeContentType() throws URISyntaxException {
assertEquals(
new Ok(new EdgeUrl("https://www.marginalia.nu/robots.txt")),
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/robots.txt")),
"robots.txt is expected to pass the probing test since it's text/plain"
);
assertEquals(
new BadContentType("image/png", 200),
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/sanic.png")),
"sanic.png is expected to pass the probing test since it's image/png"
);
assertEquals(
new Ok(new EdgeUrl("https://www.marginalia.nu/dev/null")),
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/dev/null")),
"Despite being a 404, we expect this to be passed as OK as it's NotMyJob(TM) to verify response codes"
);
assertEquals(
new Ok(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi/")),
prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi")),
"about.gmi is expected to give a redirect to about.gmi/ which is served as text/html"
);
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import org.junit.jupiter.api.Test;
import java.util.List;
@ -7,30 +8,30 @@ import java.util.List;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
class HttpFetcherImplTest {
class CrawledDocumentParquetRecordFileWriterTest {
@Test
public void testXRobotsTag() {
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
}
}

View File

@ -0,0 +1,147 @@
package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.netpreserve.jwarc.*;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WarcRecorderTest {
Path fileNameWarc;
Path fileNameParquet;
WarcRecorder client;
OkHttpClient httpClient;
@BeforeEach
public void setUp() throws Exception {
httpClient = new OkHttpClient.Builder()
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
.build();
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
client = new WarcRecorder(fileNameWarc);
}
@AfterEach
public void tearDown() throws Exception {
client.close();
Files.delete(fileNameWarc);
}
@Test
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
warcReader.forEach(record -> {
if (record instanceof WarcRequest req) {
sampleData.put(record.type(), req.target());
}
if (record instanceof WarcResponse rsp) {
sampleData.put(record.type(), rsp.target());
}
});
}
assertEquals("https://www.marginalia.nu/", sampleData.get("request"));
assertEquals("https://www.marginalia.nu/", sampleData.get("response"));
}
@Test
public void flagAsSkipped() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
"<?doctype html><html><body>test</body></html>");
}
try (var reader = new WarcReader(fileNameWarc)) {
for (var record : reader) {
if (record instanceof WarcResponse rsp) {
assertEquals("https://www.marginalia.nu/", rsp.target());
assertEquals("text/html", rsp.contentType().type());
assertEquals(200, rsp.http().status());
assertEquals("1", rsp.http().headers().first("X-Cookies").orElse(null));
}
}
}
}
@Test
public void testSaveImport() throws URISyntaxException, IOException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
"<?doctype html><html><body>test</body></html>");
}
try (var reader = new WarcReader(fileNameWarc)) {
WarcXResponseReference.register(reader);
for (var record : reader) {
System.out.println(record.type());
System.out.println(record.getClass().getSimpleName());
if (record instanceof WarcXResponseReference rsp) {
assertEquals("https://www.marginalia.nu/", rsp.target());
}
}
}
}
@Test
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.close();
CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu",
new UserAgent("test"),
fileNameWarc,
fileNameParquet);
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
assertEquals(3, urls.size());
assertEquals("https://www.marginalia.nu/", urls.get(0));
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
}
}

View File

@ -4,11 +4,15 @@ import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.body.ContentTypeLogic;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
class HttpFetcherTest {
@ -28,16 +32,25 @@ class HttpFetcherTest {
}
@Test
void fetchUTF8() throws URISyntaxException, RateLimitException {
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
System.out.println(str.contentType);
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType());
}
}
}
@Test
void fetchText() throws URISyntaxException, RateLimitException {
void fetchText() throws URISyntaxException, RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
System.out.println(str);
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType());
}
}
}
}

View File

@ -5,6 +5,8 @@ import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.SerializableCrawlData;
@ -12,17 +14,16 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import nu.marginalia.test.CommonTestData;
import okhttp3.Headers;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
public class CrawlerMockFetcherTest {
@ -61,44 +62,42 @@ public class CrawlerMockFetcherTest {
}
void crawl(CrawlSpecRecord spec) throws IOException {
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
.fetch();
}
}
@Test
public void testLemmy() throws URISyntaxException {
public void testLemmy() throws URISyntaxException, IOException {
List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add)
.fetch();
out.forEach(System.out::println);
crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
}
@Test
public void testMediawiki() throws URISyntaxException {
public void testMediawiki() throws URISyntaxException, IOException {
List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add)
.fetch();
out.forEach(System.out::println);
crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
}
@Test
public void testDiscourse() throws URISyntaxException {
public void testDiscourse() throws URISyntaxException, IOException {
List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add)
.fetch();
out.forEach(System.out::println);
crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
}
class MockFetcher implements HttpFetcher {
@ -118,25 +117,28 @@ public class CrawlerMockFetcherTest {
return new FetchResult(FetchResultState.OK, url);
}
@SneakyThrows
@Override
public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
logger.info("Fetching {}", url);
if (mockData.containsKey(url)) {
return mockData.get(url);
}
else {
return CrawledDocument.builder()
.crawlId("1")
.url(url.toString())
.contentType("text/html")
.httpStatus(404)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
return new HttpFetchResult.ResultOk(
url.asURI(),
200,
new Headers.Builder().build(),
"127.0.0.1",
bodyBytes,
0,
bodyBytes.length
);
}
return new HttpFetchResult.ResultNone();
}
@Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
return new SimpleRobotRules();
}
@ -144,5 +146,6 @@ public class CrawlerMockFetcherTest {
public SitemapRetriever createSitemapRetriever() {
return Mockito.mock(SitemapRetriever.class);
}
}
}

View File

@ -8,6 +8,7 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.CrawledDomainWriter;
import nu.marginalia.crawling.model.CrawledDocument;
@ -15,22 +16,24 @@ import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@Tag("slow")
class CrawlerRetreiverTest {
private HttpFetcher httpFetcher;
Path tempFile;
Path tempFile2;
@BeforeEach
public void setUp() {
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
@ -43,8 +46,62 @@ class CrawlerRetreiverTest {
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
}
@AfterEach
public void tearDown() throws IOException {
if (tempFile != null) {
Files.deleteIfExists(tempFile);
}
if (tempFile2 != null) {
Files.deleteIfExists(tempFile2);
}
}
@Test
public void testWithKnownDomains() {
public void testWarcOutput() throws IOException {
var specs = CrawlSpecRecord
.builder()
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
.build();
Path tempFile = null;
try {
tempFile = Files.createTempFile("crawling-process", "warc");
try (var recorder = new WarcRecorder(tempFile)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
} catch (IOException ex) {
Assertions.fail(ex);
}
Set<String> requests = new HashSet<>();
Set<String> responses = new HashSet<>();
try (var reader = new WarcReader(tempFile)) {
reader.forEach(record -> {
if (record instanceof WarcRequest req) {
requests.add(req.target());
System.out.println(req.type() + ":" + req.target());
}
else if (record instanceof WarcResponse rsp) {
responses.add(rsp.target());
System.out.println(rsp.type() + ":" + rsp.target());
}
else {
System.out.println(record.type());
}
});
}
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
assertEquals(requests, responses);
}
finally {
if (tempFile != null)
Files.deleteIfExists(tempFile);
}
}
@Test
public void testWithKnownDomains() throws IOException {
var specs = CrawlSpecRecord
.builder()
.crawlDepth(5)
@ -54,10 +111,30 @@ class CrawlerRetreiverTest {
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
tempFile = Files.createTempFile("crawling-process", ".warc");
try (var recorder = new WarcRecorder(tempFile)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
}
catch (IOException ex) {
Assertions.fail(ex);
}
try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
while (stream.hasNext()) {
if (stream.next() instanceof CrawledDocument doc) {
data.add(doc);
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
var fetchedUrls =
data.stream().filter(CrawledDocument.class::isInstance)
data.stream()
.peek(System.out::println)
.filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.map(doc -> doc.url)
.collect(Collectors.toSet());
@ -72,7 +149,7 @@ class CrawlerRetreiverTest {
}
@Test
public void testEmptySet() {
public void testEmptySet() throws IOException {
var specs = CrawlSpecRecord
.builder()
@ -81,9 +158,29 @@ class CrawlerRetreiverTest {
.urls(List.of())
.build();
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch();
tempFile = Files.createTempFile("crawling-process", ".warc");
try (var recorder = new WarcRecorder(tempFile)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
}
catch (IOException ex) {
Assertions.fail(ex);
}
try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
while (stream.hasNext()) {
if (stream.next() instanceof CrawledDocument doc) {
data.add(doc);
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
@ -115,33 +212,70 @@ class CrawlerRetreiverTest {
.build();
Path out = Files.createTempDirectory("crawling-process");
var writer = new CrawledDomainWriter(out, specs.domain, "idid");
tempFile = Files.createTempFile("crawling-process", ".warc.gz");
tempFile2 = Files.createTempFile("crawling-process", ".warc.gz");
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
if (Math.random() > 0.5) {
doc.headers = "";
}
}
writer.accept(d);
}).fetch();
writer.close();
try (var recorder = new WarcRecorder(tempFile)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
}
catch (IOException ex) {
Assertions.fail(ex);
}
var reader = new CrawledDomainReader();
var stream = reader.createDataStream(out, specs.domain, "idid");
try (var stream = CrawledDomainReader.createDataStream(tempFile)) {
while (stream.hasNext()) {
var doc = stream.next();
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
var stream = CrawledDomainReader.createDataStream(tempFile);
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
try (var recorder = new WarcRecorder(tempFile2)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
new CrawlDataReference(stream));
}
catch (IOException ex) {
Assertions.fail(ex);
}
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> {
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
try (var reader = new WarcReader(tempFile2)) {
WarcXResponseReference.register(reader);
reader.forEach(record -> {
if (record instanceof WarcResponse rsp) {
try {
System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
if (record instanceof WarcMetadata rsp) {
System.out.println("meta:" + rsp.target());
}
});
}
try (var ds = CrawledDomainReader.createDataStream(tempFile2)) {
while (ds.hasNext()) {
var doc = ds.next();
if (doc instanceof CrawledDomain dr) {
System.out.println(dr.domain + "/" + dr.crawlerStatus);
}
else if (doc instanceof CrawledDocument dc) {
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus);
}
}
}).fetch(new DomainLinks(), new CrawlDataReference(stream));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -32,6 +32,7 @@ public class ConvertActor extends RecordActorPrototype {
public record Convert(FileStorageId fid) implements ActorStep {};
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
public record ConvertDirtree(String source) implements ActorStep {};
public record ConvertWarc(String source) implements ActorStep {};
public record ConvertStackexchange(String source) implements ActorStep {};
@Resume(behavior = ActorResumeBehavior.RETRY)
public record ConvertWait(FileStorageId destFid,
@ -74,6 +75,25 @@ public class ConvertActor extends RecordActorPrototype {
mqConverterOutbox.sendAsync(ConvertRequest.forDirtree(sourcePath, processedArea.id()))
);
}
case ConvertWarc(String source) -> {
Path sourcePath = Path.of(source);
if (!Files.exists(sourcePath))
yield new Error("Source path does not exist: " + sourcePath);
String fileName = sourcePath.toFile().getName();
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
var processedArea = storageService.allocateTemporaryStorage(base,
FileStorageType.PROCESSED_DATA, "processed-data",
"Processed Warc Data; " + fileName);
storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW);
yield new ConvertWait(
processedArea.id(),
mqConverterOutbox.sendAsync(ConvertRequest.forWarc(sourcePath, processedArea.id()))
);
}
case ConvertEncyclopedia(String source, String baseUrl) -> {
Path sourcePath = Path.of(source);

View File

@ -63,8 +63,6 @@ public class ExportAtagsActor extends RecordActorPrototype {
Path inputDir = storageService.getStorage(crawlId).asPath();
var reader = new CrawledDomainReader();
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
)
{
@ -78,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
}
Path crawlDataPath = inputDir.resolve(item.relPath());
try (var stream = reader.createDataStream(crawlDataPath)) {
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
exportLinks(tagWriter, stream);
}
catch (Exception ex) {

View File

@ -170,6 +170,7 @@ public class IndexQueryService extends IndexApiImplBase {
}
}
// GRPC endpoint
@SneakyThrows
public void query(nu.marginalia.index.api.RpcIndexQuery request,

View File

@ -29,13 +29,11 @@ public class CrawlDataUnfcker {
return;
}
var reader = new CrawledDomainReader();
try (var wl = new WorkLog(output.resolve("crawler.log"))) {
for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
Path inputPath = input.resolve(inputItem.relPath());
var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain);
var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain);
if (domainMaybe.isEmpty())
continue;
var domain = domainMaybe.get();
@ -43,7 +41,7 @@ public class CrawlDataUnfcker {
// Generate conformant ID
String newId = Integer.toHexString(domain.hashCode());
var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain);
var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain);
var outputFileName = outputPath.toFile().getName();
System.out.println(inputPath + " -> " + outputPath);
@ -56,13 +54,13 @@ public class CrawlDataUnfcker {
}
}
static Optional<CrawledDomain> readDomain(CrawledDomainReader reader, Path file) {
static Optional<CrawledDomain> readDomain(Path file) {
if (!Files.exists(file)) {
System.out.println("Missing file " + file);
return Optional.empty();
}
try (var stream = reader.createDataStream(file)) {
try (var stream = CrawledDomainReader.createDataStream(file)) {
while (stream.hasNext()) {
if (stream.next() instanceof CrawledDomain domain) {
return Optional.of(domain);

View File

@ -50,10 +50,9 @@ public class ExperimentRunnerMain {
experiment.args(Arrays.copyOfRange(args, 2, args.length));
Path basePath = Path.of(args[0]);
var reader = new CrawledDomainReader();
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
Path crawlDataPath = basePath.resolve(item.relPath());
try (var stream = reader.createDataStream(crawlDataPath)) {
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
experiment.process(stream);
}
catch (Exception ex) {

View File

@ -5,12 +5,12 @@ import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
public abstract class LegacyExperiment extends Experiment {
public abstract boolean process(CrawledDomain domain);
@Override
public boolean process(SerializableCrawlDataStream dataStream) throws IOException {
List<CrawledDocument> documentList = new ArrayList<>();

View File

@ -41,6 +41,7 @@ include 'code:features-convert:topic-detection'
include 'code:features-crawl:crawl-blocklist'
include 'code:features-crawl:link-parser'
include 'code:features-crawl:content-type'
include 'code:features-index:index-journal'
include 'code:features-index:index-query'
@ -154,6 +155,8 @@ dependencyResolutionManagement {
library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1')
library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0')
library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
library('commons.net', 'commons-net','commons-net').version('3.9.0')

View File

@ -13,6 +13,7 @@ import org.apache.parquet.io.DelegatingSeekableInputStream;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
@ -144,7 +145,11 @@ public final class ParquetReader<U, S> implements Spliterator<S>, Closeable {
case BINARY:
case FIXED_LEN_BYTE_ARRAY:
case INT96:
return primitiveType.stringifier().stringify(columnReader.getBinary());
if (primitiveType.getLogicalTypeAnnotation() == null) {
return columnReader.getBinary().getBytes();
} else {
return primitiveType.stringifier().stringify(columnReader.getBinary());
}
case BOOLEAN:
return columnReader.getBoolean();
case DOUBLE:

View File

@ -242,7 +242,7 @@ public final class ParquetWriter<T> implements Closeable {
if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) {
recordConsumer.addBinary(Binary.fromString((String)value));
} else {
throw new UnsupportedOperationException("We don't support writing logical annotation type " + type.getLogicalTypeAnnotation());
recordConsumer.addBinary(Binary.fromConstantByteArray((byte[])value));
}
break;
default: