(warc) More accurate filering of advisory records

We want to mute some of these records so that they don't produce documents, but in some cases we want a document to be produced for accounting purposes.

Added improved tests that reach for known resources on www.marginalia.nu to test the behavior when encountering bad content type and 404s.

The commit also adds some safety try-catch:es around the charset handling, as it may sometimes explode when fed incorrect data, and we do be guessing...
This commit is contained in:
Viktor Lofgren 2023-12-15 21:31:16 +01:00
parent 5329968155
commit 2e7db61808
5 changed files with 76 additions and 18 deletions

View File

@ -23,6 +23,7 @@ dependencies {
implementation project(':code:features-crawl:content-type')
implementation project(':code:libraries:language-processing')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j

View File

@ -39,8 +39,13 @@ public class DocumentBodyExtractor {
private static DocumentBodyResult<String> toStringResult(ContentType contentType, byte[] bytes) {
if (contentTypeLogic.isAllowableContentType(contentType)) {
try {
return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes));
}
catch (Exception ex) {
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}
}
else {
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
}

View File

@ -4,12 +4,10 @@ import lombok.SneakyThrows;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.CrawlerDomainStatus;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.*;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -22,8 +20,9 @@ import java.util.*;
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
private final MurmurHash3_128 hash = new MurmurHash3_128();
private final Iterator<CrawledDocumentParquetRecord> backingIterator;
private Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
private final Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
private boolean wroteDomainRecord = false;
private final Path path;
@ -64,14 +63,13 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
status = CrawlerDomainStatus.REDIRECT;
}
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) {
status = CrawlerDomainStatus.BLOCKED; // FIXME we don't write this yet
status = CrawlerDomainStatus.BLOCKED;
}
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) {
status = CrawlerDomainStatus.ERROR;
statusReason = new String(parquetRecord.body);
}
// FIXME -- cookies
nextQ.add(new CrawledDomain(
parquetRecord.domain,
redirectDomain,
@ -84,25 +82,36 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
}
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
if (nextRecord.contentType.startsWith("x-marginalia/advisory")) {
String bodyString = "";
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
}
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
return;
}
String bodyString = DocumentBodyToString.getStringData(
else {
try {
bodyString = DocumentBodyToString.getStringData(
ContentType.parse(nextRecord.contentType),
nextRecord.body);
} catch (Exception ex) {
logger.error("Failed to convert body to string", ex);
status = CrawlerDocumentStatus.BAD_CHARSET;
}
}
// FIXME -- a lot of these fields are not set properly!
nextQ.add(new CrawledDocument("",
nextRecord.url,
nextRecord.contentType,
nextRecord.timestamp.toString(),
nextRecord.httpStatus,
"OK",
status.toString(),
"",
"",
bodyString,
"",
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
nextRecord.url,
null,
"",

View File

@ -168,8 +168,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
false,
0,
date,
"x-marginalia/advisory;state=error",
errorStatus.getBytes()
errorStatus,
new byte[0]
);
}
}

View File

@ -28,7 +28,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
@ -168,6 +170,47 @@ public class CrawlingThenConvertingIntegrationTest {
}
@Test
public void crawlContentTypes() throws IOException {
var specs = CrawlSpecRecord.builder()
.domain("www.marginalia.nu")
.crawlDepth(5)
.urls(List.of(
"https://www.marginalia.nu/sanic.png",
"https://www.marginalia.nu/invalid"
))
.build();
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("www.marginalia.nu", domain.domain);
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type");
assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
for (var doc : output.documents) {
if (doc.isOk()) {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
}
else {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
}
}
}
private ProcessedDomain process() {
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
return domainProcessor.process(stream);