(warc) More accurate filering of advisory records
We want to mute some of these records so that they don't produce documents, but in some cases we want a document to be produced for accounting purposes. Added improved tests that reach for known resources on www.marginalia.nu to test the behavior when encountering bad content type and 404s. The commit also adds some safety try-catch:es around the charset handling, as it may sometimes explode when fed incorrect data, and we do be guessing...
This commit is contained in:
parent
5329968155
commit
2e7db61808
@ -23,6 +23,7 @@ dependencies {
|
||||
implementation project(':code:features-crawl:content-type')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':third-party:parquet-floor')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -39,7 +39,12 @@ public class DocumentBodyExtractor {
|
||||
|
||||
private static DocumentBodyResult<String> toStringResult(ContentType contentType, byte[] bytes) {
|
||||
if (contentTypeLogic.isAllowableContentType(contentType)) {
|
||||
return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes));
|
||||
try {
|
||||
return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||
}
|
||||
}
|
||||
else {
|
||||
return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, "");
|
||||
|
@ -4,12 +4,10 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.model.*;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -22,8 +20,9 @@ import java.util.*;
|
||||
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
|
||||
|
||||
private final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
private final Iterator<CrawledDocumentParquetRecord> backingIterator;
|
||||
private Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
|
||||
private final Deque<SerializableCrawlData> nextQ = new ArrayDeque<>();
|
||||
private boolean wroteDomainRecord = false;
|
||||
private final Path path;
|
||||
|
||||
@ -64,14 +63,13 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
status = CrawlerDomainStatus.REDIRECT;
|
||||
}
|
||||
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) {
|
||||
status = CrawlerDomainStatus.BLOCKED; // FIXME we don't write this yet
|
||||
status = CrawlerDomainStatus.BLOCKED;
|
||||
}
|
||||
else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) {
|
||||
status = CrawlerDomainStatus.ERROR;
|
||||
statusReason = new String(parquetRecord.body);
|
||||
}
|
||||
|
||||
// FIXME -- cookies
|
||||
nextQ.add(new CrawledDomain(
|
||||
parquetRecord.domain,
|
||||
redirectDomain,
|
||||
@ -84,25 +82,36 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
}
|
||||
|
||||
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
|
||||
if (nextRecord.contentType.startsWith("x-marginalia/advisory")) {
|
||||
String bodyString = "";
|
||||
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
|
||||
|
||||
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
|
||||
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
|
||||
}
|
||||
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
||||
return;
|
||||
}
|
||||
else {
|
||||
try {
|
||||
bodyString = DocumentBodyToString.getStringData(
|
||||
ContentType.parse(nextRecord.contentType),
|
||||
nextRecord.body);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Failed to convert body to string", ex);
|
||||
status = CrawlerDocumentStatus.BAD_CHARSET;
|
||||
}
|
||||
}
|
||||
|
||||
String bodyString = DocumentBodyToString.getStringData(
|
||||
ContentType.parse(nextRecord.contentType),
|
||||
nextRecord.body);
|
||||
|
||||
// FIXME -- a lot of these fields are not set properly!
|
||||
nextQ.add(new CrawledDocument("",
|
||||
nextRecord.url,
|
||||
nextRecord.contentType,
|
||||
nextRecord.timestamp.toString(),
|
||||
nextRecord.httpStatus,
|
||||
"OK",
|
||||
status.toString(),
|
||||
"",
|
||||
"",
|
||||
bodyString,
|
||||
"",
|
||||
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.url,
|
||||
null,
|
||||
"",
|
||||
|
@ -168,8 +168,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
false,
|
||||
0,
|
||||
date,
|
||||
"x-marginalia/advisory;state=error",
|
||||
errorStatus.getBytes()
|
||||
errorStatus,
|
||||
new byte[0]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -28,7 +28,9 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
@ -168,6 +170,47 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void crawlContentTypes() throws IOException {
|
||||
var specs = CrawlSpecRecord.builder()
|
||||
.domain("www.marginalia.nu")
|
||||
.crawlDepth(5)
|
||||
.urls(List.of(
|
||||
"https://www.marginalia.nu/sanic.png",
|
||||
"https://www.marginalia.nu/invalid"
|
||||
))
|
||||
.build();
|
||||
|
||||
CrawledDomain domain = crawl(specs);
|
||||
assertFalse(domain.doc.isEmpty());
|
||||
assertEquals("OK", domain.crawlerStatus);
|
||||
assertEquals("www.marginalia.nu", domain.domain);
|
||||
|
||||
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||
assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type");
|
||||
assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL");
|
||||
|
||||
var output = process();
|
||||
|
||||
assertNotNull(output);
|
||||
assertFalse(output.documents.isEmpty());
|
||||
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
|
||||
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||
|
||||
|
||||
for (var doc : output.documents) {
|
||||
if (doc.isOk()) {
|
||||
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
|
||||
}
|
||||
else {
|
||||
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private ProcessedDomain process() {
|
||||
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||
return domainProcessor.process(stream);
|
||||
|
Loading…
Reference in New Issue
Block a user