(warc) More accurate filering of advisory records

Further create records for resources that were blocked due to robots.txt; as well as tests to verify this happens.
This commit is contained in:
Viktor Lofgren 2023-12-15 21:37:02 +01:00
parent 2e7db61808
commit 0f9cd9c87d
2 changed files with 40 additions and 0 deletions

View File

@ -88,6 +88,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE; status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
} }
else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
status = CrawlerDocumentStatus.ROBOTS_TXT;
}
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
return; return;
} }

View File

@ -211,6 +211,43 @@ public class CrawlingThenConvertingIntegrationTest {
} }
@Test
public void crawlRobotsTxt() throws IOException {
var specs = CrawlSpecRecord.builder()
.domain("search.marginalia.nu")
.crawlDepth(5)
.urls(List.of(
"https://search.marginalia.nu/search?q=hello+world"
))
.build();
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("search.marginalia.nu", domain.domain);
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
for (var doc : output.documents) {
if (doc.isOk()) {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
}
else {
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
}
}
}
private ProcessedDomain process() { private ProcessedDomain process() {
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
return domainProcessor.process(stream); return domainProcessor.process(stream);