(warc) More accurate filering of advisory records
Further create records for resources that were blocked due to robots.txt; as well as tests to verify this happens.
This commit is contained in:
parent
2e7db61808
commit
0f9cd9c87d
@ -88,6 +88,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
|
||||
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
|
||||
}
|
||||
else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
|
||||
status = CrawlerDocumentStatus.ROBOTS_TXT;
|
||||
}
|
||||
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
||||
return;
|
||||
}
|
||||
|
@ -211,6 +211,43 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void crawlRobotsTxt() throws IOException {
|
||||
var specs = CrawlSpecRecord.builder()
|
||||
.domain("search.marginalia.nu")
|
||||
.crawlDepth(5)
|
||||
.urls(List.of(
|
||||
"https://search.marginalia.nu/search?q=hello+world"
|
||||
))
|
||||
.build();
|
||||
|
||||
CrawledDomain domain = crawl(specs);
|
||||
assertFalse(domain.doc.isEmpty());
|
||||
assertEquals("OK", domain.crawlerStatus);
|
||||
assertEquals("search.marginalia.nu", domain.domain);
|
||||
|
||||
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
|
||||
|
||||
var output = process();
|
||||
|
||||
assertNotNull(output);
|
||||
assertFalse(output.documents.isEmpty());
|
||||
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
|
||||
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||
|
||||
for (var doc : output.documents) {
|
||||
if (doc.isOk()) {
|
||||
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
|
||||
}
|
||||
else {
|
||||
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private ProcessedDomain process() {
|
||||
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||
return domainProcessor.process(stream);
|
||||
|
Loading…
Reference in New Issue
Block a user