(warc) More accurate filering of advisory records
Further create records for resources that were blocked due to robots.txt; as well as tests to verify this happens.
This commit is contained in:
parent
2e7db61808
commit
0f9cd9c87d
@ -88,6 +88,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
|
if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) {
|
||||||
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
|
status = CrawlerDocumentStatus.BAD_CONTENT_TYPE;
|
||||||
}
|
}
|
||||||
|
else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) {
|
||||||
|
status = CrawlerDocumentStatus.ROBOTS_TXT;
|
||||||
|
}
|
||||||
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -211,6 +211,43 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void crawlRobotsTxt() throws IOException {
|
||||||
|
var specs = CrawlSpecRecord.builder()
|
||||||
|
.domain("search.marginalia.nu")
|
||||||
|
.crawlDepth(5)
|
||||||
|
.urls(List.of(
|
||||||
|
"https://search.marginalia.nu/search?q=hello+world"
|
||||||
|
))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
CrawledDomain domain = crawl(specs);
|
||||||
|
assertFalse(domain.doc.isEmpty());
|
||||||
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
|
assertEquals("search.marginalia.nu", domain.domain);
|
||||||
|
|
||||||
|
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||||
|
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
|
||||||
|
|
||||||
|
var output = process();
|
||||||
|
|
||||||
|
assertNotNull(output);
|
||||||
|
assertFalse(output.documents.isEmpty());
|
||||||
|
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
|
||||||
|
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||||
|
|
||||||
|
for (var doc : output.documents) {
|
||||||
|
if (doc.isOk()) {
|
||||||
|
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private ProcessedDomain process() {
|
private ProcessedDomain process() {
|
||||||
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
|
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||||
return domainProcessor.process(stream);
|
return domainProcessor.process(stream);
|
||||||
|
Loading…
Reference in New Issue
Block a user