(crawler) Update CrawlingThenConvertingIntegrationTest

This commit updates CrawlingThenConvertingIntegrationTest with additional tests for invalid, redirecting, and blocked domains. Improvements have also been made to filter out irrelevant entries in ParquetSerializableCrawlDataStream.
This commit is contained in:
Viktor Lofgren 2023-12-15 21:04:06 +01:00
parent 2e536e3141
commit 5329968155
2 changed files with 111 additions and 12 deletions

View File

@ -58,7 +58,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
String statusReason = "";
String redirectDomain = null;
if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redir")) {
if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) {
EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url);
redirectDomain = crawledUrl.getDomain().toString();
status = CrawlerDomainStatus.REDIRECT;
@ -84,6 +84,10 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
}
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
if (nextRecord.contentType.startsWith("x-marginalia/advisory")) {
return;
}
String bodyString = DocumentBodyToString.getStringData(
ContentType.parse(nextRecord.contentType),
nextRecord.body);

View File

@ -4,35 +4,44 @@ import com.google.inject.Guice;
import com.google.inject.Injector;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
/* This is mostly a debugging utility */
import static org.junit.jupiter.api.Assertions.*;
/** Tests for the crawler and converter integration. These are pretty slow and potentially
* a bit flaky, since they attempt to fetch real websites.
*/
@Tag("slow")
public class CrawlingThenConvertingIntegrationTest {
private DomainProcessor domainProcessor;
private HttpFetcher httpFetcher;
private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class);
private Path fileName;
private Path fileName2;
@ -63,7 +72,69 @@ public class CrawlingThenConvertingIntegrationTest {
}
@Test
public void crawlThenProcess() throws IOException {
public void testInvalidDomain() throws IOException {
// Attempt to fetch an invalid domain
var specs = CrawlSpecRecord.builder()
.domain("invalid.invalid.invalid")
.crawlDepth(10)
.urls(List.of()) // add specific URLs to crawl here
.build();
CrawledDomain crawlData = crawl(specs);
assertEquals("ERROR", crawlData.crawlerStatus);
assertTrue(crawlData.doc.isEmpty());
var processedData = process();
assertNotNull(processedData);
assertTrue(processedData.documents.isEmpty());
}
@Test
public void testRedirectingDomain() throws IOException {
// Attempt to fetch an invalid domain
var specs = CrawlSpecRecord.builder()
.domain("memex.marginalia.nu")
.crawlDepth(10)
.urls(List.of()) // add specific URLs to crawl here
.build();
CrawledDomain crawlData = crawl(specs);
assertEquals("REDIRECT", crawlData.crawlerStatus);
assertEquals("www.marginalia.nu", crawlData.redirectDomain);
assertTrue(crawlData.doc.isEmpty());
var processedData = process();
assertNotNull(processedData);
assertTrue(processedData.documents.isEmpty());
}
@Test
public void testBlockedDomain() throws IOException {
// Attempt to fetch an invalid domain
var specs = CrawlSpecRecord.builder()
.domain("search.marginalia.nu")
.crawlDepth(10)
.urls(List.of()) // add specific URLs to crawl here
.build();
CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
assertEquals("ERROR", crawlData.crawlerStatus);
assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc);
assertTrue(crawlData.doc.isEmpty());
var processedData = process();
assertNotNull(processedData);
assertTrue(processedData.documents.isEmpty());
}
@Test
public void crawlSunnyDay() throws IOException {
var specs = CrawlSpecRecord.builder()
.domain("www.marginalia.nu")
.crawlDepth(10)
@ -71,12 +142,20 @@ public class CrawlingThenConvertingIntegrationTest {
.build();
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("www.marginalia.nu", domain.domain);
List<SerializableCrawlData> data = new ArrayList<>();
data.add(domain);
data.addAll(domain.doc);
boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt"));
assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator()));
for (var doc : output.documents) {
if (doc.isOk()) {
@ -89,18 +168,33 @@ public class CrawlingThenConvertingIntegrationTest {
}
private ProcessedDomain process() {
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
return domainProcessor.process(stream);
}
catch (Exception e) {
Assertions.fail(e);
return null; // unreachable
}
}
private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException {
return crawl(specs, domain -> true);
}
private CrawledDomain crawl(CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(fileName)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
}
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2);
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
while (reader.hasNext()) {
data.add(reader.next());
var next = reader.next();
logger.info("{}", next);
data.add(next);
}
}
@ -109,6 +203,7 @@ public class CrawlingThenConvertingIntegrationTest {
.map(CrawledDomain.class::cast)
.findFirst()
.get();
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
return domain;
}