(crawler) Update CrawlingThenConvertingIntegrationTest
This commit updates CrawlingThenConvertingIntegrationTest with additional tests for invalid, redirecting, and blocked domains. Improvements have also been made to filter out irrelevant entries in ParquetSerializableCrawlDataStream.
This commit is contained in:
parent
2e536e3141
commit
5329968155
@ -58,7 +58,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
String statusReason = "";
|
||||
|
||||
String redirectDomain = null;
|
||||
if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redir")) {
|
||||
if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) {
|
||||
EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url);
|
||||
redirectDomain = crawledUrl.getDomain().toString();
|
||||
status = CrawlerDomainStatus.REDIRECT;
|
||||
@ -84,6 +84,10 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
}
|
||||
|
||||
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
|
||||
if (nextRecord.contentType.startsWith("x-marginalia/advisory")) {
|
||||
return;
|
||||
}
|
||||
|
||||
String bodyString = DocumentBodyToString.getStringData(
|
||||
ContentType.parse(nextRecord.contentType),
|
||||
nextRecord.body);
|
||||
|
@ -4,35 +4,44 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/* This is mostly a debugging utility */
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
/** Tests for the crawler and converter integration. These are pretty slow and potentially
|
||||
* a bit flaky, since they attempt to fetch real websites.
|
||||
*/
|
||||
@Tag("slow")
|
||||
public class CrawlingThenConvertingIntegrationTest {
|
||||
private DomainProcessor domainProcessor;
|
||||
private HttpFetcher httpFetcher;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class);
|
||||
|
||||
private Path fileName;
|
||||
private Path fileName2;
|
||||
|
||||
@ -63,7 +72,69 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void crawlThenProcess() throws IOException {
|
||||
public void testInvalidDomain() throws IOException {
|
||||
// Attempt to fetch an invalid domain
|
||||
var specs = CrawlSpecRecord.builder()
|
||||
.domain("invalid.invalid.invalid")
|
||||
.crawlDepth(10)
|
||||
.urls(List.of()) // add specific URLs to crawl here
|
||||
.build();
|
||||
|
||||
CrawledDomain crawlData = crawl(specs);
|
||||
|
||||
assertEquals("ERROR", crawlData.crawlerStatus);
|
||||
assertTrue(crawlData.doc.isEmpty());
|
||||
|
||||
var processedData = process();
|
||||
|
||||
assertNotNull(processedData);
|
||||
assertTrue(processedData.documents.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRedirectingDomain() throws IOException {
|
||||
// Attempt to fetch an invalid domain
|
||||
var specs = CrawlSpecRecord.builder()
|
||||
.domain("memex.marginalia.nu")
|
||||
.crawlDepth(10)
|
||||
.urls(List.of()) // add specific URLs to crawl here
|
||||
.build();
|
||||
|
||||
CrawledDomain crawlData = crawl(specs);
|
||||
|
||||
assertEquals("REDIRECT", crawlData.crawlerStatus);
|
||||
assertEquals("www.marginalia.nu", crawlData.redirectDomain);
|
||||
assertTrue(crawlData.doc.isEmpty());
|
||||
|
||||
var processedData = process();
|
||||
|
||||
assertNotNull(processedData);
|
||||
assertTrue(processedData.documents.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBlockedDomain() throws IOException {
|
||||
// Attempt to fetch an invalid domain
|
||||
var specs = CrawlSpecRecord.builder()
|
||||
.domain("search.marginalia.nu")
|
||||
.crawlDepth(10)
|
||||
.urls(List.of()) // add specific URLs to crawl here
|
||||
.build();
|
||||
|
||||
CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
|
||||
|
||||
assertEquals("ERROR", crawlData.crawlerStatus);
|
||||
assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc);
|
||||
assertTrue(crawlData.doc.isEmpty());
|
||||
|
||||
var processedData = process();
|
||||
|
||||
assertNotNull(processedData);
|
||||
assertTrue(processedData.documents.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void crawlSunnyDay() throws IOException {
|
||||
var specs = CrawlSpecRecord.builder()
|
||||
.domain("www.marginalia.nu")
|
||||
.crawlDepth(10)
|
||||
@ -71,12 +142,20 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
.build();
|
||||
|
||||
CrawledDomain domain = crawl(specs);
|
||||
assertFalse(domain.doc.isEmpty());
|
||||
assertEquals("OK", domain.crawlerStatus);
|
||||
assertEquals("www.marginalia.nu", domain.domain);
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
data.add(domain);
|
||||
data.addAll(domain.doc);
|
||||
boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt"));
|
||||
assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler");
|
||||
|
||||
var output = process();
|
||||
|
||||
assertNotNull(output);
|
||||
assertFalse(output.documents.isEmpty());
|
||||
assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain);
|
||||
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||
|
||||
var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator()));
|
||||
|
||||
for (var doc : output.documents) {
|
||||
if (doc.isOk()) {
|
||||
@ -89,18 +168,33 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
|
||||
}
|
||||
|
||||
private ProcessedDomain process() {
|
||||
try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||
return domainProcessor.process(stream);
|
||||
}
|
||||
catch (Exception e) {
|
||||
Assertions.fail(e);
|
||||
return null; // unreachable
|
||||
}
|
||||
}
|
||||
private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException {
|
||||
return crawl(specs, domain -> true);
|
||||
}
|
||||
|
||||
private CrawledDomain crawl(CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
|
||||
}
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2);
|
||||
|
||||
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||
while (reader.hasNext()) {
|
||||
data.add(reader.next());
|
||||
var next = reader.next();
|
||||
logger.info("{}", next);
|
||||
data.add(next);
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,6 +203,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
.map(CrawledDomain.class::cast)
|
||||
.findFirst()
|
||||
.get();
|
||||
|
||||
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
||||
return domain;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user