From d71124961e41c944780bd48f7b1923c3a1361924 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 27 Jun 2023 16:11:27 +0200 Subject: [PATCH] Better tests for crawling and processing. --- .../processes/converting-process/build.gradle | 1 + ...CrawlingThenConvertingIntegrationTest.java | 80 +++++++++++++++++++ .../retreival/CrawlerMockFetcherTest.java | 1 - .../retreival/CrawlerRetreiverTest.java | 6 +- 4 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 4d187874..6ef9a25c 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -81,6 +81,7 @@ dependencies { testImplementation libs.mockito testImplementation project(':code:processes:test-data') + testImplementation project(':code:processes:crawling-process') } test { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java new file mode 100644 index 00000000..890a1081 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -0,0 +1,80 @@ +package nu.marginalia.converting; + +import com.google.inject.Guice; +import com.google.inject.Injector; +import lombok.SneakyThrows; +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; +import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; + +/* This is mostly a debugging utility */ +@Tag("slow") +public class CrawlingThenConvertingIntegrationTest { + private DomainProcessor domainProcessor; + private HttpFetcher httpFetcher; + + @SneakyThrows + @BeforeAll + public static void setUpAll() { + // this must be done to avoid java inserting its own user agent for the sitemap requests + System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); + } + + @SneakyThrows + @BeforeEach + public void setUp() { + Injector injector = Guice.createInjector( + new ConvertingIntegrationTestModule() + ); + + domainProcessor = injector.getInstance(DomainProcessor.class); + httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); + } + + @Test + public void crawlThenProcess() { + var specs = CrawlingSpecification.builder() + .id("some-string") + .domain("www.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain domain = crawl(specs); + + var output = domainProcessor.process(domain); + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + + private CrawledDomain crawl(CrawlingSpecification specs) { + List data = new ArrayList<>(); + + new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + + CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); + data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); + return domain; + } +} diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index fec0a752..d5f4581e 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -21,7 +21,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 1e309c71..64c7e890 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -8,10 +8,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import java.util.ArrayList; import java.util.List; @@ -29,6 +26,7 @@ class CrawlerRetreiverTest { } @SneakyThrows + @BeforeAll public static void setUpAll() { // this must be done to avoid java inserting its own user agent for the sitemap requests System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());