Better tests for crawling and processing.
This commit is contained in:
parent
fbdedf53de
commit
d71124961e
@ -81,6 +81,7 @@ dependencies {
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
testImplementation project(':code:processes:crawling-process')
|
||||
}
|
||||
|
||||
test {
|
||||
|
@ -0,0 +1,80 @@
|
||||
package nu.marginalia.converting;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/* This is mostly a debugging utility */
|
||||
@Tag("slow")
|
||||
public class CrawlingThenConvertingIntegrationTest {
|
||||
private DomainProcessor domainProcessor;
|
||||
private HttpFetcher httpFetcher;
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
// this must be done to avoid java inserting its own user agent for the sitemap requests
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConvertingIntegrationTestModule()
|
||||
);
|
||||
|
||||
domainProcessor = injector.getInstance(DomainProcessor.class);
|
||||
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void crawlThenProcess() {
|
||||
var specs = CrawlingSpecification.builder()
|
||||
.id("some-string")
|
||||
.domain("www.marginalia.nu")
|
||||
.crawlDepth(10)
|
||||
.urls(List.of()) // add specific URLs to crawl here
|
||||
.build();
|
||||
|
||||
CrawledDomain domain = crawl(specs);
|
||||
|
||||
var output = domainProcessor.process(domain);
|
||||
|
||||
for (var doc : output.documents) {
|
||||
if (doc.isOk()) {
|
||||
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title);
|
||||
}
|
||||
else {
|
||||
System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private CrawledDomain crawl(CrawlingSpecification specs) {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||
|
||||
CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get();
|
||||
data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add);
|
||||
return domain;
|
||||
}
|
||||
}
|
@ -21,7 +21,6 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
@ -8,10 +8,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -29,6 +26,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
// this must be done to avoid java inserting its own user agent for the sitemap requests
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
|
Loading…
Reference in New Issue
Block a user