(converter) Basic test coverage for sideloading-style processing

This commit is contained in:
Viktor Lofgren 2023-12-27 18:33:16 +01:00
parent 24051fec03
commit b37223c053
2 changed files with 43 additions and 10 deletions

View file

@ -75,7 +75,7 @@ public class DomainProcessor {
return fullProcessing(domain);
}
public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) {
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) {
try {
return new SideloadProcessing(dataStream);
}
@ -86,7 +86,7 @@ public class DomainProcessor {
}
class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
private final SerializableCrawlDataStream dataStream;
private final ProcessedDomain domain;
private final DocumentDecorator documentDecorator;
@ -97,10 +97,9 @@ public class DomainProcessor {
SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException {
this.dataStream = dataStream;
if (!dataStream.hasNext()) {
throw new IllegalStateException("No data in stream");
}
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
if (!dataStream.hasNext()
|| !(dataStream.next() instanceof CrawledDomain crawledDomain))
{
throw new IllegalStateException("First record must be a domain");
}
@ -126,10 +125,11 @@ public class DomainProcessor {
@Override
public boolean hasNext() {
try {
while (next != null
&& dataStream.hasNext()
&& dataStream.next() instanceof CrawledDocument doc)
while (next == null
&& dataStream.hasNext())
{
if (!(dataStream.next() instanceof CrawledDocument doc))
continue;
if (doc.url == null || !processedUrls.add(doc.url))
continue;

View file

@ -63,7 +63,7 @@ public class ConvertingIntegrationTest {
}
@Test
public void testMemexMarginaliaNu() throws IOException {
public void testMemexMarginaliaNuFullProcessing() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE);
@ -94,6 +94,39 @@ public class ConvertingIntegrationTest {
}
}
@Test
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals("memex.marginalia.nu", ret.id());
var domain = ret.getDomain();
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
List<ProcessedDocument> docsAll = new ArrayList<>();
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.getDocumentsStream().forEachRemaining(docsAll::add);
assertTrue(docsAll.size() > 25);
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : docsAll) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
String index = readClassPathFile("memex-marginalia/index");
String[] files = index.split("\n");