(converter) Basic test coverage for sideloading-style processing
This commit is contained in:
parent
24051fec03
commit
b37223c053
2 changed files with 43 additions and 10 deletions
|
@ -75,7 +75,7 @@ public class DomainProcessor {
|
|||
return fullProcessing(domain);
|
||||
}
|
||||
|
||||
public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) {
|
||||
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) {
|
||||
try {
|
||||
return new SideloadProcessing(dataStream);
|
||||
}
|
||||
|
@ -86,7 +86,7 @@ public class DomainProcessor {
|
|||
|
||||
}
|
||||
|
||||
class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
||||
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
||||
private final SerializableCrawlDataStream dataStream;
|
||||
private final ProcessedDomain domain;
|
||||
private final DocumentDecorator documentDecorator;
|
||||
|
@ -97,10 +97,9 @@ public class DomainProcessor {
|
|||
SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException {
|
||||
this.dataStream = dataStream;
|
||||
|
||||
if (!dataStream.hasNext()) {
|
||||
throw new IllegalStateException("No data in stream");
|
||||
}
|
||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||
if (!dataStream.hasNext()
|
||||
|| !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
||||
{
|
||||
throw new IllegalStateException("First record must be a domain");
|
||||
}
|
||||
|
||||
|
@ -126,10 +125,11 @@ public class DomainProcessor {
|
|||
@Override
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
while (next != null
|
||||
&& dataStream.hasNext()
|
||||
&& dataStream.next() instanceof CrawledDocument doc)
|
||||
while (next == null
|
||||
&& dataStream.hasNext())
|
||||
{
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (doc.url == null || !processedUrls.add(doc.url))
|
||||
continue;
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ public class ConvertingIntegrationTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testMemexMarginaliaNu() throws IOException {
|
||||
public void testMemexMarginaliaNuFullProcessing() throws IOException {
|
||||
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||
assertNotNull(ret);
|
||||
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
||||
|
@ -94,6 +94,39 @@ public class ConvertingIntegrationTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
|
||||
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||
assertNotNull(ret);
|
||||
assertEquals("memex.marginalia.nu", ret.id());
|
||||
|
||||
var domain = ret.getDomain();
|
||||
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
|
||||
|
||||
List<ProcessedDocument> docsAll = new ArrayList<>();
|
||||
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
|
||||
ret.getDocumentsStream().forEachRemaining(docsAll::add);
|
||||
assertTrue(docsAll.size() > 25);
|
||||
|
||||
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
|
||||
|
||||
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
|
||||
|
||||
for (var doc : docsAll) {
|
||||
|
||||
if (!doc.isProcessedFully()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var details = doc.details;
|
||||
|
||||
assertTrue(details.title.length() > 4);
|
||||
assertTrue(details.description.length() > 4);
|
||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
|
||||
String index = readClassPathFile("memex-marginalia/index");
|
||||
String[] files = index.split("\n");
|
||||
|
|
Loading…
Reference in a new issue