(converter) Add error handling and lazy load external domain links

The converter was not properly initiating the external links for each domain, causing an NPE in conversion.  This needs to be loaded later since we don't know the domain we're processing until we've seen it in the crawl data.

Also made some refactorings to make finding converter bugs easier, and finding the related domain less awkward from the SerializableCrawlData interface.
This commit is contained in:
Viktor Lofgren 2023-12-09 12:33:39 +01:00
parent fc30da0d48
commit d0982e7ba5
4 changed files with 42 additions and 5 deletions

View File

@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.ToString;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.model.EdgeUrl;
@Builder
@AllArgsConstructor
@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData {
return SERIAL_IDENTIFIER;
}
@Override
public String getDomain() {
if (url == null)
return null;
return EdgeUrl
.parse(url)
.map(EdgeUrl::getDomain)
.map(d -> d.domain)
.orElse(null);
}
}

View File

@ -2,4 +2,5 @@ package nu.marginalia.crawling.model;
public interface SerializableCrawlData {
String getSerialIdentifier();
String getDomain();
}

View File

@ -138,10 +138,16 @@ public class ConverterMain {
for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id)))
{
pool.submit(() -> {
ProcessedDomain processed = processor.process(domain);
converterWriter.accept(processed);
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
try {
ProcessedDomain processed = processor.process(domain);
converterWriter.accept(processed);
}
catch (Exception ex) {
logger.info("Error in processing", ex);
}
finally {
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
}
});
}

View File

@ -55,11 +55,21 @@ public class DomainProcessor {
boolean cookies = false;
String ip = "";
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
DomainLinks externalDomainLinks = null;
while (dataStream.hasNext()) {
var data = dataStream.next();
// Do a lazy load of the external domain links since we don't know the domain
// until we see the first document
if (externalDomainLinks == null) {
var domain = data.getDomain();
if (domain != null) {
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
}
}
if (data instanceof CrawledDomain crawledDomain) {
ret.domain = new EdgeDomain(crawledDomain.domain);
ret.ip = crawledDomain.ip;
@ -77,8 +87,15 @@ public class DomainProcessor {
try {
if (doc.url == null)
continue;
fixBadCanonicalTag(doc);
// This case should never be reachable, as we should have initiated
// the externalDomainLinks variable above if we made it past the
// doc.url == null check; but we'll leave it here just in case
// to make debugging easier if we break this.
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
docs.add(documentProcessor.process(doc, externalDomainLinks));
}
catch (Exception ex) {