(converter) Add error handling and lazy load external domain links
The converter was not properly initiating the external links for each domain, causing an NPE in conversion. This needs to be loaded later since we don't know the domain we're processing until we've seen it in the crawl data. Also made some refactorings to make finding converter bugs easier, and finding the related domain less awkward from the SerializableCrawlData interface.
This commit is contained in:
parent
fc30da0d48
commit
d0982e7ba5
@ -4,6 +4,7 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
return SERIAL_IDENTIFIER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDomain() {
|
||||
if (url == null)
|
||||
return null;
|
||||
|
||||
return EdgeUrl
|
||||
.parse(url)
|
||||
.map(EdgeUrl::getDomain)
|
||||
.map(d -> d.domain)
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,4 +2,5 @@ package nu.marginalia.crawling.model;
|
||||
|
||||
public interface SerializableCrawlData {
|
||||
String getSerialIdentifier();
|
||||
String getDomain();
|
||||
}
|
||||
|
@ -138,10 +138,16 @@ public class ConverterMain {
|
||||
for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id)))
|
||||
{
|
||||
pool.submit(() -> {
|
||||
ProcessedDomain processed = processor.process(domain);
|
||||
converterWriter.accept(processed);
|
||||
|
||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||
try {
|
||||
ProcessedDomain processed = processor.process(domain);
|
||||
converterWriter.accept(processed);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error in processing", ex);
|
||||
}
|
||||
finally {
|
||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -55,11 +55,21 @@ public class DomainProcessor {
|
||||
boolean cookies = false;
|
||||
String ip = "";
|
||||
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
DomainLinks externalDomainLinks = null;
|
||||
|
||||
while (dataStream.hasNext()) {
|
||||
var data = dataStream.next();
|
||||
|
||||
// Do a lazy load of the external domain links since we don't know the domain
|
||||
// until we see the first document
|
||||
if (externalDomainLinks == null) {
|
||||
var domain = data.getDomain();
|
||||
|
||||
if (domain != null) {
|
||||
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
}
|
||||
}
|
||||
|
||||
if (data instanceof CrawledDomain crawledDomain) {
|
||||
ret.domain = new EdgeDomain(crawledDomain.domain);
|
||||
ret.ip = crawledDomain.ip;
|
||||
@ -77,8 +87,15 @@ public class DomainProcessor {
|
||||
try {
|
||||
if (doc.url == null)
|
||||
continue;
|
||||
|
||||
fixBadCanonicalTag(doc);
|
||||
|
||||
// This case should never be reachable, as we should have initiated
|
||||
// the externalDomainLinks variable above if we made it past the
|
||||
// doc.url == null check; but we'll leave it here just in case
|
||||
// to make debugging easier if we break this.
|
||||
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
|
||||
|
||||
docs.add(documentProcessor.process(doc, externalDomainLinks));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
Loading…
Reference in New Issue
Block a user