From d0982e7ba5f709fc14aa2cf387e3a77324294feb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 12:33:39 +0100 Subject: [PATCH] (converter) Add error handling and lazy load external domain links The converter was not properly initiating the external links for each domain, causing an NPE in conversion. This needs to be loaded later since we don't know the domain we're processing until we've seen it in the crawl data. Also made some refactorings to make finding converter bugs easier, and finding the related domain less awkward from the SerializableCrawlData interface. --- .../crawling/model/CrawledDocument.java | 13 +++++++++++++ .../crawling/model/SerializableCrawlData.java | 1 + .../marginalia/converting/ConverterMain.java | 14 ++++++++++---- .../converting/processor/DomainProcessor.java | 19 ++++++++++++++++++- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 94d13235..143c775b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -4,6 +4,7 @@ import lombok.AllArgsConstructor; import lombok.Builder; import lombok.ToString; import nu.marginalia.bigstring.BigString; +import nu.marginalia.model.EdgeUrl; @Builder @AllArgsConstructor @@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData { return SERIAL_IDENTIFIER; } + @Override + public String getDomain() { + if (url == null) + return null; + + return EdgeUrl + .parse(url) + .map(EdgeUrl::getDomain) + .map(d -> d.domain) + .orElse(null); + } + } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java index c9804d54..48b3f65d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java @@ -2,4 +2,5 @@ package nu.marginalia.crawling.model; public interface SerializableCrawlData { String getSerialIdentifier(); + String getDomain(); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index fb919018..ebfb1bc2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -138,10 +138,16 @@ public class ConverterMain { for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) { pool.submit(() -> { - ProcessedDomain processed = processor.process(domain); - converterWriter.accept(processed); - - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + try { + ProcessedDomain processed = processor.process(domain); + converterWriter.accept(processed); + } + catch (Exception ex) { + logger.info("Error in processing", ex); + } + finally { + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + } }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f9ae890c..00a05257 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -55,11 +55,21 @@ public class DomainProcessor { boolean cookies = false; String ip = ""; - DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); + DomainLinks externalDomainLinks = null; while (dataStream.hasNext()) { var data = dataStream.next(); + // Do a lazy load of the external domain links since we don't know the domain + // until we see the first document + if (externalDomainLinks == null) { + var domain = data.getDomain(); + + if (domain != null) { + externalDomainLinks = anchorTagsSource.getAnchorTags(domain); + } + } + if (data instanceof CrawledDomain crawledDomain) { ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; @@ -77,8 +87,15 @@ public class DomainProcessor { try { if (doc.url == null) continue; + fixBadCanonicalTag(doc); + // This case should never be reachable, as we should have initiated + // the externalDomainLinks variable above if we made it past the + // doc.url == null check; but we'll leave it here just in case + // to make debugging easier if we break this. + assert externalDomainLinks != null : "externalDomainLinks has not been initialized"; + docs.add(documentProcessor.process(doc, externalDomainLinks)); } catch (Exception ex) {