diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 94d13235..143c775b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -4,6 +4,7 @@ import lombok.AllArgsConstructor; import lombok.Builder; import lombok.ToString; import nu.marginalia.bigstring.BigString; +import nu.marginalia.model.EdgeUrl; @Builder @AllArgsConstructor @@ -35,4 +36,16 @@ public class CrawledDocument implements SerializableCrawlData { return SERIAL_IDENTIFIER; } + @Override + public String getDomain() { + if (url == null) + return null; + + return EdgeUrl + .parse(url) + .map(EdgeUrl::getDomain) + .map(d -> d.domain) + .orElse(null); + } + } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java index c9804d54..48b3f65d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/SerializableCrawlData.java @@ -2,4 +2,5 @@ package nu.marginalia.crawling.model; public interface SerializableCrawlData { String getSerialIdentifier(); + String getDomain(); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index fb919018..ebfb1bc2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -138,10 +138,16 @@ public class ConverterMain { for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) { pool.submit(() -> { - ProcessedDomain processed = processor.process(domain); - converterWriter.accept(processed); - - heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + try { + ProcessedDomain processed = processor.process(domain); + converterWriter.accept(processed); + } + catch (Exception ex) { + logger.info("Error in processing", ex); + } + finally { + heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains); + } }); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f9ae890c..00a05257 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -55,11 +55,21 @@ public class DomainProcessor { boolean cookies = false; String ip = ""; - DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); + DomainLinks externalDomainLinks = null; while (dataStream.hasNext()) { var data = dataStream.next(); + // Do a lazy load of the external domain links since we don't know the domain + // until we see the first document + if (externalDomainLinks == null) { + var domain = data.getDomain(); + + if (domain != null) { + externalDomainLinks = anchorTagsSource.getAnchorTags(domain); + } + } + if (data instanceof CrawledDomain crawledDomain) { ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; @@ -77,8 +87,15 @@ public class DomainProcessor { try { if (doc.url == null) continue; + fixBadCanonicalTag(doc); + // This case should never be reachable, as we should have initiated + // the externalDomainLinks variable above if we made it past the + // doc.url == null check; but we'll leave it here just in case + // to make debugging easier if we break this. + assert externalDomainLinks != null : "externalDomainLinks has not been initialized"; + docs.add(documentProcessor.process(doc, externalDomainLinks)); } catch (Exception ex) {