diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 7d85bdfd..6b9ba1be 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -23,11 +23,15 @@ public class CrawledDocument implements SerializableCrawlData { public String headers; public String documentBody; + + @Deprecated public String documentBodyHash; + @Deprecated public String canonicalUrl; public String redirectUrl; + @Deprecated public String recrawlState; /** This is not guaranteed to be set in all versions of the format, diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index 450d147b..c79154a4 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -72,7 +72,6 @@ class CrawledDocumentParquetRecordFileWriterTest { assertEquals("text/html", document.contentType); assertEquals("hello world", document.documentBody); assertEquals(200, document.httpStatus); - assertEquals("https://www.marginalia.nu/", document.canonicalUrl); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index 8e8841a0..4b5d9173 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -105,13 +105,6 @@ public class DocumentProcessor { private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) throws URISyntaxException { - if (crawledDocument.canonicalUrl != null) { - try { - return new EdgeUrl(crawledDocument.canonicalUrl); - } - catch (URISyntaxException ex) { /* fallthrough */ } - } - return new EdgeUrl(crawledDocument.url); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f86b6bfe..e9794aad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -100,8 +100,6 @@ public class DomainProcessor { if (doc.url == null || !processedUrls.add(doc.url)) continue; - fixBadCanonicalTag(doc); - if (Boolean.TRUE.equals(doc.hasCookies)) { cookies = true; } @@ -172,25 +170,6 @@ public class DomainProcessor { return false; } - private void fixBadCanonicalTag(CrawledDocument doc) { - // Some sites have a canonical tag that points to a different domain, - // but our loader can not support this, so we point these back to the - // original url. - - var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl); - if (canonicalOpt.isEmpty()) return; - - var urlOpt = EdgeUrl.parse(doc.url); - if (urlOpt.isEmpty()) return; - - var urlActual = urlOpt.get(); - var canonicalActual = canonicalOpt.get(); - - if (!Objects.equals(urlActual.domain, canonicalActual.domain)) { - doc.canonicalUrl = doc.url; - } - } - private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords();