(converter) Add @Deprecated annotation to a few fields that should no longer be used.

This commit is contained in:
Viktor Lofgren 2023-12-15 21:42:00 +01:00
parent 0f9cd9c87d
commit 2001d0f707
4 changed files with 4 additions and 29 deletions

View File

@ -23,11 +23,15 @@ public class CrawledDocument implements SerializableCrawlData {
public String headers; public String headers;
public String documentBody; public String documentBody;
@Deprecated
public String documentBodyHash; public String documentBodyHash;
@Deprecated
public String canonicalUrl; public String canonicalUrl;
public String redirectUrl; public String redirectUrl;
@Deprecated
public String recrawlState; public String recrawlState;
/** This is not guaranteed to be set in all versions of the format, /** This is not guaranteed to be set in all versions of the format,

View File

@ -72,7 +72,6 @@ class CrawledDocumentParquetRecordFileWriterTest {
assertEquals("text/html", document.contentType); assertEquals("text/html", document.contentType);
assertEquals("hello world", document.documentBody); assertEquals("hello world", document.documentBody);
assertEquals(200, document.httpStatus); assertEquals(200, document.httpStatus);
assertEquals("https://www.marginalia.nu/", document.canonicalUrl);
} }

View File

@ -105,13 +105,6 @@ public class DocumentProcessor {
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
throws URISyntaxException throws URISyntaxException
{ {
if (crawledDocument.canonicalUrl != null) {
try {
return new EdgeUrl(crawledDocument.canonicalUrl);
}
catch (URISyntaxException ex) { /* fallthrough */ }
}
return new EdgeUrl(crawledDocument.url); return new EdgeUrl(crawledDocument.url);
} }

View File

@ -100,8 +100,6 @@ public class DomainProcessor {
if (doc.url == null || !processedUrls.add(doc.url)) if (doc.url == null || !processedUrls.add(doc.url))
continue; continue;
fixBadCanonicalTag(doc);
if (Boolean.TRUE.equals(doc.hasCookies)) { if (Boolean.TRUE.equals(doc.hasCookies)) {
cookies = true; cookies = true;
} }
@ -172,25 +170,6 @@ public class DomainProcessor {
return false; return false;
} }
private void fixBadCanonicalTag(CrawledDocument doc) {
// Some sites have a canonical tag that points to a different domain,
// but our loader can not support this, so we point these back to the
// original url.
var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl);
if (canonicalOpt.isEmpty()) return;
var urlOpt = EdgeUrl.parse(doc.url);
if (urlOpt.isEmpty()) return;
var urlActual = urlOpt.get();
var canonicalActual = canonicalOpt.get();
if (!Objects.equals(urlActual.domain, canonicalActual.domain)) {
doc.canonicalUrl = doc.url;
}
}
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) { private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
LinkGraph linkGraph = new LinkGraph(); LinkGraph linkGraph = new LinkGraph();
TopKeywords topKeywords = new TopKeywords(); TopKeywords topKeywords = new TopKeywords();