(converter) Add @Deprecated annotation to a few fields that should no longer be used.
This commit is contained in:
parent
0f9cd9c87d
commit
2001d0f707
@ -23,11 +23,15 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
|
||||
public String headers;
|
||||
public String documentBody;
|
||||
|
||||
@Deprecated
|
||||
public String documentBodyHash;
|
||||
|
||||
@Deprecated
|
||||
public String canonicalUrl;
|
||||
public String redirectUrl;
|
||||
|
||||
@Deprecated
|
||||
public String recrawlState;
|
||||
|
||||
/** This is not guaranteed to be set in all versions of the format,
|
||||
|
@ -72,7 +72,6 @@ class CrawledDocumentParquetRecordFileWriterTest {
|
||||
assertEquals("text/html", document.contentType);
|
||||
assertEquals("hello world", document.documentBody);
|
||||
assertEquals(200, document.httpStatus);
|
||||
assertEquals("https://www.marginalia.nu/", document.canonicalUrl);
|
||||
}
|
||||
|
||||
|
||||
|
@ -105,13 +105,6 @@ public class DocumentProcessor {
|
||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||
throws URISyntaxException
|
||||
{
|
||||
if (crawledDocument.canonicalUrl != null) {
|
||||
try {
|
||||
return new EdgeUrl(crawledDocument.canonicalUrl);
|
||||
}
|
||||
catch (URISyntaxException ex) { /* fallthrough */ }
|
||||
}
|
||||
|
||||
return new EdgeUrl(crawledDocument.url);
|
||||
}
|
||||
|
||||
|
@ -100,8 +100,6 @@ public class DomainProcessor {
|
||||
if (doc.url == null || !processedUrls.add(doc.url))
|
||||
continue;
|
||||
|
||||
fixBadCanonicalTag(doc);
|
||||
|
||||
if (Boolean.TRUE.equals(doc.hasCookies)) {
|
||||
cookies = true;
|
||||
}
|
||||
@ -172,25 +170,6 @@ public class DomainProcessor {
|
||||
return false;
|
||||
}
|
||||
|
||||
private void fixBadCanonicalTag(CrawledDocument doc) {
|
||||
// Some sites have a canonical tag that points to a different domain,
|
||||
// but our loader can not support this, so we point these back to the
|
||||
// original url.
|
||||
|
||||
var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl);
|
||||
if (canonicalOpt.isEmpty()) return;
|
||||
|
||||
var urlOpt = EdgeUrl.parse(doc.url);
|
||||
if (urlOpt.isEmpty()) return;
|
||||
|
||||
var urlActual = urlOpt.get();
|
||||
var canonicalActual = canonicalOpt.get();
|
||||
|
||||
if (!Objects.equals(urlActual.domain, canonicalActual.domain)) {
|
||||
doc.canonicalUrl = doc.url;
|
||||
}
|
||||
}
|
||||
|
||||
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
|
||||
LinkGraph linkGraph = new LinkGraph();
|
||||
TopKeywords topKeywords = new TopKeywords();
|
||||
|
Loading…
Reference in New Issue
Block a user