(converter) Add @Deprecated annotation to a few fields that should no longer be used.
This commit is contained in:
parent
0f9cd9c87d
commit
2001d0f707
@ -23,11 +23,15 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
|
|
||||||
public String headers;
|
public String headers;
|
||||||
public String documentBody;
|
public String documentBody;
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public String documentBodyHash;
|
public String documentBodyHash;
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public String canonicalUrl;
|
public String canonicalUrl;
|
||||||
public String redirectUrl;
|
public String redirectUrl;
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public String recrawlState;
|
public String recrawlState;
|
||||||
|
|
||||||
/** This is not guaranteed to be set in all versions of the format,
|
/** This is not guaranteed to be set in all versions of the format,
|
||||||
|
@ -72,7 +72,6 @@ class CrawledDocumentParquetRecordFileWriterTest {
|
|||||||
assertEquals("text/html", document.contentType);
|
assertEquals("text/html", document.contentType);
|
||||||
assertEquals("hello world", document.documentBody);
|
assertEquals("hello world", document.documentBody);
|
||||||
assertEquals(200, document.httpStatus);
|
assertEquals(200, document.httpStatus);
|
||||||
assertEquals("https://www.marginalia.nu/", document.canonicalUrl);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,13 +105,6 @@ public class DocumentProcessor {
|
|||||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||||
throws URISyntaxException
|
throws URISyntaxException
|
||||||
{
|
{
|
||||||
if (crawledDocument.canonicalUrl != null) {
|
|
||||||
try {
|
|
||||||
return new EdgeUrl(crawledDocument.canonicalUrl);
|
|
||||||
}
|
|
||||||
catch (URISyntaxException ex) { /* fallthrough */ }
|
|
||||||
}
|
|
||||||
|
|
||||||
return new EdgeUrl(crawledDocument.url);
|
return new EdgeUrl(crawledDocument.url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,8 +100,6 @@ public class DomainProcessor {
|
|||||||
if (doc.url == null || !processedUrls.add(doc.url))
|
if (doc.url == null || !processedUrls.add(doc.url))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
fixBadCanonicalTag(doc);
|
|
||||||
|
|
||||||
if (Boolean.TRUE.equals(doc.hasCookies)) {
|
if (Boolean.TRUE.equals(doc.hasCookies)) {
|
||||||
cookies = true;
|
cookies = true;
|
||||||
}
|
}
|
||||||
@ -172,25 +170,6 @@ public class DomainProcessor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fixBadCanonicalTag(CrawledDocument doc) {
|
|
||||||
// Some sites have a canonical tag that points to a different domain,
|
|
||||||
// but our loader can not support this, so we point these back to the
|
|
||||||
// original url.
|
|
||||||
|
|
||||||
var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl);
|
|
||||||
if (canonicalOpt.isEmpty()) return;
|
|
||||||
|
|
||||||
var urlOpt = EdgeUrl.parse(doc.url);
|
|
||||||
if (urlOpt.isEmpty()) return;
|
|
||||||
|
|
||||||
var urlActual = urlOpt.get();
|
|
||||||
var canonicalActual = canonicalOpt.get();
|
|
||||||
|
|
||||||
if (!Objects.equals(urlActual.domain, canonicalActual.domain)) {
|
|
||||||
doc.canonicalUrl = doc.url;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
|
private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) {
|
||||||
LinkGraph linkGraph = new LinkGraph();
|
LinkGraph linkGraph = new LinkGraph();
|
||||||
TopKeywords topKeywords = new TopKeywords();
|
TopKeywords topKeywords = new TopKeywords();
|
||||||
|
Loading…
Reference in New Issue
Block a user