(converter) Refactor the DomainProcessor for new format of crawl data
With the new crawler modifications, the crawl data comes in a slightly different order, and a result of this is that we can optimize the converter. This is a breaking change that will be incompatible with the old style of crawl data, hence it will linger as a branch for a while. The first step is to move stuff out of the domain processor into the document processor.
This commit is contained in:
parent
9707366348
commit
acf7bcc7a6
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class DocumentDecorator {
|
||||
private final Set<String> extraSearchTerms = new HashSet<>();
|
||||
private final AnchorTextKeywords keywords;
|
||||
private final DomainLinks externalDomainLinks;
|
||||
|
||||
public DocumentDecorator(AnchorTextKeywords keywords, DomainLinks externalDomainLinks) {
|
||||
this.keywords = keywords;
|
||||
this.externalDomainLinks = externalDomainLinks;
|
||||
}
|
||||
|
||||
public void addTerm(String term) {
|
||||
extraSearchTerms.add(term);
|
||||
}
|
||||
|
||||
public void apply(ProcessedDocument doc) {
|
||||
if (doc == null)
|
||||
return;
|
||||
if (doc.words == null)
|
||||
return;
|
||||
|
||||
doc.words.addAllSyntheticTerms(extraSearchTerms);
|
||||
doc.words.addAnchorTerms(keywords.getAnchorTextKeywords(externalDomainLinks, doc.url));
|
||||
}
|
||||
}
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
@ -38,7 +39,7 @@ public class DocumentProcessor {
|
||||
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
||||
}
|
||||
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks) {
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks, DocumentDecorator documentDecorator) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
@ -51,7 +52,7 @@ public class DocumentProcessor {
|
||||
default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
|
||||
};
|
||||
|
||||
processDocument(crawledDocument, documentClass, ret);
|
||||
processDocument(crawledDocument, documentClass, documentDecorator, ret);
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
@ -67,7 +68,7 @@ public class DocumentProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
|
||||
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
|
||||
|
||||
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
|
||||
if (crawlerStatus != CrawlerDocumentStatus.OK) {
|
||||
@ -90,6 +91,16 @@ public class DocumentProcessor {
|
||||
|
||||
ret.details = detailsWithWords.details();
|
||||
ret.words = detailsWithWords.words();
|
||||
|
||||
documentDecorator.apply(ret);
|
||||
|
||||
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
|
||||
&& ret.details != null
|
||||
&& ret.details.features != null)
|
||||
{
|
||||
ret.details.features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private AbstractDocumentProcessorPlugin findPlugin(CrawledDocument crawledDocument) throws DisqualifiedException {
|
||||
|
@ -17,7 +17,6 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.slf4j.Logger;
|
||||
@ -32,7 +31,6 @@ public class DomainProcessor {
|
||||
private final SiteWords siteWords;
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final LshDocumentDeduplicator documentDeduplicator;
|
||||
private final GeoIpDictionary geoIpDictionary;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -42,12 +40,11 @@ public class DomainProcessor {
|
||||
SiteWords siteWords,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException
|
||||
GeoIpDictionary geoIpDictionary) throws SQLException
|
||||
{
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.siteWords = siteWords;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.documentDeduplicator = documentDeduplicator;
|
||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||
this.geoIpDictionary = geoIpDictionary;
|
||||
|
||||
@ -61,15 +58,15 @@ public class DomainProcessor {
|
||||
return null;
|
||||
}
|
||||
|
||||
var ret = new ProcessedDomain();
|
||||
ProcessedDomain ret = new ProcessedDomain();
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
Set<String> processedUrls = new HashSet<>();
|
||||
|
||||
boolean cookies = false;
|
||||
String ip = "";
|
||||
|
||||
DomainLinks externalDomainLinks = null;
|
||||
|
||||
DocumentDecorator documentDecorator = null;
|
||||
|
||||
try (var deduplicator = new LshDocumentDeduplicator()){
|
||||
while (dataStream.hasNext()) {
|
||||
var data = dataStream.next();
|
||||
|
||||
@ -84,94 +81,78 @@ public class DomainProcessor {
|
||||
}
|
||||
|
||||
if (data instanceof CrawledDomain crawledDomain) {
|
||||
ret.domain = new EdgeDomain(crawledDomain.domain);
|
||||
ret.ip = crawledDomain.ip;
|
||||
documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks);
|
||||
|
||||
cookies = crawledDomain.hasCookies();
|
||||
ip = crawledDomain.ip;
|
||||
ret = processDomain(crawledDomain, ret, documentDecorator);
|
||||
|
||||
if (crawledDomain.redirectDomain != null) {
|
||||
ret.redirect = new EdgeDomain(crawledDomain.redirectDomain);
|
||||
}
|
||||
ret.documents = docs;
|
||||
ret.state = getState(crawledDomain.crawlerStatus);
|
||||
}
|
||||
else if (data instanceof CrawledDocument doc) {
|
||||
|
||||
} else if (data instanceof CrawledDocument doc) {
|
||||
try {
|
||||
if (doc.url == null || !processedUrls.add(doc.url))
|
||||
continue;
|
||||
|
||||
if (Boolean.TRUE.equals(doc.hasCookies)) {
|
||||
cookies = true;
|
||||
}
|
||||
var processedDoc = documentProcessor.process(doc, externalDomainLinks, documentDecorator);
|
||||
|
||||
// This case should never be reachable, as we should have initiated
|
||||
// the externalDomainLinks variable above if we made it past the
|
||||
// doc.url == null check; but we'll leave it here just in case
|
||||
// to make debugging easier if we break this.
|
||||
assert externalDomainLinks != null : "externalDomainLinks has not been initialized";
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
|
||||
docs.add(documentProcessor.process(doc, externalDomainLinks));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
docs.add(processedDoc);
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
List<String> terms = new ArrayList<>();
|
||||
|
||||
addIpInfo(terms, ip);
|
||||
|
||||
if (cookies) {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
||||
if (isAcademicDomain(ret.domain)) {
|
||||
terms.add("special:academia");
|
||||
}
|
||||
|
||||
for (var document : ret.documents) {
|
||||
if (document.details == null)
|
||||
continue;
|
||||
|
||||
if (cookies) {
|
||||
document.details.features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
document.words.addAllSyntheticTerms(terms);
|
||||
|
||||
document.words.addAnchorTerms(
|
||||
anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, document.url)
|
||||
);
|
||||
}
|
||||
documentDeduplicator.deduplicate(ret.documents);
|
||||
calculateStatistics(ret, externalDomainLinks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void addIpInfo(List<String> terms, String ip) {
|
||||
terms.add("ip:"+ip);
|
||||
private ProcessedDomain processDomain(CrawledDomain crawledDomain,
|
||||
ProcessedDomain ret,
|
||||
DocumentDecorator decorator)
|
||||
{
|
||||
ret.domain = new EdgeDomain(crawledDomain.domain);
|
||||
ret.ip = crawledDomain.ip;
|
||||
|
||||
addIpInfo(decorator, crawledDomain.ip);
|
||||
|
||||
if (isAcademicDomain(ret.domain)) {
|
||||
decorator.addTerm("special:academia");
|
||||
}
|
||||
|
||||
if (crawledDomain.redirectDomain != null) {
|
||||
ret.redirect = new EdgeDomain(crawledDomain.redirectDomain);
|
||||
}
|
||||
ret.state = getState(crawledDomain.crawlerStatus);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private void addIpInfo(DocumentDecorator decorator, String ip) {
|
||||
decorator.addTerm("ip:"+ip);
|
||||
|
||||
// Add IP location country as a term
|
||||
String country = geoIpDictionary.getCountry(ip);
|
||||
if (!country.isBlank()) { // use the ip:-prefix as there's no real confusion between e.g. ip:127.0.0.1 and ip:uk
|
||||
terms.add("ip:"+country.toLowerCase());
|
||||
decorator.addTerm("ip:"+country.toLowerCase());
|
||||
}
|
||||
|
||||
// Add ASN as a term
|
||||
geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> {
|
||||
terms.add("as:"+asnInfo.asn());
|
||||
decorator.addTerm("as:"+asnInfo.asn());
|
||||
|
||||
for (var orgPart : StringUtils.split(asnInfo.org(), '-')) {
|
||||
terms.add("as:"+orgPart.toLowerCase());
|
||||
decorator.addTerm("as:"+orgPart.toLowerCase());
|
||||
}
|
||||
|
||||
if (isCloudy(asnInfo)) {
|
||||
terms.add("special:cloud");
|
||||
decorator.addTerm("special:cloud");
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -1,74 +1,43 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** Deduplicates documents based on their LSH
|
||||
*
|
||||
* @see EasyLSH
|
||||
*/
|
||||
@Singleton
|
||||
public class LshDocumentDeduplicator {
|
||||
public class LshDocumentDeduplicator implements AutoCloseable {
|
||||
|
||||
private final int DISTANCE_THRESHOLD = 2;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final TLongArrayList hashCodes = new TLongArrayList(1000);
|
||||
private static final int DISTANCE_THRESHOLD = 2;
|
||||
|
||||
public void deduplicate(List<ProcessedDocument> documents) {
|
||||
ProcessedDocument[] goodDocuments = documents.stream()
|
||||
.filter(ProcessedDocument::isProcessedFully)
|
||||
.filter(doc -> doc.words.size() > 100)
|
||||
.toArray(ProcessedDocument[]::new);
|
||||
|
||||
long[] hashCodes = new long[goodDocuments.length];
|
||||
for (int i = 0; i < goodDocuments.length; i++) {
|
||||
hashCodes[i] = goodDocuments[i].details.hashCode;
|
||||
public void markIfDuplicate(ProcessedDocument document) {
|
||||
if (!document.isProcessedFully()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// These arrays can be fairly large (~10,000) so we need to be
|
||||
// careful about what we do in this O(n^2) loop
|
||||
|
||||
for (int i = 0; i < hashCodes.length; i++) {
|
||||
for (int j = 0; j < hashCodes.length; j++) {
|
||||
// This is basically just a 64 bit XOR and a POPCOUNT so it's pretty fast.
|
||||
if (EasyLSH.hammingDistance(hashCodes[i], hashCodes[j]) < DISTANCE_THRESHOLD) {
|
||||
if (i == j)
|
||||
continue;
|
||||
|
||||
if (flagIfDuplicate(goodDocuments[i], goodDocuments[j])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (document.words.size() < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
private boolean flagIfDuplicate(ProcessedDocument thisDoc, ProcessedDocument otherDoc) {
|
||||
|
||||
// This document has already been disqualified as a duplicate
|
||||
if (thisDoc.state != UrlIndexingState.OK)
|
||||
return false;
|
||||
|
||||
|
||||
// We might consider using thisDoc.details.metadata.topology() here instead of the
|
||||
// URL length to determine which document is the "better" one.
|
||||
if (thisDoc.url.path.length()
|
||||
< otherDoc.url.path.length())
|
||||
{
|
||||
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
|
||||
|
||||
otherDoc.state = UrlIndexingState.DISQUALIFIED;
|
||||
otherDoc.stateReason = "Duplicate";
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
long hashCode = document.details.hashCode;
|
||||
|
||||
for (int i = 0; i < hashCodes.size(); i++) {
|
||||
if (EasyLSH.hammingDistance(hashCode, hashCodes.get(i)) < DISTANCE_THRESHOLD) {
|
||||
document.state = UrlIndexingState.DISQUALIFIED;
|
||||
document.stateReason = "Duplicate";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
hashCodes.add(hashCode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
hashCodes.clear(1);
|
||||
}
|
||||
}
|
||||
|
@ -139,10 +139,13 @@ public class ConvertingIntegrationTest {
|
||||
|
||||
private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
data.add(domain);
|
||||
|
||||
if (domain.doc != null) {
|
||||
data.addAll(domain.doc);
|
||||
}
|
||||
data.add(domain);
|
||||
|
||||
|
||||
return SerializableCrawlDataStream.fromIterator(data.iterator());
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user