Reduce the amount of expensive operations in HtmlDocumentProcessorPlugin.
This commit is contained in:
parent
9579cdd151
commit
67c15a34e6
@ -9,7 +9,7 @@ public class MetaRobotsTag {
|
||||
private final String searchEngineName = "marginalia-search";
|
||||
|
||||
public boolean allowIndexingByMetaTag(Document doc) {
|
||||
var robotsContent = doc.select("meta[name=robots]").attr("content");
|
||||
var robotsContent = doc.getElementsByTag("meta").select("meta[name=robots]").attr("content");
|
||||
|
||||
if (isForbidden(robotsContent)) {
|
||||
var marginaliaTag = doc.select( "meta[name=" + searchEngineName + "]").attr("content");
|
||||
|
@ -3,18 +3,23 @@ package nu.marginalia.converting.processor.logic;
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Set;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
public class DocumentValuator {
|
||||
|
||||
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException {
|
||||
public double getQuality(CrawledDocument crawledDocument,
|
||||
HtmlStandard htmlStandard,
|
||||
Document parsedDocument,
|
||||
int textLength) throws DisqualifiedException {
|
||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||
|
||||
int textBodyLength = parsedDocument.text().length();
|
||||
int textBodyLength = textLength;
|
||||
int rawLength = crawledDocument.documentBody.length();
|
||||
|
||||
if (textBodyLength == 0) {
|
||||
@ -28,24 +33,50 @@ public class DocumentValuator {
|
||||
|
||||
|
||||
private int getScriptPenalty(Document parsed) {
|
||||
var scriptTags = parsed.getElementsByTag("script");
|
||||
String scriptText = scriptTags.html();
|
||||
int badScript = 0;
|
||||
if (scriptText.contains(".createElement(")) {
|
||||
badScript = 1;
|
||||
}
|
||||
var scriptVisitor = new ScriptVisitor();
|
||||
|
||||
double scriptPenalty = 0;
|
||||
for (var tag : scriptTags) {
|
||||
String srcAttr = tag.attr("src");
|
||||
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
|
||||
scriptPenalty += 0.49;
|
||||
}
|
||||
else if (!Strings.isBlank(srcAttr)) {
|
||||
scriptPenalty += 1;
|
||||
}
|
||||
}
|
||||
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
|
||||
parsed.getElementsByTag("script").traverse(scriptVisitor);
|
||||
|
||||
return scriptVisitor.score();
|
||||
}
|
||||
|
||||
private static class ScriptVisitor implements NodeVisitor {
|
||||
boolean hasBadScript = false;
|
||||
int scriptLength = 0;
|
||||
double penalty = 0.;
|
||||
|
||||
public int score() {
|
||||
return (int)(penalty + (hasBadScript?1:0) + (scriptLength)/1000.);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void head(@NotNull Node node, int depth) {
|
||||
if (node instanceof Element el) {
|
||||
visitTag(el);
|
||||
}
|
||||
else if (node instanceof TextNode tn) {
|
||||
visitScriptText(tn);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void visitScriptText(TextNode tn) {
|
||||
String wholeText = tn.getWholeText();
|
||||
scriptLength += wholeText.length();
|
||||
|
||||
if (!hasBadScript) {
|
||||
hasBadScript = wholeText.contains(".createElement(");
|
||||
}
|
||||
}
|
||||
|
||||
public void visitTag(Element el) {
|
||||
String srcAttr = el.attr("src");
|
||||
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
|
||||
penalty += 0.49;
|
||||
}
|
||||
else if (!Strings.isBlank(srcAttr)) {
|
||||
penalty += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text.
|
||||
@ -34,7 +35,7 @@ public class DomPruningFilter implements NodeFilter {
|
||||
final NodeData dataForNode;
|
||||
|
||||
if (node instanceof TextNode tn) {
|
||||
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
||||
dataForNode = new NodeData(depth, MeasureLengthVisitor.lengthOfElement(tn), 0);
|
||||
}
|
||||
else if (isSignal(node)) {
|
||||
dataForNode = new NodeData(depth, 0,0);
|
||||
@ -61,24 +62,67 @@ public class DomPruningFilter implements NodeFilter {
|
||||
&& dataForNode.treeSize > 3)
|
||||
return FilterResult.REMOVE;
|
||||
|
||||
if (node instanceof Element el) {
|
||||
if (shouldAlwaysPurge(el)) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
public boolean isSignal(Node node) {
|
||||
|
||||
if (node instanceof Element e) {
|
||||
if ("a".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("nav".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("footer".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("header".equalsIgnoreCase(e.tagName()))
|
||||
|
||||
String tagName = e.tagName();
|
||||
|
||||
if ("a".equalsIgnoreCase(tagName))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
final List<String> badClassNames = List.of("cookie-banner", "cookie", "cookie-notice", "cookie-policy",
|
||||
"nav", "navigation", "footer", "header", "menu", "toolbar", "tooltip",
|
||||
"alert", "alertdialog", "banner", "onetrust-consent-sdk");
|
||||
final List<String> badAriaRoles = List.of("alert", "alertdialog", "navigation", "banner", "dialog", "menu", "toolbar", "tooltip");
|
||||
|
||||
|
||||
private boolean shouldAlwaysPurge(Element el) {
|
||||
|
||||
String tagName = el.tagName();
|
||||
|
||||
if ("nav".equalsIgnoreCase(tagName))
|
||||
return true;
|
||||
if ("footer".equalsIgnoreCase(tagName))
|
||||
return true;
|
||||
if ("header".equalsIgnoreCase(tagName))
|
||||
return true;
|
||||
|
||||
var classNames = el.classNames();
|
||||
|
||||
for (var clazz : classNames) {
|
||||
for (var bad : badClassNames) {
|
||||
if (clazz.equalsIgnoreCase(bad))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
var role = el.attr("role");
|
||||
|
||||
for (var bad : badAriaRoles) {
|
||||
if (bad.equalsIgnoreCase(role))
|
||||
return true;
|
||||
}
|
||||
|
||||
var ariaHidden = el.attr("aria-hidden");
|
||||
if ("true".equalsIgnoreCase(ariaHidden))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
class NodeData {
|
||||
|
@ -0,0 +1,53 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
/** Best effort visitor to measure the length of the text in a DOM tree
|
||||
* without allocating a bunch of Strings.
|
||||
*/
|
||||
public class MeasureLengthVisitor implements NodeVisitor {
|
||||
public int length = 0;
|
||||
|
||||
@Override
|
||||
public void head(Node node, int depth) {
|
||||
if (node instanceof TextNode tn) {
|
||||
length += lengthOfElement(tn);
|
||||
}
|
||||
}
|
||||
|
||||
// Emulate the HTML spec's definition of "length of an element"
|
||||
// in a "close-enough" fashion.
|
||||
static int lengthOfElement(TextNode tn) {
|
||||
String wholeText = tn.getWholeText();
|
||||
|
||||
int length = 0;
|
||||
|
||||
int start = 0;
|
||||
int end = wholeText.length() - 1;
|
||||
|
||||
while (start < wholeText.length() && Character.isWhitespace(wholeText.charAt(start)))
|
||||
start++;
|
||||
while (end >= 0 && Character.isWhitespace(wholeText.charAt(end)))
|
||||
end--;
|
||||
|
||||
boolean lastWasWhitespace = false;
|
||||
for (int i = start; i < end; i++) {
|
||||
char c = wholeText.charAt(i);
|
||||
if (Character.isWhitespace(c)) {
|
||||
if (!lastWasWhitespace) {
|
||||
length++;
|
||||
}
|
||||
|
||||
lastWasWhitespace = true;
|
||||
} else {
|
||||
length++;
|
||||
|
||||
lastWasWhitespace = false;
|
||||
}
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
}
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -111,10 +112,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.length = getLength(doc);
|
||||
ret.standard = getHtmlStandard(doc);
|
||||
final int length = getLength(doc);
|
||||
final HtmlStandard standard = getHtmlStandard(doc);
|
||||
final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
|
||||
|
||||
ret.length = length;
|
||||
ret.standard = standard;
|
||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc);
|
||||
ret.quality = quality;
|
||||
|
||||
// don't move this up! it uses title and quality
|
||||
// and is run before the heavy computations below
|
||||
@ -123,15 +128,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
ret.features = features;
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
|
||||
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(features);
|
||||
|
||||
ret.metadata = new DocumentMetadata(
|
||||
documentLengthLogic.getEncodedAverageLength(dld),
|
||||
pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
pubDate.yearByte(), (int) -quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
@ -141,8 +147,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
.addPubDate(pubDate)
|
||||
.addUrl(url)
|
||||
.addFeatures(ret.features)
|
||||
.addFormat(ret.standard)
|
||||
.addFeatures(features)
|
||||
.addFormat(standard)
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
@ -285,6 +291,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
}
|
||||
|
||||
private int getLength(Document doc) {
|
||||
return doc.text().length();
|
||||
var mlv = new MeasureLengthVisitor();
|
||||
doc.traverse(mlv);
|
||||
return mlv.length;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,20 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class MeasureLengthVisitorTest {
|
||||
|
||||
@Test
|
||||
public void testMeasureLength() {
|
||||
var mlv = new MeasureLengthVisitor();
|
||||
Jsoup.parse("""
|
||||
<p> hello world!
|
||||
<span> neat! </span>
|
||||
<p>
|
||||
""").traverse(mlv);
|
||||
assertEquals(15, mlv.length);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user