Reduce the amount of expensive operations in HtmlDocumentProcessorPlugin.

This commit is contained in:
Viktor Lofgren 2023-06-12 17:42:31 +02:00 committed by Viktor
parent 9579cdd151
commit 67c15a34e6
6 changed files with 198 additions and 41 deletions

View File

@ -9,7 +9,7 @@ public class MetaRobotsTag {
private final String searchEngineName = "marginalia-search";
public boolean allowIndexingByMetaTag(Document doc) {
var robotsContent = doc.select("meta[name=robots]").attr("content");
var robotsContent = doc.getElementsByTag("meta").select("meta[name=robots]").attr("content");
if (isForbidden(robotsContent)) {
var marginaliaTag = doc.select( "meta[name=" + searchEngineName + "]").attr("content");

View File

@ -3,18 +3,23 @@ package nu.marginalia.converting.processor.logic;
import crawlercommons.utils.Strings;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.model.DisqualifiedException;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import java.util.Set;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
public class DocumentValuator {
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException {
public double getQuality(CrawledDocument crawledDocument,
HtmlStandard htmlStandard,
Document parsedDocument,
int textLength) throws DisqualifiedException {
double scriptPenalty = getScriptPenalty(parsedDocument);
int textBodyLength = parsedDocument.text().length();
int textBodyLength = textLength;
int rawLength = crawledDocument.documentBody.length();
if (textBodyLength == 0) {
@ -28,24 +33,50 @@ public class DocumentValuator {
private int getScriptPenalty(Document parsed) {
var scriptTags = parsed.getElementsByTag("script");
String scriptText = scriptTags.html();
int badScript = 0;
if (scriptText.contains(".createElement(")) {
badScript = 1;
}
var scriptVisitor = new ScriptVisitor();
double scriptPenalty = 0;
for (var tag : scriptTags) {
String srcAttr = tag.attr("src");
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
scriptPenalty += 0.49;
}
else if (!Strings.isBlank(srcAttr)) {
scriptPenalty += 1;
}
}
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
parsed.getElementsByTag("script").traverse(scriptVisitor);
return scriptVisitor.score();
}
private static class ScriptVisitor implements NodeVisitor {
boolean hasBadScript = false;
int scriptLength = 0;
double penalty = 0.;
public int score() {
return (int)(penalty + (hasBadScript?1:0) + (scriptLength)/1000.);
}
@Override
public void head(@NotNull Node node, int depth) {
if (node instanceof Element el) {
visitTag(el);
}
else if (node instanceof TextNode tn) {
visitScriptText(tn);
}
}
private void visitScriptText(TextNode tn) {
String wholeText = tn.getWholeText();
scriptLength += wholeText.length();
if (!hasBadScript) {
hasBadScript = wholeText.contains(".createElement(");
}
}
public void visitTag(Element el) {
String srcAttr = el.attr("src");
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
penalty += 0.49;
}
else if (!Strings.isBlank(srcAttr)) {
penalty += 1;
}
}
}
}

View File

@ -6,6 +6,7 @@ import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text.
@ -34,7 +35,7 @@ public class DomPruningFilter implements NodeFilter {
final NodeData dataForNode;
if (node instanceof TextNode tn) {
dataForNode = new NodeData(depth, tn.text().length(), 0);
dataForNode = new NodeData(depth, MeasureLengthVisitor.lengthOfElement(tn), 0);
}
else if (isSignal(node)) {
dataForNode = new NodeData(depth, 0,0);
@ -61,24 +62,67 @@ public class DomPruningFilter implements NodeFilter {
&& dataForNode.treeSize > 3)
return FilterResult.REMOVE;
if (node instanceof Element el) {
if (shouldAlwaysPurge(el)) {
return FilterResult.REMOVE;
}
}
return FilterResult.CONTINUE;
}
public boolean isSignal(Node node) {
if (node instanceof Element e) {
if ("a".equalsIgnoreCase(e.tagName()))
return false;
if ("nav".equalsIgnoreCase(e.tagName()))
return false;
if ("footer".equalsIgnoreCase(e.tagName()))
return false;
if ("header".equalsIgnoreCase(e.tagName()))
String tagName = e.tagName();
if ("a".equalsIgnoreCase(tagName))
return false;
}
return true;
}
final List<String> badClassNames = List.of("cookie-banner", "cookie", "cookie-notice", "cookie-policy",
"nav", "navigation", "footer", "header", "menu", "toolbar", "tooltip",
"alert", "alertdialog", "banner", "onetrust-consent-sdk");
final List<String> badAriaRoles = List.of("alert", "alertdialog", "navigation", "banner", "dialog", "menu", "toolbar", "tooltip");
private boolean shouldAlwaysPurge(Element el) {
String tagName = el.tagName();
if ("nav".equalsIgnoreCase(tagName))
return true;
if ("footer".equalsIgnoreCase(tagName))
return true;
if ("header".equalsIgnoreCase(tagName))
return true;
var classNames = el.classNames();
for (var clazz : classNames) {
for (var bad : badClassNames) {
if (clazz.equalsIgnoreCase(bad))
return true;
}
}
var role = el.attr("role");
for (var bad : badAriaRoles) {
if (bad.equalsIgnoreCase(role))
return true;
}
var ariaHidden = el.attr("aria-hidden");
if ("true".equalsIgnoreCase(ariaHidden))
return true;
return false;
}
}
class NodeData {

View File

@ -0,0 +1,53 @@
package nu.marginalia.converting.processor.logic.dom;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
/** Best effort visitor to measure the length of the text in a DOM tree
* without allocating a bunch of Strings.
*/
public class MeasureLengthVisitor implements NodeVisitor {
public int length = 0;
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode tn) {
length += lengthOfElement(tn);
}
}
// Emulate the HTML spec's definition of "length of an element"
// in a "close-enough" fashion.
static int lengthOfElement(TextNode tn) {
String wholeText = tn.getWholeText();
int length = 0;
int start = 0;
int end = wholeText.length() - 1;
while (start < wholeText.length() && Character.isWhitespace(wholeText.charAt(start)))
start++;
while (end >= 0 && Character.isWhitespace(wholeText.charAt(end)))
end--;
boolean lastWasWhitespace = false;
for (int i = start; i < end; i++) {
char c = wholeText.charAt(i);
if (Character.isWhitespace(c)) {
if (!lastWasWhitespace) {
length++;
}
lastWasWhitespace = true;
} else {
length++;
lastWasWhitespace = false;
}
}
return length;
}
}

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.language.model.DocumentLanguageData;
@ -111,10 +112,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
var ret = new ProcessedDocumentDetails();
ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc);
final int length = getLength(doc);
final HtmlStandard standard = getHtmlStandard(doc);
final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
ret.length = length;
ret.standard = standard;
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc);
ret.quality = quality;
// don't move this up! it uses title and quality
// and is run before the heavy computations below
@ -123,15 +128,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
final Set<HtmlFeature> features = featureExtractor.getFeatures(crawledDomain, doc, dld);
ret.features = features;
ret.hashCode = dld.localitySensitiveHashCode();
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true);
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(features);
ret.metadata = new DocumentMetadata(
documentLengthLogic.getEncodedAverageLength(dld),
pubDate.yearByte(), (int) -ret.quality, documentFlags);
pubDate.yearByte(), (int) -quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
@ -141,8 +147,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.addDomainCrawlData(crawledDomain)
.addPubDate(pubDate)
.addUrl(url)
.addFeatures(ret.features)
.addFormat(ret.standard)
.addFeatures(features)
.addFormat(standard)
.build();
words.addAllSyntheticTerms(tagWords);
@ -285,6 +291,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
}
private int getLength(Document doc) {
return doc.text().length();
var mlv = new MeasureLengthVisitor();
doc.traverse(mlv);
return mlv.length;
}
}

View File

@ -0,0 +1,20 @@
package nu.marginalia.converting.processor.logic.dom;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class MeasureLengthVisitorTest {
@Test
public void testMeasureLength() {
var mlv = new MeasureLengthVisitor();
Jsoup.parse("""
<p> hello world!
<span> neat! </span>
<p>
""").traverse(mlv);
assertEquals(15, mlv.length);
}
}