From 67c15a34e67a8d4d256e263420d9d4d57f93e696 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 12 Jun 2023 17:42:31 +0200 Subject: [PATCH] Reduce the amount of expensive operations in HtmlDocumentProcessorPlugin. --- .../converting/processor/MetaRobotsTag.java | 2 +- .../processor/logic/DocumentValuator.java | 75 +++++++++++++------ .../processor/logic/dom/DomPruningFilter.java | 60 +++++++++++++-- .../logic/dom/MeasureLengthVisitor.java | 53 +++++++++++++ .../plugin/HtmlDocumentProcessorPlugin.java | 29 ++++--- .../logic/dom/MeasureLengthVisitorTest.java | 20 +++++ 6 files changed, 198 insertions(+), 41 deletions(-) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitor.java create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitorTest.java diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java index 2c061a72..6e8e3dbc 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java @@ -9,7 +9,7 @@ public class MetaRobotsTag { private final String searchEngineName = "marginalia-search"; public boolean allowIndexingByMetaTag(Document doc) { - var robotsContent = doc.select("meta[name=robots]").attr("content"); + var robotsContent = doc.getElementsByTag("meta").select("meta[name=robots]").attr("content"); if (isForbidden(robotsContent)) { var marginaliaTag = doc.select( "meta[name=" + searchEngineName + "]").attr("content"); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 6c3bf267..59b208ec 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -3,18 +3,23 @@ package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.converting.model.HtmlStandard; -import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.converting.model.DisqualifiedException; +import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; - -import java.util.Set; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; public class DocumentValuator { - public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException { + public double getQuality(CrawledDocument crawledDocument, + HtmlStandard htmlStandard, + Document parsedDocument, + int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); - int textBodyLength = parsedDocument.text().length(); + int textBodyLength = textLength; int rawLength = crawledDocument.documentBody.length(); if (textBodyLength == 0) { @@ -28,24 +33,50 @@ public class DocumentValuator { private int getScriptPenalty(Document parsed) { - var scriptTags = parsed.getElementsByTag("script"); - String scriptText = scriptTags.html(); - int badScript = 0; - if (scriptText.contains(".createElement(")) { - badScript = 1; - } + var scriptVisitor = new ScriptVisitor(); - double scriptPenalty = 0; - for (var tag : scriptTags) { - String srcAttr = tag.attr("src"); - if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) { - scriptPenalty += 0.49; - } - else if (!Strings.isBlank(srcAttr)) { - scriptPenalty += 1; - } - } - return (int)(scriptPenalty + badScript + (scriptText.length())/1000.); + parsed.getElementsByTag("script").traverse(scriptVisitor); + + return scriptVisitor.score(); } + private static class ScriptVisitor implements NodeVisitor { + boolean hasBadScript = false; + int scriptLength = 0; + double penalty = 0.; + + public int score() { + return (int)(penalty + (hasBadScript?1:0) + (scriptLength)/1000.); + } + + @Override + public void head(@NotNull Node node, int depth) { + if (node instanceof Element el) { + visitTag(el); + } + else if (node instanceof TextNode tn) { + visitScriptText(tn); + + } + } + + private void visitScriptText(TextNode tn) { + String wholeText = tn.getWholeText(); + scriptLength += wholeText.length(); + + if (!hasBadScript) { + hasBadScript = wholeText.contains(".createElement("); + } + } + + public void visitTag(Element el) { + String srcAttr = el.attr("src"); + if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) { + penalty += 0.49; + } + else if (!Strings.isBlank(srcAttr)) { + penalty += 1; + } + } + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java index ba236aac..6bfacf27 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java @@ -6,6 +6,7 @@ import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeFilter; import java.util.HashMap; +import java.util.List; import java.util.Map; /** Prune the DOM and remove noisy branches with a lot of tags and not a lot of text. @@ -34,7 +35,7 @@ public class DomPruningFilter implements NodeFilter { final NodeData dataForNode; if (node instanceof TextNode tn) { - dataForNode = new NodeData(depth, tn.text().length(), 0); + dataForNode = new NodeData(depth, MeasureLengthVisitor.lengthOfElement(tn), 0); } else if (isSignal(node)) { dataForNode = new NodeData(depth, 0,0); @@ -61,24 +62,67 @@ public class DomPruningFilter implements NodeFilter { && dataForNode.treeSize > 3) return FilterResult.REMOVE; + if (node instanceof Element el) { + if (shouldAlwaysPurge(el)) { + return FilterResult.REMOVE; + } + } + return FilterResult.CONTINUE; } public boolean isSignal(Node node) { if (node instanceof Element e) { - if ("a".equalsIgnoreCase(e.tagName())) - return false; - if ("nav".equalsIgnoreCase(e.tagName())) - return false; - if ("footer".equalsIgnoreCase(e.tagName())) - return false; - if ("header".equalsIgnoreCase(e.tagName())) + + String tagName = e.tagName(); + + if ("a".equalsIgnoreCase(tagName)) return false; } return true; } + + final List badClassNames = List.of("cookie-banner", "cookie", "cookie-notice", "cookie-policy", + "nav", "navigation", "footer", "header", "menu", "toolbar", "tooltip", + "alert", "alertdialog", "banner", "onetrust-consent-sdk"); + final List badAriaRoles = List.of("alert", "alertdialog", "navigation", "banner", "dialog", "menu", "toolbar", "tooltip"); + + + private boolean shouldAlwaysPurge(Element el) { + + String tagName = el.tagName(); + + if ("nav".equalsIgnoreCase(tagName)) + return true; + if ("footer".equalsIgnoreCase(tagName)) + return true; + if ("header".equalsIgnoreCase(tagName)) + return true; + + var classNames = el.classNames(); + + for (var clazz : classNames) { + for (var bad : badClassNames) { + if (clazz.equalsIgnoreCase(bad)) + return true; + } + } + + var role = el.attr("role"); + + for (var bad : badAriaRoles) { + if (bad.equalsIgnoreCase(role)) + return true; + } + + var ariaHidden = el.attr("aria-hidden"); + if ("true".equalsIgnoreCase(ariaHidden)) + return true; + + return false; + } } class NodeData { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitor.java new file mode 100644 index 00000000..3c693583 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitor.java @@ -0,0 +1,53 @@ +package nu.marginalia.converting.processor.logic.dom; + +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +/** Best effort visitor to measure the length of the text in a DOM tree + * without allocating a bunch of Strings. + */ +public class MeasureLengthVisitor implements NodeVisitor { + public int length = 0; + + @Override + public void head(Node node, int depth) { + if (node instanceof TextNode tn) { + length += lengthOfElement(tn); + } + } + + // Emulate the HTML spec's definition of "length of an element" + // in a "close-enough" fashion. + static int lengthOfElement(TextNode tn) { + String wholeText = tn.getWholeText(); + + int length = 0; + + int start = 0; + int end = wholeText.length() - 1; + + while (start < wholeText.length() && Character.isWhitespace(wholeText.charAt(start))) + start++; + while (end >= 0 && Character.isWhitespace(wholeText.charAt(end))) + end--; + + boolean lastWasWhitespace = false; + for (int i = start; i < end; i++) { + char c = wholeText.charAt(i); + if (Character.isWhitespace(c)) { + if (!lastWasWhitespace) { + length++; + } + + lastWasWhitespace = true; + } else { + length++; + + lastWasWhitespace = false; + } + } + + return length; + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 60fb344f..5c43ca66 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; +import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor; import nu.marginalia.converting.processor.logic.links.FileLinks; import nu.marginalia.converting.processor.logic.links.LinkProcessor; import nu.marginalia.language.model.DocumentLanguageData; @@ -111,10 +112,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin var ret = new ProcessedDocumentDetails(); - ret.length = getLength(doc); - ret.standard = getHtmlStandard(doc); + final int length = getLength(doc); + final HtmlStandard standard = getHtmlStandard(doc); + final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length); + + ret.length = length; + ret.standard = standard; ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc); + ret.quality = quality; // don't move this up! it uses title and quality // and is run before the heavy computations below @@ -123,15 +128,16 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); + final Set features = featureExtractor.getFeatures(crawledDomain, doc, dld); + ret.features = features; ret.hashCode = dld.localitySensitiveHashCode(); - PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); - EnumSet documentFlags = htmlFeatures2DocumentFlags(ret.features); + PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, standard, true); + EnumSet documentFlags = htmlFeatures2DocumentFlags(features); ret.metadata = new DocumentMetadata( documentLengthLogic.getEncodedAverageLength(dld), - pubDate.yearByte(), (int) -ret.quality, documentFlags); + pubDate.yearByte(), (int) -quality, documentFlags); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); @@ -141,8 +147,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) - .addFeatures(ret.features) - .addFormat(ret.standard) + .addFeatures(features) + .addFormat(standard) .build(); words.addAllSyntheticTerms(tagWords); @@ -285,6 +291,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } private int getLength(Document doc) { - return doc.text().length(); + var mlv = new MeasureLengthVisitor(); + doc.traverse(mlv); + return mlv.length; } + } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitorTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitorTest.java new file mode 100644 index 00000000..2b44d73f --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/dom/MeasureLengthVisitorTest.java @@ -0,0 +1,20 @@ +package nu.marginalia.converting.processor.logic.dom; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class MeasureLengthVisitorTest { + + @Test + public void testMeasureLength() { + var mlv = new MeasureLengthVisitor(); + Jsoup.parse(""" +

hello world! + neat! +

+ """).traverse(mlv); + assertEquals(15, mlv.length); + } +} \ No newline at end of file