From 02abe498ffe267b03bcb1f8b9fcff7e209dc8d18 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Aug 2022 13:50:57 +0200 Subject: [PATCH] master (#84) Co-authored-by: vlofgren Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/84 --- .../wmsa/configuration/WmsaHome.java | 5 + .../edge/assistant/suggest/Suggestions.java | 2 +- .../processor/DocumentProcessor.java | 52 +++- .../processor/logic/FeatureExtractor.java | 16 +- .../processor/logic/HtmlFeature.java | 2 + .../processor/logic/LinkParser.java | 22 +- .../processor/logic/LinkProcessor.java | 11 + .../processor/logic/QueryParams.java | 8 +- .../logic/topic/AdblockSimulator.java | 254 +++++++++++------- .../edge/crawling/CrawledDomainReader.java | 52 ++-- .../wmsa/edge/index/model/IndexBlock.java | 2 +- .../edge/model/search/EdgeUrlDetails.java | 2 + .../wmsa/edge/tools/AdblockTesterTool.java | 3 +- .../edge/tools/ConverterLogicTestTool.java | 56 ++++ ...rTool.java => CrawlDataExtractorTool.java} | 64 ++--- ...oaderTool.java => FeaturesLoaderTool.java} | 14 +- .../templates/edge/search-result-metadata.hdb | 1 + 17 files changed, 388 insertions(+), 178 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/{RecipeDetectorTool.java => CrawlDataExtractorTool.java} (51%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/{RecipesLoaderTool.java => FeaturesLoaderTool.java} (88%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java index 87768ca0..f82b9527 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java @@ -46,6 +46,10 @@ public class WmsaHome { } } + public static Path getAdsDefinition() { + return getHomePath().resolve("data").resolve("adblock.txt"); + } + public static Path getIPLocationDatabse() { return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV"); } @@ -90,4 +94,5 @@ public class WmsaHome { home.resolve("model/English.DICT"), home.resolve("model/opennlp-tok.bin")); } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java index 20531e2d..b8284420 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java @@ -61,7 +61,7 @@ public class Suggestions { } catch (IOException ex) { logger.error("Failed to load suggestions file", ex); - return new PatriciaTrie(); + return new PatriciaTrie<>(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 9107f62c..ee106cce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -16,6 +16,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; @@ -26,6 +27,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URISyntaxException; +import java.nio.file.Path; import java.util.*; import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN; @@ -199,8 +201,19 @@ public class DocumentProcessor { baseUrl = linkParser.getBaseLink(doc, baseUrl); + EdgeDomain domain = baseUrl.domain; + for (var atag : doc.getElementsByTag("a")) { - linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept); + var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); + if (linkParser.shouldIndexLink(atag)) { + linkOpt.ifPresent(lp::accept); + } + else if (linkOpt.isPresent()) { + if (linkParser.hasBinarySuffix(linkOpt.get().toString())) { + linkOpt.ifPresent(lp::acceptNonIndexable); + } + } + } for (var frame : doc.getElementsByTag("frame")) { linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept); @@ -216,13 +229,44 @@ public class DocumentProcessor { final Set linkTerms = new HashSet<>(); - for (var domain : lp.getForeignDomains()) { - linkTerms.add("links:"+domain.toString().toLowerCase()); - linkTerms.add("links:"+domain.getDomain().toLowerCase()); + for (var fd : lp.getForeignDomains()) { + linkTerms.add("links:"+fd.toString().toLowerCase()); + linkTerms.add("links:"+fd.getDomain().toLowerCase()); } words.append(IndexBlock.Meta, linkTerms); + Set fileKeywords = new HashSet<>(100); + for (var link : lp.getNonIndexableUrls()) { + + if (!Objects.equals(domain, link.domain)) { + continue; + } + + synthesizeFilenameKeyword(fileKeywords, link); + + } + + words.append(IndexBlock.Artifacts, fileKeywords); + } + + private void synthesizeFilenameKeyword(Set fileKeywords, EdgeUrl link) { + + + Path pFilename = Path.of(link.path.toLowerCase()).getFileName(); + + if (pFilename == null) return; + + String filename = pFilename.toString(); + if (filename.length() > 32 + || filename.endsWith(".xml") + || filename.endsWith(".jpg") + || filename.endsWith(".png") + || filename.endsWith(".pdf") + || filename.endsWith(".gif")) + return; + + fileKeywords.add(filename.replace(' ', '_')); } private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java index ceb7e8c7..440f21ac 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java @@ -1,5 +1,8 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import org.jsoup.nodes.Document; @@ -7,6 +10,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +@Singleton public class FeatureExtractor { private static final List trackers = List.of("adform.net", @@ -29,6 +33,13 @@ public class FeatureExtractor { "d31qbv1cthcecs.cloudfront.net", "linkedin.com"); + private AdblockSimulator adblockSimulator; + + @Inject + public FeatureExtractor(AdblockSimulator adblockSimulator) { + this.adblockSimulator = adblockSimulator; + } + public Set getFeatures(CrawledDomain domain, Document doc) { Set features = new HashSet<>(); @@ -37,6 +48,9 @@ public class FeatureExtractor { if (scriptTags.size() > 0) { features.add(HtmlFeature.JS); } + else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript + features.add(HtmlFeature.ADVERTISEMENT); + } if (!doc.getElementsByTag("object").isEmpty() || !doc.getElementsByTag("audio").isEmpty() @@ -56,7 +70,7 @@ public class FeatureExtractor { if (doc.getElementsByTag("a").stream().map(e -> e.attr("href")) .map(String::toLowerCase) .anyMatch(href -> - href.contains("amzn.to/") || href.contains("amazon.com/"))) { + href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) { features.add(HtmlFeature.AFFILIATE_LINK); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java index c7ef9fd8..5744221d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java @@ -10,6 +10,8 @@ public enum HtmlFeature { COOKIES("special:cookies"), CATEGORY_FOOD("category:food"), + + ADVERTISEMENT("special:ads"), ; private final String keyword; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index d58b15bf..50dc3da6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -40,6 +40,17 @@ public class LinkParser { .flatMap(this::createEdgeUrl); } + @Contract(pure=true) + public Optional parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) { + return Optional.of(l) + .map(this::getUrl) + .map(link -> resolveUrl(relativeBaseUrl, link)) + .flatMap(this::createURI) + .map(URI::normalize) + .map(this::renormalize) + .flatMap(this::createEdgeUrl); + } + private Optional createURI(String s) { try { return Optional.of(new URI(s)); @@ -146,17 +157,20 @@ public class LinkParser { return s.matches("^[a-zA-Z]+:.*$"); } - private boolean shouldIndexLink(Element link) { + public boolean shouldIndexLink(Element link) { return isUrlRelevant(link.attr("href")) && isRelRelevant(link.attr("rel")); - } - private boolean isRelRelevant(String rel) { + public boolean isRelRelevant(String rel) { // this is null safe return !"noindex".equalsIgnoreCase(rel); } + public boolean hasBinarySuffix(String href) { + return blockSuffixList.stream().anyMatch(href::endsWith); + } + private boolean isUrlRelevant(String href) { if (null == href || "".equals(href)) { return false; @@ -164,7 +178,7 @@ public class LinkParser { if (blockPrefixList.stream().anyMatch(href::startsWith)) { return false; } - if (blockSuffixList.stream().anyMatch(href::endsWith)) { + if (hasBinarySuffix(href)) { return false; } if (href.length() > 128) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java index 54c47e4c..b94f90d8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java @@ -13,6 +13,9 @@ import java.util.Set; public class LinkProcessor { private final ProcessedDocumentDetails ret; private final EdgeUrl baseUrl; + + private final Set nonIndexable = new HashSet<>(); + private final Set seenUrls = new HashSet<>(); private final Set foreignDomains = new HashSet<>(); @@ -32,6 +35,10 @@ public class LinkProcessor { public Set getForeignDomains() { return foreignDomains; } + + public Set getNonIndexableUrls() { + return nonIndexable; + } public void accept(EdgeUrl link) { if (!isLinkPermitted(link)) { @@ -87,4 +94,8 @@ public class LinkProcessor { return proto.equalsIgnoreCase("http") || proto.equalsIgnoreCase("https"); } + + public void acceptNonIndexable(EdgeUrl edgeUrl) { + nonIndexable.add(edgeUrl); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java index ad52e347..2e5ef542 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java @@ -28,9 +28,9 @@ public class QueryParams { public static boolean isPermittedParam(String path, String param) { if (path.endsWith("index.php")) { - if (param.startsWith("showtopic")) + if (param.startsWith("showtopic=")) return true; - if (param.startsWith("showforum")) + if (param.startsWith("showforum=")) return true; } if (path.endsWith("viewtopic.php")) { @@ -45,6 +45,10 @@ public class QueryParams { if (path.endsWith("showforum.php")) { return param.startsWith("v="); } + + if (path.endsWith("StoryView.py")) { // folklore.org is neat + return param.startsWith("project=") || param.startsWith("story="); + } return false; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java index 2ed74810..199e05bc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java @@ -1,133 +1,181 @@ package nu.marginalia.wmsa.edge.converting.processor.logic.topic; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.configuration.WmsaHome; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.function.Predicate; import java.util.regex.Pattern; +@Singleton public class AdblockSimulator { + private final Set idRules = new HashSet<>(); - List idRules = new ArrayList(); - List classRules = new ArrayList(); - List> scriptRules = new ArrayList(); + private final Set classRules = new HashSet<>(); + private final List> scriptRules = new ArrayList<>(); - public AdblockSimulator(Path adsDefinition) throws IOException { - try (var lineStream = Files.lines(adsDefinition)) { + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public AdblockSimulator() throws IOException { + Path adDef = WmsaHome.getAdsDefinition(); + + if (!Files.exists(adDef)) { + logger.error("Can not find ads definition file in {}", adDef); + return; + } + + try (var lineStream = Files.lines(adDef)) { lineStream.skip(1).forEach(this::addRule); } } - private void addRule(String s) { - if (s.startsWith("##") && !s.contains(":")) { - if (s.startsWith("###")) { - idRules.add(s.substring(3)); - } else if(s.startsWith("##.")) { - classRules.add(s.substring(3)); - } - } - else if (!s.startsWith("!") && !s.contains("#")){ - scriptRules.add(toRegexMatcher(s)); - } - } - - private Predicate toRegexMatcher(String s) { - - System.out.println("<-" + s); - - s = s.replaceAll("\\?", "\\\\?"); - s = s.replaceAll("\\.", "\\\\."); - s = s.replaceAll("\\$", "\\\\\\$"); - - if (s.startsWith("||")) { - s = s.replaceFirst("\\|\\|","^http(s)?://"); - } - - s = s.replaceAll("\\|", "\\\\|"); - s = s.replaceAll("\\*", ".*"); - s = s.replaceAll("\\^", "[?/]"); - - - System.out.println("->" + s); - return Pattern.compile(s).asPredicate(); - } - - class RuleVisitor implements NodeFilter { - public boolean sawAds; - Pattern spPattern = Pattern.compile("\\s"); - - @Override - public FilterResult head(Node node, int depth) { - - if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow - - String id = elem.id(); - for (var rule : idRules) { - if (rule.equals(id)) { - sawAds = true; - return FilterResult.STOP; - } - } - - String classes = elem.className(); - if (classes.isBlank()) return FilterResult.CONTINUE; - - if (classes.indexOf(' ') > 0) { - String[] classNames = spPattern.split(classes); - for (var rule : classRules) { - - for (var className : classNames) { - if (className.equals(rule)) { - sawAds = true; - return FilterResult.STOP; - } - } - } - } - else { // tag only has one class - for (var rule : classRules) { - if (classes.equals(rule)) { - sawAds = true; - return FilterResult.STOP; - } - } - } - - if ("script".equals(elem.tagName())) { - String src = elem.attr("src"); - - for (var rule : scriptRules) { - if (rule.test(src)) { - sawAds = true; - return FilterResult.STOP; - } - } - } - - return FilterResult.CONTINUE; - } - return FilterResult.CONTINUE; - } - - @Override - public FilterResult tail(Node node, int depth) { - return FilterResult.CONTINUE; - } - } - public boolean hasAds(Document document) { - RuleVisitor ruleVisitor = new RuleVisitor(); + document.filter(ruleVisitor); return ruleVisitor.sawAds; } + private void addRule(String s) { + try { + if (s.startsWith("##") && !s.contains(":")) { + if (s.startsWith("###")) { + idRules.add(s.substring(3)); + } else if (s.startsWith("##.")) { + classRules.add(s.substring(3)); + } + } else if (s.startsWith("/^")) { + int end = s.indexOf("[^\\]/"); + if (end >= 0) { + String patternString = s.substring(1, end+1); + scriptRules.add(Pattern.compile(patternString).asPredicate()); + } + } else if (!s.startsWith("!") && !s.contains("#") && !s.startsWith("@@")) { + if (!s.contains("$")) { + scriptRules.add(toRegexMatcher(s)); + } + else if (s.contains("$script") && !s.contains("domain=")) { + scriptRules.add(toRegexMatcher(s.substring(0, s.indexOf('$')))); + } + } + } + catch (Exception ex) { + System.err.println("Failed to add rule " + s); + } + } + + private Predicate toRegexMatcher(String s) { + String sOriginal = s; + if (s.isBlank()) return unused -> false; + + // In some cases, regexes aren't necessary + if (s.matches("[&?=/A-Za-z0-9._-]+")) { + if (s.startsWith("/")) { + return str -> str.equals(sOriginal); + } + else { + return str -> str.contains(sOriginal); + } + } + if (s.matches("[&?=/A-Za-z0-9._-]+\\*")) { + return str -> str.startsWith(sOriginal.substring(0, sOriginal.length()-1)); + } + + String s0 = s; + s = s.replaceAll("\\?", "\\\\?"); + s = s.replaceAll("\\.", "\\\\."); + + s = s.replaceAll("\\^", "[?/]"); + s = s.replaceAll("\\*", ".*"); + + if (s.startsWith("||")) { + s = s.replaceFirst("\\|\\|","^http[s]?://.*"); + } + + s = s.replaceAll("\\|", "\\\\|"); + return Pattern.compile(s).asPredicate(); + } + + + // Refrain from cleaning up this code, it's very hot code and needs to be fast. + // This version is about 100x faster than the a "clean" first stab implementation. + + class RuleVisitor implements NodeFilter { + public boolean sawAds; + + Pattern spPattern = Pattern.compile("\\s"); + + @Override + public FilterResult head(Node node, int depth) { + + if (node.attributesSize() > 0 && node instanceof Element elem) { + if (testId(elem) || testClass(elem) || testScriptTags(elem)) { + sawAds = true; + return FilterResult.STOP; + } + } + return FilterResult.CONTINUE; + } + + private boolean testScriptTags(Element elem) { + if (!"script".equals(elem.tagName())) { + return false; + } + + String src = elem.attr("src"); + for (var rule : scriptRules) { + if (rule.test(src)) { + return true; + } + } + + return false; + } + + private boolean testId(Element elem) { + String id = elem.id(); + + return idRules.contains(id); + } + + private boolean testClass(Element elem) { + String classes = elem.className(); + if (classes.isBlank()) + return false; + + if (classes.indexOf(' ') > 0) { + String[] classNames = spPattern.split(classes); + for (var className : classNames) { + if (classRules.contains(className)) + return true; + } + } + else { // tag only has one class, no need to split + return classRules.contains(classes); + } + + return false; + } + @Override + public FilterResult tail(Node node, int depth) { + return FilterResult.CONTINUE; + } + + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java index 81121af7..acc9708c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java @@ -5,17 +5,17 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; public class CrawledDomainReader { private final Gson gson = new GsonBuilder().create(); - private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class); public CrawledDomainReader() { } @@ -23,29 +23,45 @@ public class CrawledDomainReader { public CrawledDomain read(Path path) throws IOException { List docs = new ArrayList<>(); CrawledDomain domain = null; - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { - String line; - while ((line = br.readLine()) != null) { - if (line.startsWith("//")) { - String nextLine = br.readLine(); - if (nextLine == null) break; - if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - domain = gson.fromJson(nextLine, CrawledDomain.class); - } else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - docs.add(gson.fromJson(nextLine, CrawledDocument.class)); + + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { + br.mark(2); + boolean legacy = '{' == br.read(); + br.reset(); + + if (legacy) { + domain = gson.fromJson(br, CrawledDomain.class); + } + else { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("//")) { + String nextLine = br.readLine(); + if (nextLine == null) break; + + if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + domain = gson.fromJson(nextLine, CrawledDomain.class); + } else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + docs.add(gson.fromJson(nextLine, CrawledDocument.class)); + } + } else if (line.charAt(0) == '{') { + domain = gson.fromJson(line, CrawledDomain.class); } } - else if (line.charAt(0) == '{') { - domain = gson.fromJson(line, CrawledDomain.class); - } } } if (domain == null) { return null; } - domain.doc.addAll(docs); + + if (!docs.isEmpty()) { + if (domain.doc == null) + domain.doc = new ArrayList<>(); + + domain.doc.addAll(docs); + } return domain; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index 73e733a5..819706fd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -11,7 +11,7 @@ public enum IndexBlock { Meta(7, 7), PositionWords(8, 4.5), NamesWords(9, 5), - Unused(10, 10), + Artifacts(10, 10), Topic(11, 0.5); public final int id; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index f6519362..0a7cd0c9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -132,6 +132,8 @@ public class EdgeUrlDetails { public boolean isCookies() { return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); } + public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); } + public boolean isSpecialDomain() { return domainState == EdgeDomainIndexingState.SPECIAL; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java index fde620c3..bd3c0429 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java @@ -19,7 +19,7 @@ public class AdblockTesterTool { static { try { - simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt")); + simulator = new AdblockSimulator(); } catch (IOException e) { throw new RuntimeException(e); } @@ -29,7 +29,6 @@ public class AdblockTesterTool { public static void main(String... args) throws IOException { EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0])); - try (var iterable = plan.domainsIterable()) { for (var domain : iterable) { processDomain(domain); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java new file mode 100644 index 00000000..58fff3c6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -0,0 +1,56 @@ +package nu.marginalia.wmsa.edge.tools; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.wmsa.edge.converting.ConverterModule; +import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; + +public class ConverterLogicTestTool { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static void main(String... args) throws IOException { + + if (args.length != 1) { + System.err.println("Arguments: crawl-plan.yaml"); + System.exit(0); + } + var plan = new CrawlPlanLoader().load(Path.of(args[0])); + + Injector injector = Guice.createInjector( + new ConverterModule(plan) + ); + + injector.getInstance(ConverterLogicTestTool.class); + } + + @Inject + public ConverterLogicTestTool( + EdgeCrawlPlan plan, + DomainProcessor processor + ) throws Exception { + + plan.forEachCrawledDomain(domain -> { + var ret = processor.process(domain); + ret.documents.forEach(doc -> { + if (doc.words == null) + return; + var artifacts = doc.words.get(IndexBlock.Artifacts); + if (artifacts.size() > 0) { + System.out.println(doc.url + ": " + artifacts); + } + }); + }); + + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java similarity index 51% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java index e1b6c822..b0e86d7f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java @@ -1,13 +1,8 @@ package nu.marginalia.wmsa.edge.tools; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.configuration.WmsaHome; +import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; -import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; @@ -20,21 +15,24 @@ import java.nio.file.Path; import java.sql.SQLException; import java.util.HashSet; import java.util.Set; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.*; import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; -public class RecipeDetectorTool { - private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); - private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); - private static final RecipeDetector recipeDetector = new RecipeDetector(); +public class CrawlDataExtractorTool { + private static final AdblockSimulator abs; - private static final LanguageModels lm = WmsaHome.getLanguageModels(); - private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm); + static { + try { + abs = new AdblockSimulator(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } private static final Set urls = new HashSet<>(50_000_000); + @SneakyThrows public static void main(String... args) throws IOException { EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0])); DatabaseModule module = new DatabaseModule(); @@ -51,15 +49,25 @@ public class RecipeDetectorTool { ex.printStackTrace(); } - ForkJoinPool pool = new ForkJoinPool(16); + LinkedBlockingQueue queue = new LinkedBlockingQueue<>(10); + ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue); + Semaphore sem = new Semaphore(20); try (var iterable = plan.domainsIterable()) { for (var domain : iterable) { - pool.execute(() -> processDomain(domain)); + sem.acquire(); + pool.execute(() -> { + try { processDomain(domain); } + finally { sem.release(); } + }); } + } catch (InterruptedException e) { + throw new RuntimeException(e); } - while (!pool.awaitQuiescence(1, TimeUnit.HOURS)); + pool.shutdown(); + + while (!pool.awaitTermination(1, TimeUnit.MINUTES)); } private static void processDomain(CrawledDomain domain) { @@ -78,24 +86,8 @@ public class RecipeDetectorTool { private static void processDocument(CrawledDocument doc) { Document parsedDocument = Jsoup.parse(doc.documentBody); - parsedDocument.getElementsByTag("a").remove(); - parsedDocument.getElementsByTag("nav").remove(); - - DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument); - - double prob = 100*recipeDetector.testP(dld); - if (prob > 50) { - System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url); - } - - prob = 100*woodworkingDetector.testP(dld); - if (prob > 20) { - System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url); - } - - prob = 100*textileCraftDetector.testP(dld); - if (prob > 20) { - System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url); + if (abs.hasAds(parsedDocument)) { + System.out.println(doc.url); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java index 96fc9e47..2e3398da 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipesLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeId; @@ -20,19 +21,20 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD; - -public class RecipesLoaderTool { +public class FeaturesLoaderTool { public static void main(String... args) { + HtmlFeature feature = HtmlFeature.valueOf(args[0]); + Path file = Path.of(args[1]); + try (EdgeIndexClient client = new EdgeIndexClient(); HikariDataSource ds = new DatabaseModule().provideConnection(); Connection conn = ds.getConnection(); PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?"); - var linesStream = Files.lines(Path.of(args[0]))) { + var linesStream = Files.lines(file)) { var urls = getUrls(ds); - var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword()))); + var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(feature.getKeyword()))); linesStream .map(urls::get) .filter(Objects::nonNull) @@ -42,7 +44,7 @@ public class RecipesLoaderTool { try { ps.setInt(2, urlId); - ps.setInt(1, CATEGORY_FOOD.getFeatureBit()); + ps.setInt(1, feature.getFeatureBit()); ps.executeUpdate(); } catch (SQLException ex) { diff --git a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb b/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb index 9127d2e6..27e41fd7 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-result-metadata.hdb @@ -3,6 +3,7 @@ {{#if media}}🎞️{{/if}} {{#if affiliate}}💳️{{/if}} {{#if cookies}}👁️️{{/if}} +{{#if ads}}⚠️️️{{/if}} {{format}} {{#unless focusDomain}} {{{rankingSymbol}}}