From 9c6e3b177299dc077aebe00e531d8f6e31875ab4 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 10 Aug 2022 15:04:25 +0200 Subject: [PATCH 1/4] Topical detection (experimental), Adblock simulation (experimental) --- .../logic/topic/AdblockSimulator.java | 133 +++++++++++++++ .../logic/{ => topic}/RecipeDetector.java | 4 +- .../logic/topic/TextileCraftDetector.java | 158 ++++++++++++++++++ .../logic/topic/WoodworkingDetector.java | 134 +++++++++++++++ .../edge/crawling/CrawledDomainReader.java | 9 +- .../wmsa/edge/crawling/WorkLog.java | 21 ++- .../wmsa/edge/model/EdgeCrawlPlan.java | 51 +++++- .../wmsa/edge/tools/AdblockTesterTool.java | 58 +++++++ .../wmsa/edge/tools/RecipeDetectorTool.java | 33 +++- 9 files changed, 578 insertions(+), 23 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/{ => topic}/RecipeDetector.java (98%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java new file mode 100644 index 00000000..2ed74810 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/AdblockSimulator.java @@ -0,0 +1,133 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.NodeFilter; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class AdblockSimulator { + + List idRules = new ArrayList(); + List classRules = new ArrayList(); + List> scriptRules = new ArrayList(); + + public AdblockSimulator(Path adsDefinition) throws IOException { + try (var lineStream = Files.lines(adsDefinition)) { + lineStream.skip(1).forEach(this::addRule); + } + } + + private void addRule(String s) { + if (s.startsWith("##") && !s.contains(":")) { + if (s.startsWith("###")) { + idRules.add(s.substring(3)); + } else if(s.startsWith("##.")) { + classRules.add(s.substring(3)); + } + } + else if (!s.startsWith("!") && !s.contains("#")){ + scriptRules.add(toRegexMatcher(s)); + } + } + + private Predicate toRegexMatcher(String s) { + + System.out.println("<-" + s); + + s = s.replaceAll("\\?", "\\\\?"); + s = s.replaceAll("\\.", "\\\\."); + s = s.replaceAll("\\$", "\\\\\\$"); + + if (s.startsWith("||")) { + s = s.replaceFirst("\\|\\|","^http(s)?://"); + } + + s = s.replaceAll("\\|", "\\\\|"); + s = s.replaceAll("\\*", ".*"); + s = s.replaceAll("\\^", "[?/]"); + + + System.out.println("->" + s); + return Pattern.compile(s).asPredicate(); + } + + class RuleVisitor implements NodeFilter { + public boolean sawAds; + Pattern spPattern = Pattern.compile("\\s"); + + @Override + public FilterResult head(Node node, int depth) { + + if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow + + String id = elem.id(); + for (var rule : idRules) { + if (rule.equals(id)) { + sawAds = true; + return FilterResult.STOP; + } + } + + String classes = elem.className(); + if (classes.isBlank()) return FilterResult.CONTINUE; + + if (classes.indexOf(' ') > 0) { + String[] classNames = spPattern.split(classes); + for (var rule : classRules) { + + for (var className : classNames) { + if (className.equals(rule)) { + sawAds = true; + return FilterResult.STOP; + } + } + } + } + else { // tag only has one class + for (var rule : classRules) { + if (classes.equals(rule)) { + sawAds = true; + return FilterResult.STOP; + } + } + } + + if ("script".equals(elem.tagName())) { + String src = elem.attr("src"); + + for (var rule : scriptRules) { + if (rule.test(src)) { + sawAds = true; + return FilterResult.STOP; + } + } + } + + return FilterResult.CONTINUE; + } + return FilterResult.CONTINUE; + } + + @Override + public FilterResult tail(Node node, int depth) { + return FilterResult.CONTINUE; + } + } + + public boolean hasAds(Document document) { + + RuleVisitor ruleVisitor = new RuleVisitor(); + document.filter(ruleVisitor); + + return ruleVisitor.sawAds; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java index 4b77cba2..17f8d992 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/RecipeDetector.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/RecipeDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.converting.processor.logic; +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; import ca.rmen.porterstemmer.PorterStemmer; import nu.marginalia.util.language.processing.model.DocumentLanguageData; @@ -205,7 +205,7 @@ public class RecipeDetector { } - public double recipeP(DocumentLanguageData dld) { + public double testP(DocumentLanguageData dld) { Map values = new HashMap<>(); int count = 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java new file mode 100644 index 00000000..1146c620 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/TextileCraftDetector.java @@ -0,0 +1,158 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; + +import java.util.HashMap; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class TextileCraftDetector { + private static final int AVG_LENGTH = 1000; + + private final Map termValues = new HashMap<>(); + + public TextileCraftDetector() { + PorterStemmer ps = new PorterStemmer(); + + termValues.put(ps.stemWord("shop"), -0.1); + termValues.put(ps.stemWord("newsletter"), -0.1); + termValues.put(ps.stemWord("cart"), -0.1); + termValues.put(ps.stemWord("item"), -0.025); + termValues.put(ps.stemWord("price"), -0.1); + termValues.put(ps.stemWord("book"), -0.1); + termValues.put(ps.stemWord("order"), -0.1); + termValues.put(ps.stemWord("exhibition"), -0.1); + + termValues.put(ps.stemWord("knit"), 0.05); + termValues.put(ps.stemWord("stitch"), 0.05); + termValues.put(ps.stemWord("yarn"), 0.05); + termValues.put(ps.stemWord("crochet"), 0.05); + termValues.put(ps.stemWord("ravelry"), 0.15); + + termValues.put(ps.stemWord("stockinette"), 0.075); + termValues.put(ps.stemWord("purl"), 0.075); + termValues.put(ps.stemWord("ksp"), 0.075); + termValues.put(ps.stemWord("kwise"), 0.075); + termValues.put(ps.stemWord("k2tog"), 0.075); + termValues.put(ps.stemWord("k1b"), 0.075); + termValues.put(ps.stemWord("psso"), 0.075); + termValues.put(ps.stemWord("p2sso"), 0.075); + termValues.put(ps.stemWord("pwise"), 0.075); + termValues.put(ps.stemWord("yrn"), 0.075); + termValues.put(ps.stemWord("yon"), 0.075); + termValues.put(ps.stemWord("entrelac"), 0.075); + termValues.put(ps.stemWord("thrum"), 0.075); + termValues.put(ps.stemWord("bobbin"), 0.025); + + termValues.put(ps.stemWord("boucle"), 0.075); + termValues.put(ps.stemWord("lopi"), 0.075); + termValues.put(ps.stemWord("eyelash"), 0.01); + termValues.put(ps.stemWord("variegated"), 0.075); + + termValues.put(ps.stemWord("serge"), 0.04); + termValues.put(ps.stemWord("selvage"), 0.075); + termValues.put(ps.stemWord("topstitch"), 0.075); + + termValues.put(ps.stemWord("gauge"), 0.01); + termValues.put(ps.stemWord("design"), 0.01); + termValues.put(ps.stemWord("pattern"), 0.01); + termValues.put(ps.stemWord("layer"), 0.01); + termValues.put(ps.stemWord("color"), 0.01); + termValues.put(ps.stemWord("colour"), 0.01); + termValues.put(ps.stemWord("chart"), 0.01); + termValues.put(ps.stemWord("grid"), 0.01); + termValues.put(ps.stemWord("wool"), 0.01); + termValues.put(ps.stemWord("acrylic"), 0.01); + termValues.put(ps.stemWord("loose"), 0.01); + termValues.put(ps.stemWord("loop"), 0.01); + termValues.put(ps.stemWord("needle"), 0.01); + termValues.put(ps.stemWord("row"), 0.01); + termValues.put(ps.stemWord("circular"), 0.01); + termValues.put(ps.stemWord("sew"), 0.01); + termValues.put(ps.stemWord("size"), 0.01); + termValues.put(ps.stemWord("repeat"), 0.01); + termValues.put(ps.stemWord("repetition"), 0.01); + termValues.put(ps.stemWord("basketweave"), 0.01); + termValues.put(ps.stemWord("weave"), 0.01); + termValues.put(ps.stemWord("loom"), 0.01); + termValues.put(ps.stemWord("warp"), 0.01); + termValues.put(ps.stemWord("weft"), 0.01); + termValues.put(ps.stemWord("shuttle"), 0.01); + termValues.put(ps.stemWord("brioche"), 0.01); + termValues.put(ps.stemWord("spool"), 0.01); + termValues.put(ps.stemWord("hem"), 0.01); + termValues.put(ps.stemWord("bodice"), 0.01); + termValues.put(ps.stemWord("seam"), 0.01); + termValues.put(ps.stemWord("allowance"), 0.01); + termValues.put(ps.stemWord("crinoline"), 0.01); + termValues.put(ps.stemWord("petticoat"), 0.01); + termValues.put(ps.stemWord("armscye"), 0.01); + termValues.put(ps.stemWord("baste"), 0.01); + termValues.put(ps.stemWord("cord"), 0.01); + termValues.put(ps.stemWord("darning"), 0.01); + termValues.put(ps.stemWord("draping"), 0.01); + termValues.put(ps.stemWord("embroider"), 0.01); + termValues.put(ps.stemWord("eyelet"), 0.01); + termValues.put(ps.stemWord("godet"), 0.01); + termValues.put(ps.stemWord("gore"), 0.01); + termValues.put(ps.stemWord("grain"), 0.01); + termValues.put(ps.stemWord("jersey"), 0.01); + termValues.put(ps.stemWord("lining"), 0.01); + termValues.put(ps.stemWord("muslin"), 0.01); + termValues.put(ps.stemWord("needlework"), 0.01); + termValues.put(ps.stemWord("pleat"), 0.01); + termValues.put(ps.stemWord("quilt"), 0.01); + termValues.put(ps.stemWord("silk"), 0.01); + + termValues.put(ps.stemWord("sloper"), 0.01); + termValues.put(ps.stemWord("surplice"), 0.01); + termValues.put(ps.stemWord("thread"), 0.01); + termValues.put(ps.stemWord("twill"), 0.01); + + termValues.put(ps.stemWord("ch"), 0.01); + termValues.put(ps.stemWord("sp"), 0.01); + termValues.put(ps.stemWord("sl"), 0.01); + termValues.put(ps.stemWord("sc"), 0.01); + termValues.put(ps.stemWord("ss"), 0.01); + termValues.put(ps.stemWord("hdc"), 0.01); + termValues.put(ps.stemWord("turn"), 0.01); + termValues.put(ps.stemWord("skip"), 0.01); + termValues.put(ps.stemWord("round"), 0.01); + termValues.put(ps.stemWord("ring"), 0.01); + + termValues.put(ps.stemWord("sequin"), 0.01); + termValues.put(ps.stemWord("bobble"), 0.01); + termValues.put(ps.stemWord("puff"), 0.01); + termValues.put(ps.stemWord("v-stitch"), 0.01); + } + + public double testP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld.sentences) { + + for (var word : sentence) { + count++; + + final String stemmed = word.stemmed(); + final Double value = termValues.get(stemmed); + + if (value != null) { + values.merge(stemmed, value, (a,b) -> 0.5*a + b); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java new file mode 100644 index 00000000..bb4a0cd0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/WoodworkingDetector.java @@ -0,0 +1,134 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic.topic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.util.language.processing.model.DocumentLanguageData; + +import java.util.HashMap; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class WoodworkingDetector { + private static final int AVG_LENGTH = 1000; + + private final Map termValues = new HashMap<>(); + + public WoodworkingDetector() { + PorterStemmer ps = new PorterStemmer(); + + termValues.put(ps.stemWord("shop"), -0.1); + termValues.put(ps.stemWord("newsletter"), -0.1); + termValues.put(ps.stemWord("cart"), -0.1); + termValues.put(ps.stemWord("item"), -0.025); + termValues.put(ps.stemWord("price"), -0.1); + termValues.put(ps.stemWord("book"), -0.1); + termValues.put(ps.stemWord("order"), -0.1); + termValues.put(ps.stemWord("exhibition"), -0.1); + + // woodworking and joinery + termValues.put(ps.stemWord("apse"), 0.01); + termValues.put(ps.stemWord("baluster"), 0.01); + termValues.put(ps.stemWord("beam"), 0.01); + termValues.put(ps.stemWord("cornice"), 0.01); + termValues.put(ps.stemWord("drill"), 0.01); + termValues.put(ps.stemWord("nail"), 0.01); + termValues.put(ps.stemWord("saw"), 0.01); + termValues.put(ps.stemWord("hacksaw"), 0.01); + termValues.put(ps.stemWord("bandsaw"), 0.01); + termValues.put(ps.stemWord("whipsaw"), 0.01); + termValues.put(ps.stemWord("gimlet"), 0.01); + termValues.put(ps.stemWord("clamp"), 0.01); + termValues.put(ps.stemWord("glue"), 0.01); + termValues.put(ps.stemWord("cut"), 0.01); + termValues.put(ps.stemWord("plane"), 0.01); + termValues.put(ps.stemWord("sand"), 0.01); + termValues.put(ps.stemWord("bevel"), 0.01); + termValues.put(ps.stemWord("chamfer"), 0.01); + termValues.put(ps.stemWord("dado"), 0.075); + termValues.put(ps.stemWord("dowel"), 0.05); + termValues.put(ps.stemWord("dovetail"), 0.05); + termValues.put(ps.stemWord("joint"), 0.01); + termValues.put(ps.stemWord("level"), 0.01); + termValues.put(ps.stemWord("edge"), 0.01); + termValues.put(ps.stemWord("face"), 0.01); + termValues.put(ps.stemWord("fibreboard"), 0.01); + termValues.put(ps.stemWord("fiberboard"), 0.01); + termValues.put(ps.stemWord("battens"), 0.01); + termValues.put(ps.stemWord("furring"), 0.01); + termValues.put(ps.stemWord("glulam"), 0.025); + termValues.put(ps.stemWord("hardboard"), 0.025); + termValues.put(ps.stemWord("hardwood"), 0.01); + termValues.put(ps.stemWord("jamb"), 0.015); + termValues.put(ps.stemWord("kerf"), 0.025); + termValues.put(ps.stemWord("lvl"), 0.025); + termValues.put(ps.stemWord("laminated"), 0.01); + termValues.put(ps.stemWord("lignin"), 0.01); + termValues.put(ps.stemWord("mitre"), 0.01); + termValues.put(ps.stemWord("mortise"), 0.015); + termValues.put(ps.stemWord("mullion"), 0.01); + termValues.put(ps.stemWord("newel"), 0.01); + termValues.put(ps.stemWord("nogging"), 0.01); + termValues.put(ps.stemWord("ogee"), 0.01); + termValues.put(ps.stemWord("ogive"), 0.01); + termValues.put(ps.stemWord("ovolo"), 0.01); + termValues.put(ps.stemWord("drawknife"), 0.01); + termValues.put(ps.stemWord("plywood"), 0.01); + termValues.put(ps.stemWord("purlin"), 0.01); + termValues.put(ps.stemWord("riser"), 0.01); + termValues.put(ps.stemWord("sapwood"), 0.01); + termValues.put(ps.stemWord("shingle"), 0.01); + termValues.put(ps.stemWord("softwood"), 0.01); + termValues.put(ps.stemWord("sapwood"), 0.01); + termValues.put(ps.stemWord("stave"), 0.01); + termValues.put(ps.stemWord("stopper"), 0.01); + termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D + termValues.put(ps.stemWord("transom"), 0.01); + termValues.put(ps.stemWord("v-joint"), 0.015); + termValues.put(ps.stemWord("veneer"), 0.01); + termValues.put(ps.stemWord("quartersaw"), 0.015); + termValues.put(ps.stemWord("screw"), 0.01); + termValues.put(ps.stemWord("woodturning"), 0.01); + + termValues.put(ps.stemWord("pine"), 0.005); + termValues.put(ps.stemWord("balsa"), 0.01); + termValues.put(ps.stemWord("poplar"), 0.005); + + termValues.put(ps.stemWord("nut"), 0.01); + termValues.put(ps.stemWord("bolt"), 0.01); + termValues.put(ps.stemWord("tack"), 0.01); + termValues.put(ps.stemWord("hinge"), 0.01); + termValues.put(ps.stemWord("brass"), 0.01); + termValues.put(ps.stemWord("fitting"), 0.01); + + termValues.put(ps.stemWord("diy"), 0.015); + termValues.put(ps.stemWord("dozuki"), 0.01); + } + + public double testP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld.sentences) { + + for (var word : sentence) { + count++; + + final String stemmed = word.stemmed(); + final Double value = termValues.get(stemmed); + + if (value != null) { + values.merge(stemmed, value, (a,b) -> 0.5*a + b); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java index 0bd36d06..21b80993 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java @@ -22,5 +22,12 @@ public class CrawledDomainReader { return gson.fromJson(br, CrawledDomain.class); } } - + public CrawledDomain readRuntimeExcept(Path path) { + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { + return gson.fromJson(br, CrawledDomain.class); + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java index 276d3651..fb5bf5b2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/WorkLog.java @@ -1,9 +1,11 @@ package nu.marginalia.wmsa.edge.crawling; +import com.google.errorprone.annotations.MustBeClosed; import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; import org.apache.logging.log4j.util.Strings; -import java.io.*; +import java.io.FileOutputStream; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -12,6 +14,7 @@ import java.util.HashSet; import java.util.Set; import java.util.function.Consumer; import java.util.regex.Pattern; +import java.util.stream.Stream; public class WorkLog implements AutoCloseable { private final Set finishedJobs = new HashSet<>(); @@ -29,15 +32,21 @@ public class WorkLog implements AutoCloseable { return; } - try (var lines = Files.lines(logFile)) { - lines.filter(WorkLog::isJobId).map(line -> { - String[] parts = line.split("\\s+"); - return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); - }).forEach(entryConsumer); + try (var entries = streamLog(logFile)) { + entries.forEach(entryConsumer); } catch (IOException e) { e.printStackTrace(); } } + + @MustBeClosed + public static Stream streamLog(Path logFile) throws IOException { + return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> { + String[] parts = line.split("\\s+"); + return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3])); + }); + } + private void loadLog(Path logFile) throws IOException { if (!Files.exists(logFile)) { return; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 3515c48a..926b9d74 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -1,15 +1,20 @@ package nu.marginalia.wmsa.edge.model; +import com.google.errorprone.annotations.MustBeClosed; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.WorkLog; +import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import org.jetbrains.annotations.NotNull; import java.io.IOException; import java.nio.file.Path; +import java.util.Iterator; import java.util.function.Consumer; +import java.util.stream.Stream; @AllArgsConstructor @NoArgsConstructor @ToString public class EdgeCrawlPlan { @@ -49,12 +54,44 @@ public class EdgeCrawlPlan { public void forEachCrawledDomain(Consumer consumer) { final CrawledDomainReader reader = new CrawledDomainReader(); - WorkLog.readLog(crawl.getLogFile(), entry -> { - try { - consumer.accept(reader.read(getCrawledFilePath(entry.path()))); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); + try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { + entryStream + .map(CrawlLogEntry::path) + .map(this::getCrawledFilePath) + .map(reader::readRuntimeExcept) + .forEach(consumer); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + @MustBeClosed + public DomainsIterable domainsIterable() throws IOException { + return new DomainsIterable(); + } + + public class DomainsIterable implements Iterable, AutoCloseable { + private final Stream stream; + + DomainsIterable() throws IOException { + final CrawledDomainReader reader = new CrawledDomainReader(); + + stream = WorkLog.streamLog(crawl.getLogFile()) + .map(CrawlLogEntry::path) + .map(EdgeCrawlPlan.this::getCrawledFilePath) + .map(reader::readRuntimeExcept); + } + + @Override + public void close() { + stream.close(); + } + + @NotNull + @Override + public Iterator iterator() { + return stream.iterator(); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java new file mode 100644 index 00000000..fde620c3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java @@ -0,0 +1,58 @@ +package nu.marginalia.wmsa.edge.tools; + +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; +import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.nio.file.Path; + +import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; + +public class AdblockTesterTool { + + static AdblockSimulator simulator; + + static { + try { + simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt")); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + public static void main(String... args) throws IOException { + EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0])); + + + try (var iterable = plan.domainsIterable()) { + for (var domain : iterable) { + processDomain(domain); + } + } + + } + + private static void processDomain(CrawledDomain domain) { + if (domain.doc == null) return; + for (var doc : domain.doc) { + if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { + processDocument(doc); + } + } + } + + + private static void processDocument(CrawledDocument doc) { + Document parsedDocument = Jsoup.parse(doc.documentBody); + + if (simulator.hasAds(parsedDocument)) { + System.out.println(doc.url); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java index 480e85b8..e1b6c822 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/RecipeDetectorTool.java @@ -5,9 +5,10 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector; +import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; @@ -25,8 +26,10 @@ import java.util.concurrent.TimeUnit; import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType; public class RecipeDetectorTool { - private static final CrawledDomainReader reader = new CrawledDomainReader(); - private static final RecipeDetector detector = new RecipeDetector(); + private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); + private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); + private static final RecipeDetector recipeDetector = new RecipeDetector(); + private static final LanguageModels lm = WmsaHome.getLanguageModels(); private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm); @@ -49,7 +52,12 @@ public class RecipeDetectorTool { } ForkJoinPool pool = new ForkJoinPool(16); - plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data))); + + try (var iterable = plan.domainsIterable()) { + for (var domain : iterable) { + pool.execute(() -> processDomain(domain)); + } + } while (!pool.awaitQuiescence(1, TimeUnit.HOURS)); } @@ -74,9 +82,20 @@ public class RecipeDetectorTool { parsedDocument.getElementsByTag("nav").remove(); DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument); - double prob = 100*detector.recipeP(dld); + + double prob = 100*recipeDetector.testP(dld); if (prob > 50) { - System.out.printf("%3.2f\t%s\n", prob, doc.url); + System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url); + } + + prob = 100*woodworkingDetector.testP(dld); + if (prob > 20) { + System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url); + } + + prob = 100*textileCraftDetector.testP(dld); + if (prob > 20) { + System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url); } } } From ce09fce639b900d0b779748eac5b67518224528c Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 10 Aug 2022 17:03:58 +0200 Subject: [PATCH 2/4] Faster crawling --- .../wmsa/edge/converting/ConverterMain.java | 89 +++++------------- .../converting/LinkKeywordExtractorMain.java | 1 + .../edge/crawling/CrawledDomainReader.java | 33 ++++++- .../edge/crawling/CrawledDomainWriter.java | 35 ++++--- .../wmsa/edge/crawling/CrawlerMain.java | 94 +++++++------------ .../edge/crawling/model/CrawledDocument.java | 8 +- .../edge/crawling/model/CrawledDomain.java | 8 +- .../crawling/model/SerializableCrawlData.java | 5 + .../crawling/retreival/CrawlerRetreiver.java | 30 ++++-- .../edge/crawling/retreival/HttpFetcher.java | 3 - .../wmsa/edge/model/EdgeCrawlPlan.java | 23 ++++- 11 files changed, 179 insertions(+), 150 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index 973554d2..93814b46 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -1,6 +1,5 @@ package nu.marginalia.wmsa.edge.converting; -import com.google.common.base.Strings; import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; @@ -10,8 +9,6 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; -import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; -import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; @@ -20,24 +17,13 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; -import java.util.HashMap; import java.util.List; -import java.util.Map; public class ConverterMain { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final DomainProcessor processor; - private final InstructionsCompiler compiler; - private final WorkLog processLog; private final CrawledInstructionWriter instructionWriter; - private final Gson gson; - private final CrawledDomainReader reader = new CrawledDomainReader(); - - private final Map domainToId = new HashMap<>(); - private final Map idToFileName = new HashMap<>(); - public static void main(String... args) throws IOException { if (args.length != 1) { @@ -60,65 +46,42 @@ public class ConverterMain { InstructionsCompiler compiler, Gson gson ) throws Exception { - this.processor = processor; - this.compiler = compiler; - this.gson = gson; instructionWriter = new CrawledInstructionWriter(plan.process.getDir(), gson); - logger.info("Loading input spec"); - CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), - spec -> domainToId.put(spec.domain, spec.id)); - - logger.info("Replaying crawl log"); - WorkLog.readLog(plan.crawl.getLogFile(), - entry -> idToFileName.put(entry.id(), entry.path())); - logger.info("Starting pipe"); - processLog = new WorkLog(plan.process.getLogFile()); + try (WorkLog processLog = plan.createProcessWorkLog()) { + var pipe = new ParallelPipe("Crawler", 48, 4, 2) { - var pipe = new ParallelPipe("Crawler", 48, 4, 2) { - @Override - protected ProcessingInstructions onProcess(CrawledDomain domainData) { - var processed = processor.process(domainData); - return new ProcessingInstructions(domainData.id, compiler.compile(processed)); - } + @Override + protected ProcessingInstructions onProcess(CrawledDomain domainData) { + var processed = processor.process(domainData); + var compiled = compiler.compile(processed); - @Override - protected void onReceive(ProcessingInstructions processedInstructions) throws IOException { - var instructions = processedInstructions.instructions; - instructions.removeIf(Instruction::isNoOp); - - String where = instructionWriter.accept(processedInstructions.id, instructions); - processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); - } - }; - - domainToId.forEach((domain, id) -> { - String fileName = idToFileName.get(id); - - if (Strings.isNullOrEmpty(fileName)) - return; - - Path dest = plan.getCrawledFilePath(fileName); - - logger.info("{} - {} - {}", domain, id, dest); - - if (!processLog.isJobFinished(id)) { - try { - var cd = reader.read(dest); - pipe.accept(cd); - - } catch (IOException e) { - logger.error("Failed to read {}", dest); + return new ProcessingInstructions(domainData.id, compiled); } - } - }); - pipe.join(); + @Override + protected void onReceive(ProcessingInstructions processedInstructions) throws IOException { + var instructions = processedInstructions.instructions; + instructions.removeIf(Instruction::isNoOp); - processLog.close(); + String where = instructionWriter.accept(processedInstructions.id, instructions); + processLog.setJobToFinished(processedInstructions.id, where, instructions.size()); + } + + }; + + plan.forEachCrawledDomain(domain -> { + if (!processLog.isJobFinished(domain.id)) { + logger.info("{} - {}", domain.domain, domain.id); + pipe.accept(domain); + } + }); + + pipe.join(); + } logger.info("Finished"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 99c93740..12044fda 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -61,6 +61,7 @@ public class LinkKeywordExtractorMain { .forEach(crawledUrls::add); logger.info("Loading input spec"); + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), spec -> { crawledDomains.add(spec.domain); }); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java index 21b80993..81121af7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java @@ -3,12 +3,15 @@ package nu.marginalia.wmsa.edge.crawling; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import com.google.gson.GsonBuilder; +import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; public class CrawledDomainReader { private final Gson gson = new GsonBuilder().create(); @@ -18,13 +21,37 @@ public class CrawledDomainReader { } public CrawledDomain read(Path path) throws IOException { + List docs = new ArrayList<>(); + CrawledDomain domain = null; try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { - return gson.fromJson(br, CrawledDomain.class); + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("//")) { + String nextLine = br.readLine(); + if (nextLine == null) break; + + if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + domain = gson.fromJson(nextLine, CrawledDomain.class); + } else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + docs.add(gson.fromJson(nextLine, CrawledDocument.class)); + } + } + else if (line.charAt(0) == '{') { + domain = gson.fromJson(line, CrawledDomain.class); + } + } } + + if (domain == null) { + return null; + } + domain.doc.addAll(docs); + return domain; } + public CrawledDomain readRuntimeExcept(Path path) { - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) { - return gson.fromJson(br, CrawledDomain.class); + try { + return read(path); } catch (Exception ex) { throw new RuntimeException(ex); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java index ce0c7216..8866a55d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainWriter.java @@ -3,40 +3,44 @@ package nu.marginalia.wmsa.edge.crawling; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedOutputStream; -import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; +import java.io.Writer; import java.nio.file.Files; import java.nio.file.Path; -public class CrawledDomainWriter { +public class CrawledDomainWriter implements AutoCloseable { private final Path outputDir; private final Gson gson = new GsonBuilder().create(); private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class); + private final Writer writer; + private final Path outputFile; - public CrawledDomainWriter(Path outputDir) { + public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException { this.outputDir = outputDir; if (!Files.isDirectory(outputDir)) { throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); } + + outputFile = getOutputFile(id, name); + writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(outputFile)))); } - public String accept(CrawledDomain domainData) throws IOException { - Path outputFile = getOutputFile(domainData.id, domainData.domain); + public Path getOutputFile() { + return outputFile; + } - try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) { - logger.info("Writing {} - {}", domainData.id, domainData.domain); - - gson.toJson(domainData, outputStream); - } - - return outputFile.getFileName().toString(); + public void accept(SerializableCrawlData data) throws IOException { + writer.write(data.getSerialIdentifier()); + writer.write('\n'); + gson.toJson(data, writer); + writer.write('\n'); } private Path getOutputFile(String id, String name) throws IOException { @@ -63,4 +67,9 @@ public class CrawledDomainWriter { return nameSaneBuilder.toString(); } + + @Override + public void close() throws IOException { + writer.close(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index 7238dce0..6d23c4d1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -4,68 +4,38 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import nu.marginalia.wmsa.configuration.UserAgent; import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver; import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher; -import nu.marginalia.util.ParallelPipe; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import okhttp3.Dispatcher; import okhttp3.internal.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; import java.nio.file.Path; -import java.util.concurrent.Semaphore; -import java.util.concurrent.SynchronousQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.*; public class CrawlerMain implements AutoCloseable { public static Gson gson = new GsonBuilder().create(); private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Path inputSpec; + private final EdgeCrawlPlan plan; + private final Path crawlDataDir; private final WorkLog workLog; - private final CrawledDomainWriter domainWriter; - private final int numberOfThreads; - private final ParallelPipe pipe; private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); private final UserAgent userAgent; public CrawlerMain(EdgeCrawlPlan plan) throws Exception { - this.inputSpec = plan.getJobSpec(); - this.numberOfThreads = 512; + this.plan = plan; this.userAgent = WmsaHome.getUserAgent(); - workLog = new WorkLog(plan.crawl.getLogFile()); - domainWriter = new CrawledDomainWriter(plan.crawl.getDir()); - - Semaphore sem = new Semaphore(250_000); - - pipe = new ParallelPipe<>("Crawler", numberOfThreads, 2, 1) { - @Override - protected CrawledDomain onProcess(CrawlingSpecification crawlingSpecification) throws Exception { - int toAcquire = crawlingSpecification.urls.size(); - sem.acquire(toAcquire); - try { - return fetchDomain(crawlingSpecification); - } - finally { - sem.release(toAcquire); - } - } - - @Override - protected void onReceive(CrawledDomain crawledDomain) throws IOException { - writeDomain(crawledDomain); - } - }; + workLog = plan.createCrawlWorkLog(); + crawlDataDir = plan.crawl.getDir(); } public static void main(String... args) throws Exception { @@ -84,48 +54,54 @@ public class CrawlerMain implements AutoCloseable { crawler.run(); } - // TODO (2022-05-24): Some thread isn't set to daemon mode, need to explicitly harakiri the process, find why? System.exit(0); } - private CrawledDomain fetchDomain(CrawlingSpecification specification) { + private void fetchDomain(CrawlingSpecification specification) { if (workLog.isJobFinished(specification.id)) - return null; + return; var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher); - try { - var retreiver = new CrawlerRetreiver(fetcher, specification); + try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { + var retreiver = new CrawlerRetreiver(fetcher, specification, writer); - return retreiver.fetch(); + int size = retreiver.fetch(); + + workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size); + + logger.info("Fetched {}", specification.domain); } catch (Exception e) { logger.error("Error fetching domain", e); - return null; } } - private void writeDomain(CrawledDomain crawledDomain) throws IOException { - String name = domainWriter.accept(crawledDomain); - workLog.setJobToFinished(crawledDomain.id, name, crawledDomain.size()); - } - public void run() throws InterruptedException { // First a validation run to ensure the file is all good to parse logger.info("Validating JSON"); - CrawlerSpecificationLoader.readInputSpec(inputSpec, spec -> {}); + plan.forEachCrawlingSpecification(unused -> {}); - logger.info("Starting pipe"); - CrawlerSpecificationLoader.readInputSpec(inputSpec, pipe::accept); + logger.info("Let's go"); - if (!AbortMonitor.getInstance().isAlive()) { - logger.info("Aborting"); - pipe.clearQueues(); - } - else { - logger.info("All jobs queued, waiting for pipe to finish"); - } - pipe.join(); + final int poolSize = 1024; + + BlockingQueue queue = new LinkedBlockingQueue<>(10); + ThreadPoolExecutor pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this? + + AbortMonitor abortMonitor = AbortMonitor.getInstance(); + + plan.forEachCrawlingSpecification(spec -> { + if (abortMonitor.isAlive()) { + pool.execute(() -> fetchDomain(spec)); + } + }); + + logger.info("Awaiting termination"); + + pool.shutdown(); + + while (!pool.awaitTermination(1, TimeUnit.SECONDS)); logger.info("All finished"); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java index 5d8d7e54..d43315a0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.crawling.model; import lombok.Builder; @Builder -public class CrawledDocument { +public class CrawledDocument implements SerializableCrawlData { public String crawlId; public String url; @@ -22,4 +22,10 @@ public class CrawledDocument { public String canonicalUrl; public String redirectUrl; + + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; + @Override + public String getSerialIdentifier() { + return SERIAL_IDENTIFIER; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java index 1a0a3f46..a4c365d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDomain.java @@ -7,7 +7,7 @@ import lombok.Data; import java.util.List; @AllArgsConstructor @Data @Builder -public class CrawledDomain { +public class CrawledDomain implements SerializableCrawlData { public String id; public String domain; @@ -24,4 +24,10 @@ public class CrawledDomain { if (doc == null) return 0; return doc.size(); } + + public static final String SERIAL_IDENTIFIER = "// DOMAIN"; + @Override + public String getSerialIdentifier() { + return SERIAL_IDENTIFIER; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java new file mode 100644 index 00000000..015ea743 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/SerializableCrawlData.java @@ -0,0 +1,5 @@ +package nu.marginalia.wmsa.edge.crawling.model; + +public interface SerializableCrawlData { + String getSerialIdentifier(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index b9fb79c5..f8b8eab8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.crawling.CrawledDomainWriter; import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist; import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList; import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist; @@ -14,6 +15,7 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.time.LocalDateTime; @@ -29,6 +31,7 @@ public class CrawlerRetreiver { private final int depth; private final String id; private final String domain; + private final CrawledDomainWriter crawledDomainWriter; private static final LinkParser linkParser = new LinkParser(); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); @@ -45,7 +48,7 @@ public class CrawlerRetreiver { } } - public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs) { + public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter crawledDomainWriter) { this.fetcher = fetcher; visited = new HashSet<>((int)(specs.urls.size() * 1.5)); known = new HashSet<>(specs.urls.size() * 10); @@ -53,6 +56,7 @@ public class CrawlerRetreiver { depth = specs.crawlDepth; id = specs.id; domain = specs.domain; + this.crawledDomainWriter = crawledDomainWriter; specs.urls.stream() .map(this::parseUrl) @@ -78,12 +82,18 @@ public class CrawlerRetreiver { } } - public CrawledDomain fetch() { + public int fetch() throws IOException { logger.info("Fetching {}", domain); Optional probeResult = probeDomainForProblems(domain); - return probeResult.orElseGet(this::crawlDomain); + if (probeResult.isPresent()) { + crawledDomainWriter.accept(probeResult.get()); + return 1; + } + else { + return crawlDomain(); + } } private Optional probeDomainForProblems(String domain) { @@ -118,7 +128,7 @@ public class CrawlerRetreiver { return Optional.empty(); } - private CrawledDomain crawlDomain() { + private int crawlDomain() throws IOException { String ip = findIp(domain); assert !queue.isEmpty(); @@ -130,6 +140,8 @@ public class CrawlerRetreiver { CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, docs, null); int visitedCount = 0; + int fetchedCount = 0; + while (!queue.isEmpty() && visitedCount < depth) { var top = queue.removeFirst(); @@ -150,7 +162,11 @@ public class CrawlerRetreiver { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - fetchUrl(top).ifPresent(ret.doc::add); + var doc = fetchUrl(top); + if (doc.isPresent()) { + fetchedCount++; + crawledDomainWriter.accept(doc.get()); + } long crawledTime = System.currentTimeMillis() - startTime; delay(crawlDelay, crawledTime); @@ -160,7 +176,9 @@ public class CrawlerRetreiver { ret.cookies = fetcher.getCookies(); - return ret; + crawledDomainWriter.accept(ret); + + return fetchedCount; } private Optional fetchUrl(EdgeUrl top) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 53180137..967e0203 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -7,7 +7,6 @@ import crawlercommons.robots.SimpleRobotRulesParser; import lombok.AllArgsConstructor; import lombok.SneakyThrows; import lombok.ToString; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.model.EdgeDomain; @@ -43,8 +42,6 @@ public class HttpFetcher { private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); - private final LinkParser linkParser = new LinkParser(); - public void setAllowAllContentTypes(boolean allowAllContentTypes) { this.allowAllContentTypes = allowAllContentTypes; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java index 926b9d74..28b4255b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java @@ -5,9 +5,11 @@ import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; +import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; +import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import org.jetbrains.annotations.NotNull; import java.io.IOException; @@ -51,7 +53,26 @@ public class EdgeCrawlPlan { return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName); } - public void forEachCrawledDomain(Consumer consumer) { + public WorkLog createCrawlWorkLog() throws IOException { + return new WorkLog(crawl.getLogFile()); + } + + public WorkLog createProcessWorkLog() throws IOException { + return new WorkLog(process.getLogFile()); + } + + public void forEachCrawlingSpecification(Consumer consumer) { + CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer); + } + + public void forEachCrawlingLogEntry(Consumer consumer) { + WorkLog.readLog(this.crawl.getLogFile(), consumer); + } + public void forEachProcessingLogEntry(Consumer consumer) { + WorkLog.readLog(this.process.getLogFile(), consumer); + } + + public void forEachCrawledDomain(Consumer consumer) { final CrawledDomainReader reader = new CrawledDomainReader(); try (Stream entryStream = WorkLog.streamLog(crawl.getLogFile())) { From ffde8c83051f2abf1945eb01f8d0c61d1afe19a9 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 10 Aug 2022 18:46:13 +0200 Subject: [PATCH 3/4] Faster crawling --- .../wmsa/edge/crawling/CrawlerMain.java | 43 +++++++++++++------ .../crawling/retreival/CrawlerRetreiver.java | 6 +-- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index 6d23c4d1..f6c9a5b6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -29,11 +29,17 @@ public class CrawlerMain implements AutoCloseable { new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); private final UserAgent userAgent; + private final ThreadPoolExecutor pool; + final int poolSize = 256; + final int poolQueueSize = 32; public CrawlerMain(EdgeCrawlPlan plan) throws Exception { this.plan = plan; this.userAgent = WmsaHome.getUserAgent(); + BlockingQueue queue = new LinkedBlockingQueue<>(poolQueueSize); + pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this? + workLog = plan.createCrawlWorkLog(); crawlDataDir = plan.crawl.getDir(); } @@ -84,31 +90,44 @@ public class CrawlerMain implements AutoCloseable { logger.info("Let's go"); - final int poolSize = 1024; - - BlockingQueue queue = new LinkedBlockingQueue<>(10); - ThreadPoolExecutor pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this? - AbortMonitor abortMonitor = AbortMonitor.getInstance(); + + Semaphore taskSem = new Semaphore(poolSize); + plan.forEachCrawlingSpecification(spec -> { if (abortMonitor.isAlive()) { - pool.execute(() -> fetchDomain(spec)); + try { + taskSem.acquire(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + pool.execute(() -> { + try { + fetchDomain(spec); + } + finally { + taskSem.release(); + } + }); } }); - logger.info("Awaiting termination"); - pool.shutdown(); - - while (!pool.awaitTermination(1, TimeUnit.SECONDS)); - - logger.info("All finished"); } public void close() throws Exception { + logger.info("Awaiting termination"); + pool.shutdown(); + + while (!pool.awaitTermination(1, TimeUnit.SECONDS)); + logger.info("All finished"); + workLog.close(); dispatcher.executorService().shutdownNow(); + + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index f8b8eab8..802211ce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -83,8 +83,6 @@ public class CrawlerRetreiver { } public int fetch() throws IOException { - logger.info("Fetching {}", domain); - Optional probeResult = probeDomainForProblems(domain); if (probeResult.isPresent()) { @@ -272,10 +270,10 @@ public class CrawlerRetreiver { @SneakyThrows private void delay(long crawlDelay, long timeParsed) { if (crawlDelay >= 1) { - if (timeParsed/1000 > crawlDelay) + if (timeParsed > crawlDelay) return; - Thread.sleep(Math.min(1000*crawlDelay-timeParsed, 5000)); + Thread.sleep(Math.min(crawlDelay-timeParsed, 5000)); } else { if (timeParsed > DEFAULT_CRAWL_DELAY_MS) From ba9e0d98296ba0170ee17e576b4ecf8cf9598485 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 10 Aug 2022 19:50:14 +0200 Subject: [PATCH 4/4] Add features to suggestions --- .../wmsa/edge/assistant/suggest/Suggestions.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java index 8c920028..20531e2d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/suggest/Suggestions.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; +import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import org.apache.commons.collections4.trie.PatriciaTrie; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,6 +50,13 @@ public class Suggestions { .map(String::toLowerCase) .forEach(w -> ret.put(w, w)); + for (var feature : HtmlFeature.values()) { + String keyword = feature.getKeyword(); + + ret.put(keyword, keyword); + ret.put("-" + keyword, "-"+ keyword); + } + return ret; } catch (IOException ex) {