Topical detection (experimental),
Adblock simulation (experimental)
This commit is contained in:
parent
4c9e1fa686
commit
9c6e3b1772
@ -0,0 +1,133 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AdblockSimulator {
|
||||
|
||||
List<String> idRules = new ArrayList();
|
||||
List<String> classRules = new ArrayList();
|
||||
List<Predicate<String>> scriptRules = new ArrayList();
|
||||
|
||||
public AdblockSimulator(Path adsDefinition) throws IOException {
|
||||
try (var lineStream = Files.lines(adsDefinition)) {
|
||||
lineStream.skip(1).forEach(this::addRule);
|
||||
}
|
||||
}
|
||||
|
||||
private void addRule(String s) {
|
||||
if (s.startsWith("##") && !s.contains(":")) {
|
||||
if (s.startsWith("###")) {
|
||||
idRules.add(s.substring(3));
|
||||
} else if(s.startsWith("##.")) {
|
||||
classRules.add(s.substring(3));
|
||||
}
|
||||
}
|
||||
else if (!s.startsWith("!") && !s.contains("#")){
|
||||
scriptRules.add(toRegexMatcher(s));
|
||||
}
|
||||
}
|
||||
|
||||
private Predicate<String> toRegexMatcher(String s) {
|
||||
|
||||
System.out.println("<-" + s);
|
||||
|
||||
s = s.replaceAll("\\?", "\\\\?");
|
||||
s = s.replaceAll("\\.", "\\\\.");
|
||||
s = s.replaceAll("\\$", "\\\\\\$");
|
||||
|
||||
if (s.startsWith("||")) {
|
||||
s = s.replaceFirst("\\|\\|","^http(s)?://");
|
||||
}
|
||||
|
||||
s = s.replaceAll("\\|", "\\\\|");
|
||||
s = s.replaceAll("\\*", ".*");
|
||||
s = s.replaceAll("\\^", "[?/]");
|
||||
|
||||
|
||||
System.out.println("->" + s);
|
||||
return Pattern.compile(s).asPredicate();
|
||||
}
|
||||
|
||||
class RuleVisitor implements NodeFilter {
|
||||
public boolean sawAds;
|
||||
Pattern spPattern = Pattern.compile("\\s");
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
|
||||
if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow
|
||||
|
||||
String id = elem.id();
|
||||
for (var rule : idRules) {
|
||||
if (rule.equals(id)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
|
||||
String classes = elem.className();
|
||||
if (classes.isBlank()) return FilterResult.CONTINUE;
|
||||
|
||||
if (classes.indexOf(' ') > 0) {
|
||||
String[] classNames = spPattern.split(classes);
|
||||
for (var rule : classRules) {
|
||||
|
||||
for (var className : classNames) {
|
||||
if (className.equals(rule)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else { // tag only has one class
|
||||
for (var rule : classRules) {
|
||||
if (classes.equals(rule)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ("script".equals(elem.tagName())) {
|
||||
String src = elem.attr("src");
|
||||
|
||||
for (var rule : scriptRules) {
|
||||
if (rule.test(src)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilterResult tail(Node node, int depth) {
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasAds(Document document) {
|
||||
|
||||
RuleVisitor ruleVisitor = new RuleVisitor();
|
||||
document.filter(ruleVisitor);
|
||||
|
||||
return ruleVisitor.sawAds;
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
@ -205,7 +205,7 @@ public class RecipeDetector {
|
||||
|
||||
}
|
||||
|
||||
public double recipeP(DocumentLanguageData dld) {
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
@ -0,0 +1,158 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class TextileCraftDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public TextileCraftDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
termValues.put(ps.stemWord("shop"), -0.1);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.1);
|
||||
termValues.put(ps.stemWord("cart"), -0.1);
|
||||
termValues.put(ps.stemWord("item"), -0.025);
|
||||
termValues.put(ps.stemWord("price"), -0.1);
|
||||
termValues.put(ps.stemWord("book"), -0.1);
|
||||
termValues.put(ps.stemWord("order"), -0.1);
|
||||
termValues.put(ps.stemWord("exhibition"), -0.1);
|
||||
|
||||
termValues.put(ps.stemWord("knit"), 0.05);
|
||||
termValues.put(ps.stemWord("stitch"), 0.05);
|
||||
termValues.put(ps.stemWord("yarn"), 0.05);
|
||||
termValues.put(ps.stemWord("crochet"), 0.05);
|
||||
termValues.put(ps.stemWord("ravelry"), 0.15);
|
||||
|
||||
termValues.put(ps.stemWord("stockinette"), 0.075);
|
||||
termValues.put(ps.stemWord("purl"), 0.075);
|
||||
termValues.put(ps.stemWord("ksp"), 0.075);
|
||||
termValues.put(ps.stemWord("kwise"), 0.075);
|
||||
termValues.put(ps.stemWord("k2tog"), 0.075);
|
||||
termValues.put(ps.stemWord("k1b"), 0.075);
|
||||
termValues.put(ps.stemWord("psso"), 0.075);
|
||||
termValues.put(ps.stemWord("p2sso"), 0.075);
|
||||
termValues.put(ps.stemWord("pwise"), 0.075);
|
||||
termValues.put(ps.stemWord("yrn"), 0.075);
|
||||
termValues.put(ps.stemWord("yon"), 0.075);
|
||||
termValues.put(ps.stemWord("entrelac"), 0.075);
|
||||
termValues.put(ps.stemWord("thrum"), 0.075);
|
||||
termValues.put(ps.stemWord("bobbin"), 0.025);
|
||||
|
||||
termValues.put(ps.stemWord("boucle"), 0.075);
|
||||
termValues.put(ps.stemWord("lopi"), 0.075);
|
||||
termValues.put(ps.stemWord("eyelash"), 0.01);
|
||||
termValues.put(ps.stemWord("variegated"), 0.075);
|
||||
|
||||
termValues.put(ps.stemWord("serge"), 0.04);
|
||||
termValues.put(ps.stemWord("selvage"), 0.075);
|
||||
termValues.put(ps.stemWord("topstitch"), 0.075);
|
||||
|
||||
termValues.put(ps.stemWord("gauge"), 0.01);
|
||||
termValues.put(ps.stemWord("design"), 0.01);
|
||||
termValues.put(ps.stemWord("pattern"), 0.01);
|
||||
termValues.put(ps.stemWord("layer"), 0.01);
|
||||
termValues.put(ps.stemWord("color"), 0.01);
|
||||
termValues.put(ps.stemWord("colour"), 0.01);
|
||||
termValues.put(ps.stemWord("chart"), 0.01);
|
||||
termValues.put(ps.stemWord("grid"), 0.01);
|
||||
termValues.put(ps.stemWord("wool"), 0.01);
|
||||
termValues.put(ps.stemWord("acrylic"), 0.01);
|
||||
termValues.put(ps.stemWord("loose"), 0.01);
|
||||
termValues.put(ps.stemWord("loop"), 0.01);
|
||||
termValues.put(ps.stemWord("needle"), 0.01);
|
||||
termValues.put(ps.stemWord("row"), 0.01);
|
||||
termValues.put(ps.stemWord("circular"), 0.01);
|
||||
termValues.put(ps.stemWord("sew"), 0.01);
|
||||
termValues.put(ps.stemWord("size"), 0.01);
|
||||
termValues.put(ps.stemWord("repeat"), 0.01);
|
||||
termValues.put(ps.stemWord("repetition"), 0.01);
|
||||
termValues.put(ps.stemWord("basketweave"), 0.01);
|
||||
termValues.put(ps.stemWord("weave"), 0.01);
|
||||
termValues.put(ps.stemWord("loom"), 0.01);
|
||||
termValues.put(ps.stemWord("warp"), 0.01);
|
||||
termValues.put(ps.stemWord("weft"), 0.01);
|
||||
termValues.put(ps.stemWord("shuttle"), 0.01);
|
||||
termValues.put(ps.stemWord("brioche"), 0.01);
|
||||
termValues.put(ps.stemWord("spool"), 0.01);
|
||||
termValues.put(ps.stemWord("hem"), 0.01);
|
||||
termValues.put(ps.stemWord("bodice"), 0.01);
|
||||
termValues.put(ps.stemWord("seam"), 0.01);
|
||||
termValues.put(ps.stemWord("allowance"), 0.01);
|
||||
termValues.put(ps.stemWord("crinoline"), 0.01);
|
||||
termValues.put(ps.stemWord("petticoat"), 0.01);
|
||||
termValues.put(ps.stemWord("armscye"), 0.01);
|
||||
termValues.put(ps.stemWord("baste"), 0.01);
|
||||
termValues.put(ps.stemWord("cord"), 0.01);
|
||||
termValues.put(ps.stemWord("darning"), 0.01);
|
||||
termValues.put(ps.stemWord("draping"), 0.01);
|
||||
termValues.put(ps.stemWord("embroider"), 0.01);
|
||||
termValues.put(ps.stemWord("eyelet"), 0.01);
|
||||
termValues.put(ps.stemWord("godet"), 0.01);
|
||||
termValues.put(ps.stemWord("gore"), 0.01);
|
||||
termValues.put(ps.stemWord("grain"), 0.01);
|
||||
termValues.put(ps.stemWord("jersey"), 0.01);
|
||||
termValues.put(ps.stemWord("lining"), 0.01);
|
||||
termValues.put(ps.stemWord("muslin"), 0.01);
|
||||
termValues.put(ps.stemWord("needlework"), 0.01);
|
||||
termValues.put(ps.stemWord("pleat"), 0.01);
|
||||
termValues.put(ps.stemWord("quilt"), 0.01);
|
||||
termValues.put(ps.stemWord("silk"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("sloper"), 0.01);
|
||||
termValues.put(ps.stemWord("surplice"), 0.01);
|
||||
termValues.put(ps.stemWord("thread"), 0.01);
|
||||
termValues.put(ps.stemWord("twill"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("ch"), 0.01);
|
||||
termValues.put(ps.stemWord("sp"), 0.01);
|
||||
termValues.put(ps.stemWord("sl"), 0.01);
|
||||
termValues.put(ps.stemWord("sc"), 0.01);
|
||||
termValues.put(ps.stemWord("ss"), 0.01);
|
||||
termValues.put(ps.stemWord("hdc"), 0.01);
|
||||
termValues.put(ps.stemWord("turn"), 0.01);
|
||||
termValues.put(ps.stemWord("skip"), 0.01);
|
||||
termValues.put(ps.stemWord("round"), 0.01);
|
||||
termValues.put(ps.stemWord("ring"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("sequin"), 0.01);
|
||||
termValues.put(ps.stemWord("bobble"), 0.01);
|
||||
termValues.put(ps.stemWord("puff"), 0.01);
|
||||
termValues.put(ps.stemWord("v-stitch"), 0.01);
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld.sentences) {
|
||||
|
||||
for (var word : sentence) {
|
||||
count++;
|
||||
|
||||
final String stemmed = word.stemmed();
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,134 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class WoodworkingDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public WoodworkingDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
termValues.put(ps.stemWord("shop"), -0.1);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.1);
|
||||
termValues.put(ps.stemWord("cart"), -0.1);
|
||||
termValues.put(ps.stemWord("item"), -0.025);
|
||||
termValues.put(ps.stemWord("price"), -0.1);
|
||||
termValues.put(ps.stemWord("book"), -0.1);
|
||||
termValues.put(ps.stemWord("order"), -0.1);
|
||||
termValues.put(ps.stemWord("exhibition"), -0.1);
|
||||
|
||||
// woodworking and joinery
|
||||
termValues.put(ps.stemWord("apse"), 0.01);
|
||||
termValues.put(ps.stemWord("baluster"), 0.01);
|
||||
termValues.put(ps.stemWord("beam"), 0.01);
|
||||
termValues.put(ps.stemWord("cornice"), 0.01);
|
||||
termValues.put(ps.stemWord("drill"), 0.01);
|
||||
termValues.put(ps.stemWord("nail"), 0.01);
|
||||
termValues.put(ps.stemWord("saw"), 0.01);
|
||||
termValues.put(ps.stemWord("hacksaw"), 0.01);
|
||||
termValues.put(ps.stemWord("bandsaw"), 0.01);
|
||||
termValues.put(ps.stemWord("whipsaw"), 0.01);
|
||||
termValues.put(ps.stemWord("gimlet"), 0.01);
|
||||
termValues.put(ps.stemWord("clamp"), 0.01);
|
||||
termValues.put(ps.stemWord("glue"), 0.01);
|
||||
termValues.put(ps.stemWord("cut"), 0.01);
|
||||
termValues.put(ps.stemWord("plane"), 0.01);
|
||||
termValues.put(ps.stemWord("sand"), 0.01);
|
||||
termValues.put(ps.stemWord("bevel"), 0.01);
|
||||
termValues.put(ps.stemWord("chamfer"), 0.01);
|
||||
termValues.put(ps.stemWord("dado"), 0.075);
|
||||
termValues.put(ps.stemWord("dowel"), 0.05);
|
||||
termValues.put(ps.stemWord("dovetail"), 0.05);
|
||||
termValues.put(ps.stemWord("joint"), 0.01);
|
||||
termValues.put(ps.stemWord("level"), 0.01);
|
||||
termValues.put(ps.stemWord("edge"), 0.01);
|
||||
termValues.put(ps.stemWord("face"), 0.01);
|
||||
termValues.put(ps.stemWord("fibreboard"), 0.01);
|
||||
termValues.put(ps.stemWord("fiberboard"), 0.01);
|
||||
termValues.put(ps.stemWord("battens"), 0.01);
|
||||
termValues.put(ps.stemWord("furring"), 0.01);
|
||||
termValues.put(ps.stemWord("glulam"), 0.025);
|
||||
termValues.put(ps.stemWord("hardboard"), 0.025);
|
||||
termValues.put(ps.stemWord("hardwood"), 0.01);
|
||||
termValues.put(ps.stemWord("jamb"), 0.015);
|
||||
termValues.put(ps.stemWord("kerf"), 0.025);
|
||||
termValues.put(ps.stemWord("lvl"), 0.025);
|
||||
termValues.put(ps.stemWord("laminated"), 0.01);
|
||||
termValues.put(ps.stemWord("lignin"), 0.01);
|
||||
termValues.put(ps.stemWord("mitre"), 0.01);
|
||||
termValues.put(ps.stemWord("mortise"), 0.015);
|
||||
termValues.put(ps.stemWord("mullion"), 0.01);
|
||||
termValues.put(ps.stemWord("newel"), 0.01);
|
||||
termValues.put(ps.stemWord("nogging"), 0.01);
|
||||
termValues.put(ps.stemWord("ogee"), 0.01);
|
||||
termValues.put(ps.stemWord("ogive"), 0.01);
|
||||
termValues.put(ps.stemWord("ovolo"), 0.01);
|
||||
termValues.put(ps.stemWord("drawknife"), 0.01);
|
||||
termValues.put(ps.stemWord("plywood"), 0.01);
|
||||
termValues.put(ps.stemWord("purlin"), 0.01);
|
||||
termValues.put(ps.stemWord("riser"), 0.01);
|
||||
termValues.put(ps.stemWord("sapwood"), 0.01);
|
||||
termValues.put(ps.stemWord("shingle"), 0.01);
|
||||
termValues.put(ps.stemWord("softwood"), 0.01);
|
||||
termValues.put(ps.stemWord("sapwood"), 0.01);
|
||||
termValues.put(ps.stemWord("stave"), 0.01);
|
||||
termValues.put(ps.stemWord("stopper"), 0.01);
|
||||
termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D
|
||||
termValues.put(ps.stemWord("transom"), 0.01);
|
||||
termValues.put(ps.stemWord("v-joint"), 0.015);
|
||||
termValues.put(ps.stemWord("veneer"), 0.01);
|
||||
termValues.put(ps.stemWord("quartersaw"), 0.015);
|
||||
termValues.put(ps.stemWord("screw"), 0.01);
|
||||
termValues.put(ps.stemWord("woodturning"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("pine"), 0.005);
|
||||
termValues.put(ps.stemWord("balsa"), 0.01);
|
||||
termValues.put(ps.stemWord("poplar"), 0.005);
|
||||
|
||||
termValues.put(ps.stemWord("nut"), 0.01);
|
||||
termValues.put(ps.stemWord("bolt"), 0.01);
|
||||
termValues.put(ps.stemWord("tack"), 0.01);
|
||||
termValues.put(ps.stemWord("hinge"), 0.01);
|
||||
termValues.put(ps.stemWord("brass"), 0.01);
|
||||
termValues.put(ps.stemWord("fitting"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("diy"), 0.015);
|
||||
termValues.put(ps.stemWord("dozuki"), 0.01);
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld.sentences) {
|
||||
|
||||
for (var word : sentence) {
|
||||
count++;
|
||||
|
||||
final String stemmed = word.stemmed();
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@ -22,5 +22,12 @@ public class CrawledDomainReader {
|
||||
return gson.fromJson(br, CrawledDomain.class);
|
||||
}
|
||||
}
|
||||
|
||||
public CrawledDomain readRuntimeExcept(Path path) {
|
||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) {
|
||||
return gson.fromJson(br, CrawledDomain.class);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.google.errorprone.annotations.MustBeClosed;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -12,6 +14,7 @@ import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class WorkLog implements AutoCloseable {
|
||||
private final Set<String> finishedJobs = new HashSet<>();
|
||||
@ -29,15 +32,21 @@ public class WorkLog implements AutoCloseable {
|
||||
return;
|
||||
}
|
||||
|
||||
try (var lines = Files.lines(logFile)) {
|
||||
lines.filter(WorkLog::isJobId).map(line -> {
|
||||
String[] parts = line.split("\\s+");
|
||||
return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3]));
|
||||
}).forEach(entryConsumer);
|
||||
try (var entries = streamLog(logFile)) {
|
||||
entries.forEach(entryConsumer);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@MustBeClosed
|
||||
public static Stream<CrawlLogEntry> streamLog(Path logFile) throws IOException {
|
||||
return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> {
|
||||
String[] parts = line.split("\\s+");
|
||||
return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3]));
|
||||
});
|
||||
}
|
||||
|
||||
private void loadLog(Path logFile) throws IOException {
|
||||
if (!Files.exists(logFile)) {
|
||||
return;
|
||||
|
@ -1,15 +1,20 @@
|
||||
package nu.marginalia.wmsa.edge.model;
|
||||
|
||||
import com.google.errorprone.annotations.MustBeClosed;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor @ToString
|
||||
public class EdgeCrawlPlan {
|
||||
@ -49,12 +54,44 @@ public class EdgeCrawlPlan {
|
||||
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
WorkLog.readLog(crawl.getLogFile(), entry -> {
|
||||
try {
|
||||
consumer.accept(reader.read(getCrawledFilePath(entry.path())));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
entryStream
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept)
|
||||
.forEach(consumer);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@MustBeClosed
|
||||
public DomainsIterable domainsIterable() throws IOException {
|
||||
return new DomainsIterable();
|
||||
}
|
||||
|
||||
public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {
|
||||
private final Stream<CrawledDomain> stream;
|
||||
|
||||
DomainsIterable() throws IOException {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
stream = WorkLog.streamLog(crawl.getLogFile())
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(EdgeCrawlPlan.this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<CrawledDomain> iterator() {
|
||||
return stream.iterator();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||
|
||||
public class AdblockTesterTool {
|
||||
|
||||
static AdblockSimulator simulator;
|
||||
|
||||
static {
|
||||
try {
|
||||
simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt"));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
processDomain(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void processDomain(CrawledDomain domain) {
|
||||
if (domain.doc == null) return;
|
||||
for (var doc : domain.doc) {
|
||||
if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||
processDocument(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||
|
||||
if (simulator.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
}
|
@ -5,9 +5,10 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
@ -25,8 +26,10 @@ import java.util.concurrent.TimeUnit;
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||
|
||||
public class RecipeDetectorTool {
|
||||
private static final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
private static final RecipeDetector detector = new RecipeDetector();
|
||||
private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
private static final RecipeDetector recipeDetector = new RecipeDetector();
|
||||
|
||||
private static final LanguageModels lm = WmsaHome.getLanguageModels();
|
||||
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
|
||||
|
||||
@ -49,7 +52,12 @@ public class RecipeDetectorTool {
|
||||
}
|
||||
|
||||
ForkJoinPool pool = new ForkJoinPool(16);
|
||||
plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data)));
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
pool.execute(() -> processDomain(domain));
|
||||
}
|
||||
}
|
||||
|
||||
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
|
||||
}
|
||||
@ -74,9 +82,20 @@ public class RecipeDetectorTool {
|
||||
parsedDocument.getElementsByTag("nav").remove();
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
|
||||
double prob = 100*detector.recipeP(dld);
|
||||
|
||||
double prob = 100*recipeDetector.testP(dld);
|
||||
if (prob > 50) {
|
||||
System.out.printf("%3.2f\t%s\n", prob, doc.url);
|
||||
System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
|
||||
prob = 100*woodworkingDetector.testP(dld);
|
||||
if (prob > 20) {
|
||||
System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
|
||||
prob = 100*textileCraftDetector.testP(dld);
|
||||
if (prob > 20) {
|
||||
System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user