Topical detection (experimental),

Adblock simulation (experimental)
This commit is contained in:
vlofgren 2022-08-10 15:04:25 +02:00
parent 4c9e1fa686
commit 9c6e3b1772
9 changed files with 578 additions and 23 deletions

View File

@ -0,0 +1,133 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class AdblockSimulator {
List<String> idRules = new ArrayList();
List<String> classRules = new ArrayList();
List<Predicate<String>> scriptRules = new ArrayList();
public AdblockSimulator(Path adsDefinition) throws IOException {
try (var lineStream = Files.lines(adsDefinition)) {
lineStream.skip(1).forEach(this::addRule);
}
}
private void addRule(String s) {
if (s.startsWith("##") && !s.contains(":")) {
if (s.startsWith("###")) {
idRules.add(s.substring(3));
} else if(s.startsWith("##.")) {
classRules.add(s.substring(3));
}
}
else if (!s.startsWith("!") && !s.contains("#")){
scriptRules.add(toRegexMatcher(s));
}
}
private Predicate<String> toRegexMatcher(String s) {
System.out.println("<-" + s);
s = s.replaceAll("\\?", "\\\\?");
s = s.replaceAll("\\.", "\\\\.");
s = s.replaceAll("\\$", "\\\\\\$");
if (s.startsWith("||")) {
s = s.replaceFirst("\\|\\|","^http(s)?://");
}
s = s.replaceAll("\\|", "\\\\|");
s = s.replaceAll("\\*", ".*");
s = s.replaceAll("\\^", "[?/]");
System.out.println("->" + s);
return Pattern.compile(s).asPredicate();
}
class RuleVisitor implements NodeFilter {
public boolean sawAds;
Pattern spPattern = Pattern.compile("\\s");
@Override
public FilterResult head(Node node, int depth) {
if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow
String id = elem.id();
for (var rule : idRules) {
if (rule.equals(id)) {
sawAds = true;
return FilterResult.STOP;
}
}
String classes = elem.className();
if (classes.isBlank()) return FilterResult.CONTINUE;
if (classes.indexOf(' ') > 0) {
String[] classNames = spPattern.split(classes);
for (var rule : classRules) {
for (var className : classNames) {
if (className.equals(rule)) {
sawAds = true;
return FilterResult.STOP;
}
}
}
}
else { // tag only has one class
for (var rule : classRules) {
if (classes.equals(rule)) {
sawAds = true;
return FilterResult.STOP;
}
}
}
if ("script".equals(elem.tagName())) {
String src = elem.attr("src");
for (var rule : scriptRules) {
if (rule.test(src)) {
sawAds = true;
return FilterResult.STOP;
}
}
}
return FilterResult.CONTINUE;
}
return FilterResult.CONTINUE;
}
@Override
public FilterResult tail(Node node, int depth) {
return FilterResult.CONTINUE;
}
}
public boolean hasAds(Document document) {
RuleVisitor ruleVisitor = new RuleVisitor();
document.filter(ruleVisitor);
return ruleVisitor.sawAds;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
@ -205,7 +205,7 @@ public class RecipeDetector {
}
public double recipeP(DocumentLanguageData dld) {
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;

View File

@ -0,0 +1,158 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class TextileCraftDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
public TextileCraftDetector() {
PorterStemmer ps = new PorterStemmer();
termValues.put(ps.stemWord("shop"), -0.1);
termValues.put(ps.stemWord("newsletter"), -0.1);
termValues.put(ps.stemWord("cart"), -0.1);
termValues.put(ps.stemWord("item"), -0.025);
termValues.put(ps.stemWord("price"), -0.1);
termValues.put(ps.stemWord("book"), -0.1);
termValues.put(ps.stemWord("order"), -0.1);
termValues.put(ps.stemWord("exhibition"), -0.1);
termValues.put(ps.stemWord("knit"), 0.05);
termValues.put(ps.stemWord("stitch"), 0.05);
termValues.put(ps.stemWord("yarn"), 0.05);
termValues.put(ps.stemWord("crochet"), 0.05);
termValues.put(ps.stemWord("ravelry"), 0.15);
termValues.put(ps.stemWord("stockinette"), 0.075);
termValues.put(ps.stemWord("purl"), 0.075);
termValues.put(ps.stemWord("ksp"), 0.075);
termValues.put(ps.stemWord("kwise"), 0.075);
termValues.put(ps.stemWord("k2tog"), 0.075);
termValues.put(ps.stemWord("k1b"), 0.075);
termValues.put(ps.stemWord("psso"), 0.075);
termValues.put(ps.stemWord("p2sso"), 0.075);
termValues.put(ps.stemWord("pwise"), 0.075);
termValues.put(ps.stemWord("yrn"), 0.075);
termValues.put(ps.stemWord("yon"), 0.075);
termValues.put(ps.stemWord("entrelac"), 0.075);
termValues.put(ps.stemWord("thrum"), 0.075);
termValues.put(ps.stemWord("bobbin"), 0.025);
termValues.put(ps.stemWord("boucle"), 0.075);
termValues.put(ps.stemWord("lopi"), 0.075);
termValues.put(ps.stemWord("eyelash"), 0.01);
termValues.put(ps.stemWord("variegated"), 0.075);
termValues.put(ps.stemWord("serge"), 0.04);
termValues.put(ps.stemWord("selvage"), 0.075);
termValues.put(ps.stemWord("topstitch"), 0.075);
termValues.put(ps.stemWord("gauge"), 0.01);
termValues.put(ps.stemWord("design"), 0.01);
termValues.put(ps.stemWord("pattern"), 0.01);
termValues.put(ps.stemWord("layer"), 0.01);
termValues.put(ps.stemWord("color"), 0.01);
termValues.put(ps.stemWord("colour"), 0.01);
termValues.put(ps.stemWord("chart"), 0.01);
termValues.put(ps.stemWord("grid"), 0.01);
termValues.put(ps.stemWord("wool"), 0.01);
termValues.put(ps.stemWord("acrylic"), 0.01);
termValues.put(ps.stemWord("loose"), 0.01);
termValues.put(ps.stemWord("loop"), 0.01);
termValues.put(ps.stemWord("needle"), 0.01);
termValues.put(ps.stemWord("row"), 0.01);
termValues.put(ps.stemWord("circular"), 0.01);
termValues.put(ps.stemWord("sew"), 0.01);
termValues.put(ps.stemWord("size"), 0.01);
termValues.put(ps.stemWord("repeat"), 0.01);
termValues.put(ps.stemWord("repetition"), 0.01);
termValues.put(ps.stemWord("basketweave"), 0.01);
termValues.put(ps.stemWord("weave"), 0.01);
termValues.put(ps.stemWord("loom"), 0.01);
termValues.put(ps.stemWord("warp"), 0.01);
termValues.put(ps.stemWord("weft"), 0.01);
termValues.put(ps.stemWord("shuttle"), 0.01);
termValues.put(ps.stemWord("brioche"), 0.01);
termValues.put(ps.stemWord("spool"), 0.01);
termValues.put(ps.stemWord("hem"), 0.01);
termValues.put(ps.stemWord("bodice"), 0.01);
termValues.put(ps.stemWord("seam"), 0.01);
termValues.put(ps.stemWord("allowance"), 0.01);
termValues.put(ps.stemWord("crinoline"), 0.01);
termValues.put(ps.stemWord("petticoat"), 0.01);
termValues.put(ps.stemWord("armscye"), 0.01);
termValues.put(ps.stemWord("baste"), 0.01);
termValues.put(ps.stemWord("cord"), 0.01);
termValues.put(ps.stemWord("darning"), 0.01);
termValues.put(ps.stemWord("draping"), 0.01);
termValues.put(ps.stemWord("embroider"), 0.01);
termValues.put(ps.stemWord("eyelet"), 0.01);
termValues.put(ps.stemWord("godet"), 0.01);
termValues.put(ps.stemWord("gore"), 0.01);
termValues.put(ps.stemWord("grain"), 0.01);
termValues.put(ps.stemWord("jersey"), 0.01);
termValues.put(ps.stemWord("lining"), 0.01);
termValues.put(ps.stemWord("muslin"), 0.01);
termValues.put(ps.stemWord("needlework"), 0.01);
termValues.put(ps.stemWord("pleat"), 0.01);
termValues.put(ps.stemWord("quilt"), 0.01);
termValues.put(ps.stemWord("silk"), 0.01);
termValues.put(ps.stemWord("sloper"), 0.01);
termValues.put(ps.stemWord("surplice"), 0.01);
termValues.put(ps.stemWord("thread"), 0.01);
termValues.put(ps.stemWord("twill"), 0.01);
termValues.put(ps.stemWord("ch"), 0.01);
termValues.put(ps.stemWord("sp"), 0.01);
termValues.put(ps.stemWord("sl"), 0.01);
termValues.put(ps.stemWord("sc"), 0.01);
termValues.put(ps.stemWord("ss"), 0.01);
termValues.put(ps.stemWord("hdc"), 0.01);
termValues.put(ps.stemWord("turn"), 0.01);
termValues.put(ps.stemWord("skip"), 0.01);
termValues.put(ps.stemWord("round"), 0.01);
termValues.put(ps.stemWord("ring"), 0.01);
termValues.put(ps.stemWord("sequin"), 0.01);
termValues.put(ps.stemWord("bobble"), 0.01);
termValues.put(ps.stemWord("puff"), 0.01);
termValues.put(ps.stemWord("v-stitch"), 0.01);
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld.sentences) {
for (var word : sentence) {
count++;
final String stemmed = word.stemmed();
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@ -0,0 +1,134 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap;
import java.util.Map;
import static java.lang.Math.max;
import static java.lang.Math.sqrt;
public class WoodworkingDetector {
private static final int AVG_LENGTH = 1000;
private final Map<String, Double> termValues = new HashMap<>();
public WoodworkingDetector() {
PorterStemmer ps = new PorterStemmer();
termValues.put(ps.stemWord("shop"), -0.1);
termValues.put(ps.stemWord("newsletter"), -0.1);
termValues.put(ps.stemWord("cart"), -0.1);
termValues.put(ps.stemWord("item"), -0.025);
termValues.put(ps.stemWord("price"), -0.1);
termValues.put(ps.stemWord("book"), -0.1);
termValues.put(ps.stemWord("order"), -0.1);
termValues.put(ps.stemWord("exhibition"), -0.1);
// woodworking and joinery
termValues.put(ps.stemWord("apse"), 0.01);
termValues.put(ps.stemWord("baluster"), 0.01);
termValues.put(ps.stemWord("beam"), 0.01);
termValues.put(ps.stemWord("cornice"), 0.01);
termValues.put(ps.stemWord("drill"), 0.01);
termValues.put(ps.stemWord("nail"), 0.01);
termValues.put(ps.stemWord("saw"), 0.01);
termValues.put(ps.stemWord("hacksaw"), 0.01);
termValues.put(ps.stemWord("bandsaw"), 0.01);
termValues.put(ps.stemWord("whipsaw"), 0.01);
termValues.put(ps.stemWord("gimlet"), 0.01);
termValues.put(ps.stemWord("clamp"), 0.01);
termValues.put(ps.stemWord("glue"), 0.01);
termValues.put(ps.stemWord("cut"), 0.01);
termValues.put(ps.stemWord("plane"), 0.01);
termValues.put(ps.stemWord("sand"), 0.01);
termValues.put(ps.stemWord("bevel"), 0.01);
termValues.put(ps.stemWord("chamfer"), 0.01);
termValues.put(ps.stemWord("dado"), 0.075);
termValues.put(ps.stemWord("dowel"), 0.05);
termValues.put(ps.stemWord("dovetail"), 0.05);
termValues.put(ps.stemWord("joint"), 0.01);
termValues.put(ps.stemWord("level"), 0.01);
termValues.put(ps.stemWord("edge"), 0.01);
termValues.put(ps.stemWord("face"), 0.01);
termValues.put(ps.stemWord("fibreboard"), 0.01);
termValues.put(ps.stemWord("fiberboard"), 0.01);
termValues.put(ps.stemWord("battens"), 0.01);
termValues.put(ps.stemWord("furring"), 0.01);
termValues.put(ps.stemWord("glulam"), 0.025);
termValues.put(ps.stemWord("hardboard"), 0.025);
termValues.put(ps.stemWord("hardwood"), 0.01);
termValues.put(ps.stemWord("jamb"), 0.015);
termValues.put(ps.stemWord("kerf"), 0.025);
termValues.put(ps.stemWord("lvl"), 0.025);
termValues.put(ps.stemWord("laminated"), 0.01);
termValues.put(ps.stemWord("lignin"), 0.01);
termValues.put(ps.stemWord("mitre"), 0.01);
termValues.put(ps.stemWord("mortise"), 0.015);
termValues.put(ps.stemWord("mullion"), 0.01);
termValues.put(ps.stemWord("newel"), 0.01);
termValues.put(ps.stemWord("nogging"), 0.01);
termValues.put(ps.stemWord("ogee"), 0.01);
termValues.put(ps.stemWord("ogive"), 0.01);
termValues.put(ps.stemWord("ovolo"), 0.01);
termValues.put(ps.stemWord("drawknife"), 0.01);
termValues.put(ps.stemWord("plywood"), 0.01);
termValues.put(ps.stemWord("purlin"), 0.01);
termValues.put(ps.stemWord("riser"), 0.01);
termValues.put(ps.stemWord("sapwood"), 0.01);
termValues.put(ps.stemWord("shingle"), 0.01);
termValues.put(ps.stemWord("softwood"), 0.01);
termValues.put(ps.stemWord("sapwood"), 0.01);
termValues.put(ps.stemWord("stave"), 0.01);
termValues.put(ps.stemWord("stopper"), 0.01);
termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D
termValues.put(ps.stemWord("transom"), 0.01);
termValues.put(ps.stemWord("v-joint"), 0.015);
termValues.put(ps.stemWord("veneer"), 0.01);
termValues.put(ps.stemWord("quartersaw"), 0.015);
termValues.put(ps.stemWord("screw"), 0.01);
termValues.put(ps.stemWord("woodturning"), 0.01);
termValues.put(ps.stemWord("pine"), 0.005);
termValues.put(ps.stemWord("balsa"), 0.01);
termValues.put(ps.stemWord("poplar"), 0.005);
termValues.put(ps.stemWord("nut"), 0.01);
termValues.put(ps.stemWord("bolt"), 0.01);
termValues.put(ps.stemWord("tack"), 0.01);
termValues.put(ps.stemWord("hinge"), 0.01);
termValues.put(ps.stemWord("brass"), 0.01);
termValues.put(ps.stemWord("fitting"), 0.01);
termValues.put(ps.stemWord("diy"), 0.015);
termValues.put(ps.stemWord("dozuki"), 0.01);
}
public double testP(DocumentLanguageData dld) {
Map<String, Double> values = new HashMap<>();
int count = 0;
for (var sentence : dld.sentences) {
for (var word : sentence) {
count++;
final String stemmed = word.stemmed();
final Double value = termValues.get(stemmed);
if (value != null) {
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
}
}
}
if (count == 0) return 0.;
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
}
}

View File

@ -22,5 +22,12 @@ public class CrawledDomainReader {
return gson.fromJson(br, CrawledDomain.class);
}
}
public CrawledDomain readRuntimeExcept(Path path) {
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) {
return gson.fromJson(br, CrawledDomain.class);
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
}
}

View File

@ -1,9 +1,11 @@
package nu.marginalia.wmsa.edge.crawling;
import com.google.errorprone.annotations.MustBeClosed;
import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry;
import org.apache.logging.log4j.util.Strings;
import java.io.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
@ -12,6 +14,7 @@ import java.util.HashSet;
import java.util.Set;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import java.util.stream.Stream;
public class WorkLog implements AutoCloseable {
private final Set<String> finishedJobs = new HashSet<>();
@ -29,15 +32,21 @@ public class WorkLog implements AutoCloseable {
return;
}
try (var lines = Files.lines(logFile)) {
lines.filter(WorkLog::isJobId).map(line -> {
String[] parts = line.split("\\s+");
return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3]));
}).forEach(entryConsumer);
try (var entries = streamLog(logFile)) {
entries.forEach(entryConsumer);
} catch (IOException e) {
e.printStackTrace();
}
}
@MustBeClosed
public static Stream<CrawlLogEntry> streamLog(Path logFile) throws IOException {
return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> {
String[] parts = line.split("\\s+");
return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3]));
});
}
private void loadLog(Path logFile) throws IOException {
if (!Files.exists(logFile)) {
return;

View File

@ -1,15 +1,20 @@
package nu.marginalia.wmsa.edge.model;
import com.google.errorprone.annotations.MustBeClosed;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.ToString;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.function.Consumer;
import java.util.stream.Stream;
@AllArgsConstructor @NoArgsConstructor @ToString
public class EdgeCrawlPlan {
@ -49,12 +54,44 @@ public class EdgeCrawlPlan {
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader();
WorkLog.readLog(crawl.getLogFile(), entry -> {
try {
consumer.accept(reader.read(getCrawledFilePath(entry.path())));
} catch (IOException e) {
throw new RuntimeException(e);
}
});
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream
.map(CrawlLogEntry::path)
.map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept)
.forEach(consumer);
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
@MustBeClosed
public DomainsIterable domainsIterable() throws IOException {
return new DomainsIterable();
}
public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {
private final Stream<CrawledDomain> stream;
DomainsIterable() throws IOException {
final CrawledDomainReader reader = new CrawledDomainReader();
stream = WorkLog.streamLog(crawl.getLogFile())
.map(CrawlLogEntry::path)
.map(EdgeCrawlPlan.this::getCrawledFilePath)
.map(reader::readRuntimeExcept);
}
@Override
public void close() {
stream.close();
}
@NotNull
@Override
public Iterator<CrawledDomain> iterator() {
return stream.iterator();
}
}
}

View File

@ -0,0 +1,58 @@
package nu.marginalia.wmsa.edge.tools;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.nio.file.Path;
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
public class AdblockTesterTool {
static AdblockSimulator simulator;
static {
try {
simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt"));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void main(String... args) throws IOException {
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
processDomain(domain);
}
}
}
private static void processDomain(CrawledDomain domain) {
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
processDocument(doc);
}
}
}
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody);
if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
}

View File

@ -5,9 +5,10 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
@ -25,8 +26,10 @@ import java.util.concurrent.TimeUnit;
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
public class RecipeDetectorTool {
private static final CrawledDomainReader reader = new CrawledDomainReader();
private static final RecipeDetector detector = new RecipeDetector();
private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
private static final RecipeDetector recipeDetector = new RecipeDetector();
private static final LanguageModels lm = WmsaHome.getLanguageModels();
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
@ -49,7 +52,12 @@ public class RecipeDetectorTool {
}
ForkJoinPool pool = new ForkJoinPool(16);
plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data)));
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
pool.execute(() -> processDomain(domain));
}
}
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
}
@ -74,9 +82,20 @@ public class RecipeDetectorTool {
parsedDocument.getElementsByTag("nav").remove();
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
double prob = 100*detector.recipeP(dld);
double prob = 100*recipeDetector.testP(dld);
if (prob > 50) {
System.out.printf("%3.2f\t%s\n", prob, doc.url);
System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
}
prob = 100*woodworkingDetector.testP(dld);
if (prob > 20) {
System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
}
prob = 100*textileCraftDetector.testP(dld);
if (prob > 20) {
System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
}
}
}