Merge pull request 'Add features to suggestions' (#83) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/83
This commit is contained in:
commit
f3a8a20321
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -49,6 +50,13 @@ public class Suggestions {
|
||||
.map(String::toLowerCase)
|
||||
.forEach(w -> ret.put(w, w));
|
||||
|
||||
for (var feature : HtmlFeature.values()) {
|
||||
String keyword = feature.getKeyword();
|
||||
|
||||
ret.put(keyword, keyword);
|
||||
ret.put("-" + keyword, "-"+ keyword);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (IOException ex) {
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
@ -10,8 +9,6 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
@ -20,24 +17,13 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class ConverterMain {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final DomainProcessor processor;
|
||||
private final InstructionsCompiler compiler;
|
||||
private final WorkLog processLog;
|
||||
private final CrawledInstructionWriter instructionWriter;
|
||||
|
||||
private final Gson gson;
|
||||
private final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
private final Map<String, String> domainToId = new HashMap<>();
|
||||
private final Map<String, String> idToFileName = new HashMap<>();
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
if (args.length != 1) {
|
||||
@ -60,29 +46,20 @@ public class ConverterMain {
|
||||
InstructionsCompiler compiler,
|
||||
Gson gson
|
||||
) throws Exception {
|
||||
this.processor = processor;
|
||||
this.compiler = compiler;
|
||||
this.gson = gson;
|
||||
|
||||
instructionWriter = new CrawledInstructionWriter(plan.process.getDir(), gson);
|
||||
|
||||
logger.info("Loading input spec");
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> domainToId.put(spec.domain, spec.id));
|
||||
|
||||
logger.info("Replaying crawl log");
|
||||
WorkLog.readLog(plan.crawl.getLogFile(),
|
||||
entry -> idToFileName.put(entry.id(), entry.path()));
|
||||
|
||||
logger.info("Starting pipe");
|
||||
processLog = new WorkLog(plan.process.getLogFile());
|
||||
|
||||
|
||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
|
||||
|
||||
@Override
|
||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||
var processed = processor.process(domainData);
|
||||
return new ProcessingInstructions(domainData.id, compiler.compile(processed));
|
||||
var compiled = compiler.compile(processed);
|
||||
|
||||
return new ProcessingInstructions(domainData.id, compiled);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -93,32 +70,18 @@ public class ConverterMain {
|
||||
String where = instructionWriter.accept(processedInstructions.id, instructions);
|
||||
processLog.setJobToFinished(processedInstructions.id, where, instructions.size());
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
domainToId.forEach((domain, id) -> {
|
||||
String fileName = idToFileName.get(id);
|
||||
|
||||
if (Strings.isNullOrEmpty(fileName))
|
||||
return;
|
||||
|
||||
Path dest = plan.getCrawledFilePath(fileName);
|
||||
|
||||
logger.info("{} - {} - {}", domain, id, dest);
|
||||
|
||||
if (!processLog.isJobFinished(id)) {
|
||||
try {
|
||||
var cd = reader.read(dest);
|
||||
pipe.accept(cd);
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to read {}", dest);
|
||||
}
|
||||
plan.forEachCrawledDomain(domain -> {
|
||||
if (!processLog.isJobFinished(domain.id)) {
|
||||
logger.info("{} - {}", domain.domain, domain.id);
|
||||
pipe.accept(domain);
|
||||
}
|
||||
});
|
||||
|
||||
pipe.join();
|
||||
|
||||
processLog.close();
|
||||
}
|
||||
|
||||
logger.info("Finished");
|
||||
|
||||
|
@ -61,6 +61,7 @@ public class LinkKeywordExtractorMain {
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
logger.info("Loading input spec");
|
||||
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> { crawledDomains.add(spec.domain); });
|
||||
|
||||
|
@ -0,0 +1,133 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AdblockSimulator {
|
||||
|
||||
List<String> idRules = new ArrayList();
|
||||
List<String> classRules = new ArrayList();
|
||||
List<Predicate<String>> scriptRules = new ArrayList();
|
||||
|
||||
public AdblockSimulator(Path adsDefinition) throws IOException {
|
||||
try (var lineStream = Files.lines(adsDefinition)) {
|
||||
lineStream.skip(1).forEach(this::addRule);
|
||||
}
|
||||
}
|
||||
|
||||
private void addRule(String s) {
|
||||
if (s.startsWith("##") && !s.contains(":")) {
|
||||
if (s.startsWith("###")) {
|
||||
idRules.add(s.substring(3));
|
||||
} else if(s.startsWith("##.")) {
|
||||
classRules.add(s.substring(3));
|
||||
}
|
||||
}
|
||||
else if (!s.startsWith("!") && !s.contains("#")){
|
||||
scriptRules.add(toRegexMatcher(s));
|
||||
}
|
||||
}
|
||||
|
||||
private Predicate<String> toRegexMatcher(String s) {
|
||||
|
||||
System.out.println("<-" + s);
|
||||
|
||||
s = s.replaceAll("\\?", "\\\\?");
|
||||
s = s.replaceAll("\\.", "\\\\.");
|
||||
s = s.replaceAll("\\$", "\\\\\\$");
|
||||
|
||||
if (s.startsWith("||")) {
|
||||
s = s.replaceFirst("\\|\\|","^http(s)?://");
|
||||
}
|
||||
|
||||
s = s.replaceAll("\\|", "\\\\|");
|
||||
s = s.replaceAll("\\*", ".*");
|
||||
s = s.replaceAll("\\^", "[?/]");
|
||||
|
||||
|
||||
System.out.println("->" + s);
|
||||
return Pattern.compile(s).asPredicate();
|
||||
}
|
||||
|
||||
class RuleVisitor implements NodeFilter {
|
||||
public boolean sawAds;
|
||||
Pattern spPattern = Pattern.compile("\\s");
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
|
||||
if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow
|
||||
|
||||
String id = elem.id();
|
||||
for (var rule : idRules) {
|
||||
if (rule.equals(id)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
|
||||
String classes = elem.className();
|
||||
if (classes.isBlank()) return FilterResult.CONTINUE;
|
||||
|
||||
if (classes.indexOf(' ') > 0) {
|
||||
String[] classNames = spPattern.split(classes);
|
||||
for (var rule : classRules) {
|
||||
|
||||
for (var className : classNames) {
|
||||
if (className.equals(rule)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else { // tag only has one class
|
||||
for (var rule : classRules) {
|
||||
if (classes.equals(rule)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ("script".equals(elem.tagName())) {
|
||||
String src = elem.attr("src");
|
||||
|
||||
for (var rule : scriptRules) {
|
||||
if (rule.test(src)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilterResult tail(Node node, int depth) {
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasAds(Document document) {
|
||||
|
||||
RuleVisitor ruleVisitor = new RuleVisitor();
|
||||
document.filter(ruleVisitor);
|
||||
|
||||
return ruleVisitor.sawAds;
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
@ -205,7 +205,7 @@ public class RecipeDetector {
|
||||
|
||||
}
|
||||
|
||||
public double recipeP(DocumentLanguageData dld) {
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
@ -0,0 +1,158 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class TextileCraftDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public TextileCraftDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
termValues.put(ps.stemWord("shop"), -0.1);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.1);
|
||||
termValues.put(ps.stemWord("cart"), -0.1);
|
||||
termValues.put(ps.stemWord("item"), -0.025);
|
||||
termValues.put(ps.stemWord("price"), -0.1);
|
||||
termValues.put(ps.stemWord("book"), -0.1);
|
||||
termValues.put(ps.stemWord("order"), -0.1);
|
||||
termValues.put(ps.stemWord("exhibition"), -0.1);
|
||||
|
||||
termValues.put(ps.stemWord("knit"), 0.05);
|
||||
termValues.put(ps.stemWord("stitch"), 0.05);
|
||||
termValues.put(ps.stemWord("yarn"), 0.05);
|
||||
termValues.put(ps.stemWord("crochet"), 0.05);
|
||||
termValues.put(ps.stemWord("ravelry"), 0.15);
|
||||
|
||||
termValues.put(ps.stemWord("stockinette"), 0.075);
|
||||
termValues.put(ps.stemWord("purl"), 0.075);
|
||||
termValues.put(ps.stemWord("ksp"), 0.075);
|
||||
termValues.put(ps.stemWord("kwise"), 0.075);
|
||||
termValues.put(ps.stemWord("k2tog"), 0.075);
|
||||
termValues.put(ps.stemWord("k1b"), 0.075);
|
||||
termValues.put(ps.stemWord("psso"), 0.075);
|
||||
termValues.put(ps.stemWord("p2sso"), 0.075);
|
||||
termValues.put(ps.stemWord("pwise"), 0.075);
|
||||
termValues.put(ps.stemWord("yrn"), 0.075);
|
||||
termValues.put(ps.stemWord("yon"), 0.075);
|
||||
termValues.put(ps.stemWord("entrelac"), 0.075);
|
||||
termValues.put(ps.stemWord("thrum"), 0.075);
|
||||
termValues.put(ps.stemWord("bobbin"), 0.025);
|
||||
|
||||
termValues.put(ps.stemWord("boucle"), 0.075);
|
||||
termValues.put(ps.stemWord("lopi"), 0.075);
|
||||
termValues.put(ps.stemWord("eyelash"), 0.01);
|
||||
termValues.put(ps.stemWord("variegated"), 0.075);
|
||||
|
||||
termValues.put(ps.stemWord("serge"), 0.04);
|
||||
termValues.put(ps.stemWord("selvage"), 0.075);
|
||||
termValues.put(ps.stemWord("topstitch"), 0.075);
|
||||
|
||||
termValues.put(ps.stemWord("gauge"), 0.01);
|
||||
termValues.put(ps.stemWord("design"), 0.01);
|
||||
termValues.put(ps.stemWord("pattern"), 0.01);
|
||||
termValues.put(ps.stemWord("layer"), 0.01);
|
||||
termValues.put(ps.stemWord("color"), 0.01);
|
||||
termValues.put(ps.stemWord("colour"), 0.01);
|
||||
termValues.put(ps.stemWord("chart"), 0.01);
|
||||
termValues.put(ps.stemWord("grid"), 0.01);
|
||||
termValues.put(ps.stemWord("wool"), 0.01);
|
||||
termValues.put(ps.stemWord("acrylic"), 0.01);
|
||||
termValues.put(ps.stemWord("loose"), 0.01);
|
||||
termValues.put(ps.stemWord("loop"), 0.01);
|
||||
termValues.put(ps.stemWord("needle"), 0.01);
|
||||
termValues.put(ps.stemWord("row"), 0.01);
|
||||
termValues.put(ps.stemWord("circular"), 0.01);
|
||||
termValues.put(ps.stemWord("sew"), 0.01);
|
||||
termValues.put(ps.stemWord("size"), 0.01);
|
||||
termValues.put(ps.stemWord("repeat"), 0.01);
|
||||
termValues.put(ps.stemWord("repetition"), 0.01);
|
||||
termValues.put(ps.stemWord("basketweave"), 0.01);
|
||||
termValues.put(ps.stemWord("weave"), 0.01);
|
||||
termValues.put(ps.stemWord("loom"), 0.01);
|
||||
termValues.put(ps.stemWord("warp"), 0.01);
|
||||
termValues.put(ps.stemWord("weft"), 0.01);
|
||||
termValues.put(ps.stemWord("shuttle"), 0.01);
|
||||
termValues.put(ps.stemWord("brioche"), 0.01);
|
||||
termValues.put(ps.stemWord("spool"), 0.01);
|
||||
termValues.put(ps.stemWord("hem"), 0.01);
|
||||
termValues.put(ps.stemWord("bodice"), 0.01);
|
||||
termValues.put(ps.stemWord("seam"), 0.01);
|
||||
termValues.put(ps.stemWord("allowance"), 0.01);
|
||||
termValues.put(ps.stemWord("crinoline"), 0.01);
|
||||
termValues.put(ps.stemWord("petticoat"), 0.01);
|
||||
termValues.put(ps.stemWord("armscye"), 0.01);
|
||||
termValues.put(ps.stemWord("baste"), 0.01);
|
||||
termValues.put(ps.stemWord("cord"), 0.01);
|
||||
termValues.put(ps.stemWord("darning"), 0.01);
|
||||
termValues.put(ps.stemWord("draping"), 0.01);
|
||||
termValues.put(ps.stemWord("embroider"), 0.01);
|
||||
termValues.put(ps.stemWord("eyelet"), 0.01);
|
||||
termValues.put(ps.stemWord("godet"), 0.01);
|
||||
termValues.put(ps.stemWord("gore"), 0.01);
|
||||
termValues.put(ps.stemWord("grain"), 0.01);
|
||||
termValues.put(ps.stemWord("jersey"), 0.01);
|
||||
termValues.put(ps.stemWord("lining"), 0.01);
|
||||
termValues.put(ps.stemWord("muslin"), 0.01);
|
||||
termValues.put(ps.stemWord("needlework"), 0.01);
|
||||
termValues.put(ps.stemWord("pleat"), 0.01);
|
||||
termValues.put(ps.stemWord("quilt"), 0.01);
|
||||
termValues.put(ps.stemWord("silk"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("sloper"), 0.01);
|
||||
termValues.put(ps.stemWord("surplice"), 0.01);
|
||||
termValues.put(ps.stemWord("thread"), 0.01);
|
||||
termValues.put(ps.stemWord("twill"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("ch"), 0.01);
|
||||
termValues.put(ps.stemWord("sp"), 0.01);
|
||||
termValues.put(ps.stemWord("sl"), 0.01);
|
||||
termValues.put(ps.stemWord("sc"), 0.01);
|
||||
termValues.put(ps.stemWord("ss"), 0.01);
|
||||
termValues.put(ps.stemWord("hdc"), 0.01);
|
||||
termValues.put(ps.stemWord("turn"), 0.01);
|
||||
termValues.put(ps.stemWord("skip"), 0.01);
|
||||
termValues.put(ps.stemWord("round"), 0.01);
|
||||
termValues.put(ps.stemWord("ring"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("sequin"), 0.01);
|
||||
termValues.put(ps.stemWord("bobble"), 0.01);
|
||||
termValues.put(ps.stemWord("puff"), 0.01);
|
||||
termValues.put(ps.stemWord("v-stitch"), 0.01);
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld.sentences) {
|
||||
|
||||
for (var word : sentence) {
|
||||
count++;
|
||||
|
||||
final String stemmed = word.stemmed();
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,134 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.sqrt;
|
||||
|
||||
public class WoodworkingDetector {
|
||||
private static final int AVG_LENGTH = 1000;
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
public WoodworkingDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
termValues.put(ps.stemWord("shop"), -0.1);
|
||||
termValues.put(ps.stemWord("newsletter"), -0.1);
|
||||
termValues.put(ps.stemWord("cart"), -0.1);
|
||||
termValues.put(ps.stemWord("item"), -0.025);
|
||||
termValues.put(ps.stemWord("price"), -0.1);
|
||||
termValues.put(ps.stemWord("book"), -0.1);
|
||||
termValues.put(ps.stemWord("order"), -0.1);
|
||||
termValues.put(ps.stemWord("exhibition"), -0.1);
|
||||
|
||||
// woodworking and joinery
|
||||
termValues.put(ps.stemWord("apse"), 0.01);
|
||||
termValues.put(ps.stemWord("baluster"), 0.01);
|
||||
termValues.put(ps.stemWord("beam"), 0.01);
|
||||
termValues.put(ps.stemWord("cornice"), 0.01);
|
||||
termValues.put(ps.stemWord("drill"), 0.01);
|
||||
termValues.put(ps.stemWord("nail"), 0.01);
|
||||
termValues.put(ps.stemWord("saw"), 0.01);
|
||||
termValues.put(ps.stemWord("hacksaw"), 0.01);
|
||||
termValues.put(ps.stemWord("bandsaw"), 0.01);
|
||||
termValues.put(ps.stemWord("whipsaw"), 0.01);
|
||||
termValues.put(ps.stemWord("gimlet"), 0.01);
|
||||
termValues.put(ps.stemWord("clamp"), 0.01);
|
||||
termValues.put(ps.stemWord("glue"), 0.01);
|
||||
termValues.put(ps.stemWord("cut"), 0.01);
|
||||
termValues.put(ps.stemWord("plane"), 0.01);
|
||||
termValues.put(ps.stemWord("sand"), 0.01);
|
||||
termValues.put(ps.stemWord("bevel"), 0.01);
|
||||
termValues.put(ps.stemWord("chamfer"), 0.01);
|
||||
termValues.put(ps.stemWord("dado"), 0.075);
|
||||
termValues.put(ps.stemWord("dowel"), 0.05);
|
||||
termValues.put(ps.stemWord("dovetail"), 0.05);
|
||||
termValues.put(ps.stemWord("joint"), 0.01);
|
||||
termValues.put(ps.stemWord("level"), 0.01);
|
||||
termValues.put(ps.stemWord("edge"), 0.01);
|
||||
termValues.put(ps.stemWord("face"), 0.01);
|
||||
termValues.put(ps.stemWord("fibreboard"), 0.01);
|
||||
termValues.put(ps.stemWord("fiberboard"), 0.01);
|
||||
termValues.put(ps.stemWord("battens"), 0.01);
|
||||
termValues.put(ps.stemWord("furring"), 0.01);
|
||||
termValues.put(ps.stemWord("glulam"), 0.025);
|
||||
termValues.put(ps.stemWord("hardboard"), 0.025);
|
||||
termValues.put(ps.stemWord("hardwood"), 0.01);
|
||||
termValues.put(ps.stemWord("jamb"), 0.015);
|
||||
termValues.put(ps.stemWord("kerf"), 0.025);
|
||||
termValues.put(ps.stemWord("lvl"), 0.025);
|
||||
termValues.put(ps.stemWord("laminated"), 0.01);
|
||||
termValues.put(ps.stemWord("lignin"), 0.01);
|
||||
termValues.put(ps.stemWord("mitre"), 0.01);
|
||||
termValues.put(ps.stemWord("mortise"), 0.015);
|
||||
termValues.put(ps.stemWord("mullion"), 0.01);
|
||||
termValues.put(ps.stemWord("newel"), 0.01);
|
||||
termValues.put(ps.stemWord("nogging"), 0.01);
|
||||
termValues.put(ps.stemWord("ogee"), 0.01);
|
||||
termValues.put(ps.stemWord("ogive"), 0.01);
|
||||
termValues.put(ps.stemWord("ovolo"), 0.01);
|
||||
termValues.put(ps.stemWord("drawknife"), 0.01);
|
||||
termValues.put(ps.stemWord("plywood"), 0.01);
|
||||
termValues.put(ps.stemWord("purlin"), 0.01);
|
||||
termValues.put(ps.stemWord("riser"), 0.01);
|
||||
termValues.put(ps.stemWord("sapwood"), 0.01);
|
||||
termValues.put(ps.stemWord("shingle"), 0.01);
|
||||
termValues.put(ps.stemWord("softwood"), 0.01);
|
||||
termValues.put(ps.stemWord("sapwood"), 0.01);
|
||||
termValues.put(ps.stemWord("stave"), 0.01);
|
||||
termValues.put(ps.stemWord("stopper"), 0.01);
|
||||
termValues.put(ps.stemWord("stud"), 0.01); // beep beep beep, huh, the stud detector seems to work just well :D
|
||||
termValues.put(ps.stemWord("transom"), 0.01);
|
||||
termValues.put(ps.stemWord("v-joint"), 0.015);
|
||||
termValues.put(ps.stemWord("veneer"), 0.01);
|
||||
termValues.put(ps.stemWord("quartersaw"), 0.015);
|
||||
termValues.put(ps.stemWord("screw"), 0.01);
|
||||
termValues.put(ps.stemWord("woodturning"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("pine"), 0.005);
|
||||
termValues.put(ps.stemWord("balsa"), 0.01);
|
||||
termValues.put(ps.stemWord("poplar"), 0.005);
|
||||
|
||||
termValues.put(ps.stemWord("nut"), 0.01);
|
||||
termValues.put(ps.stemWord("bolt"), 0.01);
|
||||
termValues.put(ps.stemWord("tack"), 0.01);
|
||||
termValues.put(ps.stemWord("hinge"), 0.01);
|
||||
termValues.put(ps.stemWord("brass"), 0.01);
|
||||
termValues.put(ps.stemWord("fitting"), 0.01);
|
||||
|
||||
termValues.put(ps.stemWord("diy"), 0.015);
|
||||
termValues.put(ps.stemWord("dozuki"), 0.01);
|
||||
}
|
||||
|
||||
public double testP(DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Double> values = new HashMap<>();
|
||||
int count = 0;
|
||||
for (var sentence : dld.sentences) {
|
||||
|
||||
for (var word : sentence) {
|
||||
count++;
|
||||
|
||||
final String stemmed = word.stemmed();
|
||||
final Double value = termValues.get(stemmed);
|
||||
|
||||
if (value != null) {
|
||||
values.merge(stemmed, value, (a,b) -> 0.5*a + b);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (count == 0) return 0.;
|
||||
|
||||
double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count));
|
||||
|
||||
return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty;
|
||||
}
|
||||
|
||||
}
|
@ -3,12 +3,15 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
@ -18,9 +21,40 @@ public class CrawledDomainReader {
|
||||
}
|
||||
|
||||
public CrawledDomain read(Path path) throws IOException {
|
||||
List<CrawledDocument> docs = new ArrayList<>();
|
||||
CrawledDomain domain = null;
|
||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) {
|
||||
return gson.fromJson(br, CrawledDomain.class);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("//")) {
|
||||
String nextLine = br.readLine();
|
||||
if (nextLine == null) break;
|
||||
|
||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
||||
}
|
||||
}
|
||||
else if (line.charAt(0) == '{') {
|
||||
domain = gson.fromJson(line, CrawledDomain.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (domain == null) {
|
||||
return null;
|
||||
}
|
||||
domain.doc.addAll(docs);
|
||||
return domain;
|
||||
}
|
||||
|
||||
public CrawledDomain readRuntimeExcept(Path path) {
|
||||
try {
|
||||
return read(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,40 +3,44 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class CrawledDomainWriter {
|
||||
public class CrawledDomainWriter implements AutoCloseable {
|
||||
private final Path outputDir;
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
|
||||
private final Writer writer;
|
||||
private final Path outputFile;
|
||||
|
||||
public CrawledDomainWriter(Path outputDir) {
|
||||
public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException {
|
||||
this.outputDir = outputDir;
|
||||
|
||||
if (!Files.isDirectory(outputDir)) {
|
||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||
}
|
||||
|
||||
outputFile = getOutputFile(id, name);
|
||||
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(outputFile))));
|
||||
}
|
||||
|
||||
public String accept(CrawledDomain domainData) throws IOException {
|
||||
Path outputFile = getOutputFile(domainData.id, domainData.domain);
|
||||
|
||||
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
||||
logger.info("Writing {} - {}", domainData.id, domainData.domain);
|
||||
|
||||
gson.toJson(domainData, outputStream);
|
||||
public Path getOutputFile() {
|
||||
return outputFile;
|
||||
}
|
||||
|
||||
return outputFile.getFileName().toString();
|
||||
public void accept(SerializableCrawlData data) throws IOException {
|
||||
writer.write(data.getSerialIdentifier());
|
||||
writer.write('\n');
|
||||
gson.toJson(data, writer);
|
||||
writer.write('\n');
|
||||
}
|
||||
|
||||
private Path getOutputFile(String id, String name) throws IOException {
|
||||
@ -63,4 +67,9 @@ public class CrawledDomainWriter {
|
||||
return nameSaneBuilder.toString();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
@ -4,68 +4,44 @@ import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.configuration.UserAgent;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.HttpFetcher;
|
||||
import nu.marginalia.util.ParallelPipe;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.internal.Util;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.SynchronousQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
public class CrawlerMain implements AutoCloseable {
|
||||
public static Gson gson = new GsonBuilder().create();
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final Path inputSpec;
|
||||
private final EdgeCrawlPlan plan;
|
||||
private final Path crawlDataDir;
|
||||
|
||||
private final WorkLog workLog;
|
||||
private final CrawledDomainWriter domainWriter;
|
||||
|
||||
private final int numberOfThreads;
|
||||
private final ParallelPipe<CrawlingSpecification, CrawledDomain> pipe;
|
||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ThreadPoolExecutor pool;
|
||||
final int poolSize = 256;
|
||||
final int poolQueueSize = 32;
|
||||
|
||||
public CrawlerMain(EdgeCrawlPlan plan) throws Exception {
|
||||
this.inputSpec = plan.getJobSpec();
|
||||
this.numberOfThreads = 512;
|
||||
this.plan = plan;
|
||||
this.userAgent = WmsaHome.getUserAgent();
|
||||
|
||||
workLog = new WorkLog(plan.crawl.getLogFile());
|
||||
domainWriter = new CrawledDomainWriter(plan.crawl.getDir());
|
||||
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(poolQueueSize);
|
||||
pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this?
|
||||
|
||||
Semaphore sem = new Semaphore(250_000);
|
||||
|
||||
pipe = new ParallelPipe<>("Crawler", numberOfThreads, 2, 1) {
|
||||
@Override
|
||||
protected CrawledDomain onProcess(CrawlingSpecification crawlingSpecification) throws Exception {
|
||||
int toAcquire = crawlingSpecification.urls.size();
|
||||
sem.acquire(toAcquire);
|
||||
try {
|
||||
return fetchDomain(crawlingSpecification);
|
||||
}
|
||||
finally {
|
||||
sem.release(toAcquire);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onReceive(CrawledDomain crawledDomain) throws IOException {
|
||||
writeDomain(crawledDomain);
|
||||
}
|
||||
};
|
||||
workLog = plan.createCrawlWorkLog();
|
||||
crawlDataDir = plan.crawl.getDir();
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
@ -84,55 +60,74 @@ public class CrawlerMain implements AutoCloseable {
|
||||
crawler.run();
|
||||
}
|
||||
|
||||
// TODO (2022-05-24): Some thread isn't set to daemon mode, need to explicitly harakiri the process, find why?
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private CrawledDomain fetchDomain(CrawlingSpecification specification) {
|
||||
private void fetchDomain(CrawlingSpecification specification) {
|
||||
if (workLog.isJobFinished(specification.id))
|
||||
return null;
|
||||
return;
|
||||
|
||||
var fetcher = new HttpFetcher(userAgent.uaString(), dispatcher);
|
||||
|
||||
try {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification);
|
||||
try (var writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
|
||||
|
||||
return retreiver.fetch();
|
||||
int size = retreiver.fetch();
|
||||
|
||||
workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);
|
||||
|
||||
logger.info("Fetched {}", specification.domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void writeDomain(CrawledDomain crawledDomain) throws IOException {
|
||||
String name = domainWriter.accept(crawledDomain);
|
||||
workLog.setJobToFinished(crawledDomain.id, name, crawledDomain.size());
|
||||
}
|
||||
|
||||
public void run() throws InterruptedException {
|
||||
// First a validation run to ensure the file is all good to parse
|
||||
|
||||
logger.info("Validating JSON");
|
||||
CrawlerSpecificationLoader.readInputSpec(inputSpec, spec -> {});
|
||||
plan.forEachCrawlingSpecification(unused -> {});
|
||||
|
||||
logger.info("Starting pipe");
|
||||
CrawlerSpecificationLoader.readInputSpec(inputSpec, pipe::accept);
|
||||
logger.info("Let's go");
|
||||
|
||||
if (!AbortMonitor.getInstance().isAlive()) {
|
||||
logger.info("Aborting");
|
||||
pipe.clearQueues();
|
||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||
|
||||
|
||||
Semaphore taskSem = new Semaphore(poolSize);
|
||||
|
||||
plan.forEachCrawlingSpecification(spec -> {
|
||||
if (abortMonitor.isAlive()) {
|
||||
try {
|
||||
taskSem.acquire();
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
else {
|
||||
logger.info("All jobs queued, waiting for pipe to finish");
|
||||
}
|
||||
pipe.join();
|
||||
|
||||
logger.info("All finished");
|
||||
pool.execute(() -> {
|
||||
try {
|
||||
fetchDomain(spec);
|
||||
}
|
||||
finally {
|
||||
taskSem.release();
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
logger.info("Awaiting termination");
|
||||
pool.shutdown();
|
||||
|
||||
while (!pool.awaitTermination(1, TimeUnit.SECONDS));
|
||||
logger.info("All finished");
|
||||
|
||||
workLog.close();
|
||||
dispatcher.executorService().shutdownNow();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.google.errorprone.annotations.MustBeClosed;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -12,6 +14,7 @@ import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class WorkLog implements AutoCloseable {
|
||||
private final Set<String> finishedJobs = new HashSet<>();
|
||||
@ -29,15 +32,21 @@ public class WorkLog implements AutoCloseable {
|
||||
return;
|
||||
}
|
||||
|
||||
try (var lines = Files.lines(logFile)) {
|
||||
lines.filter(WorkLog::isJobId).map(line -> {
|
||||
String[] parts = line.split("\\s+");
|
||||
return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3]));
|
||||
}).forEach(entryConsumer);
|
||||
try (var entries = streamLog(logFile)) {
|
||||
entries.forEach(entryConsumer);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@MustBeClosed
|
||||
public static Stream<CrawlLogEntry> streamLog(Path logFile) throws IOException {
|
||||
return Files.lines(logFile).filter(WorkLog::isJobId).map(line -> {
|
||||
String[] parts = line.split("\\s+");
|
||||
return new CrawlLogEntry(parts[0], parts[1], parts[2], Integer.parseInt(parts[3]));
|
||||
});
|
||||
}
|
||||
|
||||
private void loadLog(Path logFile) throws IOException {
|
||||
if (!Files.exists(logFile)) {
|
||||
return;
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.crawling.model;
|
||||
import lombok.Builder;
|
||||
|
||||
@Builder
|
||||
public class CrawledDocument {
|
||||
public class CrawledDocument implements SerializableCrawlData {
|
||||
public String crawlId;
|
||||
|
||||
public String url;
|
||||
@ -22,4 +22,10 @@ public class CrawledDocument {
|
||||
|
||||
public String canonicalUrl;
|
||||
public String redirectUrl;
|
||||
|
||||
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
||||
@Override
|
||||
public String getSerialIdentifier() {
|
||||
return SERIAL_IDENTIFIER;
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ import lombok.Data;
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @Data @Builder
|
||||
public class CrawledDomain {
|
||||
public class CrawledDomain implements SerializableCrawlData {
|
||||
public String id;
|
||||
public String domain;
|
||||
|
||||
@ -24,4 +24,10 @@ public class CrawledDomain {
|
||||
if (doc == null) return 0;
|
||||
return doc.size();
|
||||
}
|
||||
|
||||
public static final String SERIAL_IDENTIFIER = "// DOMAIN";
|
||||
@Override
|
||||
public String getSerialIdentifier() {
|
||||
return SERIAL_IDENTIFIER;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.model;
|
||||
|
||||
public interface SerializableCrawlData {
|
||||
String getSerialIdentifier();
|
||||
}
|
@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainWriter;
|
||||
import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist;
|
||||
import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList;
|
||||
import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist;
|
||||
@ -14,6 +15,7 @@ import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
@ -29,6 +31,7 @@ public class CrawlerRetreiver {
|
||||
private final int depth;
|
||||
private final String id;
|
||||
private final String domain;
|
||||
private final CrawledDomainWriter crawledDomainWriter;
|
||||
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
||||
@ -45,7 +48,7 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
}
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs) {
|
||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter crawledDomainWriter) {
|
||||
this.fetcher = fetcher;
|
||||
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
||||
known = new HashSet<>(specs.urls.size() * 10);
|
||||
@ -53,6 +56,7 @@ public class CrawlerRetreiver {
|
||||
depth = specs.crawlDepth;
|
||||
id = specs.id;
|
||||
domain = specs.domain;
|
||||
this.crawledDomainWriter = crawledDomainWriter;
|
||||
|
||||
specs.urls.stream()
|
||||
.map(this::parseUrl)
|
||||
@ -78,12 +82,16 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
}
|
||||
|
||||
public CrawledDomain fetch() {
|
||||
logger.info("Fetching {}", domain);
|
||||
|
||||
public int fetch() throws IOException {
|
||||
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain);
|
||||
|
||||
return probeResult.orElseGet(this::crawlDomain);
|
||||
if (probeResult.isPresent()) {
|
||||
crawledDomainWriter.accept(probeResult.get());
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
return crawlDomain();
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<CrawledDomain> probeDomainForProblems(String domain) {
|
||||
@ -118,7 +126,7 @@ public class CrawlerRetreiver {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
private CrawledDomain crawlDomain() {
|
||||
private int crawlDomain() throws IOException {
|
||||
String ip = findIp(domain);
|
||||
|
||||
assert !queue.isEmpty();
|
||||
@ -130,6 +138,8 @@ public class CrawlerRetreiver {
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, docs, null);
|
||||
|
||||
int visitedCount = 0;
|
||||
int fetchedCount = 0;
|
||||
|
||||
while (!queue.isEmpty() && visitedCount < depth) {
|
||||
var top = queue.removeFirst();
|
||||
|
||||
@ -150,7 +160,11 @@ public class CrawlerRetreiver {
|
||||
logger.debug("Fetching {}", top);
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
fetchUrl(top).ifPresent(ret.doc::add);
|
||||
var doc = fetchUrl(top);
|
||||
if (doc.isPresent()) {
|
||||
fetchedCount++;
|
||||
crawledDomainWriter.accept(doc.get());
|
||||
}
|
||||
|
||||
long crawledTime = System.currentTimeMillis() - startTime;
|
||||
delay(crawlDelay, crawledTime);
|
||||
@ -160,7 +174,9 @@ public class CrawlerRetreiver {
|
||||
|
||||
ret.cookies = fetcher.getCookies();
|
||||
|
||||
return ret;
|
||||
crawledDomainWriter.accept(ret);
|
||||
|
||||
return fetchedCount;
|
||||
}
|
||||
|
||||
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
|
||||
@ -254,10 +270,10 @@ public class CrawlerRetreiver {
|
||||
@SneakyThrows
|
||||
private void delay(long crawlDelay, long timeParsed) {
|
||||
if (crawlDelay >= 1) {
|
||||
if (timeParsed/1000 > crawlDelay)
|
||||
if (timeParsed > crawlDelay)
|
||||
return;
|
||||
|
||||
Thread.sleep(Math.min(1000*crawlDelay-timeParsed, 5000));
|
||||
Thread.sleep(Math.min(crawlDelay-timeParsed, 5000));
|
||||
}
|
||||
else {
|
||||
if (timeParsed > DEFAULT_CRAWL_DELAY_MS)
|
||||
|
@ -7,7 +7,6 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
@ -43,8 +42,6 @@ public class HttpFetcher {
|
||||
|
||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||
this.allowAllContentTypes = allowAllContentTypes;
|
||||
}
|
||||
|
@ -1,15 +1,22 @@
|
||||
package nu.marginalia.wmsa.edge.model;
|
||||
|
||||
import com.google.errorprone.annotations.MustBeClosed;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlLogEntry;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor @ToString
|
||||
public class EdgeCrawlPlan {
|
||||
@ -46,15 +53,66 @@ public class EdgeCrawlPlan {
|
||||
return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
|
||||
public WorkLog createCrawlWorkLog() throws IOException {
|
||||
return new WorkLog(crawl.getLogFile());
|
||||
}
|
||||
|
||||
public WorkLog createProcessWorkLog() throws IOException {
|
||||
return new WorkLog(process.getLogFile());
|
||||
}
|
||||
|
||||
public void forEachCrawlingSpecification(Consumer<CrawlingSpecification> consumer) {
|
||||
CrawlerSpecificationLoader.readInputSpec(getJobSpec(), consumer);
|
||||
}
|
||||
|
||||
public void forEachCrawlingLogEntry(Consumer<CrawlLogEntry> consumer) {
|
||||
WorkLog.readLog(this.crawl.getLogFile(), consumer);
|
||||
}
|
||||
public void forEachProcessingLogEntry(Consumer<CrawlLogEntry> consumer) {
|
||||
WorkLog.readLog(this.process.getLogFile(), consumer);
|
||||
}
|
||||
|
||||
public void forEachCrawledDomain(Consumer<CrawledDomain> consumer) {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
WorkLog.readLog(crawl.getLogFile(), entry -> {
|
||||
try {
|
||||
consumer.accept(reader.read(getCrawledFilePath(entry.path())));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
entryStream
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept)
|
||||
.forEach(consumer);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@MustBeClosed
|
||||
public DomainsIterable domainsIterable() throws IOException {
|
||||
return new DomainsIterable();
|
||||
}
|
||||
|
||||
public class DomainsIterable implements Iterable<CrawledDomain>, AutoCloseable {
|
||||
private final Stream<CrawledDomain> stream;
|
||||
|
||||
DomainsIterable() throws IOException {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
stream = WorkLog.streamLog(crawl.getLogFile())
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(EdgeCrawlPlan.this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<CrawledDomain> iterator() {
|
||||
return stream.iterator();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||
|
||||
public class AdblockTesterTool {
|
||||
|
||||
static AdblockSimulator simulator;
|
||||
|
||||
static {
|
||||
try {
|
||||
simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt"));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
processDomain(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void processDomain(CrawledDomain domain) {
|
||||
if (domain.doc == null) return;
|
||||
for (var doc : domain.doc) {
|
||||
if (isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) {
|
||||
processDocument(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||
|
||||
if (simulator.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
}
|
@ -5,9 +5,10 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
@ -25,8 +26,10 @@ import java.util.concurrent.TimeUnit;
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||
|
||||
public class RecipeDetectorTool {
|
||||
private static final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
private static final RecipeDetector detector = new RecipeDetector();
|
||||
private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
private static final RecipeDetector recipeDetector = new RecipeDetector();
|
||||
|
||||
private static final LanguageModels lm = WmsaHome.getLanguageModels();
|
||||
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
|
||||
|
||||
@ -49,7 +52,12 @@ public class RecipeDetectorTool {
|
||||
}
|
||||
|
||||
ForkJoinPool pool = new ForkJoinPool(16);
|
||||
plan.forEachCrawledDomain(data -> pool.execute(() -> processDomain(data)));
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
pool.execute(() -> processDomain(domain));
|
||||
}
|
||||
}
|
||||
|
||||
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
|
||||
}
|
||||
@ -74,9 +82,20 @@ public class RecipeDetectorTool {
|
||||
parsedDocument.getElementsByTag("nav").remove();
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
|
||||
double prob = 100*detector.recipeP(dld);
|
||||
|
||||
double prob = 100*recipeDetector.testP(dld);
|
||||
if (prob > 50) {
|
||||
System.out.printf("%3.2f\t%s\n", prob, doc.url);
|
||||
System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
|
||||
prob = 100*woodworkingDetector.testP(dld);
|
||||
if (prob > 20) {
|
||||
System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
|
||||
prob = 100*textileCraftDetector.testP(dld);
|
||||
if (prob > 20) {
|
||||
System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user