Add advertisement Feature to search,
Add adblock simulation to processor, Add filename and email address extraction to processor.
This commit is contained in:
parent
0e28ff5a72
commit
30d2a707ff
@ -46,6 +46,10 @@ public class WmsaHome {
|
||||
}
|
||||
}
|
||||
|
||||
public static Path getAdsDefinition() {
|
||||
return getHomePath().resolve("data").resolve("adblock.txt");
|
||||
}
|
||||
|
||||
public static Path getIPLocationDatabse() {
|
||||
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
||||
}
|
||||
@ -90,4 +94,5 @@ public class WmsaHome {
|
||||
home.resolve("model/English.DICT"),
|
||||
home.resolve("model/opennlp-tok.bin"));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
@ -26,6 +27,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
|
||||
@ -199,8 +201,19 @@ public class DocumentProcessor {
|
||||
|
||||
baseUrl = linkParser.getBaseLink(doc, baseUrl);
|
||||
|
||||
EdgeDomain domain = baseUrl.domain;
|
||||
|
||||
for (var atag : doc.getElementsByTag("a")) {
|
||||
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
|
||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||
if (linkParser.shouldIndexLink(atag)) {
|
||||
linkOpt.ifPresent(lp::accept);
|
||||
}
|
||||
else if (linkOpt.isPresent()) {
|
||||
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
|
||||
linkOpt.ifPresent(lp::acceptNonIndexable);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
for (var frame : doc.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
@ -216,13 +229,44 @@ public class DocumentProcessor {
|
||||
|
||||
final Set<String> linkTerms = new HashSet<>();
|
||||
|
||||
for (var domain : lp.getForeignDomains()) {
|
||||
linkTerms.add("links:"+domain.toString().toLowerCase());
|
||||
linkTerms.add("links:"+domain.getDomain().toLowerCase());
|
||||
for (var fd : lp.getForeignDomains()) {
|
||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||
}
|
||||
|
||||
words.append(IndexBlock.Meta, linkTerms);
|
||||
|
||||
Set<String> fileKeywords = new HashSet<>(100);
|
||||
for (var link : lp.getNonIndexableUrls()) {
|
||||
|
||||
if (!Objects.equals(domain, link.domain)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
synthesizeFilenameKeyword(fileKeywords, link);
|
||||
|
||||
}
|
||||
|
||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||
}
|
||||
|
||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||
|
||||
|
||||
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
||||
|
||||
if (pFilename == null) return;
|
||||
|
||||
String filename = pFilename.toString();
|
||||
if (filename.length() > 32
|
||||
|| filename.endsWith(".xml")
|
||||
|| filename.endsWith(".jpg")
|
||||
|| filename.endsWith(".png")
|
||||
|| filename.endsWith(".pdf")
|
||||
|| filename.endsWith(".gif"))
|
||||
return;
|
||||
|
||||
fileKeywords.add(filename.replace(' ', '_'));
|
||||
}
|
||||
|
||||
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||
|
@ -1,5 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
@ -7,6 +10,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class FeatureExtractor {
|
||||
|
||||
private static final List<String> trackers = List.of("adform.net",
|
||||
@ -29,6 +33,13 @@ public class FeatureExtractor {
|
||||
"d31qbv1cthcecs.cloudfront.net",
|
||||
"linkedin.com");
|
||||
|
||||
private AdblockSimulator adblockSimulator;
|
||||
|
||||
@Inject
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator) {
|
||||
this.adblockSimulator = adblockSimulator;
|
||||
}
|
||||
|
||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
||||
Set<HtmlFeature> features = new HashSet<>();
|
||||
|
||||
@ -37,6 +48,9 @@ public class FeatureExtractor {
|
||||
if (scriptTags.size() > 0) {
|
||||
features.add(HtmlFeature.JS);
|
||||
}
|
||||
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
|
||||
features.add(HtmlFeature.ADVERTISEMENT);
|
||||
}
|
||||
|
||||
if (!doc.getElementsByTag("object").isEmpty()
|
||||
|| !doc.getElementsByTag("audio").isEmpty()
|
||||
@ -56,7 +70,7 @@ public class FeatureExtractor {
|
||||
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(href ->
|
||||
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
|
||||
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) {
|
||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,8 @@ public enum HtmlFeature {
|
||||
COOKIES("special:cookies"),
|
||||
|
||||
CATEGORY_FOOD("category:food"),
|
||||
|
||||
ADVERTISEMENT("special:ads"),
|
||||
;
|
||||
|
||||
private final String keyword;
|
||||
|
@ -40,6 +40,17 @@ public class LinkParser {
|
||||
.flatMap(this::createEdgeUrl);
|
||||
}
|
||||
|
||||
@Contract(pure=true)
|
||||
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
||||
return Optional.of(l)
|
||||
.map(this::getUrl)
|
||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
.flatMap(this::createEdgeUrl);
|
||||
}
|
||||
|
||||
private Optional<URI> createURI(String s) {
|
||||
try {
|
||||
return Optional.of(new URI(s));
|
||||
@ -146,17 +157,20 @@ public class LinkParser {
|
||||
return s.matches("^[a-zA-Z]+:.*$");
|
||||
}
|
||||
|
||||
private boolean shouldIndexLink(Element link) {
|
||||
public boolean shouldIndexLink(Element link) {
|
||||
return isUrlRelevant(link.attr("href"))
|
||||
&& isRelRelevant(link.attr("rel"));
|
||||
|
||||
}
|
||||
|
||||
private boolean isRelRelevant(String rel) {
|
||||
public boolean isRelRelevant(String rel) {
|
||||
// this is null safe
|
||||
return !"noindex".equalsIgnoreCase(rel);
|
||||
}
|
||||
|
||||
public boolean hasBinarySuffix(String href) {
|
||||
return blockSuffixList.stream().anyMatch(href::endsWith);
|
||||
}
|
||||
|
||||
private boolean isUrlRelevant(String href) {
|
||||
if (null == href || "".equals(href)) {
|
||||
return false;
|
||||
@ -164,7 +178,7 @@ public class LinkParser {
|
||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
||||
return false;
|
||||
}
|
||||
if (blockSuffixList.stream().anyMatch(href::endsWith)) {
|
||||
if (hasBinarySuffix(href)) {
|
||||
return false;
|
||||
}
|
||||
if (href.length() > 128) {
|
||||
|
@ -13,6 +13,9 @@ import java.util.Set;
|
||||
public class LinkProcessor {
|
||||
private final ProcessedDocumentDetails ret;
|
||||
private final EdgeUrl baseUrl;
|
||||
|
||||
private final Set<EdgeUrl> nonIndexable = new HashSet<>();
|
||||
|
||||
private final Set<EdgeUrl> seenUrls = new HashSet<>();
|
||||
private final Set<EdgeDomain> foreignDomains = new HashSet<>();
|
||||
|
||||
@ -32,6 +35,10 @@ public class LinkProcessor {
|
||||
public Set<EdgeDomain> getForeignDomains() {
|
||||
return foreignDomains;
|
||||
}
|
||||
|
||||
public Set<EdgeUrl> getNonIndexableUrls() {
|
||||
return nonIndexable;
|
||||
}
|
||||
|
||||
public void accept(EdgeUrl link) {
|
||||
if (!isLinkPermitted(link)) {
|
||||
@ -87,4 +94,8 @@ public class LinkProcessor {
|
||||
return proto.equalsIgnoreCase("http")
|
||||
|| proto.equalsIgnoreCase("https");
|
||||
}
|
||||
|
||||
public void acceptNonIndexable(EdgeUrl edgeUrl) {
|
||||
nonIndexable.add(edgeUrl);
|
||||
}
|
||||
}
|
||||
|
@ -28,9 +28,9 @@ public class QueryParams {
|
||||
|
||||
public static boolean isPermittedParam(String path, String param) {
|
||||
if (path.endsWith("index.php")) {
|
||||
if (param.startsWith("showtopic"))
|
||||
if (param.startsWith("showtopic="))
|
||||
return true;
|
||||
if (param.startsWith("showforum"))
|
||||
if (param.startsWith("showforum="))
|
||||
return true;
|
||||
}
|
||||
if (path.endsWith("viewtopic.php")) {
|
||||
@ -45,6 +45,10 @@ public class QueryParams {
|
||||
if (path.endsWith("showforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
|
||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||
return param.startsWith("project=") || param.startsWith("story=");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1,133 +1,181 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Singleton
|
||||
public class AdblockSimulator {
|
||||
private final Set<String> idRules = new HashSet<>();
|
||||
|
||||
List<String> idRules = new ArrayList();
|
||||
List<String> classRules = new ArrayList();
|
||||
List<Predicate<String>> scriptRules = new ArrayList();
|
||||
private final Set<String> classRules = new HashSet<>();
|
||||
private final List<Predicate<String>> scriptRules = new ArrayList<>();
|
||||
|
||||
public AdblockSimulator(Path adsDefinition) throws IOException {
|
||||
try (var lineStream = Files.lines(adsDefinition)) {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public AdblockSimulator() throws IOException {
|
||||
Path adDef = WmsaHome.getAdsDefinition();
|
||||
|
||||
if (!Files.exists(adDef)) {
|
||||
logger.error("Can not find ads definition file in {}", adDef);
|
||||
return;
|
||||
}
|
||||
|
||||
try (var lineStream = Files.lines(adDef)) {
|
||||
lineStream.skip(1).forEach(this::addRule);
|
||||
}
|
||||
}
|
||||
|
||||
private void addRule(String s) {
|
||||
if (s.startsWith("##") && !s.contains(":")) {
|
||||
if (s.startsWith("###")) {
|
||||
idRules.add(s.substring(3));
|
||||
} else if(s.startsWith("##.")) {
|
||||
classRules.add(s.substring(3));
|
||||
}
|
||||
}
|
||||
else if (!s.startsWith("!") && !s.contains("#")){
|
||||
scriptRules.add(toRegexMatcher(s));
|
||||
}
|
||||
}
|
||||
|
||||
private Predicate<String> toRegexMatcher(String s) {
|
||||
|
||||
System.out.println("<-" + s);
|
||||
|
||||
s = s.replaceAll("\\?", "\\\\?");
|
||||
s = s.replaceAll("\\.", "\\\\.");
|
||||
s = s.replaceAll("\\$", "\\\\\\$");
|
||||
|
||||
if (s.startsWith("||")) {
|
||||
s = s.replaceFirst("\\|\\|","^http(s)?://");
|
||||
}
|
||||
|
||||
s = s.replaceAll("\\|", "\\\\|");
|
||||
s = s.replaceAll("\\*", ".*");
|
||||
s = s.replaceAll("\\^", "[?/]");
|
||||
|
||||
|
||||
System.out.println("->" + s);
|
||||
return Pattern.compile(s).asPredicate();
|
||||
}
|
||||
|
||||
class RuleVisitor implements NodeFilter {
|
||||
public boolean sawAds;
|
||||
Pattern spPattern = Pattern.compile("\\s");
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
|
||||
if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow
|
||||
|
||||
String id = elem.id();
|
||||
for (var rule : idRules) {
|
||||
if (rule.equals(id)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
|
||||
String classes = elem.className();
|
||||
if (classes.isBlank()) return FilterResult.CONTINUE;
|
||||
|
||||
if (classes.indexOf(' ') > 0) {
|
||||
String[] classNames = spPattern.split(classes);
|
||||
for (var rule : classRules) {
|
||||
|
||||
for (var className : classNames) {
|
||||
if (className.equals(rule)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else { // tag only has one class
|
||||
for (var rule : classRules) {
|
||||
if (classes.equals(rule)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ("script".equals(elem.tagName())) {
|
||||
String src = elem.attr("src");
|
||||
|
||||
for (var rule : scriptRules) {
|
||||
if (rule.test(src)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilterResult tail(Node node, int depth) {
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasAds(Document document) {
|
||||
|
||||
RuleVisitor ruleVisitor = new RuleVisitor();
|
||||
|
||||
document.filter(ruleVisitor);
|
||||
|
||||
return ruleVisitor.sawAds;
|
||||
}
|
||||
|
||||
private void addRule(String s) {
|
||||
try {
|
||||
if (s.startsWith("##") && !s.contains(":")) {
|
||||
if (s.startsWith("###")) {
|
||||
idRules.add(s.substring(3));
|
||||
} else if (s.startsWith("##.")) {
|
||||
classRules.add(s.substring(3));
|
||||
}
|
||||
} else if (s.startsWith("/^")) {
|
||||
int end = s.indexOf("[^\\]/");
|
||||
if (end >= 0) {
|
||||
String patternString = s.substring(1, end+1);
|
||||
scriptRules.add(Pattern.compile(patternString).asPredicate());
|
||||
}
|
||||
} else if (!s.startsWith("!") && !s.contains("#") && !s.startsWith("@@")) {
|
||||
if (!s.contains("$")) {
|
||||
scriptRules.add(toRegexMatcher(s));
|
||||
}
|
||||
else if (s.contains("$script") && !s.contains("domain=")) {
|
||||
scriptRules.add(toRegexMatcher(s.substring(0, s.indexOf('$'))));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
System.err.println("Failed to add rule " + s);
|
||||
}
|
||||
}
|
||||
|
||||
private Predicate<String> toRegexMatcher(String s) {
|
||||
String sOriginal = s;
|
||||
if (s.isBlank()) return unused -> false;
|
||||
|
||||
// In some cases, regexes aren't necessary
|
||||
if (s.matches("[&?=/A-Za-z0-9._-]+")) {
|
||||
if (s.startsWith("/")) {
|
||||
return str -> str.equals(sOriginal);
|
||||
}
|
||||
else {
|
||||
return str -> str.contains(sOriginal);
|
||||
}
|
||||
}
|
||||
if (s.matches("[&?=/A-Za-z0-9._-]+\\*")) {
|
||||
return str -> str.startsWith(sOriginal.substring(0, sOriginal.length()-1));
|
||||
}
|
||||
|
||||
String s0 = s;
|
||||
s = s.replaceAll("\\?", "\\\\?");
|
||||
s = s.replaceAll("\\.", "\\\\.");
|
||||
|
||||
s = s.replaceAll("\\^", "[?/]");
|
||||
s = s.replaceAll("\\*", ".*");
|
||||
|
||||
if (s.startsWith("||")) {
|
||||
s = s.replaceFirst("\\|\\|","^http[s]?://.*");
|
||||
}
|
||||
|
||||
s = s.replaceAll("\\|", "\\\\|");
|
||||
return Pattern.compile(s).asPredicate();
|
||||
}
|
||||
|
||||
|
||||
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
||||
// This version is about 100x faster than the a "clean" first stab implementation.
|
||||
|
||||
class RuleVisitor implements NodeFilter {
|
||||
public boolean sawAds;
|
||||
|
||||
Pattern spPattern = Pattern.compile("\\s");
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
|
||||
if (node.attributesSize() > 0 && node instanceof Element elem) {
|
||||
if (testId(elem) || testClass(elem) || testScriptTags(elem)) {
|
||||
sawAds = true;
|
||||
return FilterResult.STOP;
|
||||
}
|
||||
}
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
private boolean testScriptTags(Element elem) {
|
||||
if (!"script".equals(elem.tagName())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String src = elem.attr("src");
|
||||
for (var rule : scriptRules) {
|
||||
if (rule.test(src)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean testId(Element elem) {
|
||||
String id = elem.id();
|
||||
|
||||
return idRules.contains(id);
|
||||
}
|
||||
|
||||
private boolean testClass(Element elem) {
|
||||
String classes = elem.className();
|
||||
if (classes.isBlank())
|
||||
return false;
|
||||
|
||||
if (classes.indexOf(' ') > 0) {
|
||||
String[] classNames = spPattern.split(classes);
|
||||
for (var className : classNames) {
|
||||
if (classRules.contains(className))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else { // tag only has one class, no need to split
|
||||
return classRules.contains(classes);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@Override
|
||||
public FilterResult tail(Node node, int depth) {
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,17 +5,17 @@ import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
||||
|
||||
public CrawledDomainReader() {
|
||||
}
|
||||
@ -23,29 +23,45 @@ public class CrawledDomainReader {
|
||||
public CrawledDomain read(Path path) throws IOException {
|
||||
List<CrawledDocument> docs = new ArrayList<>();
|
||||
CrawledDomain domain = null;
|
||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("//")) {
|
||||
String nextLine = br.readLine();
|
||||
if (nextLine == null) break;
|
||||
|
||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
||||
|
||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||
br.mark(2);
|
||||
boolean legacy = '{' == br.read();
|
||||
br.reset();
|
||||
|
||||
if (legacy) {
|
||||
domain = gson.fromJson(br, CrawledDomain.class);
|
||||
}
|
||||
else {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("//")) {
|
||||
String nextLine = br.readLine();
|
||||
if (nextLine == null) break;
|
||||
|
||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
||||
}
|
||||
} else if (line.charAt(0) == '{') {
|
||||
domain = gson.fromJson(line, CrawledDomain.class);
|
||||
}
|
||||
}
|
||||
else if (line.charAt(0) == '{') {
|
||||
domain = gson.fromJson(line, CrawledDomain.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (domain == null) {
|
||||
return null;
|
||||
}
|
||||
domain.doc.addAll(docs);
|
||||
|
||||
if (!docs.isEmpty()) {
|
||||
if (domain.doc == null)
|
||||
domain.doc = new ArrayList<>();
|
||||
|
||||
domain.doc.addAll(docs);
|
||||
}
|
||||
return domain;
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@ public enum IndexBlock {
|
||||
Meta(7, 7),
|
||||
PositionWords(8, 4.5),
|
||||
NamesWords(9, 5),
|
||||
Unused(10, 10),
|
||||
Artifacts(10, 10),
|
||||
Topic(11, 0.5);
|
||||
|
||||
public final int id;
|
||||
|
@ -132,6 +132,8 @@ public class EdgeUrlDetails {
|
||||
public boolean isCookies() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||
}
|
||||
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
|
||||
|
||||
public boolean isSpecialDomain() {
|
||||
return domainState == EdgeDomainIndexingState.SPECIAL;
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ public class AdblockTesterTool {
|
||||
|
||||
static {
|
||||
try {
|
||||
simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt"));
|
||||
simulator = new AdblockSimulator();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@ -29,7 +29,6 @@ public class AdblockTesterTool {
|
||||
public static void main(String... args) throws IOException {
|
||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
processDomain(domain);
|
||||
|
@ -0,0 +1,56 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ConverterLogicTestTool {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
if (args.length != 1) {
|
||||
System.err.println("Arguments: crawl-plan.yaml");
|
||||
System.exit(0);
|
||||
}
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConverterModule(plan)
|
||||
);
|
||||
|
||||
injector.getInstance(ConverterLogicTestTool.class);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ConverterLogicTestTool(
|
||||
EdgeCrawlPlan plan,
|
||||
DomainProcessor processor
|
||||
) throws Exception {
|
||||
|
||||
plan.forEachCrawledDomain(domain -> {
|
||||
var ret = processor.process(domain);
|
||||
ret.documents.forEach(doc -> {
|
||||
if (doc.words == null)
|
||||
return;
|
||||
var artifacts = doc.words.get(IndexBlock.Artifacts);
|
||||
if (artifacts.size() > 0) {
|
||||
System.out.println(doc.url + ": " + artifacts);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,13 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
@ -20,21 +15,24 @@ import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||
|
||||
public class RecipeDetectorTool {
|
||||
private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
private static final RecipeDetector recipeDetector = new RecipeDetector();
|
||||
public class CrawlDataExtractorTool {
|
||||
private static final AdblockSimulator abs;
|
||||
|
||||
private static final LanguageModels lm = WmsaHome.getLanguageModels();
|
||||
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
|
||||
static {
|
||||
try {
|
||||
abs = new AdblockSimulator();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Set<String> urls = new HashSet<>(50_000_000);
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
DatabaseModule module = new DatabaseModule();
|
||||
@ -51,15 +49,25 @@ public class RecipeDetectorTool {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
ForkJoinPool pool = new ForkJoinPool(16);
|
||||
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
|
||||
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
|
||||
Semaphore sem = new Semaphore(20);
|
||||
|
||||
try (var iterable = plan.domainsIterable()) {
|
||||
for (var domain : iterable) {
|
||||
pool.execute(() -> processDomain(domain));
|
||||
sem.acquire();
|
||||
pool.execute(() -> {
|
||||
try { processDomain(domain); }
|
||||
finally { sem.release(); }
|
||||
});
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
|
||||
pool.shutdown();
|
||||
|
||||
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
|
||||
}
|
||||
|
||||
private static void processDomain(CrawledDomain domain) {
|
||||
@ -78,24 +86,8 @@ public class RecipeDetectorTool {
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||
|
||||
parsedDocument.getElementsByTag("a").remove();
|
||||
parsedDocument.getElementsByTag("nav").remove();
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
|
||||
|
||||
double prob = 100*recipeDetector.testP(dld);
|
||||
if (prob > 50) {
|
||||
System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
|
||||
prob = 100*woodworkingDetector.testP(dld);
|
||||
if (prob > 20) {
|
||||
System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
}
|
||||
|
||||
prob = 100*textileCraftDetector.testP(dld);
|
||||
if (prob > 20) {
|
||||
System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
||||
if (abs.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
@ -20,19 +21,20 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD;
|
||||
|
||||
public class RecipesLoaderTool {
|
||||
public class FeaturesLoaderTool {
|
||||
public static void main(String... args) {
|
||||
|
||||
HtmlFeature feature = HtmlFeature.valueOf(args[0]);
|
||||
Path file = Path.of(args[1]);
|
||||
|
||||
try (EdgeIndexClient client = new EdgeIndexClient();
|
||||
HikariDataSource ds = new DatabaseModule().provideConnection();
|
||||
Connection conn = ds.getConnection();
|
||||
PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?");
|
||||
var linesStream = Files.lines(Path.of(args[0]))) {
|
||||
var linesStream = Files.lines(file)) {
|
||||
|
||||
var urls = getUrls(ds);
|
||||
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword())));
|
||||
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(feature.getKeyword())));
|
||||
linesStream
|
||||
.map(urls::get)
|
||||
.filter(Objects::nonNull)
|
||||
@ -42,7 +44,7 @@ public class RecipesLoaderTool {
|
||||
|
||||
try {
|
||||
ps.setInt(2, urlId);
|
||||
ps.setInt(1, CATEGORY_FOOD.getFeatureBit());
|
||||
ps.setInt(1, feature.getFeatureBit());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
@ -3,6 +3,7 @@
|
||||
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
|
||||
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
|
||||
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
||||
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
|
||||
<span class="meta">{{format}}</span>
|
||||
{{#unless focusDomain}}
|
||||
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>
|
||||
|
Loading…
Reference in New Issue
Block a user