Add advertisement Feature to search,
Add adblock simulation to processor, Add filename and email address extraction to processor.
This commit is contained in:
parent
0e28ff5a72
commit
30d2a707ff
@ -46,6 +46,10 @@ public class WmsaHome {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Path getAdsDefinition() {
|
||||||
|
return getHomePath().resolve("data").resolve("adblock.txt");
|
||||||
|
}
|
||||||
|
|
||||||
public static Path getIPLocationDatabse() {
|
public static Path getIPLocationDatabse() {
|
||||||
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
|
||||||
}
|
}
|
||||||
@ -90,4 +94,5 @@ public class WmsaHome {
|
|||||||
home.resolve("model/English.DICT"),
|
home.resolve("model/English.DICT"),
|
||||||
home.resolve("model/opennlp-tok.bin"));
|
home.resolve("model/opennlp-tok.bin"));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
|||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
@ -26,6 +27,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
|
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
|
||||||
@ -199,8 +201,19 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
baseUrl = linkParser.getBaseLink(doc, baseUrl);
|
baseUrl = linkParser.getBaseLink(doc, baseUrl);
|
||||||
|
|
||||||
|
EdgeDomain domain = baseUrl.domain;
|
||||||
|
|
||||||
for (var atag : doc.getElementsByTag("a")) {
|
for (var atag : doc.getElementsByTag("a")) {
|
||||||
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
|
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||||
|
if (linkParser.shouldIndexLink(atag)) {
|
||||||
|
linkOpt.ifPresent(lp::accept);
|
||||||
|
}
|
||||||
|
else if (linkOpt.isPresent()) {
|
||||||
|
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
|
||||||
|
linkOpt.ifPresent(lp::acceptNonIndexable);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
for (var frame : doc.getElementsByTag("frame")) {
|
for (var frame : doc.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
@ -216,13 +229,44 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
final Set<String> linkTerms = new HashSet<>();
|
final Set<String> linkTerms = new HashSet<>();
|
||||||
|
|
||||||
for (var domain : lp.getForeignDomains()) {
|
for (var fd : lp.getForeignDomains()) {
|
||||||
linkTerms.add("links:"+domain.toString().toLowerCase());
|
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||||
linkTerms.add("links:"+domain.getDomain().toLowerCase());
|
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, linkTerms);
|
words.append(IndexBlock.Meta, linkTerms);
|
||||||
|
|
||||||
|
Set<String> fileKeywords = new HashSet<>(100);
|
||||||
|
for (var link : lp.getNonIndexableUrls()) {
|
||||||
|
|
||||||
|
if (!Objects.equals(domain, link.domain)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
synthesizeFilenameKeyword(fileKeywords, link);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||||
|
|
||||||
|
|
||||||
|
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
||||||
|
|
||||||
|
if (pFilename == null) return;
|
||||||
|
|
||||||
|
String filename = pFilename.toString();
|
||||||
|
if (filename.length() > 32
|
||||||
|
|| filename.endsWith(".xml")
|
||||||
|
|| filename.endsWith(".jpg")
|
||||||
|
|| filename.endsWith(".png")
|
||||||
|
|| filename.endsWith(".pdf")
|
||||||
|
|| filename.endsWith(".gif"))
|
||||||
|
return;
|
||||||
|
|
||||||
|
fileKeywords.add(filename.replace(' ', '_'));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -7,6 +10,7 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class FeatureExtractor {
|
public class FeatureExtractor {
|
||||||
|
|
||||||
private static final List<String> trackers = List.of("adform.net",
|
private static final List<String> trackers = List.of("adform.net",
|
||||||
@ -29,6 +33,13 @@ public class FeatureExtractor {
|
|||||||
"d31qbv1cthcecs.cloudfront.net",
|
"d31qbv1cthcecs.cloudfront.net",
|
||||||
"linkedin.com");
|
"linkedin.com");
|
||||||
|
|
||||||
|
private AdblockSimulator adblockSimulator;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FeatureExtractor(AdblockSimulator adblockSimulator) {
|
||||||
|
this.adblockSimulator = adblockSimulator;
|
||||||
|
}
|
||||||
|
|
||||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
||||||
Set<HtmlFeature> features = new HashSet<>();
|
Set<HtmlFeature> features = new HashSet<>();
|
||||||
|
|
||||||
@ -37,6 +48,9 @@ public class FeatureExtractor {
|
|||||||
if (scriptTags.size() > 0) {
|
if (scriptTags.size() > 0) {
|
||||||
features.add(HtmlFeature.JS);
|
features.add(HtmlFeature.JS);
|
||||||
}
|
}
|
||||||
|
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
|
||||||
|
features.add(HtmlFeature.ADVERTISEMENT);
|
||||||
|
}
|
||||||
|
|
||||||
if (!doc.getElementsByTag("object").isEmpty()
|
if (!doc.getElementsByTag("object").isEmpty()
|
||||||
|| !doc.getElementsByTag("audio").isEmpty()
|
|| !doc.getElementsByTag("audio").isEmpty()
|
||||||
@ -56,7 +70,7 @@ public class FeatureExtractor {
|
|||||||
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
|
||||||
.map(String::toLowerCase)
|
.map(String::toLowerCase)
|
||||||
.anyMatch(href ->
|
.anyMatch(href ->
|
||||||
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
|
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) {
|
||||||
features.add(HtmlFeature.AFFILIATE_LINK);
|
features.add(HtmlFeature.AFFILIATE_LINK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,8 @@ public enum HtmlFeature {
|
|||||||
COOKIES("special:cookies"),
|
COOKIES("special:cookies"),
|
||||||
|
|
||||||
CATEGORY_FOOD("category:food"),
|
CATEGORY_FOOD("category:food"),
|
||||||
|
|
||||||
|
ADVERTISEMENT("special:ads"),
|
||||||
;
|
;
|
||||||
|
|
||||||
private final String keyword;
|
private final String keyword;
|
||||||
|
@ -40,6 +40,17 @@ public class LinkParser {
|
|||||||
.flatMap(this::createEdgeUrl);
|
.flatMap(this::createEdgeUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Contract(pure=true)
|
||||||
|
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
||||||
|
return Optional.of(l)
|
||||||
|
.map(this::getUrl)
|
||||||
|
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||||
|
.flatMap(this::createURI)
|
||||||
|
.map(URI::normalize)
|
||||||
|
.map(this::renormalize)
|
||||||
|
.flatMap(this::createEdgeUrl);
|
||||||
|
}
|
||||||
|
|
||||||
private Optional<URI> createURI(String s) {
|
private Optional<URI> createURI(String s) {
|
||||||
try {
|
try {
|
||||||
return Optional.of(new URI(s));
|
return Optional.of(new URI(s));
|
||||||
@ -146,17 +157,20 @@ public class LinkParser {
|
|||||||
return s.matches("^[a-zA-Z]+:.*$");
|
return s.matches("^[a-zA-Z]+:.*$");
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean shouldIndexLink(Element link) {
|
public boolean shouldIndexLink(Element link) {
|
||||||
return isUrlRelevant(link.attr("href"))
|
return isUrlRelevant(link.attr("href"))
|
||||||
&& isRelRelevant(link.attr("rel"));
|
&& isRelRelevant(link.attr("rel"));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isRelRelevant(String rel) {
|
public boolean isRelRelevant(String rel) {
|
||||||
// this is null safe
|
// this is null safe
|
||||||
return !"noindex".equalsIgnoreCase(rel);
|
return !"noindex".equalsIgnoreCase(rel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hasBinarySuffix(String href) {
|
||||||
|
return blockSuffixList.stream().anyMatch(href::endsWith);
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isUrlRelevant(String href) {
|
private boolean isUrlRelevant(String href) {
|
||||||
if (null == href || "".equals(href)) {
|
if (null == href || "".equals(href)) {
|
||||||
return false;
|
return false;
|
||||||
@ -164,7 +178,7 @@ public class LinkParser {
|
|||||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (blockSuffixList.stream().anyMatch(href::endsWith)) {
|
if (hasBinarySuffix(href)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (href.length() > 128) {
|
if (href.length() > 128) {
|
||||||
|
@ -13,6 +13,9 @@ import java.util.Set;
|
|||||||
public class LinkProcessor {
|
public class LinkProcessor {
|
||||||
private final ProcessedDocumentDetails ret;
|
private final ProcessedDocumentDetails ret;
|
||||||
private final EdgeUrl baseUrl;
|
private final EdgeUrl baseUrl;
|
||||||
|
|
||||||
|
private final Set<EdgeUrl> nonIndexable = new HashSet<>();
|
||||||
|
|
||||||
private final Set<EdgeUrl> seenUrls = new HashSet<>();
|
private final Set<EdgeUrl> seenUrls = new HashSet<>();
|
||||||
private final Set<EdgeDomain> foreignDomains = new HashSet<>();
|
private final Set<EdgeDomain> foreignDomains = new HashSet<>();
|
||||||
|
|
||||||
@ -33,6 +36,10 @@ public class LinkProcessor {
|
|||||||
return foreignDomains;
|
return foreignDomains;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<EdgeUrl> getNonIndexableUrls() {
|
||||||
|
return nonIndexable;
|
||||||
|
}
|
||||||
|
|
||||||
public void accept(EdgeUrl link) {
|
public void accept(EdgeUrl link) {
|
||||||
if (!isLinkPermitted(link)) {
|
if (!isLinkPermitted(link)) {
|
||||||
return;
|
return;
|
||||||
@ -87,4 +94,8 @@ public class LinkProcessor {
|
|||||||
return proto.equalsIgnoreCase("http")
|
return proto.equalsIgnoreCase("http")
|
||||||
|| proto.equalsIgnoreCase("https");
|
|| proto.equalsIgnoreCase("https");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void acceptNonIndexable(EdgeUrl edgeUrl) {
|
||||||
|
nonIndexable.add(edgeUrl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,9 +28,9 @@ public class QueryParams {
|
|||||||
|
|
||||||
public static boolean isPermittedParam(String path, String param) {
|
public static boolean isPermittedParam(String path, String param) {
|
||||||
if (path.endsWith("index.php")) {
|
if (path.endsWith("index.php")) {
|
||||||
if (param.startsWith("showtopic"))
|
if (param.startsWith("showtopic="))
|
||||||
return true;
|
return true;
|
||||||
if (param.startsWith("showforum"))
|
if (param.startsWith("showforum="))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (path.endsWith("viewtopic.php")) {
|
if (path.endsWith("viewtopic.php")) {
|
||||||
@ -45,6 +45,10 @@ public class QueryParams {
|
|||||||
if (path.endsWith("showforum.php")) {
|
if (path.endsWith("showforum.php")) {
|
||||||
return param.startsWith("v=");
|
return param.startsWith("v=");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||||
|
return param.startsWith("project=") || param.startsWith("story=");
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,133 +1,181 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
import org.jsoup.select.NodeFilter;
|
import org.jsoup.select.NodeFilter;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class AdblockSimulator {
|
public class AdblockSimulator {
|
||||||
|
private final Set<String> idRules = new HashSet<>();
|
||||||
|
|
||||||
List<String> idRules = new ArrayList();
|
private final Set<String> classRules = new HashSet<>();
|
||||||
List<String> classRules = new ArrayList();
|
private final List<Predicate<String>> scriptRules = new ArrayList<>();
|
||||||
List<Predicate<String>> scriptRules = new ArrayList();
|
|
||||||
|
|
||||||
public AdblockSimulator(Path adsDefinition) throws IOException {
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
try (var lineStream = Files.lines(adsDefinition)) {
|
|
||||||
|
@Inject
|
||||||
|
public AdblockSimulator() throws IOException {
|
||||||
|
Path adDef = WmsaHome.getAdsDefinition();
|
||||||
|
|
||||||
|
if (!Files.exists(adDef)) {
|
||||||
|
logger.error("Can not find ads definition file in {}", adDef);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var lineStream = Files.lines(adDef)) {
|
||||||
lineStream.skip(1).forEach(this::addRule);
|
lineStream.skip(1).forEach(this::addRule);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addRule(String s) {
|
|
||||||
if (s.startsWith("##") && !s.contains(":")) {
|
|
||||||
if (s.startsWith("###")) {
|
|
||||||
idRules.add(s.substring(3));
|
|
||||||
} else if(s.startsWith("##.")) {
|
|
||||||
classRules.add(s.substring(3));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (!s.startsWith("!") && !s.contains("#")){
|
|
||||||
scriptRules.add(toRegexMatcher(s));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Predicate<String> toRegexMatcher(String s) {
|
|
||||||
|
|
||||||
System.out.println("<-" + s);
|
|
||||||
|
|
||||||
s = s.replaceAll("\\?", "\\\\?");
|
|
||||||
s = s.replaceAll("\\.", "\\\\.");
|
|
||||||
s = s.replaceAll("\\$", "\\\\\\$");
|
|
||||||
|
|
||||||
if (s.startsWith("||")) {
|
|
||||||
s = s.replaceFirst("\\|\\|","^http(s)?://");
|
|
||||||
}
|
|
||||||
|
|
||||||
s = s.replaceAll("\\|", "\\\\|");
|
|
||||||
s = s.replaceAll("\\*", ".*");
|
|
||||||
s = s.replaceAll("\\^", "[?/]");
|
|
||||||
|
|
||||||
|
|
||||||
System.out.println("->" + s);
|
|
||||||
return Pattern.compile(s).asPredicate();
|
|
||||||
}
|
|
||||||
|
|
||||||
class RuleVisitor implements NodeFilter {
|
|
||||||
public boolean sawAds;
|
|
||||||
Pattern spPattern = Pattern.compile("\\s");
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public FilterResult head(Node node, int depth) {
|
|
||||||
|
|
||||||
if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow
|
|
||||||
|
|
||||||
String id = elem.id();
|
|
||||||
for (var rule : idRules) {
|
|
||||||
if (rule.equals(id)) {
|
|
||||||
sawAds = true;
|
|
||||||
return FilterResult.STOP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
String classes = elem.className();
|
|
||||||
if (classes.isBlank()) return FilterResult.CONTINUE;
|
|
||||||
|
|
||||||
if (classes.indexOf(' ') > 0) {
|
|
||||||
String[] classNames = spPattern.split(classes);
|
|
||||||
for (var rule : classRules) {
|
|
||||||
|
|
||||||
for (var className : classNames) {
|
|
||||||
if (className.equals(rule)) {
|
|
||||||
sawAds = true;
|
|
||||||
return FilterResult.STOP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else { // tag only has one class
|
|
||||||
for (var rule : classRules) {
|
|
||||||
if (classes.equals(rule)) {
|
|
||||||
sawAds = true;
|
|
||||||
return FilterResult.STOP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ("script".equals(elem.tagName())) {
|
|
||||||
String src = elem.attr("src");
|
|
||||||
|
|
||||||
for (var rule : scriptRules) {
|
|
||||||
if (rule.test(src)) {
|
|
||||||
sawAds = true;
|
|
||||||
return FilterResult.STOP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return FilterResult.CONTINUE;
|
|
||||||
}
|
|
||||||
return FilterResult.CONTINUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public FilterResult tail(Node node, int depth) {
|
|
||||||
return FilterResult.CONTINUE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasAds(Document document) {
|
public boolean hasAds(Document document) {
|
||||||
|
|
||||||
RuleVisitor ruleVisitor = new RuleVisitor();
|
RuleVisitor ruleVisitor = new RuleVisitor();
|
||||||
|
|
||||||
document.filter(ruleVisitor);
|
document.filter(ruleVisitor);
|
||||||
|
|
||||||
return ruleVisitor.sawAds;
|
return ruleVisitor.sawAds;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addRule(String s) {
|
||||||
|
try {
|
||||||
|
if (s.startsWith("##") && !s.contains(":")) {
|
||||||
|
if (s.startsWith("###")) {
|
||||||
|
idRules.add(s.substring(3));
|
||||||
|
} else if (s.startsWith("##.")) {
|
||||||
|
classRules.add(s.substring(3));
|
||||||
|
}
|
||||||
|
} else if (s.startsWith("/^")) {
|
||||||
|
int end = s.indexOf("[^\\]/");
|
||||||
|
if (end >= 0) {
|
||||||
|
String patternString = s.substring(1, end+1);
|
||||||
|
scriptRules.add(Pattern.compile(patternString).asPredicate());
|
||||||
|
}
|
||||||
|
} else if (!s.startsWith("!") && !s.contains("#") && !s.startsWith("@@")) {
|
||||||
|
if (!s.contains("$")) {
|
||||||
|
scriptRules.add(toRegexMatcher(s));
|
||||||
|
}
|
||||||
|
else if (s.contains("$script") && !s.contains("domain=")) {
|
||||||
|
scriptRules.add(toRegexMatcher(s.substring(0, s.indexOf('$'))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
System.err.println("Failed to add rule " + s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Predicate<String> toRegexMatcher(String s) {
|
||||||
|
String sOriginal = s;
|
||||||
|
if (s.isBlank()) return unused -> false;
|
||||||
|
|
||||||
|
// In some cases, regexes aren't necessary
|
||||||
|
if (s.matches("[&?=/A-Za-z0-9._-]+")) {
|
||||||
|
if (s.startsWith("/")) {
|
||||||
|
return str -> str.equals(sOriginal);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return str -> str.contains(sOriginal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.matches("[&?=/A-Za-z0-9._-]+\\*")) {
|
||||||
|
return str -> str.startsWith(sOriginal.substring(0, sOriginal.length()-1));
|
||||||
|
}
|
||||||
|
|
||||||
|
String s0 = s;
|
||||||
|
s = s.replaceAll("\\?", "\\\\?");
|
||||||
|
s = s.replaceAll("\\.", "\\\\.");
|
||||||
|
|
||||||
|
s = s.replaceAll("\\^", "[?/]");
|
||||||
|
s = s.replaceAll("\\*", ".*");
|
||||||
|
|
||||||
|
if (s.startsWith("||")) {
|
||||||
|
s = s.replaceFirst("\\|\\|","^http[s]?://.*");
|
||||||
|
}
|
||||||
|
|
||||||
|
s = s.replaceAll("\\|", "\\\\|");
|
||||||
|
return Pattern.compile(s).asPredicate();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
||||||
|
// This version is about 100x faster than the a "clean" first stab implementation.
|
||||||
|
|
||||||
|
class RuleVisitor implements NodeFilter {
|
||||||
|
public boolean sawAds;
|
||||||
|
|
||||||
|
Pattern spPattern = Pattern.compile("\\s");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult head(Node node, int depth) {
|
||||||
|
|
||||||
|
if (node.attributesSize() > 0 && node instanceof Element elem) {
|
||||||
|
if (testId(elem) || testClass(elem) || testScriptTags(elem)) {
|
||||||
|
sawAds = true;
|
||||||
|
return FilterResult.STOP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean testScriptTags(Element elem) {
|
||||||
|
if (!"script".equals(elem.tagName())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
String src = elem.attr("src");
|
||||||
|
for (var rule : scriptRules) {
|
||||||
|
if (rule.test(src)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean testId(Element elem) {
|
||||||
|
String id = elem.id();
|
||||||
|
|
||||||
|
return idRules.contains(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean testClass(Element elem) {
|
||||||
|
String classes = elem.className();
|
||||||
|
if (classes.isBlank())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (classes.indexOf(' ') > 0) {
|
||||||
|
String[] classNames = spPattern.split(classes);
|
||||||
|
for (var className : classNames) {
|
||||||
|
if (classRules.contains(className))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { // tag only has one class, no need to split
|
||||||
|
return classRules.contains(classes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public FilterResult tail(Node node, int depth) {
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,17 +5,17 @@ import com.google.gson.Gson;
|
|||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class CrawledDomainReader {
|
public class CrawledDomainReader {
|
||||||
private final Gson gson = new GsonBuilder().create();
|
private final Gson gson = new GsonBuilder().create();
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
|
||||||
|
|
||||||
public CrawledDomainReader() {
|
public CrawledDomainReader() {
|
||||||
}
|
}
|
||||||
@ -23,7 +23,17 @@ public class CrawledDomainReader {
|
|||||||
public CrawledDomain read(Path path) throws IOException {
|
public CrawledDomain read(Path path) throws IOException {
|
||||||
List<CrawledDocument> docs = new ArrayList<>();
|
List<CrawledDocument> docs = new ArrayList<>();
|
||||||
CrawledDomain domain = null;
|
CrawledDomain domain = null;
|
||||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) {
|
|
||||||
|
|
||||||
|
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||||
|
br.mark(2);
|
||||||
|
boolean legacy = '{' == br.read();
|
||||||
|
br.reset();
|
||||||
|
|
||||||
|
if (legacy) {
|
||||||
|
domain = gson.fromJson(br, CrawledDomain.class);
|
||||||
|
}
|
||||||
|
else {
|
||||||
String line;
|
String line;
|
||||||
while ((line = br.readLine()) != null) {
|
while ((line = br.readLine()) != null) {
|
||||||
if (line.startsWith("//")) {
|
if (line.startsWith("//")) {
|
||||||
@ -35,17 +45,23 @@ public class CrawledDomainReader {
|
|||||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
||||||
}
|
}
|
||||||
}
|
} else if (line.charAt(0) == '{') {
|
||||||
else if (line.charAt(0) == '{') {
|
|
||||||
domain = gson.fromJson(line, CrawledDomain.class);
|
domain = gson.fromJson(line, CrawledDomain.class);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (domain == null) {
|
if (domain == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!docs.isEmpty()) {
|
||||||
|
if (domain.doc == null)
|
||||||
|
domain.doc = new ArrayList<>();
|
||||||
|
|
||||||
domain.doc.addAll(docs);
|
domain.doc.addAll(docs);
|
||||||
|
}
|
||||||
return domain;
|
return domain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ public enum IndexBlock {
|
|||||||
Meta(7, 7),
|
Meta(7, 7),
|
||||||
PositionWords(8, 4.5),
|
PositionWords(8, 4.5),
|
||||||
NamesWords(9, 5),
|
NamesWords(9, 5),
|
||||||
Unused(10, 10),
|
Artifacts(10, 10),
|
||||||
Topic(11, 0.5);
|
Topic(11, 0.5);
|
||||||
|
|
||||||
public final int id;
|
public final int id;
|
||||||
|
@ -132,6 +132,8 @@ public class EdgeUrlDetails {
|
|||||||
public boolean isCookies() {
|
public boolean isCookies() {
|
||||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||||
}
|
}
|
||||||
|
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
|
||||||
|
|
||||||
public boolean isSpecialDomain() {
|
public boolean isSpecialDomain() {
|
||||||
return domainState == EdgeDomainIndexingState.SPECIAL;
|
return domainState == EdgeDomainIndexingState.SPECIAL;
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,7 @@ public class AdblockTesterTool {
|
|||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt"));
|
simulator = new AdblockSimulator();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
@ -29,7 +29,6 @@ public class AdblockTesterTool {
|
|||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
|
|
||||||
try (var iterable = plan.domainsIterable()) {
|
try (var iterable = plan.domainsIterable()) {
|
||||||
for (var domain : iterable) {
|
for (var domain : iterable) {
|
||||||
processDomain(domain);
|
processDomain(domain);
|
||||||
|
@ -0,0 +1,56 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.tools;
|
||||||
|
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Injector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class ConverterLogicTestTool {
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException {
|
||||||
|
|
||||||
|
if (args.length != 1) {
|
||||||
|
System.err.println("Arguments: crawl-plan.yaml");
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
|
Injector injector = Guice.createInjector(
|
||||||
|
new ConverterModule(plan)
|
||||||
|
);
|
||||||
|
|
||||||
|
injector.getInstance(ConverterLogicTestTool.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ConverterLogicTestTool(
|
||||||
|
EdgeCrawlPlan plan,
|
||||||
|
DomainProcessor processor
|
||||||
|
) throws Exception {
|
||||||
|
|
||||||
|
plan.forEachCrawledDomain(domain -> {
|
||||||
|
var ret = processor.process(domain);
|
||||||
|
ret.documents.forEach(doc -> {
|
||||||
|
if (doc.words == null)
|
||||||
|
return;
|
||||||
|
var artifacts = doc.words.get(IndexBlock.Artifacts);
|
||||||
|
if (artifacts.size() > 0) {
|
||||||
|
System.out.println(doc.url + ": " + artifacts);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,13 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.tools;
|
package nu.marginalia.wmsa.edge.tools;
|
||||||
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
@ -20,21 +15,24 @@ import java.nio.file.Path;
|
|||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ForkJoinPool;
|
import java.util.concurrent.*;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
|
||||||
|
|
||||||
public class RecipeDetectorTool {
|
public class CrawlDataExtractorTool {
|
||||||
private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
private static final AdblockSimulator abs;
|
||||||
private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
|
||||||
private static final RecipeDetector recipeDetector = new RecipeDetector();
|
|
||||||
|
|
||||||
private static final LanguageModels lm = WmsaHome.getLanguageModels();
|
static {
|
||||||
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
|
try {
|
||||||
|
abs = new AdblockSimulator();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static final Set<String> urls = new HashSet<>(50_000_000);
|
private static final Set<String> urls = new HashSet<>(50_000_000);
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
DatabaseModule module = new DatabaseModule();
|
DatabaseModule module = new DatabaseModule();
|
||||||
@ -51,15 +49,25 @@ public class RecipeDetectorTool {
|
|||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
ForkJoinPool pool = new ForkJoinPool(16);
|
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
|
||||||
|
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
|
||||||
|
Semaphore sem = new Semaphore(20);
|
||||||
|
|
||||||
try (var iterable = plan.domainsIterable()) {
|
try (var iterable = plan.domainsIterable()) {
|
||||||
for (var domain : iterable) {
|
for (var domain : iterable) {
|
||||||
pool.execute(() -> processDomain(domain));
|
sem.acquire();
|
||||||
|
pool.execute(() -> {
|
||||||
|
try { processDomain(domain); }
|
||||||
|
finally { sem.release(); }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
|
pool.shutdown();
|
||||||
|
|
||||||
|
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void processDomain(CrawledDomain domain) {
|
private static void processDomain(CrawledDomain domain) {
|
||||||
@ -78,24 +86,8 @@ public class RecipeDetectorTool {
|
|||||||
private static void processDocument(CrawledDocument doc) {
|
private static void processDocument(CrawledDocument doc) {
|
||||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||||
|
|
||||||
parsedDocument.getElementsByTag("a").remove();
|
if (abs.hasAds(parsedDocument)) {
|
||||||
parsedDocument.getElementsByTag("nav").remove();
|
System.out.println(doc.url);
|
||||||
|
|
||||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
|
|
||||||
|
|
||||||
double prob = 100*recipeDetector.testP(dld);
|
|
||||||
if (prob > 50) {
|
|
||||||
System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
|
||||||
}
|
|
||||||
|
|
||||||
prob = 100*woodworkingDetector.testP(dld);
|
|
||||||
if (prob > 20) {
|
|
||||||
System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
|
||||||
}
|
|
||||||
|
|
||||||
prob = 100*textileCraftDetector.testP(dld);
|
|
||||||
if (prob > 20) {
|
|
||||||
System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
|
|||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
@ -20,19 +21,20 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD;
|
public class FeaturesLoaderTool {
|
||||||
|
|
||||||
public class RecipesLoaderTool {
|
|
||||||
public static void main(String... args) {
|
public static void main(String... args) {
|
||||||
|
|
||||||
|
HtmlFeature feature = HtmlFeature.valueOf(args[0]);
|
||||||
|
Path file = Path.of(args[1]);
|
||||||
|
|
||||||
try (EdgeIndexClient client = new EdgeIndexClient();
|
try (EdgeIndexClient client = new EdgeIndexClient();
|
||||||
HikariDataSource ds = new DatabaseModule().provideConnection();
|
HikariDataSource ds = new DatabaseModule().provideConnection();
|
||||||
Connection conn = ds.getConnection();
|
Connection conn = ds.getConnection();
|
||||||
PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?");
|
PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?");
|
||||||
var linesStream = Files.lines(Path.of(args[0]))) {
|
var linesStream = Files.lines(file)) {
|
||||||
|
|
||||||
var urls = getUrls(ds);
|
var urls = getUrls(ds);
|
||||||
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword())));
|
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(feature.getKeyword())));
|
||||||
linesStream
|
linesStream
|
||||||
.map(urls::get)
|
.map(urls::get)
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
@ -42,7 +44,7 @@ public class RecipesLoaderTool {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
ps.setInt(2, urlId);
|
ps.setInt(2, urlId);
|
||||||
ps.setInt(1, CATEGORY_FOOD.getFeatureBit());
|
ps.setInt(1, feature.getFeatureBit());
|
||||||
ps.executeUpdate();
|
ps.executeUpdate();
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
@ -3,6 +3,7 @@
|
|||||||
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
|
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
|
||||||
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
|
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
|
||||||
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
|
||||||
|
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
|
||||||
<span class="meta">{{format}}</span>
|
<span class="meta">{{format}}</span>
|
||||||
{{#unless focusDomain}}
|
{{#unless focusDomain}}
|
||||||
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>
|
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>
|
||||||
|
Loading…
Reference in New Issue
Block a user