Add advertisement Feature to search,

Add adblock simulation to processor,
Add filename and email address extraction to processor.
This commit is contained in:
vlofgren 2022-08-12 13:50:18 +02:00
parent 0e28ff5a72
commit 30d2a707ff
16 changed files with 387 additions and 177 deletions

View File

@ -46,6 +46,10 @@ public class WmsaHome {
}
}
public static Path getAdsDefinition() {
return getHomePath().resolve("data").resolve("adblock.txt");
}
public static Path getIPLocationDatabse() {
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
}
@ -90,4 +94,5 @@ public class WmsaHome {
home.resolve("model/English.DICT"),
home.resolve("model/opennlp-tok.bin"));
}
}

View File

@ -16,6 +16,7 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
@ -26,6 +27,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.util.*;
import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN;
@ -199,8 +201,19 @@ public class DocumentProcessor {
baseUrl = linkParser.getBaseLink(doc, baseUrl);
EdgeDomain domain = baseUrl.domain;
for (var atag : doc.getElementsByTag("a")) {
linkParser.parseLink(baseUrl, atag).ifPresent(lp::accept);
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
if (linkParser.shouldIndexLink(atag)) {
linkOpt.ifPresent(lp::accept);
}
else if (linkOpt.isPresent()) {
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
linkOpt.ifPresent(lp::acceptNonIndexable);
}
}
}
for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
@ -216,13 +229,44 @@ public class DocumentProcessor {
final Set<String> linkTerms = new HashSet<>();
for (var domain : lp.getForeignDomains()) {
linkTerms.add("links:"+domain.toString().toLowerCase());
linkTerms.add("links:"+domain.getDomain().toLowerCase());
for (var fd : lp.getForeignDomains()) {
linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase());
}
words.append(IndexBlock.Meta, linkTerms);
Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) {
if (!Objects.equals(domain, link.domain)) {
continue;
}
synthesizeFilenameKeyword(fileKeywords, link);
}
words.append(IndexBlock.Artifacts, fileKeywords);
}
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
if (pFilename == null) return;
String filename = pFilename.toString();
if (filename.length() > 32
|| filename.endsWith(".xml")
|| filename.endsWith(".jpg")
|| filename.endsWith(".png")
|| filename.endsWith(".pdf")
|| filename.endsWith(".gif"))
return;
fileKeywords.add(filename.replace(' ', '_'));
}
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {

View File

@ -1,5 +1,8 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jsoup.nodes.Document;
@ -7,6 +10,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
@Singleton
public class FeatureExtractor {
private static final List<String> trackers = List.of("adform.net",
@ -29,6 +33,13 @@ public class FeatureExtractor {
"d31qbv1cthcecs.cloudfront.net",
"linkedin.com");
private AdblockSimulator adblockSimulator;
@Inject
public FeatureExtractor(AdblockSimulator adblockSimulator) {
this.adblockSimulator = adblockSimulator;
}
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
Set<HtmlFeature> features = new HashSet<>();
@ -37,6 +48,9 @@ public class FeatureExtractor {
if (scriptTags.size() > 0) {
features.add(HtmlFeature.JS);
}
else if(adblockSimulator.hasAds(doc.clone())) { // Only look for ads if there is javascript
features.add(HtmlFeature.ADVERTISEMENT);
}
if (!doc.getElementsByTag("object").isEmpty()
|| !doc.getElementsByTag("audio").isEmpty()
@ -56,7 +70,7 @@ public class FeatureExtractor {
if (doc.getElementsByTag("a").stream().map(e -> e.attr("href"))
.map(String::toLowerCase)
.anyMatch(href ->
href.contains("amzn.to/") || href.contains("amazon.com/"))) {
href.contains("amzn.to/") || (href.contains("amazon.com/") & href.contains("tag=")))) {
features.add(HtmlFeature.AFFILIATE_LINK);
}

View File

@ -10,6 +10,8 @@ public enum HtmlFeature {
COOKIES("special:cookies"),
CATEGORY_FOOD("category:food"),
ADVERTISEMENT("special:ads"),
;
private final String keyword;

View File

@ -40,6 +40,17 @@ public class LinkParser {
.flatMap(this::createEdgeUrl);
}
@Contract(pure=true)
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
return Optional.of(l)
.map(this::getUrl)
.map(link -> resolveUrl(relativeBaseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
.flatMap(this::createEdgeUrl);
}
private Optional<URI> createURI(String s) {
try {
return Optional.of(new URI(s));
@ -146,17 +157,20 @@ public class LinkParser {
return s.matches("^[a-zA-Z]+:.*$");
}
private boolean shouldIndexLink(Element link) {
public boolean shouldIndexLink(Element link) {
return isUrlRelevant(link.attr("href"))
&& isRelRelevant(link.attr("rel"));
}
private boolean isRelRelevant(String rel) {
public boolean isRelRelevant(String rel) {
// this is null safe
return !"noindex".equalsIgnoreCase(rel);
}
public boolean hasBinarySuffix(String href) {
return blockSuffixList.stream().anyMatch(href::endsWith);
}
private boolean isUrlRelevant(String href) {
if (null == href || "".equals(href)) {
return false;
@ -164,7 +178,7 @@ public class LinkParser {
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
return false;
}
if (blockSuffixList.stream().anyMatch(href::endsWith)) {
if (hasBinarySuffix(href)) {
return false;
}
if (href.length() > 128) {

View File

@ -13,6 +13,9 @@ import java.util.Set;
public class LinkProcessor {
private final ProcessedDocumentDetails ret;
private final EdgeUrl baseUrl;
private final Set<EdgeUrl> nonIndexable = new HashSet<>();
private final Set<EdgeUrl> seenUrls = new HashSet<>();
private final Set<EdgeDomain> foreignDomains = new HashSet<>();
@ -33,6 +36,10 @@ public class LinkProcessor {
return foreignDomains;
}
public Set<EdgeUrl> getNonIndexableUrls() {
return nonIndexable;
}
public void accept(EdgeUrl link) {
if (!isLinkPermitted(link)) {
return;
@ -87,4 +94,8 @@ public class LinkProcessor {
return proto.equalsIgnoreCase("http")
|| proto.equalsIgnoreCase("https");
}
public void acceptNonIndexable(EdgeUrl edgeUrl) {
nonIndexable.add(edgeUrl);
}
}

View File

@ -28,9 +28,9 @@ public class QueryParams {
public static boolean isPermittedParam(String path, String param) {
if (path.endsWith("index.php")) {
if (param.startsWith("showtopic"))
if (param.startsWith("showtopic="))
return true;
if (param.startsWith("showforum"))
if (param.startsWith("showforum="))
return true;
}
if (path.endsWith("viewtopic.php")) {
@ -45,6 +45,10 @@ public class QueryParams {
if (path.endsWith("showforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("StoryView.py")) { // folklore.org is neat
return param.startsWith("project=") || param.startsWith("story=");
}
return false;
}
}

View File

@ -1,133 +1,181 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.configuration.WmsaHome;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@Singleton
public class AdblockSimulator {
private final Set<String> idRules = new HashSet<>();
List<String> idRules = new ArrayList();
List<String> classRules = new ArrayList();
List<Predicate<String>> scriptRules = new ArrayList();
private final Set<String> classRules = new HashSet<>();
private final List<Predicate<String>> scriptRules = new ArrayList<>();
public AdblockSimulator(Path adsDefinition) throws IOException {
try (var lineStream = Files.lines(adsDefinition)) {
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public AdblockSimulator() throws IOException {
Path adDef = WmsaHome.getAdsDefinition();
if (!Files.exists(adDef)) {
logger.error("Can not find ads definition file in {}", adDef);
return;
}
try (var lineStream = Files.lines(adDef)) {
lineStream.skip(1).forEach(this::addRule);
}
}
public boolean hasAds(Document document) {
RuleVisitor ruleVisitor = new RuleVisitor();
document.filter(ruleVisitor);
return ruleVisitor.sawAds;
}
private void addRule(String s) {
try {
if (s.startsWith("##") && !s.contains(":")) {
if (s.startsWith("###")) {
idRules.add(s.substring(3));
} else if (s.startsWith("##.")) {
classRules.add(s.substring(3));
}
} else if (s.startsWith("/^")) {
int end = s.indexOf("[^\\]/");
if (end >= 0) {
String patternString = s.substring(1, end+1);
scriptRules.add(Pattern.compile(patternString).asPredicate());
}
else if (!s.startsWith("!") && !s.contains("#")){
} else if (!s.startsWith("!") && !s.contains("#") && !s.startsWith("@@")) {
if (!s.contains("$")) {
scriptRules.add(toRegexMatcher(s));
}
else if (s.contains("$script") && !s.contains("domain=")) {
scriptRules.add(toRegexMatcher(s.substring(0, s.indexOf('$'))));
}
}
}
catch (Exception ex) {
System.err.println("Failed to add rule " + s);
}
}
private Predicate<String> toRegexMatcher(String s) {
String sOriginal = s;
if (s.isBlank()) return unused -> false;
System.out.println("<-" + s);
// In some cases, regexes aren't necessary
if (s.matches("[&?=/A-Za-z0-9._-]+")) {
if (s.startsWith("/")) {
return str -> str.equals(sOriginal);
}
else {
return str -> str.contains(sOriginal);
}
}
if (s.matches("[&?=/A-Za-z0-9._-]+\\*")) {
return str -> str.startsWith(sOriginal.substring(0, sOriginal.length()-1));
}
String s0 = s;
s = s.replaceAll("\\?", "\\\\?");
s = s.replaceAll("\\.", "\\\\.");
s = s.replaceAll("\\$", "\\\\\\$");
s = s.replaceAll("\\^", "[?/]");
s = s.replaceAll("\\*", ".*");
if (s.startsWith("||")) {
s = s.replaceFirst("\\|\\|","^http(s)?://");
s = s.replaceFirst("\\|\\|","^http[s]?://.*");
}
s = s.replaceAll("\\|", "\\\\|");
s = s.replaceAll("\\*", ".*");
s = s.replaceAll("\\^", "[?/]");
System.out.println("->" + s);
return Pattern.compile(s).asPredicate();
}
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
// This version is about 100x faster than the a "clean" first stab implementation.
class RuleVisitor implements NodeFilter {
public boolean sawAds;
Pattern spPattern = Pattern.compile("\\s");
@Override
public FilterResult head(Node node, int depth) {
if (node.attributesSize() > 0 && node instanceof Element elem) { // instanceof is slow
String id = elem.id();
for (var rule : idRules) {
if (rule.equals(id)) {
if (node.attributesSize() > 0 && node instanceof Element elem) {
if (testId(elem) || testClass(elem) || testScriptTags(elem)) {
sawAds = true;
return FilterResult.STOP;
}
}
return FilterResult.CONTINUE;
}
private boolean testScriptTags(Element elem) {
if (!"script".equals(elem.tagName())) {
return false;
}
String src = elem.attr("src");
for (var rule : scriptRules) {
if (rule.test(src)) {
return true;
}
}
return false;
}
private boolean testId(Element elem) {
String id = elem.id();
return idRules.contains(id);
}
private boolean testClass(Element elem) {
String classes = elem.className();
if (classes.isBlank()) return FilterResult.CONTINUE;
if (classes.isBlank())
return false;
if (classes.indexOf(' ') > 0) {
String[] classNames = spPattern.split(classes);
for (var rule : classRules) {
for (var className : classNames) {
if (className.equals(rule)) {
sawAds = true;
return FilterResult.STOP;
}
}
}
}
else { // tag only has one class
for (var rule : classRules) {
if (classes.equals(rule)) {
sawAds = true;
return FilterResult.STOP;
if (classRules.contains(className))
return true;
}
}
else { // tag only has one class, no need to split
return classRules.contains(classes);
}
if ("script".equals(elem.tagName())) {
String src = elem.attr("src");
for (var rule : scriptRules) {
if (rule.test(src)) {
sawAds = true;
return FilterResult.STOP;
return false;
}
}
}
return FilterResult.CONTINUE;
}
return FilterResult.CONTINUE;
}
@Override
public FilterResult tail(Node node, int depth) {
return FilterResult.CONTINUE;
}
}
public boolean hasAds(Document document) {
RuleVisitor ruleVisitor = new RuleVisitor();
document.filter(ruleVisitor);
return ruleVisitor.sawAds;
}
}

View File

@ -5,17 +5,17 @@ import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class CrawledDomainReader {
private final Gson gson = new GsonBuilder().create();
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
public CrawledDomainReader() {
}
@ -23,7 +23,17 @@ public class CrawledDomainReader {
public CrawledDomain read(Path path) throws IOException {
List<CrawledDocument> docs = new ArrayList<>();
CrawledDomain domain = null;
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new BufferedInputStream(new FileInputStream(path.toFile())))))) {
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
br.mark(2);
boolean legacy = '{' == br.read();
br.reset();
if (legacy) {
domain = gson.fromJson(br, CrawledDomain.class);
}
else {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("//")) {
@ -35,17 +45,23 @@ public class CrawledDomainReader {
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
}
}
else if (line.charAt(0) == '{') {
} else if (line.charAt(0) == '{') {
domain = gson.fromJson(line, CrawledDomain.class);
}
}
}
}
if (domain == null) {
return null;
}
if (!docs.isEmpty()) {
if (domain.doc == null)
domain.doc = new ArrayList<>();
domain.doc.addAll(docs);
}
return domain;
}

View File

@ -11,7 +11,7 @@ public enum IndexBlock {
Meta(7, 7),
PositionWords(8, 4.5),
NamesWords(9, 5),
Unused(10, 10),
Artifacts(10, 10),
Topic(11, 0.5);
public final int id;

View File

@ -132,6 +132,8 @@ public class EdgeUrlDetails {
public boolean isCookies() {
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
}
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
public boolean isSpecialDomain() {
return domainState == EdgeDomainIndexingState.SPECIAL;
}

View File

@ -19,7 +19,7 @@ public class AdblockTesterTool {
static {
try {
simulator = new AdblockSimulator(Path.of("/home/vlofgren/easylist.txt"));
simulator = new AdblockSimulator();
} catch (IOException e) {
throw new RuntimeException(e);
}
@ -29,7 +29,6 @@ public class AdblockTesterTool {
public static void main(String... args) throws IOException {
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
processDomain(domain);

View File

@ -0,0 +1,56 @@
package nu.marginalia.wmsa.edge.tools;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.wmsa.edge.converting.ConverterModule;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
public class ConverterLogicTestTool {
private final Logger logger = LoggerFactory.getLogger(getClass());
public static void main(String... args) throws IOException {
if (args.length != 1) {
System.err.println("Arguments: crawl-plan.yaml");
System.exit(0);
}
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector(
new ConverterModule(plan)
);
injector.getInstance(ConverterLogicTestTool.class);
}
@Inject
public ConverterLogicTestTool(
EdgeCrawlPlan plan,
DomainProcessor processor
) throws Exception {
plan.forEachCrawledDomain(domain -> {
var ret = processor.process(domain);
ret.documents.forEach(doc -> {
if (doc.words == null)
return;
var artifacts = doc.words.get(IndexBlock.Artifacts);
if (artifacts.size() > 0) {
System.out.println(doc.url + ": " + artifacts);
}
});
});
}
}

View File

@ -1,13 +1,8 @@
package nu.marginalia.wmsa.edge.tools;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
@ -20,21 +15,24 @@ import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.*;
import static nu.marginalia.wmsa.edge.converting.processor.DocumentProcessor.isAcceptedContentType;
public class RecipeDetectorTool {
private static final TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
private static final WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
private static final RecipeDetector recipeDetector = new RecipeDetector();
public class CrawlDataExtractorTool {
private static final AdblockSimulator abs;
private static final LanguageModels lm = WmsaHome.getLanguageModels();
private static final SentenceExtractor sentenceExtractor = new SentenceExtractor(lm);
static {
try {
abs = new AdblockSimulator();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static final Set<String> urls = new HashSet<>(50_000_000);
@SneakyThrows
public static void main(String... args) throws IOException {
EdgeCrawlPlan plan = new CrawlPlanLoader().load(Path.of(args[0]));
DatabaseModule module = new DatabaseModule();
@ -51,15 +49,25 @@ public class RecipeDetectorTool {
ex.printStackTrace();
}
ForkJoinPool pool = new ForkJoinPool(16);
LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(10);
ExecutorService pool = new ThreadPoolExecutor(10, 20, 5, TimeUnit.MINUTES, queue);
Semaphore sem = new Semaphore(20);
try (var iterable = plan.domainsIterable()) {
for (var domain : iterable) {
pool.execute(() -> processDomain(domain));
sem.acquire();
pool.execute(() -> {
try { processDomain(domain); }
finally { sem.release(); }
});
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
while (!pool.awaitQuiescence(1, TimeUnit.HOURS));
pool.shutdown();
while (!pool.awaitTermination(1, TimeUnit.MINUTES));
}
private static void processDomain(CrawledDomain domain) {
@ -78,24 +86,8 @@ public class RecipeDetectorTool {
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody);
parsedDocument.getElementsByTag("a").remove();
parsedDocument.getElementsByTag("nav").remove();
DocumentLanguageData dld = sentenceExtractor.extractSentences(parsedDocument);
double prob = 100*recipeDetector.testP(dld);
if (prob > 50) {
System.out.printf("#%3.2f recipe\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
}
prob = 100*woodworkingDetector.testP(dld);
if (prob > 20) {
System.out.printf("#%3.2f woodworking\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
}
prob = 100*textileCraftDetector.testP(dld);
if (prob > 20) {
System.out.printf("#%3.2f textilecraft\t%s\n%s\n", prob, parsedDocument.title(), doc.url);
if (abs.hasAds(parsedDocument)) {
System.out.println(doc.url);
}
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeId;
@ -20,19 +21,20 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import static nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature.CATEGORY_FOOD;
public class RecipesLoaderTool {
public class FeaturesLoaderTool {
public static void main(String... args) {
HtmlFeature feature = HtmlFeature.valueOf(args[0]);
Path file = Path.of(args[1]);
try (EdgeIndexClient client = new EdgeIndexClient();
HikariDataSource ds = new DatabaseModule().provideConnection();
Connection conn = ds.getConnection();
PreparedStatement ps = conn.prepareStatement("UPDATE EC_PAGE_DATA SET FEATURES = FEATURES | ? WHERE ID=?");
var linesStream = Files.lines(Path.of(args[0]))) {
var linesStream = Files.lines(file)) {
var urls = getUrls(ds);
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(CATEGORY_FOOD.getKeyword())));
var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(feature.getKeyword())));
linesStream
.map(urls::get)
.filter(Objects::nonNull)
@ -42,7 +44,7 @@ public class RecipesLoaderTool {
try {
ps.setInt(2, urlId);
ps.setInt(1, CATEGORY_FOOD.getFeatureBit());
ps.setInt(1, feature.getFeatureBit());
ps.executeUpdate();
}
catch (SQLException ex) {

View File

@ -3,6 +3,7 @@
{{#if media}}<abbr title="audio or video" class="meta">🎞️</abbr>{{/if}}
{{#if affiliate}}<abbr title="possible amazon affiliate link (experimental; unreliable)" class="meta">💳️</abbr>{{/if}}
{{#if cookies}}<abbr title="cookies" class="meta">👁️️</abbr>{{/if}}
{{#if ads}}<abbr title="ads (experimental)" class="meta">⚠️️️</abbr>{{/if}}
<span class="meta">{{format}}</span>
{{#unless focusDomain}}
<span class="rank-symbol" title="{{rankingSymbolDesc}}">{{{rankingSymbol}}}</span>