From 37a7296759cf79ef7c4316b40b99b0621502c018 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 17 Feb 2024 14:32:36 +0100 Subject: [PATCH] (sideload) Clean up the sideloading code Clean up the sideloading code a bit, making the Reddit sideloader use the more sophisticated SideloaderProcessing approach to sideloading, instead of mimicing StackexchangeSideloader's cruder approach. The reddit sideloader now uses the SideloaderProcessing class. It also properly sets js-attributes for the sideloaded documents. The control GUI now also filters the upload directory items based on name, and disables the items that do not have appropriate filenames. --- .../executor/upload/UploadDirItem.java | 25 +++ .../converting/processor/DocumentClass.java | 4 +- .../sideload/SideloadSourceFactory.java | 3 +- .../sideload/SideloaderProcessing.java | 23 +-- .../sideload/dirtree/DirtreeSideloader.java | 4 + .../EncyclopediaMarginaliaNuSideloader.java | 5 + .../sideload/reddit/RedditSideloader.java | 158 ++++++++---------- .../StackexchangeSideloader.java | 3 + .../sideload/warc/WarcSideloader.java | 9 +- .../actions/partial-sideload-encyclopedia.hdb | 4 +- .../partial-sideload-stackexchange.hdb | 4 +- .../node/actions/partial-sideload-warc.hdb | 4 +- 12 files changed, 139 insertions(+), 107 deletions(-) diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/upload/UploadDirItem.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/upload/UploadDirItem.java index c1953f8f..a00344b2 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/upload/UploadDirItem.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/upload/UploadDirItem.java @@ -6,4 +6,29 @@ public record UploadDirItem ( boolean isDirectory, long size ) { + + public boolean isZim() { + if (name.endsWith(".zim")) + return true; + if (name.contains(".zim.") && name.endsWith(".db")) + return true; + return false; + } + + public boolean isStackexchange7z() { + if (name.endsWith(".7z")) + return true; + if (name.contains(".7z.") && name.endsWith(".db")) + return true; + return isDirectory; + } + + public boolean isWarc() { + if (name.endsWith(".warc")) + return true; + if (name.contains(".warc.gz")) + return true; + return isDirectory; + } + } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java index 408d3105..e02c35a0 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java @@ -7,7 +7,9 @@ public enum DocumentClass { NORMAL, EXTERNALLY_LINKED_ONCE, EXTERNALLY_LINKED_MULTI, - /** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */ + /** A document that is not known to be linked to, + * but is sideloaded. This excludes most inclusion + * checks and always loads the document as-is */ SIDELOAD; public boolean enforceQualityLimits() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 18cf0104..8c6e92d2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -65,7 +65,8 @@ public class SideloadSourceFactory { public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException { return sideload(pathToDbFiles, new PathSuffixPredicate(".db"), - (List paths) -> new RedditSideloader(paths, sentenceExtractorProvider, documentKeywordExtractor)); + (List paths) -> new RedditSideloader(paths, + anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing)); } public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index f888a6b0..32a0ec62 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -36,9 +36,11 @@ public class SideloaderProcessing { List extraKeywords, DomainLinks domainLinks, GeneratorType type, + DocumentClass documentClass, + int pubYear, int size) throws URISyntaxException { var crawledDoc = new CrawledDocument( - "encyclopedia.marginalia.nu", + "synthetic", url, "text/html", LocalDateTime.now().toString(), @@ -59,9 +61,6 @@ public class SideloaderProcessing { // Give the document processing preferential treatment if this is a sideloaded wiki, since we // truncate the document to the first paragraph, which typically is too short to be included // on its own. - final DocumentClass documentClass; - if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD; - else documentClass = DocumentClass.NORMAL; var ret = new ProcessedDocument(); try { @@ -72,11 +71,13 @@ public class SideloaderProcessing { for (String keyword : extraKeywords) ret.words.add(keyword, WordFlags.Subjects.asBit()); - if (type == GeneratorType.WIKI) - ret.words.add("generator:wiki", WordFlags.Subjects.asBit()); - else if (type == GeneratorType.DOCS) - ret.words.add("generator:docs", WordFlags.Subjects.asBit()); - + if (type == GeneratorType.WIKI) { + ret.words.addAllSyntheticTerms(List.of("generator:wiki")); + } else if (type == GeneratorType.DOCS) { + ret.words.addAllSyntheticTerms(List.of("generator:docs")); + } else if (type == GeneratorType.FORUM) { + ret.words.addAllSyntheticTerms(List.of("generator:forum")); + } ret.details = details.details(); // Add a few things that we know about the document @@ -84,14 +85,14 @@ public class SideloaderProcessing { // so stripped down ret.details.standard = HtmlStandard.HTML5; - ret.details.pubYear = LocalDateTime.now().getYear(); + ret.details.pubYear = pubYear; ret.details.features.add(HtmlFeature.JS); ret.details.features.add(HtmlFeature.TRACKING); ret.details.quality = -4.5; ret.details.generator = type; ret.details.metadata = new DocumentMetadata(3, - PubDate.toYearByte(ret.details.pubYear), + PubDate.toYearByte(pubYear), (int) -ret.details.quality, switch (type) { case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java index a8e729e3..252f9086 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java @@ -5,6 +5,7 @@ import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.model.EdgeDomain; @@ -13,6 +14,7 @@ import nu.marginalia.model.crawl.DomainIndexingState; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.time.LocalDate; import java.util.Iterator; import java.util.List; import java.util.stream.Stream; @@ -83,6 +85,8 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable { return sideloaderProcessing .processDocument(url, body, extraKeywords, new DomainLinks(), GeneratorType.DOCS, + DocumentClass.NORMAL, + LocalDate.now().getYear(), 10_000); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 009adf3a..8f36e6a1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.model.EdgeDomain; @@ -27,6 +28,7 @@ import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.sql.*; +import java.time.LocalDate; import java.util.Iterator; import java.util.List; @@ -115,6 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC .append("") .append(title) .append("") + .append("") .append("
"); for (String part : parts) { @@ -131,6 +134,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC List.of("encyclopedia", "wiki"), domainLinks, GeneratorType.WIKI, + DocumentClass.SIDELOAD, + LocalDate.now().getYear(), 10_000_000); // Add anchor text keywords diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java index 97e519d6..24706317 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java @@ -1,34 +1,28 @@ package nu.marginalia.converting.sideload.reddit; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.integration.reddit.db.RedditDb; -import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.model.crawl.UrlIndexingState; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; import java.net.URISyntaxException; import java.nio.file.Path; import java.time.Instant; import java.time.LocalDate; import java.time.ZoneOffset; -import java.util.EnumSet; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -36,15 +30,18 @@ public class RedditSideloader implements SideloadSource { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class); private final List dbFiles; - private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; - private final DocumentKeywordExtractor keywordExtractor; + private final AnchorTagsSourceFactory anchorTagsSourceFactory; + private final AnchorTextKeywords anchorTextKeywords; + private final SideloaderProcessing sideloaderProcessing; public RedditSideloader(List listToDbFiles, - ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, - DocumentKeywordExtractor keywordExtractor) { + AnchorTagsSourceFactory anchorTagsSourceFactory, + AnchorTextKeywords anchorTextKeywords, + SideloaderProcessing sideloaderProcessing) { this.dbFiles = listToDbFiles; - this.sentenceExtractorProvider = sentenceExtractorProvider; - this.keywordExtractor = keywordExtractor; + this.anchorTagsSourceFactory = anchorTagsSourceFactory; + this.anchorTextKeywords = anchorTextKeywords; + this.sideloaderProcessing = sideloaderProcessing; } @Override @@ -115,81 +112,68 @@ public class RedditSideloader implements SideloadSource { DomainLinks domainLinks) throws URISyntaxException { String fullUrl = "https://old.reddit.com" + permalink; - StringBuilder fullHtml = new StringBuilder(); - fullHtml.append("").append(title).append(""); - fullHtml.append("

").append(title).append("

"); - fullHtml.append("

").append(body).append("

"); - fullHtml.append(""); + int pubYear = LocalDate + .ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC) + .getYear(); - var ret = new ProcessedDocument(); - try { + String fullHtml = STR.""" + + + + \{title} + + + +

\{title}

+
+

\{body}

+
+ + + """; - var url = new EdgeUrl(fullUrl); - var doc = Jsoup.parse(fullHtml.toString()); - var dld = sentenceExtractorProvider.get().extractSentences(doc); + List extraKeywords = new ArrayList<>(); - ret.url = url; - ret.words = keywordExtractor.extractKeywords(dld, url); + extraKeywords.add("reddit"); + extraKeywords.add(subreddit); + extraKeywords.add("r/" + subreddit); - ret.words.addAllSyntheticTerms(List.of( - "js:true", - "site:reddit.com", - "site:old.reddit.com", - "site:www.reddit.com", - "special:ads", - "special:tracking", - "generator:forum", - subreddit - )); - - ret.words.add(subreddit, WordFlags.Subjects.asBit()); - ret.words.add("reddit", - WordFlags.ExternalLink.asBit() - | WordFlags.Subjects.asBit() - | WordFlags.Synthetic.asBit() - | WordFlags.NamesWords.asBit()); - ret.words.add(subreddit.toLowerCase(), - WordFlags.ExternalLink.asBit() - | WordFlags.NamesWords.asBit() - | WordFlags.Synthetic.asBit() - ); - if (!"[deleted]".equals(author)) - ret.words.add(author, WordFlags.NamesWords.asBit() | WordFlags.Synthetic.asBit()); - - var date = LocalDate.ofInstant( - Instant.ofEpochSecond(createdUtc), - ZoneOffset.UTC); - int year = date.getYear(); - - ret.details = new ProcessedDocumentDetails(); - ret.details.pubYear = year; - ret.details.quality = -5; - ret.details.metadata = new DocumentMetadata(3, - PubDate.toYearByte(year), - (int) -ret.details.quality, - EnumSet.of(DocumentFlags.GeneratorForum)); - ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING); - - ret.details.metadata.withSizeAndTopology(10000, score); - - ret.details.generator = GeneratorType.DOCS; - ret.details.title = StringUtils.truncate(STR."[/r/\{subreddit}] \{title}", 128); - ret.details.description = StringUtils.truncate(body, 255); - ret.details.length = 128; - - ret.details.standard = HtmlStandard.HTML5; - ret.details.feedLinks = List.of(); - ret.details.linksExternal = List.of(); - ret.details.linksInternal = List.of(); - ret.state = UrlIndexingState.OK; - ret.stateReason = "SIDELOAD"; + if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) { + extraKeywords.add(author); } - catch (Exception e) { - logger.warn("Failed to process document", e); - ret.url = new EdgeUrl(fullUrl); - ret.state = UrlIndexingState.DISQUALIFIED; - ret.stateReason = "SIDELOAD"; + + var doc = sideloaderProcessing + .processDocument(fullUrl, + fullHtml, + List.of("encyclopedia", "wiki"), + domainLinks, + GeneratorType.WIKI, + DocumentClass.SIDELOAD, + pubYear, + 10_000_000); + + + if (doc.isProcessedFully()) { + for (String url : List.of( + STR."https://old.reddit.com/r/\{permalink}", + STR."https://www.reddit.com/r/\{permalink}", + STR."https://reddit.com/r/\{permalink}" + )) { + EdgeUrl.parse(url) + .map(parsed -> anchorTextKeywords.getAnchorTextKeywords(domainLinks, parsed)) + .filter(parsed -> !parsed.isEmpty()) + .ifPresent(doc.words::addAnchorTerms); + } + + for (var keyword : extraKeywords) { + doc.words.add(keyword, WordFlags.Subjects.asBit()); + } + + // Insert topology information + doc.details.metadata.withSizeAndTopology(50_000_000, score); } - return ret; + + + return doc; }; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index 46ad47e2..53be14aa 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -105,6 +105,9 @@ public class StackexchangeSideloader implements SideloadSource { StringBuilder fullHtml = new StringBuilder(); fullHtml.append("").append(post.title()).append(""); + // Add a bogus script tag to make sure we get the JS flag + fullHtml.append(""); + fullHtml.append("

").append(post.title()).append("

"); for (var comment : post.bodies()) { fullHtml.append("

").append(comment).append("

"); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java index edb670fa..791f0665 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -7,6 +7,7 @@ import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.model.EdgeDomain; @@ -19,6 +20,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Path; +import java.time.LocalDate; import java.util.Iterator; import java.util.List; import java.util.Objects; @@ -130,8 +132,13 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { } return Optional.of(sideloaderProcessing - .processDocument(url, body.get(), List.of(), new DomainLinks(), + .processDocument(url, + body.get(), + List.of(), + new DomainLinks(), GeneratorType.DOCS, + DocumentClass.SIDELOAD, + LocalDate.now().getYear(), // TODO: This should be the actual year of the document 10_000)); } diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb index 39d5c686..388d970d 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb @@ -20,8 +20,8 @@ FilenameSizeLast Modified {{#each uploadDirContents.items}} - - + + {{#unless directory}}{{size}}{{/unless}} diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-stackexchange.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-stackexchange.hdb index f5f73f84..adaa1040 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-stackexchange.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-stackexchange.hdb @@ -13,8 +13,8 @@ information how to do this. FilenameSizeLast Modified {{#each uploadDirContents.items}} - - + + {{#unless directory}}{{size}}{{/unless}} diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-warc.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-warc.hdb index 7680b7b8..7d728992 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-warc.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-warc.hdb @@ -12,9 +12,9 @@ A warc export can be created using e.g. wget:

FilenameSizeLast Modified {{#each uploadDirContents.items}} - + - + {{#unless directory}}{{size}}{{/unless}} {{shortTimestamp lastModifiedTime}}