(sideload) Clean up the sideloading code

Clean up the sideloading code a bit, making the Reddit sideloader use the more sophisticated SideloaderProcessing approach to sideloading, instead of mimicing StackexchangeSideloader's cruder approach.

The reddit sideloader now uses the SideloaderProcessing class.  It also properly sets js-attributes for the sideloaded documents.

The control GUI now also filters the upload directory items based on name, and disables the items that do not have appropriate filenames.
This commit is contained in:
Viktor Lofgren 2024-02-17 14:32:36 +01:00
parent ebbe49d17b
commit 37a7296759
12 changed files with 139 additions and 107 deletions

View File

@ -6,4 +6,29 @@ public record UploadDirItem (
boolean isDirectory, boolean isDirectory,
long size long size
) { ) {
public boolean isZim() {
if (name.endsWith(".zim"))
return true;
if (name.contains(".zim.") && name.endsWith(".db"))
return true;
return false;
}
public boolean isStackexchange7z() {
if (name.endsWith(".7z"))
return true;
if (name.contains(".7z.") && name.endsWith(".db"))
return true;
return isDirectory;
}
public boolean isWarc() {
if (name.endsWith(".warc"))
return true;
if (name.contains(".warc.gz"))
return true;
return isDirectory;
}
} }

View File

@ -7,7 +7,9 @@ public enum DocumentClass {
NORMAL, NORMAL,
EXTERNALLY_LINKED_ONCE, EXTERNALLY_LINKED_ONCE,
EXTERNALLY_LINKED_MULTI, EXTERNALLY_LINKED_MULTI,
/** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */ /** A document that is not known to be linked to,
* but is sideloaded. This excludes most inclusion
* checks and always loads the document as-is */
SIDELOAD; SIDELOAD;
public boolean enforceQualityLimits() { public boolean enforceQualityLimits() {

View File

@ -65,7 +65,8 @@ public class SideloadSourceFactory {
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException { public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
return sideload(pathToDbFiles, return sideload(pathToDbFiles,
new PathSuffixPredicate(".db"), new PathSuffixPredicate(".db"),
(List<Path> paths) -> new RedditSideloader(paths, sentenceExtractorProvider, documentKeywordExtractor)); (List<Path> paths) -> new RedditSideloader(paths,
anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
} }
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException { public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {

View File

@ -36,9 +36,11 @@ public class SideloaderProcessing {
List<String> extraKeywords, List<String> extraKeywords,
DomainLinks domainLinks, DomainLinks domainLinks,
GeneratorType type, GeneratorType type,
DocumentClass documentClass,
int pubYear,
int size) throws URISyntaxException { int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument( var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu", "synthetic",
url, url,
"text/html", "text/html",
LocalDateTime.now().toString(), LocalDateTime.now().toString(),
@ -59,9 +61,6 @@ public class SideloaderProcessing {
// Give the document processing preferential treatment if this is a sideloaded wiki, since we // Give the document processing preferential treatment if this is a sideloaded wiki, since we
// truncate the document to the first paragraph, which typically is too short to be included // truncate the document to the first paragraph, which typically is too short to be included
// on its own. // on its own.
final DocumentClass documentClass;
if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD;
else documentClass = DocumentClass.NORMAL;
var ret = new ProcessedDocument(); var ret = new ProcessedDocument();
try { try {
@ -72,11 +71,13 @@ public class SideloaderProcessing {
for (String keyword : extraKeywords) for (String keyword : extraKeywords)
ret.words.add(keyword, WordFlags.Subjects.asBit()); ret.words.add(keyword, WordFlags.Subjects.asBit());
if (type == GeneratorType.WIKI) if (type == GeneratorType.WIKI) {
ret.words.add("generator:wiki", WordFlags.Subjects.asBit()); ret.words.addAllSyntheticTerms(List.of("generator:wiki"));
else if (type == GeneratorType.DOCS) } else if (type == GeneratorType.DOCS) {
ret.words.add("generator:docs", WordFlags.Subjects.asBit()); ret.words.addAllSyntheticTerms(List.of("generator:docs"));
} else if (type == GeneratorType.FORUM) {
ret.words.addAllSyntheticTerms(List.of("generator:forum"));
}
ret.details = details.details(); ret.details = details.details();
// Add a few things that we know about the document // Add a few things that we know about the document
@ -84,14 +85,14 @@ public class SideloaderProcessing {
// so stripped down // so stripped down
ret.details.standard = HtmlStandard.HTML5; ret.details.standard = HtmlStandard.HTML5;
ret.details.pubYear = LocalDateTime.now().getYear(); ret.details.pubYear = pubYear;
ret.details.features.add(HtmlFeature.JS); ret.details.features.add(HtmlFeature.JS);
ret.details.features.add(HtmlFeature.TRACKING); ret.details.features.add(HtmlFeature.TRACKING);
ret.details.quality = -4.5; ret.details.quality = -4.5;
ret.details.generator = type; ret.details.generator = type;
ret.details.metadata = new DocumentMetadata(3, ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(ret.details.pubYear), PubDate.toYearByte(pubYear),
(int) -ret.details.quality, (int) -ret.details.quality,
switch (type) { switch (type) {
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded); case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded);

View File

@ -5,6 +5,7 @@ import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -13,6 +14,7 @@ import nu.marginalia.model.crawl.DomainIndexingState;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.LocalDate;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -83,6 +85,8 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
return sideloaderProcessing return sideloaderProcessing
.processDocument(url, body, extraKeywords, new DomainLinks(), .processDocument(url, body, extraKeywords, new DomainLinks(),
GeneratorType.DOCS, GeneratorType.DOCS,
DocumentClass.NORMAL,
LocalDate.now().getYear(),
10_000); 10_000);
} }

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -27,6 +28,7 @@ import java.net.URLEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.*; import java.sql.*;
import java.time.LocalDate;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -115,6 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
.append("<!DOCTYPE html><html><head><title>") .append("<!DOCTYPE html><html><head><title>")
.append(title) .append(title)
.append("</title></head><body>") .append("</title></head><body>")
.append("<script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>")
.append("<div class=\"mw-content-text\">"); .append("<div class=\"mw-content-text\">");
for (String part : parts) { for (String part : parts) {
@ -131,6 +134,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
List.of("encyclopedia", "wiki"), List.of("encyclopedia", "wiki"),
domainLinks, domainLinks,
GeneratorType.WIKI, GeneratorType.WIKI,
DocumentClass.SIDELOAD,
LocalDate.now().getYear(),
10_000_000); 10_000_000);
// Add anchor text keywords // Add anchor text keywords

View File

@ -1,34 +1,28 @@
package nu.marginalia.converting.sideload.reddit; package nu.marginalia.converting.sideload.reddit;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.integration.reddit.db.RedditDb; import nu.marginalia.integration.reddit.db.RedditDb;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.util.ProcessingIterator; import nu.marginalia.util.ProcessingIterator;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Instant; import java.time.Instant;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.ZoneOffset; import java.time.ZoneOffset;
import java.util.EnumSet; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -36,15 +30,18 @@ public class RedditSideloader implements SideloadSource {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class); private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
private final List<Path> dbFiles; private final List<Path> dbFiles;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final DocumentKeywordExtractor keywordExtractor; private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
public RedditSideloader(List<Path> listToDbFiles, public RedditSideloader(List<Path> listToDbFiles,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, AnchorTagsSourceFactory anchorTagsSourceFactory,
DocumentKeywordExtractor keywordExtractor) { AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) {
this.dbFiles = listToDbFiles; this.dbFiles = listToDbFiles;
this.sentenceExtractorProvider = sentenceExtractorProvider; this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.keywordExtractor = keywordExtractor; this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
} }
@Override @Override
@ -115,81 +112,68 @@ public class RedditSideloader implements SideloadSource {
DomainLinks domainLinks) throws URISyntaxException { DomainLinks domainLinks) throws URISyntaxException {
String fullUrl = "https://old.reddit.com" + permalink; String fullUrl = "https://old.reddit.com" + permalink;
StringBuilder fullHtml = new StringBuilder(); int pubYear = LocalDate
fullHtml.append("<!DOCTYPE html><html><head><title>").append(title).append("</title></head><body>"); .ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
fullHtml.append("<h1>").append(title).append("</h1>"); .getYear();
fullHtml.append("<p>").append(body).append("</p>");
fullHtml.append("</body></html>");
var ret = new ProcessedDocument(); String fullHtml = STR."""
try { <!DOCTYPE html>
<html>
<head>
<title>\{title}</title>
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
</head>
<body>
<h1>\{title}</h1>
<article>
<p>\{body}</p>
</article>
</body>
</html>
""";
var url = new EdgeUrl(fullUrl); List<String> extraKeywords = new ArrayList<>();
var doc = Jsoup.parse(fullHtml.toString());
var dld = sentenceExtractorProvider.get().extractSentences(doc);
ret.url = url; extraKeywords.add("reddit");
ret.words = keywordExtractor.extractKeywords(dld, url); extraKeywords.add(subreddit);
extraKeywords.add("r/" + subreddit);
ret.words.addAllSyntheticTerms(List.of( if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
"js:true", extraKeywords.add(author);
"site:reddit.com",
"site:old.reddit.com",
"site:www.reddit.com",
"special:ads",
"special:tracking",
"generator:forum",
subreddit
));
ret.words.add(subreddit, WordFlags.Subjects.asBit());
ret.words.add("reddit",
WordFlags.ExternalLink.asBit()
| WordFlags.Subjects.asBit()
| WordFlags.Synthetic.asBit()
| WordFlags.NamesWords.asBit());
ret.words.add(subreddit.toLowerCase(),
WordFlags.ExternalLink.asBit()
| WordFlags.NamesWords.asBit()
| WordFlags.Synthetic.asBit()
);
if (!"[deleted]".equals(author))
ret.words.add(author, WordFlags.NamesWords.asBit() | WordFlags.Synthetic.asBit());
var date = LocalDate.ofInstant(
Instant.ofEpochSecond(createdUtc),
ZoneOffset.UTC);
int year = date.getYear();
ret.details = new ProcessedDocumentDetails();
ret.details.pubYear = year;
ret.details.quality = -5;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(year),
(int) -ret.details.quality,
EnumSet.of(DocumentFlags.GeneratorForum));
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
ret.details.metadata.withSizeAndTopology(10000, score);
ret.details.generator = GeneratorType.DOCS;
ret.details.title = StringUtils.truncate(STR."[/r/\{subreddit}] \{title}", 128);
ret.details.description = StringUtils.truncate(body, 255);
ret.details.length = 128;
ret.details.standard = HtmlStandard.HTML5;
ret.details.feedLinks = List.of();
ret.details.linksExternal = List.of();
ret.details.linksInternal = List.of();
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";
} }
catch (Exception e) {
logger.warn("Failed to process document", e); var doc = sideloaderProcessing
ret.url = new EdgeUrl(fullUrl); .processDocument(fullUrl,
ret.state = UrlIndexingState.DISQUALIFIED; fullHtml,
ret.stateReason = "SIDELOAD"; List.of("encyclopedia", "wiki"),
domainLinks,
GeneratorType.WIKI,
DocumentClass.SIDELOAD,
pubYear,
10_000_000);
if (doc.isProcessedFully()) {
for (String url : List.of(
STR."https://old.reddit.com/r/\{permalink}",
STR."https://www.reddit.com/r/\{permalink}",
STR."https://reddit.com/r/\{permalink}"
)) {
EdgeUrl.parse(url)
.map(parsed -> anchorTextKeywords.getAnchorTextKeywords(domainLinks, parsed))
.filter(parsed -> !parsed.isEmpty())
.ifPresent(doc.words::addAnchorTerms);
}
for (var keyword : extraKeywords) {
doc.words.add(keyword, WordFlags.Subjects.asBit());
}
// Insert topology information
doc.details.metadata.withSizeAndTopology(50_000_000, score);
} }
return ret;
return doc;
}; };
} }

View File

@ -105,6 +105,9 @@ public class StackexchangeSideloader implements SideloadSource {
StringBuilder fullHtml = new StringBuilder(); StringBuilder fullHtml = new StringBuilder();
fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>"); fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>");
// Add a bogus script tag to make sure we get the JS flag
fullHtml.append("<script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>");
fullHtml.append("<p>").append(post.title()).append("</p>"); fullHtml.append("<p>").append(post.title()).append("</p>");
for (var comment : post.bodies()) { for (var comment : post.bodies()) {
fullHtml.append("<p>").append(comment).append("</p>"); fullHtml.append("<p>").append(comment).append("</p>");

View File

@ -7,6 +7,7 @@ import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -19,6 +20,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.LocalDate;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
@ -130,8 +132,13 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
} }
return Optional.of(sideloaderProcessing return Optional.of(sideloaderProcessing
.processDocument(url, body.get(), List.of(), new DomainLinks(), .processDocument(url,
body.get(),
List.of(),
new DomainLinks(),
GeneratorType.DOCS, GeneratorType.DOCS,
DocumentClass.SIDELOAD,
LocalDate.now().getYear(), // TODO: This should be the actual year of the document
10_000)); 10_000));
} }

View File

@ -20,8 +20,8 @@
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th> <th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
{{#each uploadDirContents.items}} {{#each uploadDirContents.items}}
<tr> <tr>
<td><input {{#if directory}}disabled{{/if}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td> <td><input {{#unless isZim}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td {{#if directory}}class="text-muted"{{/if}}> <td {{#unless isZim}}class="text-muted"{{/unless}}>
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label> <label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
</td> </td>
<td>{{#unless directory}}{{size}}{{/unless}}</td> <td>{{#unless directory}}{{size}}{{/unless}}</td>

View File

@ -13,8 +13,8 @@ information how to do this.
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th> <th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
{{#each uploadDirContents.items}} {{#each uploadDirContents.items}}
<tr> <tr>
<td><input class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td> <td><input {{#unless isStackexchange7z}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td> <td {{#unless isStackexchange7z}}class="text-muted"{{/unless}}>
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label> <label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
</td> </td>
<td>{{#unless directory}}{{size}}{{/unless}}</td> <td>{{#unless directory}}{{size}}{{/unless}}</td>

View File

@ -12,9 +12,9 @@ A warc export can be created using e.g. wget: <p>
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th> <th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
{{#each uploadDirContents.items}} {{#each uploadDirContents.items}}
<tr> <tr>
<td><input class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td> <td><input {{#unless isWarc}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td> <td>
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label> <label {{#unless isWarc}}class="text-muted"{{/unless}} class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
</td> </td>
<td>{{#unless directory}}{{size}}{{/unless}}</td> <td>{{#unless directory}}{{size}}{{/unless}}</td>
<td title={{lastModifiedTime}}>{{shortTimestamp lastModifiedTime}}</td> <td title={{lastModifiedTime}}>{{shortTimestamp lastModifiedTime}}</td>