(sideload) Clean up the sideloading code

Clean up the sideloading code a bit, making the Reddit sideloader use the more sophisticated SideloaderProcessing approach to sideloading, instead of mimicing StackexchangeSideloader's cruder approach.

The reddit sideloader now uses the SideloaderProcessing class.  It also properly sets js-attributes for the sideloaded documents.

The control GUI now also filters the upload directory items based on name, and disables the items that do not have appropriate filenames.
This commit is contained in:
Viktor Lofgren 2024-02-17 14:32:36 +01:00
parent ebbe49d17b
commit 37a7296759
12 changed files with 139 additions and 107 deletions

View File

@ -6,4 +6,29 @@ public record UploadDirItem (
boolean isDirectory,
long size
) {
public boolean isZim() {
if (name.endsWith(".zim"))
return true;
if (name.contains(".zim.") && name.endsWith(".db"))
return true;
return false;
}
public boolean isStackexchange7z() {
if (name.endsWith(".7z"))
return true;
if (name.contains(".7z.") && name.endsWith(".db"))
return true;
return isDirectory;
}
public boolean isWarc() {
if (name.endsWith(".warc"))
return true;
if (name.contains(".warc.gz"))
return true;
return isDirectory;
}
}

View File

@ -7,7 +7,9 @@ public enum DocumentClass {
NORMAL,
EXTERNALLY_LINKED_ONCE,
EXTERNALLY_LINKED_MULTI,
/** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */
/** A document that is not known to be linked to,
* but is sideloaded. This excludes most inclusion
* checks and always loads the document as-is */
SIDELOAD;
public boolean enforceQualityLimits() {

View File

@ -65,7 +65,8 @@ public class SideloadSourceFactory {
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
return sideload(pathToDbFiles,
new PathSuffixPredicate(".db"),
(List<Path> paths) -> new RedditSideloader(paths, sentenceExtractorProvider, documentKeywordExtractor));
(List<Path> paths) -> new RedditSideloader(paths,
anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
}
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {

View File

@ -36,9 +36,11 @@ public class SideloaderProcessing {
List<String> extraKeywords,
DomainLinks domainLinks,
GeneratorType type,
DocumentClass documentClass,
int pubYear,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu",
"synthetic",
url,
"text/html",
LocalDateTime.now().toString(),
@ -59,9 +61,6 @@ public class SideloaderProcessing {
// Give the document processing preferential treatment if this is a sideloaded wiki, since we
// truncate the document to the first paragraph, which typically is too short to be included
// on its own.
final DocumentClass documentClass;
if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD;
else documentClass = DocumentClass.NORMAL;
var ret = new ProcessedDocument();
try {
@ -72,11 +71,13 @@ public class SideloaderProcessing {
for (String keyword : extraKeywords)
ret.words.add(keyword, WordFlags.Subjects.asBit());
if (type == GeneratorType.WIKI)
ret.words.add("generator:wiki", WordFlags.Subjects.asBit());
else if (type == GeneratorType.DOCS)
ret.words.add("generator:docs", WordFlags.Subjects.asBit());
if (type == GeneratorType.WIKI) {
ret.words.addAllSyntheticTerms(List.of("generator:wiki"));
} else if (type == GeneratorType.DOCS) {
ret.words.addAllSyntheticTerms(List.of("generator:docs"));
} else if (type == GeneratorType.FORUM) {
ret.words.addAllSyntheticTerms(List.of("generator:forum"));
}
ret.details = details.details();
// Add a few things that we know about the document
@ -84,14 +85,14 @@ public class SideloaderProcessing {
// so stripped down
ret.details.standard = HtmlStandard.HTML5;
ret.details.pubYear = LocalDateTime.now().getYear();
ret.details.pubYear = pubYear;
ret.details.features.add(HtmlFeature.JS);
ret.details.features.add(HtmlFeature.TRACKING);
ret.details.quality = -4.5;
ret.details.generator = type;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(ret.details.pubYear),
PubDate.toYearByte(pubYear),
(int) -ret.details.quality,
switch (type) {
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded);

View File

@ -5,6 +5,7 @@ import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
@ -13,6 +14,7 @@ import nu.marginalia.model.crawl.DomainIndexingState;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDate;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Stream;
@ -83,6 +85,8 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
return sideloaderProcessing
.processDocument(url, body, extraKeywords, new DomainLinks(),
GeneratorType.DOCS,
DocumentClass.NORMAL,
LocalDate.now().getYear(),
10_000);
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
@ -27,6 +28,7 @@ import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.sql.*;
import java.time.LocalDate;
import java.util.Iterator;
import java.util.List;
@ -115,6 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
.append("<!DOCTYPE html><html><head><title>")
.append(title)
.append("</title></head><body>")
.append("<script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>")
.append("<div class=\"mw-content-text\">");
for (String part : parts) {
@ -131,6 +134,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
List.of("encyclopedia", "wiki"),
domainLinks,
GeneratorType.WIKI,
DocumentClass.SIDELOAD,
LocalDate.now().getYear(),
10_000_000);
// Add anchor text keywords

View File

@ -1,34 +1,28 @@
package nu.marginalia.converting.sideload.reddit;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDocumentDetails;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.integration.reddit.db.RedditDb;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.util.ProcessingIterator;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.time.Instant;
import java.time.LocalDate;
import java.time.ZoneOffset;
import java.util.EnumSet;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@ -36,15 +30,18 @@ public class RedditSideloader implements SideloadSource {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
private final List<Path> dbFiles;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor keywordExtractor;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
public RedditSideloader(List<Path> listToDbFiles,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor) {
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) {
this.dbFiles = listToDbFiles;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.keywordExtractor = keywordExtractor;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
}
@Override
@ -115,81 +112,68 @@ public class RedditSideloader implements SideloadSource {
DomainLinks domainLinks) throws URISyntaxException {
String fullUrl = "https://old.reddit.com" + permalink;
StringBuilder fullHtml = new StringBuilder();
fullHtml.append("<!DOCTYPE html><html><head><title>").append(title).append("</title></head><body>");
fullHtml.append("<h1>").append(title).append("</h1>");
fullHtml.append("<p>").append(body).append("</p>");
fullHtml.append("</body></html>");
int pubYear = LocalDate
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
.getYear();
var ret = new ProcessedDocument();
try {
String fullHtml = STR."""
<!DOCTYPE html>
<html>
<head>
<title>\{title}</title>
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
</head>
<body>
<h1>\{title}</h1>
<article>
<p>\{body}</p>
</article>
</body>
</html>
""";
var url = new EdgeUrl(fullUrl);
var doc = Jsoup.parse(fullHtml.toString());
var dld = sentenceExtractorProvider.get().extractSentences(doc);
List<String> extraKeywords = new ArrayList<>();
ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, url);
extraKeywords.add("reddit");
extraKeywords.add(subreddit);
extraKeywords.add("r/" + subreddit);
ret.words.addAllSyntheticTerms(List.of(
"js:true",
"site:reddit.com",
"site:old.reddit.com",
"site:www.reddit.com",
"special:ads",
"special:tracking",
"generator:forum",
subreddit
));
ret.words.add(subreddit, WordFlags.Subjects.asBit());
ret.words.add("reddit",
WordFlags.ExternalLink.asBit()
| WordFlags.Subjects.asBit()
| WordFlags.Synthetic.asBit()
| WordFlags.NamesWords.asBit());
ret.words.add(subreddit.toLowerCase(),
WordFlags.ExternalLink.asBit()
| WordFlags.NamesWords.asBit()
| WordFlags.Synthetic.asBit()
);
if (!"[deleted]".equals(author))
ret.words.add(author, WordFlags.NamesWords.asBit() | WordFlags.Synthetic.asBit());
var date = LocalDate.ofInstant(
Instant.ofEpochSecond(createdUtc),
ZoneOffset.UTC);
int year = date.getYear();
ret.details = new ProcessedDocumentDetails();
ret.details.pubYear = year;
ret.details.quality = -5;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(year),
(int) -ret.details.quality,
EnumSet.of(DocumentFlags.GeneratorForum));
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
ret.details.metadata.withSizeAndTopology(10000, score);
ret.details.generator = GeneratorType.DOCS;
ret.details.title = StringUtils.truncate(STR."[/r/\{subreddit}] \{title}", 128);
ret.details.description = StringUtils.truncate(body, 255);
ret.details.length = 128;
ret.details.standard = HtmlStandard.HTML5;
ret.details.feedLinks = List.of();
ret.details.linksExternal = List.of();
ret.details.linksInternal = List.of();
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
extraKeywords.add(author);
}
catch (Exception e) {
logger.warn("Failed to process document", e);
ret.url = new EdgeUrl(fullUrl);
ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = "SIDELOAD";
var doc = sideloaderProcessing
.processDocument(fullUrl,
fullHtml,
List.of("encyclopedia", "wiki"),
domainLinks,
GeneratorType.WIKI,
DocumentClass.SIDELOAD,
pubYear,
10_000_000);
if (doc.isProcessedFully()) {
for (String url : List.of(
STR."https://old.reddit.com/r/\{permalink}",
STR."https://www.reddit.com/r/\{permalink}",
STR."https://reddit.com/r/\{permalink}"
)) {
EdgeUrl.parse(url)
.map(parsed -> anchorTextKeywords.getAnchorTextKeywords(domainLinks, parsed))
.filter(parsed -> !parsed.isEmpty())
.ifPresent(doc.words::addAnchorTerms);
}
for (var keyword : extraKeywords) {
doc.words.add(keyword, WordFlags.Subjects.asBit());
}
// Insert topology information
doc.details.metadata.withSizeAndTopology(50_000_000, score);
}
return ret;
return doc;
};
}

View File

@ -105,6 +105,9 @@ public class StackexchangeSideloader implements SideloadSource {
StringBuilder fullHtml = new StringBuilder();
fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>");
// Add a bogus script tag to make sure we get the JS flag
fullHtml.append("<script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>");
fullHtml.append("<p>").append(post.title()).append("</p>");
for (var comment : post.bodies()) {
fullHtml.append("<p>").append(comment).append("</p>");

View File

@ -7,6 +7,7 @@ import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
@ -19,6 +20,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.time.LocalDate;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
@ -130,8 +132,13 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
}
return Optional.of(sideloaderProcessing
.processDocument(url, body.get(), List.of(), new DomainLinks(),
.processDocument(url,
body.get(),
List.of(),
new DomainLinks(),
GeneratorType.DOCS,
DocumentClass.SIDELOAD,
LocalDate.now().getYear(), // TODO: This should be the actual year of the document
10_000));
}

View File

@ -20,8 +20,8 @@
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
{{#each uploadDirContents.items}}
<tr>
<td><input {{#if directory}}disabled{{/if}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td {{#if directory}}class="text-muted"{{/if}}>
<td><input {{#unless isZim}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td {{#unless isZim}}class="text-muted"{{/unless}}>
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
</td>
<td>{{#unless directory}}{{size}}{{/unless}}</td>

View File

@ -13,8 +13,8 @@ information how to do this.
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
{{#each uploadDirContents.items}}
<tr>
<td><input class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td>
<td><input {{#unless isStackexchange7z}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td {{#unless isStackexchange7z}}class="text-muted"{{/unless}}>
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
</td>
<td>{{#unless directory}}{{size}}{{/unless}}</td>

View File

@ -12,9 +12,9 @@ A warc export can be created using e.g. wget: <p>
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
{{#each uploadDirContents.items}}
<tr>
<td><input class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td><input {{#unless isWarc}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
<td>
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
<label {{#unless isWarc}}class="text-muted"{{/unless}} class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
</td>
<td>{{#unless directory}}{{size}}{{/unless}}</td>
<td title={{lastModifiedTime}}>{{shortTimestamp lastModifiedTime}}</td>