(sideload) Clean up the sideloading code
Clean up the sideloading code a bit, making the Reddit sideloader use the more sophisticated SideloaderProcessing approach to sideloading, instead of mimicing StackexchangeSideloader's cruder approach. The reddit sideloader now uses the SideloaderProcessing class. It also properly sets js-attributes for the sideloaded documents. The control GUI now also filters the upload directory items based on name, and disables the items that do not have appropriate filenames.
This commit is contained in:
parent
ebbe49d17b
commit
37a7296759
@ -6,4 +6,29 @@ public record UploadDirItem (
|
||||
boolean isDirectory,
|
||||
long size
|
||||
) {
|
||||
|
||||
public boolean isZim() {
|
||||
if (name.endsWith(".zim"))
|
||||
return true;
|
||||
if (name.contains(".zim.") && name.endsWith(".db"))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isStackexchange7z() {
|
||||
if (name.endsWith(".7z"))
|
||||
return true;
|
||||
if (name.contains(".7z.") && name.endsWith(".db"))
|
||||
return true;
|
||||
return isDirectory;
|
||||
}
|
||||
|
||||
public boolean isWarc() {
|
||||
if (name.endsWith(".warc"))
|
||||
return true;
|
||||
if (name.contains(".warc.gz"))
|
||||
return true;
|
||||
return isDirectory;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,7 +7,9 @@ public enum DocumentClass {
|
||||
NORMAL,
|
||||
EXTERNALLY_LINKED_ONCE,
|
||||
EXTERNALLY_LINKED_MULTI,
|
||||
/** A document that is not linked to, but is sideloaded. Ignore most inclusion checks. */
|
||||
/** A document that is not known to be linked to,
|
||||
* but is sideloaded. This excludes most inclusion
|
||||
* checks and always loads the document as-is */
|
||||
SIDELOAD;
|
||||
|
||||
public boolean enforceQualityLimits() {
|
||||
|
@ -65,7 +65,8 @@ public class SideloadSourceFactory {
|
||||
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
|
||||
return sideload(pathToDbFiles,
|
||||
new PathSuffixPredicate(".db"),
|
||||
(List<Path> paths) -> new RedditSideloader(paths, sentenceExtractorProvider, documentKeywordExtractor));
|
||||
(List<Path> paths) -> new RedditSideloader(paths,
|
||||
anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
|
||||
}
|
||||
|
||||
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
||||
|
@ -36,9 +36,11 @@ public class SideloaderProcessing {
|
||||
List<String> extraKeywords,
|
||||
DomainLinks domainLinks,
|
||||
GeneratorType type,
|
||||
DocumentClass documentClass,
|
||||
int pubYear,
|
||||
int size) throws URISyntaxException {
|
||||
var crawledDoc = new CrawledDocument(
|
||||
"encyclopedia.marginalia.nu",
|
||||
"synthetic",
|
||||
url,
|
||||
"text/html",
|
||||
LocalDateTime.now().toString(),
|
||||
@ -59,9 +61,6 @@ public class SideloaderProcessing {
|
||||
// Give the document processing preferential treatment if this is a sideloaded wiki, since we
|
||||
// truncate the document to the first paragraph, which typically is too short to be included
|
||||
// on its own.
|
||||
final DocumentClass documentClass;
|
||||
if (type == GeneratorType.WIKI) documentClass = DocumentClass.SIDELOAD;
|
||||
else documentClass = DocumentClass.NORMAL;
|
||||
|
||||
var ret = new ProcessedDocument();
|
||||
try {
|
||||
@ -72,11 +71,13 @@ public class SideloaderProcessing {
|
||||
for (String keyword : extraKeywords)
|
||||
ret.words.add(keyword, WordFlags.Subjects.asBit());
|
||||
|
||||
if (type == GeneratorType.WIKI)
|
||||
ret.words.add("generator:wiki", WordFlags.Subjects.asBit());
|
||||
else if (type == GeneratorType.DOCS)
|
||||
ret.words.add("generator:docs", WordFlags.Subjects.asBit());
|
||||
|
||||
if (type == GeneratorType.WIKI) {
|
||||
ret.words.addAllSyntheticTerms(List.of("generator:wiki"));
|
||||
} else if (type == GeneratorType.DOCS) {
|
||||
ret.words.addAllSyntheticTerms(List.of("generator:docs"));
|
||||
} else if (type == GeneratorType.FORUM) {
|
||||
ret.words.addAllSyntheticTerms(List.of("generator:forum"));
|
||||
}
|
||||
ret.details = details.details();
|
||||
|
||||
// Add a few things that we know about the document
|
||||
@ -84,14 +85,14 @@ public class SideloaderProcessing {
|
||||
// so stripped down
|
||||
|
||||
ret.details.standard = HtmlStandard.HTML5;
|
||||
ret.details.pubYear = LocalDateTime.now().getYear();
|
||||
ret.details.pubYear = pubYear;
|
||||
ret.details.features.add(HtmlFeature.JS);
|
||||
ret.details.features.add(HtmlFeature.TRACKING);
|
||||
ret.details.quality = -4.5;
|
||||
ret.details.generator = type;
|
||||
|
||||
ret.details.metadata = new DocumentMetadata(3,
|
||||
PubDate.toYearByte(ret.details.pubYear),
|
||||
PubDate.toYearByte(pubYear),
|
||||
(int) -ret.details.quality,
|
||||
switch (type) {
|
||||
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded);
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -13,6 +14,7 @@ import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDate;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
@ -83,6 +85,8 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.NORMAL,
|
||||
LocalDate.now().getYear(),
|
||||
10_000);
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -27,6 +28,7 @@ import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.*;
|
||||
import java.time.LocalDate;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@ -115,6 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
.append("<!DOCTYPE html><html><head><title>")
|
||||
.append(title)
|
||||
.append("</title></head><body>")
|
||||
.append("<script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>")
|
||||
.append("<div class=\"mw-content-text\">");
|
||||
|
||||
for (String part : parts) {
|
||||
@ -131,6 +134,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
List.of("encyclopedia", "wiki"),
|
||||
domainLinks,
|
||||
GeneratorType.WIKI,
|
||||
DocumentClass.SIDELOAD,
|
||||
LocalDate.now().getYear(),
|
||||
10_000_000);
|
||||
|
||||
// Add anchor text keywords
|
||||
|
@ -1,34 +1,28 @@
|
||||
package nu.marginalia.converting.sideload.reddit;
|
||||
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.integration.reddit.db.RedditDb;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.util.ProcessingIterator;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.EnumSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@ -36,15 +30,18 @@ public class RedditSideloader implements SideloadSource {
|
||||
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
|
||||
|
||||
private final List<Path> dbFiles;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
|
||||
public RedditSideloader(List<Path> listToDbFiles,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor keywordExtractor) {
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
SideloaderProcessing sideloaderProcessing) {
|
||||
this.dbFiles = listToDbFiles;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -115,81 +112,68 @@ public class RedditSideloader implements SideloadSource {
|
||||
DomainLinks domainLinks) throws URISyntaxException {
|
||||
String fullUrl = "https://old.reddit.com" + permalink;
|
||||
|
||||
StringBuilder fullHtml = new StringBuilder();
|
||||
fullHtml.append("<!DOCTYPE html><html><head><title>").append(title).append("</title></head><body>");
|
||||
fullHtml.append("<h1>").append(title).append("</h1>");
|
||||
fullHtml.append("<p>").append(body).append("</p>");
|
||||
fullHtml.append("</body></html>");
|
||||
int pubYear = LocalDate
|
||||
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
|
||||
.getYear();
|
||||
|
||||
var ret = new ProcessedDocument();
|
||||
try {
|
||||
String fullHtml = STR."""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>\{title}</title>
|
||||
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>\{title}</h1>
|
||||
<article>
|
||||
<p>\{body}</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
""";
|
||||
|
||||
var url = new EdgeUrl(fullUrl);
|
||||
var doc = Jsoup.parse(fullHtml.toString());
|
||||
var dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||
List<String> extraKeywords = new ArrayList<>();
|
||||
|
||||
ret.url = url;
|
||||
ret.words = keywordExtractor.extractKeywords(dld, url);
|
||||
extraKeywords.add("reddit");
|
||||
extraKeywords.add(subreddit);
|
||||
extraKeywords.add("r/" + subreddit);
|
||||
|
||||
ret.words.addAllSyntheticTerms(List.of(
|
||||
"js:true",
|
||||
"site:reddit.com",
|
||||
"site:old.reddit.com",
|
||||
"site:www.reddit.com",
|
||||
"special:ads",
|
||||
"special:tracking",
|
||||
"generator:forum",
|
||||
subreddit
|
||||
));
|
||||
|
||||
ret.words.add(subreddit, WordFlags.Subjects.asBit());
|
||||
ret.words.add("reddit",
|
||||
WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.Subjects.asBit()
|
||||
| WordFlags.Synthetic.asBit()
|
||||
| WordFlags.NamesWords.asBit());
|
||||
ret.words.add(subreddit.toLowerCase(),
|
||||
WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.NamesWords.asBit()
|
||||
| WordFlags.Synthetic.asBit()
|
||||
);
|
||||
if (!"[deleted]".equals(author))
|
||||
ret.words.add(author, WordFlags.NamesWords.asBit() | WordFlags.Synthetic.asBit());
|
||||
|
||||
var date = LocalDate.ofInstant(
|
||||
Instant.ofEpochSecond(createdUtc),
|
||||
ZoneOffset.UTC);
|
||||
int year = date.getYear();
|
||||
|
||||
ret.details = new ProcessedDocumentDetails();
|
||||
ret.details.pubYear = year;
|
||||
ret.details.quality = -5;
|
||||
ret.details.metadata = new DocumentMetadata(3,
|
||||
PubDate.toYearByte(year),
|
||||
(int) -ret.details.quality,
|
||||
EnumSet.of(DocumentFlags.GeneratorForum));
|
||||
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
||||
|
||||
ret.details.metadata.withSizeAndTopology(10000, score);
|
||||
|
||||
ret.details.generator = GeneratorType.DOCS;
|
||||
ret.details.title = StringUtils.truncate(STR."[/r/\{subreddit}] \{title}", 128);
|
||||
ret.details.description = StringUtils.truncate(body, 255);
|
||||
ret.details.length = 128;
|
||||
|
||||
ret.details.standard = HtmlStandard.HTML5;
|
||||
ret.details.feedLinks = List.of();
|
||||
ret.details.linksExternal = List.of();
|
||||
ret.details.linksInternal = List.of();
|
||||
ret.state = UrlIndexingState.OK;
|
||||
ret.stateReason = "SIDELOAD";
|
||||
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
|
||||
extraKeywords.add(author);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Failed to process document", e);
|
||||
ret.url = new EdgeUrl(fullUrl);
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = "SIDELOAD";
|
||||
|
||||
var doc = sideloaderProcessing
|
||||
.processDocument(fullUrl,
|
||||
fullHtml,
|
||||
List.of("encyclopedia", "wiki"),
|
||||
domainLinks,
|
||||
GeneratorType.WIKI,
|
||||
DocumentClass.SIDELOAD,
|
||||
pubYear,
|
||||
10_000_000);
|
||||
|
||||
|
||||
if (doc.isProcessedFully()) {
|
||||
for (String url : List.of(
|
||||
STR."https://old.reddit.com/r/\{permalink}",
|
||||
STR."https://www.reddit.com/r/\{permalink}",
|
||||
STR."https://reddit.com/r/\{permalink}"
|
||||
)) {
|
||||
EdgeUrl.parse(url)
|
||||
.map(parsed -> anchorTextKeywords.getAnchorTextKeywords(domainLinks, parsed))
|
||||
.filter(parsed -> !parsed.isEmpty())
|
||||
.ifPresent(doc.words::addAnchorTerms);
|
||||
}
|
||||
|
||||
for (var keyword : extraKeywords) {
|
||||
doc.words.add(keyword, WordFlags.Subjects.asBit());
|
||||
}
|
||||
|
||||
// Insert topology information
|
||||
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
||||
}
|
||||
return ret;
|
||||
|
||||
|
||||
return doc;
|
||||
};
|
||||
}
|
||||
|
@ -105,6 +105,9 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
StringBuilder fullHtml = new StringBuilder();
|
||||
fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>");
|
||||
// Add a bogus script tag to make sure we get the JS flag
|
||||
fullHtml.append("<script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>");
|
||||
|
||||
fullHtml.append("<p>").append(post.title()).append("</p>");
|
||||
for (var comment : post.bodies()) {
|
||||
fullHtml.append("<p>").append(comment).append("</p>");
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -19,6 +20,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDate;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
@ -130,8 +132,13 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
}
|
||||
|
||||
return Optional.of(sideloaderProcessing
|
||||
.processDocument(url, body.get(), List.of(), new DomainLinks(),
|
||||
.processDocument(url,
|
||||
body.get(),
|
||||
List.of(),
|
||||
new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.SIDELOAD,
|
||||
LocalDate.now().getYear(), // TODO: This should be the actual year of the document
|
||||
10_000));
|
||||
}
|
||||
|
||||
|
@ -20,8 +20,8 @@
|
||||
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
|
||||
{{#each uploadDirContents.items}}
|
||||
<tr>
|
||||
<td><input {{#if directory}}disabled{{/if}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
|
||||
<td {{#if directory}}class="text-muted"{{/if}}>
|
||||
<td><input {{#unless isZim}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
|
||||
<td {{#unless isZim}}class="text-muted"{{/unless}}>
|
||||
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
|
||||
</td>
|
||||
<td>{{#unless directory}}{{size}}{{/unless}}</td>
|
||||
|
@ -13,8 +13,8 @@ information how to do this.
|
||||
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
|
||||
{{#each uploadDirContents.items}}
|
||||
<tr>
|
||||
<td><input class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
|
||||
<td>
|
||||
<td><input {{#unless isStackexchange7z}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
|
||||
<td {{#unless isStackexchange7z}}class="text-muted"{{/unless}}>
|
||||
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
|
||||
</td>
|
||||
<td>{{#unless directory}}{{size}}{{/unless}}</td>
|
||||
|
@ -12,9 +12,9 @@ A warc export can be created using e.g. wget: <p>
|
||||
<th></th><th>Filename</th><th>Size</th><th>Last Modified</th>
|
||||
{{#each uploadDirContents.items}}
|
||||
<tr>
|
||||
<td><input class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
|
||||
<td><input {{#unless isWarc}}disabled{{/unless}} class="form-check-input" type="radio" name="source" id="{{name}}" value="{{name}}"></td>
|
||||
<td>
|
||||
<label class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
|
||||
<label {{#unless isWarc}}class="text-muted"{{/unless}} class="form-check-label" for="{{name}}">{{name}}{{#if directory}}/{{/if}}</label>
|
||||
</td>
|
||||
<td>{{#unless directory}}{{size}}{{/unless}}</td>
|
||||
<td title={{lastModifiedTime}}>{{shortTimestamp lastModifiedTime}}</td>
|
||||
|
Loading…
Reference in New Issue
Block a user