(sideload) Fix sideloading so that it doesn't get disproportionately good rankings
Also add type flags so that e.g. wikipedia shows up in the wikis filter.
This commit is contained in:
parent
e9a01caa5c
commit
e5cee1f46d
@ -3,15 +3,22 @@ package nu.marginalia.converting.sideload;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
@ -27,6 +34,7 @@ public class SideloaderProcessing {
|
||||
String body,
|
||||
List<String> extraKeywords,
|
||||
DomainLinks domainLinks,
|
||||
GeneratorType type,
|
||||
int size) throws URISyntaxException {
|
||||
var crawledDoc = new CrawledDocument(
|
||||
"encyclopedia.marginalia.nu",
|
||||
@ -55,6 +63,27 @@ public class SideloaderProcessing {
|
||||
|
||||
ret.details = details.details();
|
||||
|
||||
// Add a few things that we know about the document
|
||||
// that we can't get from the sideloaded data since it's
|
||||
// so stripped down
|
||||
|
||||
ret.details.standard = HtmlStandard.HTML5;
|
||||
ret.details.pubYear = LocalDateTime.now().getYear();
|
||||
ret.details.features.add(HtmlFeature.JS);
|
||||
ret.details.features.add(HtmlFeature.TRACKING);
|
||||
ret.details.quality = -10;
|
||||
ret.details.generator = type;
|
||||
|
||||
ret.details.metadata = new DocumentMetadata(3,
|
||||
PubDate.toYearByte(ret.details.pubYear),
|
||||
(int) -ret.details.quality,
|
||||
switch (type) {
|
||||
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki);
|
||||
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs);
|
||||
default -> EnumSet.noneOf(DocumentFlags.class);
|
||||
});
|
||||
|
||||
|
||||
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
|
||||
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
|
||||
// deal but something to keep in mind
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
@ -79,7 +80,9 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(), 10_000);
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
10_000);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
@ -184,6 +185,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
fullHtml.toString(),
|
||||
List.of("encyclopedia", "wiki"),
|
||||
domainLinks,
|
||||
GeneratorType.WIKI,
|
||||
10_000_000);
|
||||
}
|
||||
|
||||
|
@ -126,9 +126,13 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
ret.details = new ProcessedDocumentDetails();
|
||||
ret.details.pubYear = post.year();
|
||||
ret.details.quality = 10;
|
||||
ret.details.quality = -10;
|
||||
ret.details.metadata = new DocumentMetadata(3,
|
||||
PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class));
|
||||
PubDate.toYearByte(ret.details.pubYear),
|
||||
(int) -ret.details.quality,
|
||||
EnumSet.of(DocumentFlags.GeneratorDocs));
|
||||
ret.details.features.add(HtmlFeature.JS);
|
||||
ret.details.features.add(HtmlFeature.TRACKING);
|
||||
|
||||
ret.details.metadata.withSizeAndTopology(10000, 0);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user