(sideload) Fix sideloading so that it doesn't get disproportionately good rankings

Also add type flags so that e.g. wikipedia shows up in the wikis filter.
This commit is contained in:
Viktor Lofgren 2023-11-12 14:56:26 +01:00
parent e9a01caa5c
commit e5cee1f46d
4 changed files with 41 additions and 3 deletions

View File

@ -3,15 +3,22 @@ package nu.marginalia.converting.sideload;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import java.net.URISyntaxException;
import java.time.LocalDateTime;
import java.util.EnumSet;
import java.util.List;
@Singleton
@ -27,6 +34,7 @@ public class SideloaderProcessing {
String body,
List<String> extraKeywords,
DomainLinks domainLinks,
GeneratorType type,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu",
@ -55,6 +63,27 @@ public class SideloaderProcessing {
ret.details = details.details();
// Add a few things that we know about the document
// that we can't get from the sideloaded data since it's
// so stripped down
ret.details.standard = HtmlStandard.HTML5;
ret.details.pubYear = LocalDateTime.now().getYear();
ret.details.features.add(HtmlFeature.JS);
ret.details.features.add(HtmlFeature.TRACKING);
ret.details.quality = -10;
ret.details.generator = type;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(ret.details.pubYear),
(int) -ret.details.quality,
switch (type) {
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki);
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs);
default -> EnumSet.noneOf(DocumentFlags.class);
});
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
// deal but something to keep in mind

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload.dirtree;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
@ -79,7 +80,9 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
}
return sideloaderProcessing
.processDocument(url, body, extraKeywords, new DomainLinks(), 10_000);
.processDocument(url, body, extraKeywords, new DomainLinks(),
GeneratorType.DOCS,
10_000);
}
@Override

View File

@ -6,6 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
@ -184,6 +185,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
fullHtml.toString(),
List.of("encyclopedia", "wiki"),
domainLinks,
GeneratorType.WIKI,
10_000_000);
}

View File

@ -126,9 +126,13 @@ public class StackexchangeSideloader implements SideloadSource {
ret.details = new ProcessedDocumentDetails();
ret.details.pubYear = post.year();
ret.details.quality = 10;
ret.details.quality = -10;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class));
PubDate.toYearByte(ret.details.pubYear),
(int) -ret.details.quality,
EnumSet.of(DocumentFlags.GeneratorDocs));
ret.details.features.add(HtmlFeature.JS);
ret.details.features.add(HtmlFeature.TRACKING);
ret.details.metadata.withSizeAndTopology(10000, 0);