(converter) Integrate atags.parquet with the encyclopedia sideloader

Also clean up stackexchange and dirtree a bit.
This commit is contained in:
Viktor Lofgren 2023-11-06 18:03:01 +01:00
parent ebd10a5f28
commit e0c769fd19
7 changed files with 65 additions and 10 deletions

View File

@ -53,6 +53,9 @@ public class AnchorTagsSourceFactory {
// that needs to be loaded into the duckdb instance to a more manageable level, and keeps
// the memory footprint of the service down.
private List<EdgeDomain> getRelevantDomainsByNodeAffinity() {
if (dataSource == null)
return List.of();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
import com.google.gson.Gson;
import com.google.inject.Inject;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
@ -19,6 +20,7 @@ public class SideloadSourceFactory {
private final SideloaderProcessing sideloaderProcessing;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
@Inject
@ -26,16 +28,18 @@ public class SideloadSourceFactory {
SideloaderProcessing sideloaderProcessing,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor,
AnchorTagsSourceFactory anchorTagsSourceFactory,
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
this.gson = gson;
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
}
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, sideloaderProcessing);
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing);
}
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
import nu.marginalia.crawling.model.CrawledDocument;
@ -25,6 +26,7 @@ public class SideloaderProcessing {
public ProcessedDocument processDocument(String url,
String body,
List<String> extraKeywords,
DomainLinks domainLinks,
int size) throws URISyntaxException {
var crawledDoc = new CrawledDocument(
"encyclopedia.marginalia.nu",
@ -52,8 +54,15 @@ public class SideloaderProcessing {
ret.words.add(keyword, WordFlags.Subjects.asBit());
ret.details = details.details();
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
// deal but something to keep in mind
int topology = domainLinks.countForUrl(new EdgeUrl(url));
ret.details.metadata = ret.details.metadata
.withSizeAndTopology(size, Math.max(0, 32 - url.length()) / 4);
.withSizeAndTopology(size, topology);
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";

View File

@ -1,6 +1,7 @@
package nu.marginalia.converting.sideload.dirtree;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
@ -78,7 +79,7 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
}
return sideloaderProcessing
.processDocument(url, body, extraKeywords, 10_000);
.processDocument(url, body, extraKeywords, new DomainLinks(), 10_000);
}
@Override

View File

@ -3,13 +3,18 @@ package nu.marginalia.converting.sideload.encyclopedia;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@ -38,10 +43,13 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
private final EdgeUrl baseUrl;
private final Gson gson;
private final SideloaderProcessing sideloaderProcessing;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
public EncyclopediaMarginaliaNuSideloader(Path pathToDbFile,
String baseUrl,
Gson gson,
AnchorTagsSourceFactory anchorTagsSourceFactory,
SideloaderProcessing sideloaderProcessing) throws SQLException {
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
this.gson = gson;
@ -49,6 +57,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
connection = DriverManager.getConnection(sqliteDbString);
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
}
@ -72,6 +81,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
ExecutorService executorService = Executors.newFixedThreadPool(16);
Semaphore sem = new Semaphore(16);
DomainLinks domainLinks = getDomainLinks();
executorService.submit(() -> {
try {
var stmt = connection.prepareStatement("""
@ -89,7 +100,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
executorService.submit(() -> {
try {
docs.add(convertDocument(articleParts.parts, title, url));
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
} catch (URISyntaxException | DisqualifiedException e) {
e.printStackTrace();
} finally {
@ -122,9 +133,21 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
};
}
private DomainLinks getDomainLinks() {
try (var source = anchorTagsSourceFactory.create(List.of(new EdgeDomain("en.wikipedia.org")))) {
return source.getAnchorTags("en.wikipedia.org");
}
catch (Exception ex) {
logger.error("Failed to create anchor tags source", ex);
return new DomainLinks();
}
}
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
var stmt = connection.prepareStatement("""
SELECT url,title,html FROM articles
SELECT url,title,html
FROM articles
WHERE url=?
""");
stmt.setFetchSize(100);
@ -135,12 +158,16 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
return convertDocument(articleParts.parts, title, URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8));
return convertDocument(articleParts.parts,
title,
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
);
}
return null;
}
private ProcessedDocument convertDocument(List<String> parts, String title, String url) throws URISyntaxException, DisqualifiedException {
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
String fullUrl = baseUrl.toString() + url;
StringBuilder fullHtml = new StringBuilder();
@ -156,6 +183,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
.processDocument(fullUrl,
fullHtml.toString(),
List.of("encyclopedia", "wiki"),
domainLinks,
10_000_000);
}

View File

@ -126,9 +126,12 @@ public class StackexchangeSideloader implements SideloadSource {
ret.details = new ProcessedDocumentDetails();
ret.details.pubYear = post.year();
ret.details.quality = 5;
ret.details.metadata = new DocumentMetadata(4,
ret.details.quality = 10;
ret.details.metadata = new DocumentMetadata(3,
PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class));
ret.details.metadata.withSizeAndTopology(10000, 0);
ret.details.features = EnumSet.noneOf(HtmlFeature.class);
ret.details.generator = GeneratorType.DOCS;
ret.details.title = StringUtils.truncate(post.title(), 128);

View File

@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.encyclopedia;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.processor.ConverterDomainTypes;
@ -30,6 +32,7 @@ import java.util.Map;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.when;
class EncyclopediaMarginaliaNuSideloaderTest {
Path tempFile;
@ -68,7 +71,7 @@ class EncyclopediaMarginaliaNuSideloaderTest {
return;
}
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
Mockito.when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
var processing = Guice.createInjector(new ConverterModule(),
new AbstractModule() {
public void configure() {
@ -78,10 +81,14 @@ class EncyclopediaMarginaliaNuSideloaderTest {
)
.getInstance(SideloaderProcessing.class);
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
var sideloader = new EncyclopediaMarginaliaNuSideloader(
pathToDbFile,
"https://en.wikipedia.org/wiki/",
GsonFactory.get(),
atagsFactory,
processing
);