(converter) Integrate atags.parquet with the encyclopedia sideloader
Also clean up stackexchange and dirtree a bit.
This commit is contained in:
parent
ebd10a5f28
commit
e0c769fd19
@ -53,6 +53,9 @@ public class AnchorTagsSourceFactory {
|
||||
// that needs to be loaded into the duckdb instance to a more manageable level, and keeps
|
||||
// the memory footprint of the service down.
|
||||
private List<EdgeDomain> getRelevantDomainsByNodeAffinity() {
|
||||
if (dataSource == null)
|
||||
return List.of();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
||||
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
||||
import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader;
|
||||
@ -19,6 +20,7 @@ public class SideloadSourceFactory {
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentKeywordExtractor documentKeywordExtractor;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
||||
|
||||
@Inject
|
||||
@ -26,16 +28,18 @@ public class SideloadSourceFactory {
|
||||
SideloaderProcessing sideloaderProcessing,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor documentKeywordExtractor,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
|
||||
this.gson = gson;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentKeywordExtractor = documentKeywordExtractor;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
||||
}
|
||||
|
||||
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
|
||||
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, sideloaderProcessing);
|
||||
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing);
|
||||
}
|
||||
|
||||
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
@ -25,6 +26,7 @@ public class SideloaderProcessing {
|
||||
public ProcessedDocument processDocument(String url,
|
||||
String body,
|
||||
List<String> extraKeywords,
|
||||
DomainLinks domainLinks,
|
||||
int size) throws URISyntaxException {
|
||||
var crawledDoc = new CrawledDocument(
|
||||
"encyclopedia.marginalia.nu",
|
||||
@ -52,8 +54,15 @@ public class SideloaderProcessing {
|
||||
ret.words.add(keyword, WordFlags.Subjects.asBit());
|
||||
|
||||
ret.details = details.details();
|
||||
|
||||
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
|
||||
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
|
||||
// deal but something to keep in mind
|
||||
int topology = domainLinks.countForUrl(new EdgeUrl(url));
|
||||
|
||||
ret.details.metadata = ret.details.metadata
|
||||
.withSizeAndTopology(size, Math.max(0, 32 - url.length()) / 4);
|
||||
.withSizeAndTopology(size, topology);
|
||||
|
||||
ret.url = new EdgeUrl(url);
|
||||
ret.state = UrlIndexingState.OK;
|
||||
ret.stateReason = "SIDELOAD";
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
@ -78,7 +79,7 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, 10_000);
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(), 10_000);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -3,13 +3,18 @@ package nu.marginalia.converting.sideload.encyclopedia;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
@ -38,10 +43,13 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
private final EdgeUrl baseUrl;
|
||||
private final Gson gson;
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
|
||||
|
||||
public EncyclopediaMarginaliaNuSideloader(Path pathToDbFile,
|
||||
String baseUrl,
|
||||
Gson gson,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
SideloaderProcessing sideloaderProcessing) throws SQLException {
|
||||
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
|
||||
this.gson = gson;
|
||||
@ -49,6 +57,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
|
||||
|
||||
connection = DriverManager.getConnection(sqliteDbString);
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
|
||||
}
|
||||
|
||||
@ -72,6 +81,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(16);
|
||||
Semaphore sem = new Semaphore(16);
|
||||
|
||||
DomainLinks domainLinks = getDomainLinks();
|
||||
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
var stmt = connection.prepareStatement("""
|
||||
@ -89,7 +100,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
docs.add(convertDocument(articleParts.parts, title, url));
|
||||
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
|
||||
} catch (URISyntaxException | DisqualifiedException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
@ -122,9 +133,21 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
};
|
||||
}
|
||||
|
||||
private DomainLinks getDomainLinks() {
|
||||
try (var source = anchorTagsSourceFactory.create(List.of(new EdgeDomain("en.wikipedia.org")))) {
|
||||
return source.getAnchorTags("en.wikipedia.org");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to create anchor tags source", ex);
|
||||
return new DomainLinks();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
|
||||
var stmt = connection.prepareStatement("""
|
||||
SELECT url,title,html FROM articles
|
||||
SELECT url,title,html
|
||||
FROM articles
|
||||
WHERE url=?
|
||||
""");
|
||||
stmt.setFetchSize(100);
|
||||
@ -135,12 +158,16 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
|
||||
String title = rs.getString("title");
|
||||
|
||||
return convertDocument(articleParts.parts, title, URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8));
|
||||
return convertDocument(articleParts.parts,
|
||||
title,
|
||||
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
|
||||
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
|
||||
);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private ProcessedDocument convertDocument(List<String> parts, String title, String url) throws URISyntaxException, DisqualifiedException {
|
||||
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
|
||||
String fullUrl = baseUrl.toString() + url;
|
||||
|
||||
StringBuilder fullHtml = new StringBuilder();
|
||||
@ -156,6 +183,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
.processDocument(fullUrl,
|
||||
fullHtml.toString(),
|
||||
List.of("encyclopedia", "wiki"),
|
||||
domainLinks,
|
||||
10_000_000);
|
||||
}
|
||||
|
||||
|
@ -126,9 +126,12 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
ret.details = new ProcessedDocumentDetails();
|
||||
ret.details.pubYear = post.year();
|
||||
ret.details.quality = 5;
|
||||
ret.details.metadata = new DocumentMetadata(4,
|
||||
ret.details.quality = 10;
|
||||
ret.details.metadata = new DocumentMetadata(3,
|
||||
PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class));
|
||||
|
||||
ret.details.metadata.withSizeAndTopology(10000, 0);
|
||||
|
||||
ret.details.features = EnumSet.noneOf(HtmlFeature.class);
|
||||
ret.details.generator = GeneratorType.DOCS;
|
||||
ret.details.title = StringUtils.truncate(post.title(), 128);
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.encyclopedia;
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
@ -30,6 +32,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
Path tempFile;
|
||||
@ -68,7 +71,7 @@ class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
return;
|
||||
}
|
||||
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
|
||||
Mockito.when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
|
||||
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
|
||||
var processing = Guice.createInjector(new ConverterModule(),
|
||||
new AbstractModule() {
|
||||
public void configure() {
|
||||
@ -78,10 +81,14 @@ class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
)
|
||||
.getInstance(SideloaderProcessing.class);
|
||||
|
||||
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
|
||||
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
|
||||
|
||||
var sideloader = new EncyclopediaMarginaliaNuSideloader(
|
||||
pathToDbFile,
|
||||
"https://en.wikipedia.org/wiki/",
|
||||
GsonFactory.get(),
|
||||
atagsFactory,
|
||||
processing
|
||||
);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user