(converter) Integrate zim->db conversion into automatic encyclopedia processing workflow

Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file.  This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically.

The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads.
This commit is contained in:
Viktor Lofgren 2024-01-19 13:59:03 +01:00
parent 22c8fb3f59
commit 27ffb8fa8a
21 changed files with 895 additions and 19 deletions

View File

@ -2,6 +2,8 @@ package nu.marginalia.loading;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.worklog.BatchingWorkLogInspector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
@ -10,13 +12,21 @@ import java.util.*;
public class LoaderInputData {
private final List<Path> sourceDirectories;
private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class);
private final Map<Path, Integer> lastGoodBatch = new HashMap<>();
public LoaderInputData(List<Path> sourceDirectories) throws IOException {
this.sourceDirectories = sourceDirectories;
for (var source : sourceDirectories) {
lastGoodBatch.put(source, BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log")));
int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"));
this.lastGoodBatch.put(source, lastGoodBatch);
if (lastGoodBatch == 0) {
// This is useful diagnostic information, so we log it as a warning
logger.warn("No valid batches found in {}", source);
}
}
}

View File

@ -1,11 +1,17 @@
<h1 class="my-3">Sideload Encyclopedia</h1>
<div class="my-3 p-3 border bg-light">
<p>This will sideload a pre-converted MediaWiki-style OpenZim data set.
See the <a href="https://github.com/MarginaliaSearch/MarginaliaSearch/blob/master/doc/sideloading-howto.md">sideloading howto</a>
for instructions how to produce this file. </p>
<p>Place an articles.db file in the upload directory on the server, and select it from the list
below. </p>
<p>This will side-load a MediaWiki-style OpenZim data set. Place a zim file in the uploads directory.
For Wikipedia, the zim file can be downloaded from <a href="https://download.kiwix.org/zim/wikipedia/">https://download.kiwix.org/zim/wikipedia/</a>.
The en_all_nopic sets are recommended for wikipedia, since they are smaller and do not contain images
(which are not used anyway). For testing, the _mini or _en_100 sets are good choices.
<p></p>
The zim file will be converted to a sqlite database (.db-file) with a similar name to
the zim file, which then automatically is turned into processed data.
<p></p>
Since the first stage of processing is very time-consuming, the sqlite database can
also be loaded from this form.
</p>
</div>
<form method="post" action="actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
<div class="my-3 py-3">

View File

@ -45,6 +45,7 @@ dependencies {
implementation project(':code:api:query-api')
implementation project(':code:api:process-mqapi')
implementation project(':code:api:executor-api')
implementation project(':third-party:encyclopedia-marginalia-nu')
implementation libs.bundles.slf4j

View File

@ -7,6 +7,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume;
import nu.marginalia.encyclopedia.EncyclopediaConverter;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.storage.FileStorageService;
@ -16,21 +17,27 @@ import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.mq.MqMessageState;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.mqapi.converting.ConvertAction;
import nu.marginalia.mqapi.converting.ConvertRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.CRC32;
@Singleton
public class ConvertActor extends RecordActorPrototype {
private static final Logger logger = LoggerFactory.getLogger(ConvertActor.class);
private final ActorProcessWatcher processWatcher;
private final MqOutbox mqConverterOutbox;
private final FileStorageService storageService;
private final Gson gson;
public record Convert(FileStorageId fid) implements ActorStep {};
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
public record PredigestEncyclopedia(String source, String dest, String baseUrl) implements ActorStep {};
public record ConvertDirtree(String source) implements ActorStep {};
public record ConvertWarc(String source) implements ActorStep {};
public record ConvertStackexchange(String source) implements ActorStep {};
@ -100,6 +107,19 @@ public class ConvertActor extends RecordActorPrototype {
if (!Files.exists(sourcePath))
yield new Error("Source path does not exist: " + sourcePath);
if (source.toLowerCase().endsWith(".zim")) {
// If we're fed a ZIM file, we need to convert it to a sqlite database first
String hash = getCrc32FileHash(sourcePath);
// To avoid re-converting the same file, we'll assign the file a name based on its hash
// and the original filename. This way, if we're fed the same file again, we'll be able to just
// re-use the predigested database file.
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
} else if (!source.endsWith(".db")) {
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
}
String fileName = sourcePath.toFile().getName();
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
@ -114,6 +134,36 @@ public class ConvertActor extends RecordActorPrototype {
mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id()))
);
}
case PredigestEncyclopedia(String source, String dest, String baseUrl) -> {
Path sourcePath = Path.of(source);
if (!Files.exists(sourcePath)) {
yield new Error("Source path does not exist: " + sourcePath);
}
Path destPath = Path.of(dest);
if (Files.exists(destPath)) {
// Already predigested, go straight to convert step
yield new ConvertEncyclopedia(dest, baseUrl);
}
Path tempFile = Files.createTempFile(destPath.getParent(), "encyclopedia", "db.tmp");
try {
EncyclopediaConverter.convert(sourcePath, tempFile);
Files.move(tempFile, destPath);
}
catch (Exception e) {
logger.error("Failed to convert ZIM file to sqlite database", e);
Files.deleteIfExists(tempFile);
Files.deleteIfExists(destPath);
yield new Error("Failed to convert ZIM file to sqlite database: " + e.getMessage());
}
// Go back to convert step with the new database file
yield new ConvertEncyclopedia(dest, baseUrl);
}
case ConvertStackexchange(String source) -> {
Path sourcePath = Path.of(source);
@ -150,6 +200,22 @@ public class ConvertActor extends RecordActorPrototype {
};
}
private String getCrc32FileHash(Path file) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
try (var channel = Files.newByteChannel(file)) {
CRC32 crc = new CRC32();
while (channel.read(buffer) > 0) {
buffer.flip();
crc.update(buffer);
buffer.clear();
}
return Long.toHexString(crc.getValue());
}
}
@Override
public String describe() {
return "Convert a set of crawl data into a format suitable for loading into the database.";
@ -165,6 +231,5 @@ public class ConvertActor extends RecordActorPrototype {
this.processWatcher = processWatcher;
this.mqConverterOutbox = processOutboxes.getConverterOutbox();
this.storageService = storageService;
this.gson = gson;
}
}

View File

@ -99,6 +99,7 @@ include 'third-party:monkey-patch-opennlp'
include 'third-party:monkey-patch-gson'
include 'third-party:commons-codec'
include 'third-party:parquet-floor'
include 'third-party:encyclopedia-marginalia-nu'
dependencyResolutionManagement {

View File

@ -9,9 +9,9 @@ or lack an artifact, or to override some default that is inappropriate for the t
* [RDRPosTagger](rdrpostagger/) - GPL3
* [PorterStemmer](porterstemmer/) - LGPL3
* [Uppend](uppend/) - MIT
* [OpenZIM](openzim/) - GPL-2.0
* [OpenZIM](openzim/) - GPL-2.0+
* [Commons Codec](commons-codec/) - Apache 2.0
* [encylopedia.marginalia.nu](encyclopedia-marginalia-nu/) - GPL 2.0+
### Repackaged
* [SymSpell](symspell/) - LGPL-3.0
* [Count-Min-Sketch](count-min-sketch/) - Apache 2.0

View File

@ -0,0 +1,26 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation libs.jsoup
implementation libs.notnull
implementation libs.bundles.gson
implementation libs.zstd
implementation libs.bundles.slf4j
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':third-party:xz')
implementation project(':third-party:openzim')
}
test {
useJUnitPlatform()
}

View File

@ -0,0 +1,5 @@
This package contains a severely stripped down version of the codebase from
[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/).
The extracted code is a ZimFile reader and WikiHTML cleaner. It is used by the
encyclopedia side-loader.

View File

@ -0,0 +1,67 @@
package nu.marginalia.encyclopedia;
import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiConsumer;
import java.util.function.Predicate;
/** Converts an OpenZim file with Wikipedia articles to a SQLite database
* with cleaned-up MediaWiki HTML
*/
public class EncyclopediaConverter {
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class);
public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
var wc = new WikiCleaner();
var pool = new SimpleBlockingThreadPool("Convert ZIM",
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
2);
var size = new AtomicInteger();
if (!Files.exists(inputFile)) {
throw new IllegalStateException("ZIM file not found: " + inputFile);
}
Files.deleteIfExists(outputFile);
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
Predicate<Integer> keepGoing = (s) -> true;
BiConsumer<String, String> handleArticle = (url, html) -> {
if (pool.isTerminated())
return;
pool.submitQuietly(() -> {
int sz = size.incrementAndGet();
if (sz % 1000 == 0) {
System.out.printf("\u001b[2K\r%d", sz);
}
asw.add(wc.cleanWikiJunk(url, html));
});
size.incrementAndGet();
};
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
pool.shutDown();
logger.info("Waiting for pool to finish");
while (!pool.awaitTermination(1, TimeUnit.SECONDS)) {
// ...
}
}
}
}

View File

@ -0,0 +1,60 @@
package nu.marginalia.encyclopedia.cleaner;
import lombok.Builder;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@Builder
public class CleanerFilter implements NodeFilter {
final Set<String> badTags;
final Set<String> badIds;
final Set<String> badClasses;
final Set<Predicate<Element>> predicates;
private static final Pattern spacePattern = Pattern.compile("\\s+");
@Override
public FilterResult head(Node node, int depth) {
if (node instanceof Element el) {
if (badTags != null && badTags.contains(el.tagName()))
return FilterResult.REMOVE;
if (badIds != null && badIds.contains(el.id()))
return FilterResult.REMOVE;
if (badClasses != null) {
String className = el.className();
if (className.contains(" ")) {
String[] parts = spacePattern.split(className);
for (var c : parts) {
if (badClasses.contains(c))
return FilterResult.REMOVE;
}
}
else if (badClasses.contains(className)) {
return FilterResult.REMOVE;
}
}
if (predicates != null) {
for (var pred : predicates) {
if (pred.test(el))
return FilterResult.REMOVE;
}
}
}
if (node instanceof Comment) {
return FilterResult.REMOVE;
}
return FilterResult.CONTINUE;
}
}

View File

@ -0,0 +1,329 @@
package nu.marginalia.encyclopedia.cleaner;
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
import nu.marginalia.encyclopedia.model.Article;
import nu.marginalia.encyclopedia.model.Link;
import nu.marginalia.encyclopedia.model.LinkList;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
public class WikiCleaner {
private static final String licenseFooter = "This article is issued from Wikipedia. The text is licensed under Creative Commons - Attribution - Sharealike. Additional terms may apply for the media files.";
public ArticleData cleanWikiJunk(String url, String html) {
return cleanWikiJunk(url, Jsoup.parse(html));
}
private boolean isPresentationRole(Element el) {
return "presentation".equals(el.attr("role"));
}
private boolean isLicenseFooter(Element el) {
// We'll add our own later
if ("div".equals(el.tagName())) {
return licenseFooter.equals(el.wholeOwnText().trim());
}
return false;
}
public ArticleData cleanWikiJunk(String url, Document doc) {
if (doc.getElementById("content") == null) {
return null;
}
List<Link> disambig = getDisambiguationLinks(doc);
List<Link> topLinks = getWikiPageLinks(doc);
doc.filter(CleanerFilter.builder()
.badClasses(Set.of("infobox", "collapsible", "navbar", "printfooter",
"mw-editsection", "thumb", "sidebar", "navbox", "mw-jump-link",
"vertical-navbox", "mw-indicators", "noprint", "sistersitebox",
"BarChartTemplate"))
.badIds(Set.of("coordinates", "mw-page-base", "mw-head-base", "site-notice", "contentSub", "contentSub2"))
.badTags(Set.of("footer", "script", "object", "embed", "audio", "style", "nosript", "link", "meta", "img"))
.predicates(Set.of(this::isPresentationRole, this::isLicenseFooter))
.build());
doc.getElementsByTag("a").forEach(tag -> {
var href = tag.attr("href");
var parent = tag.parent();
if (null != parent && "li".equals(parent.tagName())) {
tag.removeAttr("title");
if (href.startsWith("http://")) {
tag.addClass("extern-link");
tag.attr("rel", "nofollow");
}
} else {
tag.replaceWith(new TextNode(tag.text()));
}
});
doc.getElementsByTag("cite").tagName("span");
doc.filter(CleanerFilter.builder()
.badIds(Set.of("toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav"))
.badClasses(Set.of("mw-references-wrap", "references", "reference", "siteSub", "refbegin"))
.build()
);
doc.getAllElements().forEach(elem -> {
if (elem.parent() != null
&& "summary".equals(elem.parent().tagName()))
{
elem.parent().replaceWith(elem);
}
});
doc.getElementsByClass("mwe-math-element").forEach(mathSpan -> {
var mathTag = mathSpan.getElementsByTag("math").first();
if (mathTag != null) {
mathSpan.replaceWith(mathTag);
}
});
doc.getElementsByTag("span").forEach(elem -> {
if ("pre".equals(elem.parent().tagName())) {
if (elem.hasClass("linenos")) {
elem.replaceWith(new TextNode(String.format("%-4s", elem.text())));
}
else {
elem.replaceWith(new TextNode(elem.text()));
}
}
else {
elem.replaceWith(new TextNode(" " + elem.text() + " "));
}
});
doc.getElementsByTag("details").forEach(deets -> {
if (deets.children().size() == 1) {
deets.replaceWith(deets.children().first());
}
else {
deets.tagName("div");
}
});
removeSingularlyNestedDivs(doc);
removeEmptyTags(doc, "li");
removeEmptyTags(doc, "ul");
removeEmptyTags(doc, "div");
doc.getElementsByTag("p").forEach(elem -> {
if ("blockquote".equals(elem.parent().tagName())) {
elem.replaceWith(new TextNode(elem.text()));
}
});
removeEmptyTags(doc, "p");
cascadingHeaderCleanup(doc, "h4", "h3", "h2");
cascadingHeaderCleanup(doc, "h3", "h2");
cascadingHeaderCleanup(doc, "h2");
doc.getElementsByTag("table").forEach(table -> {
table.attr("border", "1");
if ("right".equals(table.attr("align"))) {
table.remove();
}
});
doc.getAllElements().forEach(elem -> {
removeWikiClassNames(elem);
elem.removeAttr("lang");
elem.removeAttr("dir");
elem.removeAttr("id");
elem.removeAttr("role");
elem.removeAttr("style");
elem.removeAttr("tabindex");
elem.removeAttr("aria-haspopup");
elem.removeAttr("data-section-id");
elem.removeAttr("aria-expanded");
elem.removeAttr("aria-pressed");
elem.removeAttr("open");
elem.removeAttr("data-level");
});
doc.getElementsByTag("table").remove();
// Remove the first header since we'll insert our own in the templating
Optional.ofNullable(doc.getElementsByTag("h1").first()).ifPresent(Element::remove);
ArticleParts articleParts = getDocumentParts(doc);
return new Article(
url,
doc.title(),
articleParts.getSummary(),
articleParts,
new LinkList(topLinks),
new LinkList(disambig)
).asData();
}
private void removeWikiClassNames(Element elem) {
final String classNames = elem.className();
// Note that the string with class names isn't split,
// this is fairly expensive and since most tags don't even
// have classes, we'll optimistically check for presence and then
// pay for the expensive removeClass operation even if unnecessary
// due to a false positive
if (classNames.contains("verb")) {
elem.removeClass("verb");
}
if (classNames.contains("extern-link")) {
elem.removeClass("extern-link");
}
if (classNames.contains("margin-note")) {
elem.removeClass("margin-note");
}
if (classNames.contains("wikitable")) {
elem.removeClass("wikitable");
}
}
public static ArticleParts getDocumentParts(Document doc) {
// We expect the document to be one container div with a bunch of children
// each corresponding to a section of the document
var rootDiv = doc.getElementsByTag("div").first();
if (null == rootDiv) {
return new ArticleParts(List.of());
}
// To be maximally useful, we want the article as a series of divs corresponding to
// logical sections of the article
List<String> parts = new ArrayList<>();
Element normalizingDiv = null;
for (Element child : rootDiv.children()) {
boolean isDiv = "div".equals(child.tagName());
if (!isDiv && normalizingDiv == null) {
normalizingDiv = new Element("div");
}
if (isDiv && normalizingDiv != null) {
if (normalizingDiv.childrenSize() > 0) {
parts.add(normalizingDiv.outerHtml());
}
normalizingDiv = null;
}
if (normalizingDiv != null) normalizingDiv.appendChild(child.clone());
if (isDiv && child.childrenSize() > 0) parts.add(child.outerHtml());
}
if (normalizingDiv != null &&
normalizingDiv.childrenSize() > 0)
{
parts.add(normalizingDiv.outerHtml());
}
return new ArticleParts(parts);
}
private void removeSingularlyNestedDivs(Document doc) {
// Remove divs that only contain a single div, and replace them with the inner div
for (Element div : doc.getElementsByTag("div")) {
final Elements children = div.children();
if (children.size() != 1)
continue;
final Element childDiv = children.first();
if (null != childDiv && "div".equals(childDiv.tagName())) {
div.replaceWith(childDiv);
}
}
}
private void cascadingHeaderCleanup(Document doc, String currH, String... nextHeaders) {
doc.getElementsByTag(currH).forEach(elem -> {
var next = elem.nextElementSibling();
if (next == null) {
elem.remove();
return;
}
String nextTagName = next.tagName();
if (currH.equals(nextTagName)) {
elem.remove();
}
else for (String h : nextHeaders) {
if (h.equals(nextTagName)) {
elem.remove();
}
}
});
}
private void removeEmptyTags(Document doc, String tag) {
doc.getElementsByTag(tag).forEach(elem -> {
if (elem.text().isBlank() && elem.getElementsByTag("img").isEmpty()) {
elem.replaceWith(new TextNode(" "));
}
});
}
@NotNull
private List<Link> getWikiPageLinks(Document doc) {
List<Link> topLinks = new ArrayList<>();
doc.select("p a").forEach(atag -> {
String href = atag.attr("href");
if (!href.isBlank()
&& !href.contains(":")
&& !href.startsWith("#")
) {
topLinks.add(new Link(href, atag.attr("title")));
}
});
return topLinks;
}
@NotNull
private List<Link> getDisambiguationLinks(Document doc) {
List<Link> disambig = new ArrayList<>();
for (var note: doc.getElementsByClass("hatnote")) {
for (var atag : note.getElementsByTag("a")) {
String href = atag.attr("href");
if (atag.hasClass("mw-disambig") && !href.isBlank()) {
disambig.add(new Link(href, atag.attr("title")));
}
}
note.remove();
}
return disambig;
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.encyclopedia.cleaner.model;
public record ArticleData(
String url,
String title,
String summary,
byte[] parts,
byte[] links,
byte[] disambigs) {
}

View File

@ -0,0 +1,44 @@
package nu.marginalia.encyclopedia.cleaner.model;
import org.jsoup.Jsoup;
import java.util.List;
public record ArticleParts(List<String> parts) {
public ArticleParts(String... parts) {
this(List.of(parts));
}
public String articleHtml() {
StringBuilder sb = new StringBuilder();
for (String part : parts()) {
sb.append(part);
}
return sb.toString();
}
public String getSummary() {
if (parts.isEmpty())
return "";
String firstPart = parts.get(0);
var doclet = Jsoup.parse(firstPart);
doclet.getElementsByTag("b").tagName("span");
var firstP = doclet.select("p").first();
if (null == firstP)
return "";
StringBuilder ret = new StringBuilder();
ret.append(firstP.outerHtml());
var nextSibling = firstP.nextElementSibling();
if (nextSibling != null &&
!"p".equals(nextSibling.tagName()) &&
!"table".equals(nextSibling.tagName()))
{
ret.append(" ").append(nextSibling.outerHtml());
}
return ret.toString();
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.encyclopedia.model;
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
import nu.marginalia.encyclopedia.store.ArticleCodec;
public record Article (
String url,
String title,
String summary,
ArticleParts parts,
LinkList urls,
LinkList disambigs)
{
public ArticleData asData() {
return new ArticleData(
url(),
title(),
summary(),
ArticleCodec.toCompressedJson(parts),
ArticleCodec.toCompressedJson(urls),
ArticleCodec.toCompressedJson(disambigs)
);
}
/** Used by template */
public String articleHtml() {
if (parts == null) {
return "";
}
return parts.articleHtml();
}
}

View File

@ -0,0 +1,3 @@
package nu.marginalia.encyclopedia.model;
public record Link(String url, String text) { }

View File

@ -0,0 +1,13 @@
package nu.marginalia.encyclopedia.model;
import java.util.List;
public record LinkList(List<Link> links) {
public LinkList(Link... links) {
this(List.of(links));
}
public int size() {
return links.size();
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.encyclopedia.model;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
public record ReferencedArticle(String title,
List<String> aliases,
String url,
String summary) implements Comparable<ReferencedArticle> {
public ReferencedArticle(String title, String url, String summary) {
this(title, List.of(), url, summary);
}
public ReferencedArticle withAliases(List<String> aliases) {
if (aliases != null && aliases.size() > 1) {
var cleanAliases = new ArrayList<>(aliases);
cleanAliases.remove(title());
return new ReferencedArticle(title(), cleanAliases, url(), summary());
}
return this;
}
private String compareKey() {
return url.toLowerCase();
}
@Override
public int compareTo(@NotNull ReferencedArticle referencedArticle) {
return compareKey().compareTo(referencedArticle.compareKey());
}
}

View File

@ -0,0 +1,25 @@
package nu.marginalia.encyclopedia.store;
import com.github.luben.zstd.Zstd;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
public class ArticleCodec {
private static final Gson gson = new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.create();
public static byte[] toCompressedJson(Object any) {
return Zstd.compress(gson.toJson(any).getBytes());
}
public static <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.encyclopedia.store;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
public class ArticleDbProvider {
private final Connection connection;
public ArticleDbProvider(Path filename) throws SQLException {
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
connection = DriverManager.getConnection(sqliteDbString);
try (var stmt = connection.createStatement()) {
stmt.executeUpdate("""
CREATE TABLE IF NOT EXISTS articles (
url TEXT PRIMARY KEY,
title TEXT NOT NULL,
summary TEXT NOT NULL,
html BLOB NOT NULL,
urls BLOB NOT NULL,
disambigs BLOB NOT NULL
)
""");
}
}
public Connection getConnection() {
return connection;
}
}

View File

@ -0,0 +1,102 @@
package nu.marginalia.encyclopedia.store;
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
public class ArticleStoreWriter implements AutoCloseable {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Connection connection;
private final LinkedBlockingQueue<ArticleData> queue = new LinkedBlockingQueue<>(1000);
Thread insertThread;
volatile boolean running;
public ArticleStoreWriter(ArticleDbProvider dbProvider) throws SQLException {
connection = dbProvider.getConnection();
try (var stmt = connection.createStatement()) {
stmt.execute("PRAGMA synchronous = OFF");
stmt.execute("PRAGMA journal_mode = MEMORY");
}
running = true;
insertThread = new Thread(this::insertLoop);
insertThread.start();
}
private void insertLoop() {
List<ArticleData> toAdd = new ArrayList<>();
while (running || !queue.isEmpty()) {
try {
while (0 != queue.drainTo(toAdd, 100)) {
insertItems(toAdd);
toAdd.clear();
}
if (queue.isEmpty()) {
// Yield for a moment to avoid busy looping
TimeUnit.NANOSECONDS.sleep(100);
}
} catch (SQLException e) {
e.printStackTrace();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
private void insertItems(List<ArticleData> toAdd) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR IGNORE INTO articles (url, title, html, summary, urls, disambigs)
VALUES (?, ?, ?, ?, ?, ?)
"""))
{
connection.setAutoCommit(false); // Disable auto-commit mode
for (var article : toAdd) {
stmt.setString(1, article.url());
stmt.setString(2, article.title());
stmt.setBytes(3, article.parts());
stmt.setString(4, article.summary());
stmt.setBytes(5, article.links());
stmt.setBytes(6, article.disambigs());
stmt.addBatch();
}
stmt.executeBatch();
connection.commit(); // Commit the transaction
} catch (SQLException e) {
connection.rollback(); // Rollback the transaction in case of error
logger.warn("SQL error", e);
} finally {
connection.setAutoCommit(true); // Re-enable auto-commit mode
}
}
public void add(ArticleData article) {
try {
queue.put(article);
}
catch (InterruptedException e) {
logger.warn("Interrupted", e);
throw new RuntimeException(e);
}
}
public void close() {
running = false;
try {
insertThread.join();
connection.close();
} catch (InterruptedException|SQLException e) {
logger.warn("Error", e);
}
}
}

View File

@ -221,7 +221,7 @@ public class ZIMReader {
// Gives the minimum required information needed for the given articleName
public DirectoryEntry forEachArticles(BiConsumer<String, String> consumer, Predicate<Integer> blobPred)
throws IOException {
throws IOException, InterruptedException {
int numberOfArticles = mFile.getArticleCount();
long beg = mFile.getTitlePtrPos();
@ -237,6 +237,10 @@ public class ZIMReader {
for (long i = beg; i < end; i+=4) {
var entry = getDirectoryInfoAtTitlePosition(i);
if (Thread.interrupted()) {
throw new InterruptedException();
}
if (((i-beg)%100_000) == 0) {
System.out.printf("%f%%\n", ((i-beg) * 100.) / (end-beg));
}
@ -249,21 +253,25 @@ public class ZIMReader {
System.out.println("Iterating over " + data.keySet().stream().mapToInt(Integer::intValue).max() + "clusters");
data.forEach((pos,blobs) -> {
if (!blobPred.test(pos)) {
return;
}
var iter = data.entrySet().iterator();
while (iter.hasNext()) {
if (Thread.interrupted()) throw new InterruptedException();
var next = iter.next();
int pos = next.getKey();
if (!blobPred.test(pos)) continue;
Map<Integer, String> blobs = next.getValue();
try {
getArticleData(consumer, pos, blobs);
}
catch (Exception ex) {
ex.printStackTrace();
throw new RuntimeException(ex);
}
});
}
return null;
}