(converter) Integrate zim->db conversion into automatic encyclopedia processing workflow

Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file. This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically. The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads.
2024-01-19 13:59:03 +01:00 · 2024-01-19 13:59:03 +01:00 · 27ffb8fa8a
commit 27ffb8fa8a
parent 22c8fb3f59
21 changed files with 895 additions and 19 deletions
--- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderInputData.java
+++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderInputData.java
@ -2,6 +2,8 @@ package nu.marginalia.loading;

 import nu.marginalia.io.processed.ProcessedDataFileNames;
 import nu.marginalia.worklog.BatchingWorkLogInspector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.nio.file.Path;
@ -10,13 +12,21 @@ import java.util.*;

 public class LoaderInputData {
    private final List<Path> sourceDirectories;
+    private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class);
    private final Map<Path, Integer> lastGoodBatch = new HashMap<>();

    public LoaderInputData(List<Path> sourceDirectories) throws IOException {
        this.sourceDirectories = sourceDirectories;

        for (var source : sourceDirectories) {
-            lastGoodBatch.put(source, BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log")));
+            int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"));
+
+            this.lastGoodBatch.put(source, lastGoodBatch);
+
+            if (lastGoodBatch == 0) {
+                // This is useful diagnostic information, so we log it as a warning
+                logger.warn("No valid batches found in {}", source);
+            }
        }

    }
--- a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb
+++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb
@ -1,11 +1,17 @@
 <h1 class="my-3">Sideload Encyclopedia</h1>

 <div class="my-3 p-3 border bg-light">
-<p>This will sideload a pre-converted MediaWiki-style OpenZim data set.
-    See the <a href="https://github.com/MarginaliaSearch/MarginaliaSearch/blob/master/doc/sideloading-howto.md">sideloading howto</a>
-    for instructions how to produce this file. </p>
-<p>Place an articles.db file in the upload directory on the server, and select it from the list
-    below.  </p>
+<p>This will side-load a MediaWiki-style OpenZim data set.  Place a zim file in the uploads directory.
+    For Wikipedia, the zim file can be downloaded from <a href="https://download.kiwix.org/zim/wikipedia/">https://download.kiwix.org/zim/wikipedia/</a>.
+    The en_all_nopic sets are recommended for wikipedia, since they are smaller and do not contain images
+    (which are not used anyway).   For testing, the _mini or _en_100 sets are good choices.
+   <p></p>
+   The zim file will be converted to a sqlite database (.db-file) with a similar name to
+   the zim file, which then automatically is turned into processed data.
+   <p></p>
+   Since the first stage of processing is very time-consuming, the sqlite database can
+   also be loaded from this form.
+</p>
 </div>
 <form method="post" action="actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
    <div class="my-3 py-3">
--- a/code/services-core/executor-service/build.gradle
+++ b/code/services-core/executor-service/build.gradle
@ -45,6 +45,7 @@ dependencies {
    implementation project(':code:api:query-api')
    implementation project(':code:api:process-mqapi')
    implementation project(':code:api:executor-api')
+    implementation project(':third-party:encyclopedia-marginalia-nu')

    implementation libs.bundles.slf4j

--- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java
+++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java
@ -7,6 +7,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
 import nu.marginalia.actor.state.ActorResumeBehavior;
 import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.actor.state.Resume;
+import nu.marginalia.encyclopedia.EncyclopediaConverter;
 import nu.marginalia.process.ProcessOutboxes;
 import nu.marginalia.process.ProcessService;
 import nu.marginalia.storage.FileStorageService;
@ -16,21 +17,27 @@ import nu.marginalia.storage.model.FileStorageState;
 import nu.marginalia.storage.model.FileStorageType;
 import nu.marginalia.mq.MqMessageState;
 import nu.marginalia.mq.outbox.MqOutbox;
-import nu.marginalia.mqapi.converting.ConvertAction;
 import nu.marginalia.mqapi.converting.ConvertRequest;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

+import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.zip.CRC32;

@Singleton
 public class ConvertActor extends RecordActorPrototype {
+
+    private static final Logger logger = LoggerFactory.getLogger(ConvertActor.class);
    private final ActorProcessWatcher processWatcher;
    private final MqOutbox mqConverterOutbox;
    private final FileStorageService storageService;
-    private final Gson gson;

    public record Convert(FileStorageId fid) implements ActorStep {};
    public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
+    public record PredigestEncyclopedia(String source, String dest, String baseUrl) implements ActorStep {};
    public record ConvertDirtree(String source) implements ActorStep {};
    public record ConvertWarc(String source) implements ActorStep {};
    public record ConvertStackexchange(String source) implements ActorStep {};
@ -100,6 +107,19 @@ public class ConvertActor extends RecordActorPrototype {
                if (!Files.exists(sourcePath))
                    yield new Error("Source path does not exist: " + sourcePath);

+                if (source.toLowerCase().endsWith(".zim")) {
+                    // If we're fed a ZIM file, we need to convert it to a sqlite database first
+                    String hash = getCrc32FileHash(sourcePath);
+
+                    // To avoid re-converting the same file, we'll assign the file a name based on its hash
+                    // and the original filename. This way, if we're fed the same file again, we'll be able to just
+                    // re-use the predigested database file.
+                    yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
+                } else if (!source.endsWith(".db")) {
+                    yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
+                }
+
+
                String fileName = sourcePath.toFile().getName();

                var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
@ -114,6 +134,36 @@ public class ConvertActor extends RecordActorPrototype {
                        mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id()))
                );
            }
+            case PredigestEncyclopedia(String source, String dest, String baseUrl) -> {
+                Path sourcePath = Path.of(source);
+
+                if (!Files.exists(sourcePath)) {
+                    yield new Error("Source path does not exist: " + sourcePath);
+                }
+
+                Path destPath = Path.of(dest);
+                if (Files.exists(destPath)) {
+                    // Already predigested, go straight to convert step
+                    yield new ConvertEncyclopedia(dest, baseUrl);
+                }
+
+                Path tempFile = Files.createTempFile(destPath.getParent(), "encyclopedia", "db.tmp");
+
+                try {
+                    EncyclopediaConverter.convert(sourcePath, tempFile);
+                    Files.move(tempFile, destPath);
+                }
+                catch (Exception e) {
+                    logger.error("Failed to convert ZIM file to sqlite database", e);
+                    Files.deleteIfExists(tempFile);
+                    Files.deleteIfExists(destPath);
+
+                    yield new Error("Failed to convert ZIM file to sqlite database: " + e.getMessage());
+                }
+
+                // Go back to convert step with the new database file
+                yield new ConvertEncyclopedia(dest, baseUrl);
+            }
            case ConvertStackexchange(String source) -> {

                Path sourcePath = Path.of(source);
@ -150,6 +200,22 @@ public class ConvertActor extends RecordActorPrototype {
        };
    }

+    private String getCrc32FileHash(Path file) throws IOException {
+        ByteBuffer buffer = ByteBuffer.allocate(8192);
+
+        try (var channel = Files.newByteChannel(file)) {
+            CRC32 crc = new CRC32();
+
+            while (channel.read(buffer) > 0) {
+                buffer.flip();
+                crc.update(buffer);
+                buffer.clear();
+            }
+
+            return Long.toHexString(crc.getValue());
+        }
+    }
+
    @Override
    public String describe() {
        return "Convert a set of crawl data into a format suitable for loading into the database.";
@ -165,6 +231,5 @@ public class ConvertActor extends RecordActorPrototype {
        this.processWatcher = processWatcher;
        this.mqConverterOutbox = processOutboxes.getConverterOutbox();
        this.storageService = storageService;
-        this.gson = gson;
    }
 }
--- a/settings.gradle
+++ b/settings.gradle
@ -99,6 +99,7 @@ include 'third-party:monkey-patch-opennlp'
 include 'third-party:monkey-patch-gson'
 include 'third-party:commons-codec'
 include 'third-party:parquet-floor'
+include 'third-party:encyclopedia-marginalia-nu'


 dependencyResolutionManagement {
--- a/third-party/README.md
+++ b/third-party/README.md
@ -9,9 +9,9 @@ or lack an artifact, or to override some default that is inappropriate for the t
 * [RDRPosTagger](rdrpostagger/) - GPL3
 * [PorterStemmer](porterstemmer/) - LGPL3
 * [Uppend](uppend/) - MIT
-* [OpenZIM](openzim/) - GPL-2.0
+* [OpenZIM](openzim/) - GPL-2.0+
 * [Commons Codec](commons-codec/) - Apache 2.0
-
+* [encylopedia.marginalia.nu](encyclopedia-marginalia-nu/) - GPL 2.0+
 ### Repackaged
 * [SymSpell](symspell/) - LGPL-3.0
 * [Count-Min-Sketch](count-min-sketch/) - Apache 2.0
--- a/third-party/encyclopedia-marginalia-nu/build.gradle
+++ b/third-party/encyclopedia-marginalia-nu/build.gradle
@ -0,0 +1,26 @@
+plugins {
+    id 'java'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(21))
+    }
+}
+
+dependencies {
+    implementation libs.jsoup
+    implementation libs.notnull
+    implementation libs.bundles.gson
+    implementation libs.zstd
+    implementation libs.bundles.slf4j
+
+    implementation project(':code:libraries:blocking-thread-pool')
+
+    implementation project(':third-party:xz')
+    implementation project(':third-party:openzim')
+}
+
+test {
+    useJUnitPlatform()
+}
--- a/third-party/encyclopedia-marginalia-nu/readme.md
+++ b/third-party/encyclopedia-marginalia-nu/readme.md
@ -0,0 +1,5 @@
+This package contains a severely stripped down version of the codebase from
+[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/).  
+
+The extracted code is a ZimFile reader and WikiHTML cleaner.  It is used by the
+encyclopedia side-loader.
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java
@ -0,0 +1,67 @@
+package nu.marginalia.encyclopedia;
+
+import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
+import nu.marginalia.encyclopedia.store.ArticleDbProvider;
+import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
+import nu.marginalia.util.SimpleBlockingThreadPool;
+import org.openzim.ZIMTypes.ZIMFile;
+import org.openzim.ZIMTypes.ZIMReader;
+import org.slf4j.LoggerFactory;
+import org.slf4j.Logger;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.BiConsumer;
+import java.util.function.Predicate;
+
+/** Converts an OpenZim file with Wikipedia articles to a SQLite database
+ * with cleaned-up MediaWiki HTML
+ */
+public class EncyclopediaConverter {
+    private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class);
+
+    public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
+        var wc = new WikiCleaner();
+        var pool = new SimpleBlockingThreadPool("Convert ZIM",
+                Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
+                2);
+        var size = new AtomicInteger();
+
+        if (!Files.exists(inputFile)) {
+            throw new IllegalStateException("ZIM file not found: " + inputFile);
+        }
+        Files.deleteIfExists(outputFile);
+
+        try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
+            Predicate<Integer> keepGoing = (s) -> true;
+
+            BiConsumer<String, String> handleArticle = (url, html) -> {
+                if (pool.isTerminated())
+                    return;
+
+                pool.submitQuietly(() -> {
+                    int sz = size.incrementAndGet();
+                    if (sz % 1000 == 0) {
+                        System.out.printf("\u001b[2K\r%d", sz);
+                    }
+                    asw.add(wc.cleanWikiJunk(url, html));
+                });
+
+                size.incrementAndGet();
+            };
+
+            new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
+
+            pool.shutDown();
+            logger.info("Waiting for pool to finish");
+
+            while (!pool.awaitTermination(1, TimeUnit.SECONDS)) {
+                // ...
+            }
+        }
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java
@ -0,0 +1,60 @@
+package nu.marginalia.encyclopedia.cleaner;
+
+import lombok.Builder;
+import org.jsoup.nodes.Comment;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.NodeFilter;
+
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+@Builder
+public class CleanerFilter implements NodeFilter {
+    final Set<String> badTags;
+    final Set<String> badIds;
+    final Set<String> badClasses;
+
+    final Set<Predicate<Element>> predicates;
+
+    private static final Pattern spacePattern = Pattern.compile("\\s+");
+
+    @Override
+    public FilterResult head(Node node, int depth) {
+        if (node instanceof Element el) {
+            if (badTags != null && badTags.contains(el.tagName()))
+                return FilterResult.REMOVE;
+
+            if (badIds != null && badIds.contains(el.id()))
+                return FilterResult.REMOVE;
+
+            if (badClasses != null) {
+                String className = el.className();
+                if (className.contains(" ")) {
+                    String[] parts = spacePattern.split(className);
+                    for (var c : parts) {
+                        if (badClasses.contains(c))
+                            return FilterResult.REMOVE;
+                    }
+                }
+                else if (badClasses.contains(className)) {
+                    return FilterResult.REMOVE;
+                }
+            }
+
+            if (predicates != null) {
+                for (var pred : predicates) {
+                    if (pred.test(el))
+                        return FilterResult.REMOVE;
+                }
+            }
+        }
+
+        if (node instanceof Comment) {
+            return FilterResult.REMOVE;
+        }
+
+        return FilterResult.CONTINUE;
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/WikiCleaner.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/WikiCleaner.java
@ -0,0 +1,329 @@
+package nu.marginalia.encyclopedia.cleaner;
+
+import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
+import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
+import nu.marginalia.encyclopedia.model.Article;
+import nu.marginalia.encyclopedia.model.Link;
+import nu.marginalia.encyclopedia.model.LinkList;
+import org.jetbrains.annotations.NotNull;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.*;
+import org.jsoup.select.Elements;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+
+public class WikiCleaner {
+
+    private static final String licenseFooter = "This article is issued from Wikipedia. The text is licensed under Creative Commons - Attribution - Sharealike. Additional terms may apply for the media files.";
+    public ArticleData cleanWikiJunk(String url, String html) {
+        return cleanWikiJunk(url, Jsoup.parse(html));
+    }
+
+    private boolean isPresentationRole(Element el) {
+        return "presentation".equals(el.attr("role"));
+    }
+    private boolean isLicenseFooter(Element el) {
+        // We'll add our own later
+        if ("div".equals(el.tagName())) {
+            return licenseFooter.equals(el.wholeOwnText().trim());
+        }
+
+        return false;
+    }
+
+    public ArticleData cleanWikiJunk(String url, Document doc) {
+
+        if (doc.getElementById("content") == null) {
+            return null;
+        }
+
+        List<Link> disambig = getDisambiguationLinks(doc);
+        List<Link> topLinks = getWikiPageLinks(doc);
+
+        doc.filter(CleanerFilter.builder()
+                        .badClasses(Set.of("infobox", "collapsible", "navbar", "printfooter",
+                                        "mw-editsection", "thumb", "sidebar", "navbox", "mw-jump-link",
+                                        "vertical-navbox", "mw-indicators", "noprint", "sistersitebox",
+                                        "BarChartTemplate"))
+                        .badIds(Set.of("coordinates", "mw-page-base", "mw-head-base", "site-notice", "contentSub", "contentSub2"))
+                        .badTags(Set.of("footer", "script", "object", "embed", "audio", "style", "nosript", "link", "meta", "img"))
+                        .predicates(Set.of(this::isPresentationRole, this::isLicenseFooter))
+                .build());
+
+        doc.getElementsByTag("a").forEach(tag -> {
+            var href = tag.attr("href");
+            var parent = tag.parent();
+
+            if (null != parent && "li".equals(parent.tagName())) {
+                tag.removeAttr("title");
+
+                if (href.startsWith("http://")) {
+                    tag.addClass("extern-link");
+                    tag.attr("rel", "nofollow");
+                }
+            } else {
+                tag.replaceWith(new TextNode(tag.text()));
+            }
+        });
+
+        doc.getElementsByTag("cite").tagName("span");
+
+        doc.filter(CleanerFilter.builder()
+                .badIds(Set.of("toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav"))
+                .badClasses(Set.of("mw-references-wrap", "references", "reference", "siteSub", "refbegin"))
+                .build()
+        );
+
+        doc.getAllElements().forEach(elem -> {
+            if (elem.parent() != null
+                    && "summary".equals(elem.parent().tagName()))
+            {
+                elem.parent().replaceWith(elem);
+            }
+        });
+
+        doc.getElementsByClass("mwe-math-element").forEach(mathSpan -> {
+            var mathTag = mathSpan.getElementsByTag("math").first();
+            if (mathTag != null) {
+                mathSpan.replaceWith(mathTag);
+            }
+        });
+
+        doc.getElementsByTag("span").forEach(elem -> {
+            if ("pre".equals(elem.parent().tagName())) {
+                if (elem.hasClass("linenos")) {
+                    elem.replaceWith(new TextNode(String.format("%-4s", elem.text())));
+                }
+                else {
+                    elem.replaceWith(new TextNode(elem.text()));
+                }
+            }
+            else {
+                elem.replaceWith(new TextNode(" " + elem.text() + " "));
+            }
+        });
+
+        doc.getElementsByTag("details").forEach(deets -> {
+            if (deets.children().size() == 1) {
+                deets.replaceWith(deets.children().first());
+            }
+            else {
+                deets.tagName("div");
+            }
+        });
+
+        removeSingularlyNestedDivs(doc);
+
+        removeEmptyTags(doc, "li");
+        removeEmptyTags(doc, "ul");
+        removeEmptyTags(doc, "div");
+
+        doc.getElementsByTag("p").forEach(elem -> {
+            if ("blockquote".equals(elem.parent().tagName())) {
+                elem.replaceWith(new TextNode(elem.text()));
+            }
+        });
+
+        removeEmptyTags(doc, "p");
+
+
+
+        cascadingHeaderCleanup(doc, "h4", "h3", "h2");
+        cascadingHeaderCleanup(doc, "h3", "h2");
+        cascadingHeaderCleanup(doc, "h2");
+
+        doc.getElementsByTag("table").forEach(table -> {
+            table.attr("border", "1");
+
+            if ("right".equals(table.attr("align"))) {
+                table.remove();
+            }
+        });
+
+        doc.getAllElements().forEach(elem -> {
+            removeWikiClassNames(elem);
+
+            elem.removeAttr("lang");
+            elem.removeAttr("dir");
+            elem.removeAttr("id");
+            elem.removeAttr("role");
+            elem.removeAttr("style");
+            elem.removeAttr("tabindex");
+            elem.removeAttr("aria-haspopup");
+            elem.removeAttr("data-section-id");
+            elem.removeAttr("aria-expanded");
+            elem.removeAttr("aria-pressed");
+            elem.removeAttr("open");
+            elem.removeAttr("data-level");
+        });
+
+        doc.getElementsByTag("table").remove();
+
+        // Remove the first header since we'll insert our own in the templating
+        Optional.ofNullable(doc.getElementsByTag("h1").first()).ifPresent(Element::remove);
+
+        ArticleParts articleParts = getDocumentParts(doc);
+
+        return new Article(
+                url,
+                doc.title(),
+                articleParts.getSummary(),
+                articleParts,
+                new LinkList(topLinks),
+                new LinkList(disambig)
+        ).asData();
+    }
+
+    private void removeWikiClassNames(Element elem) {
+        final String classNames = elem.className();
+
+        // Note that the string with class names isn't split,
+        // this is fairly expensive and since most tags don't even
+        // have classes, we'll optimistically check for presence and then
+        // pay for the expensive removeClass operation even if unnecessary
+        // due to a false positive
+
+        if (classNames.contains("verb")) {
+            elem.removeClass("verb");
+        }
+
+        if (classNames.contains("extern-link")) {
+            elem.removeClass("extern-link");
+        }
+
+        if (classNames.contains("margin-note")) {
+            elem.removeClass("margin-note");
+        }
+
+        if (classNames.contains("wikitable")) {
+            elem.removeClass("wikitable");
+        }
+
+    }
+
+    public static ArticleParts getDocumentParts(Document doc) {
+
+        // We expect the document to be one container div with a bunch of children
+        // each corresponding to a section of the document
+
+        var rootDiv = doc.getElementsByTag("div").first();
+
+        if (null == rootDiv) {
+            return new ArticleParts(List.of());
+        }
+
+        // To be maximally useful, we want the article as a series of divs corresponding to
+        // logical sections of the article
+
+        List<String> parts = new ArrayList<>();
+
+        Element normalizingDiv = null;
+        for (Element child : rootDiv.children()) {
+            boolean isDiv = "div".equals(child.tagName());
+
+            if (!isDiv && normalizingDiv == null) {
+                normalizingDiv = new Element("div");
+            }
+
+            if (isDiv && normalizingDiv != null) {
+                if (normalizingDiv.childrenSize() > 0) {
+                    parts.add(normalizingDiv.outerHtml());
+                }
+                normalizingDiv = null;
+            }
+
+            if (normalizingDiv != null) normalizingDiv.appendChild(child.clone());
+            if (isDiv && child.childrenSize() > 0) parts.add(child.outerHtml());
+
+        }
+        if (normalizingDiv != null &&
+            normalizingDiv.childrenSize() > 0)
+        {
+            parts.add(normalizingDiv.outerHtml());
+        }
+
+        return new ArticleParts(parts);
+    }
+
+    private void removeSingularlyNestedDivs(Document doc) {
+        // Remove divs that only contain a single div, and replace them with the inner div
+
+        for (Element div : doc.getElementsByTag("div")) {
+            final Elements children = div.children();
+
+            if (children.size() != 1)
+                continue;
+
+            final Element childDiv = children.first();
+
+            if (null != childDiv && "div".equals(childDiv.tagName())) {
+                div.replaceWith(childDiv);
+            }
+        }
+    }
+
+    private void cascadingHeaderCleanup(Document doc, String currH, String... nextHeaders) {
+        doc.getElementsByTag(currH).forEach(elem -> {
+            var next = elem.nextElementSibling();
+            if (next == null) {
+                elem.remove();
+                return;
+            }
+            String nextTagName = next.tagName();
+            if (currH.equals(nextTagName)) {
+                elem.remove();
+            }
+            else for (String h : nextHeaders) {
+                if (h.equals(nextTagName)) {
+                    elem.remove();
+                }
+            }
+        });
+    }
+
+    private void removeEmptyTags(Document doc, String tag) {
+        doc.getElementsByTag(tag).forEach(elem -> {
+            if (elem.text().isBlank() && elem.getElementsByTag("img").isEmpty()) {
+                elem.replaceWith(new TextNode(" "));
+            }
+        });
+    }
+
+    @NotNull
+    private List<Link> getWikiPageLinks(Document doc) {
+        List<Link> topLinks = new ArrayList<>();
+        doc.select("p a").forEach(atag -> {
+            String href = atag.attr("href");
+
+            if (!href.isBlank()
+                    && !href.contains(":")
+                    && !href.startsWith("#")
+            ) {
+                topLinks.add(new Link(href, atag.attr("title")));
+            }
+        });
+        return topLinks;
+    }
+
+
+    @NotNull
+    private List<Link> getDisambiguationLinks(Document doc) {
+        List<Link> disambig = new ArrayList<>();
+
+        for (var note: doc.getElementsByClass("hatnote")) {
+            for (var atag : note.getElementsByTag("a")) {
+                String href = atag.attr("href");
+                if (atag.hasClass("mw-disambig") && !href.isBlank()) {
+                    disambig.add(new Link(href, atag.attr("title")));
+                }
+            }
+            note.remove();
+        }
+
+        return disambig;
+    }
+
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleData.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleData.java
@ -0,0 +1,10 @@
+package nu.marginalia.encyclopedia.cleaner.model;
+
+public record ArticleData(
+        String url,
+        String title,
+        String summary,
+        byte[] parts,
+        byte[] links,
+        byte[] disambigs) {
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleParts.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleParts.java
@ -0,0 +1,44 @@
+package nu.marginalia.encyclopedia.cleaner.model;
+
+import org.jsoup.Jsoup;
+
+import java.util.List;
+
+public record ArticleParts(List<String> parts) {
+    public ArticleParts(String... parts) {
+        this(List.of(parts));
+    }
+    public String articleHtml() {
+        StringBuilder sb = new StringBuilder();
+        for (String part : parts()) {
+            sb.append(part);
+        }
+        return sb.toString();
+    }
+
+    public String getSummary() {
+        if (parts.isEmpty())
+            return "";
+
+        String firstPart = parts.get(0);
+        var doclet = Jsoup.parse(firstPart);
+        doclet.getElementsByTag("b").tagName("span");
+        var firstP = doclet.select("p").first();
+
+        if (null == firstP)
+            return "";
+
+        StringBuilder ret = new StringBuilder();
+        ret.append(firstP.outerHtml());
+
+        var nextSibling = firstP.nextElementSibling();
+
+        if (nextSibling != null &&
+                !"p".equals(nextSibling.tagName()) &&
+                !"table".equals(nextSibling.tagName()))
+        {
+            ret.append(" ").append(nextSibling.outerHtml());
+        }
+        return ret.toString();
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Article.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Article.java
@ -0,0 +1,35 @@
+package nu.marginalia.encyclopedia.model;
+
+import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
+import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
+import nu.marginalia.encyclopedia.store.ArticleCodec;
+
+public record Article (
+        String url,
+        String title,
+        String summary,
+        ArticleParts parts,
+        LinkList urls,
+        LinkList disambigs)
+{
+
+    public ArticleData asData() {
+        return new ArticleData(
+                url(),
+                title(),
+                summary(),
+                ArticleCodec.toCompressedJson(parts),
+                ArticleCodec.toCompressedJson(urls),
+                ArticleCodec.toCompressedJson(disambigs)
+        );
+    }
+
+    /** Used by template */
+    public String articleHtml() {
+        if (parts == null) {
+            return "";
+        }
+
+        return parts.articleHtml();
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Link.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Link.java
@ -0,0 +1,3 @@
+package nu.marginalia.encyclopedia.model;
+
+public record Link(String url, String text) { }
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/LinkList.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/LinkList.java
@ -0,0 +1,13 @@
+package nu.marginalia.encyclopedia.model;
+
+import java.util.List;
+
+public record LinkList(List<Link> links) {
+    public LinkList(Link... links) {
+        this(List.of(links));
+    }
+
+    public int size() {
+        return links.size();
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/ReferencedArticle.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/ReferencedArticle.java
@ -0,0 +1,33 @@
+package nu.marginalia.encyclopedia.model;
+
+import org.jetbrains.annotations.NotNull;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public record ReferencedArticle(String title,
+                                List<String> aliases,
+                                String url,
+                                String summary) implements Comparable<ReferencedArticle> {
+    public ReferencedArticle(String title, String url, String summary) {
+        this(title, List.of(), url, summary);
+    }
+
+    public ReferencedArticle withAliases(List<String> aliases) {
+        if (aliases != null && aliases.size() > 1) {
+            var cleanAliases = new ArrayList<>(aliases);
+            cleanAliases.remove(title());
+            return new ReferencedArticle(title(), cleanAliases, url(), summary());
+        }
+
+        return this;
+    }
+
+    private String compareKey() {
+        return url.toLowerCase();
+    }
+    @Override
+    public int compareTo(@NotNull ReferencedArticle referencedArticle) {
+        return compareKey().compareTo(referencedArticle.compareKey());
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleCodec.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleCodec.java
@ -0,0 +1,25 @@
+package nu.marginalia.encyclopedia.store;
+
+import com.github.luben.zstd.Zstd;
+import com.github.luben.zstd.ZstdInputStream;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+public class ArticleCodec {
+    private static final Gson gson = new GsonBuilder()
+            .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
+            .create();
+
+    public static byte[] toCompressedJson(Object any) {
+        return Zstd.compress(gson.toJson(any).getBytes());
+    }
+    public static <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
+        return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
+    }
+
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleDbProvider.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleDbProvider.java
@ -0,0 +1,33 @@
+package nu.marginalia.encyclopedia.store;
+
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+
+public class ArticleDbProvider {
+    private final Connection connection;
+
+    public ArticleDbProvider(Path filename) throws SQLException {
+        String sqliteDbString = "jdbc:sqlite:" + filename.toString();
+        connection = DriverManager.getConnection(sqliteDbString);
+
+        try (var stmt = connection.createStatement()) {
+            stmt.executeUpdate("""
+                    CREATE TABLE IF NOT EXISTS articles (
+                        url TEXT PRIMARY KEY,
+                        title TEXT NOT NULL,
+                        summary TEXT NOT NULL,
+                        html BLOB NOT NULL,
+                        urls BLOB NOT NULL,
+                        disambigs BLOB NOT NULL
+                    )
+                    """);
+
+        }
+    }
+
+    public Connection getConnection() {
+        return connection;
+    }
+}
--- a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleStoreWriter.java
+++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleStoreWriter.java
@ -0,0 +1,102 @@
+package nu.marginalia.encyclopedia.store;
+
+import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+public class ArticleStoreWriter implements AutoCloseable {
+    private final Logger logger = LoggerFactory.getLogger(getClass());
+    private final Connection connection;
+    private final LinkedBlockingQueue<ArticleData> queue = new LinkedBlockingQueue<>(1000);
+
+    Thread insertThread;
+    volatile boolean running;
+
+    public ArticleStoreWriter(ArticleDbProvider dbProvider) throws SQLException {
+        connection = dbProvider.getConnection();
+
+        try (var stmt = connection.createStatement()) {
+            stmt.execute("PRAGMA synchronous = OFF");
+            stmt.execute("PRAGMA journal_mode = MEMORY");
+        }
+
+        running = true;
+        insertThread = new Thread(this::insertLoop);
+        insertThread.start();
+    }
+
+    private void insertLoop() {
+        List<ArticleData> toAdd = new ArrayList<>();
+        while (running || !queue.isEmpty()) {
+            try {
+                while (0 != queue.drainTo(toAdd, 100)) {
+                    insertItems(toAdd);
+                    toAdd.clear();
+                }
+                if (queue.isEmpty()) {
+                    // Yield for a moment to avoid busy looping
+                    TimeUnit.NANOSECONDS.sleep(100);
+                }
+            } catch (SQLException e) {
+                e.printStackTrace();
+            } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    private void insertItems(List<ArticleData> toAdd) throws SQLException {
+        try (var stmt = connection.prepareStatement("""
+        INSERT OR IGNORE INTO articles (url, title, html, summary, urls, disambigs)
+        VALUES (?, ?, ?, ?, ?, ?)
+        """))
+        {
+            connection.setAutoCommit(false); // Disable auto-commit mode
+            for (var article : toAdd) {
+                stmt.setString(1, article.url());
+                stmt.setString(2, article.title());
+                stmt.setBytes(3, article.parts());
+                stmt.setString(4, article.summary());
+                stmt.setBytes(5, article.links());
+                stmt.setBytes(6, article.disambigs());
+
+                stmt.addBatch();
+            }
+            stmt.executeBatch();
+            connection.commit(); // Commit the transaction
+        } catch (SQLException e) {
+            connection.rollback(); // Rollback the transaction in case of error
+            logger.warn("SQL error", e);
+        } finally {
+            connection.setAutoCommit(true); // Re-enable auto-commit mode
+        }
+    }
+
+    public void add(ArticleData article)  {
+        try {
+            queue.put(article);
+        }
+        catch (InterruptedException e) {
+            logger.warn("Interrupted", e);
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void close() {
+        running = false;
+        try {
+            insertThread.join();
+            connection.close();
+        } catch (InterruptedException|SQLException e) {
+            logger.warn("Error", e);
+        }
+    }
+
+}
--- a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
+++ b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
@ -221,7 +221,7 @@ public class ZIMReader {

 	// Gives the minimum required information needed for the given articleName
 	public DirectoryEntry forEachArticles(BiConsumer<String, String> consumer, Predicate<Integer> blobPred)
-			throws IOException {
+			throws IOException, InterruptedException {

 		int numberOfArticles = mFile.getArticleCount();
 		long beg = mFile.getTitlePtrPos();
@ -237,6 +237,10 @@ public class ZIMReader {
 		for (long i = beg; i < end; i+=4) {
 			var entry = getDirectoryInfoAtTitlePosition(i);

+			if (Thread.interrupted()) {
+				throw new InterruptedException();
+			}
+
 			if (((i-beg)%100_000) == 0) {
 				System.out.printf("%f%%\n", ((i-beg) * 100.) / (end-beg));
 			}
@ -249,21 +253,25 @@ public class ZIMReader {

 		System.out.println("Iterating over " + data.keySet().stream().mapToInt(Integer::intValue).max() + "clusters");

-		data.forEach((pos,blobs) -> {
-			if (!blobPred.test(pos)) {
-				return;
-			}
+		var iter = data.entrySet().iterator();
+		while (iter.hasNext()) {
+			if (Thread.interrupted()) throw new InterruptedException();
+
+			var next = iter.next();
+			int pos = next.getKey();
+
+			if (!blobPred.test(pos)) continue;
+			Map<Integer, String> blobs = next.getValue();

 			try {
 				getArticleData(consumer, pos, blobs);
 			}
 			catch (Exception ex) {
-				ex.printStackTrace();
+				throw new RuntimeException(ex);
 			}
-		});
+		}

 		return null;
-
 	}