From 27ffb8fa8a81f4d740027d3dfd0c10e5b8ee2fdd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Jan 2024 13:59:03 +0100 Subject: [PATCH] (converter) Integrate zim->db conversion into automatic encyclopedia processing workflow Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file. This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically. The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads. --- .../marginalia/loading/LoaderInputData.java | 12 +- .../actions/partial-sideload-encyclopedia.hdb | 16 +- .../executor-service/build.gradle | 1 + .../marginalia/actor/task/ConvertActor.java | 71 +++- settings.gradle | 1 + third-party/README.md | 4 +- .../encyclopedia-marginalia-nu/build.gradle | 26 ++ .../encyclopedia-marginalia-nu/readme.md | 5 + .../encyclopedia/EncyclopediaConverter.java | 67 ++++ .../encyclopedia/cleaner/CleanerFilter.java | 60 ++++ .../encyclopedia/cleaner/WikiCleaner.java | 329 ++++++++++++++++++ .../cleaner/model/ArticleData.java | 10 + .../cleaner/model/ArticleParts.java | 44 +++ .../encyclopedia/model/Article.java | 35 ++ .../marginalia/encyclopedia/model/Link.java | 3 + .../encyclopedia/model/LinkList.java | 13 + .../encyclopedia/model/ReferencedArticle.java | 33 ++ .../encyclopedia/store/ArticleCodec.java | 25 ++ .../encyclopedia/store/ArticleDbProvider.java | 33 ++ .../store/ArticleStoreWriter.java | 102 ++++++ .../java/org/openzim/ZIMTypes/ZIMReader.java | 24 +- 21 files changed, 895 insertions(+), 19 deletions(-) create mode 100644 third-party/encyclopedia-marginalia-nu/build.gradle create mode 100644 third-party/encyclopedia-marginalia-nu/readme.md create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/WikiCleaner.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleData.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleParts.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Article.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Link.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/LinkList.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/ReferencedArticle.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleCodec.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleDbProvider.java create mode 100644 third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleStoreWriter.java diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderInputData.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderInputData.java index cc54ea29..21f878f0 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderInputData.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderInputData.java @@ -2,6 +2,8 @@ package nu.marginalia.loading; import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.worklog.BatchingWorkLogInspector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; @@ -10,13 +12,21 @@ import java.util.*; public class LoaderInputData { private final List sourceDirectories; + private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class); private final Map lastGoodBatch = new HashMap<>(); public LoaderInputData(List sourceDirectories) throws IOException { this.sourceDirectories = sourceDirectories; for (var source : sourceDirectories) { - lastGoodBatch.put(source, BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"))); + int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log")); + + this.lastGoodBatch.put(source, lastGoodBatch); + + if (lastGoodBatch == 0) { + // This is useful diagnostic information, so we log it as a warning + logger.warn("No valid batches found in {}", source); + } } } diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb index 7e84676e..62bbce9a 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-sideload-encyclopedia.hdb @@ -1,11 +1,17 @@

Sideload Encyclopedia

-

This will sideload a pre-converted MediaWiki-style OpenZim data set. - See the sideloading howto - for instructions how to produce this file.

-

Place an articles.db file in the upload directory on the server, and select it from the list - below.

+

This will side-load a MediaWiki-style OpenZim data set. Place a zim file in the uploads directory. + For Wikipedia, the zim file can be downloaded from https://download.kiwix.org/zim/wikipedia/. + The en_all_nopic sets are recommended for wikipedia, since they are smaller and do not contain images + (which are not used anyway). For testing, the _mini or _en_100 sets are good choices. +

+ The zim file will be converted to a sqlite database (.db-file) with a similar name to + the zim file, which then automatically is turned into processed data. +

+ Since the first stage of processing is very time-consuming, the sqlite database can + also be loaded from this form. +

diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index be550b7f..adaab6fb 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -45,6 +45,7 @@ dependencies { implementation project(':code:api:query-api') implementation project(':code:api:process-mqapi') implementation project(':code:api:executor-api') + implementation project(':third-party:encyclopedia-marginalia-nu') implementation libs.bundles.slf4j diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java index 4af4852e..40b547a5 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java @@ -7,6 +7,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.Resume; +import nu.marginalia.encyclopedia.EncyclopediaConverter; import nu.marginalia.process.ProcessOutboxes; import nu.marginalia.process.ProcessService; import nu.marginalia.storage.FileStorageService; @@ -16,21 +17,27 @@ import nu.marginalia.storage.model.FileStorageState; import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.outbox.MqOutbox; -import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.mqapi.converting.ConvertRequest; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; +import java.util.zip.CRC32; @Singleton public class ConvertActor extends RecordActorPrototype { + + private static final Logger logger = LoggerFactory.getLogger(ConvertActor.class); private final ActorProcessWatcher processWatcher; private final MqOutbox mqConverterOutbox; private final FileStorageService storageService; - private final Gson gson; public record Convert(FileStorageId fid) implements ActorStep {}; public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {}; + public record PredigestEncyclopedia(String source, String dest, String baseUrl) implements ActorStep {}; public record ConvertDirtree(String source) implements ActorStep {}; public record ConvertWarc(String source) implements ActorStep {}; public record ConvertStackexchange(String source) implements ActorStep {}; @@ -100,6 +107,19 @@ public class ConvertActor extends RecordActorPrototype { if (!Files.exists(sourcePath)) yield new Error("Source path does not exist: " + sourcePath); + if (source.toLowerCase().endsWith(".zim")) { + // If we're fed a ZIM file, we need to convert it to a sqlite database first + String hash = getCrc32FileHash(sourcePath); + + // To avoid re-converting the same file, we'll assign the file a name based on its hash + // and the original filename. This way, if we're fed the same file again, we'll be able to just + // re-use the predigested database file. + yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl); + } else if (!source.endsWith(".db")) { + yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)"); + } + + String fileName = sourcePath.toFile().getName(); var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); @@ -114,6 +134,36 @@ public class ConvertActor extends RecordActorPrototype { mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id())) ); } + case PredigestEncyclopedia(String source, String dest, String baseUrl) -> { + Path sourcePath = Path.of(source); + + if (!Files.exists(sourcePath)) { + yield new Error("Source path does not exist: " + sourcePath); + } + + Path destPath = Path.of(dest); + if (Files.exists(destPath)) { + // Already predigested, go straight to convert step + yield new ConvertEncyclopedia(dest, baseUrl); + } + + Path tempFile = Files.createTempFile(destPath.getParent(), "encyclopedia", "db.tmp"); + + try { + EncyclopediaConverter.convert(sourcePath, tempFile); + Files.move(tempFile, destPath); + } + catch (Exception e) { + logger.error("Failed to convert ZIM file to sqlite database", e); + Files.deleteIfExists(tempFile); + Files.deleteIfExists(destPath); + + yield new Error("Failed to convert ZIM file to sqlite database: " + e.getMessage()); + } + + // Go back to convert step with the new database file + yield new ConvertEncyclopedia(dest, baseUrl); + } case ConvertStackexchange(String source) -> { Path sourcePath = Path.of(source); @@ -150,6 +200,22 @@ public class ConvertActor extends RecordActorPrototype { }; } + private String getCrc32FileHash(Path file) throws IOException { + ByteBuffer buffer = ByteBuffer.allocate(8192); + + try (var channel = Files.newByteChannel(file)) { + CRC32 crc = new CRC32(); + + while (channel.read(buffer) > 0) { + buffer.flip(); + crc.update(buffer); + buffer.clear(); + } + + return Long.toHexString(crc.getValue()); + } + } + @Override public String describe() { return "Convert a set of crawl data into a format suitable for loading into the database."; @@ -165,6 +231,5 @@ public class ConvertActor extends RecordActorPrototype { this.processWatcher = processWatcher; this.mqConverterOutbox = processOutboxes.getConverterOutbox(); this.storageService = storageService; - this.gson = gson; } } diff --git a/settings.gradle b/settings.gradle index 779ab288..68d593b6 100644 --- a/settings.gradle +++ b/settings.gradle @@ -99,6 +99,7 @@ include 'third-party:monkey-patch-opennlp' include 'third-party:monkey-patch-gson' include 'third-party:commons-codec' include 'third-party:parquet-floor' +include 'third-party:encyclopedia-marginalia-nu' dependencyResolutionManagement { diff --git a/third-party/README.md b/third-party/README.md index d6b8a834..8949f377 100644 --- a/third-party/README.md +++ b/third-party/README.md @@ -9,9 +9,9 @@ or lack an artifact, or to override some default that is inappropriate for the t * [RDRPosTagger](rdrpostagger/) - GPL3 * [PorterStemmer](porterstemmer/) - LGPL3 * [Uppend](uppend/) - MIT -* [OpenZIM](openzim/) - GPL-2.0 +* [OpenZIM](openzim/) - GPL-2.0+ * [Commons Codec](commons-codec/) - Apache 2.0 - +* [encylopedia.marginalia.nu](encyclopedia-marginalia-nu/) - GPL 2.0+ ### Repackaged * [SymSpell](symspell/) - LGPL-3.0 * [Count-Min-Sketch](count-min-sketch/) - Apache 2.0 diff --git a/third-party/encyclopedia-marginalia-nu/build.gradle b/third-party/encyclopedia-marginalia-nu/build.gradle new file mode 100644 index 00000000..443b599d --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/build.gradle @@ -0,0 +1,26 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation libs.jsoup + implementation libs.notnull + implementation libs.bundles.gson + implementation libs.zstd + implementation libs.bundles.slf4j + + implementation project(':code:libraries:blocking-thread-pool') + + implementation project(':third-party:xz') + implementation project(':third-party:openzim') +} + +test { + useJUnitPlatform() +} diff --git a/third-party/encyclopedia-marginalia-nu/readme.md b/third-party/encyclopedia-marginalia-nu/readme.md new file mode 100644 index 00000000..5a6b38e0 --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/readme.md @@ -0,0 +1,5 @@ +This package contains a severely stripped down version of the codebase from +[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/). + +The extracted code is a ZimFile reader and WikiHTML cleaner. It is used by the +encyclopedia side-loader. \ No newline at end of file diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java new file mode 100644 index 00000000..1920f536 --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/EncyclopediaConverter.java @@ -0,0 +1,67 @@ +package nu.marginalia.encyclopedia; + +import nu.marginalia.encyclopedia.cleaner.WikiCleaner; +import nu.marginalia.encyclopedia.store.ArticleDbProvider; +import nu.marginalia.encyclopedia.store.ArticleStoreWriter; +import nu.marginalia.util.SimpleBlockingThreadPool; +import org.openzim.ZIMTypes.ZIMFile; +import org.openzim.ZIMTypes.ZIMReader; +import org.slf4j.LoggerFactory; +import org.slf4j.Logger; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; +import java.util.function.Predicate; + +/** Converts an OpenZim file with Wikipedia articles to a SQLite database + * with cleaned-up MediaWiki HTML + */ +public class EncyclopediaConverter { + private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class); + + public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException { + var wc = new WikiCleaner(); + var pool = new SimpleBlockingThreadPool("Convert ZIM", + Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32), + 2); + var size = new AtomicInteger(); + + if (!Files.exists(inputFile)) { + throw new IllegalStateException("ZIM file not found: " + inputFile); + } + Files.deleteIfExists(outputFile); + + try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) { + Predicate keepGoing = (s) -> true; + + BiConsumer handleArticle = (url, html) -> { + if (pool.isTerminated()) + return; + + pool.submitQuietly(() -> { + int sz = size.incrementAndGet(); + if (sz % 1000 == 0) { + System.out.printf("\u001b[2K\r%d", sz); + } + asw.add(wc.cleanWikiJunk(url, html)); + }); + + size.incrementAndGet(); + }; + + new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing); + + pool.shutDown(); + logger.info("Waiting for pool to finish"); + + while (!pool.awaitTermination(1, TimeUnit.SECONDS)) { + // ... + } + } + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java new file mode 100644 index 00000000..d0248fab --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/CleanerFilter.java @@ -0,0 +1,60 @@ +package nu.marginalia.encyclopedia.cleaner; + +import lombok.Builder; +import org.jsoup.nodes.Comment; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.NodeFilter; + +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +@Builder +public class CleanerFilter implements NodeFilter { + final Set badTags; + final Set badIds; + final Set badClasses; + + final Set> predicates; + + private static final Pattern spacePattern = Pattern.compile("\\s+"); + + @Override + public FilterResult head(Node node, int depth) { + if (node instanceof Element el) { + if (badTags != null && badTags.contains(el.tagName())) + return FilterResult.REMOVE; + + if (badIds != null && badIds.contains(el.id())) + return FilterResult.REMOVE; + + if (badClasses != null) { + String className = el.className(); + if (className.contains(" ")) { + String[] parts = spacePattern.split(className); + for (var c : parts) { + if (badClasses.contains(c)) + return FilterResult.REMOVE; + } + } + else if (badClasses.contains(className)) { + return FilterResult.REMOVE; + } + } + + if (predicates != null) { + for (var pred : predicates) { + if (pred.test(el)) + return FilterResult.REMOVE; + } + } + } + + if (node instanceof Comment) { + return FilterResult.REMOVE; + } + + return FilterResult.CONTINUE; + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/WikiCleaner.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/WikiCleaner.java new file mode 100644 index 00000000..42805a5f --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/WikiCleaner.java @@ -0,0 +1,329 @@ +package nu.marginalia.encyclopedia.cleaner; + +import nu.marginalia.encyclopedia.cleaner.model.ArticleData; +import nu.marginalia.encyclopedia.cleaner.model.ArticleParts; +import nu.marginalia.encyclopedia.model.Article; +import nu.marginalia.encyclopedia.model.Link; +import nu.marginalia.encyclopedia.model.LinkList; +import org.jetbrains.annotations.NotNull; +import org.jsoup.Jsoup; +import org.jsoup.nodes.*; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +public class WikiCleaner { + + private static final String licenseFooter = "This article is issued from Wikipedia. The text is licensed under Creative Commons - Attribution - Sharealike. Additional terms may apply for the media files."; + public ArticleData cleanWikiJunk(String url, String html) { + return cleanWikiJunk(url, Jsoup.parse(html)); + } + + private boolean isPresentationRole(Element el) { + return "presentation".equals(el.attr("role")); + } + private boolean isLicenseFooter(Element el) { + // We'll add our own later + if ("div".equals(el.tagName())) { + return licenseFooter.equals(el.wholeOwnText().trim()); + } + + return false; + } + + public ArticleData cleanWikiJunk(String url, Document doc) { + + if (doc.getElementById("content") == null) { + return null; + } + + List disambig = getDisambiguationLinks(doc); + List topLinks = getWikiPageLinks(doc); + + doc.filter(CleanerFilter.builder() + .badClasses(Set.of("infobox", "collapsible", "navbar", "printfooter", + "mw-editsection", "thumb", "sidebar", "navbox", "mw-jump-link", + "vertical-navbox", "mw-indicators", "noprint", "sistersitebox", + "BarChartTemplate")) + .badIds(Set.of("coordinates", "mw-page-base", "mw-head-base", "site-notice", "contentSub", "contentSub2")) + .badTags(Set.of("footer", "script", "object", "embed", "audio", "style", "nosript", "link", "meta", "img")) + .predicates(Set.of(this::isPresentationRole, this::isLicenseFooter)) + .build()); + + doc.getElementsByTag("a").forEach(tag -> { + var href = tag.attr("href"); + var parent = tag.parent(); + + if (null != parent && "li".equals(parent.tagName())) { + tag.removeAttr("title"); + + if (href.startsWith("http://")) { + tag.addClass("extern-link"); + tag.attr("rel", "nofollow"); + } + } else { + tag.replaceWith(new TextNode(tag.text())); + } + }); + + doc.getElementsByTag("cite").tagName("span"); + + doc.filter(CleanerFilter.builder() + .badIds(Set.of("toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav")) + .badClasses(Set.of("mw-references-wrap", "references", "reference", "siteSub", "refbegin")) + .build() + ); + + doc.getAllElements().forEach(elem -> { + if (elem.parent() != null + && "summary".equals(elem.parent().tagName())) + { + elem.parent().replaceWith(elem); + } + }); + + doc.getElementsByClass("mwe-math-element").forEach(mathSpan -> { + var mathTag = mathSpan.getElementsByTag("math").first(); + if (mathTag != null) { + mathSpan.replaceWith(mathTag); + } + }); + + doc.getElementsByTag("span").forEach(elem -> { + if ("pre".equals(elem.parent().tagName())) { + if (elem.hasClass("linenos")) { + elem.replaceWith(new TextNode(String.format("%-4s", elem.text()))); + } + else { + elem.replaceWith(new TextNode(elem.text())); + } + } + else { + elem.replaceWith(new TextNode(" " + elem.text() + " ")); + } + }); + + doc.getElementsByTag("details").forEach(deets -> { + if (deets.children().size() == 1) { + deets.replaceWith(deets.children().first()); + } + else { + deets.tagName("div"); + } + }); + + removeSingularlyNestedDivs(doc); + + removeEmptyTags(doc, "li"); + removeEmptyTags(doc, "ul"); + removeEmptyTags(doc, "div"); + + doc.getElementsByTag("p").forEach(elem -> { + if ("blockquote".equals(elem.parent().tagName())) { + elem.replaceWith(new TextNode(elem.text())); + } + }); + + removeEmptyTags(doc, "p"); + + + + cascadingHeaderCleanup(doc, "h4", "h3", "h2"); + cascadingHeaderCleanup(doc, "h3", "h2"); + cascadingHeaderCleanup(doc, "h2"); + + doc.getElementsByTag("table").forEach(table -> { + table.attr("border", "1"); + + if ("right".equals(table.attr("align"))) { + table.remove(); + } + }); + + doc.getAllElements().forEach(elem -> { + removeWikiClassNames(elem); + + elem.removeAttr("lang"); + elem.removeAttr("dir"); + elem.removeAttr("id"); + elem.removeAttr("role"); + elem.removeAttr("style"); + elem.removeAttr("tabindex"); + elem.removeAttr("aria-haspopup"); + elem.removeAttr("data-section-id"); + elem.removeAttr("aria-expanded"); + elem.removeAttr("aria-pressed"); + elem.removeAttr("open"); + elem.removeAttr("data-level"); + }); + + doc.getElementsByTag("table").remove(); + + // Remove the first header since we'll insert our own in the templating + Optional.ofNullable(doc.getElementsByTag("h1").first()).ifPresent(Element::remove); + + ArticleParts articleParts = getDocumentParts(doc); + + return new Article( + url, + doc.title(), + articleParts.getSummary(), + articleParts, + new LinkList(topLinks), + new LinkList(disambig) + ).asData(); + } + + private void removeWikiClassNames(Element elem) { + final String classNames = elem.className(); + + // Note that the string with class names isn't split, + // this is fairly expensive and since most tags don't even + // have classes, we'll optimistically check for presence and then + // pay for the expensive removeClass operation even if unnecessary + // due to a false positive + + if (classNames.contains("verb")) { + elem.removeClass("verb"); + } + + if (classNames.contains("extern-link")) { + elem.removeClass("extern-link"); + } + + if (classNames.contains("margin-note")) { + elem.removeClass("margin-note"); + } + + if (classNames.contains("wikitable")) { + elem.removeClass("wikitable"); + } + + } + + public static ArticleParts getDocumentParts(Document doc) { + + // We expect the document to be one container div with a bunch of children + // each corresponding to a section of the document + + var rootDiv = doc.getElementsByTag("div").first(); + + if (null == rootDiv) { + return new ArticleParts(List.of()); + } + + // To be maximally useful, we want the article as a series of divs corresponding to + // logical sections of the article + + List parts = new ArrayList<>(); + + Element normalizingDiv = null; + for (Element child : rootDiv.children()) { + boolean isDiv = "div".equals(child.tagName()); + + if (!isDiv && normalizingDiv == null) { + normalizingDiv = new Element("div"); + } + + if (isDiv && normalizingDiv != null) { + if (normalizingDiv.childrenSize() > 0) { + parts.add(normalizingDiv.outerHtml()); + } + normalizingDiv = null; + } + + if (normalizingDiv != null) normalizingDiv.appendChild(child.clone()); + if (isDiv && child.childrenSize() > 0) parts.add(child.outerHtml()); + + } + if (normalizingDiv != null && + normalizingDiv.childrenSize() > 0) + { + parts.add(normalizingDiv.outerHtml()); + } + + return new ArticleParts(parts); + } + + private void removeSingularlyNestedDivs(Document doc) { + // Remove divs that only contain a single div, and replace them with the inner div + + for (Element div : doc.getElementsByTag("div")) { + final Elements children = div.children(); + + if (children.size() != 1) + continue; + + final Element childDiv = children.first(); + + if (null != childDiv && "div".equals(childDiv.tagName())) { + div.replaceWith(childDiv); + } + } + } + + private void cascadingHeaderCleanup(Document doc, String currH, String... nextHeaders) { + doc.getElementsByTag(currH).forEach(elem -> { + var next = elem.nextElementSibling(); + if (next == null) { + elem.remove(); + return; + } + String nextTagName = next.tagName(); + if (currH.equals(nextTagName)) { + elem.remove(); + } + else for (String h : nextHeaders) { + if (h.equals(nextTagName)) { + elem.remove(); + } + } + }); + } + + private void removeEmptyTags(Document doc, String tag) { + doc.getElementsByTag(tag).forEach(elem -> { + if (elem.text().isBlank() && elem.getElementsByTag("img").isEmpty()) { + elem.replaceWith(new TextNode(" ")); + } + }); + } + + @NotNull + private List getWikiPageLinks(Document doc) { + List topLinks = new ArrayList<>(); + doc.select("p a").forEach(atag -> { + String href = atag.attr("href"); + + if (!href.isBlank() + && !href.contains(":") + && !href.startsWith("#") + ) { + topLinks.add(new Link(href, atag.attr("title"))); + } + }); + return topLinks; + } + + + @NotNull + private List getDisambiguationLinks(Document doc) { + List disambig = new ArrayList<>(); + + for (var note: doc.getElementsByClass("hatnote")) { + for (var atag : note.getElementsByTag("a")) { + String href = atag.attr("href"); + if (atag.hasClass("mw-disambig") && !href.isBlank()) { + disambig.add(new Link(href, atag.attr("title"))); + } + } + note.remove(); + } + + return disambig; + } + +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleData.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleData.java new file mode 100644 index 00000000..009c8056 --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleData.java @@ -0,0 +1,10 @@ +package nu.marginalia.encyclopedia.cleaner.model; + +public record ArticleData( + String url, + String title, + String summary, + byte[] parts, + byte[] links, + byte[] disambigs) { +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleParts.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleParts.java new file mode 100644 index 00000000..475390da --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/cleaner/model/ArticleParts.java @@ -0,0 +1,44 @@ +package nu.marginalia.encyclopedia.cleaner.model; + +import org.jsoup.Jsoup; + +import java.util.List; + +public record ArticleParts(List parts) { + public ArticleParts(String... parts) { + this(List.of(parts)); + } + public String articleHtml() { + StringBuilder sb = new StringBuilder(); + for (String part : parts()) { + sb.append(part); + } + return sb.toString(); + } + + public String getSummary() { + if (parts.isEmpty()) + return ""; + + String firstPart = parts.get(0); + var doclet = Jsoup.parse(firstPart); + doclet.getElementsByTag("b").tagName("span"); + var firstP = doclet.select("p").first(); + + if (null == firstP) + return ""; + + StringBuilder ret = new StringBuilder(); + ret.append(firstP.outerHtml()); + + var nextSibling = firstP.nextElementSibling(); + + if (nextSibling != null && + !"p".equals(nextSibling.tagName()) && + !"table".equals(nextSibling.tagName())) + { + ret.append(" ").append(nextSibling.outerHtml()); + } + return ret.toString(); + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Article.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Article.java new file mode 100644 index 00000000..d317b7d7 --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Article.java @@ -0,0 +1,35 @@ +package nu.marginalia.encyclopedia.model; + +import nu.marginalia.encyclopedia.cleaner.model.ArticleData; +import nu.marginalia.encyclopedia.cleaner.model.ArticleParts; +import nu.marginalia.encyclopedia.store.ArticleCodec; + +public record Article ( + String url, + String title, + String summary, + ArticleParts parts, + LinkList urls, + LinkList disambigs) +{ + + public ArticleData asData() { + return new ArticleData( + url(), + title(), + summary(), + ArticleCodec.toCompressedJson(parts), + ArticleCodec.toCompressedJson(urls), + ArticleCodec.toCompressedJson(disambigs) + ); + } + + /** Used by template */ + public String articleHtml() { + if (parts == null) { + return ""; + } + + return parts.articleHtml(); + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Link.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Link.java new file mode 100644 index 00000000..42acae3c --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/Link.java @@ -0,0 +1,3 @@ +package nu.marginalia.encyclopedia.model; + +public record Link(String url, String text) { } diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/LinkList.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/LinkList.java new file mode 100644 index 00000000..58c2d03f --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/LinkList.java @@ -0,0 +1,13 @@ +package nu.marginalia.encyclopedia.model; + +import java.util.List; + +public record LinkList(List links) { + public LinkList(Link... links) { + this(List.of(links)); + } + + public int size() { + return links.size(); + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/ReferencedArticle.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/ReferencedArticle.java new file mode 100644 index 00000000..66038edf --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/model/ReferencedArticle.java @@ -0,0 +1,33 @@ +package nu.marginalia.encyclopedia.model; + +import org.jetbrains.annotations.NotNull; + +import java.util.ArrayList; +import java.util.List; + +public record ReferencedArticle(String title, + List aliases, + String url, + String summary) implements Comparable { + public ReferencedArticle(String title, String url, String summary) { + this(title, List.of(), url, summary); + } + + public ReferencedArticle withAliases(List aliases) { + if (aliases != null && aliases.size() > 1) { + var cleanAliases = new ArrayList<>(aliases); + cleanAliases.remove(title()); + return new ReferencedArticle(title(), cleanAliases, url(), summary()); + } + + return this; + } + + private String compareKey() { + return url.toLowerCase(); + } + @Override + public int compareTo(@NotNull ReferencedArticle referencedArticle) { + return compareKey().compareTo(referencedArticle.compareKey()); + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleCodec.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleCodec.java new file mode 100644 index 00000000..6aff67d7 --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleCodec.java @@ -0,0 +1,25 @@ +package nu.marginalia.encyclopedia.store; + +import com.github.luben.zstd.Zstd; +import com.github.luben.zstd.ZstdInputStream; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +public class ArticleCodec { + private static final Gson gson = new GsonBuilder() + .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create()) + .create(); + + public static byte[] toCompressedJson(Object any) { + return Zstd.compress(gson.toJson(any).getBytes()); + } + public static T fromCompressedJson(byte[] stream, Class type) throws IOException { + return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type); + } + +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleDbProvider.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleDbProvider.java new file mode 100644 index 00000000..86ea5d5d --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleDbProvider.java @@ -0,0 +1,33 @@ +package nu.marginalia.encyclopedia.store; + +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; + +public class ArticleDbProvider { + private final Connection connection; + + public ArticleDbProvider(Path filename) throws SQLException { + String sqliteDbString = "jdbc:sqlite:" + filename.toString(); + connection = DriverManager.getConnection(sqliteDbString); + + try (var stmt = connection.createStatement()) { + stmt.executeUpdate(""" + CREATE TABLE IF NOT EXISTS articles ( + url TEXT PRIMARY KEY, + title TEXT NOT NULL, + summary TEXT NOT NULL, + html BLOB NOT NULL, + urls BLOB NOT NULL, + disambigs BLOB NOT NULL + ) + """); + + } + } + + public Connection getConnection() { + return connection; + } +} diff --git a/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleStoreWriter.java b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleStoreWriter.java new file mode 100644 index 00000000..20b6b7df --- /dev/null +++ b/third-party/encyclopedia-marginalia-nu/src/main/java/nu/marginalia/encyclopedia/store/ArticleStoreWriter.java @@ -0,0 +1,102 @@ +package nu.marginalia.encyclopedia.store; + +import nu.marginalia.encyclopedia.cleaner.model.ArticleData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Connection; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; + +public class ArticleStoreWriter implements AutoCloseable { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Connection connection; + private final LinkedBlockingQueue queue = new LinkedBlockingQueue<>(1000); + + Thread insertThread; + volatile boolean running; + + public ArticleStoreWriter(ArticleDbProvider dbProvider) throws SQLException { + connection = dbProvider.getConnection(); + + try (var stmt = connection.createStatement()) { + stmt.execute("PRAGMA synchronous = OFF"); + stmt.execute("PRAGMA journal_mode = MEMORY"); + } + + running = true; + insertThread = new Thread(this::insertLoop); + insertThread.start(); + } + + private void insertLoop() { + List toAdd = new ArrayList<>(); + while (running || !queue.isEmpty()) { + try { + while (0 != queue.drainTo(toAdd, 100)) { + insertItems(toAdd); + toAdd.clear(); + } + if (queue.isEmpty()) { + // Yield for a moment to avoid busy looping + TimeUnit.NANOSECONDS.sleep(100); + } + } catch (SQLException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + } + + private void insertItems(List toAdd) throws SQLException { + try (var stmt = connection.prepareStatement(""" + INSERT OR IGNORE INTO articles (url, title, html, summary, urls, disambigs) + VALUES (?, ?, ?, ?, ?, ?) + """)) + { + connection.setAutoCommit(false); // Disable auto-commit mode + for (var article : toAdd) { + stmt.setString(1, article.url()); + stmt.setString(2, article.title()); + stmt.setBytes(3, article.parts()); + stmt.setString(4, article.summary()); + stmt.setBytes(5, article.links()); + stmt.setBytes(6, article.disambigs()); + + stmt.addBatch(); + } + stmt.executeBatch(); + connection.commit(); // Commit the transaction + } catch (SQLException e) { + connection.rollback(); // Rollback the transaction in case of error + logger.warn("SQL error", e); + } finally { + connection.setAutoCommit(true); // Re-enable auto-commit mode + } + } + + public void add(ArticleData article) { + try { + queue.put(article); + } + catch (InterruptedException e) { + logger.warn("Interrupted", e); + throw new RuntimeException(e); + } + } + + public void close() { + running = false; + try { + insertThread.join(); + connection.close(); + } catch (InterruptedException|SQLException e) { + logger.warn("Error", e); + } + } + +} diff --git a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index 1f001d36..e2fcaf6e 100644 --- a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -221,7 +221,7 @@ public class ZIMReader { // Gives the minimum required information needed for the given articleName public DirectoryEntry forEachArticles(BiConsumer consumer, Predicate blobPred) - throws IOException { + throws IOException, InterruptedException { int numberOfArticles = mFile.getArticleCount(); long beg = mFile.getTitlePtrPos(); @@ -237,6 +237,10 @@ public class ZIMReader { for (long i = beg; i < end; i+=4) { var entry = getDirectoryInfoAtTitlePosition(i); + if (Thread.interrupted()) { + throw new InterruptedException(); + } + if (((i-beg)%100_000) == 0) { System.out.printf("%f%%\n", ((i-beg) * 100.) / (end-beg)); } @@ -249,21 +253,25 @@ public class ZIMReader { System.out.println("Iterating over " + data.keySet().stream().mapToInt(Integer::intValue).max() + "clusters"); - data.forEach((pos,blobs) -> { - if (!blobPred.test(pos)) { - return; - } + var iter = data.entrySet().iterator(); + while (iter.hasNext()) { + if (Thread.interrupted()) throw new InterruptedException(); + + var next = iter.next(); + int pos = next.getKey(); + + if (!blobPred.test(pos)) continue; + Map blobs = next.getValue(); try { getArticleData(consumer, pos, blobs); } catch (Exception ex) { - ex.printStackTrace(); + throw new RuntimeException(ex); } - }); + } return null; - }