(converter) Integrate zim->db conversion into automatic encyclopedia processing workflow
Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file. This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically. The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads.
This commit is contained in:
parent
22c8fb3f59
commit
27ffb8fa8a
@ -2,6 +2,8 @@ package nu.marginalia.loading;
|
||||
|
||||
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
||||
import nu.marginalia.worklog.BatchingWorkLogInspector;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
@ -10,13 +12,21 @@ import java.util.*;
|
||||
|
||||
public class LoaderInputData {
|
||||
private final List<Path> sourceDirectories;
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class);
|
||||
private final Map<Path, Integer> lastGoodBatch = new HashMap<>();
|
||||
|
||||
public LoaderInputData(List<Path> sourceDirectories) throws IOException {
|
||||
this.sourceDirectories = sourceDirectories;
|
||||
|
||||
for (var source : sourceDirectories) {
|
||||
lastGoodBatch.put(source, BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log")));
|
||||
int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"));
|
||||
|
||||
this.lastGoodBatch.put(source, lastGoodBatch);
|
||||
|
||||
if (lastGoodBatch == 0) {
|
||||
// This is useful diagnostic information, so we log it as a warning
|
||||
logger.warn("No valid batches found in {}", source);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,11 +1,17 @@
|
||||
<h1 class="my-3">Sideload Encyclopedia</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
<p>This will sideload a pre-converted MediaWiki-style OpenZim data set.
|
||||
See the <a href="https://github.com/MarginaliaSearch/MarginaliaSearch/blob/master/doc/sideloading-howto.md">sideloading howto</a>
|
||||
for instructions how to produce this file. </p>
|
||||
<p>Place an articles.db file in the upload directory on the server, and select it from the list
|
||||
below. </p>
|
||||
<p>This will side-load a MediaWiki-style OpenZim data set. Place a zim file in the uploads directory.
|
||||
For Wikipedia, the zim file can be downloaded from <a href="https://download.kiwix.org/zim/wikipedia/">https://download.kiwix.org/zim/wikipedia/</a>.
|
||||
The en_all_nopic sets are recommended for wikipedia, since they are smaller and do not contain images
|
||||
(which are not used anyway). For testing, the _mini or _en_100 sets are good choices.
|
||||
<p></p>
|
||||
The zim file will be converted to a sqlite database (.db-file) with a similar name to
|
||||
the zim file, which then automatically is turned into processed data.
|
||||
<p></p>
|
||||
Since the first stage of processing is very time-consuming, the sqlite database can
|
||||
also be loaded from this form.
|
||||
</p>
|
||||
</div>
|
||||
<form method="post" action="actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
|
||||
<div class="my-3 py-3">
|
||||
|
@ -45,6 +45,7 @@ dependencies {
|
||||
implementation project(':code:api:query-api')
|
||||
implementation project(':code:api:process-mqapi')
|
||||
implementation project(':code:api:executor-api')
|
||||
implementation project(':third-party:encyclopedia-marginalia-nu')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.encyclopedia.EncyclopediaConverter;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@ -16,21 +17,27 @@ import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertAction;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.zip.CRC32;
|
||||
|
||||
@Singleton
|
||||
public class ConvertActor extends RecordActorPrototype {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ConvertActor.class);
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox mqConverterOutbox;
|
||||
private final FileStorageService storageService;
|
||||
private final Gson gson;
|
||||
|
||||
public record Convert(FileStorageId fid) implements ActorStep {};
|
||||
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
|
||||
public record PredigestEncyclopedia(String source, String dest, String baseUrl) implements ActorStep {};
|
||||
public record ConvertDirtree(String source) implements ActorStep {};
|
||||
public record ConvertWarc(String source) implements ActorStep {};
|
||||
public record ConvertStackexchange(String source) implements ActorStep {};
|
||||
@ -100,6 +107,19 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
if (!Files.exists(sourcePath))
|
||||
yield new Error("Source path does not exist: " + sourcePath);
|
||||
|
||||
if (source.toLowerCase().endsWith(".zim")) {
|
||||
// If we're fed a ZIM file, we need to convert it to a sqlite database first
|
||||
String hash = getCrc32FileHash(sourcePath);
|
||||
|
||||
// To avoid re-converting the same file, we'll assign the file a name based on its hash
|
||||
// and the original filename. This way, if we're fed the same file again, we'll be able to just
|
||||
// re-use the predigested database file.
|
||||
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
|
||||
} else if (!source.endsWith(".db")) {
|
||||
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
|
||||
}
|
||||
|
||||
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
|
||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
@ -114,6 +134,36 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id()))
|
||||
);
|
||||
}
|
||||
case PredigestEncyclopedia(String source, String dest, String baseUrl) -> {
|
||||
Path sourcePath = Path.of(source);
|
||||
|
||||
if (!Files.exists(sourcePath)) {
|
||||
yield new Error("Source path does not exist: " + sourcePath);
|
||||
}
|
||||
|
||||
Path destPath = Path.of(dest);
|
||||
if (Files.exists(destPath)) {
|
||||
// Already predigested, go straight to convert step
|
||||
yield new ConvertEncyclopedia(dest, baseUrl);
|
||||
}
|
||||
|
||||
Path tempFile = Files.createTempFile(destPath.getParent(), "encyclopedia", "db.tmp");
|
||||
|
||||
try {
|
||||
EncyclopediaConverter.convert(sourcePath, tempFile);
|
||||
Files.move(tempFile, destPath);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to convert ZIM file to sqlite database", e);
|
||||
Files.deleteIfExists(tempFile);
|
||||
Files.deleteIfExists(destPath);
|
||||
|
||||
yield new Error("Failed to convert ZIM file to sqlite database: " + e.getMessage());
|
||||
}
|
||||
|
||||
// Go back to convert step with the new database file
|
||||
yield new ConvertEncyclopedia(dest, baseUrl);
|
||||
}
|
||||
case ConvertStackexchange(String source) -> {
|
||||
|
||||
Path sourcePath = Path.of(source);
|
||||
@ -150,6 +200,22 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
};
|
||||
}
|
||||
|
||||
private String getCrc32FileHash(Path file) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(8192);
|
||||
|
||||
try (var channel = Files.newByteChannel(file)) {
|
||||
CRC32 crc = new CRC32();
|
||||
|
||||
while (channel.read(buffer) > 0) {
|
||||
buffer.flip();
|
||||
crc.update(buffer);
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
return Long.toHexString(crc.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Convert a set of crawl data into a format suitable for loading into the database.";
|
||||
@ -165,6 +231,5 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
this.processWatcher = processWatcher;
|
||||
this.mqConverterOutbox = processOutboxes.getConverterOutbox();
|
||||
this.storageService = storageService;
|
||||
this.gson = gson;
|
||||
}
|
||||
}
|
||||
|
@ -99,6 +99,7 @@ include 'third-party:monkey-patch-opennlp'
|
||||
include 'third-party:monkey-patch-gson'
|
||||
include 'third-party:commons-codec'
|
||||
include 'third-party:parquet-floor'
|
||||
include 'third-party:encyclopedia-marginalia-nu'
|
||||
|
||||
|
||||
dependencyResolutionManagement {
|
||||
|
4
third-party/README.md
vendored
4
third-party/README.md
vendored
@ -9,9 +9,9 @@ or lack an artifact, or to override some default that is inappropriate for the t
|
||||
* [RDRPosTagger](rdrpostagger/) - GPL3
|
||||
* [PorterStemmer](porterstemmer/) - LGPL3
|
||||
* [Uppend](uppend/) - MIT
|
||||
* [OpenZIM](openzim/) - GPL-2.0
|
||||
* [OpenZIM](openzim/) - GPL-2.0+
|
||||
* [Commons Codec](commons-codec/) - Apache 2.0
|
||||
|
||||
* [encylopedia.marginalia.nu](encyclopedia-marginalia-nu/) - GPL 2.0+
|
||||
### Repackaged
|
||||
* [SymSpell](symspell/) - LGPL-3.0
|
||||
* [Count-Min-Sketch](count-min-sketch/) - Apache 2.0
|
||||
|
26
third-party/encyclopedia-marginalia-nu/build.gradle
vendored
Normal file
26
third-party/encyclopedia-marginalia-nu/build.gradle
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation project(':third-party:xz')
|
||||
implementation project(':third-party:openzim')
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
5
third-party/encyclopedia-marginalia-nu/readme.md
vendored
Normal file
5
third-party/encyclopedia-marginalia-nu/readme.md
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
This package contains a severely stripped down version of the codebase from
|
||||
[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/).
|
||||
|
||||
The extracted code is a ZimFile reader and WikiHTML cleaner. It is used by the
|
||||
encyclopedia side-loader.
|
@ -0,0 +1,67 @@
|
||||
package nu.marginalia.encyclopedia;
|
||||
|
||||
import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
|
||||
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
|
||||
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** Converts an OpenZim file with Wikipedia articles to a SQLite database
|
||||
* with cleaned-up MediaWiki HTML
|
||||
*/
|
||||
public class EncyclopediaConverter {
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class);
|
||||
|
||||
public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
|
||||
var wc = new WikiCleaner();
|
||||
var pool = new SimpleBlockingThreadPool("Convert ZIM",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
|
||||
2);
|
||||
var size = new AtomicInteger();
|
||||
|
||||
if (!Files.exists(inputFile)) {
|
||||
throw new IllegalStateException("ZIM file not found: " + inputFile);
|
||||
}
|
||||
Files.deleteIfExists(outputFile);
|
||||
|
||||
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
|
||||
Predicate<Integer> keepGoing = (s) -> true;
|
||||
|
||||
BiConsumer<String, String> handleArticle = (url, html) -> {
|
||||
if (pool.isTerminated())
|
||||
return;
|
||||
|
||||
pool.submitQuietly(() -> {
|
||||
int sz = size.incrementAndGet();
|
||||
if (sz % 1000 == 0) {
|
||||
System.out.printf("\u001b[2K\r%d", sz);
|
||||
}
|
||||
asw.add(wc.cleanWikiJunk(url, html));
|
||||
});
|
||||
|
||||
size.incrementAndGet();
|
||||
};
|
||||
|
||||
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
|
||||
|
||||
pool.shutDown();
|
||||
logger.info("Waiting for pool to finish");
|
||||
|
||||
while (!pool.awaitTermination(1, TimeUnit.SECONDS)) {
|
||||
// ...
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
package nu.marginalia.encyclopedia.cleaner;
|
||||
|
||||
import lombok.Builder;
|
||||
import org.jsoup.nodes.Comment;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Builder
|
||||
public class CleanerFilter implements NodeFilter {
|
||||
final Set<String> badTags;
|
||||
final Set<String> badIds;
|
||||
final Set<String> badClasses;
|
||||
|
||||
final Set<Predicate<Element>> predicates;
|
||||
|
||||
private static final Pattern spacePattern = Pattern.compile("\\s+");
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
if (node instanceof Element el) {
|
||||
if (badTags != null && badTags.contains(el.tagName()))
|
||||
return FilterResult.REMOVE;
|
||||
|
||||
if (badIds != null && badIds.contains(el.id()))
|
||||
return FilterResult.REMOVE;
|
||||
|
||||
if (badClasses != null) {
|
||||
String className = el.className();
|
||||
if (className.contains(" ")) {
|
||||
String[] parts = spacePattern.split(className);
|
||||
for (var c : parts) {
|
||||
if (badClasses.contains(c))
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
else if (badClasses.contains(className)) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
|
||||
if (predicates != null) {
|
||||
for (var pred : predicates) {
|
||||
if (pred.test(el))
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (node instanceof Comment) {
|
||||
return FilterResult.REMOVE;
|
||||
}
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
}
|
@ -0,0 +1,329 @@
|
||||
package nu.marginalia.encyclopedia.cleaner;
|
||||
|
||||
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
|
||||
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
|
||||
import nu.marginalia.encyclopedia.model.Article;
|
||||
import nu.marginalia.encyclopedia.model.Link;
|
||||
import nu.marginalia.encyclopedia.model.LinkList;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.*;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
public class WikiCleaner {
|
||||
|
||||
private static final String licenseFooter = "This article is issued from Wikipedia. The text is licensed under Creative Commons - Attribution - Sharealike. Additional terms may apply for the media files.";
|
||||
public ArticleData cleanWikiJunk(String url, String html) {
|
||||
return cleanWikiJunk(url, Jsoup.parse(html));
|
||||
}
|
||||
|
||||
private boolean isPresentationRole(Element el) {
|
||||
return "presentation".equals(el.attr("role"));
|
||||
}
|
||||
private boolean isLicenseFooter(Element el) {
|
||||
// We'll add our own later
|
||||
if ("div".equals(el.tagName())) {
|
||||
return licenseFooter.equals(el.wholeOwnText().trim());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public ArticleData cleanWikiJunk(String url, Document doc) {
|
||||
|
||||
if (doc.getElementById("content") == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<Link> disambig = getDisambiguationLinks(doc);
|
||||
List<Link> topLinks = getWikiPageLinks(doc);
|
||||
|
||||
doc.filter(CleanerFilter.builder()
|
||||
.badClasses(Set.of("infobox", "collapsible", "navbar", "printfooter",
|
||||
"mw-editsection", "thumb", "sidebar", "navbox", "mw-jump-link",
|
||||
"vertical-navbox", "mw-indicators", "noprint", "sistersitebox",
|
||||
"BarChartTemplate"))
|
||||
.badIds(Set.of("coordinates", "mw-page-base", "mw-head-base", "site-notice", "contentSub", "contentSub2"))
|
||||
.badTags(Set.of("footer", "script", "object", "embed", "audio", "style", "nosript", "link", "meta", "img"))
|
||||
.predicates(Set.of(this::isPresentationRole, this::isLicenseFooter))
|
||||
.build());
|
||||
|
||||
doc.getElementsByTag("a").forEach(tag -> {
|
||||
var href = tag.attr("href");
|
||||
var parent = tag.parent();
|
||||
|
||||
if (null != parent && "li".equals(parent.tagName())) {
|
||||
tag.removeAttr("title");
|
||||
|
||||
if (href.startsWith("http://")) {
|
||||
tag.addClass("extern-link");
|
||||
tag.attr("rel", "nofollow");
|
||||
}
|
||||
} else {
|
||||
tag.replaceWith(new TextNode(tag.text()));
|
||||
}
|
||||
});
|
||||
|
||||
doc.getElementsByTag("cite").tagName("span");
|
||||
|
||||
doc.filter(CleanerFilter.builder()
|
||||
.badIds(Set.of("toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav"))
|
||||
.badClasses(Set.of("mw-references-wrap", "references", "reference", "siteSub", "refbegin"))
|
||||
.build()
|
||||
);
|
||||
|
||||
doc.getAllElements().forEach(elem -> {
|
||||
if (elem.parent() != null
|
||||
&& "summary".equals(elem.parent().tagName()))
|
||||
{
|
||||
elem.parent().replaceWith(elem);
|
||||
}
|
||||
});
|
||||
|
||||
doc.getElementsByClass("mwe-math-element").forEach(mathSpan -> {
|
||||
var mathTag = mathSpan.getElementsByTag("math").first();
|
||||
if (mathTag != null) {
|
||||
mathSpan.replaceWith(mathTag);
|
||||
}
|
||||
});
|
||||
|
||||
doc.getElementsByTag("span").forEach(elem -> {
|
||||
if ("pre".equals(elem.parent().tagName())) {
|
||||
if (elem.hasClass("linenos")) {
|
||||
elem.replaceWith(new TextNode(String.format("%-4s", elem.text())));
|
||||
}
|
||||
else {
|
||||
elem.replaceWith(new TextNode(elem.text()));
|
||||
}
|
||||
}
|
||||
else {
|
||||
elem.replaceWith(new TextNode(" " + elem.text() + " "));
|
||||
}
|
||||
});
|
||||
|
||||
doc.getElementsByTag("details").forEach(deets -> {
|
||||
if (deets.children().size() == 1) {
|
||||
deets.replaceWith(deets.children().first());
|
||||
}
|
||||
else {
|
||||
deets.tagName("div");
|
||||
}
|
||||
});
|
||||
|
||||
removeSingularlyNestedDivs(doc);
|
||||
|
||||
removeEmptyTags(doc, "li");
|
||||
removeEmptyTags(doc, "ul");
|
||||
removeEmptyTags(doc, "div");
|
||||
|
||||
doc.getElementsByTag("p").forEach(elem -> {
|
||||
if ("blockquote".equals(elem.parent().tagName())) {
|
||||
elem.replaceWith(new TextNode(elem.text()));
|
||||
}
|
||||
});
|
||||
|
||||
removeEmptyTags(doc, "p");
|
||||
|
||||
|
||||
|
||||
cascadingHeaderCleanup(doc, "h4", "h3", "h2");
|
||||
cascadingHeaderCleanup(doc, "h3", "h2");
|
||||
cascadingHeaderCleanup(doc, "h2");
|
||||
|
||||
doc.getElementsByTag("table").forEach(table -> {
|
||||
table.attr("border", "1");
|
||||
|
||||
if ("right".equals(table.attr("align"))) {
|
||||
table.remove();
|
||||
}
|
||||
});
|
||||
|
||||
doc.getAllElements().forEach(elem -> {
|
||||
removeWikiClassNames(elem);
|
||||
|
||||
elem.removeAttr("lang");
|
||||
elem.removeAttr("dir");
|
||||
elem.removeAttr("id");
|
||||
elem.removeAttr("role");
|
||||
elem.removeAttr("style");
|
||||
elem.removeAttr("tabindex");
|
||||
elem.removeAttr("aria-haspopup");
|
||||
elem.removeAttr("data-section-id");
|
||||
elem.removeAttr("aria-expanded");
|
||||
elem.removeAttr("aria-pressed");
|
||||
elem.removeAttr("open");
|
||||
elem.removeAttr("data-level");
|
||||
});
|
||||
|
||||
doc.getElementsByTag("table").remove();
|
||||
|
||||
// Remove the first header since we'll insert our own in the templating
|
||||
Optional.ofNullable(doc.getElementsByTag("h1").first()).ifPresent(Element::remove);
|
||||
|
||||
ArticleParts articleParts = getDocumentParts(doc);
|
||||
|
||||
return new Article(
|
||||
url,
|
||||
doc.title(),
|
||||
articleParts.getSummary(),
|
||||
articleParts,
|
||||
new LinkList(topLinks),
|
||||
new LinkList(disambig)
|
||||
).asData();
|
||||
}
|
||||
|
||||
private void removeWikiClassNames(Element elem) {
|
||||
final String classNames = elem.className();
|
||||
|
||||
// Note that the string with class names isn't split,
|
||||
// this is fairly expensive and since most tags don't even
|
||||
// have classes, we'll optimistically check for presence and then
|
||||
// pay for the expensive removeClass operation even if unnecessary
|
||||
// due to a false positive
|
||||
|
||||
if (classNames.contains("verb")) {
|
||||
elem.removeClass("verb");
|
||||
}
|
||||
|
||||
if (classNames.contains("extern-link")) {
|
||||
elem.removeClass("extern-link");
|
||||
}
|
||||
|
||||
if (classNames.contains("margin-note")) {
|
||||
elem.removeClass("margin-note");
|
||||
}
|
||||
|
||||
if (classNames.contains("wikitable")) {
|
||||
elem.removeClass("wikitable");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static ArticleParts getDocumentParts(Document doc) {
|
||||
|
||||
// We expect the document to be one container div with a bunch of children
|
||||
// each corresponding to a section of the document
|
||||
|
||||
var rootDiv = doc.getElementsByTag("div").first();
|
||||
|
||||
if (null == rootDiv) {
|
||||
return new ArticleParts(List.of());
|
||||
}
|
||||
|
||||
// To be maximally useful, we want the article as a series of divs corresponding to
|
||||
// logical sections of the article
|
||||
|
||||
List<String> parts = new ArrayList<>();
|
||||
|
||||
Element normalizingDiv = null;
|
||||
for (Element child : rootDiv.children()) {
|
||||
boolean isDiv = "div".equals(child.tagName());
|
||||
|
||||
if (!isDiv && normalizingDiv == null) {
|
||||
normalizingDiv = new Element("div");
|
||||
}
|
||||
|
||||
if (isDiv && normalizingDiv != null) {
|
||||
if (normalizingDiv.childrenSize() > 0) {
|
||||
parts.add(normalizingDiv.outerHtml());
|
||||
}
|
||||
normalizingDiv = null;
|
||||
}
|
||||
|
||||
if (normalizingDiv != null) normalizingDiv.appendChild(child.clone());
|
||||
if (isDiv && child.childrenSize() > 0) parts.add(child.outerHtml());
|
||||
|
||||
}
|
||||
if (normalizingDiv != null &&
|
||||
normalizingDiv.childrenSize() > 0)
|
||||
{
|
||||
parts.add(normalizingDiv.outerHtml());
|
||||
}
|
||||
|
||||
return new ArticleParts(parts);
|
||||
}
|
||||
|
||||
private void removeSingularlyNestedDivs(Document doc) {
|
||||
// Remove divs that only contain a single div, and replace them with the inner div
|
||||
|
||||
for (Element div : doc.getElementsByTag("div")) {
|
||||
final Elements children = div.children();
|
||||
|
||||
if (children.size() != 1)
|
||||
continue;
|
||||
|
||||
final Element childDiv = children.first();
|
||||
|
||||
if (null != childDiv && "div".equals(childDiv.tagName())) {
|
||||
div.replaceWith(childDiv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void cascadingHeaderCleanup(Document doc, String currH, String... nextHeaders) {
|
||||
doc.getElementsByTag(currH).forEach(elem -> {
|
||||
var next = elem.nextElementSibling();
|
||||
if (next == null) {
|
||||
elem.remove();
|
||||
return;
|
||||
}
|
||||
String nextTagName = next.tagName();
|
||||
if (currH.equals(nextTagName)) {
|
||||
elem.remove();
|
||||
}
|
||||
else for (String h : nextHeaders) {
|
||||
if (h.equals(nextTagName)) {
|
||||
elem.remove();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void removeEmptyTags(Document doc, String tag) {
|
||||
doc.getElementsByTag(tag).forEach(elem -> {
|
||||
if (elem.text().isBlank() && elem.getElementsByTag("img").isEmpty()) {
|
||||
elem.replaceWith(new TextNode(" "));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private List<Link> getWikiPageLinks(Document doc) {
|
||||
List<Link> topLinks = new ArrayList<>();
|
||||
doc.select("p a").forEach(atag -> {
|
||||
String href = atag.attr("href");
|
||||
|
||||
if (!href.isBlank()
|
||||
&& !href.contains(":")
|
||||
&& !href.startsWith("#")
|
||||
) {
|
||||
topLinks.add(new Link(href, atag.attr("title")));
|
||||
}
|
||||
});
|
||||
return topLinks;
|
||||
}
|
||||
|
||||
|
||||
@NotNull
|
||||
private List<Link> getDisambiguationLinks(Document doc) {
|
||||
List<Link> disambig = new ArrayList<>();
|
||||
|
||||
for (var note: doc.getElementsByClass("hatnote")) {
|
||||
for (var atag : note.getElementsByTag("a")) {
|
||||
String href = atag.attr("href");
|
||||
if (atag.hasClass("mw-disambig") && !href.isBlank()) {
|
||||
disambig.add(new Link(href, atag.attr("title")));
|
||||
}
|
||||
}
|
||||
note.remove();
|
||||
}
|
||||
|
||||
return disambig;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.encyclopedia.cleaner.model;
|
||||
|
||||
public record ArticleData(
|
||||
String url,
|
||||
String title,
|
||||
String summary,
|
||||
byte[] parts,
|
||||
byte[] links,
|
||||
byte[] disambigs) {
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
package nu.marginalia.encyclopedia.cleaner.model;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record ArticleParts(List<String> parts) {
|
||||
public ArticleParts(String... parts) {
|
||||
this(List.of(parts));
|
||||
}
|
||||
public String articleHtml() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String part : parts()) {
|
||||
sb.append(part);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getSummary() {
|
||||
if (parts.isEmpty())
|
||||
return "";
|
||||
|
||||
String firstPart = parts.get(0);
|
||||
var doclet = Jsoup.parse(firstPart);
|
||||
doclet.getElementsByTag("b").tagName("span");
|
||||
var firstP = doclet.select("p").first();
|
||||
|
||||
if (null == firstP)
|
||||
return "";
|
||||
|
||||
StringBuilder ret = new StringBuilder();
|
||||
ret.append(firstP.outerHtml());
|
||||
|
||||
var nextSibling = firstP.nextElementSibling();
|
||||
|
||||
if (nextSibling != null &&
|
||||
!"p".equals(nextSibling.tagName()) &&
|
||||
!"table".equals(nextSibling.tagName()))
|
||||
{
|
||||
ret.append(" ").append(nextSibling.outerHtml());
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package nu.marginalia.encyclopedia.model;
|
||||
|
||||
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
|
||||
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
|
||||
import nu.marginalia.encyclopedia.store.ArticleCodec;
|
||||
|
||||
public record Article (
|
||||
String url,
|
||||
String title,
|
||||
String summary,
|
||||
ArticleParts parts,
|
||||
LinkList urls,
|
||||
LinkList disambigs)
|
||||
{
|
||||
|
||||
public ArticleData asData() {
|
||||
return new ArticleData(
|
||||
url(),
|
||||
title(),
|
||||
summary(),
|
||||
ArticleCodec.toCompressedJson(parts),
|
||||
ArticleCodec.toCompressedJson(urls),
|
||||
ArticleCodec.toCompressedJson(disambigs)
|
||||
);
|
||||
}
|
||||
|
||||
/** Used by template */
|
||||
public String articleHtml() {
|
||||
if (parts == null) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return parts.articleHtml();
|
||||
}
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
package nu.marginalia.encyclopedia.model;
|
||||
|
||||
public record Link(String url, String text) { }
|
@ -0,0 +1,13 @@
|
||||
package nu.marginalia.encyclopedia.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record LinkList(List<Link> links) {
|
||||
public LinkList(Link... links) {
|
||||
this(List.of(links));
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return links.size();
|
||||
}
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.encyclopedia.model;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public record ReferencedArticle(String title,
|
||||
List<String> aliases,
|
||||
String url,
|
||||
String summary) implements Comparable<ReferencedArticle> {
|
||||
public ReferencedArticle(String title, String url, String summary) {
|
||||
this(title, List.of(), url, summary);
|
||||
}
|
||||
|
||||
public ReferencedArticle withAliases(List<String> aliases) {
|
||||
if (aliases != null && aliases.size() > 1) {
|
||||
var cleanAliases = new ArrayList<>(aliases);
|
||||
cleanAliases.remove(title());
|
||||
return new ReferencedArticle(title(), cleanAliases, url(), summary());
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
private String compareKey() {
|
||||
return url.toLowerCase();
|
||||
}
|
||||
@Override
|
||||
public int compareTo(@NotNull ReferencedArticle referencedArticle) {
|
||||
return compareKey().compareTo(referencedArticle.compareKey());
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.encyclopedia.store;
|
||||
|
||||
import com.github.luben.zstd.Zstd;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
public class ArticleCodec {
|
||||
private static final Gson gson = new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.create();
|
||||
|
||||
public static byte[] toCompressedJson(Object any) {
|
||||
return Zstd.compress(gson.toJson(any).getBytes());
|
||||
}
|
||||
public static <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
||||
return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.encyclopedia.store;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class ArticleDbProvider {
|
||||
private final Connection connection;
|
||||
|
||||
public ArticleDbProvider(Path filename) throws SQLException {
|
||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||
connection = DriverManager.getConnection(sqliteDbString);
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
url TEXT PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
summary TEXT NOT NULL,
|
||||
html BLOB NOT NULL,
|
||||
urls BLOB NOT NULL,
|
||||
disambigs BLOB NOT NULL
|
||||
)
|
||||
""");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public Connection getConnection() {
|
||||
return connection;
|
||||
}
|
||||
}
|
@ -0,0 +1,102 @@
|
||||
package nu.marginalia.encyclopedia.store;
|
||||
|
||||
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class ArticleStoreWriter implements AutoCloseable {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Connection connection;
|
||||
private final LinkedBlockingQueue<ArticleData> queue = new LinkedBlockingQueue<>(1000);
|
||||
|
||||
Thread insertThread;
|
||||
volatile boolean running;
|
||||
|
||||
public ArticleStoreWriter(ArticleDbProvider dbProvider) throws SQLException {
|
||||
connection = dbProvider.getConnection();
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.execute("PRAGMA synchronous = OFF");
|
||||
stmt.execute("PRAGMA journal_mode = MEMORY");
|
||||
}
|
||||
|
||||
running = true;
|
||||
insertThread = new Thread(this::insertLoop);
|
||||
insertThread.start();
|
||||
}
|
||||
|
||||
private void insertLoop() {
|
||||
List<ArticleData> toAdd = new ArrayList<>();
|
||||
while (running || !queue.isEmpty()) {
|
||||
try {
|
||||
while (0 != queue.drainTo(toAdd, 100)) {
|
||||
insertItems(toAdd);
|
||||
toAdd.clear();
|
||||
}
|
||||
if (queue.isEmpty()) {
|
||||
// Yield for a moment to avoid busy looping
|
||||
TimeUnit.NANOSECONDS.sleep(100);
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void insertItems(List<ArticleData> toAdd) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR IGNORE INTO articles (url, title, html, summary, urls, disambigs)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
"""))
|
||||
{
|
||||
connection.setAutoCommit(false); // Disable auto-commit mode
|
||||
for (var article : toAdd) {
|
||||
stmt.setString(1, article.url());
|
||||
stmt.setString(2, article.title());
|
||||
stmt.setBytes(3, article.parts());
|
||||
stmt.setString(4, article.summary());
|
||||
stmt.setBytes(5, article.links());
|
||||
stmt.setBytes(6, article.disambigs());
|
||||
|
||||
stmt.addBatch();
|
||||
}
|
||||
stmt.executeBatch();
|
||||
connection.commit(); // Commit the transaction
|
||||
} catch (SQLException e) {
|
||||
connection.rollback(); // Rollback the transaction in case of error
|
||||
logger.warn("SQL error", e);
|
||||
} finally {
|
||||
connection.setAutoCommit(true); // Re-enable auto-commit mode
|
||||
}
|
||||
}
|
||||
|
||||
public void add(ArticleData article) {
|
||||
try {
|
||||
queue.put(article);
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
logger.warn("Interrupted", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
running = false;
|
||||
try {
|
||||
insertThread.join();
|
||||
connection.close();
|
||||
} catch (InterruptedException|SQLException e) {
|
||||
logger.warn("Error", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -221,7 +221,7 @@ public class ZIMReader {
|
||||
|
||||
// Gives the minimum required information needed for the given articleName
|
||||
public DirectoryEntry forEachArticles(BiConsumer<String, String> consumer, Predicate<Integer> blobPred)
|
||||
throws IOException {
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
int numberOfArticles = mFile.getArticleCount();
|
||||
long beg = mFile.getTitlePtrPos();
|
||||
@ -237,6 +237,10 @@ public class ZIMReader {
|
||||
for (long i = beg; i < end; i+=4) {
|
||||
var entry = getDirectoryInfoAtTitlePosition(i);
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
if (((i-beg)%100_000) == 0) {
|
||||
System.out.printf("%f%%\n", ((i-beg) * 100.) / (end-beg));
|
||||
}
|
||||
@ -249,21 +253,25 @@ public class ZIMReader {
|
||||
|
||||
System.out.println("Iterating over " + data.keySet().stream().mapToInt(Integer::intValue).max() + "clusters");
|
||||
|
||||
data.forEach((pos,blobs) -> {
|
||||
if (!blobPred.test(pos)) {
|
||||
return;
|
||||
}
|
||||
var iter = data.entrySet().iterator();
|
||||
while (iter.hasNext()) {
|
||||
if (Thread.interrupted()) throw new InterruptedException();
|
||||
|
||||
var next = iter.next();
|
||||
int pos = next.getKey();
|
||||
|
||||
if (!blobPred.test(pos)) continue;
|
||||
Map<Integer, String> blobs = next.getValue();
|
||||
|
||||
try {
|
||||
getArticleData(consumer, pos, blobs);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user