(converter, stackexchange-xml) Add the ability to sideload stackexchange data

2023-09-21 12:48:33 +02:00 · 2023-09-21 12:48:33 +02:00 · 70aa04c047
commit 70aa04c047
parent 4aa47e87f2
13 changed files with 290 additions and 307 deletions
--- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
+++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
@ -155,6 +155,8 @@ public class StackExchangePostsDb {
                String title = "";
                int year = 2023;

+                String tags = "";
+
                List<Future<String>> partWork = new ArrayList<>();
                var commonPool = ForkJoinPool.commonPool();
                while (rs.next()) {
@ -162,6 +164,11 @@ public class StackExchangePostsDb {

                    if (maybeTitle != null && !maybeTitle.isBlank())
                        title = maybeTitle;
+
+                    String maybeTags = rs.getString("tags");
+                    if (maybeTags != null && !maybeTags.isBlank())
+                        tags = maybeTags;
+
                    int origSize = rs.getInt("origSize");

                    year = Math.min(year, rs.getInt("postYear"));
@ -177,7 +184,7 @@ public class StackExchangePostsDb {
                    parts.add(workItem.get());
                }

-                if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
+                if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts, tags)))
                    break;
            }

@ -188,11 +195,27 @@ public class StackExchangePostsDb {

    }

+    public static String getDomainName(Path pathToDbFile) throws SQLException {
+        String connStr = "jdbc:sqlite:" + pathToDbFile;
+
+        try (var connection = DriverManager.getConnection(connStr);
+             var stmt = connection.prepareStatement("SELECT domainName FROM metadata")
+        ) {
+            var rs = stmt.executeQuery();
+            if (rs.next()) {
+                return rs.getString(1);
+            }
+            throw new IllegalArgumentException("No metadata in db file " + pathToDbFile);
+        }
+
+    }
+
    public record CombinedPostModel(int ordinal,
                                    int threadId,
                                    String title,
                                    int year,
-                                    List<String> bodies)
+                                    List<String> bodies,
+                                    String tags)
    { }

 }
--- a/code/processes/converting-process/build.gradle
+++ b/code/processes/converting-process/build.gradle
@ -53,6 +53,7 @@ dependencies {
    implementation project(':code:features-convert:pubdate')
    implementation project(':code:features-convert:keyword-extraction')
    implementation project(':code:features-convert:summary-extraction')
+    implementation project(':code:features-convert:stackexchange-xml')

    implementation project(':code:features-crawl:crawl-blocklist')
    implementation project(':code:features-crawl:link-parser')
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java
@ -25,6 +25,7 @@ import nu.marginalia.converting.processor.DomainProcessor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.Collection;
@ -86,12 +87,19 @@ public class ConverterMain {

    public void convert(Collection<? extends SideloadSource> sideloadSources, Path writeDir) throws Exception {
        try (var writer = new ConverterBatchWriter(writeDir, 0);
+             var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("Sideloading");
             BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log"))
        ) {
+
+            int i = 0;
            for (var sideloadSource : sideloadSources) {
                logger.info("Sideloading {}", sideloadSource.getDomain());
+
+                taskHeartbeat.progress(sideloadSource.getDomain().toString(), i++, sideloadSources.size());
+
                writer.write(sideloadSource);
            }
+            taskHeartbeat.progress("Finished", i, sideloadSources.size());

            // We write an empty log with just a finish marker for the sideloading action
            batchingWorkLog.logFinishedBatch();
@ -242,9 +250,8 @@ public class ConverterMain {
            }
            case SideloadStackexchange -> {
                var processData = fileStorageService.getStorage(request.processedDataStorage);
-                var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.'));

-                yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName),
+                yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath),
                        processData.asPath(),
                        msg, inbox);
            }
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java
@ -9,6 +9,7 @@ import nu.marginalia.keyword.DocumentKeywordExtractor;
 import nu.marginalia.language.sentence.SentenceExtractor;

 import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.Collection;
@ -42,8 +43,13 @@ public class SideloadSourceFactory {
    }

    /** Do not use, this code isn't finished */
-    @Deprecated()
-    public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) {
-        return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor);
+    public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
+        try (var dirs = Files.walk(pathToDbFileRoot)) {
+            return dirs
+                .filter(Files::isRegularFile)
+                .filter(f -> f.toFile().getName().endsWith(".db"))
+                .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
+                .toList();
+        }
    }
 }
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackExchange7zReader.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackExchange7zReader.java
@ -1,229 +0,0 @@
-package nu.marginalia.converting.sideload.stackexchange;
-
-import lombok.SneakyThrows;
-import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry;
-import org.apache.commons.compress.archivers.sevenz.SevenZFile;
-
-import javax.xml.namespace.QName;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamException;
-import java.io.IOException;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.stream.Collectors;
-
-@Deprecated
-public class StackExchange7zReader {
-    private final Path pathTo7zFile;
-
-    public StackExchange7zReader(Path pathTo7zFile) {
-        this.pathTo7zFile = pathTo7zFile;
-    }
-
-    public List<String> getIds() throws IOException, XMLStreamException {
-        try (SevenZFile file = new SevenZFile(pathTo7zFile.toFile())) {
-            for (SevenZArchiveEntry entry : file.getEntries()) {
-                if ("Posts.xml".equals(entry.getName())) {
-                    return getIds(file, entry);
-                }
-            }
-        }
-        return List.of();
-    }
-
-
-    private List<String> getIds(SevenZFile file, SevenZArchiveEntry entry) throws IOException, XMLStreamException {
-        List<String> ids = new ArrayList<>(10000);
-
-        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
-        var idField = new QName("Id");
-
-        try (var inputStream = file.getInputStream(entry)) {
-
-            var xmlReader = xmlInputFactory.createXMLEventReader(inputStream);
-
-            while (xmlReader.hasNext()) {
-                var event = xmlReader.nextEvent();
-                if (!event.isStartElement()) continue;
-
-                var startEvent = event.asStartElement();
-                if (!"row".equals(startEvent.getName().getLocalPart())) continue;
-
-                var fieldValue = startEvent.getAttributeByName(idField);
-                if (fieldValue != null) {
-                    ids.add(fieldValue.getValue());
-                }
-            }
-        }
-
-        return ids;
-    }
-
-    public Iterator<Post> postIterator() throws IOException, XMLStreamException {
-        SevenZFile postsFile = new SevenZFile(pathTo7zFile.toFile());
-        SevenZFile commentsFile = new SevenZFile(pathTo7zFile.toFile());
-
-        SevenZArchiveEntry postsEntry = null;
-        SevenZArchiveEntry commentsEntry = null;
-
-        for (SevenZArchiveEntry entry : postsFile.getEntries()) {
-            if ("Posts.xml".equals(entry.getName())) {
-                postsEntry = entry;
-                break;
-            }
-        }
-
-        for (SevenZArchiveEntry entry : commentsFile.getEntries()) {
-            if ("Comments.xml".equals(entry.getName())) {
-                commentsEntry = entry;
-                break;
-            }
-        }
-
-        if (postsEntry == null || commentsEntry == null) {
-            postsFile.close();
-            commentsFile.close();
-
-            throw new IOException("Posts.xml or Comments.xml not found in 7z file");
-        }
-
-        var postsInputStream = postsFile.getInputStream(postsEntry);
-        var commentsInputStream = commentsFile.getInputStream(commentsEntry);
-
-        XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
-
-        var postsXmlReader = xmlInputFactory.createXMLEventReader(postsInputStream);
-        var commentsXmlReader = xmlInputFactory.createXMLEventReader(commentsInputStream);
-
-        QName titleName = new QName("Title");
-        QName idName = new QName("Id");
-        QName bodyName = new QName("Body");
-        QName tagsName = new QName("Tags");
-        QName creationDateName = new QName("CreationDate");
-        QName score = new QName("Score");
-
-        QName postIdName = new QName("PostId");
-        QName textName = new QName("Text");
-
-        return new Iterator<>() {
-            Post next = null;
-            Comment nextComment = null;
-
-            @SneakyThrows
-            @Override
-            public boolean hasNext() {
-                if (next != null)
-                    return true;
-
-                while (postsXmlReader.hasNext()) {
-                    var event = postsXmlReader.nextEvent();
-                    if (!event.isStartElement()) continue;
-
-                    var startEvent = event.asStartElement();
-                    if (!"row".equals(startEvent.getName().getLocalPart())) continue;
-
-                    var scoreAttribute = startEvent.getAttributeByName(score);
-                    if (scoreAttribute == null) continue;
-                    int score = Integer.parseInt(scoreAttribute.getValue());
-                    if (score < 1) continue;
-
-                    var titleAttribute = startEvent.getAttributeByName(titleName);
-                    if (titleAttribute == null) continue;
-                    String title = titleAttribute.getValue();
-
-                    var idAttribute = startEvent.getAttributeByName(idName);
-                    if (idAttribute == null) continue;
-                    int id = Integer.parseInt(idAttribute.getValue());
-
-                    var bodyAttribute = startEvent.getAttributeByName(bodyName);
-                    if (bodyAttribute == null) continue;
-                    String body = bodyAttribute.getValue();
-
-                    var tagsAttribute = startEvent.getAttributeByName(tagsName);
-                    if (tagsAttribute == null) continue;
-                    String tags = tagsAttribute.getValue();
-                    List<String> tagsParsed = parseTags(tags);
-                    var creationDateAttribute = startEvent.getAttributeByName(creationDateName);
-                    if (creationDateAttribute == null) continue;
-                    String creationDate = creationDateAttribute.getValue();
-                    int year = Integer.parseInt(creationDate.substring(0, 4));
-
-                    List<Comment> comments = new ArrayList<>();
-                    do {
-                        if (nextComment == null) continue;
-
-                        if (nextComment.postId > id) {
-                            break;
-                        }
-                        if (nextComment.postId == id) {
-                            comments.add(nextComment);
-                            nextComment = null;
-                        }
-                    }
-                    while (readNextComment());
-
-                    next = new Post(title, tagsParsed, year, id, body, comments);
-                    return true;
-                }
-
-                postsInputStream.close();
-                commentsInputStream.close();
-                postsFile.close();
-                commentsFile.close();
-
-                return false;
-            }
-
-            private boolean readNextComment() throws XMLStreamException {
-                while (commentsXmlReader.hasNext()) {
-                    var event = commentsXmlReader.nextEvent();
-                    if (!event.isStartElement()) continue;
-
-                    var startEvent = event.asStartElement();
-                    if (!"row".equals(startEvent.getName().getLocalPart())) continue;
-
-                    var postIdAttribute = startEvent.getAttributeByName(postIdName);
-                    if (postIdAttribute == null) continue;
-                    int postId = Integer.parseInt(postIdAttribute.getValue());
-
-                    var textAttribute = startEvent.getAttributeByName(textName);
-                    if (textAttribute == null) continue;
-                    String text = textAttribute.getValue();
-
-                    nextComment = new Comment(postId, text);
-                    return true;
-                }
-                return false;
-            }
-
-            @Override
-            public Post next() {
-                if (hasNext()) {
-                    var ret = next;
-                    next = null;
-                    return ret;
-                }
-
-                throw new IllegalStateException("No more posts");
-            }
-        };
-    }
-
-    private List<String> parseTags(String tags) {
-        return Arrays.stream(tags.split("<|>"))
-                .filter(s -> !s.isBlank())
-                .collect(Collectors.toList());
-    }
-
-
-    public record Post(String title, List<String> tags, int year, int id, String body, List<Comment> comments) {
-
-    }
-
-    public record Comment(int postId, String text) {
-
-    }
-}
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.stackexchange;
 import lombok.SneakyThrows;
 import nu.marginalia.converting.model.*;
 import nu.marginalia.converting.sideload.SideloadSource;
+import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
 import nu.marginalia.keyword.DocumentKeywordExtractor;
 import nu.marginalia.language.sentence.SentenceExtractor;
 import nu.marginalia.model.EdgeDomain;
@ -15,31 +16,32 @@ import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
+import nu.marginalia.util.SimpleBlockingThreadPool;
 import org.apache.commons.lang3.StringUtils;
 import org.jsoup.Jsoup;

-import javax.xml.stream.XMLStreamException;
-import java.io.IOException;
 import java.nio.file.Path;
+import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.TimeUnit;

-/** This code is broken */
-@Deprecated()
 public class StackexchangeSideloader implements SideloadSource {
-    private final StackExchange7zReader reader;
    private final SentenceExtractor sentenceExtractor;
    private final DocumentKeywordExtractor keywordExtractor;
    private final String domainName;

-    public StackexchangeSideloader(Path pathTo7zFile,
-                                   String domainName,
+    private final Path dbFile;
+
+    @SneakyThrows
+    public StackexchangeSideloader(Path pathToDbFile,
                                   SentenceExtractor sentenceExtractor,
                                   DocumentKeywordExtractor keywordExtractor
    ) {
-        this.domainName = domainName;
-        reader = new StackExchange7zReader(pathTo7zFile);
+        this.dbFile = pathToDbFile;
+        this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
        this.sentenceExtractor = sentenceExtractor;
        this.keywordExtractor = keywordExtractor;
    }
@ -57,36 +59,48 @@ public class StackexchangeSideloader implements SideloadSource {

    @Override
    public Iterator<ProcessedDocument> getDocumentsStream() {
-        try {
-            var baseIter = reader.postIterator();
-            return new Iterator<>() {

-                @Override
-                public boolean hasNext() {
-                    return baseIter.hasNext();
+        var postsReader = new PostsReader();
+        Thread readerThread = new Thread(postsReader);
+        readerThread.setDaemon(true);
+        readerThread.start();
+
+        return new Iterator<>() {
+
+            ProcessedDocument nextModel = null;
+
+            @SneakyThrows
+            @Override
+            public boolean hasNext() {
+                if (nextModel != null)
+                    return true;
+                nextModel = postsReader.next();
+
+                return nextModel != null;
+            }
+
+            @Override
+            public ProcessedDocument next() {
+                if (hasNext()) {
+                    var ret = nextModel;
+                    nextModel = null;
+                    return ret;
                }

-                @Override
-                public ProcessedDocument next() {
-                    return convert(baseIter.next());
-                }
-            };
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        } catch (XMLStreamException e) {
-            throw new RuntimeException(e);
-        }
+                throw new IllegalStateException();
+            }
+        };
    }

    @SneakyThrows
-    private ProcessedDocument convert(StackExchange7zReader.Post post) {
-        String fullUrl = "https://" + domainName + "/questions/" + post.id();
+    private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) {
+        String fullUrl = "https://" + domainName + "/questions/" + post.threadId();

        StringBuilder fullHtml = new StringBuilder();
        fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>");
        fullHtml.append("<p>").append(post.title()).append("</p>");
-        for (var comment : post.comments()) {
-            fullHtml.append("<p>").append(comment.text()).append("</p>");
+        for (var comment : post.bodies()) {
+            fullHtml.append("<p>").append(comment).append("</p>");
        }
        fullHtml.append("</body></html>");

@ -99,10 +113,17 @@ public class StackexchangeSideloader implements SideloadSource {

            ret.url = url;
            ret.words = keywordExtractor.extractKeywords(dld, url);
-            ret.words.addJustNoMeta("site:"+domainName);
-            ret.words.addJustNoMeta("site:"+url.domain.domain);
-            ret.words.addJustNoMeta(url.domain.domain);
-            ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, post.tags());
+            ret.words.addAllSyntheticTerms(List.of(
+                    "site:" + domainName,
+                    "site:" + url.domain.domain,
+                    url.domain.domain
+            ));
+
+            if (!post.tags().isBlank()) {
+                List<String> subjects = Arrays.asList(post.tags().split(","));
+                ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, subjects);
+            }
+
            ret.details = new ProcessedDocumentDetails();
            ret.details.pubYear = post.year();
            ret.details.quality = 5;
@ -129,4 +150,44 @@ public class StackexchangeSideloader implements SideloadSource {

        return ret;
    }
+
+    class PostsReader implements Runnable {
+        private final ArrayBlockingQueue<ProcessedDocument> results = new ArrayBlockingQueue<>(16);
+        private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("Sideloading Stackexchange", 16, 4);
+        volatile boolean isRunning = true;
+
+        public void run() {
+            try {
+                StackExchangePostsDb.forEachPost(dbFile, this::enqueue);
+            }
+            finally {
+                isRunning = false;
+                pool.shutDown();
+            }
+        }
+
+        @SneakyThrows
+        private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) {
+            pool.submit(() -> results.put(convert(model)));
+
+            return true;
+        }
+
+        public ProcessedDocument next() throws InterruptedException {
+            do {
+                var next = results.poll(1, TimeUnit.SECONDS);
+                if (next != null) {
+                    return next;
+                }
+            } while (!isFinished());
+
+            return null;
+        }
+
+        public boolean isFinished() {
+            return !isRunning &&
+                    results.isEmpty() &&
+                    pool.isTerminated();
+        }
+    }
 }
--- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java
+++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java
@ -1,24 +0,0 @@
-package nu.marginalia.converting.sideload;
-
-import nu.marginalia.converting.sideload.stackexchange.StackExchange7zReader;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-
-import javax.xml.stream.XMLStreamException;
-import java.io.IOException;
-import java.nio.file.Path;
-
-class StackexchangeSideloaderTest {
-    @Test
-    @Disabled
-    public void test7zFile() throws IOException, XMLStreamException {
-        var stackExchangeReader = new StackExchange7zReader(Path.of("/mnt/storage/stackexchange/scifi.meta.stackexchange.com.7z"));
-
-        System.out.println(stackExchangeReader.getIds());
-
-        var iter = stackExchangeReader.postIterator();
-        while (iter.hasNext()) {
-            System.out.println(iter.next());
-        }
-    }
-}
--- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java
+++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java
@ -190,6 +190,7 @@ public class ControlService extends Service {
        Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
        Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);
        Spark.post("/public/actions/sideload-dirtree", controlActionsService::sideloadDirtree, redirectToActors);
+        Spark.post("/public/actions/sideload-stackexchange", controlActionsService::sideloadStackexchange, redirectToActors);

        // Review Random Domains
        Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);
--- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java
+++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java
@ -129,6 +129,21 @@ public class ControlActionsService {
        return "";
    }

+    public Object sideloadStackexchange(Request request, Response response) throws Exception {
+
+        Path sourcePath = Path.of(request.queryParams("source"));
+        if (!Files.exists(sourcePath)) {
+            Spark.halt(404);
+            return "No such file " + sourcePath;
+        }
+
+        eventLog.logEvent("USER-ACTION", "SIDELOAD STACKEXCHANGE");
+
+        actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_STACKEXCHANGE, sourcePath.toString());
+
+        return "";
+    }
+
    public Object triggerRepartition(Request request, Response response) throws Exception {
        indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");

--- a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb
+++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb
@ -54,6 +54,20 @@
                </form>
            </td>
        </tr>
+        <tr>
+            <td><b>Sideload Stackexchange</b><p>
+                Will load a set of pre-converted stackexchange .db files.
+            </td>
+            <td>
+                <form method="post" action="/actions/sideload-stackexchange" onsubmit="return confirm('Confirm sideloading')">
+                    <label for="source">Directory with .db files location on server</label><br>
+                    <input id="source" name="source" value="">
+                    <br><br>
+
+                    <input type="submit" value="Sideload Stackexchange">
+                </form>
+            </td>
+        </tr>
        <tr>
            <td>
                <b>Reload Blogs List</b>
--- a/doc/readme.md
+++ b/doc/readme.md
@ -6,6 +6,7 @@ Start in [📁 ../code/](../code/) and poke around.
 ## Operations

 * [System Properties](system-properties.md) - JVM property flags
+* [Sideloading How-To](sideloading-howto.md) - How to sideload various data sets

 ## Set-up

--- a/doc/sideloading-howto.md
+++ b/doc/sideloading-howto.md
@ -0,0 +1,120 @@
+# Sideloading How-To
+
+(This document is a bit of a draft to get this down in writing
+while it's still fresh in my head.)
+
+Some websites are much larger than others, this includes
+Wikipedia, Stack Overflow, and a few others.  They are so
+large they are impractical to crawl in the traditional fashion,
+but luckily they make available data dumps that can be processed
+and loaded into the search engine through other means.
+
+## Sideloading a directory tree
+
+For relatively small websites, ad-hoc side-loading is available directly from a
+folder structure on the hard drive. This is intended for loading manuals, 
+documentation and similar data sets that are large and slowly changing.
+
+A website can be archived with wget, like this
+
+```bash
+UA="search.marginalia.nu" \
+DOMAIN="www.example.com" \
+wget -nc -x --continue -w 1 -r -U ${UA} -A "html" ${DOMAIN}
+```
+
+After doing this to a bunch of websites, create a YAML file something like this:
+
+```yaml
+sources:
+- name: jdk-20
+  dir: "jdk-20/"
+  domainName: "docs.oracle.com"
+  baseUrl: "https://docs.oracle.com/en/java/javase/20/docs"
+  keywords:
+  - "java"
+  - "docs"
+  - "documentation"
+  - "javadoc"
+- name: python3
+  dir: "python-3.11.5/"
+  domainName: "docs.python.org"
+  baseUrl: "https://docs.python.org/3/"
+  keywords:
+  - "python"
+  - "docs"
+  - "documentation"
+- name: mariadb.com
+  dir: "mariadb.com/"
+  domainName: "mariadb.com"
+  baseUrl: "https://mariadb.com/"
+  keywords:
+  - "sql"
+  - "docs"
+  - "mariadb"
+  - "mysql"
+```
+
+|parameter|description|
+|----|----|
+|name|Purely informative|
+|dir|Path of website contents relative to the location of the yaml file|
+|domainName|The domain name of the website|
+|baseUrl|This URL will be prefixed to the contents of `dir`|
+|keywords|These supplemental keywords will be injected in each document|
+
+The directory structure corresponding to the above might look like
+
+```
+docs-index.yaml
+jdk-20/
+jdk-20/resources/
+jdk-20/api/
+jdk-20/api/[...]
+jdk-20/specs/
+jdk-20/specs/[...]
+jdk-20/index.html
+mariadb.com
+mariadb.com/kb/
+mariadb.com/kb/[...]
+python-3.11.5
+python-3.11.5/genindex-B.html
+python-3.11.5/library/
+python-3.11.5/distutils/
+python-3.11.5/[...]
+[...]
+```
+
+This yaml-file can be processed and loaded into the search engine through the
+Actions view.
+
+## Sideloading Wikipedia
+
+For now, this workflow depends on using the conversion process from
+[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
+to pre-digest the data.  This is because it uses OpenZIM which has a
+license that is incompatible with this project.
+
+Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
+and follow the instructions for downloading a ZIM file, and then run something like
+
+```$./encyclopedia convert file.zim articles.db```
+
+This db-file can be processed and loaded into the search engine through the
+Actions view.
+
+FIXME: It will currently only point to encyclopedia.marginalia.nu and not main Wikipedia,
+this should be made configurable.
+
+## Sideloading Stack Overflow/Stackexchange
+
+Stackexchange makes dumps available on Archive.org.  These are unfortunately on a format that 
+needs some heavy-handed pre-processing before they can be loaded.  A tool is available for 
+this in [tools/stackexchange-converter](../code/tools/stackexchange-converter).
+
+After running `gradlew dist`, this tool is found in `build/dist/stackexchange-converter`,
+follow the instructions in the stackexchange-converter readme, and
+convert the stackexchange xml.7z-files to sqlite db-files. 
+
+A directory with such db-files can be processed and loaded into the 
+search engine through the Actions view.
--- a/doc/system-properties.md
+++ b/doc/system-properties.md
@ -7,12 +7,6 @@ These are JVM system properties used by each service
 |-------------|------------|-------------------------------------------------------|
 | website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering|

-## Index Service
-
-|flag| values     | description                                                   |
-|---|------------|---------------------------------------------------------------|
-|lexiconSizeHint| 1000000000 | The default size of the lexicon, speeds up start time in prod |
-
 ## Crawler Process
 |flag| values     | description                                           |
 |---|------------|-------------------------------------------------------|
@ -26,12 +20,5 @@ These are JVM system properties used by each service
 ## Loader Process
 |flag| values     | description                                           |
 |---|------------|-------------------------------------------------------|
-|lexiconSizeHint| 800000000  | The default size of the lexicon                       |
 |local-index-path| /some/path | Selects the location the loader will write index data |
-|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
-
-## Other
-
-|flag| values     | description                                 |
-|---|------------|---------------------------------------------|
-|bigstring.disabled| true/false | Disables transparent big string compression |
+|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |