From 70aa04c047c3fe4bcef57c6e0cc2a1abf442d3ec Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Sep 2023 12:48:33 +0200 Subject: [PATCH] (converter, stackexchange-xml) Add the ability to sideload stackexchange data --- .../sqlite/StackExchangePostsDb.java | 27 ++- .../processes/converting-process/build.gradle | 1 + .../marginalia/converting/ConverterMain.java | 11 +- .../sideload/SideloadSourceFactory.java | 12 +- .../stackexchange/StackExchange7zReader.java | 229 ------------------ .../StackexchangeSideloader.java | 127 +++++++--- .../sideload/StackexchangeSideloaderTest.java | 24 -- .../nu/marginalia/control/ControlService.java | 1 + .../control/svc/ControlActionsService.java | 15 ++ .../resources/templates/control/actions.hdb | 14 ++ doc/readme.md | 1 + doc/sideloading-howto.md | 120 +++++++++ doc/system-properties.md | 15 +- 13 files changed, 290 insertions(+), 307 deletions(-) delete mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackExchange7zReader.java delete mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java create mode 100644 doc/sideloading-howto.md diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java index bf78f5f1..e9bbc38f 100644 --- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java @@ -155,6 +155,8 @@ public class StackExchangePostsDb { String title = ""; int year = 2023; + String tags = ""; + List> partWork = new ArrayList<>(); var commonPool = ForkJoinPool.commonPool(); while (rs.next()) { @@ -162,6 +164,11 @@ public class StackExchangePostsDb { if (maybeTitle != null && !maybeTitle.isBlank()) title = maybeTitle; + + String maybeTags = rs.getString("tags"); + if (maybeTags != null && !maybeTags.isBlank()) + tags = maybeTags; + int origSize = rs.getInt("origSize"); year = Math.min(year, rs.getInt("postYear")); @@ -177,7 +184,7 @@ public class StackExchangePostsDb { parts.add(workItem.get()); } - if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts))) + if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts, tags))) break; } @@ -188,11 +195,27 @@ public class StackExchangePostsDb { } + public static String getDomainName(Path pathToDbFile) throws SQLException { + String connStr = "jdbc:sqlite:" + pathToDbFile; + + try (var connection = DriverManager.getConnection(connStr); + var stmt = connection.prepareStatement("SELECT domainName FROM metadata") + ) { + var rs = stmt.executeQuery(); + if (rs.next()) { + return rs.getString(1); + } + throw new IllegalArgumentException("No metadata in db file " + pathToDbFile); + } + + } + public record CombinedPostModel(int ordinal, int threadId, String title, int year, - List bodies) + List bodies, + String tags) { } } diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index ebd8d5ed..ff8133ca 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -53,6 +53,7 @@ dependencies { implementation project(':code:features-convert:pubdate') implementation project(':code:features-convert:keyword-extraction') implementation project(':code:features-convert:summary-extraction') + implementation project(':code:features-convert:stackexchange-xml') implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index a5e3b35e..67b7c939 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -25,6 +25,7 @@ import nu.marginalia.converting.processor.DomainProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.Collection; @@ -86,12 +87,19 @@ public class ConverterMain { public void convert(Collection sideloadSources, Path writeDir) throws Exception { try (var writer = new ConverterBatchWriter(writeDir, 0); + var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("Sideloading"); BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log")) ) { + + int i = 0; for (var sideloadSource : sideloadSources) { logger.info("Sideloading {}", sideloadSource.getDomain()); + + taskHeartbeat.progress(sideloadSource.getDomain().toString(), i++, sideloadSources.size()); + writer.write(sideloadSource); } + taskHeartbeat.progress("Finished", i, sideloadSources.size()); // We write an empty log with just a finish marker for the sideloading action batchingWorkLog.logFinishedBatch(); @@ -242,9 +250,8 @@ public class ConverterMain { } case SideloadStackexchange -> { var processData = fileStorageService.getStorage(request.processedDataStorage); - var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.')); - yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName), + yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath), processData.asPath(), msg, inbox); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index ca2832c4..18db740e 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -9,6 +9,7 @@ import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.Collection; @@ -42,8 +43,13 @@ public class SideloadSourceFactory { } /** Do not use, this code isn't finished */ - @Deprecated() - public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) { - return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor); + public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { + try (var dirs = Files.walk(pathToDbFileRoot)) { + return dirs + .filter(Files::isRegularFile) + .filter(f -> f.toFile().getName().endsWith(".db")) + .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor)) + .toList(); + } } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackExchange7zReader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackExchange7zReader.java deleted file mode 100644 index 9f899d2d..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackExchange7zReader.java +++ /dev/null @@ -1,229 +0,0 @@ -package nu.marginalia.converting.sideload.stackexchange; - -import lombok.SneakyThrows; -import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; -import org.apache.commons.compress.archivers.sevenz.SevenZFile; - -import javax.xml.namespace.QName; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamException; -import java.io.IOException; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.stream.Collectors; - -@Deprecated -public class StackExchange7zReader { - private final Path pathTo7zFile; - - public StackExchange7zReader(Path pathTo7zFile) { - this.pathTo7zFile = pathTo7zFile; - } - - public List getIds() throws IOException, XMLStreamException { - try (SevenZFile file = new SevenZFile(pathTo7zFile.toFile())) { - for (SevenZArchiveEntry entry : file.getEntries()) { - if ("Posts.xml".equals(entry.getName())) { - return getIds(file, entry); - } - } - } - return List.of(); - } - - - private List getIds(SevenZFile file, SevenZArchiveEntry entry) throws IOException, XMLStreamException { - List ids = new ArrayList<>(10000); - - XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); - var idField = new QName("Id"); - - try (var inputStream = file.getInputStream(entry)) { - - var xmlReader = xmlInputFactory.createXMLEventReader(inputStream); - - while (xmlReader.hasNext()) { - var event = xmlReader.nextEvent(); - if (!event.isStartElement()) continue; - - var startEvent = event.asStartElement(); - if (!"row".equals(startEvent.getName().getLocalPart())) continue; - - var fieldValue = startEvent.getAttributeByName(idField); - if (fieldValue != null) { - ids.add(fieldValue.getValue()); - } - } - } - - return ids; - } - - public Iterator postIterator() throws IOException, XMLStreamException { - SevenZFile postsFile = new SevenZFile(pathTo7zFile.toFile()); - SevenZFile commentsFile = new SevenZFile(pathTo7zFile.toFile()); - - SevenZArchiveEntry postsEntry = null; - SevenZArchiveEntry commentsEntry = null; - - for (SevenZArchiveEntry entry : postsFile.getEntries()) { - if ("Posts.xml".equals(entry.getName())) { - postsEntry = entry; - break; - } - } - - for (SevenZArchiveEntry entry : commentsFile.getEntries()) { - if ("Comments.xml".equals(entry.getName())) { - commentsEntry = entry; - break; - } - } - - if (postsEntry == null || commentsEntry == null) { - postsFile.close(); - commentsFile.close(); - - throw new IOException("Posts.xml or Comments.xml not found in 7z file"); - } - - var postsInputStream = postsFile.getInputStream(postsEntry); - var commentsInputStream = commentsFile.getInputStream(commentsEntry); - - XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); - - var postsXmlReader = xmlInputFactory.createXMLEventReader(postsInputStream); - var commentsXmlReader = xmlInputFactory.createXMLEventReader(commentsInputStream); - - QName titleName = new QName("Title"); - QName idName = new QName("Id"); - QName bodyName = new QName("Body"); - QName tagsName = new QName("Tags"); - QName creationDateName = new QName("CreationDate"); - QName score = new QName("Score"); - - QName postIdName = new QName("PostId"); - QName textName = new QName("Text"); - - return new Iterator<>() { - Post next = null; - Comment nextComment = null; - - @SneakyThrows - @Override - public boolean hasNext() { - if (next != null) - return true; - - while (postsXmlReader.hasNext()) { - var event = postsXmlReader.nextEvent(); - if (!event.isStartElement()) continue; - - var startEvent = event.asStartElement(); - if (!"row".equals(startEvent.getName().getLocalPart())) continue; - - var scoreAttribute = startEvent.getAttributeByName(score); - if (scoreAttribute == null) continue; - int score = Integer.parseInt(scoreAttribute.getValue()); - if (score < 1) continue; - - var titleAttribute = startEvent.getAttributeByName(titleName); - if (titleAttribute == null) continue; - String title = titleAttribute.getValue(); - - var idAttribute = startEvent.getAttributeByName(idName); - if (idAttribute == null) continue; - int id = Integer.parseInt(idAttribute.getValue()); - - var bodyAttribute = startEvent.getAttributeByName(bodyName); - if (bodyAttribute == null) continue; - String body = bodyAttribute.getValue(); - - var tagsAttribute = startEvent.getAttributeByName(tagsName); - if (tagsAttribute == null) continue; - String tags = tagsAttribute.getValue(); - List tagsParsed = parseTags(tags); - var creationDateAttribute = startEvent.getAttributeByName(creationDateName); - if (creationDateAttribute == null) continue; - String creationDate = creationDateAttribute.getValue(); - int year = Integer.parseInt(creationDate.substring(0, 4)); - - List comments = new ArrayList<>(); - do { - if (nextComment == null) continue; - - if (nextComment.postId > id) { - break; - } - if (nextComment.postId == id) { - comments.add(nextComment); - nextComment = null; - } - } - while (readNextComment()); - - next = new Post(title, tagsParsed, year, id, body, comments); - return true; - } - - postsInputStream.close(); - commentsInputStream.close(); - postsFile.close(); - commentsFile.close(); - - return false; - } - - private boolean readNextComment() throws XMLStreamException { - while (commentsXmlReader.hasNext()) { - var event = commentsXmlReader.nextEvent(); - if (!event.isStartElement()) continue; - - var startEvent = event.asStartElement(); - if (!"row".equals(startEvent.getName().getLocalPart())) continue; - - var postIdAttribute = startEvent.getAttributeByName(postIdName); - if (postIdAttribute == null) continue; - int postId = Integer.parseInt(postIdAttribute.getValue()); - - var textAttribute = startEvent.getAttributeByName(textName); - if (textAttribute == null) continue; - String text = textAttribute.getValue(); - - nextComment = new Comment(postId, text); - return true; - } - return false; - } - - @Override - public Post next() { - if (hasNext()) { - var ret = next; - next = null; - return ret; - } - - throw new IllegalStateException("No more posts"); - } - }; - } - - private List parseTags(String tags) { - return Arrays.stream(tags.split("<|>")) - .filter(s -> !s.isBlank()) - .collect(Collectors.toList()); - } - - - public record Post(String title, List tags, int year, int id, String body, List comments) { - - } - - public record Comment(int postId, String text) { - - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index f5dd4351..a1b1ffac 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.stackexchange; import lombok.SneakyThrows; import nu.marginalia.converting.model.*; import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeDomain; @@ -15,31 +16,32 @@ import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.util.SimpleBlockingThreadPool; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; -import javax.xml.stream.XMLStreamException; -import java.io.IOException; import java.nio.file.Path; +import java.util.Arrays; import java.util.EnumSet; import java.util.Iterator; import java.util.List; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.TimeUnit; -/** This code is broken */ -@Deprecated() public class StackexchangeSideloader implements SideloadSource { - private final StackExchange7zReader reader; private final SentenceExtractor sentenceExtractor; private final DocumentKeywordExtractor keywordExtractor; private final String domainName; - public StackexchangeSideloader(Path pathTo7zFile, - String domainName, + private final Path dbFile; + + @SneakyThrows + public StackexchangeSideloader(Path pathToDbFile, SentenceExtractor sentenceExtractor, DocumentKeywordExtractor keywordExtractor ) { - this.domainName = domainName; - reader = new StackExchange7zReader(pathTo7zFile); + this.dbFile = pathToDbFile; + this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile); this.sentenceExtractor = sentenceExtractor; this.keywordExtractor = keywordExtractor; } @@ -57,36 +59,48 @@ public class StackexchangeSideloader implements SideloadSource { @Override public Iterator getDocumentsStream() { - try { - var baseIter = reader.postIterator(); - return new Iterator<>() { - @Override - public boolean hasNext() { - return baseIter.hasNext(); + var postsReader = new PostsReader(); + Thread readerThread = new Thread(postsReader); + readerThread.setDaemon(true); + readerThread.start(); + + return new Iterator<>() { + + ProcessedDocument nextModel = null; + + @SneakyThrows + @Override + public boolean hasNext() { + if (nextModel != null) + return true; + nextModel = postsReader.next(); + + return nextModel != null; + } + + @Override + public ProcessedDocument next() { + if (hasNext()) { + var ret = nextModel; + nextModel = null; + return ret; } - @Override - public ProcessedDocument next() { - return convert(baseIter.next()); - } - }; - } catch (IOException e) { - throw new RuntimeException(e); - } catch (XMLStreamException e) { - throw new RuntimeException(e); - } + throw new IllegalStateException(); + } + }; } @SneakyThrows - private ProcessedDocument convert(StackExchange7zReader.Post post) { - String fullUrl = "https://" + domainName + "/questions/" + post.id(); + private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) { + String fullUrl = "https://" + domainName + "/questions/" + post.threadId(); StringBuilder fullHtml = new StringBuilder(); fullHtml.append("").append(post.title()).append(""); fullHtml.append("

").append(post.title()).append("

"); - for (var comment : post.comments()) { - fullHtml.append("

").append(comment.text()).append("

"); + for (var comment : post.bodies()) { + fullHtml.append("

").append(comment).append("

"); } fullHtml.append(""); @@ -99,10 +113,17 @@ public class StackexchangeSideloader implements SideloadSource { ret.url = url; ret.words = keywordExtractor.extractKeywords(dld, url); - ret.words.addJustNoMeta("site:"+domainName); - ret.words.addJustNoMeta("site:"+url.domain.domain); - ret.words.addJustNoMeta(url.domain.domain); - ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, post.tags()); + ret.words.addAllSyntheticTerms(List.of( + "site:" + domainName, + "site:" + url.domain.domain, + url.domain.domain + )); + + if (!post.tags().isBlank()) { + List subjects = Arrays.asList(post.tags().split(",")); + ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, subjects); + } + ret.details = new ProcessedDocumentDetails(); ret.details.pubYear = post.year(); ret.details.quality = 5; @@ -129,4 +150,44 @@ public class StackexchangeSideloader implements SideloadSource { return ret; } + + class PostsReader implements Runnable { + private final ArrayBlockingQueue results = new ArrayBlockingQueue<>(16); + private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("Sideloading Stackexchange", 16, 4); + volatile boolean isRunning = true; + + public void run() { + try { + StackExchangePostsDb.forEachPost(dbFile, this::enqueue); + } + finally { + isRunning = false; + pool.shutDown(); + } + } + + @SneakyThrows + private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) { + pool.submit(() -> results.put(convert(model))); + + return true; + } + + public ProcessedDocument next() throws InterruptedException { + do { + var next = results.poll(1, TimeUnit.SECONDS); + if (next != null) { + return next; + } + } while (!isFinished()); + + return null; + } + + public boolean isFinished() { + return !isRunning && + results.isEmpty() && + pool.isTerminated(); + } + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java deleted file mode 100644 index 56374b0b..00000000 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/StackexchangeSideloaderTest.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.converting.sideload; - -import nu.marginalia.converting.sideload.stackexchange.StackExchange7zReader; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import javax.xml.stream.XMLStreamException; -import java.io.IOException; -import java.nio.file.Path; - -class StackexchangeSideloaderTest { - @Test - @Disabled - public void test7zFile() throws IOException, XMLStreamException { - var stackExchangeReader = new StackExchange7zReader(Path.of("/mnt/storage/stackexchange/scifi.meta.stackexchange.com.7z")); - - System.out.println(stackExchangeReader.getIds()); - - var iter = stackExchangeReader.postIterator(); - while (iter.hasNext()) { - System.out.println(iter.next()); - } - } -} \ No newline at end of file diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 83c86590..c6adb096 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -190,6 +190,7 @@ public class ControlService extends Service { Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors); Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors); Spark.post("/public/actions/sideload-dirtree", controlActionsService::sideloadDirtree, redirectToActors); + Spark.post("/public/actions/sideload-stackexchange", controlActionsService::sideloadStackexchange, redirectToActors); // Review Random Domains Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java index 3407ca0a..5b4bfd5c 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -129,6 +129,21 @@ public class ControlActionsService { return ""; } + public Object sideloadStackexchange(Request request, Response response) throws Exception { + + Path sourcePath = Path.of(request.queryParams("source")); + if (!Files.exists(sourcePath)) { + Spark.halt(404); + return "No such file " + sourcePath; + } + + eventLog.logEvent("USER-ACTION", "SIDELOAD STACKEXCHANGE"); + + actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_STACKEXCHANGE, sourcePath.toString()); + + return ""; + } + public Object triggerRepartition(Request request, Response response) throws Exception { indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb index 85ac2c3d..b372b304 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -54,6 +54,20 @@ + + Sideload Stackexchange

+ Will load a set of pre-converted stackexchange .db files. + + +

+
+ +

+ + +
+ + Reload Blogs List diff --git a/doc/readme.md b/doc/readme.md index 9a591213..a5da9973 100644 --- a/doc/readme.md +++ b/doc/readme.md @@ -6,6 +6,7 @@ Start in [📁 ../code/](../code/) and poke around. ## Operations * [System Properties](system-properties.md) - JVM property flags +* [Sideloading How-To](sideloading-howto.md) - How to sideload various data sets ## Set-up diff --git a/doc/sideloading-howto.md b/doc/sideloading-howto.md new file mode 100644 index 00000000..5c337423 --- /dev/null +++ b/doc/sideloading-howto.md @@ -0,0 +1,120 @@ +# Sideloading How-To + +(This document is a bit of a draft to get this down in writing +while it's still fresh in my head.) + +Some websites are much larger than others, this includes +Wikipedia, Stack Overflow, and a few others. They are so +large they are impractical to crawl in the traditional fashion, +but luckily they make available data dumps that can be processed +and loaded into the search engine through other means. + +## Sideloading a directory tree + +For relatively small websites, ad-hoc side-loading is available directly from a +folder structure on the hard drive. This is intended for loading manuals, +documentation and similar data sets that are large and slowly changing. + +A website can be archived with wget, like this + +```bash +UA="search.marginalia.nu" \ +DOMAIN="www.example.com" \ +wget -nc -x --continue -w 1 -r -U ${UA} -A "html" ${DOMAIN} +``` + +After doing this to a bunch of websites, create a YAML file something like this: + +```yaml +sources: +- name: jdk-20 + dir: "jdk-20/" + domainName: "docs.oracle.com" + baseUrl: "https://docs.oracle.com/en/java/javase/20/docs" + keywords: + - "java" + - "docs" + - "documentation" + - "javadoc" +- name: python3 + dir: "python-3.11.5/" + domainName: "docs.python.org" + baseUrl: "https://docs.python.org/3/" + keywords: + - "python" + - "docs" + - "documentation" +- name: mariadb.com + dir: "mariadb.com/" + domainName: "mariadb.com" + baseUrl: "https://mariadb.com/" + keywords: + - "sql" + - "docs" + - "mariadb" + - "mysql" +``` + +|parameter|description| +|----|----| +|name|Purely informative| +|dir|Path of website contents relative to the location of the yaml file| +|domainName|The domain name of the website| +|baseUrl|This URL will be prefixed to the contents of `dir`| +|keywords|These supplemental keywords will be injected in each document| + +The directory structure corresponding to the above might look like + +``` +docs-index.yaml +jdk-20/ +jdk-20/resources/ +jdk-20/api/ +jdk-20/api/[...] +jdk-20/specs/ +jdk-20/specs/[...] +jdk-20/index.html +mariadb.com +mariadb.com/kb/ +mariadb.com/kb/[...] +python-3.11.5 +python-3.11.5/genindex-B.html +python-3.11.5/library/ +python-3.11.5/distutils/ +python-3.11.5/[...] +[...] +``` + +This yaml-file can be processed and loaded into the search engine through the +Actions view. + +## Sideloading Wikipedia + +For now, this workflow depends on using the conversion process from +[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/) +to pre-digest the data. This is because it uses OpenZIM which has a +license that is incompatible with this project. + +Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu) +and follow the instructions for downloading a ZIM file, and then run something like + +```$./encyclopedia convert file.zim articles.db``` + +This db-file can be processed and loaded into the search engine through the +Actions view. + +FIXME: It will currently only point to encyclopedia.marginalia.nu and not main Wikipedia, +this should be made configurable. + +## Sideloading Stack Overflow/Stackexchange + +Stackexchange makes dumps available on Archive.org. These are unfortunately on a format that +needs some heavy-handed pre-processing before they can be loaded. A tool is available for +this in [tools/stackexchange-converter](../code/tools/stackexchange-converter). + +After running `gradlew dist`, this tool is found in `build/dist/stackexchange-converter`, +follow the instructions in the stackexchange-converter readme, and +convert the stackexchange xml.7z-files to sqlite db-files. + +A directory with such db-files can be processed and loaded into the +search engine through the Actions view. \ No newline at end of file diff --git a/doc/system-properties.md b/doc/system-properties.md index e79228bd..c1b77478 100644 --- a/doc/system-properties.md +++ b/doc/system-properties.md @@ -7,12 +7,6 @@ These are JVM system properties used by each service |-------------|------------|-------------------------------------------------------| | website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering| -## Index Service - -|flag| values | description | -|---|------------|---------------------------------------------------------------| -|lexiconSizeHint| 1000000000 | The default size of the lexicon, speeds up start time in prod | - ## Crawler Process |flag| values | description | |---|------------|-------------------------------------------------------| @@ -26,12 +20,5 @@ These are JVM system properties used by each service ## Loader Process |flag| values | description | |---|------------|-------------------------------------------------------| -|lexiconSizeHint| 800000000 | The default size of the lexicon | |local-index-path| /some/path | Selects the location the loader will write index data | -|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan | - -## Other - -|flag| values | description | -|---|------------|---------------------------------------------| -|bigstring.disabled| true/false | Disables transparent big string compression | \ No newline at end of file +|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan | \ No newline at end of file