(converter, stackexchange-xml) Add the ability to sideload stackexchange data

This commit is contained in:
Viktor Lofgren 2023-09-21 12:48:33 +02:00
parent 4aa47e87f2
commit 70aa04c047
13 changed files with 290 additions and 307 deletions

View File

@ -155,6 +155,8 @@ public class StackExchangePostsDb {
String title = "";
int year = 2023;
String tags = "";
List<Future<String>> partWork = new ArrayList<>();
var commonPool = ForkJoinPool.commonPool();
while (rs.next()) {
@ -162,6 +164,11 @@ public class StackExchangePostsDb {
if (maybeTitle != null && !maybeTitle.isBlank())
title = maybeTitle;
String maybeTags = rs.getString("tags");
if (maybeTags != null && !maybeTags.isBlank())
tags = maybeTags;
int origSize = rs.getInt("origSize");
year = Math.min(year, rs.getInt("postYear"));
@ -177,7 +184,7 @@ public class StackExchangePostsDb {
parts.add(workItem.get());
}
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts, tags)))
break;
}
@ -188,11 +195,27 @@ public class StackExchangePostsDb {
}
public static String getDomainName(Path pathToDbFile) throws SQLException {
String connStr = "jdbc:sqlite:" + pathToDbFile;
try (var connection = DriverManager.getConnection(connStr);
var stmt = connection.prepareStatement("SELECT domainName FROM metadata")
) {
var rs = stmt.executeQuery();
if (rs.next()) {
return rs.getString(1);
}
throw new IllegalArgumentException("No metadata in db file " + pathToDbFile);
}
}
public record CombinedPostModel(int ordinal,
int threadId,
String title,
int year,
List<String> bodies)
List<String> bodies,
String tags)
{ }
}

View File

@ -53,6 +53,7 @@ dependencies {
implementation project(':code:features-convert:pubdate')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:features-convert:summary-extraction')
implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')

View File

@ -25,6 +25,7 @@ import nu.marginalia.converting.processor.DomainProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Collection;
@ -86,12 +87,19 @@ public class ConverterMain {
public void convert(Collection<? extends SideloadSource> sideloadSources, Path writeDir) throws Exception {
try (var writer = new ConverterBatchWriter(writeDir, 0);
var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("Sideloading");
BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log"))
) {
int i = 0;
for (var sideloadSource : sideloadSources) {
logger.info("Sideloading {}", sideloadSource.getDomain());
taskHeartbeat.progress(sideloadSource.getDomain().toString(), i++, sideloadSources.size());
writer.write(sideloadSource);
}
taskHeartbeat.progress("Finished", i, sideloadSources.size());
// We write an empty log with just a finish marker for the sideloading action
batchingWorkLog.logFinishedBatch();
@ -242,9 +250,8 @@ public class ConverterMain {
}
case SideloadStackexchange -> {
var processData = fileStorageService.getStorage(request.processedDataStorage);
var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.'));
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName),
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath),
processData.asPath(),
msg, inbox);
}

View File

@ -9,6 +9,7 @@ import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Collection;
@ -42,8 +43,13 @@ public class SideloadSourceFactory {
}
/** Do not use, this code isn't finished */
@Deprecated()
public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) {
return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor);
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
try (var dirs = Files.walk(pathToDbFileRoot)) {
return dirs
.filter(Files::isRegularFile)
.filter(f -> f.toFile().getName().endsWith(".db"))
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
.toList();
}
}
}

View File

@ -1,229 +0,0 @@
package nu.marginalia.converting.sideload.stackexchange;
import lombok.SneakyThrows;
import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry;
import org.apache.commons.compress.archivers.sevenz.SevenZFile;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
@Deprecated
public class StackExchange7zReader {
private final Path pathTo7zFile;
public StackExchange7zReader(Path pathTo7zFile) {
this.pathTo7zFile = pathTo7zFile;
}
public List<String> getIds() throws IOException, XMLStreamException {
try (SevenZFile file = new SevenZFile(pathTo7zFile.toFile())) {
for (SevenZArchiveEntry entry : file.getEntries()) {
if ("Posts.xml".equals(entry.getName())) {
return getIds(file, entry);
}
}
}
return List.of();
}
private List<String> getIds(SevenZFile file, SevenZArchiveEntry entry) throws IOException, XMLStreamException {
List<String> ids = new ArrayList<>(10000);
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
var idField = new QName("Id");
try (var inputStream = file.getInputStream(entry)) {
var xmlReader = xmlInputFactory.createXMLEventReader(inputStream);
while (xmlReader.hasNext()) {
var event = xmlReader.nextEvent();
if (!event.isStartElement()) continue;
var startEvent = event.asStartElement();
if (!"row".equals(startEvent.getName().getLocalPart())) continue;
var fieldValue = startEvent.getAttributeByName(idField);
if (fieldValue != null) {
ids.add(fieldValue.getValue());
}
}
}
return ids;
}
public Iterator<Post> postIterator() throws IOException, XMLStreamException {
SevenZFile postsFile = new SevenZFile(pathTo7zFile.toFile());
SevenZFile commentsFile = new SevenZFile(pathTo7zFile.toFile());
SevenZArchiveEntry postsEntry = null;
SevenZArchiveEntry commentsEntry = null;
for (SevenZArchiveEntry entry : postsFile.getEntries()) {
if ("Posts.xml".equals(entry.getName())) {
postsEntry = entry;
break;
}
}
for (SevenZArchiveEntry entry : commentsFile.getEntries()) {
if ("Comments.xml".equals(entry.getName())) {
commentsEntry = entry;
break;
}
}
if (postsEntry == null || commentsEntry == null) {
postsFile.close();
commentsFile.close();
throw new IOException("Posts.xml or Comments.xml not found in 7z file");
}
var postsInputStream = postsFile.getInputStream(postsEntry);
var commentsInputStream = commentsFile.getInputStream(commentsEntry);
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
var postsXmlReader = xmlInputFactory.createXMLEventReader(postsInputStream);
var commentsXmlReader = xmlInputFactory.createXMLEventReader(commentsInputStream);
QName titleName = new QName("Title");
QName idName = new QName("Id");
QName bodyName = new QName("Body");
QName tagsName = new QName("Tags");
QName creationDateName = new QName("CreationDate");
QName score = new QName("Score");
QName postIdName = new QName("PostId");
QName textName = new QName("Text");
return new Iterator<>() {
Post next = null;
Comment nextComment = null;
@SneakyThrows
@Override
public boolean hasNext() {
if (next != null)
return true;
while (postsXmlReader.hasNext()) {
var event = postsXmlReader.nextEvent();
if (!event.isStartElement()) continue;
var startEvent = event.asStartElement();
if (!"row".equals(startEvent.getName().getLocalPart())) continue;
var scoreAttribute = startEvent.getAttributeByName(score);
if (scoreAttribute == null) continue;
int score = Integer.parseInt(scoreAttribute.getValue());
if (score < 1) continue;
var titleAttribute = startEvent.getAttributeByName(titleName);
if (titleAttribute == null) continue;
String title = titleAttribute.getValue();
var idAttribute = startEvent.getAttributeByName(idName);
if (idAttribute == null) continue;
int id = Integer.parseInt(idAttribute.getValue());
var bodyAttribute = startEvent.getAttributeByName(bodyName);
if (bodyAttribute == null) continue;
String body = bodyAttribute.getValue();
var tagsAttribute = startEvent.getAttributeByName(tagsName);
if (tagsAttribute == null) continue;
String tags = tagsAttribute.getValue();
List<String> tagsParsed = parseTags(tags);
var creationDateAttribute = startEvent.getAttributeByName(creationDateName);
if (creationDateAttribute == null) continue;
String creationDate = creationDateAttribute.getValue();
int year = Integer.parseInt(creationDate.substring(0, 4));
List<Comment> comments = new ArrayList<>();
do {
if (nextComment == null) continue;
if (nextComment.postId > id) {
break;
}
if (nextComment.postId == id) {
comments.add(nextComment);
nextComment = null;
}
}
while (readNextComment());
next = new Post(title, tagsParsed, year, id, body, comments);
return true;
}
postsInputStream.close();
commentsInputStream.close();
postsFile.close();
commentsFile.close();
return false;
}
private boolean readNextComment() throws XMLStreamException {
while (commentsXmlReader.hasNext()) {
var event = commentsXmlReader.nextEvent();
if (!event.isStartElement()) continue;
var startEvent = event.asStartElement();
if (!"row".equals(startEvent.getName().getLocalPart())) continue;
var postIdAttribute = startEvent.getAttributeByName(postIdName);
if (postIdAttribute == null) continue;
int postId = Integer.parseInt(postIdAttribute.getValue());
var textAttribute = startEvent.getAttributeByName(textName);
if (textAttribute == null) continue;
String text = textAttribute.getValue();
nextComment = new Comment(postId, text);
return true;
}
return false;
}
@Override
public Post next() {
if (hasNext()) {
var ret = next;
next = null;
return ret;
}
throw new IllegalStateException("No more posts");
}
};
}
private List<String> parseTags(String tags) {
return Arrays.stream(tags.split("<|>"))
.filter(s -> !s.isBlank())
.collect(Collectors.toList());
}
public record Post(String title, List<String> tags, int year, int id, String body, List<Comment> comments) {
}
public record Comment(int postId, String text) {
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.stackexchange;
import lombok.SneakyThrows;
import nu.marginalia.converting.model.*;
import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeDomain;
@ -15,31 +16,32 @@ import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
/** This code is broken */
@Deprecated()
public class StackexchangeSideloader implements SideloadSource {
private final StackExchange7zReader reader;
private final SentenceExtractor sentenceExtractor;
private final DocumentKeywordExtractor keywordExtractor;
private final String domainName;
public StackexchangeSideloader(Path pathTo7zFile,
String domainName,
private final Path dbFile;
@SneakyThrows
public StackexchangeSideloader(Path pathToDbFile,
SentenceExtractor sentenceExtractor,
DocumentKeywordExtractor keywordExtractor
) {
this.domainName = domainName;
reader = new StackExchange7zReader(pathTo7zFile);
this.dbFile = pathToDbFile;
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
this.sentenceExtractor = sentenceExtractor;
this.keywordExtractor = keywordExtractor;
}
@ -57,36 +59,48 @@ public class StackexchangeSideloader implements SideloadSource {
@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
try {
var baseIter = reader.postIterator();
return new Iterator<>() {
@Override
public boolean hasNext() {
return baseIter.hasNext();
var postsReader = new PostsReader();
Thread readerThread = new Thread(postsReader);
readerThread.setDaemon(true);
readerThread.start();
return new Iterator<>() {
ProcessedDocument nextModel = null;
@SneakyThrows
@Override
public boolean hasNext() {
if (nextModel != null)
return true;
nextModel = postsReader.next();
return nextModel != null;
}
@Override
public ProcessedDocument next() {
if (hasNext()) {
var ret = nextModel;
nextModel = null;
return ret;
}
@Override
public ProcessedDocument next() {
return convert(baseIter.next());
}
};
} catch (IOException e) {
throw new RuntimeException(e);
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
throw new IllegalStateException();
}
};
}
@SneakyThrows
private ProcessedDocument convert(StackExchange7zReader.Post post) {
String fullUrl = "https://" + domainName + "/questions/" + post.id();
private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) {
String fullUrl = "https://" + domainName + "/questions/" + post.threadId();
StringBuilder fullHtml = new StringBuilder();
fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>");
fullHtml.append("<p>").append(post.title()).append("</p>");
for (var comment : post.comments()) {
fullHtml.append("<p>").append(comment.text()).append("</p>");
for (var comment : post.bodies()) {
fullHtml.append("<p>").append(comment).append("</p>");
}
fullHtml.append("</body></html>");
@ -99,10 +113,17 @@ public class StackexchangeSideloader implements SideloadSource {
ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, url);
ret.words.addJustNoMeta("site:"+domainName);
ret.words.addJustNoMeta("site:"+url.domain.domain);
ret.words.addJustNoMeta(url.domain.domain);
ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, post.tags());
ret.words.addAllSyntheticTerms(List.of(
"site:" + domainName,
"site:" + url.domain.domain,
url.domain.domain
));
if (!post.tags().isBlank()) {
List<String> subjects = Arrays.asList(post.tags().split(","));
ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, subjects);
}
ret.details = new ProcessedDocumentDetails();
ret.details.pubYear = post.year();
ret.details.quality = 5;
@ -129,4 +150,44 @@ public class StackexchangeSideloader implements SideloadSource {
return ret;
}
class PostsReader implements Runnable {
private final ArrayBlockingQueue<ProcessedDocument> results = new ArrayBlockingQueue<>(16);
private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("Sideloading Stackexchange", 16, 4);
volatile boolean isRunning = true;
public void run() {
try {
StackExchangePostsDb.forEachPost(dbFile, this::enqueue);
}
finally {
isRunning = false;
pool.shutDown();
}
}
@SneakyThrows
private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) {
pool.submit(() -> results.put(convert(model)));
return true;
}
public ProcessedDocument next() throws InterruptedException {
do {
var next = results.poll(1, TimeUnit.SECONDS);
if (next != null) {
return next;
}
} while (!isFinished());
return null;
}
public boolean isFinished() {
return !isRunning &&
results.isEmpty() &&
pool.isTerminated();
}
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.converting.sideload;
import nu.marginalia.converting.sideload.stackexchange.StackExchange7zReader;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.nio.file.Path;
class StackexchangeSideloaderTest {
@Test
@Disabled
public void test7zFile() throws IOException, XMLStreamException {
var stackExchangeReader = new StackExchange7zReader(Path.of("/mnt/storage/stackexchange/scifi.meta.stackexchange.com.7z"));
System.out.println(stackExchangeReader.getIds());
var iter = stackExchangeReader.postIterator();
while (iter.hasNext()) {
System.out.println(iter.next());
}
}
}

View File

@ -190,6 +190,7 @@ public class ControlService extends Service {
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);
Spark.post("/public/actions/sideload-dirtree", controlActionsService::sideloadDirtree, redirectToActors);
Spark.post("/public/actions/sideload-stackexchange", controlActionsService::sideloadStackexchange, redirectToActors);
// Review Random Domains
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);

View File

@ -129,6 +129,21 @@ public class ControlActionsService {
return "";
}
public Object sideloadStackexchange(Request request, Response response) throws Exception {
Path sourcePath = Path.of(request.queryParams("source"));
if (!Files.exists(sourcePath)) {
Spark.halt(404);
return "No such file " + sourcePath;
}
eventLog.logEvent("USER-ACTION", "SIDELOAD STACKEXCHANGE");
actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_STACKEXCHANGE, sourcePath.toString());
return "";
}
public Object triggerRepartition(Request request, Response response) throws Exception {
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");

View File

@ -54,6 +54,20 @@
</form>
</td>
</tr>
<tr>
<td><b>Sideload Stackexchange</b><p>
Will load a set of pre-converted stackexchange .db files.
</td>
<td>
<form method="post" action="/actions/sideload-stackexchange" onsubmit="return confirm('Confirm sideloading')">
<label for="source">Directory with .db files location on server</label><br>
<input id="source" name="source" value="">
<br><br>
<input type="submit" value="Sideload Stackexchange">
</form>
</td>
</tr>
<tr>
<td>
<b>Reload Blogs List</b>

View File

@ -6,6 +6,7 @@ Start in [📁 ../code/](../code/) and poke around.
## Operations
* [System Properties](system-properties.md) - JVM property flags
* [Sideloading How-To](sideloading-howto.md) - How to sideload various data sets
## Set-up

120
doc/sideloading-howto.md Normal file
View File

@ -0,0 +1,120 @@
# Sideloading How-To
(This document is a bit of a draft to get this down in writing
while it's still fresh in my head.)
Some websites are much larger than others, this includes
Wikipedia, Stack Overflow, and a few others. They are so
large they are impractical to crawl in the traditional fashion,
but luckily they make available data dumps that can be processed
and loaded into the search engine through other means.
## Sideloading a directory tree
For relatively small websites, ad-hoc side-loading is available directly from a
folder structure on the hard drive. This is intended for loading manuals,
documentation and similar data sets that are large and slowly changing.
A website can be archived with wget, like this
```bash
UA="search.marginalia.nu" \
DOMAIN="www.example.com" \
wget -nc -x --continue -w 1 -r -U ${UA} -A "html" ${DOMAIN}
```
After doing this to a bunch of websites, create a YAML file something like this:
```yaml
sources:
- name: jdk-20
dir: "jdk-20/"
domainName: "docs.oracle.com"
baseUrl: "https://docs.oracle.com/en/java/javase/20/docs"
keywords:
- "java"
- "docs"
- "documentation"
- "javadoc"
- name: python3
dir: "python-3.11.5/"
domainName: "docs.python.org"
baseUrl: "https://docs.python.org/3/"
keywords:
- "python"
- "docs"
- "documentation"
- name: mariadb.com
dir: "mariadb.com/"
domainName: "mariadb.com"
baseUrl: "https://mariadb.com/"
keywords:
- "sql"
- "docs"
- "mariadb"
- "mysql"
```
|parameter|description|
|----|----|
|name|Purely informative|
|dir|Path of website contents relative to the location of the yaml file|
|domainName|The domain name of the website|
|baseUrl|This URL will be prefixed to the contents of `dir`|
|keywords|These supplemental keywords will be injected in each document|
The directory structure corresponding to the above might look like
```
docs-index.yaml
jdk-20/
jdk-20/resources/
jdk-20/api/
jdk-20/api/[...]
jdk-20/specs/
jdk-20/specs/[...]
jdk-20/index.html
mariadb.com
mariadb.com/kb/
mariadb.com/kb/[...]
python-3.11.5
python-3.11.5/genindex-B.html
python-3.11.5/library/
python-3.11.5/distutils/
python-3.11.5/[...]
[...]
```
This yaml-file can be processed and loaded into the search engine through the
Actions view.
## Sideloading Wikipedia
For now, this workflow depends on using the conversion process from
[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
to pre-digest the data. This is because it uses OpenZIM which has a
license that is incompatible with this project.
Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
and follow the instructions for downloading a ZIM file, and then run something like
```$./encyclopedia convert file.zim articles.db```
This db-file can be processed and loaded into the search engine through the
Actions view.
FIXME: It will currently only point to encyclopedia.marginalia.nu and not main Wikipedia,
this should be made configurable.
## Sideloading Stack Overflow/Stackexchange
Stackexchange makes dumps available on Archive.org. These are unfortunately on a format that
needs some heavy-handed pre-processing before they can be loaded. A tool is available for
this in [tools/stackexchange-converter](../code/tools/stackexchange-converter).
After running `gradlew dist`, this tool is found in `build/dist/stackexchange-converter`,
follow the instructions in the stackexchange-converter readme, and
convert the stackexchange xml.7z-files to sqlite db-files.
A directory with such db-files can be processed and loaded into the
search engine through the Actions view.

View File

@ -7,12 +7,6 @@ These are JVM system properties used by each service
|-------------|------------|-------------------------------------------------------|
| website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering|
## Index Service
|flag| values | description |
|---|------------|---------------------------------------------------------------|
|lexiconSizeHint| 1000000000 | The default size of the lexicon, speeds up start time in prod |
## Crawler Process
|flag| values | description |
|---|------------|-------------------------------------------------------|
@ -26,12 +20,5 @@ These are JVM system properties used by each service
## Loader Process
|flag| values | description |
|---|------------|-------------------------------------------------------|
|lexiconSizeHint| 800000000 | The default size of the lexicon |
|local-index-path| /some/path | Selects the location the loader will write index data |
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
## Other
|flag| values | description |
|---|------------|---------------------------------------------|
|bigstring.disabled| true/false | Disables transparent big string compression |
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |