(converter, stackexchange-xml) Add the ability to sideload stackexchange data
This commit is contained in:
parent
4aa47e87f2
commit
70aa04c047
@ -155,6 +155,8 @@ public class StackExchangePostsDb {
|
||||
String title = "";
|
||||
int year = 2023;
|
||||
|
||||
String tags = "";
|
||||
|
||||
List<Future<String>> partWork = new ArrayList<>();
|
||||
var commonPool = ForkJoinPool.commonPool();
|
||||
while (rs.next()) {
|
||||
@ -162,6 +164,11 @@ public class StackExchangePostsDb {
|
||||
|
||||
if (maybeTitle != null && !maybeTitle.isBlank())
|
||||
title = maybeTitle;
|
||||
|
||||
String maybeTags = rs.getString("tags");
|
||||
if (maybeTags != null && !maybeTags.isBlank())
|
||||
tags = maybeTags;
|
||||
|
||||
int origSize = rs.getInt("origSize");
|
||||
|
||||
year = Math.min(year, rs.getInt("postYear"));
|
||||
@ -177,7 +184,7 @@ public class StackExchangePostsDb {
|
||||
parts.add(workItem.get());
|
||||
}
|
||||
|
||||
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
|
||||
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts, tags)))
|
||||
break;
|
||||
}
|
||||
|
||||
@ -188,11 +195,27 @@ public class StackExchangePostsDb {
|
||||
|
||||
}
|
||||
|
||||
public static String getDomainName(Path pathToDbFile) throws SQLException {
|
||||
String connStr = "jdbc:sqlite:" + pathToDbFile;
|
||||
|
||||
try (var connection = DriverManager.getConnection(connStr);
|
||||
var stmt = connection.prepareStatement("SELECT domainName FROM metadata")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getString(1);
|
||||
}
|
||||
throw new IllegalArgumentException("No metadata in db file " + pathToDbFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public record CombinedPostModel(int ordinal,
|
||||
int threadId,
|
||||
String title,
|
||||
int year,
|
||||
List<String> bodies)
|
||||
List<String> bodies,
|
||||
String tags)
|
||||
{ }
|
||||
|
||||
}
|
||||
|
@ -53,6 +53,7 @@ dependencies {
|
||||
implementation project(':code:features-convert:pubdate')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:features-convert:summary-extraction')
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
|
||||
implementation project(':code:features-crawl:crawl-blocklist')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
|
@ -25,6 +25,7 @@ import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Collection;
|
||||
@ -86,12 +87,19 @@ public class ConverterMain {
|
||||
|
||||
public void convert(Collection<? extends SideloadSource> sideloadSources, Path writeDir) throws Exception {
|
||||
try (var writer = new ConverterBatchWriter(writeDir, 0);
|
||||
var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("Sideloading");
|
||||
BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(writeDir.resolve("processor.log"))
|
||||
) {
|
||||
|
||||
int i = 0;
|
||||
for (var sideloadSource : sideloadSources) {
|
||||
logger.info("Sideloading {}", sideloadSource.getDomain());
|
||||
|
||||
taskHeartbeat.progress(sideloadSource.getDomain().toString(), i++, sideloadSources.size());
|
||||
|
||||
writer.write(sideloadSource);
|
||||
}
|
||||
taskHeartbeat.progress("Finished", i, sideloadSources.size());
|
||||
|
||||
// We write an empty log with just a finish marker for the sideloading action
|
||||
batchingWorkLog.logFinishedBatch();
|
||||
@ -242,9 +250,8 @@ public class ConverterMain {
|
||||
}
|
||||
case SideloadStackexchange -> {
|
||||
var processData = fileStorageService.getStorage(request.processedDataStorage);
|
||||
var domainName = filePath.toFile().getName().substring(0, filePath.toFile().getName().lastIndexOf('.'));
|
||||
|
||||
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath, domainName),
|
||||
yield new SideloadAction(sideloadSourceFactory.sideloadStackexchange(filePath),
|
||||
processData.asPath(),
|
||||
msg, inbox);
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Collection;
|
||||
@ -42,8 +43,13 @@ public class SideloadSourceFactory {
|
||||
}
|
||||
|
||||
/** Do not use, this code isn't finished */
|
||||
@Deprecated()
|
||||
public SideloadSource sideloadStackexchange(Path pathTo7zFile, String domainName) {
|
||||
return new StackexchangeSideloader(pathTo7zFile, domainName, sentenceExtractor, documentKeywordExtractor);
|
||||
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
||||
try (var dirs = Files.walk(pathToDbFileRoot)) {
|
||||
return dirs
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(f -> f.toFile().getName().endsWith(".db"))
|
||||
.map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor))
|
||||
.toList();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,229 +0,0 @@
|
||||
package nu.marginalia.converting.sideload.stackexchange;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.sevenz.SevenZFile;
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
import javax.xml.stream.XMLInputFactory;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Deprecated
|
||||
public class StackExchange7zReader {
|
||||
private final Path pathTo7zFile;
|
||||
|
||||
public StackExchange7zReader(Path pathTo7zFile) {
|
||||
this.pathTo7zFile = pathTo7zFile;
|
||||
}
|
||||
|
||||
public List<String> getIds() throws IOException, XMLStreamException {
|
||||
try (SevenZFile file = new SevenZFile(pathTo7zFile.toFile())) {
|
||||
for (SevenZArchiveEntry entry : file.getEntries()) {
|
||||
if ("Posts.xml".equals(entry.getName())) {
|
||||
return getIds(file, entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
return List.of();
|
||||
}
|
||||
|
||||
|
||||
private List<String> getIds(SevenZFile file, SevenZArchiveEntry entry) throws IOException, XMLStreamException {
|
||||
List<String> ids = new ArrayList<>(10000);
|
||||
|
||||
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
|
||||
var idField = new QName("Id");
|
||||
|
||||
try (var inputStream = file.getInputStream(entry)) {
|
||||
|
||||
var xmlReader = xmlInputFactory.createXMLEventReader(inputStream);
|
||||
|
||||
while (xmlReader.hasNext()) {
|
||||
var event = xmlReader.nextEvent();
|
||||
if (!event.isStartElement()) continue;
|
||||
|
||||
var startEvent = event.asStartElement();
|
||||
if (!"row".equals(startEvent.getName().getLocalPart())) continue;
|
||||
|
||||
var fieldValue = startEvent.getAttributeByName(idField);
|
||||
if (fieldValue != null) {
|
||||
ids.add(fieldValue.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ids;
|
||||
}
|
||||
|
||||
public Iterator<Post> postIterator() throws IOException, XMLStreamException {
|
||||
SevenZFile postsFile = new SevenZFile(pathTo7zFile.toFile());
|
||||
SevenZFile commentsFile = new SevenZFile(pathTo7zFile.toFile());
|
||||
|
||||
SevenZArchiveEntry postsEntry = null;
|
||||
SevenZArchiveEntry commentsEntry = null;
|
||||
|
||||
for (SevenZArchiveEntry entry : postsFile.getEntries()) {
|
||||
if ("Posts.xml".equals(entry.getName())) {
|
||||
postsEntry = entry;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (SevenZArchiveEntry entry : commentsFile.getEntries()) {
|
||||
if ("Comments.xml".equals(entry.getName())) {
|
||||
commentsEntry = entry;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (postsEntry == null || commentsEntry == null) {
|
||||
postsFile.close();
|
||||
commentsFile.close();
|
||||
|
||||
throw new IOException("Posts.xml or Comments.xml not found in 7z file");
|
||||
}
|
||||
|
||||
var postsInputStream = postsFile.getInputStream(postsEntry);
|
||||
var commentsInputStream = commentsFile.getInputStream(commentsEntry);
|
||||
|
||||
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
|
||||
|
||||
var postsXmlReader = xmlInputFactory.createXMLEventReader(postsInputStream);
|
||||
var commentsXmlReader = xmlInputFactory.createXMLEventReader(commentsInputStream);
|
||||
|
||||
QName titleName = new QName("Title");
|
||||
QName idName = new QName("Id");
|
||||
QName bodyName = new QName("Body");
|
||||
QName tagsName = new QName("Tags");
|
||||
QName creationDateName = new QName("CreationDate");
|
||||
QName score = new QName("Score");
|
||||
|
||||
QName postIdName = new QName("PostId");
|
||||
QName textName = new QName("Text");
|
||||
|
||||
return new Iterator<>() {
|
||||
Post next = null;
|
||||
Comment nextComment = null;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (next != null)
|
||||
return true;
|
||||
|
||||
while (postsXmlReader.hasNext()) {
|
||||
var event = postsXmlReader.nextEvent();
|
||||
if (!event.isStartElement()) continue;
|
||||
|
||||
var startEvent = event.asStartElement();
|
||||
if (!"row".equals(startEvent.getName().getLocalPart())) continue;
|
||||
|
||||
var scoreAttribute = startEvent.getAttributeByName(score);
|
||||
if (scoreAttribute == null) continue;
|
||||
int score = Integer.parseInt(scoreAttribute.getValue());
|
||||
if (score < 1) continue;
|
||||
|
||||
var titleAttribute = startEvent.getAttributeByName(titleName);
|
||||
if (titleAttribute == null) continue;
|
||||
String title = titleAttribute.getValue();
|
||||
|
||||
var idAttribute = startEvent.getAttributeByName(idName);
|
||||
if (idAttribute == null) continue;
|
||||
int id = Integer.parseInt(idAttribute.getValue());
|
||||
|
||||
var bodyAttribute = startEvent.getAttributeByName(bodyName);
|
||||
if (bodyAttribute == null) continue;
|
||||
String body = bodyAttribute.getValue();
|
||||
|
||||
var tagsAttribute = startEvent.getAttributeByName(tagsName);
|
||||
if (tagsAttribute == null) continue;
|
||||
String tags = tagsAttribute.getValue();
|
||||
List<String> tagsParsed = parseTags(tags);
|
||||
var creationDateAttribute = startEvent.getAttributeByName(creationDateName);
|
||||
if (creationDateAttribute == null) continue;
|
||||
String creationDate = creationDateAttribute.getValue();
|
||||
int year = Integer.parseInt(creationDate.substring(0, 4));
|
||||
|
||||
List<Comment> comments = new ArrayList<>();
|
||||
do {
|
||||
if (nextComment == null) continue;
|
||||
|
||||
if (nextComment.postId > id) {
|
||||
break;
|
||||
}
|
||||
if (nextComment.postId == id) {
|
||||
comments.add(nextComment);
|
||||
nextComment = null;
|
||||
}
|
||||
}
|
||||
while (readNextComment());
|
||||
|
||||
next = new Post(title, tagsParsed, year, id, body, comments);
|
||||
return true;
|
||||
}
|
||||
|
||||
postsInputStream.close();
|
||||
commentsInputStream.close();
|
||||
postsFile.close();
|
||||
commentsFile.close();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean readNextComment() throws XMLStreamException {
|
||||
while (commentsXmlReader.hasNext()) {
|
||||
var event = commentsXmlReader.nextEvent();
|
||||
if (!event.isStartElement()) continue;
|
||||
|
||||
var startEvent = event.asStartElement();
|
||||
if (!"row".equals(startEvent.getName().getLocalPart())) continue;
|
||||
|
||||
var postIdAttribute = startEvent.getAttributeByName(postIdName);
|
||||
if (postIdAttribute == null) continue;
|
||||
int postId = Integer.parseInt(postIdAttribute.getValue());
|
||||
|
||||
var textAttribute = startEvent.getAttributeByName(textName);
|
||||
if (textAttribute == null) continue;
|
||||
String text = textAttribute.getValue();
|
||||
|
||||
nextComment = new Comment(postId, text);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Post next() {
|
||||
if (hasNext()) {
|
||||
var ret = next;
|
||||
next = null;
|
||||
return ret;
|
||||
}
|
||||
|
||||
throw new IllegalStateException("No more posts");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private List<String> parseTags(String tags) {
|
||||
return Arrays.stream(tags.split("<|>"))
|
||||
.filter(s -> !s.isBlank())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public record Post(String title, List<String> tags, int year, int id, String body, List<Comment> comments) {
|
||||
|
||||
}
|
||||
|
||||
public record Comment(int postId, String text) {
|
||||
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.stackexchange;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.converting.model.*;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -15,31 +16,32 @@ import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This code is broken */
|
||||
@Deprecated()
|
||||
public class StackexchangeSideloader implements SideloadSource {
|
||||
private final StackExchange7zReader reader;
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final String domainName;
|
||||
|
||||
public StackexchangeSideloader(Path pathTo7zFile,
|
||||
String domainName,
|
||||
private final Path dbFile;
|
||||
|
||||
@SneakyThrows
|
||||
public StackexchangeSideloader(Path pathToDbFile,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor
|
||||
) {
|
||||
this.domainName = domainName;
|
||||
reader = new StackExchange7zReader(pathTo7zFile);
|
||||
this.dbFile = pathToDbFile;
|
||||
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
@ -57,36 +59,48 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
@Override
|
||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||
try {
|
||||
var baseIter = reader.postIterator();
|
||||
return new Iterator<>() {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return baseIter.hasNext();
|
||||
var postsReader = new PostsReader();
|
||||
Thread readerThread = new Thread(postsReader);
|
||||
readerThread.setDaemon(true);
|
||||
readerThread.start();
|
||||
|
||||
return new Iterator<>() {
|
||||
|
||||
ProcessedDocument nextModel = null;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (nextModel != null)
|
||||
return true;
|
||||
nextModel = postsReader.next();
|
||||
|
||||
return nextModel != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProcessedDocument next() {
|
||||
if (hasNext()) {
|
||||
var ret = nextModel;
|
||||
nextModel = null;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProcessedDocument next() {
|
||||
return convert(baseIter.next());
|
||||
}
|
||||
};
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (XMLStreamException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessedDocument convert(StackExchange7zReader.Post post) {
|
||||
String fullUrl = "https://" + domainName + "/questions/" + post.id();
|
||||
private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) {
|
||||
String fullUrl = "https://" + domainName + "/questions/" + post.threadId();
|
||||
|
||||
StringBuilder fullHtml = new StringBuilder();
|
||||
fullHtml.append("<!DOCTYPE html><html><head><title>").append(post.title()).append("</title></head><body>");
|
||||
fullHtml.append("<p>").append(post.title()).append("</p>");
|
||||
for (var comment : post.comments()) {
|
||||
fullHtml.append("<p>").append(comment.text()).append("</p>");
|
||||
for (var comment : post.bodies()) {
|
||||
fullHtml.append("<p>").append(comment).append("</p>");
|
||||
}
|
||||
fullHtml.append("</body></html>");
|
||||
|
||||
@ -99,10 +113,17 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
ret.url = url;
|
||||
ret.words = keywordExtractor.extractKeywords(dld, url);
|
||||
ret.words.addJustNoMeta("site:"+domainName);
|
||||
ret.words.addJustNoMeta("site:"+url.domain.domain);
|
||||
ret.words.addJustNoMeta(url.domain.domain);
|
||||
ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, post.tags());
|
||||
ret.words.addAllSyntheticTerms(List.of(
|
||||
"site:" + domainName,
|
||||
"site:" + url.domain.domain,
|
||||
url.domain.domain
|
||||
));
|
||||
|
||||
if (!post.tags().isBlank()) {
|
||||
List<String> subjects = Arrays.asList(post.tags().split(","));
|
||||
ret.words.setFlagOnMetadataForWords(WordFlags.Subjects, subjects);
|
||||
}
|
||||
|
||||
ret.details = new ProcessedDocumentDetails();
|
||||
ret.details.pubYear = post.year();
|
||||
ret.details.quality = 5;
|
||||
@ -129,4 +150,44 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
class PostsReader implements Runnable {
|
||||
private final ArrayBlockingQueue<ProcessedDocument> results = new ArrayBlockingQueue<>(16);
|
||||
private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("Sideloading Stackexchange", 16, 4);
|
||||
volatile boolean isRunning = true;
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
StackExchangePostsDb.forEachPost(dbFile, this::enqueue);
|
||||
}
|
||||
finally {
|
||||
isRunning = false;
|
||||
pool.shutDown();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) {
|
||||
pool.submit(() -> results.put(convert(model)));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public ProcessedDocument next() throws InterruptedException {
|
||||
do {
|
||||
var next = results.poll(1, TimeUnit.SECONDS);
|
||||
if (next != null) {
|
||||
return next;
|
||||
}
|
||||
} while (!isFinished());
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return !isRunning &&
|
||||
results.isEmpty() &&
|
||||
pool.isTerminated();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,24 +0,0 @@
|
||||
package nu.marginalia.converting.sideload;
|
||||
|
||||
import nu.marginalia.converting.sideload.stackexchange.StackExchange7zReader;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
class StackexchangeSideloaderTest {
|
||||
@Test
|
||||
@Disabled
|
||||
public void test7zFile() throws IOException, XMLStreamException {
|
||||
var stackExchangeReader = new StackExchange7zReader(Path.of("/mnt/storage/stackexchange/scifi.meta.stackexchange.com.7z"));
|
||||
|
||||
System.out.println(stackExchangeReader.getIds());
|
||||
|
||||
var iter = stackExchangeReader.postIterator();
|
||||
while (iter.hasNext()) {
|
||||
System.out.println(iter.next());
|
||||
}
|
||||
}
|
||||
}
|
@ -190,6 +190,7 @@ public class ControlService extends Service {
|
||||
Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors);
|
||||
Spark.post("/public/actions/sideload-encyclopedia", controlActionsService::sideloadEncyclopedia, redirectToActors);
|
||||
Spark.post("/public/actions/sideload-dirtree", controlActionsService::sideloadDirtree, redirectToActors);
|
||||
Spark.post("/public/actions/sideload-stackexchange", controlActionsService::sideloadStackexchange, redirectToActors);
|
||||
|
||||
// Review Random Domains
|
||||
Spark.get("/public/review-random-domains", this::reviewRandomDomainsModel, reviewRandomDomainsRenderer::render);
|
||||
|
@ -129,6 +129,21 @@ public class ControlActionsService {
|
||||
return "";
|
||||
}
|
||||
|
||||
public Object sideloadStackexchange(Request request, Response response) throws Exception {
|
||||
|
||||
Path sourcePath = Path.of(request.queryParams("source"));
|
||||
if (!Files.exists(sourcePath)) {
|
||||
Spark.halt(404);
|
||||
return "No such file " + sourcePath;
|
||||
}
|
||||
|
||||
eventLog.logEvent("USER-ACTION", "SIDELOAD STACKEXCHANGE");
|
||||
|
||||
actors.startFrom(Actor.CONVERT, ConvertActor.CONVERT_STACKEXCHANGE, sourcePath.toString());
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
public Object triggerRepartition(Request request, Response response) throws Exception {
|
||||
indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "");
|
||||
|
||||
|
@ -54,6 +54,20 @@
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><b>Sideload Stackexchange</b><p>
|
||||
Will load a set of pre-converted stackexchange .db files.
|
||||
</td>
|
||||
<td>
|
||||
<form method="post" action="/actions/sideload-stackexchange" onsubmit="return confirm('Confirm sideloading')">
|
||||
<label for="source">Directory with .db files location on server</label><br>
|
||||
<input id="source" name="source" value="">
|
||||
<br><br>
|
||||
|
||||
<input type="submit" value="Sideload Stackexchange">
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<b>Reload Blogs List</b>
|
||||
|
@ -6,6 +6,7 @@ Start in [📁 ../code/](../code/) and poke around.
|
||||
## Operations
|
||||
|
||||
* [System Properties](system-properties.md) - JVM property flags
|
||||
* [Sideloading How-To](sideloading-howto.md) - How to sideload various data sets
|
||||
|
||||
## Set-up
|
||||
|
||||
|
120
doc/sideloading-howto.md
Normal file
120
doc/sideloading-howto.md
Normal file
@ -0,0 +1,120 @@
|
||||
# Sideloading How-To
|
||||
|
||||
(This document is a bit of a draft to get this down in writing
|
||||
while it's still fresh in my head.)
|
||||
|
||||
Some websites are much larger than others, this includes
|
||||
Wikipedia, Stack Overflow, and a few others. They are so
|
||||
large they are impractical to crawl in the traditional fashion,
|
||||
but luckily they make available data dumps that can be processed
|
||||
and loaded into the search engine through other means.
|
||||
|
||||
## Sideloading a directory tree
|
||||
|
||||
For relatively small websites, ad-hoc side-loading is available directly from a
|
||||
folder structure on the hard drive. This is intended for loading manuals,
|
||||
documentation and similar data sets that are large and slowly changing.
|
||||
|
||||
A website can be archived with wget, like this
|
||||
|
||||
```bash
|
||||
UA="search.marginalia.nu" \
|
||||
DOMAIN="www.example.com" \
|
||||
wget -nc -x --continue -w 1 -r -U ${UA} -A "html" ${DOMAIN}
|
||||
```
|
||||
|
||||
After doing this to a bunch of websites, create a YAML file something like this:
|
||||
|
||||
```yaml
|
||||
sources:
|
||||
- name: jdk-20
|
||||
dir: "jdk-20/"
|
||||
domainName: "docs.oracle.com"
|
||||
baseUrl: "https://docs.oracle.com/en/java/javase/20/docs"
|
||||
keywords:
|
||||
- "java"
|
||||
- "docs"
|
||||
- "documentation"
|
||||
- "javadoc"
|
||||
- name: python3
|
||||
dir: "python-3.11.5/"
|
||||
domainName: "docs.python.org"
|
||||
baseUrl: "https://docs.python.org/3/"
|
||||
keywords:
|
||||
- "python"
|
||||
- "docs"
|
||||
- "documentation"
|
||||
- name: mariadb.com
|
||||
dir: "mariadb.com/"
|
||||
domainName: "mariadb.com"
|
||||
baseUrl: "https://mariadb.com/"
|
||||
keywords:
|
||||
- "sql"
|
||||
- "docs"
|
||||
- "mariadb"
|
||||
- "mysql"
|
||||
```
|
||||
|
||||
|parameter|description|
|
||||
|----|----|
|
||||
|name|Purely informative|
|
||||
|dir|Path of website contents relative to the location of the yaml file|
|
||||
|domainName|The domain name of the website|
|
||||
|baseUrl|This URL will be prefixed to the contents of `dir`|
|
||||
|keywords|These supplemental keywords will be injected in each document|
|
||||
|
||||
The directory structure corresponding to the above might look like
|
||||
|
||||
```
|
||||
docs-index.yaml
|
||||
jdk-20/
|
||||
jdk-20/resources/
|
||||
jdk-20/api/
|
||||
jdk-20/api/[...]
|
||||
jdk-20/specs/
|
||||
jdk-20/specs/[...]
|
||||
jdk-20/index.html
|
||||
mariadb.com
|
||||
mariadb.com/kb/
|
||||
mariadb.com/kb/[...]
|
||||
python-3.11.5
|
||||
python-3.11.5/genindex-B.html
|
||||
python-3.11.5/library/
|
||||
python-3.11.5/distutils/
|
||||
python-3.11.5/[...]
|
||||
[...]
|
||||
```
|
||||
|
||||
This yaml-file can be processed and loaded into the search engine through the
|
||||
Actions view.
|
||||
|
||||
## Sideloading Wikipedia
|
||||
|
||||
For now, this workflow depends on using the conversion process from
|
||||
[https://encyclopedia.marginalia.nu/](https://encyclopedia.marginalia.nu/)
|
||||
to pre-digest the data. This is because it uses OpenZIM which has a
|
||||
license that is incompatible with this project.
|
||||
|
||||
Build the [encyclopedia.marginalia.nu Code](https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu)
|
||||
and follow the instructions for downloading a ZIM file, and then run something like
|
||||
|
||||
```$./encyclopedia convert file.zim articles.db```
|
||||
|
||||
This db-file can be processed and loaded into the search engine through the
|
||||
Actions view.
|
||||
|
||||
FIXME: It will currently only point to encyclopedia.marginalia.nu and not main Wikipedia,
|
||||
this should be made configurable.
|
||||
|
||||
## Sideloading Stack Overflow/Stackexchange
|
||||
|
||||
Stackexchange makes dumps available on Archive.org. These are unfortunately on a format that
|
||||
needs some heavy-handed pre-processing before they can be loaded. A tool is available for
|
||||
this in [tools/stackexchange-converter](../code/tools/stackexchange-converter).
|
||||
|
||||
After running `gradlew dist`, this tool is found in `build/dist/stackexchange-converter`,
|
||||
follow the instructions in the stackexchange-converter readme, and
|
||||
convert the stackexchange xml.7z-files to sqlite db-files.
|
||||
|
||||
A directory with such db-files can be processed and loaded into the
|
||||
search engine through the Actions view.
|
@ -7,12 +7,6 @@ These are JVM system properties used by each service
|
||||
|-------------|------------|-------------------------------------------------------|
|
||||
| website-url |https://search.marginalia.nu/|Overrides the website URL used in rendering|
|
||||
|
||||
## Index Service
|
||||
|
||||
|flag| values | description |
|
||||
|---|------------|---------------------------------------------------------------|
|
||||
|lexiconSizeHint| 1000000000 | The default size of the lexicon, speeds up start time in prod |
|
||||
|
||||
## Crawler Process
|
||||
|flag| values | description |
|
||||
|---|------------|-------------------------------------------------------|
|
||||
@ -26,12 +20,5 @@ These are JVM system properties used by each service
|
||||
## Loader Process
|
||||
|flag| values | description |
|
||||
|---|------------|-------------------------------------------------------|
|
||||
|lexiconSizeHint| 800000000 | The default size of the lexicon |
|
||||
|local-index-path| /some/path | Selects the location the loader will write index data |
|
||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
||||
|
||||
## Other
|
||||
|
||||
|flag| values | description |
|
||||
|---|------------|---------------------------------------------|
|
||||
|bigstring.disabled| true/false | Disables transparent big string compression |
|
||||
|crawl.rootDirRewrite|/some/path|Sets the base directory of a crawl plan |
|
Loading…
Reference in New Issue
Block a user