(stackexchange-integration) Add better comments

This commit is contained in:
Viktor Lofgren 2023-09-20 14:43:06 +02:00
parent 6bbf40d7d2
commit 3b4d08f52b

View File

@ -3,11 +3,8 @@ package nu.marginalia.integration.stackexchange.sqlite;
import com.github.luben.zstd.Zstd;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.TIntIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
import lombok.SneakyThrows;
import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader;
import org.apache.commons.compress.compressors.zstandard.ZstdUtils;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
@ -22,8 +19,19 @@ import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
import java.util.function.Predicate;
/** Because stackexchange's XML format is a stream of entities that reference their parent,
* and we want to process them in a thread-by-thread order, it is necessary to use something
* to essentially re-order the data.
* <p>
* This class uses SQLite to perform this task. The actual post bodies are compressed to keep
* the size of the file down. It is strongly advisable to read off an SSD and not a mechanical
* hard drive when processing these database files, the difference in processing time is 20 minutes
* vs 6+ hours.
* <p>
*/
public class StackExchangePostsDb {
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
@SneakyThrows
public static void create(Path sqliteFile,
Path stackExchange7zFile) {
@ -62,6 +70,8 @@ public class StackExchangePostsDb {
var post = iter.next();
insertPost.setInt(1, post.id());
// We invent a new field called threadId, which is the id of the post if it's
// a question, or the parent if it's an answer
if (post.parentId() == null) insertPost.setInt(2, post.id());
else insertPost.setInt(2, post.parentId());
@ -92,6 +102,11 @@ public class StackExchangePostsDb {
}
}
/** Iterate over each post in the sqlite post database.
* Each post will be assigned an ordinal number that is different from the id of the post. This is
* necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode
* for a single domain, despite having less than 67 million 'threads'.
* */
@SneakyThrows
public static void forEachPost(
Path sqliteFile,
@ -108,20 +123,24 @@ public class StackExchangePostsDb {
WHERE threadId = ?
""")
) {
// Step 1 is to export a list of thread IDs from the database
TIntList threadIds = new TIntArrayList(10_000);
ResultSet rs = selectThreadIds.executeQuery();
while (rs.next()) {
threadIds.add(rs.getInt(1));
}
System.out.println("Got " + threadIds.size() + " IDs");
// Step 2: Iterate over each thread
var idIterator = threadIds.iterator();
int ordinal = 0;
while (idIterator.hasNext()) {
queryPostContents.setInt(1, idIterator.next());
int threadId = idIterator.next();
// Query posts with this threadId
queryPostContents.setInt(1, threadId);
rs = queryPostContents.executeQuery();
List<String> parts = new ArrayList<>();
@ -139,6 +158,7 @@ public class StackExchangePostsDb {
year = Math.min(year, rs.getInt("postYear"));
// Decompress the bodies
byte[] bytes = rs.getBytes("body");
partWork.add(commonPool.submit(
() -> new String(Zstd.decompress(bytes, origSize)
@ -149,7 +169,7 @@ public class StackExchangePostsDb {
parts.add(workItem.get());
}
if (!consumer.test(new CombinedPostModel(ordinal++, title, year, parts)))
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
break;
}
@ -161,11 +181,10 @@ public class StackExchangePostsDb {
}
public record CombinedPostModel(int ordinal,
int threadId,
String title,
int year,
List<String> bodies)
{
}
{ }
}