(stackexchange-integration) Add better comments

2023-09-20 14:43:06 +02:00 · 2023-09-20 14:43:06 +02:00 · 3b4d08f52b
commit 3b4d08f52b
parent 6bbf40d7d2
1 changed files with 29 additions and 10 deletions
--- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
+++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
@ -3,11 +3,8 @@ package nu.marginalia.integration.stackexchange.sqlite;
 import com.github.luben.zstd.Zstd;
 import gnu.trove.list.TIntList;
 import gnu.trove.list.array.TIntArrayList;
-import gnu.trove.map.TIntIntMap;
-import gnu.trove.map.hash.TIntIntHashMap;
 import lombok.SneakyThrows;
 import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader;
-import org.apache.commons.compress.compressors.zstandard.ZstdUtils;

 import javax.xml.stream.XMLStreamException;
 import java.io.IOException;
@ -22,8 +19,19 @@ import java.util.concurrent.ForkJoinPool;
 import java.util.concurrent.Future;
 import java.util.function.Predicate;

+/**  Because stackexchange's XML format is a stream of entities that reference their parent,
+ * and we want to process them in a thread-by-thread order, it is necessary to use something
+ * to essentially re-order the data.
+ * <p>
+ * This class uses SQLite to perform this task.  The actual post bodies are compressed to keep
+ * the size of the file down.  It is strongly advisable to read off an SSD and not a mechanical
+ * hard drive when processing these database files, the difference in processing time is 20 minutes
+ * vs 6+ hours.
+ * <p>
+ */
 public class StackExchangePostsDb {

+    /** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
    @SneakyThrows
    public static void create(Path sqliteFile,
                       Path stackExchange7zFile) {
@ -62,6 +70,8 @@ public class StackExchangePostsDb {
                var post = iter.next();
                insertPost.setInt(1, post.id());

+                // We invent a new field called threadId, which is the id of the post if it's
+                // a question, or the parent if it's an answer
                if (post.parentId() == null) insertPost.setInt(2, post.id());
                else insertPost.setInt(2, post.parentId());

@ -92,6 +102,11 @@ public class StackExchangePostsDb {
        }
    }

+    /** Iterate over each post in the sqlite post database.
+     * Each post will be assigned an ordinal number that is different from the id of the post.  This is
+     * necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode
+     * for a single domain, despite having less than 67 million 'threads'.
+     * */
    @SneakyThrows
    public static void forEachPost(
            Path sqliteFile,
@ -108,20 +123,24 @@ public class StackExchangePostsDb {
                    WHERE threadId = ?
                    """)
        ) {
+
+            // Step 1 is to export a list of thread IDs from the database
            TIntList threadIds = new TIntArrayList(10_000);
            ResultSet rs = selectThreadIds.executeQuery();
-
            while (rs.next()) {
                threadIds.add(rs.getInt(1));
            }

            System.out.println("Got " + threadIds.size() + " IDs");

+            // Step 2: Iterate over each thread
            var idIterator = threadIds.iterator();
            int ordinal = 0;
-
            while (idIterator.hasNext()) {
-                queryPostContents.setInt(1, idIterator.next());
+                int threadId = idIterator.next();
+
+                // Query posts with this threadId
+                queryPostContents.setInt(1, threadId);
                rs = queryPostContents.executeQuery();

                List<String> parts = new ArrayList<>();
@ -139,6 +158,7 @@ public class StackExchangePostsDb {

                    year = Math.min(year, rs.getInt("postYear"));

+                    // Decompress the bodies
                    byte[] bytes = rs.getBytes("body");
                    partWork.add(commonPool.submit(
                            () -> new String(Zstd.decompress(bytes, origSize)
@ -149,7 +169,7 @@ public class StackExchangePostsDb {
                    parts.add(workItem.get());
                }

-                if (!consumer.test(new CombinedPostModel(ordinal++, title, year, parts)))
+                if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
                    break;
            }

@ -161,11 +181,10 @@ public class StackExchangePostsDb {
    }

    public record CombinedPostModel(int ordinal,
+                                    int threadId,
                                    String title,
                                    int year,
                                    List<String> bodies)
-    {
-
-    }
+    { }

 }