(stackexchange-integration) Add better comments
This commit is contained in:
parent
6bbf40d7d2
commit
3b4d08f52b
@ -3,11 +3,8 @@ package nu.marginalia.integration.stackexchange.sqlite;
|
||||
import com.github.luben.zstd.Zstd;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.TIntIntMap;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader;
|
||||
import org.apache.commons.compress.compressors.zstandard.ZstdUtils;
|
||||
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import java.io.IOException;
|
||||
@ -22,8 +19,19 @@ import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** Because stackexchange's XML format is a stream of entities that reference their parent,
|
||||
* and we want to process them in a thread-by-thread order, it is necessary to use something
|
||||
* to essentially re-order the data.
|
||||
* <p>
|
||||
* This class uses SQLite to perform this task. The actual post bodies are compressed to keep
|
||||
* the size of the file down. It is strongly advisable to read off an SSD and not a mechanical
|
||||
* hard drive when processing these database files, the difference in processing time is 20 minutes
|
||||
* vs 6+ hours.
|
||||
* <p>
|
||||
*/
|
||||
public class StackExchangePostsDb {
|
||||
|
||||
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
|
||||
@SneakyThrows
|
||||
public static void create(Path sqliteFile,
|
||||
Path stackExchange7zFile) {
|
||||
@ -62,6 +70,8 @@ public class StackExchangePostsDb {
|
||||
var post = iter.next();
|
||||
insertPost.setInt(1, post.id());
|
||||
|
||||
// We invent a new field called threadId, which is the id of the post if it's
|
||||
// a question, or the parent if it's an answer
|
||||
if (post.parentId() == null) insertPost.setInt(2, post.id());
|
||||
else insertPost.setInt(2, post.parentId());
|
||||
|
||||
@ -92,6 +102,11 @@ public class StackExchangePostsDb {
|
||||
}
|
||||
}
|
||||
|
||||
/** Iterate over each post in the sqlite post database.
|
||||
* Each post will be assigned an ordinal number that is different from the id of the post. This is
|
||||
* necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode
|
||||
* for a single domain, despite having less than 67 million 'threads'.
|
||||
* */
|
||||
@SneakyThrows
|
||||
public static void forEachPost(
|
||||
Path sqliteFile,
|
||||
@ -108,20 +123,24 @@ public class StackExchangePostsDb {
|
||||
WHERE threadId = ?
|
||||
""")
|
||||
) {
|
||||
|
||||
// Step 1 is to export a list of thread IDs from the database
|
||||
TIntList threadIds = new TIntArrayList(10_000);
|
||||
ResultSet rs = selectThreadIds.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
threadIds.add(rs.getInt(1));
|
||||
}
|
||||
|
||||
System.out.println("Got " + threadIds.size() + " IDs");
|
||||
|
||||
// Step 2: Iterate over each thread
|
||||
var idIterator = threadIds.iterator();
|
||||
int ordinal = 0;
|
||||
|
||||
while (idIterator.hasNext()) {
|
||||
queryPostContents.setInt(1, idIterator.next());
|
||||
int threadId = idIterator.next();
|
||||
|
||||
// Query posts with this threadId
|
||||
queryPostContents.setInt(1, threadId);
|
||||
rs = queryPostContents.executeQuery();
|
||||
|
||||
List<String> parts = new ArrayList<>();
|
||||
@ -139,6 +158,7 @@ public class StackExchangePostsDb {
|
||||
|
||||
year = Math.min(year, rs.getInt("postYear"));
|
||||
|
||||
// Decompress the bodies
|
||||
byte[] bytes = rs.getBytes("body");
|
||||
partWork.add(commonPool.submit(
|
||||
() -> new String(Zstd.decompress(bytes, origSize)
|
||||
@ -149,7 +169,7 @@ public class StackExchangePostsDb {
|
||||
parts.add(workItem.get());
|
||||
}
|
||||
|
||||
if (!consumer.test(new CombinedPostModel(ordinal++, title, year, parts)))
|
||||
if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts)))
|
||||
break;
|
||||
}
|
||||
|
||||
@ -161,11 +181,10 @@ public class StackExchangePostsDb {
|
||||
}
|
||||
|
||||
public record CombinedPostModel(int ordinal,
|
||||
int threadId,
|
||||
String title,
|
||||
int year,
|
||||
List<String> bodies)
|
||||
{
|
||||
|
||||
}
|
||||
{ }
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user