(index-journal) Improve documentation and code quality

This commit is contained in:
Viktor Lofgren 2024-02-15 10:51:49 +01:00
parent d970836605
commit dcc5cfb7c0
25 changed files with 76 additions and 34 deletions

View File

@ -6,12 +6,19 @@ This journal is written by [processes/loading-process](../../processes/loading-p
when constructing the [forward](../index-forward) and [reverse](../index-reverse) when constructing the [forward](../index-forward) and [reverse](../index-reverse)
indices. indices.
The journal format is a file header, followed by a zstd-compressed list of entries,
each containing a header with document-level data, and a data section
with keyword-level data.
The journal data may be split into multiple files, and the journal writers and readers
are designed to handle this transparently via their *Paging* implementation.
## Central Classes ## Central Classes
### Model ### Model
* [IndexJournalEntry](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java) * [IndexJournalEntry](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntry.java)
* [IndexJournalEntryHeader](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java) * [IndexJournalEntryHeader](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java)
* [IndexJournalEntryData](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java) * [IndexJournalEntryData](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java)
### I/O ### I/O
* [IndexJournalReader](src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java) * [IndexJournalReader](src/main/java/nu/marginalia/index/journal/reader/IndexJournalReader.java)
* [IndexJournalWriter](src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java) * [IndexJournalWriter](src/main/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java)

View File

@ -1,4 +0,0 @@
package nu.marginalia.index.journal.model;
public record IndexJournalFileHeader(long fileSize, long wordCount) {
}

View File

@ -1,3 +0,0 @@
package nu.marginalia.index.journal.model;
public record IndexJournalStatistics(int highestWord, int documentCardinality) { }

View File

@ -1,4 +1,4 @@
package nu.marginallia.index.journal; package nu.marginalia.index.journal;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;

View File

@ -2,6 +2,14 @@ package nu.marginalia.index.journal.model;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
/** An entry in the index journal.
*
* @param header the header of the entry, containing document level data
* @param data the data of the entry, containing keyword level data
*
* @see IndexJournalEntryHeader
* @see IndexJournalEntryData
*/
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) { public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {

View File

@ -1,11 +1,22 @@
package nu.marginalia.index.journal.model; package nu.marginalia.index.journal.model;
import java.io.DataOutputStream; import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException; import nu.marginalia.model.idx.WordMetadata;
import java.nio.ByteBuffer;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
/** The keyword data of an index journal entry.
* The data itself is an interleaved array of
* word ids and metadata.
* <p>
* Odd entries are term ids, even entries are encoded WordMetadata records.
* </p>
* <p>The civilized way of reading the journal data is to use an IndexJournalReader</p>
*
* @see WordMetadata
* @see IndexJournalReader
*/
public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Record> { public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Record> {
private final int size; private final int size;
public final long[] underlyingArray; public final long[] underlyingArray;

View File

@ -1,5 +1,20 @@
package nu.marginalia.index.journal.model; package nu.marginalia.index.journal.model;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
/** The header of an index journal entry.
*
* @param entrySize the size of the entry
* @param documentFeatures the features of the document, as an encoded HtmlFeature
* @param combinedId the combined document id, encoded with UrlIdCodec
* @param documentMeta the metadata of the document, as an encoded DocumentMetadata
*
* @see DocumentMetadata
* @see HtmlFeature
* @see UrlIdCodec
*/
public record IndexJournalEntryHeader(int entrySize, public record IndexJournalEntryHeader(int entrySize,
int documentFeatures, int documentFeatures,
long combinedId, long combinedId,

View File

@ -0,0 +1,10 @@
package nu.marginalia.index.journal.model;
/** The header of an index journal file. This is the first 16 bytes of the file,
* and is not compressed.
*
* @param fileSizeRecords the size of the file in number of records
* @param reserved should be 0
*/
public record IndexJournalFileHeader(long fileSizeRecords, long reserved) {
}

View File

@ -8,7 +8,6 @@ import java.io.DataInputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.LongBuffer; import java.nio.LongBuffer;
import java.util.Arrays;
public class IndexJournalReadEntry { public class IndexJournalReadEntry {
public final IndexJournalEntryHeader header; public final IndexJournalEntryHeader header;

View File

@ -1,21 +1,23 @@
package nu.marginalia.index.journal.reader; package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import nu.marginalia.model.idx.WordFlags;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.function.LongConsumer; import java.util.function.LongConsumer;
import java.util.function.LongPredicate; import java.util.function.LongPredicate;
/** Tools for reading the index journal. */
public interface IndexJournalReader { public interface IndexJournalReader {
int FILE_HEADER_SIZE_LONGS = 2; int FILE_HEADER_SIZE_LONGS = 2;
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
/** Create a reader for a single file. */
static IndexJournalReader singleFile(Path fileName) throws IOException { static IndexJournalReader singleFile(Path fileName) throws IOException {
return new IndexJournalReaderSingleFile(fileName); return new IndexJournalReaderSingleFile(fileName);
} }
/** Create a reader for a set of files. */
static IndexJournalReader paging(Path baseDir) throws IOException { static IndexJournalReader paging(Path baseDir) throws IOException {
return new IndexJournalReaderPagingImpl(baseDir); return new IndexJournalReaderPagingImpl(baseDir);
} }
@ -36,9 +38,13 @@ public interface IndexJournalReader {
} }
} }
/** Create a new pointer to the journal. The IndexJournalPointer is
* a two-tiered iterator that allows both iteration over document records
* and their keywords
*/
IndexJournalPointer newPointer(); IndexJournalPointer newPointer();
/** Reader that filters the entries based on the term metadata. */
default IndexJournalReader filtering(LongPredicate termMetaFilter) { default IndexJournalReader filtering(LongPredicate termMetaFilter) {
return new FilteringIndexJournalReader(this, termMetaFilter); return new FilteringIndexJournalReader(this, termMetaFilter);
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.journal.reader; package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -76,7 +76,7 @@ class SingleFileJournalPointer implements IndexJournalPointer {
recordIdx = -2; recordIdx = -2;
entryData = null; entryData = null;
if (++docIdx < fileHeader.fileSize()) { if (++docIdx < fileHeader.fileSizeRecords()) {
entry = IndexJournalReadEntry.read(dataInputStream); entry = IndexJournalReadEntry.read(dataInputStream);
return true; return true;
} }

View File

@ -3,7 +3,7 @@ package nu.marginalia.index.journal.writer;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -123,7 +123,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
// Finalize the file by writing a header in the beginning // Finalize the file by writing a header in the beginning
ByteBuffer header = ByteBuffer.allocate(16); ByteBuffer header = ByteBuffer.allocate(16);
header.putLong(numEntries); header.putLong(numEntries);
header.putLong(0); header.putLong(0); // reserved for future use
header.flip(); header.flip();
while (header.position() < header.limit()) { while (header.position() < header.limit()) {

View File

@ -2,7 +2,7 @@ package nu.marginalia.index.construction;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -5,14 +5,11 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException; import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** A pair of file-backed arrays of sorted wordIds /** A pair of file-backed arrays of sorted wordIds
* and the count of documents associated with each wordId. * and the count of documents associated with each wordId.

View File

@ -6,7 +6,6 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReversePreindex; import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory; import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta; import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -12,7 +12,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -7,7 +7,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.loading.LoaderIndexJournalWriter; import nu.marginalia.loading.LoaderIndexJournalWriter;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalFileNames;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -7,7 +7,7 @@ import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import nu.marginallia.index.journal.IndexJournalFileNames; import nu.marginalia.index.journal.IndexJournalFileNames;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import com.google.inject.Inject; import com.google.inject.Inject;

View File

@ -10,7 +10,6 @@ import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames; import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.ResultRankingParameters; import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.construction.ReverseIndexConstructor;

View File

@ -7,7 +7,6 @@ import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.ReverseIndexFullFileNames; import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames; import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.results.ResultRankingParameters; import nu.marginalia.index.client.model.results.ResultRankingParameters;

View File

@ -13,7 +13,6 @@ import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.ranking.DomainRankings; import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.index.svc.searchset.SearchSetAny; import nu.marginalia.index.svc.searchset.SearchSetAny;
import nu.marginalia.index.util.TestUtil; import nu.marginalia.index.util.TestUtil;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.service.control.*; import nu.marginalia.service.control.*;
import nu.marginalia.service.id.ServiceId; import nu.marginalia.service.id.ServiceId;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;