(index-journal) Improve documentation and code quality

This commit is contained in:
Viktor Lofgren 2024-02-15 10:51:49 +01:00
parent d970836605
commit dcc5cfb7c0
25 changed files with 76 additions and 34 deletions

View File

@ -6,12 +6,19 @@ This journal is written by [processes/loading-process](../../processes/loading-p
when constructing the [forward](../index-forward) and [reverse](../index-reverse)
indices.
The journal format is a file header, followed by a zstd-compressed list of entries,
each containing a header with document-level data, and a data section
with keyword-level data.
The journal data may be split into multiple files, and the journal writers and readers
are designed to handle this transparently via their *Paging* implementation.
## Central Classes
### Model
* [IndexJournalEntry](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java)
* [IndexJournalEntryHeader](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java)
* [IndexJournalEntryData](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java)
* [IndexJournalEntry](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntry.java)
* [IndexJournalEntryHeader](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java)
* [IndexJournalEntryData](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java)
### I/O
* [IndexJournalReader](src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java)
* [IndexJournalWriter](src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java)
* [IndexJournalReader](src/main/java/nu/marginalia/index/journal/reader/IndexJournalReader.java)
* [IndexJournalWriter](src/main/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java)

View File

@ -1,4 +0,0 @@
package nu.marginalia.index.journal.model;
public record IndexJournalFileHeader(long fileSize, long wordCount) {
}

View File

@ -1,3 +0,0 @@
package nu.marginalia.index.journal.model;
public record IndexJournalStatistics(int highestWord, int documentCardinality) { }

View File

@ -1,4 +1,4 @@
package nu.marginallia.index.journal;
package nu.marginalia.index.journal;
import java.io.IOException;
import java.nio.file.Files;

View File

@ -2,6 +2,14 @@ package nu.marginalia.index.journal.model;
import nu.marginalia.model.id.UrlIdCodec;
/** An entry in the index journal.
*
* @param header the header of the entry, containing document level data
* @param data the data of the entry, containing keyword level data
*
* @see IndexJournalEntryHeader
* @see IndexJournalEntryData
*/
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {

View File

@ -1,11 +1,22 @@
package nu.marginalia.index.journal.model;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.model.idx.WordMetadata;
import java.util.Arrays;
import java.util.Iterator;
/** The keyword data of an index journal entry.
* The data itself is an interleaved array of
* word ids and metadata.
* <p>
* Odd entries are term ids, even entries are encoded WordMetadata records.
* </p>
* <p>The civilized way of reading the journal data is to use an IndexJournalReader</p>
*
* @see WordMetadata
* @see IndexJournalReader
*/
public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Record> {
private final int size;
public final long[] underlyingArray;

View File

@ -1,5 +1,20 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
/** The header of an index journal entry.
*
* @param entrySize the size of the entry
* @param documentFeatures the features of the document, as an encoded HtmlFeature
* @param combinedId the combined document id, encoded with UrlIdCodec
* @param documentMeta the metadata of the document, as an encoded DocumentMetadata
*
* @see DocumentMetadata
* @see HtmlFeature
* @see UrlIdCodec
*/
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
long combinedId,

View File

@ -0,0 +1,10 @@
package nu.marginalia.index.journal.model;
/** The header of an index journal file. This is the first 16 bytes of the file,
* and is not compressed.
*
* @param fileSizeRecords the size of the file in number of records
* @param reserved should be 0
*/
public record IndexJournalFileHeader(long fileSizeRecords, long reserved) {
}

View File

@ -8,7 +8,6 @@ import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.Arrays;
public class IndexJournalReadEntry {
public final IndexJournalEntryHeader header;

View File

@ -1,21 +1,23 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import nu.marginalia.model.idx.WordFlags;
import java.io.IOException;
import java.nio.file.Path;
import java.util.function.LongConsumer;
import java.util.function.LongPredicate;
/** Tools for reading the index journal. */
public interface IndexJournalReader {
int FILE_HEADER_SIZE_LONGS = 2;
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
/** Create a reader for a single file. */
static IndexJournalReader singleFile(Path fileName) throws IOException {
return new IndexJournalReaderSingleFile(fileName);
}
/** Create a reader for a set of files. */
static IndexJournalReader paging(Path baseDir) throws IOException {
return new IndexJournalReaderPagingImpl(baseDir);
}
@ -36,9 +38,13 @@ public interface IndexJournalReader {
}
}
/** Create a new pointer to the journal. The IndexJournalPointer is
* a two-tiered iterator that allows both iteration over document records
* and their keywords
*/
IndexJournalPointer newPointer();
/** Reader that filters the entries based on the term metadata. */
default IndexJournalReader filtering(LongPredicate termMetaFilter) {
return new FilteringIndexJournalReader(this, termMetaFilter);
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import nu.marginallia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -76,7 +76,7 @@ class SingleFileJournalPointer implements IndexJournalPointer {
recordIdx = -2;
entryData = null;
if (++docIdx < fileHeader.fileSize()) {
if (++docIdx < fileHeader.fileSizeRecords()) {
entry = IndexJournalReadEntry.read(dataInputStream);
return true;
}

View File

@ -3,7 +3,7 @@ package nu.marginalia.index.journal.writer;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginallia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -123,7 +123,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
// Finalize the file by writing a header in the beginning
ByteBuffer header = ByteBuffer.allocate(16);
header.putLong(numEntries);
header.putLong(0);
header.putLong(0); // reserved for future use
header.flip();
while (header.position() < header.limit()) {

View File

@ -2,7 +2,7 @@ package nu.marginalia.index.construction;
import lombok.SneakyThrows;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginallia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -5,14 +5,11 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** A pair of file-backed arrays of sorted wordIds
* and the count of documents associated with each wordId.

View File

@ -6,7 +6,6 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

View File

@ -12,7 +12,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginallia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -7,7 +7,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.loading.LoaderIndexJournalWriter;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginallia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

View File

@ -7,7 +7,7 @@ import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginallia.index.journal.IndexJournalFileNames;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.apache.commons.io.IOUtils;
import com.google.inject.Inject;

View File

@ -10,7 +10,6 @@ import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;

View File

@ -7,7 +7,6 @@ import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.results.ResultRankingParameters;

View File

@ -13,7 +13,6 @@ import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.index.svc.searchset.SearchSetAny;
import nu.marginalia.index.util.TestUtil;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.service.control.*;
import nu.marginalia.service.id.ServiceId;
import nu.marginalia.service.module.ServiceConfiguration;