(index-journal) Improve documentation and code quality
This commit is contained in:
parent
d970836605
commit
dcc5cfb7c0
@ -6,12 +6,19 @@ This journal is written by [processes/loading-process](../../processes/loading-p
|
||||
when constructing the [forward](../index-forward) and [reverse](../index-reverse)
|
||||
indices.
|
||||
|
||||
The journal format is a file header, followed by a zstd-compressed list of entries,
|
||||
each containing a header with document-level data, and a data section
|
||||
with keyword-level data.
|
||||
|
||||
The journal data may be split into multiple files, and the journal writers and readers
|
||||
are designed to handle this transparently via their *Paging* implementation.
|
||||
|
||||
## Central Classes
|
||||
|
||||
### Model
|
||||
* [IndexJournalEntry](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java)
|
||||
* [IndexJournalEntryHeader](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java)
|
||||
* [IndexJournalEntryData](src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java)
|
||||
* [IndexJournalEntry](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntry.java)
|
||||
* [IndexJournalEntryHeader](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java)
|
||||
* [IndexJournalEntryData](src/main/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java)
|
||||
### I/O
|
||||
* [IndexJournalReader](src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java)
|
||||
* [IndexJournalWriter](src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriter.java)
|
||||
* [IndexJournalReader](src/main/java/nu/marginalia/index/journal/reader/IndexJournalReader.java)
|
||||
* [IndexJournalWriter](src/main/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java)
|
@ -1,4 +0,0 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
public record IndexJournalFileHeader(long fileSize, long wordCount) {
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
public record IndexJournalStatistics(int highestWord, int documentCardinality) { }
|
@ -1,4 +1,4 @@
|
||||
package nu.marginallia.index.journal;
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
@ -2,6 +2,14 @@ package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
||||
/** An entry in the index journal.
|
||||
*
|
||||
* @param header the header of the entry, containing document level data
|
||||
* @param data the data of the entry, containing keyword level data
|
||||
*
|
||||
* @see IndexJournalEntryHeader
|
||||
* @see IndexJournalEntryData
|
||||
*/
|
||||
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
|
||||
|
||||
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
|
@ -1,11 +1,22 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
/** The keyword data of an index journal entry.
|
||||
* The data itself is an interleaved array of
|
||||
* word ids and metadata.
|
||||
* <p>
|
||||
* Odd entries are term ids, even entries are encoded WordMetadata records.
|
||||
* </p>
|
||||
* <p>The civilized way of reading the journal data is to use an IndexJournalReader</p>
|
||||
*
|
||||
* @see WordMetadata
|
||||
* @see IndexJournalReader
|
||||
*/
|
||||
public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Record> {
|
||||
private final int size;
|
||||
public final long[] underlyingArray;
|
@ -1,5 +1,20 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
||||
/** The header of an index journal entry.
|
||||
*
|
||||
* @param entrySize the size of the entry
|
||||
* @param documentFeatures the features of the document, as an encoded HtmlFeature
|
||||
* @param combinedId the combined document id, encoded with UrlIdCodec
|
||||
* @param documentMeta the metadata of the document, as an encoded DocumentMetadata
|
||||
*
|
||||
* @see DocumentMetadata
|
||||
* @see HtmlFeature
|
||||
* @see UrlIdCodec
|
||||
*/
|
||||
public record IndexJournalEntryHeader(int entrySize,
|
||||
int documentFeatures,
|
||||
long combinedId,
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
/** The header of an index journal file. This is the first 16 bytes of the file,
|
||||
* and is not compressed.
|
||||
*
|
||||
* @param fileSizeRecords the size of the file in number of records
|
||||
* @param reserved should be 0
|
||||
*/
|
||||
public record IndexJournalFileHeader(long fileSizeRecords, long reserved) {
|
||||
}
|
@ -8,7 +8,6 @@ import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class IndexJournalReadEntry {
|
||||
public final IndexJournalEntryHeader header;
|
@ -1,21 +1,23 @@
|
||||
package nu.marginalia.index.journal.reader;
|
||||
|
||||
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.LongConsumer;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
/** Tools for reading the index journal. */
|
||||
public interface IndexJournalReader {
|
||||
int FILE_HEADER_SIZE_LONGS = 2;
|
||||
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
/** Create a reader for a single file. */
|
||||
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
||||
return new IndexJournalReaderSingleFile(fileName);
|
||||
}
|
||||
|
||||
/** Create a reader for a set of files. */
|
||||
static IndexJournalReader paging(Path baseDir) throws IOException {
|
||||
return new IndexJournalReaderPagingImpl(baseDir);
|
||||
}
|
||||
@ -36,9 +38,13 @@ public interface IndexJournalReader {
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new pointer to the journal. The IndexJournalPointer is
|
||||
* a two-tiered iterator that allows both iteration over document records
|
||||
* and their keywords
|
||||
*/
|
||||
IndexJournalPointer newPointer();
|
||||
|
||||
|
||||
/** Reader that filters the entries based on the term metadata. */
|
||||
default IndexJournalReader filtering(LongPredicate termMetaFilter) {
|
||||
return new FilteringIndexJournalReader(this, termMetaFilter);
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index.journal.reader;
|
||||
|
||||
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -76,7 +76,7 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
||||
recordIdx = -2;
|
||||
entryData = null;
|
||||
|
||||
if (++docIdx < fileHeader.fileSize()) {
|
||||
if (++docIdx < fileHeader.fileSizeRecords()) {
|
||||
entry = IndexJournalReadEntry.read(dataInputStream);
|
||||
return true;
|
||||
}
|
@ -3,7 +3,7 @@ package nu.marginalia.index.journal.writer;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -123,7 +123,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
// Finalize the file by writing a header in the beginning
|
||||
ByteBuffer header = ByteBuffer.allocate(16);
|
||||
header.putLong(numEntries);
|
||||
header.putLong(0);
|
||||
header.putLong(0); // reserved for future use
|
||||
header.flip();
|
||||
|
||||
while (header.position() < header.limit()) {
|
@ -2,7 +2,7 @@ package nu.marginalia.index.construction;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -5,14 +5,11 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** A pair of file-backed arrays of sorted wordIds
|
||||
* and the count of documents associated with each wordId.
|
||||
|
@ -6,7 +6,6 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReversePreindex;
|
||||
import nu.marginalia.index.construction.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@ -12,7 +12,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -7,7 +7,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
import nu.marginalia.loading.LoaderIndexJournalWriter;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@ -7,7 +7,7 @@ import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
|
@ -10,7 +10,6 @@ import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
|
@ -7,7 +7,6 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
||||
|
@ -13,7 +13,6 @@ import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.index.util.TestUtil;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.service.control.*;
|
||||
import nu.marginalia.service.id.ServiceId;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
Loading…
Reference in New Issue
Block a user