(loader) Update the size of the keyword files created by the loader

Previously these ended up being about 200 Mb each, which is wastefully small.  Increasing the size of these files makes the index construction faster.
This commit is contained in:
Viktor Lofgren 2024-01-10 17:09:19 +01:00
parent f44222ce53
commit 14b7680328
4 changed files with 23 additions and 3 deletions

View File

@ -6,7 +6,12 @@ import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import java.io.IOException;
public interface IndexJournalWriter {
/** Responsible for writing to the index journal.
* <p></p>
* @see IndexJournalWriterSingleFileImpl
* @see IndexJournalWriterPagingImpl
*/
public interface IndexJournalWriter extends AutoCloseable {
void put(IndexJournalEntryHeader header, IndexJournalEntryData entry);
default void put(IndexJournalEntry entry) {
put(entry.header(), entry.data());

View File

@ -10,10 +10,25 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
/** IndexJournalWriter implementation that creates a sequence of journal files,
* delegating to IndexJournalWriterSingleFileImpl to write the individual files.
*
*/
public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
private final Path outputDir;
private int fileNumber = 0;
/* Number of entries to write to each file before switching to the next.
*
* A large limit increases the memory foot print of the index, but reduces
* the construction time. A small number increases the memory footprint, but
* reduces the construction time.
*
* The limit is set to 1,000,000, which amounts to about 1 GB on disk.
*/
private static final int SWITCH_LIMIT = 1_000_000;
private final Logger logger = LoggerFactory.getLogger(getClass());
private IndexJournalWriter currentWriter = null;
private int inputsForFile = 0;
@ -35,7 +50,7 @@ public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
@Override
@SneakyThrows
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
if (++inputsForFile > 100_000) {
if (++inputsForFile > SWITCH_LIMIT) {
inputsForFile = 0;
switchToNextWriter();
}

View File

@ -17,6 +17,7 @@ import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
/** IndexJournalWriter implementation that creates a single journal file */
public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
private static final int ZSTD_BUFFER_SIZE = 8192;

View File

@ -3,7 +3,6 @@ package nu.marginalia.loading.documents;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
import nu.marginalia.io.processed.ProcessedDataFileNames;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.loading.LoaderIndexJournalWriter;
import nu.marginalia.loading.LoaderInputData;