Index optimizations that should reduce small object churn and IOPS a bit.
This commit is contained in:
parent
460dd098b0
commit
beafdfda9c
@ -36,6 +36,16 @@ sourceSets {
|
||||
resources.srcDir file('src/e2e/resources')
|
||||
}
|
||||
}
|
||||
jmh {
|
||||
java {
|
||||
java {
|
||||
compileClasspath += main.output + test.output
|
||||
runtimeClasspath += main.output + test.output
|
||||
srcDir file('src/jmh/java')
|
||||
}
|
||||
resources.srcDir file('src/jmh/resources')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
@ -43,7 +53,9 @@ java {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
jmhJar {
|
||||
zip64 true
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third_party')
|
||||
|
||||
@ -142,6 +154,9 @@ dependencies {
|
||||
implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||
implementation 'org.seleniumhq.selenium:selenium-java:4.3.0'
|
||||
implementation 'org.sejda.imageio:webp-imageio:0.1.6'
|
||||
|
||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||
}
|
||||
|
||||
configurations {
|
||||
|
@ -0,0 +1,85 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class ByteBufferBlockReadVsIndividualRead {
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class ByteBufferState {
|
||||
private MultimapFileLong mmf;
|
||||
private Path file;
|
||||
private static final int size = 800*1024*1024;
|
||||
@Setup(Level.Iteration)
|
||||
@SneakyThrows
|
||||
public void setUp() {
|
||||
file = Files.createTempFile("jmh", ".dat");
|
||||
mmf = MultimapFileLong.forOutput(file, size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
mmf.put(i, i);
|
||||
}
|
||||
}
|
||||
|
||||
@TearDown(Level.Iteration)
|
||||
@SneakyThrows
|
||||
public void tearDown() {
|
||||
mmf.close();
|
||||
Files.delete(file);
|
||||
}
|
||||
|
||||
LongStream basicStream() {
|
||||
return IntStream.range(0, size).mapToLong(mmf::get);
|
||||
}
|
||||
|
||||
LongStream blockStream(int blockSize) {
|
||||
long urlOffset = 0;
|
||||
long endOffset = size;
|
||||
|
||||
long[] arry = new long[blockSize];
|
||||
|
||||
return LongStream
|
||||
.iterate(urlOffset, i -> i< endOffset, i->i+blockSize)
|
||||
.flatMap(pos -> {
|
||||
int sz = (int)(Math.min(pos+blockSize, endOffset) - pos);
|
||||
mmf.read(arry, sz, pos);
|
||||
return Arrays.stream(arry, 0, sz);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// @Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
public long testBasic(ByteBufferState state) {
|
||||
return state.basicStream().sum();
|
||||
}
|
||||
|
||||
|
||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 0)
|
||||
public long testBlock128(ByteBufferState state) {
|
||||
return state.blockStream(128).sum();
|
||||
}
|
||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 0)
|
||||
public long testBlock1024(ByteBufferState state) {
|
||||
return state.blockStream(1024).sum();
|
||||
}
|
||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 0)
|
||||
public long testBlock8192(ByteBufferState state) {
|
||||
return state.blockStream(8192).sum();
|
||||
}
|
||||
}
|
@ -0,0 +1,111 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class CachingBTreeReader {
|
||||
|
||||
private final MultimapFileLong file;
|
||||
public final BTreeContext ctx;
|
||||
|
||||
private final MultimapSearcher dataSearcher;
|
||||
|
||||
public CachingBTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
||||
this.file = file;
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader(long fileOffset) {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public Cache prepareCache() {
|
||||
return new Cache();
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(BTreeHeader header, Cache cache, final long keyRaw) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
final long dataAddress = header.dataOffsetLongs();
|
||||
|
||||
final long searchStart;
|
||||
final long numEntries;
|
||||
|
||||
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
|
||||
searchStart = dataAddress;
|
||||
numEntries = header.numEntries();
|
||||
}
|
||||
else {
|
||||
cache.load(header);
|
||||
|
||||
long dataLayerOffset = searchIndex(header, cache, key);
|
||||
if (dataLayerOffset < 0) {
|
||||
return dataLayerOffset;
|
||||
}
|
||||
|
||||
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
|
||||
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
|
||||
}
|
||||
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long searchIndex(BTreeHeader header, Cache cache, long key) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
long layerOffset = 0;
|
||||
|
||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
||||
|
||||
final long nextLayerOffset = cache.relativePositionInIndex(key, (int) indexLayerBlockOffset, blockSize);
|
||||
if (nextLayerOffset < 0)
|
||||
return nextLayerOffset;
|
||||
|
||||
layerOffset = blockSize * (nextLayerOffset + layerOffset);
|
||||
}
|
||||
|
||||
return layerOffset;
|
||||
}
|
||||
|
||||
|
||||
public class Cache {
|
||||
long[] indexData;
|
||||
|
||||
public void load(BTreeHeader header) {
|
||||
if (indexData != null)
|
||||
return;
|
||||
|
||||
int size = (int)(header.dataOffsetLongs() - header.indexOffsetLongs());
|
||||
indexData = new long[size];
|
||||
file.read(indexData, header.indexOffsetLongs());
|
||||
}
|
||||
|
||||
long relativePositionInIndex(long key, int fromIndex, int n) {
|
||||
int low = 0;
|
||||
int high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = (low + high) >>> 1;
|
||||
long midVal = indexData[fromIndex + mid];
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return low;
|
||||
}
|
||||
}
|
||||
}
|
@ -93,7 +93,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
channel = file.getChannel();
|
||||
mappedSize = 0;
|
||||
|
||||
logger.debug("Creating multimap file size = {} / buffer size = {}, mode = {}",
|
||||
logger.trace("Creating multimap file size = {} / buffer size = {}, mode = {}",
|
||||
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
|
||||
}
|
||||
|
||||
@ -149,7 +149,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
}
|
||||
|
||||
public void force() {
|
||||
logger.debug("Forcing");
|
||||
logger.trace("Forcing");
|
||||
|
||||
for (MappedByteBuffer buffer: mappedByteBuffers) {
|
||||
buffer.force();
|
||||
|
@ -125,8 +125,11 @@ public class EdgeIndexBucket {
|
||||
else {
|
||||
query = indexReader.findWord(block, budget, filter, orderedIncludes[0]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < orderedIncludes.length; i++) {
|
||||
int i;
|
||||
for (i = 1; (i < 2 && i < orderedIncludes.length) || i < orderedIncludes.length-1; i++) {
|
||||
query = query.alsoCached(orderedIncludes[i]);
|
||||
}
|
||||
for (; i < orderedIncludes.length; i++) {
|
||||
query = query.also(orderedIncludes[i]);
|
||||
}
|
||||
for (int term : searchTerms.excludes) {
|
||||
|
@ -5,6 +5,8 @@ import com.google.inject.name.Named;
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import nu.marginalia.util.btree.BTreeReader;
|
||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||
import org.slf4j.Logger;
|
||||
@ -13,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class SearchIndex implements AutoCloseable {
|
||||
@ -21,6 +24,8 @@ public class SearchIndex implements AutoCloseable {
|
||||
private final IndexWordsTable words;
|
||||
private final RandomAccessFile wordsFile;
|
||||
private final BTreeReader bTreeReader;
|
||||
private final CachingBTreeReader cachingBTreeReader;
|
||||
|
||||
private final Logger logger;
|
||||
|
||||
@Inject
|
||||
@ -40,6 +45,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
words = IndexWordsTable.ofFile(wordsFile);
|
||||
|
||||
bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
|
||||
cachingBTreeReader = new CachingBTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
|
||||
|
||||
Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader));
|
||||
}
|
||||
@ -64,27 +70,16 @@ public class SearchIndex implements AutoCloseable {
|
||||
if (length < 0) return 0;
|
||||
if (length > 0) return length;
|
||||
|
||||
var range = rangeForWord(wordId);
|
||||
if (range.isPresent()) {
|
||||
return bTreeReader.getHeader(range.dataOffset).numEntries();
|
||||
}
|
||||
return 0;
|
||||
return rangeForWord(wordId).numEntries();
|
||||
}
|
||||
|
||||
public UrlIndexTree rangeForWord(int wordId) {
|
||||
return new UrlIndexTree(words.positionForWord(wordId));
|
||||
}
|
||||
|
||||
public boolean hasUrl(long url, UrlIndexTree range) {
|
||||
if (!range.isPresent())
|
||||
return false;
|
||||
|
||||
return bTreeReader.findEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0;
|
||||
}
|
||||
|
||||
public class UrlIndexTree {
|
||||
final long dataOffset;
|
||||
|
||||
private BTreeHeader header;
|
||||
public UrlIndexTree(long dataOffset) {
|
||||
this.dataOffset = dataOffset;
|
||||
}
|
||||
@ -93,15 +88,65 @@ public class SearchIndex implements AutoCloseable {
|
||||
if (dataOffset < 0) {
|
||||
return LongStream.empty();
|
||||
}
|
||||
var header = bTreeReader.getHeader(dataOffset);
|
||||
if (header == null) {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
}
|
||||
|
||||
long urlOffset = header.dataOffsetLongs();
|
||||
return LongStream.range(urlOffset, urlOffset + header.numEntries()).map(urls::get);
|
||||
long endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||
int stepSize = Math.min(1024, header.numEntries());
|
||||
|
||||
long[] buffer = new long[stepSize];
|
||||
|
||||
return LongStream
|
||||
.iterate(urlOffset, i -> i< endOffset, i->i+stepSize)
|
||||
.flatMap(pos -> {
|
||||
int sz = (int)(Math.min(pos+stepSize, endOffset) - pos);
|
||||
urls.read(buffer, sz, pos);
|
||||
return Arrays.stream(buffer, 0, sz);
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isPresent() {
|
||||
return dataOffset >= 0;
|
||||
}
|
||||
|
||||
public long numEntries() {
|
||||
if (header != null) {
|
||||
return header.numEntries();
|
||||
}
|
||||
else if (dataOffset < 0) return 0L;
|
||||
else {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
return header.numEntries();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasUrl(long url) {
|
||||
if (header != null) {
|
||||
return bTreeReader.findEntry(header, url) >= 0;
|
||||
}
|
||||
else if (dataOffset < 0) return false;
|
||||
else {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
return bTreeReader.findEntry(header, url) >= 0;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
|
||||
if (header != null) {
|
||||
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
|
||||
}
|
||||
else if (dataOffset < 0) return false;
|
||||
else {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
|
||||
}
|
||||
}
|
||||
|
||||
public CachingBTreeReader.Cache createIndexCache() {
|
||||
return cachingBTreeReader.prepareCache();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,14 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader;
|
||||
|
||||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -27,7 +24,6 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
private final EnumMap<IndexBlock, IndexQueryBuilder> underspecifiedQueryBuilders;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Cache<Pair<IndexBlock, Integer>, Long> numHitsCache = CacheBuilder.newBuilder().maximumSize(1000).build();
|
||||
|
||||
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
||||
IndexBlock.Top,
|
||||
@ -131,26 +127,20 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
for (var idx : indices.values()) {
|
||||
idx.close();
|
||||
}
|
||||
numHitsCache.invalidateAll();
|
||||
numHitsCache.cleanUp();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public long numHits(IndexBlock block, int word) {
|
||||
return numHitsCache.get(Pair.of(block, word), () -> numHitsForBlockWord(block, word));
|
||||
}
|
||||
|
||||
private long numHitsForBlockWord(IndexBlock block, int word) {
|
||||
IndexQueryBuilder builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return 0L;
|
||||
|
||||
return builder
|
||||
.getIndicies()
|
||||
.stream()
|
||||
.mapToLong(idx -> idx.numUrls(word))
|
||||
.sum();
|
||||
long hits = 0;
|
||||
for (var index : builder.getIndicies()) {
|
||||
hits += index.numUrls(word);
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
|
||||
@ -163,7 +153,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
|
||||
var range = index.rangeForWord(searchTerm);
|
||||
|
||||
if (index.hasUrl(urlId, range)) {
|
||||
if (range.hasUrl(urlId)) {
|
||||
return block;
|
||||
}
|
||||
}
|
||||
@ -174,8 +164,8 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
final var index = indices.get(block);
|
||||
if (null == index) return false;
|
||||
|
||||
final var range = index.rangeForWord(searchTerm);
|
||||
|
||||
return index.hasUrl(urlId, range);
|
||||
return index
|
||||
.rangeForWord(searchTerm)
|
||||
.hasUrl(urlId);
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,6 @@ import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
@ -51,13 +50,13 @@ public class IndexQueryBuilder {
|
||||
|
||||
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
|
||||
|
||||
return new QueryForIndices(budget, () ->
|
||||
Streams.concat(IntStream.range(1, relevantIndices.length)
|
||||
.mapToObj(i -> underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[i], wordId))
|
||||
.flatMapToLong(Function.identity()),
|
||||
fstRange.stream().takeWhile(budget::take))
|
||||
.filter(filter)
|
||||
);
|
||||
LongStream priorityStream = underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[0], wordId);
|
||||
for (int i = 1; i < relevantIndices.length; i++) {
|
||||
priorityStream = Streams.concat(priorityStream, underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[i], wordId));
|
||||
}
|
||||
LongStream stream = LongStream.concat(priorityStream, fstRange.stream().takeWhile(budget::take)).filter(filter);
|
||||
|
||||
return new QueryForIndices(budget, () -> stream);
|
||||
}
|
||||
|
||||
private LongStream underspecifiedPairStream(IndexSearchBudget budget, int limit, int firstIdx, int otherIdx, int wordId) {
|
||||
@ -77,10 +76,9 @@ public class IndexQueryBuilder {
|
||||
}
|
||||
|
||||
var sndRange = snd.rangeForWord(wordId);
|
||||
var cache = sndRange.createIndexCache();
|
||||
|
||||
return fst.rangeForWord(wordId).stream().takeWhile(budget::take).limit(limit).filter(
|
||||
url -> snd.hasUrl(url, sndRange)
|
||||
);
|
||||
return fst.rangeForWord(wordId).stream().takeWhile(budget::take).limit(limit).filter(data -> sndRange.hasUrl(cache, data));
|
||||
}
|
||||
|
||||
|
||||
@ -110,6 +108,12 @@ public class IndexQueryBuilder {
|
||||
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStream(idx, wordId)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query alsoCached(int wordId) {
|
||||
return new QueryForIndices(budget,
|
||||
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStreamCached(idx, wordId)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) {
|
||||
// Happens when an index simply isn't present, won't find data anyway
|
||||
@ -123,12 +127,21 @@ public class IndexQueryBuilder {
|
||||
private LongStream alsoStream(SearchIndex idx, int wordId) {
|
||||
var range = idx.rangeForWord(wordId);
|
||||
|
||||
return stream().filter(url -> idx.hasUrl(url, range)).takeWhile(budget::take);
|
||||
return stream().filter(range::hasUrl).takeWhile(budget::take);
|
||||
}
|
||||
|
||||
private LongStream alsoStreamCached(SearchIndex idx, int wordId) {
|
||||
var range = idx.rangeForWord(wordId);
|
||||
var cache = range.createIndexCache();
|
||||
|
||||
return stream().filter(data -> range.hasUrl(cache, data)).takeWhile(budget::take);
|
||||
}
|
||||
|
||||
private LongStream notStream(int wordId) {
|
||||
var bodyRange = excludeIndex.rangeForWord(wordId);
|
||||
return stream().filter(url -> !excludeIndex.hasUrl(url, bodyRange)).takeWhile(budget::take);
|
||||
var cache = bodyRange.createIndexCache();
|
||||
|
||||
return stream().filter(url -> !bodyRange.hasUrl(cache, url)).takeWhile(budget::take);
|
||||
}
|
||||
|
||||
public LongStream stream() {
|
||||
|
@ -7,6 +7,9 @@ public interface Query {
|
||||
@Override
|
||||
public Query also(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query alsoCached(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) { return this; }
|
||||
|
||||
@ -15,6 +18,8 @@ public interface Query {
|
||||
};
|
||||
|
||||
Query also(int wordId);
|
||||
Query alsoCached(int wordId);
|
||||
|
||||
Query not(int wordId);
|
||||
|
||||
LongStream stream();
|
||||
|
@ -0,0 +1,335 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class BTreeWriterTestCachedReader {
|
||||
|
||||
final BTreeContext ctx = new BTreeContext(4, 2, 0xFFFF_FFFF_FFFF_FFFFL, 3);
|
||||
final BTreeWriter writer = new BTreeWriter(null, ctx);
|
||||
|
||||
Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@Test
|
||||
void testSmallDataBlock() {
|
||||
var header = writer.makeHeader(1024, ctx.BLOCK_SIZE_WORDS()/2);
|
||||
assertEquals(1024 + BTreeHeader.BTreeHeaderSizeLongs, header.dataOffsetLongs());
|
||||
assertEquals(header.dataOffsetLongs(), header.indexOffsetLongs());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testLayerCount() {
|
||||
int wsq = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
assertEquals(2, writer.makeHeader(1024, wsq-1).layers());
|
||||
assertEquals(2, writer.makeHeader(1024, wsq).layers());
|
||||
assertEquals(3, writer.makeHeader(1024, wsq+1).layers());
|
||||
|
||||
assertEquals(3, writer.makeHeader(1024, wcub-1).layers());
|
||||
assertEquals(3, writer.makeHeader(1024, wcub).layers());
|
||||
assertEquals(4, writer.makeHeader(1024, wcub+1).layers());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testLayerOffset() {
|
||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2));
|
||||
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
var header = writer.makeHeader(0, i);
|
||||
|
||||
|
||||
printTreeLayout(i, header, ctx);
|
||||
|
||||
if (header.layers() >= 1) {
|
||||
assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) {
|
||||
StringJoiner sj = new StringJoiner(",");
|
||||
for (int l = 0; l < header.layers(); l++) {
|
||||
sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
|
||||
}
|
||||
System.out.println(numEntries + ":" + sj);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEntrySize2() throws IOException {
|
||||
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Integer> toPut = new HashSet<>();
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((int)(Integer.MAX_VALUE * Math.random())));
|
||||
}
|
||||
|
||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(2L*i, data[i]);
|
||||
slice.put( 2L*i + 1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new CachingBTreeReader(mmf, ctx);
|
||||
var cache = reader.prepareCache();
|
||||
var header = reader.getHeader(0);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(header, cache, data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEntrySize2Small() throws IOException {
|
||||
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Integer> toPut = new HashSet<>();
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
while (!toPut.add((int)(Integer.MAX_VALUE * Math.random())));
|
||||
}
|
||||
|
||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write( 0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(2L*i, data[i]);
|
||||
slice.put(2L*i + 1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new CachingBTreeReader(mmf, ctx);
|
||||
var cache = reader.prepareCache();
|
||||
var header = reader.getHeader(0);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(header, cache, data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long)(Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(header, cache, val));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testWriteEqualityNotMasked() throws IOException {
|
||||
for (int bs = 2; bs <= 4; bs++) {
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Long> toPut = new HashSet<>();
|
||||
|
||||
var ctx = new BTreeContext(5, 1, ~0, bs);
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
||||
}
|
||||
|
||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
||||
|
||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(i, data[i]);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new CachingBTreeReader(mmf, ctx);
|
||||
var cache = reader.prepareCache();
|
||||
var header = reader.getHeader(0);
|
||||
|
||||
printTreeLayout(toPut.size(), header, ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(header, cache, data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(header, cache, val));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEqualityMasked() throws IOException {
|
||||
|
||||
for (int bs = 2; bs <= 4; bs++) {
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Long> toPut = new HashSet<>();
|
||||
|
||||
long mask = 0xFFFF_FFFF_0000_0000L;
|
||||
var ctx = new BTreeContext(5, 1, mask, bs);
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
||||
}
|
||||
|
||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
||||
|
||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(i, data[i]);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new CachingBTreeReader(mmf, ctx);
|
||||
var cache = reader.prepareCache();
|
||||
var header = reader.getHeader(0);
|
||||
|
||||
printTreeLayout(toPut.size(), header, ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(header, cache,data[i] & mask);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(header, cache, val & mask));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteTwoEqualityMasked() throws IOException {
|
||||
|
||||
for (int bs = 2; bs <= 4; bs++) {
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Long> toPut = new HashSet<>();
|
||||
|
||||
long mask = 0xFFFF_FFFF_0000_0000L;
|
||||
var ctx = new BTreeContext(5, 2, mask, bs);
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
||||
}
|
||||
|
||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
||||
|
||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(i*2L, data[i]);
|
||||
slice.put(i*2L+1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new CachingBTreeReader(mmf, ctx);
|
||||
var cache = reader.prepareCache();
|
||||
var header = reader.getHeader(0);
|
||||
|
||||
printTreeLayout(toPut.size(), header, ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(header, cache, data[i] & mask);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(header, cache,val & mask));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user