Clean up of the index query handling related code.

This commit is contained in:
Viktor Lofgren 2023-04-10 14:50:57 +02:00
parent e49b1dd155
commit ccc41d1717
13 changed files with 101 additions and 89 deletions

View File

@ -10,9 +10,14 @@ import java.util.stream.Collectors;
public class IndexQuery {
private final List<EntrySource> sources;
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
public final IndexQueryPriority queryPriority;
public IndexQuery(List<EntrySource> sources) {
public final int fetchSizeMultiplier;
public IndexQuery(List<EntrySource> sources, IndexQueryPriority priority, int fetchSizeMultiplier) {
this.sources = sources;
this.queryPriority = priority;
this.fetchSizeMultiplier = fetchSizeMultiplier;
}
public void addInclusionFilter(QueryFilterStepIf filter) {

View File

@ -0,0 +1,14 @@
package nu.marginalia.index.query;
/** Designates the presumptive value of an IndexQuery.
*/
public enum IndexQueryPriority {
/** This is likely to produce highly relevant results */
BEST,
/** This may produce relevant results */
GOOD,
/** This is a fallback query, only execute if no higher prioritized query returned any results */
FALLBACK
}

View File

@ -3,7 +3,6 @@ package nu.marginalia.index.full;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import static java.lang.Math.min;
@ -15,14 +14,11 @@ public class ReverseIndexFullEntrySource implements EntrySource {
final int entrySize;
private final int wordId;
private final ReverseIndexEntrySourceBehavior behavior;
public ReverseIndexFullEntrySource(BTreeReader reader,
int entrySize,
ReverseIndexEntrySourceBehavior behavior,
int wordId) {
this.reader = reader;
this.behavior = behavior;
this.entrySize = entrySize;
this.wordId = wordId;
@ -37,13 +33,6 @@ public class ReverseIndexFullEntrySource implements EntrySource {
@Override
public void read(LongQueryBuffer buffer) {
if (behavior == ReverseIndexEntrySourceBehavior.DO_NOT_PREFER
&& buffer.hasRetainedData())
{
pos = endOffset;
return;
}
buffer.end = min(buffer.end, endOffset - pos);
reader.readData(buffer.data, buffer.end, pos);
pos += buffer.end;
@ -71,6 +60,6 @@ public class ReverseIndexFullEntrySource implements EntrySource {
@Override
public String indexName() {
return "Priority:" + wordId;
return "Full:" + wordId;
}
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.full;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.array.LongArray;
@ -51,7 +50,7 @@ public class ReverseIndexFullReader {
return createReaderNew(offset).findEntry(documentId) >= 0;
}
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
public EntrySource documents(int wordId) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
@ -63,7 +62,7 @@ public class ReverseIndexFullReader {
if (offset < 0) return new EmptyEntrySource();
return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, behavior, wordId);
return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, wordId);
}
public QueryFilterStepIf also(int wordId) {

View File

@ -3,7 +3,6 @@ package nu.marginalia.index.priority;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import static java.lang.Math.min;
@ -13,12 +12,10 @@ public class ReverseIndexPriorityEntrySource implements EntrySource {
int pos;
int endOffset;
private final ReverseIndexEntrySourceBehavior behavior;
private final int wordId;
public ReverseIndexPriorityEntrySource(BTreeReader reader, ReverseIndexEntrySourceBehavior behavior, int wordId) {
public ReverseIndexPriorityEntrySource(BTreeReader reader, int wordId) {
this.reader = reader;
this.behavior = behavior;
this.wordId = wordId;
pos = 0;
@ -32,13 +29,6 @@ public class ReverseIndexPriorityEntrySource implements EntrySource {
@Override
public void read(LongQueryBuffer buffer) {
if (behavior == ReverseIndexEntrySourceBehavior.DO_NOT_PREFER
&& buffer.hasRetainedData())
{
pos = endOffset;
return;
}
buffer.end = min(buffer.end, endOffset - pos);
reader.readData(buffer.data, buffer.end, pos);
pos += buffer.end;

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.priority;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
@ -46,7 +45,7 @@ public class ReverseIndexPriorityReader {
if (offset < 0) return new EmptyEntrySource();
return new ReverseIndexPriorityEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER, wordId);
return new ReverseIndexPriorityEntrySource(createReaderNew(offset), wordId);
}
private BTreeReader createReaderNew(long offset) {

View File

@ -1,11 +0,0 @@
package nu.marginalia.index.query;
public enum ReverseIndexEntrySourceBehavior {
/** Eagerly read from this entry source */
DO_PREFER,
/** Do not use this entry source if entries have been fetched
* from another entry source
*/
DO_NOT_PREFER
}

View File

@ -8,7 +8,6 @@ import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
@ -101,22 +100,22 @@ class ReverseIndexFullConverterTest {
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 3));
var buffer = new LongQueryBuffer(32);
reverseIndexReader.documents(keywordLexicon.getReadOnly("1"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer);
reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer);
assertArrayEquals(LongStream.range(1, 17).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
System.out.println(buffer);
buffer.reset();
reverseIndexReader.documents(keywordLexicon.getReadOnly("2"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer);
reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer);
assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
System.out.println(buffer);
buffer.reset();
reverseIndexReader.documents(keywordLexicon.getReadOnly("3"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer);
reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer);
assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
System.out.println(buffer);
buffer.reset();
var es = reverseIndexReader.documents(keywordLexicon.getReadOnly("7"), ReverseIndexEntrySourceBehavior.DO_PREFER);
var es = reverseIndexReader.documents(keywordLexicon.getReadOnly("7"));
do {
buffer.reset();
es.read(buffer);

View File

@ -10,7 +10,6 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
@ -123,7 +122,7 @@ class ReverseIndexFullConverterTest2 {
for (int i = workSetStart; i < workSetSize; i++) {
var es = reverseReader.documents(i, ReverseIndexEntrySourceBehavior.DO_PREFER);
var es = reverseReader.documents(i);
LongQueryBuffer lqb = new LongQueryBuffer(100);
while (es.hasMore()) {
lqb.reset();
@ -148,7 +147,7 @@ class ReverseIndexFullConverterTest2 {
for (int i = workSetStart; i < workSetSize; i++) {
var es = reverseReader.documents(i, ReverseIndexEntrySourceBehavior.DO_PREFER);
var es = reverseReader.documents(i);
LongQueryBuffer lqb = new LongQueryBuffer(100);
while (es.hasMore()) {
lqb.reset();

View File

@ -3,10 +3,7 @@ package nu.marginalia.index.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.*;
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
import nu.marginalia.index.svc.IndexSearchSetsService;
import org.jetbrains.annotations.NotNull;
@ -15,7 +12,6 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.locks.Lock;
@ -102,32 +98,39 @@ public class SearchIndex {
}
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
final int[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
List<IndexQuery> queries = new ArrayList<>(10);
// Fetch more results than specified for short queries, as the query itself is cheap and the
// priority index may contain a considerable amount of less interesting results
final int fetchSizeMultiplier;
if (orderedIncludes.length == 1) fetchSizeMultiplier = 4;
else fetchSizeMultiplier = 1;
// To ensure that good results are processed first, create query heads for the priority index that filter for terms
// that contain pairs of two search terms
if (orderedIncludes.length > 1) {
for (int i = 0; i + 1 < orderedIncludes.length; i++) {
for (int j = i + 1; j < orderedIncludes.length; j++) {
if (orderedIncludesPrio.length > 1) {
for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) {
for (int j = i + 1; j < orderedIncludesPrio.length; j++) {
var entrySource = indexReader
.findPriorityWord(orderedIncludes[i])
.alsoPrio(orderedIncludes[j]);
.findPriorityWord(IndexQueryPriority.BEST, orderedIncludesPrio[i], fetchSizeMultiplier)
.alsoPrio(orderedIncludesPrio[j]);
queryHeads.add(entrySource);
}
}
}
// Next consider entries that appear only once in the priority index
for (var wordId : orderedIncludes) {
queryHeads.add(indexReader.findPriorityWord(wordId));
for (var wordId : orderedIncludesPrio) {
queryHeads.add(indexReader.findPriorityWord(IndexQueryPriority.GOOD, wordId, fetchSizeMultiplier));
}
// Finally consider terms in the full index, but only do this for sufficiently long queries
// as short queries tend to be too underspecified to produce anything other than CPU warmth
if (orderedIncludes.length > 3) {
queryHeads.add(indexReader.findFullWord(orderedIncludes[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
if (orderedIncludes.length >= 3) {
queryHeads.add(indexReader.findFullWord(IndexQueryPriority.FALLBACK, orderedIncludes[0], fetchSizeMultiplier));
}
for (var query : queryHeads) {
@ -162,6 +165,12 @@ public class SearchIndex {
);
}
private int compareKeywordsPrio(int a, int b) {
return Long.compare(
indexReader.numHitsPrio(a),
indexReader.numHitsPrio(b)
);
}
/** Replaces the values of ids with their associated metadata, or 0L if absent */
public long[] getTermMetadata(int termId, long[] docs) {
return indexReader.getMetadata(termId, docs);

View File

@ -68,6 +68,6 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder {
public IndexQuery build() {
return query;
}
}
}

View File

@ -2,14 +2,10 @@ package nu.marginalia.index.index;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.ParamMatchingQueryFilter;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.index.query.*;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
import nu.marginalia.index.full.ReverseIndexFullReader;
import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -31,17 +27,21 @@ public class SearchIndexReader {
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
public IndexQueryBuilder findPriorityWord(int wordId) {
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
List.of(reverseIndexPriorityReader.priorityDocuments(wordId))), wordId);
public IndexQueryBuilder findPriorityWord(IndexQueryPriority priority, int wordId, int fetchSizeMultiplier) {
var sources = List.of(reverseIndexPriorityReader.priorityDocuments(wordId));
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader,
new IndexQuery(sources, priority, fetchSizeMultiplier), wordId);
}
public IndexQueryBuilder findFullWord(int wordId, ReverseIndexEntrySourceBehavior behavior) {
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(
List.of(reverseIndexFullReader.documents(wordId, behavior))), wordId);
public IndexQueryBuilder findFullWord(IndexQueryPriority priority, int wordId, int fetchSizeMultiplier) {
var sources = List.of(reverseIndexFullReader.documents(wordId));
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader,
new IndexQuery(sources, priority, fetchSizeMultiplier), wordId);
}
QueryFilterStepIf filterForParams(IndexQueryParams params) {
public QueryFilterStepIf filterForParams(IndexQueryParams params) {
return new ParamMatchingQueryFilter(params, forwardIndexReader);
}

View File

@ -17,6 +17,7 @@ import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.index.SearchIndexSearchTerms;
import nu.marginalia.index.query.IndexQueryPriority;
import nu.marginalia.index.results.IndexMetadataService;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.results.IndexResultValuator;
@ -24,6 +25,7 @@ import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.results.IndexResultDomainDeduplicator;
import nu.marginalia.index.svc.searchset.SmallSearchSet;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.util.QueryParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
@ -152,13 +154,22 @@ public class IndexQueryService {
private TLongList evaluateSubqueries(SearchParameters params) {
final TLongList results = new TLongArrayList(params.fetchSize);
outer:
// These queries are various term combinations
for (var subquery : params.subqueries) {
if (!params.hasTimeLeft()) {
logger.info("Query timed out {}, ({}), -{}",
subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
break;
}
logger.info(queryMarker, "{}", subquery);
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
if (searchTerms.isEmpty()) {
logger.info(queryMarker, "empty");
continue;
}
@ -167,26 +178,35 @@ public class IndexQueryService {
// These queries are different indices for one subquery
List<IndexQuery> queries = params.createIndexQueries(index, searchTerms);
for (var query : queries) {
var resultsForSq = executeQuery(query, params, fetchSizeMultiplier(params, searchTerms));
if (!params.hasTimeLeft()) {
break;
}
if (omitQuery(params, query, results.size())) {
logger.info(queryMarker, "Omitting {}", query);
continue;
}
var resultsForSq = executeQuery(query, params);
logger.info(queryMarker, "{} from {}", resultsForSq.size(), query);
results.addAll(resultsForSq);
if (!params.hasTimeLeft()) {
logger.info("Query timed out {}, ({}), -{}",
subquery.searchTermsInclude, subquery.searchTermsAdvice, subquery.searchTermsExclude);
break outer;
}
}
}
return results;
}
private int fetchSizeMultiplier(SearchParameters params, SearchIndexSearchTerms terms) {
if (terms.size() == 1) {
return 4;
}
return 1;
private boolean omitQuery(SearchParameters params, IndexQuery query, int resultCount) {
var priority = query.queryPriority;
return switch (priority) {
case BEST -> false;
case GOOD -> resultCount > params.fetchSize / 4;
case FALLBACK -> resultCount != 0;
};
}
private void logSearchTerms(SearchSubquery subquery, SearchIndexSearchTerms searchTerms) {
@ -214,9 +234,9 @@ public class IndexQueryService {
}
}
private TLongArrayList executeQuery(IndexQuery query, SearchParameters params, int fetchSizeMultiplier)
private TLongArrayList executeQuery(IndexQuery query, SearchParameters params)
{
final int fetchSize = params.fetchSize * fetchSizeMultiplier;
final int fetchSize = params.fetchSize * query.fetchSizeMultiplier;
final TLongArrayList results = new TLongArrayList(fetchSize);
final LongQueryBuffer buffer = new LongQueryBuffer(fetchSize);