NPE fix for index buckets that aren't loaded, experimental new query mode for domains.

This commit is contained in:
vlofgren 2022-07-28 17:16:19 +02:00
parent 514074112e
commit 23b7a5fc22
12 changed files with 147 additions and 29 deletions

View File

@ -241,7 +241,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
}
@Test

View File

@ -12,7 +12,6 @@ public class RankingDomainData {
private int alias;
private EdgeDomainIndexingState state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;

View File

@ -56,7 +56,7 @@ public class RankingDomainFetcher {
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
}
}
}

View File

@ -108,6 +108,14 @@ public class PerusePageRankV2 {
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
domainFetcher.getPeripheralDomains(domainData -> {
int id = domainData.id;
domainsById.put(id, domainData);
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];

View File

@ -1,21 +1,20 @@
package nu.marginalia.wmsa.edge.index;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.reader.query.Query;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.Comparator;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.LongPredicate;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
public class EdgeIndexBucket {
@ -101,6 +100,10 @@ public class EdgeIndexBucket {
return indexReader != null;
}
public LongStream findHotDomainsForKeyword(IndexBlock block, int wordId, int queryDepth, int minHitCount, int maxResults) {
return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
}
public LongStream getQuery(IndexBlock block, LongPredicate filter, IndexSearchBudget budget, EdgeIndexSearchTerms searchTerms) {
if (null == indexReader) {
logger.warn("Index reader not neady {}", block);
@ -114,15 +117,8 @@ public class EdgeIndexBucket {
.mapToInt(Integer::intValue)
.toArray();
if (logger.isDebugEnabled()) {
logger.debug("Includes: ({}); excludes: ({})", Arrays.
stream(orderedIncludes)
.mapToObj(String::valueOf)
.collect(Collectors.joining(",")),
searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(",")));
}
Query query;
if (orderedIncludes.length == 1) {
query = indexReader.findUnderspecified(block, budget, filter, orderedIncludes[0]);
}
@ -136,6 +132,7 @@ public class EdgeIndexBucket {
for (int term : searchTerms.excludes) {
query = query.not(term);
}
return query.stream();
}

View File

@ -32,6 +32,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.apache.http.HttpStatus;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
@ -88,6 +90,7 @@ public class EdgeIndexService extends Service {
Spark.post("/words/", this::putWords);
Spark.post("/search/", this::search, gson::toJson);
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
@ -204,6 +207,26 @@ public class EdgeIndexService extends Service {
.toArray();
}
private Object searchDomain(Request request, Response response) {
if (indexes.getDictionaryReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
final int wordId = keywordLexicon.getOrInsert(specsSet.keyword);
List<EdgeId<EdgeUrl>> urlIds = indexes
.getBucket(specsSet.bucket)
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
.mapToObj(lv -> new EdgeId<EdgeUrl>((int)(lv & 0xFFFF_FFFFL)))
.toList();
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private Object search(Request request, Response response) {
if (indexes.getDictionaryReader() == null) {
logger.warn("Dictionary reader not yet initialized");
@ -387,6 +410,16 @@ public class EdgeIndexService extends Service {
}
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
int queryDepth, int minHitCount, int maxResults) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return LongStream.empty();
}
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
}
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) {

View File

@ -10,10 +10,14 @@ import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -60,6 +64,11 @@ public class EdgeIndexClient extends AbstractDynamicClient {
.blockingGet();
}
@CheckReturnValue
public EdgeDomainSearchResults queryDomains(Context ctx, EdgeDomainSearchSpecification specs) {
return this.postGet(ctx, "/search-domain/", specs, EdgeDomainSearchResults.class).blockingFirst();
}
@CheckReturnValue
public Observable<Boolean> isBlocked(Context ctx) {

View File

@ -13,8 +13,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.EnumMap;
import java.util.List;
import java.util.Objects;
import java.util.function.LongPredicate;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import java.util.stream.Stream;
public class SearchIndexReader implements AutoCloseable {
@ -55,18 +57,53 @@ public class SearchIndexReader implements AutoCloseable {
queryBuilders = new EnumMap<>(IndexBlock.class);
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex).collect(Collectors.toList()), wordsIndex));
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex), wordsIndex));
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex), wordsIndex));
queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex), wordsIndex));
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex), wordsIndex));
queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex), wordsIndex));
queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex), wordsIndex));
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(Stream.of(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex).collect(Collectors.toList()), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(Stream.of(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex).collect(Collectors.toList()), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
}
@SafeVarargs
public final <T> List<T> listOfNonNulls(T... vals) {
return Stream.of(vals).filter(Objects::nonNull).toList();
}
public LongStream findHotDomainsForKeyword(IndexBlock block, int wordId, int queryDepth, int minHitCount, int maxResults) {
var index = indices.get(block);
if (index == null)
return LongStream.empty();
return index.rangeForWord(wordId)
.stream()
.limit(queryDepth)
.filter(new LongPredicate() {
long last = Long.MIN_VALUE;
int count = 0;
@Override
public boolean test(long value) {
if ((last >>> 32L) == (value >>> 32L)) {
return count++ == minHitCount;
}
else {
last = value;
count = 0;
}
return false;
}
})
.limit(maxResults);
}
public Query findUnderspecified(
@ -116,6 +153,7 @@ public class SearchIndexReader implements AutoCloseable {
}
var range = index.rangeForWord(searchTerm);
if (index.hasUrl(urlId, range)) {
return block;
}

View File

@ -112,6 +112,11 @@ public class IndexQueryBuilder {
@Override
public Query not(int wordId) {
// Happens when an index simply isn't present, won't find data anyway
// so it's safe to no-op the query
if (excludeIndex == null)
return new QueryForIndices(budget, LongStream::empty);
return new QueryForIndices(budget, () -> notStream(wordId));
}

View File

@ -15,12 +15,12 @@ import java.util.List;
public class EdgeSearchResultItem {
public final int blockId;
public final int queryLength;
public final EdgeId<EdgeDomain> domain;
public final EdgeId<EdgeDomain> domain; // this isn't the external domain ID, but a ranking
public final EdgeId<EdgeUrl> url;
public final List<EdgeSearchResultKeywordScore> scores;
public EdgeSearchResultItem(int blockId, int queryLength, long val) {
int urlId = (int) (val & 0xFFFFFFFFL);
int urlId = (int) (val & 0xFFFF_FFFFL);
int domainId = (int) (val >>> 32);
this.queryLength = queryLength;

View File

@ -0,0 +1,15 @@
package nu.marginalia.wmsa.edge.model.search.domain;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class EdgeDomainSearchResults {
public final String keyword;
public final List<EdgeId<EdgeUrl>> results;
}

View File

@ -0,0 +1,15 @@
package nu.marginalia.wmsa.edge.model.search.domain;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
@ToString @AllArgsConstructor
public class EdgeDomainSearchSpecification {
public final int bucket;
public final IndexBlock block;
public final String keyword;
public final int queryDepth;
public final int minHitCount;
public final int maxResults;
}