NPE fix for index buckets that aren't loaded, experimental new query mode for domains.
This commit is contained in:
parent
514074112e
commit
23b7a5fc22
@ -241,7 +241,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
||||
|
||||
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -12,7 +12,6 @@ public class RankingDomainData {
|
||||
private int alias;
|
||||
private EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
public boolean peripheral;
|
||||
|
||||
public int resolveAlias() {
|
||||
if (alias == 0) return id;
|
||||
|
@ -56,7 +56,7 @@ public class RankingDomainFetcher {
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,6 +108,14 @@ public class PerusePageRankV2 {
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
});
|
||||
domainFetcher.getPeripheralDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
domainsById.put(id, domainData);
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
});
|
||||
|
||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||
|
@ -1,21 +1,20 @@
|
||||
package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Comparator;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class EdgeIndexBucket {
|
||||
@ -101,6 +100,10 @@ public class EdgeIndexBucket {
|
||||
return indexReader != null;
|
||||
}
|
||||
|
||||
public LongStream findHotDomainsForKeyword(IndexBlock block, int wordId, int queryDepth, int minHitCount, int maxResults) {
|
||||
return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
||||
}
|
||||
|
||||
public LongStream getQuery(IndexBlock block, LongPredicate filter, IndexSearchBudget budget, EdgeIndexSearchTerms searchTerms) {
|
||||
if (null == indexReader) {
|
||||
logger.warn("Index reader not neady {}", block);
|
||||
@ -114,15 +117,8 @@ public class EdgeIndexBucket {
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Includes: ({}); excludes: ({})", Arrays.
|
||||
stream(orderedIncludes)
|
||||
.mapToObj(String::valueOf)
|
||||
.collect(Collectors.joining(",")),
|
||||
searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(",")));
|
||||
}
|
||||
Query query;
|
||||
|
||||
if (orderedIncludes.length == 1) {
|
||||
query = indexReader.findUnderspecified(block, budget, filter, orderedIncludes[0]);
|
||||
}
|
||||
@ -136,6 +132,7 @@ public class EdgeIndexBucket {
|
||||
for (int term : searchTerms.excludes) {
|
||||
query = query.not(term);
|
||||
}
|
||||
|
||||
return query.stream();
|
||||
}
|
||||
|
||||
|
@ -32,6 +32,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
@ -88,6 +90,7 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
Spark.post("/words/", this::putWords);
|
||||
Spark.post("/search/", this::search, gson::toJson);
|
||||
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
|
||||
|
||||
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
|
||||
|
||||
@ -204,6 +207,26 @@ public class EdgeIndexService extends Service {
|
||||
.toArray();
|
||||
}
|
||||
|
||||
private Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getDictionaryReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
final int wordId = keywordLexicon.getOrInsert(specsSet.keyword);
|
||||
|
||||
List<EdgeId<EdgeUrl>> urlIds = indexes
|
||||
.getBucket(specsSet.bucket)
|
||||
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||
.mapToObj(lv -> new EdgeId<EdgeUrl>((int)(lv & 0xFFFF_FFFFL)))
|
||||
.toList();
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private Object search(Request request, Response response) {
|
||||
if (indexes.getDictionaryReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
@ -387,6 +410,16 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
}
|
||||
|
||||
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
|
||||
int queryDepth, int minHitCount, int maxResults) {
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return LongStream.empty();
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
||||
}
|
||||
|
||||
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
|
@ -10,10 +10,14 @@ import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
||||
import nu.marginalia.wmsa.edge.model.*;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -60,6 +64,11 @@ public class EdgeIndexClient extends AbstractDynamicClient {
|
||||
.blockingGet();
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public EdgeDomainSearchResults queryDomains(Context ctx, EdgeDomainSearchSpecification specs) {
|
||||
return this.postGet(ctx, "/search-domain/", specs, EdgeDomainSearchResults.class).blockingFirst();
|
||||
}
|
||||
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<Boolean> isBlocked(Context ctx) {
|
||||
|
@ -13,8 +13,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.LongStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class SearchIndexReader implements AutoCloseable {
|
||||
@ -55,18 +57,53 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
|
||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex).collect(Collectors.toList()), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
|
||||
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(Stream.of(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex).collect(Collectors.toList()), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(Stream.of(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex).collect(Collectors.toList()), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
public final <T> List<T> listOfNonNulls(T... vals) {
|
||||
return Stream.of(vals).filter(Objects::nonNull).toList();
|
||||
}
|
||||
|
||||
|
||||
public LongStream findHotDomainsForKeyword(IndexBlock block, int wordId, int queryDepth, int minHitCount, int maxResults) {
|
||||
var index = indices.get(block);
|
||||
|
||||
if (index == null)
|
||||
return LongStream.empty();
|
||||
|
||||
return index.rangeForWord(wordId)
|
||||
.stream()
|
||||
.limit(queryDepth)
|
||||
.filter(new LongPredicate() {
|
||||
long last = Long.MIN_VALUE;
|
||||
int count = 0;
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
if ((last >>> 32L) == (value >>> 32L)) {
|
||||
return count++ == minHitCount;
|
||||
}
|
||||
else {
|
||||
last = value;
|
||||
count = 0;
|
||||
|
||||
}
|
||||
return false;
|
||||
}
|
||||
})
|
||||
.limit(maxResults);
|
||||
}
|
||||
|
||||
public Query findUnderspecified(
|
||||
@ -116,6 +153,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
var range = index.rangeForWord(searchTerm);
|
||||
|
||||
if (index.hasUrl(urlId, range)) {
|
||||
return block;
|
||||
}
|
||||
|
@ -112,6 +112,11 @@ public class IndexQueryBuilder {
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) {
|
||||
// Happens when an index simply isn't present, won't find data anyway
|
||||
// so it's safe to no-op the query
|
||||
if (excludeIndex == null)
|
||||
return new QueryForIndices(budget, LongStream::empty);
|
||||
|
||||
return new QueryForIndices(budget, () -> notStream(wordId));
|
||||
}
|
||||
|
||||
|
@ -15,12 +15,12 @@ import java.util.List;
|
||||
public class EdgeSearchResultItem {
|
||||
public final int blockId;
|
||||
public final int queryLength;
|
||||
public final EdgeId<EdgeDomain> domain;
|
||||
public final EdgeId<EdgeDomain> domain; // this isn't the external domain ID, but a ranking
|
||||
public final EdgeId<EdgeUrl> url;
|
||||
public final List<EdgeSearchResultKeywordScore> scores;
|
||||
|
||||
public EdgeSearchResultItem(int blockId, int queryLength, long val) {
|
||||
int urlId = (int) (val & 0xFFFFFFFFL);
|
||||
int urlId = (int) (val & 0xFFFF_FFFFL);
|
||||
int domainId = (int) (val >>> 32);
|
||||
|
||||
this.queryLength = queryLength;
|
||||
|
@ -0,0 +1,15 @@
|
||||
package nu.marginalia.wmsa.edge.model.search.domain;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeDomainSearchResults {
|
||||
public final String keyword;
|
||||
public final List<EdgeId<EdgeUrl>> results;
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
package nu.marginalia.wmsa.edge.model.search.domain;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
|
||||
@ToString @AllArgsConstructor
|
||||
public class EdgeDomainSearchSpecification {
|
||||
public final int bucket;
|
||||
public final IndexBlock block;
|
||||
public final String keyword;
|
||||
public final int queryDepth;
|
||||
public final int minHitCount;
|
||||
public final int maxResults;
|
||||
}
|
Loading…
Reference in New Issue
Block a user