From 23b7a5fc22db7cae6d31248f3188d454fd051272 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 28 Jul 2022 17:16:19 +0200 Subject: [PATCH] NPE fix for index buckets that aren't loaded, experimental new query mode for domains. --- .../wmsa/edge/EdgeSearchE2ETest.java | 1 - .../util/ranking/RankingDomainData.java | 1 - .../util/ranking/RankingDomainFetcher.java | 2 +- .../util/ranking/tool/PerusePageRankV2.java | 8 +++ .../wmsa/edge/index/EdgeIndexBucket.java | 19 +++--- .../wmsa/edge/index/EdgeIndexService.java | 33 ++++++++++ .../edge/index/client/EdgeIndexClient.java | 11 +++- .../edge/index/reader/SearchIndexReader.java | 62 +++++++++++++++---- .../index/reader/query/IndexQueryBuilder.java | 5 ++ .../model/search/EdgeSearchResultItem.java | 4 +- .../domain/EdgeDomainSearchResults.java | 15 +++++ .../domain/EdgeDomainSearchSpecification.java | 15 +++++ 12 files changed, 147 insertions(+), 29 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 643e61a5..1d14af88 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -241,7 +241,6 @@ public class EdgeSearchE2ETest extends E2ETestBase { Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html)); - } @Test diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java index 29424b35..2a4b0f65 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java @@ -12,7 +12,6 @@ public class RankingDomainData { private int alias; private EdgeDomainIndexingState state; public final int knownUrls; - public boolean peripheral; public int resolveAlias() { if (alias == 0) return id; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java index 79285a83..da47434f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java @@ -56,7 +56,7 @@ public class RankingDomainFetcher { while (rsp.next()) { int id = rsp.getInt(1); if (!blacklist.isBlacklisted(id)) { - consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false)); + consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5))); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java index 29effdcf..b8b31c8c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java @@ -108,6 +108,14 @@ public class PerusePageRankV2 { domainIndexToId.put(domainIndexToId.size(), id); domainIdToIndex.put(id, domainIdToIndex.size()); }); + domainFetcher.getPeripheralDomains(domainData -> { + int id = domainData.id; + + domainsById.put(id, domainData); + + domainIndexToId.put(domainIndexToId.size(), id); + domainIdToIndex.put(id, domainIdToIndex.size()); + }); linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index 09890252..c883b513 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -1,21 +1,20 @@ package nu.marginalia.wmsa.edge.index; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.reader.query.Query; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.*; +import java.util.Comparator; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.LongPredicate; -import java.util.stream.Collectors; import java.util.stream.LongStream; public class EdgeIndexBucket { @@ -101,6 +100,10 @@ public class EdgeIndexBucket { return indexReader != null; } + public LongStream findHotDomainsForKeyword(IndexBlock block, int wordId, int queryDepth, int minHitCount, int maxResults) { + return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults); + } + public LongStream getQuery(IndexBlock block, LongPredicate filter, IndexSearchBudget budget, EdgeIndexSearchTerms searchTerms) { if (null == indexReader) { logger.warn("Index reader not neady {}", block); @@ -114,15 +117,8 @@ public class EdgeIndexBucket { .mapToInt(Integer::intValue) .toArray(); - - if (logger.isDebugEnabled()) { - logger.debug("Includes: ({}); excludes: ({})", Arrays. - stream(orderedIncludes) - .mapToObj(String::valueOf) - .collect(Collectors.joining(",")), - searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(","))); - } Query query; + if (orderedIncludes.length == 1) { query = indexReader.findUnderspecified(block, budget, filter, orderedIncludes[0]); } @@ -136,6 +132,7 @@ public class EdgeIndexBucket { for (int term : searchTerms.excludes) { query = query.not(term); } + return query.stream(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index b4915df7..ece52178 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -32,6 +32,8 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.search.*; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; import org.apache.http.HttpStatus; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -88,6 +90,7 @@ public class EdgeIndexService extends Service { Spark.post("/words/", this::putWords); Spark.post("/search/", this::search, gson::toJson); + Spark.post("/search-domain/", this::searchDomain, gson::toJson); Spark.post("/dictionary/*", this::getWordId, gson::toJson); @@ -204,6 +207,26 @@ public class EdgeIndexService extends Service { .toArray(); } + private Object searchDomain(Request request, Response response) { + if (indexes.getDictionaryReader() == null) { + logger.warn("Dictionary reader not yet initialized"); + halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); + } + + String json = request.body(); + EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); + + final int wordId = keywordLexicon.getOrInsert(specsSet.keyword); + + List> urlIds = indexes + .getBucket(specsSet.bucket) + .findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults) + .mapToObj(lv -> new EdgeId((int)(lv & 0xFFFF_FFFFL))) + .toList(); + + return new EdgeDomainSearchResults(specsSet.keyword, urlIds); + } + private Object search(Request request, Response response) { if (indexes.getDictionaryReader() == null) { logger.warn("Dictionary reader not yet initialized"); @@ -387,6 +410,16 @@ public class EdgeIndexService extends Service { } + public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId, + int queryDepth, int minHitCount, int maxResults) { + if (!indexes.isValidBucket(bucket)) { + logger.warn("Invalid bucket {}", bucket); + return LongStream.empty(); + } + + return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults); + } + private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) { if (!indexes.isValidBucket(bucket)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index 6f64ceae..4a9c1737 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -10,10 +10,14 @@ import nu.marginalia.wmsa.client.HttpStatusCode; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest; -import nu.marginalia.wmsa.edge.model.*; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,6 +64,11 @@ public class EdgeIndexClient extends AbstractDynamicClient { .blockingGet(); } + @CheckReturnValue + public EdgeDomainSearchResults queryDomains(Context ctx, EdgeDomainSearchSpecification specs) { + return this.postGet(ctx, "/search-domain/", specs, EdgeDomainSearchResults.class).blockingFirst(); + } + @CheckReturnValue public Observable isBlocked(Context ctx) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 8e7fea81..e4061982 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -13,8 +13,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.EnumMap; +import java.util.List; +import java.util.Objects; import java.util.function.LongPredicate; -import java.util.stream.Collectors; +import java.util.stream.LongStream; import java.util.stream.Stream; public class SearchIndexReader implements AutoCloseable { @@ -55,18 +57,53 @@ public class SearchIndexReader implements AutoCloseable { queryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); - queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex, topicIndex, titleIndex).collect(Collectors.toList()), wordsIndex)); - queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(Stream.of(metaIndex, titleKeywordsIndex).collect(Collectors.toList()), wordsIndex)); + queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex, wordsIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, namesIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Middle, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex)); + queryBuilders.put(IndexBlock.PositionWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex, positionIndex), wordsIndex)); + queryBuilders.put(IndexBlock.NamesWords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, namesIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, linkIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex)); + queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(Stream.of(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex).collect(Collectors.toList()), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(Stream.of(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex).collect(Collectors.toList()), wordsIndex)); + underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); + underspecifiedQueryBuilders.put(IndexBlock.Link, new IndexQueryBuilder(listOfNonNulls(linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); + } + + @SafeVarargs + public final List listOfNonNulls(T... vals) { + return Stream.of(vals).filter(Objects::nonNull).toList(); + } + + + public LongStream findHotDomainsForKeyword(IndexBlock block, int wordId, int queryDepth, int minHitCount, int maxResults) { + var index = indices.get(block); + + if (index == null) + return LongStream.empty(); + + return index.rangeForWord(wordId) + .stream() + .limit(queryDepth) + .filter(new LongPredicate() { + long last = Long.MIN_VALUE; + int count = 0; + + @Override + public boolean test(long value) { + if ((last >>> 32L) == (value >>> 32L)) { + return count++ == minHitCount; + } + else { + last = value; + count = 0; + + } + return false; + } + }) + .limit(maxResults); } public Query findUnderspecified( @@ -116,6 +153,7 @@ public class SearchIndexReader implements AutoCloseable { } var range = index.rangeForWord(searchTerm); + if (index.hasUrl(urlId, range)) { return block; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index 6f54dd2d..82b4dbf3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -112,6 +112,11 @@ public class IndexQueryBuilder { @Override public Query not(int wordId) { + // Happens when an index simply isn't present, won't find data anyway + // so it's safe to no-op the query + if (excludeIndex == null) + return new QueryForIndices(budget, LongStream::empty); + return new QueryForIndices(budget, () -> notStream(wordId)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index 66438279..d8c66fc2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -15,12 +15,12 @@ import java.util.List; public class EdgeSearchResultItem { public final int blockId; public final int queryLength; - public final EdgeId domain; + public final EdgeId domain; // this isn't the external domain ID, but a ranking public final EdgeId url; public final List scores; public EdgeSearchResultItem(int blockId, int queryLength, long val) { - int urlId = (int) (val & 0xFFFFFFFFL); + int urlId = (int) (val & 0xFFFF_FFFFL); int domainId = (int) (val >>> 32); this.queryLength = queryLength; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java new file mode 100644 index 00000000..3b41c9e0 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java @@ -0,0 +1,15 @@ +package nu.marginalia.wmsa.edge.model.search.domain; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.ToString; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import java.util.List; + +@AllArgsConstructor @Getter @ToString +public class EdgeDomainSearchResults { + public final String keyword; + public final List> results; +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java new file mode 100644 index 00000000..435752d4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchSpecification.java @@ -0,0 +1,15 @@ +package nu.marginalia.wmsa.edge.model.search.domain; + +import lombok.AllArgsConstructor; +import lombok.ToString; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; + +@ToString @AllArgsConstructor +public class EdgeDomainSearchSpecification { + public final int bucket; + public final IndexBlock block; + public final String keyword; + public final int queryDepth; + public final int minHitCount; + public final int maxResults; +}