From 427f3e922fb4296b89509fe72f0d1d18e9f3eb85 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 25 Feb 2024 12:46:30 +0100 Subject: [PATCH] (index) Retire count operation, clean up index code. --- .../api/searchquery/QueryProtobufCodec.java | 4 -- .../searchquery/model/query/QueryParams.java | 2 - .../model/query/SearchSpecification.java | 2 - .../api/src/main/protobuf/query-api.proto | 2 - .../searchquery/query_parser/QueryParser.java | 2 - .../searchquery/query_parser/token/Token.java | 1 - .../query_parser/token/TokenType.java | 1 - .../query_parser/token/TokenVisitor.java | 1 - .../searchquery/svc/QueryFactory.java | 1 - .../svc/QueryLimitsAccumulator.java | 7 --- .../svc/QuerySearchTermsAccumulator.java | 2 - .../query/svc/QueryFactoryTest.java | 1 - .../nu/marginalia/index/IndexGrpcService.java | 7 +-- .../marginalia/index/model/QueryParams.java | 3 -- .../index/model/SearchParameters.java | 2 - .../results/IndexResultValuatorService.java | 49 ++++++------------- ...IndexQueryServiceIntegrationSmokeTest.java | 3 -- .../IndexQueryServiceIntegrationTest.java | 1 - .../search/SearchQueryParamFactory.java | 4 -- 19 files changed, 18 insertions(+), 77 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 58550362..28d14c82 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -34,7 +34,6 @@ public class QueryProtobufCodec { builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year)); builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size)); builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank)); - builder.setDomainCount(IndexProtobufCodec.convertSpecLimit(query.specs.domainCount)); builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits)); @@ -63,7 +62,6 @@ public class QueryProtobufCodec { builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year)); builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size)); builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank)); - builder.setDomainCount(IndexProtobufCodec.convertSpecLimit(query.specs.domainCount)); builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits)); @@ -92,7 +90,6 @@ public class QueryProtobufCodec { IndexProtobufCodec.convertSpecLimit(request.getYear()), IndexProtobufCodec.convertSpecLimit(request.getSize()), IndexProtobufCodec.convertSpecLimit(request.getRank()), - IndexProtobufCodec.convertSpecLimit(request.getDomainCount()), request.getDomainIdsList(), IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()), request.getSearchSetIdentifier(), @@ -174,7 +171,6 @@ public class QueryProtobufCodec { IndexProtobufCodec.convertSpecLimit(specs.getYear()), IndexProtobufCodec.convertSpecLimit(specs.getSize()), IndexProtobufCodec.convertSpecLimit(specs.getRank()), - IndexProtobufCodec.convertSpecLimit(specs.getDomainCount()), IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()), QueryStrategy.valueOf(specs.getQueryStrategy()), IndexProtobufCodec.convertRankingParameterss(specs.getParameters()) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryParams.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryParams.java index b713217b..176b977e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryParams.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryParams.java @@ -19,7 +19,6 @@ public record QueryParams( SpecificationLimit year, SpecificationLimit size, SpecificationLimit rank, - SpecificationLimit domainCount, List domainIds, QueryLimits limits, String identifier, @@ -37,7 +36,6 @@ public record QueryParams( SpecificationLimit.none(), SpecificationLimit.none(), SpecificationLimit.none(), - SpecificationLimit.none(), List.of(), limits, identifier, diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java index 2b5d1867..be2a6895 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java @@ -24,8 +24,6 @@ public class SearchSpecification { public final SpecificationLimit size; public final SpecificationLimit rank; - public final SpecificationLimit domainCount; - public final QueryLimits queryLimits; public final QueryStrategy queryStrategy; diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index e508ba34..f5ec5e8d 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -25,7 +25,6 @@ message RpcQsQuery { RpcSpecLimit year = 8; RpcSpecLimit size = 9; RpcSpecLimit rank = 10; - RpcSpecLimit domainCount = 11; repeated int32 domainIds = 12; RpcQueryLimits queryLimits = 13; string searchSetIdentifier = 14; @@ -61,7 +60,6 @@ message RpcIndexQuery { RpcSpecLimit year = 6; RpcSpecLimit size = 7; RpcSpecLimit rank = 8; - RpcSpecLimit domainCount = 9; RpcQueryLimits queryLimits = 10; string queryStrategy = 11; // Named query configuration RpcResultRankingParameters parameters = 12; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index 27ad1cbc..bbaf5c87 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -82,8 +82,6 @@ public class QueryParser { entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("count") && t.str.matches("count[=><]\\d+")) { - entity.replace(new Token(TokenType.DOMAIN_COUNT_TERM, t.str.substring(5), t.displayStr)); } else if (t.str.startsWith("qs=")) { entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); } else if (t.str.contains(":")) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java index b973d777..06c28972 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java @@ -40,7 +40,6 @@ public class Token { case YEAR_TERM: visitor.onYearTerm(this); break; case RANK_TERM: visitor.onRankTerm(this); break; - case DOMAIN_COUNT_TERM: visitor.onDomainCountTerm(this); break; case SIZE_TERM: visitor.onSizeTerm(this); break; case QS_TERM: visitor.onQsTerm(this); break; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java index 8c542e5a..85d55c35 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java @@ -16,7 +16,6 @@ public enum TokenType implements Predicate { YEAR_TERM, SIZE_TERM, RANK_TERM, - DOMAIN_COUNT_TERM, NEAR_TERM, QS_TERM, diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java index f6a48d4d..2e14f837 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java @@ -9,7 +9,6 @@ public interface TokenVisitor { void onYearTerm(Token token); void onSizeTerm(Token token); void onRankTerm(Token token); - void onDomainCountTerm(Token token); void onQualityTerm(Token token); void onQsTerm(Token token); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 4ed97afb..f641e52f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -127,7 +127,6 @@ public class QueryFactory { .subqueries(subqueries) .humanQuery(query) .quality(qualityLimits.qualityLimit) - .domainCount(qualityLimits.domainCount) .year(qualityLimits.year) .size(qualityLimits.size) .rank(qualityLimits.rank) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java index 4d230650..1b49bab3 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java @@ -11,7 +11,6 @@ public class QueryLimitsAccumulator implements TokenVisitor { public SpecificationLimit year; public SpecificationLimit size; public SpecificationLimit rank; - public SpecificationLimit domainCount; public QueryStrategy queryStrategy = QueryStrategy.AUTO; @@ -20,7 +19,6 @@ public class QueryLimitsAccumulator implements TokenVisitor { year = params.year(); size = params.size(); rank = params.rank(); - domainCount = params.domainCount(); } private SpecificationLimit parseSpecificationLimit(String str) { @@ -67,11 +65,6 @@ public class QueryLimitsAccumulator implements TokenVisitor { rank = parseSpecificationLimit(token.str); } - @Override - public void onDomainCountTerm(Token token) { - domainCount = parseSpecificationLimit(token.str); - } - @Override public void onQualityTerm(Token token) { qualityLimit = parseSpecificationLimit(token.str); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java index 00991eac..e4def0d0 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java @@ -103,8 +103,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor { @Override public void onRankTerm(Token token) {} @Override - public void onDomainCountTerm(Token token) {} - @Override public void onQualityTerm(Token token) {} @Override public void onQsTerm(Token token) {} diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index bafb929d..fe93a1f6 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -48,7 +48,6 @@ public class QueryFactoryTest { SpecificationLimit.none(), SpecificationLimit.none(), SpecificationLimit.none(), - SpecificationLimit.none(), null, new QueryLimits(100, 100, 100, 100), "NONE", diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index b210c851..f7821276 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -79,7 +79,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { private final IndexQueryService indexQueryService; private final IndexResultValuatorService resultValuator; - private final int nodeId; private final String nodeName; private final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 8); @@ -91,7 +90,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { IndexQueryService indexQueryService, IndexResultValuatorService resultValuator) { - this.nodeId = serviceConfiguration.node(); + var nodeId = serviceConfiguration.node(); this.nodeName = Integer.toString(nodeId); this.index = index; this.searchSetsService = searchSetsService; @@ -107,6 +106,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { try { var params = new SearchParameters(request, getSearchSet(request)); + long endTime = System.currentTimeMillis() + request.getQueryLimits().getTimeoutMs(); + SearchResultSet results = wmsa_query_time .labels(nodeName, "GRPC") .time(() -> { @@ -119,7 +120,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .labels(nodeName, "GRPC") .set(params.getDataCost()); - if (!params.hasTimeLeft()) { + if (System.currentTimeMillis() >= endTime) { wmsa_query_timeouts .labels(nodeName, "GRPC") .inc(); diff --git a/code/index/java/nu/marginalia/index/model/QueryParams.java b/code/index/java/nu/marginalia/index/model/QueryParams.java index 87774347..56e40551 100644 --- a/code/index/java/nu/marginalia/index/model/QueryParams.java +++ b/code/index/java/nu/marginalia/index/model/QueryParams.java @@ -10,8 +10,6 @@ import nu.marginalia.index.query.limit.SpecificationLimit; * @param year The year limit. * @param size The size limit. Eliminates results from domains that do not satisfy the size criteria. * @param rank The rank limit. Eliminates results from domains that do not satisfy the domain rank criteria. - * @param domainCount The domain count limit. Filters out results from domains that do not contain enough - * documents that match the query. * @param searchSet The search set. Limits the search to a set of domains. * @param queryStrategy The query strategy. May impose additional constraints on the query, such as requiring * the keywords to appear in the title, or in the domain. @@ -20,7 +18,6 @@ public record QueryParams(SpecificationLimit qualityLimit, SpecificationLimit year, SpecificationLimit size, SpecificationLimit rank, - SpecificationLimit domainCount, SearchSet searchSet, QueryStrategy queryStrategy ) diff --git a/code/index/java/nu/marginalia/index/model/SearchParameters.java b/code/index/java/nu/marginalia/index/model/SearchParameters.java index 7d440563..0594bd68 100644 --- a/code/index/java/nu/marginalia/index/model/SearchParameters.java +++ b/code/index/java/nu/marginalia/index/model/SearchParameters.java @@ -52,7 +52,6 @@ public class SearchParameters { specsSet.year, specsSet.size, specsSet.rank, - specsSet.domainCount, searchSet, specsSet.queryStrategy); @@ -80,7 +79,6 @@ public class SearchParameters { convertSpecLimit(request.getYear()), convertSpecLimit(request.getSize()), convertSpecLimit(request.getRank()), - convertSpecLimit(request.getDomainCount()), searchSet, QueryStrategy.valueOf(request.getQueryStrategy())); diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index 9fa1713a..9251a5d2 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +import java.util.function.Consumer; import java.util.stream.Collectors; @Singleton @@ -77,33 +78,15 @@ public class IndexResultValuatorService { for (var item : results) { if (domainCountFilter.test(item)) { - resultsList.add(item); + // It's important that this filter runs across all results, not just the top N + if (resultsList.size() < params.limitTotal) { + resultsList.add(item); + } } } - if (!params.queryParams.domainCount().isNone()) { - // Remove items that don't meet the domain count requirement - // This isn't perfect because the domain count is calculated - // after the results are sorted - resultsList.removeIf(item -> !params.queryParams.domainCount().test(domainCountFilter.getCount(item))); - } - - if (resultsList.size() > params.limitTotal) { - // This can't be made a stream limit() operation because we need domainCountFilter - // to run over the entire list to provide accurate statistics - - resultsList.subList(params.limitTotal, resultsList.size()).clear(); - } - - // populate results with the total number of results encountered from - // the same domain so this information can be presented to the user - for (var result : resultsList) { - result.resultsFromDomain = domainCountFilter.getCount(result); - } - - LongArrayList idsList = new LongArrayList(resultsList.size()); - for (var result : resultsList) { - idsList.add(result.getCombinedId()); + for (var item : resultsList) { + item.resultsFromDomain = domainCountFilter.getCount(item); } return decorateAndRerank(resultsList, rankingContext); @@ -125,23 +108,19 @@ public class IndexResultValuatorService { for (var item : documentDbReader.getUrlDetails(idsList)) urlDetailsById.put(item.urlId(), item); - List decoratedItems = new ArrayList<>(); + List resultItems = new ArrayList<>(rawResults.size()); for (var result : rawResults) { - var docData = urlDetailsById.get(result.getDocumentId()); + var id = result.getDocumentId(); + var docData = urlDetailsById.get(id); - if (null == docData) { - logger.warn("No data for document id {}", result.getDocumentId()); + if (docData == null) { + logger.warn("No document data for id {}", id); continue; } - decoratedItems.add(createCombinedItem(result, docData, rankingContext)); + resultItems.add(createCombinedItem(result, docData, rankingContext)); } - - if (decoratedItems.size() != rawResults.size()) - logger.warn("Result list shrunk during decoration?"); - - decoratedItems.sort(Comparator.naturalOrder()); - return decoratedItems; + return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 0370898f..634481f4 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -120,7 +120,6 @@ public class IndexQueryServiceIntegrationSmokeTest { .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) .rank(SpecificationLimit.none()) - .domainCount(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) .searchSetIdentifier("NONE") @@ -164,7 +163,6 @@ public class IndexQueryServiceIntegrationSmokeTest { .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) .rank(SpecificationLimit.none()) - .domainCount(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) @@ -201,7 +199,6 @@ public class IndexQueryServiceIntegrationSmokeTest { .year(SpecificationLimit.equals(1998)) .size(SpecificationLimit.none()) .rank(SpecificationLimit.none()) - .domainCount(SpecificationLimit.none()) .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier("NONE") .rankingParams(ResultRankingParameters.sensibleDefaults()) diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 3431c18e..6def5bbc 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -422,7 +422,6 @@ public class IndexQueryServiceIntegrationTest { .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) .rank(SpecificationLimit.none()) - .domainCount(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) .searchSetIdentifier("NONE") diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java index 100e62b6..15c8567e 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -33,7 +33,6 @@ public class SearchQueryParamFactory { profile.getYearLimit(), profile.getSizeLimit(), SpecificationLimit.none(), - SpecificationLimit.none(), List.of(), new QueryLimits(5, 100, 200, 8192), profile.searchSetIdentifier.name(), @@ -54,7 +53,6 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), SpecificationLimit.none(), - SpecificationLimit.none(), List.of(), new QueryLimits(count, count, 100, 512), SearchSetIdentifier.NONE.name(), @@ -74,7 +72,6 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), SpecificationLimit.none(), - SpecificationLimit.none(), List.of(), new QueryLimits(100, 100, 100, 512), SearchSetIdentifier.NONE.name(), @@ -94,7 +91,6 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), SpecificationLimit.none(), - SpecificationLimit.none(), List.of(), new QueryLimits(100, 100, 100, 512), SearchSetIdentifier.NONE.name(),