From a77846373b887b64a5b112b0ba130a32fb92a0af Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 11 Feb 2024 19:48:48 +0100 Subject: [PATCH] (search) Experimental support for clustering search results Improves clustering of results. --- .../nu/marginalia/search/SearchOperator.java | 2 +- .../search/SearchResultClusterer.java | 26 ++-------------- .../search/model/ClusteredUrlDetails.java | 30 ++++++++++++------- .../search/model/SearchProfile.java | 10 ------- .../search/parts/search-result-rest.hdb | 21 ++++++++++--- .../templates/search/search-results.hdb | 9 ++++-- 6 files changed, 46 insertions(+), 52 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java index f87ade26..b3597950 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java @@ -104,7 +104,7 @@ public class SearchOperator { String evalResult = getFutureOrDefault(eval, ""); List clusteredResults = SearchResultClusterer - .selectStrategy(queryResponse, userParams) + .selectStrategy(queryResponse) .clusterResults(queryResults, 25); return DecoratedSearchResults.builder() diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java index 4c57ea8f..a2869879 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java @@ -16,14 +16,11 @@ public class SearchResultClusterer { List clusterResults(List results, int total); } - public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) { + public static SearchResultClusterStrategy selectStrategy(QueryResponse response) { if (response.domain() != null && !response.domain().isBlank()) return SearchResultClusterer::noOp; - if (params.profile().clusterResults()) - return SearchResultClusterer::byDomain; - - return SearchResultClusterer::clusterThenSplit; + return SearchResultClusterer::byDomain; } /** No clustering, just return the results as is */ @@ -55,23 +52,4 @@ public class SearchResultClusterer { .toList(); } - /** Cluster the results by domain to find the best result for each domain, - * then split the clusters into singletons, and return the top "total" clusters - */ - private static List clusterThenSplit(List results, int total) { - if (results.isEmpty()) - return List.of(); - - return results.stream() - .collect( - Collectors.groupingBy(details -> details.domainId) - ) - .values().stream() - .map(ClusteredUrlDetails::new) - .mapMulti(ClusteredUrlDetails::forEachSingle) - .sorted() - .limit(total) - .toList(); - } - } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java index 16655cdd..0f9bbfaf 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -2,10 +2,12 @@ package nu.marginalia.search.model; import lombok.Getter; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.idx.WordFlags; import org.jetbrains.annotations.NotNull; import java.util.*; import java.util.function.Consumer; +import java.util.stream.Collectors; /** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result * and the rest are additional results, for summary display. */ @@ -34,12 +36,29 @@ public class ClusteredUrlDetails implements Comparable { this.rest = queue .stream() + .filter(this::isEligbleForInclusion) .takeWhile(next -> next.termScore <= scoreLimit) .toList(); } } + private boolean isEligbleForInclusion(UrlDetails urlDetails) { + return urlDetails.resultItem.keywordScores.stream() + .filter(score -> !score.keyword.contains(":")) + .collect(Collectors.toMap( + score -> score.subquery, + score -> score.hasTermFlag(WordFlags.Title) + | score.hasTermFlag(WordFlags.ExternalLink) + | score.hasTermFlag(WordFlags.UrlDomain) + | score.hasTermFlag(WordFlags.UrlPath) + | score.hasTermFlag(WordFlags.Subjects) + , + (a, b) -> a && b + )) + .containsValue(Boolean.TRUE); + } + public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) { this.first = onlyFirst; this.rest = Collections.emptyList(); @@ -53,17 +72,6 @@ public class ClusteredUrlDetails implements Comparable { @Getter public final List rest; - public void forEachSingle(Consumer consumer) { - if (rest.isEmpty()) - consumer.accept(this); - else { - consumer.accept(new ClusteredUrlDetails(first)); - rest.stream() - .map(ClusteredUrlDetails::new) - .forEach(consumer); - } - } - public void splitSmallClusters(Consumer consumer) { if (rest.isEmpty()) consumer.accept(this); diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index 9a40e65c..85fafa8f 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -101,15 +101,5 @@ public enum SearchProfile { else return SpecificationLimit.none(); } - public boolean clusterResults() { - if (this == FORUM) - return true; - if (this == WIKI) - return true; - if (this == DOCS) - return true; - return false; - } - } diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb index 7efc5b5e..764462d8 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb @@ -1,4 +1,11 @@ -
+ +
+{{#with first}} + +

{{title}}

+

{{description}}

+ +{{/with}} @@ -7,11 +14,17 @@
  • {{title}}
  • {{/each}} -{{#if remainingCount}} +{{#with first}}
    - {{remainingCount}} more + Info + {{resultsFromSameDomain}}+ +
    + {{#if problemCount}} ⚠ {{problemCount}} {{/if}} + +
    -{{/if}} +{{/with}}

    \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb b/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb index 197f5666..60a90a97 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb @@ -25,8 +25,13 @@ {{/if}} {{#each results}} - {{#with first}} {{>search/parts/search-result}} {{/with}} - {{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}} + {{#if hasMultiple}} + {{>search/parts/search-result-rest}} + {{else}} + {{#with first}} + {{>search/parts/search-result}} + {{/with}} + {{/if}} {{/each}}