(search) Experimental support for clustering search results

Improves clustering of results.
This commit is contained in:
Viktor Lofgren 2024-02-11 19:48:48 +01:00
parent bcd0dabb92
commit a77846373b
6 changed files with 46 additions and 52 deletions

View file

@ -104,7 +104,7 @@ public class SearchOperator {
String evalResult = getFutureOrDefault(eval, "");
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
.selectStrategy(queryResponse, userParams)
.selectStrategy(queryResponse)
.clusterResults(queryResults, 25);
return DecoratedSearchResults.builder()

View file

@ -16,14 +16,11 @@ public class SearchResultClusterer {
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
}
public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) {
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
if (response.domain() != null && !response.domain().isBlank())
return SearchResultClusterer::noOp;
if (params.profile().clusterResults())
return SearchResultClusterer::byDomain;
return SearchResultClusterer::clusterThenSplit;
}
/** No clustering, just return the results as is */
@ -55,23 +52,4 @@ public class SearchResultClusterer {
.toList();
}
/** Cluster the results by domain to find the best result for each domain,
* then split the clusters into singletons, and return the top "total" clusters
*/
private static List<ClusteredUrlDetails> clusterThenSplit(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.collect(
Collectors.groupingBy(details -> details.domainId)
)
.values().stream()
.map(ClusteredUrlDetails::new)
.mapMulti(ClusteredUrlDetails::forEachSingle)
.sorted()
.limit(total)
.toList();
}
}

View file

@ -2,10 +2,12 @@ package nu.marginalia.search.model;
import lombok.Getter;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.idx.WordFlags;
import org.jetbrains.annotations.NotNull;
import java.util.*;
import java.util.function.Consumer;
import java.util.stream.Collectors;
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
* and the rest are additional results, for summary display. */
@ -34,12 +36,29 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
this.rest = queue
.stream()
.filter(this::isEligbleForInclusion)
.takeWhile(next -> next.termScore <= scoreLimit)
.toList();
}
}
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
return urlDetails.resultItem.keywordScores.stream()
.filter(score -> !score.keyword.contains(":"))
.collect(Collectors.toMap(
score -> score.subquery,
score -> score.hasTermFlag(WordFlags.Title)
| score.hasTermFlag(WordFlags.ExternalLink)
| score.hasTermFlag(WordFlags.UrlDomain)
| score.hasTermFlag(WordFlags.UrlPath)
| score.hasTermFlag(WordFlags.Subjects)
,
(a, b) -> a && b
))
.containsValue(Boolean.TRUE);
}
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
this.first = onlyFirst;
this.rest = Collections.emptyList();
@ -53,17 +72,6 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
@Getter
public final List<UrlDetails> rest;
public void forEachSingle(Consumer<ClusteredUrlDetails> consumer) {
if (rest.isEmpty())
consumer.accept(this);
else {
consumer.accept(new ClusteredUrlDetails(first));
rest.stream()
.map(ClusteredUrlDetails::new)
.forEach(consumer);
}
}
public void splitSmallClusters(Consumer<ClusteredUrlDetails> consumer) {
if (rest.isEmpty())
consumer.accept(this);

View file

@ -101,15 +101,5 @@ public enum SearchProfile {
else return SpecificationLimit.none();
}
public boolean clusterResults() {
if (this == FORUM)
return true;
if (this == WIKI)
return true;
if (this == DOCS)
return true;
return false;
}
}

View file

@ -1,4 +1,11 @@
<section class="card search-result">
<section data-ms-rank="{{first.matchRank}}" class="card search-result" >
{{#with first}}
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
<p class="description">{{description}}</p>
{{/with}}
<div class="utils">
Also from <a href="/site/{{first.url.domain}}">{{first.url.domain}}</a>
</div>
@ -7,11 +14,17 @@
<li><a href="{{url}}">{{title}}</a></li>
{{/each}}
</ul>
{{#if remainingCount}}
{{#with first}}
<div class="utils">
<a href="/site-search/{{first.url.domain}}/{{query}}?profile={{profile}}">{{remainingCount}}</a> more</li>
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>
<div class="meta">
{{#if problemCount}} <span class="problems" title="{{problems}}"> ⚠ {{problemCount}} </span> {{/if}}
<span aria-hidden="true" class="meta positions"
title="Positions where keywords were found within the document">{{positions}}</span>
</div>
</div>
{{/if}}
{{/with}}
</section>
<hr class="w3m-helper" />

View file

@ -25,8 +25,13 @@
</div>
{{/if}}
{{#each results}}
{{#with first}} {{>search/parts/search-result}} {{/with}}
{{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}}
{{#if hasMultiple}}
{{>search/parts/search-result-rest}}
{{else}}
{{#with first}}
{{>search/parts/search-result}}
{{/with}}
{{/if}}
{{/each}}
</section>