(search) Experimental support for clustering search results
Improves clustering of results.
This commit is contained in:
parent
bcd0dabb92
commit
a77846373b
6 changed files with 46 additions and 52 deletions
|
@ -104,7 +104,7 @@ public class SearchOperator {
|
|||
String evalResult = getFutureOrDefault(eval, "");
|
||||
|
||||
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
|
||||
.selectStrategy(queryResponse, userParams)
|
||||
.selectStrategy(queryResponse)
|
||||
.clusterResults(queryResults, 25);
|
||||
|
||||
return DecoratedSearchResults.builder()
|
||||
|
|
|
@ -16,14 +16,11 @@ public class SearchResultClusterer {
|
|||
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
|
||||
}
|
||||
|
||||
public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) {
|
||||
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
|
||||
if (response.domain() != null && !response.domain().isBlank())
|
||||
return SearchResultClusterer::noOp;
|
||||
|
||||
if (params.profile().clusterResults())
|
||||
return SearchResultClusterer::byDomain;
|
||||
|
||||
return SearchResultClusterer::clusterThenSplit;
|
||||
}
|
||||
|
||||
/** No clustering, just return the results as is */
|
||||
|
@ -55,23 +52,4 @@ public class SearchResultClusterer {
|
|||
.toList();
|
||||
}
|
||||
|
||||
/** Cluster the results by domain to find the best result for each domain,
|
||||
* then split the clusters into singletons, and return the top "total" clusters
|
||||
*/
|
||||
private static List<ClusteredUrlDetails> clusterThenSplit(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.collect(
|
||||
Collectors.groupingBy(details -> details.domainId)
|
||||
)
|
||||
.values().stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.mapMulti(ClusteredUrlDetails::forEachSingle)
|
||||
.sorted()
|
||||
.limit(total)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,10 +2,12 @@ package nu.marginalia.search.model;
|
|||
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||
* and the rest are additional results, for summary display. */
|
||||
|
@ -34,12 +36,29 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
|||
|
||||
this.rest = queue
|
||||
.stream()
|
||||
.filter(this::isEligbleForInclusion)
|
||||
.takeWhile(next -> next.termScore <= scoreLimit)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
|
||||
return urlDetails.resultItem.keywordScores.stream()
|
||||
.filter(score -> !score.keyword.contains(":"))
|
||||
.collect(Collectors.toMap(
|
||||
score -> score.subquery,
|
||||
score -> score.hasTermFlag(WordFlags.Title)
|
||||
| score.hasTermFlag(WordFlags.ExternalLink)
|
||||
| score.hasTermFlag(WordFlags.UrlDomain)
|
||||
| score.hasTermFlag(WordFlags.UrlPath)
|
||||
| score.hasTermFlag(WordFlags.Subjects)
|
||||
,
|
||||
(a, b) -> a && b
|
||||
))
|
||||
.containsValue(Boolean.TRUE);
|
||||
}
|
||||
|
||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||
this.first = onlyFirst;
|
||||
this.rest = Collections.emptyList();
|
||||
|
@ -53,17 +72,6 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
|||
@Getter
|
||||
public final List<UrlDetails> rest;
|
||||
|
||||
public void forEachSingle(Consumer<ClusteredUrlDetails> consumer) {
|
||||
if (rest.isEmpty())
|
||||
consumer.accept(this);
|
||||
else {
|
||||
consumer.accept(new ClusteredUrlDetails(first));
|
||||
rest.stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.forEach(consumer);
|
||||
}
|
||||
}
|
||||
|
||||
public void splitSmallClusters(Consumer<ClusteredUrlDetails> consumer) {
|
||||
if (rest.isEmpty())
|
||||
consumer.accept(this);
|
||||
|
|
|
@ -101,15 +101,5 @@ public enum SearchProfile {
|
|||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
public boolean clusterResults() {
|
||||
if (this == FORUM)
|
||||
return true;
|
||||
if (this == WIKI)
|
||||
return true;
|
||||
if (this == DOCS)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,11 @@
|
|||
<section class="card search-result">
|
||||
|
||||
<section data-ms-rank="{{first.matchRank}}" class="card search-result" >
|
||||
{{#with first}}
|
||||
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
|
||||
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
|
||||
<p class="description">{{description}}</p>
|
||||
|
||||
{{/with}}
|
||||
<div class="utils">
|
||||
Also from <a href="/site/{{first.url.domain}}">{{first.url.domain}}</a>
|
||||
</div>
|
||||
|
@ -7,11 +14,17 @@
|
|||
<li><a href="{{url}}">{{title}}</a></li>
|
||||
{{/each}}
|
||||
</ul>
|
||||
{{#if remainingCount}}
|
||||
{{#with first}}
|
||||
<div class="utils">
|
||||
<a href="/site-search/{{first.url.domain}}/{{query}}?profile={{profile}}">{{remainingCount}}</a> more</li>
|
||||
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
|
||||
<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>
|
||||
<div class="meta">
|
||||
{{#if problemCount}} <span class="problems" title="{{problems}}"> ⚠ {{problemCount}} </span> {{/if}}
|
||||
<span aria-hidden="true" class="meta positions"
|
||||
title="Positions where keywords were found within the document">{{positions}}</span>
|
||||
</div>
|
||||
</div>
|
||||
{{/if}}
|
||||
{{/with}}
|
||||
</section>
|
||||
|
||||
<hr class="w3m-helper" />
|
|
@ -25,8 +25,13 @@
|
|||
</div>
|
||||
{{/if}}
|
||||
{{#each results}}
|
||||
{{#with first}} {{>search/parts/search-result}} {{/with}}
|
||||
{{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}}
|
||||
{{#if hasMultiple}}
|
||||
{{>search/parts/search-result-rest}}
|
||||
{{else}}
|
||||
{{#with first}}
|
||||
{{>search/parts/search-result}}
|
||||
{{/with}}
|
||||
{{/if}}
|
||||
{{/each}}
|
||||
</section>
|
||||
|
||||
|
|
Loading…
Reference in a new issue