(search) Experimental support for clustering search results
Improves clustering of results.
This commit is contained in:
parent
bcd0dabb92
commit
a77846373b
@ -104,7 +104,7 @@ public class SearchOperator {
|
|||||||
String evalResult = getFutureOrDefault(eval, "");
|
String evalResult = getFutureOrDefault(eval, "");
|
||||||
|
|
||||||
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
|
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
|
||||||
.selectStrategy(queryResponse, userParams)
|
.selectStrategy(queryResponse)
|
||||||
.clusterResults(queryResults, 25);
|
.clusterResults(queryResults, 25);
|
||||||
|
|
||||||
return DecoratedSearchResults.builder()
|
return DecoratedSearchResults.builder()
|
||||||
|
@ -16,14 +16,11 @@ public class SearchResultClusterer {
|
|||||||
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
|
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) {
|
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
|
||||||
if (response.domain() != null && !response.domain().isBlank())
|
if (response.domain() != null && !response.domain().isBlank())
|
||||||
return SearchResultClusterer::noOp;
|
return SearchResultClusterer::noOp;
|
||||||
|
|
||||||
if (params.profile().clusterResults())
|
return SearchResultClusterer::byDomain;
|
||||||
return SearchResultClusterer::byDomain;
|
|
||||||
|
|
||||||
return SearchResultClusterer::clusterThenSplit;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** No clustering, just return the results as is */
|
/** No clustering, just return the results as is */
|
||||||
@ -55,23 +52,4 @@ public class SearchResultClusterer {
|
|||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Cluster the results by domain to find the best result for each domain,
|
|
||||||
* then split the clusters into singletons, and return the top "total" clusters
|
|
||||||
*/
|
|
||||||
private static List<ClusteredUrlDetails> clusterThenSplit(List<UrlDetails> results, int total) {
|
|
||||||
if (results.isEmpty())
|
|
||||||
return List.of();
|
|
||||||
|
|
||||||
return results.stream()
|
|
||||||
.collect(
|
|
||||||
Collectors.groupingBy(details -> details.domainId)
|
|
||||||
)
|
|
||||||
.values().stream()
|
|
||||||
.map(ClusteredUrlDetails::new)
|
|
||||||
.mapMulti(ClusteredUrlDetails::forEachSingle)
|
|
||||||
.sorted()
|
|
||||||
.limit(total)
|
|
||||||
.toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,10 +2,12 @@ package nu.marginalia.search.model;
|
|||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||||
* and the rest are additional results, for summary display. */
|
* and the rest are additional results, for summary display. */
|
||||||
@ -34,12 +36,29 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
|||||||
|
|
||||||
this.rest = queue
|
this.rest = queue
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(this::isEligbleForInclusion)
|
||||||
.takeWhile(next -> next.termScore <= scoreLimit)
|
.takeWhile(next -> next.termScore <= scoreLimit)
|
||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
|
||||||
|
return urlDetails.resultItem.keywordScores.stream()
|
||||||
|
.filter(score -> !score.keyword.contains(":"))
|
||||||
|
.collect(Collectors.toMap(
|
||||||
|
score -> score.subquery,
|
||||||
|
score -> score.hasTermFlag(WordFlags.Title)
|
||||||
|
| score.hasTermFlag(WordFlags.ExternalLink)
|
||||||
|
| score.hasTermFlag(WordFlags.UrlDomain)
|
||||||
|
| score.hasTermFlag(WordFlags.UrlPath)
|
||||||
|
| score.hasTermFlag(WordFlags.Subjects)
|
||||||
|
,
|
||||||
|
(a, b) -> a && b
|
||||||
|
))
|
||||||
|
.containsValue(Boolean.TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||||
this.first = onlyFirst;
|
this.first = onlyFirst;
|
||||||
this.rest = Collections.emptyList();
|
this.rest = Collections.emptyList();
|
||||||
@ -53,17 +72,6 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
|||||||
@Getter
|
@Getter
|
||||||
public final List<UrlDetails> rest;
|
public final List<UrlDetails> rest;
|
||||||
|
|
||||||
public void forEachSingle(Consumer<ClusteredUrlDetails> consumer) {
|
|
||||||
if (rest.isEmpty())
|
|
||||||
consumer.accept(this);
|
|
||||||
else {
|
|
||||||
consumer.accept(new ClusteredUrlDetails(first));
|
|
||||||
rest.stream()
|
|
||||||
.map(ClusteredUrlDetails::new)
|
|
||||||
.forEach(consumer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void splitSmallClusters(Consumer<ClusteredUrlDetails> consumer) {
|
public void splitSmallClusters(Consumer<ClusteredUrlDetails> consumer) {
|
||||||
if (rest.isEmpty())
|
if (rest.isEmpty())
|
||||||
consumer.accept(this);
|
consumer.accept(this);
|
||||||
|
@ -101,15 +101,5 @@ public enum SearchProfile {
|
|||||||
else return SpecificationLimit.none();
|
else return SpecificationLimit.none();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean clusterResults() {
|
|
||||||
if (this == FORUM)
|
|
||||||
return true;
|
|
||||||
if (this == WIKI)
|
|
||||||
return true;
|
|
||||||
if (this == DOCS)
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,11 @@
|
|||||||
<section class="card search-result">
|
|
||||||
|
<section data-ms-rank="{{first.matchRank}}" class="card search-result" >
|
||||||
|
{{#with first}}
|
||||||
|
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
|
||||||
|
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
|
||||||
|
<p class="description">{{description}}</p>
|
||||||
|
|
||||||
|
{{/with}}
|
||||||
<div class="utils">
|
<div class="utils">
|
||||||
Also from <a href="/site/{{first.url.domain}}">{{first.url.domain}}</a>
|
Also from <a href="/site/{{first.url.domain}}">{{first.url.domain}}</a>
|
||||||
</div>
|
</div>
|
||||||
@ -7,11 +14,17 @@
|
|||||||
<li><a href="{{url}}">{{title}}</a></li>
|
<li><a href="{{url}}">{{title}}</a></li>
|
||||||
{{/each}}
|
{{/each}}
|
||||||
</ul>
|
</ul>
|
||||||
{{#if remainingCount}}
|
{{#with first}}
|
||||||
<div class="utils">
|
<div class="utils">
|
||||||
<a href="/site-search/{{first.url.domain}}/{{query}}?profile={{profile}}">{{remainingCount}}</a> more</li>
|
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
|
||||||
|
<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>
|
||||||
|
<div class="meta">
|
||||||
|
{{#if problemCount}} <span class="problems" title="{{problems}}"> ⚠ {{problemCount}} </span> {{/if}}
|
||||||
|
<span aria-hidden="true" class="meta positions"
|
||||||
|
title="Positions where keywords were found within the document">{{positions}}</span>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{{/if}}
|
{{/with}}
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<hr class="w3m-helper" />
|
<hr class="w3m-helper" />
|
@ -25,8 +25,13 @@
|
|||||||
</div>
|
</div>
|
||||||
{{/if}}
|
{{/if}}
|
||||||
{{#each results}}
|
{{#each results}}
|
||||||
{{#with first}} {{>search/parts/search-result}} {{/with}}
|
{{#if hasMultiple}}
|
||||||
{{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}}
|
{{>search/parts/search-result-rest}}
|
||||||
|
{{else}}
|
||||||
|
{{#with first}}
|
||||||
|
{{>search/parts/search-result}}
|
||||||
|
{{/with}}
|
||||||
|
{{/if}}
|
||||||
{{/each}}
|
{{/each}}
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user