(search) Experimental support for clustering search results
Adds experimental support for clustering search results by e.g. domain. At a first stage, this is only enabled for the wiki and forum filters. The commit also cleans up the UrlDetails class, which contained a number of vestigial entries.
This commit is contained in:
parent
9d68062553
commit
bcd0dabb92
@ -11,11 +11,8 @@ import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.query.client.QueryClient;
|
||||
import nu.marginalia.query.model.QueryResponse;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.SearchFilters;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.model.*;
|
||||
import nu.marginalia.client.Context;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.search.svc.SearchQueryIndexService;
|
||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
@ -106,11 +103,15 @@ public class SearchOperator {
|
||||
|
||||
String evalResult = getFutureOrDefault(eval, "");
|
||||
|
||||
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
|
||||
.selectStrategy(queryResponse, userParams)
|
||||
.clusterResults(queryResults, 25);
|
||||
|
||||
return DecoratedSearchResults.builder()
|
||||
.params(userParams)
|
||||
.problems(getProblems(ctx, evalResult, queryResults, queryResponse))
|
||||
.evalResult(evalResult)
|
||||
.results(queryResults)
|
||||
.results(clusteredResults)
|
||||
.filters(new SearchFilters(websiteUrl, userParams))
|
||||
.focusDomain(queryResponse.domain())
|
||||
.focusDomainId(getDomainId(queryResponse.domain()))
|
||||
|
@ -34,7 +34,7 @@ public class SearchQueryParamFactory {
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(1, 25, 200, 8192),
|
||||
new QueryLimits(5, 100, 200, 8192),
|
||||
profile.searchSetIdentifier.name()
|
||||
);
|
||||
|
||||
|
@ -0,0 +1,77 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.query.model.QueryResponse;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Functions for clustering search results */
|
||||
public class SearchResultClusterer {
|
||||
private SearchResultClusterer() {}
|
||||
|
||||
public interface SearchResultClusterStrategy {
|
||||
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
|
||||
}
|
||||
|
||||
public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) {
|
||||
if (response.domain() != null && !response.domain().isBlank())
|
||||
return SearchResultClusterer::noOp;
|
||||
|
||||
if (params.profile().clusterResults())
|
||||
return SearchResultClusterer::byDomain;
|
||||
|
||||
return SearchResultClusterer::clusterThenSplit;
|
||||
}
|
||||
|
||||
/** No clustering, just return the results as is */
|
||||
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.toList();
|
||||
}
|
||||
|
||||
/** Cluster the results by domain, and return the top "total" clusters
|
||||
* sorted by the relevance of the best result
|
||||
*/
|
||||
private static List<ClusteredUrlDetails> byDomain(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.collect(
|
||||
Collectors.groupingBy(details -> details.domainId)
|
||||
)
|
||||
.values().stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.mapMulti(ClusteredUrlDetails::splitSmallClusters) // split small clusters into singletons
|
||||
.sorted()
|
||||
.limit(total)
|
||||
.toList();
|
||||
}
|
||||
|
||||
/** Cluster the results by domain to find the best result for each domain,
|
||||
* then split the clusters into singletons, and return the top "total" clusters
|
||||
*/
|
||||
private static List<ClusteredUrlDetails> clusterThenSplit(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.collect(
|
||||
Collectors.groupingBy(details -> details.domainId)
|
||||
)
|
||||
.values().stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.mapMulti(ClusteredUrlDetails::forEachSingle)
|
||||
.sorted()
|
||||
.limit(total)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,103 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||
* and the rest are additional results, for summary display. */
|
||||
public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
||||
|
||||
/** Create a new ClusteredUrlDetails from a collection of UrlDetails,
|
||||
* with the best result as "first", and the others, in descending order
|
||||
* of quality as the "rest"...
|
||||
*
|
||||
* @param details A collection of UrlDetails, which must not be empty.
|
||||
*/
|
||||
public ClusteredUrlDetails(Collection<UrlDetails> details) {
|
||||
var queue = new PriorityQueue<>(details);
|
||||
|
||||
if (queue.isEmpty())
|
||||
throw new IllegalArgumentException("Empty list of details");
|
||||
|
||||
this.first = queue.poll();
|
||||
|
||||
if (queue.isEmpty()) {
|
||||
this.rest = Collections.emptyList();
|
||||
}
|
||||
else {
|
||||
double bestScore = first.termScore;
|
||||
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
||||
|
||||
this.rest = queue
|
||||
.stream()
|
||||
.takeWhile(next -> next.termScore <= scoreLimit)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||
this.first = onlyFirst;
|
||||
this.rest = Collections.emptyList();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Getter
|
||||
public final UrlDetails first;
|
||||
|
||||
@NotNull
|
||||
@Getter
|
||||
public final List<UrlDetails> rest;
|
||||
|
||||
public void forEachSingle(Consumer<ClusteredUrlDetails> consumer) {
|
||||
if (rest.isEmpty())
|
||||
consumer.accept(this);
|
||||
else {
|
||||
consumer.accept(new ClusteredUrlDetails(first));
|
||||
rest.stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.forEach(consumer);
|
||||
}
|
||||
}
|
||||
|
||||
public void splitSmallClusters(Consumer<ClusteredUrlDetails> consumer) {
|
||||
if (rest.isEmpty())
|
||||
consumer.accept(this);
|
||||
else if (rest.size() < 2) { // Only one additional result
|
||||
consumer.accept(new ClusteredUrlDetails(first));
|
||||
rest.stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.forEach(consumer);
|
||||
}
|
||||
else {
|
||||
consumer.accept(this);
|
||||
}
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return first.url.getDomain();
|
||||
}
|
||||
|
||||
public boolean hasMultiple() {
|
||||
return !rest.isEmpty();
|
||||
}
|
||||
|
||||
/** Returns the total number of results from the same domain,
|
||||
* including such results that are not included here. */
|
||||
public int totalCount() {
|
||||
return first.resultsFromSameDomain;
|
||||
}
|
||||
|
||||
public int remainingCount() {
|
||||
return totalCount() - 1 - rest.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull ClusteredUrlDetails o) {
|
||||
return Objects.compare(first, o.first, UrlDetails::compareTo);
|
||||
}
|
||||
}
|
@ -13,7 +13,7 @@ public class DecoratedSearchResults {
|
||||
private final List<String> problems;
|
||||
private final String evalResult;
|
||||
|
||||
public final List<UrlDetails> results;
|
||||
public final List<ClusteredUrlDetails> results;
|
||||
|
||||
private final String focusDomain;
|
||||
private final int focusDomainId;
|
||||
|
@ -1,30 +0,0 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
public class PageScoreAdjustment {
|
||||
final double titleAdj;
|
||||
final double titleFullHit;
|
||||
final double urlAdj;
|
||||
final double domainAdj;
|
||||
final double descAdj;
|
||||
final double descHitsAdj;
|
||||
|
||||
private static final PageScoreAdjustment zero = new PageScoreAdjustment(0,0, 0,0,0, 0);
|
||||
public static PageScoreAdjustment zero() {
|
||||
return zero;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return titleAdj + titleFullHit + urlAdj + domainAdj + descAdj + descHitsAdj;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("(%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f)=%2.2f",
|
||||
titleAdj, titleFullHit, urlAdj, domainAdj, descAdj, descHitsAdj, getScore());
|
||||
}
|
||||
}
|
@ -101,5 +101,15 @@ public enum SearchProfile {
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
public boolean clusterResults() {
|
||||
if (this == FORUM)
|
||||
return true;
|
||||
if (this == WIKI)
|
||||
return true;
|
||||
if (this == DOCS)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -7,31 +7,24 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
/** A class to hold details about a single search result. */
|
||||
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
|
||||
public class UrlDetails {
|
||||
public class UrlDetails implements Comparable<UrlDetails> {
|
||||
public long id;
|
||||
public int domainId;
|
||||
|
||||
public EdgeUrl url;
|
||||
public String title;
|
||||
public String description;
|
||||
|
||||
public double urlQuality;
|
||||
|
||||
public int words;
|
||||
public String format;
|
||||
public int features;
|
||||
|
||||
public String ip;
|
||||
public DomainIndexingState domainState;
|
||||
|
||||
public long dataHash;
|
||||
|
||||
public PageScoreAdjustment urlQualityAdjustment;
|
||||
public long rankingId;
|
||||
public double termScore;
|
||||
|
||||
public int resultsFromSameDomain;
|
||||
@ -39,7 +32,6 @@ public class UrlDetails {
|
||||
public String positions;
|
||||
public SearchResultItem resultItem;
|
||||
public List<SearchResultKeywordScore> keywordScores;
|
||||
public long combinedId;
|
||||
|
||||
public boolean hasMoreResults() {
|
||||
return resultsFromSameDomain > 1;
|
||||
@ -69,6 +61,13 @@ public class UrlDetails {
|
||||
return Long.hashCode(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(UrlDetails other) {
|
||||
int result = Double.compare(getTermScore(), other.getTermScore());
|
||||
if (result == 0) result = Long.compare(getId(), other.getId());
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == null) {
|
||||
return false;
|
||||
@ -81,6 +80,7 @@ public class UrlDetails {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
if (title == null || title.isBlank()) {
|
||||
return url.toString();
|
||||
@ -88,22 +88,11 @@ public class UrlDetails {
|
||||
return title;
|
||||
}
|
||||
|
||||
public String getQualityPercent() {
|
||||
return String.format("%2.2f%%", 100*Math.exp(urlQuality+urlQualityAdjustment.getScore()));
|
||||
}
|
||||
|
||||
public double getRanking() {
|
||||
double lengthAdjustment = Math.max(1, words / (words + 10000.));
|
||||
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
||||
}
|
||||
|
||||
public boolean isPlainText() {
|
||||
return "PLAIN".equals(format);
|
||||
}
|
||||
|
||||
public int getProblemCount() {
|
||||
int numProblems = 0;
|
||||
|
||||
int mask = HtmlFeature.JS.getFeatureBit()
|
||||
| HtmlFeature.COOKIES.getFeatureBit()
|
||||
| HtmlFeature.TRACKING.getFeatureBit()
|
||||
@ -153,8 +142,6 @@ public class UrlDetails {
|
||||
}
|
||||
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); }
|
||||
|
||||
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
||||
|
||||
public int getMatchRank() {
|
||||
if (termScore <= 1) return 1;
|
||||
if (termScore <= 2) return 2;
|
||||
@ -163,23 +150,4 @@ public class UrlDetails {
|
||||
|
||||
return 10;
|
||||
}
|
||||
|
||||
public double getFeatureScore() {
|
||||
double score = 1;
|
||||
if (isScripts()) {
|
||||
score+=1;
|
||||
} else if(!"HTML5".equals(format)) {
|
||||
score+=0.5;
|
||||
}
|
||||
if (isAffiliate()) {
|
||||
score += 2.5;
|
||||
}
|
||||
if (isTracking()) {
|
||||
score += 1.5;
|
||||
}
|
||||
if (isCookies()) {
|
||||
score += 1.5;
|
||||
}
|
||||
return score;
|
||||
}
|
||||
}
|
||||
|
@ -10,7 +10,6 @@ import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.query.model.QueryResponse;
|
||||
import nu.marginalia.search.model.PageScoreAdjustment;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.results.UrlDeduplicator;
|
||||
import org.slf4j.Logger;
|
||||
@ -22,7 +21,6 @@ import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class SearchQueryIndexService {
|
||||
private final Comparator<UrlDetails> resultListComparator;
|
||||
private final SearchQueryCountService searchVisitorCount;
|
||||
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -30,11 +28,6 @@ public class SearchQueryIndexService {
|
||||
@Inject
|
||||
public SearchQueryIndexService(SearchQueryCountService searchVisitorCount) {
|
||||
this.searchVisitorCount = searchVisitorCount;
|
||||
|
||||
resultListComparator = Comparator.comparing(UrlDetails::getTermScore)
|
||||
.thenComparing(UrlDetails::getRanking)
|
||||
.thenComparing(UrlDetails::getId);
|
||||
|
||||
}
|
||||
|
||||
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
|
||||
@ -46,7 +39,8 @@ public class SearchQueryIndexService {
|
||||
|
||||
// Decorate and sort the results
|
||||
List<UrlDetails> urlDetails = getAllUrlDetails(results);
|
||||
urlDetails.sort(resultListComparator);
|
||||
|
||||
urlDetails.sort(Comparator.naturalOrder());
|
||||
|
||||
return urlDetails;
|
||||
}
|
||||
@ -81,6 +75,7 @@ public class SearchQueryIndexService {
|
||||
@SneakyThrows
|
||||
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
|
||||
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
|
||||
|
||||
for (var detail : resultSet) {
|
||||
ret.add(new UrlDetails(
|
||||
detail.documentId(),
|
||||
@ -88,21 +83,14 @@ public class SearchQueryIndexService {
|
||||
detail.url,
|
||||
detail.title,
|
||||
detail.description,
|
||||
detail.urlQuality,
|
||||
detail.wordsTotal,
|
||||
detail.format,
|
||||
detail.features,
|
||||
"",
|
||||
DomainIndexingState.ACTIVE,
|
||||
detail.dataHash,
|
||||
PageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||
detail.rankingId(),
|
||||
detail.rankingScore, // termScore
|
||||
detail.resultsFromDomain(),
|
||||
getPositionsString(detail.rawIndexResult),
|
||||
detail.rawIndexResult,
|
||||
detail.rawIndexResult.keywordScores,
|
||||
0L
|
||||
detail.rawIndexResult.keywordScores
|
||||
));
|
||||
}
|
||||
|
||||
|
@ -607,6 +607,16 @@ footer {
|
||||
padding: 1ch;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
ul.additional-results {
|
||||
background-color: $fg-light;
|
||||
padding: 1ch;
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
a {
|
||||
color: $fg-dark;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.search-result[data-ms-rank="1"] { .url, h2 { filter: grayscale(0%); } }
|
||||
|
@ -0,0 +1,17 @@
|
||||
<section class="card search-result">
|
||||
<div class="utils">
|
||||
Also from <a href="/site/{{first.url.domain}}">{{first.url.domain}}</a>
|
||||
</div>
|
||||
<ul class="additional-results">
|
||||
{{#each rest}}
|
||||
<li><a href="{{url}}">{{title}}</a></li>
|
||||
{{/each}}
|
||||
</ul>
|
||||
{{#if remainingCount}}
|
||||
<div class="utils">
|
||||
<a href="/site-search/{{first.url.domain}}/{{query}}?profile={{profile}}">{{remainingCount}}</a> more</li>
|
||||
</div>
|
||||
{{/if}}
|
||||
</section>
|
||||
|
||||
<hr class="w3m-helper" />
|
@ -1,5 +1,6 @@
|
||||
<section data-rs-rank="{{logRank}}" data-ms-rank="{{matchRank}}"
|
||||
class="card search-result" >
|
||||
<!-- {{termScore}} -->
|
||||
<section data-ms-rank="{{matchRank}}" class="card search-result" >
|
||||
|
||||
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
|
||||
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
|
||||
<p class="description">{{description}}</p>
|
||||
|
@ -23,9 +23,11 @@
|
||||
<div class="infobox">
|
||||
Showing search results from <a href="/site/{{focusDomain}}">{{focusDomain}}</a>.
|
||||
</div>
|
||||
|
||||
{{/if}}
|
||||
{{#each results}}{{>search/parts/search-result}}{{/each}}
|
||||
{{#each results}}
|
||||
{{#with first}} {{>search/parts/search-result}} {{/with}}
|
||||
{{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}}
|
||||
{{/each}}
|
||||
</section>
|
||||
|
||||
{{#with filters}}
|
||||
|
Loading…
Reference in New Issue
Block a user