(search) Experimental support for clustering search results

Adds experimental support for clustering search results by e.g. domain.  At a first stage, this is only enabled for the wiki and forum filters.

The commit also cleans up the UrlDetails class, which contained a number of vestigial entries.
This commit is contained in:
Viktor Lofgren 2024-02-11 17:31:38 +01:00
parent 9d68062553
commit bcd0dabb92
13 changed files with 247 additions and 100 deletions

View File

@ -11,11 +11,8 @@ import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.query.model.QueryResponse;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.SearchFilters;
import nu.marginalia.search.model.SearchProfile;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.model.*;
import nu.marginalia.client.Context;
import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.svc.SearchQueryIndexService;
import nu.marginalia.search.svc.SearchUnitConversionService;
import org.apache.logging.log4j.util.Strings;
@ -106,11 +103,15 @@ public class SearchOperator {
String evalResult = getFutureOrDefault(eval, "");
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
.selectStrategy(queryResponse, userParams)
.clusterResults(queryResults, 25);
return DecoratedSearchResults.builder()
.params(userParams)
.problems(getProblems(ctx, evalResult, queryResults, queryResponse))
.evalResult(evalResult)
.results(queryResults)
.results(clusteredResults)
.filters(new SearchFilters(websiteUrl, userParams))
.focusDomain(queryResponse.domain())
.focusDomainId(getDomainId(queryResponse.domain()))

View File

@ -34,7 +34,7 @@ public class SearchQueryParamFactory {
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(1, 25, 200, 8192),
new QueryLimits(5, 100, 200, 8192),
profile.searchSetIdentifier.name()
);

View File

@ -0,0 +1,77 @@
package nu.marginalia.search;
import nu.marginalia.query.model.QueryResponse;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.UrlDetails;
import java.util.List;
import java.util.stream.Collectors;
/** Functions for clustering search results */
public class SearchResultClusterer {
private SearchResultClusterer() {}
public interface SearchResultClusterStrategy {
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
}
public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) {
if (response.domain() != null && !response.domain().isBlank())
return SearchResultClusterer::noOp;
if (params.profile().clusterResults())
return SearchResultClusterer::byDomain;
return SearchResultClusterer::clusterThenSplit;
}
/** No clustering, just return the results as is */
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.map(ClusteredUrlDetails::new)
.toList();
}
/** Cluster the results by domain, and return the top "total" clusters
* sorted by the relevance of the best result
*/
private static List<ClusteredUrlDetails> byDomain(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.collect(
Collectors.groupingBy(details -> details.domainId)
)
.values().stream()
.map(ClusteredUrlDetails::new)
.mapMulti(ClusteredUrlDetails::splitSmallClusters) // split small clusters into singletons
.sorted()
.limit(total)
.toList();
}
/** Cluster the results by domain to find the best result for each domain,
* then split the clusters into singletons, and return the top "total" clusters
*/
private static List<ClusteredUrlDetails> clusterThenSplit(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.collect(
Collectors.groupingBy(details -> details.domainId)
)
.values().stream()
.map(ClusteredUrlDetails::new)
.mapMulti(ClusteredUrlDetails::forEachSingle)
.sorted()
.limit(total)
.toList();
}
}

View File

@ -0,0 +1,103 @@
package nu.marginalia.search.model;
import lombok.Getter;
import nu.marginalia.model.EdgeDomain;
import org.jetbrains.annotations.NotNull;
import java.util.*;
import java.util.function.Consumer;
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
* and the rest are additional results, for summary display. */
public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
/** Create a new ClusteredUrlDetails from a collection of UrlDetails,
* with the best result as "first", and the others, in descending order
* of quality as the "rest"...
*
* @param details A collection of UrlDetails, which must not be empty.
*/
public ClusteredUrlDetails(Collection<UrlDetails> details) {
var queue = new PriorityQueue<>(details);
if (queue.isEmpty())
throw new IllegalArgumentException("Empty list of details");
this.first = queue.poll();
if (queue.isEmpty()) {
this.rest = Collections.emptyList();
}
else {
double bestScore = first.termScore;
double scoreLimit = Math.min(4.0, bestScore * 1.25);
this.rest = queue
.stream()
.takeWhile(next -> next.termScore <= scoreLimit)
.toList();
}
}
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
this.first = onlyFirst;
this.rest = Collections.emptyList();
}
@NotNull
@Getter
public final UrlDetails first;
@NotNull
@Getter
public final List<UrlDetails> rest;
public void forEachSingle(Consumer<ClusteredUrlDetails> consumer) {
if (rest.isEmpty())
consumer.accept(this);
else {
consumer.accept(new ClusteredUrlDetails(first));
rest.stream()
.map(ClusteredUrlDetails::new)
.forEach(consumer);
}
}
public void splitSmallClusters(Consumer<ClusteredUrlDetails> consumer) {
if (rest.isEmpty())
consumer.accept(this);
else if (rest.size() < 2) { // Only one additional result
consumer.accept(new ClusteredUrlDetails(first));
rest.stream()
.map(ClusteredUrlDetails::new)
.forEach(consumer);
}
else {
consumer.accept(this);
}
}
public EdgeDomain getDomain() {
return first.url.getDomain();
}
public boolean hasMultiple() {
return !rest.isEmpty();
}
/** Returns the total number of results from the same domain,
* including such results that are not included here. */
public int totalCount() {
return first.resultsFromSameDomain;
}
public int remainingCount() {
return totalCount() - 1 - rest.size();
}
@Override
public int compareTo(@NotNull ClusteredUrlDetails o) {
return Objects.compare(first, o.first, UrlDetails::compareTo);
}
}

View File

@ -13,7 +13,7 @@ public class DecoratedSearchResults {
private final List<String> problems;
private final String evalResult;
public final List<UrlDetails> results;
public final List<ClusteredUrlDetails> results;
private final String focusDomain;
private final int focusDomainId;

View File

@ -1,30 +0,0 @@
package nu.marginalia.search.model;
import lombok.Builder;
import lombok.Getter;
@Getter
@Builder
public class PageScoreAdjustment {
final double titleAdj;
final double titleFullHit;
final double urlAdj;
final double domainAdj;
final double descAdj;
final double descHitsAdj;
private static final PageScoreAdjustment zero = new PageScoreAdjustment(0,0, 0,0,0, 0);
public static PageScoreAdjustment zero() {
return zero;
}
public double getScore() {
return titleAdj + titleFullHit + urlAdj + domainAdj + descAdj + descHitsAdj;
}
@Override
public String toString() {
return String.format("(%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f)=%2.2f",
titleAdj, titleFullHit, urlAdj, domainAdj, descAdj, descHitsAdj, getScore());
}
}

View File

@ -101,5 +101,15 @@ public enum SearchProfile {
else return SpecificationLimit.none();
}
public boolean clusterResults() {
if (this == FORUM)
return true;
if (this == WIKI)
return true;
if (this == DOCS)
return true;
return false;
}
}

View File

@ -7,31 +7,24 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import java.util.EnumSet;
import java.util.List;
import java.util.StringJoiner;
/** A class to hold details about a single search result. */
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
public class UrlDetails {
public class UrlDetails implements Comparable<UrlDetails> {
public long id;
public int domainId;
public EdgeUrl url;
public String title;
public String description;
public double urlQuality;
public int words;
public String format;
public int features;
public String ip;
public DomainIndexingState domainState;
public long dataHash;
public PageScoreAdjustment urlQualityAdjustment;
public long rankingId;
public double termScore;
public int resultsFromSameDomain;
@ -39,7 +32,6 @@ public class UrlDetails {
public String positions;
public SearchResultItem resultItem;
public List<SearchResultKeywordScore> keywordScores;
public long combinedId;
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;
@ -69,6 +61,13 @@ public class UrlDetails {
return Long.hashCode(id);
}
@Override
public int compareTo(UrlDetails other) {
int result = Double.compare(getTermScore(), other.getTermScore());
if (result == 0) result = Long.compare(getId(), other.getId());
return result;
}
public boolean equals(Object other) {
if (other == null) {
return false;
@ -81,6 +80,7 @@ public class UrlDetails {
}
return false;
}
public String getTitle() {
if (title == null || title.isBlank()) {
return url.toString();
@ -88,22 +88,11 @@ public class UrlDetails {
return title;
}
public String getQualityPercent() {
return String.format("%2.2f%%", 100*Math.exp(urlQuality+urlQualityAdjustment.getScore()));
}
public double getRanking() {
double lengthAdjustment = Math.max(1, words / (words + 10000.));
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
}
public boolean isPlainText() {
return "PLAIN".equals(format);
}
public int getProblemCount() {
int numProblems = 0;
int mask = HtmlFeature.JS.getFeatureBit()
| HtmlFeature.COOKIES.getFeatureBit()
| HtmlFeature.TRACKING.getFeatureBit()
@ -153,8 +142,6 @@ public class UrlDetails {
}
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); }
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
public int getMatchRank() {
if (termScore <= 1) return 1;
if (termScore <= 2) return 2;
@ -163,23 +150,4 @@ public class UrlDetails {
return 10;
}
public double getFeatureScore() {
double score = 1;
if (isScripts()) {
score+=1;
} else if(!"HTML5".equals(format)) {
score+=0.5;
}
if (isAffiliate()) {
score += 2.5;
}
if (isTracking()) {
score += 1.5;
}
if (isCookies()) {
score += 1.5;
}
return score;
}
}

View File

@ -10,7 +10,6 @@ import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.query.model.QueryResponse;
import nu.marginalia.search.model.PageScoreAdjustment;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.UrlDeduplicator;
import org.slf4j.Logger;
@ -22,7 +21,6 @@ import java.util.*;
@Singleton
public class SearchQueryIndexService {
private final Comparator<UrlDetails> resultListComparator;
private final SearchQueryCountService searchVisitorCount;
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -30,11 +28,6 @@ public class SearchQueryIndexService {
@Inject
public SearchQueryIndexService(SearchQueryCountService searchVisitorCount) {
this.searchVisitorCount = searchVisitorCount;
resultListComparator = Comparator.comparing(UrlDetails::getTermScore)
.thenComparing(UrlDetails::getRanking)
.thenComparing(UrlDetails::getId);
}
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
@ -46,7 +39,8 @@ public class SearchQueryIndexService {
// Decorate and sort the results
List<UrlDetails> urlDetails = getAllUrlDetails(results);
urlDetails.sort(resultListComparator);
urlDetails.sort(Comparator.naturalOrder());
return urlDetails;
}
@ -81,6 +75,7 @@ public class SearchQueryIndexService {
@SneakyThrows
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
for (var detail : resultSet) {
ret.add(new UrlDetails(
detail.documentId(),
@ -88,21 +83,14 @@ public class SearchQueryIndexService {
detail.url,
detail.title,
detail.description,
detail.urlQuality,
detail.wordsTotal,
detail.format,
detail.features,
"",
DomainIndexingState.ACTIVE,
detail.dataHash,
PageScoreAdjustment.zero(), // urlQualityAdjustment
detail.rankingId(),
detail.rankingScore, // termScore
detail.resultsFromDomain(),
getPositionsString(detail.rawIndexResult),
detail.rawIndexResult,
detail.rawIndexResult.keywordScores,
0L
detail.rawIndexResult.keywordScores
));
}

View File

@ -607,6 +607,16 @@ footer {
padding: 1ch;
margin: 0;
}
ul.additional-results {
background-color: $fg-light;
padding: 1ch;
list-style: none;
margin: 0;
a {
color: $fg-dark;
}
}
}
.search-result[data-ms-rank="1"] { .url, h2 { filter: grayscale(0%); } }

View File

@ -0,0 +1,17 @@
<section class="card search-result">
<div class="utils">
Also from <a href="/site/{{first.url.domain}}">{{first.url.domain}}</a>
</div>
<ul class="additional-results">
{{#each rest}}
<li><a href="{{url}}">{{title}}</a></li>
{{/each}}
</ul>
{{#if remainingCount}}
<div class="utils">
<a href="/site-search/{{first.url.domain}}/{{query}}?profile={{profile}}">{{remainingCount}}</a> more</li>
</div>
{{/if}}
</section>
<hr class="w3m-helper" />

View File

@ -1,5 +1,6 @@
<section data-rs-rank="{{logRank}}" data-ms-rank="{{matchRank}}"
class="card search-result" >
<!-- {{termScore}} -->
<section data-ms-rank="{{matchRank}}" class="card search-result" >
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
<p class="description">{{description}}</p>

View File

@ -23,9 +23,11 @@
<div class="infobox">
Showing search results from <a href="/site/{{focusDomain}}">{{focusDomain}}</a>.
</div>
{{/if}}
{{#each results}}{{>search/parts/search-result}}{{/each}}
{{#each results}}
{{#with first}} {{>search/parts/search-result}} {{/with}}
{{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}}
{{/each}}
</section>
{{#with filters}}