diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java index cd630b36..f87ade26 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchOperator.java @@ -11,11 +11,8 @@ import nu.marginalia.db.DbDomainQueries; import nu.marginalia.query.client.QueryClient; import nu.marginalia.query.model.QueryResponse; import nu.marginalia.search.command.SearchParameters; -import nu.marginalia.search.model.SearchFilters; -import nu.marginalia.search.model.SearchProfile; -import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.model.*; import nu.marginalia.client.Context; -import nu.marginalia.search.model.DecoratedSearchResults; import nu.marginalia.search.svc.SearchQueryIndexService; import nu.marginalia.search.svc.SearchUnitConversionService; import org.apache.logging.log4j.util.Strings; @@ -106,11 +103,15 @@ public class SearchOperator { String evalResult = getFutureOrDefault(eval, ""); + List clusteredResults = SearchResultClusterer + .selectStrategy(queryResponse, userParams) + .clusterResults(queryResults, 25); + return DecoratedSearchResults.builder() .params(userParams) .problems(getProblems(ctx, evalResult, queryResults, queryResponse)) .evalResult(evalResult) - .results(queryResults) + .results(clusteredResults) .filters(new SearchFilters(websiteUrl, userParams)) .focusDomain(queryResponse.domain()) .focusDomainId(getDomainId(queryResponse.domain())) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java index edb1b62f..38c3961e 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -34,7 +34,7 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), List.of(), - new QueryLimits(1, 25, 200, 8192), + new QueryLimits(5, 100, 200, 8192), profile.searchSetIdentifier.name() ); diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java new file mode 100644 index 00000000..4c57ea8f --- /dev/null +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchResultClusterer.java @@ -0,0 +1,77 @@ +package nu.marginalia.search; + +import nu.marginalia.query.model.QueryResponse; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.model.ClusteredUrlDetails; +import nu.marginalia.search.model.UrlDetails; + +import java.util.List; +import java.util.stream.Collectors; + +/** Functions for clustering search results */ +public class SearchResultClusterer { + private SearchResultClusterer() {} + + public interface SearchResultClusterStrategy { + List clusterResults(List results, int total); + } + + public static SearchResultClusterStrategy selectStrategy(QueryResponse response, SearchParameters params) { + if (response.domain() != null && !response.domain().isBlank()) + return SearchResultClusterer::noOp; + + if (params.profile().clusterResults()) + return SearchResultClusterer::byDomain; + + return SearchResultClusterer::clusterThenSplit; + } + + /** No clustering, just return the results as is */ + private static List noOp(List results, int total) { + if (results.isEmpty()) + return List.of(); + + return results.stream() + .map(ClusteredUrlDetails::new) + .toList(); + } + + /** Cluster the results by domain, and return the top "total" clusters + * sorted by the relevance of the best result + */ + private static List byDomain(List results, int total) { + if (results.isEmpty()) + return List.of(); + + return results.stream() + .collect( + Collectors.groupingBy(details -> details.domainId) + ) + .values().stream() + .map(ClusteredUrlDetails::new) + .mapMulti(ClusteredUrlDetails::splitSmallClusters) // split small clusters into singletons + .sorted() + .limit(total) + .toList(); + } + + /** Cluster the results by domain to find the best result for each domain, + * then split the clusters into singletons, and return the top "total" clusters + */ + private static List clusterThenSplit(List results, int total) { + if (results.isEmpty()) + return List.of(); + + return results.stream() + .collect( + Collectors.groupingBy(details -> details.domainId) + ) + .values().stream() + .map(ClusteredUrlDetails::new) + .mapMulti(ClusteredUrlDetails::forEachSingle) + .sorted() + .limit(total) + .toList(); + } + +} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java new file mode 100644 index 00000000..16655cdd --- /dev/null +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -0,0 +1,103 @@ +package nu.marginalia.search.model; + +import lombok.Getter; +import nu.marginalia.model.EdgeDomain; +import org.jetbrains.annotations.NotNull; + +import java.util.*; +import java.util.function.Consumer; + +/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result + * and the rest are additional results, for summary display. */ +public class ClusteredUrlDetails implements Comparable { + + /** Create a new ClusteredUrlDetails from a collection of UrlDetails, + * with the best result as "first", and the others, in descending order + * of quality as the "rest"... + * + * @param details A collection of UrlDetails, which must not be empty. + */ + public ClusteredUrlDetails(Collection details) { + var queue = new PriorityQueue<>(details); + + if (queue.isEmpty()) + throw new IllegalArgumentException("Empty list of details"); + + this.first = queue.poll(); + + if (queue.isEmpty()) { + this.rest = Collections.emptyList(); + } + else { + double bestScore = first.termScore; + double scoreLimit = Math.min(4.0, bestScore * 1.25); + + this.rest = queue + .stream() + .takeWhile(next -> next.termScore <= scoreLimit) + .toList(); + } + + } + + public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) { + this.first = onlyFirst; + this.rest = Collections.emptyList(); + } + + @NotNull + @Getter + public final UrlDetails first; + + @NotNull + @Getter + public final List rest; + + public void forEachSingle(Consumer consumer) { + if (rest.isEmpty()) + consumer.accept(this); + else { + consumer.accept(new ClusteredUrlDetails(first)); + rest.stream() + .map(ClusteredUrlDetails::new) + .forEach(consumer); + } + } + + public void splitSmallClusters(Consumer consumer) { + if (rest.isEmpty()) + consumer.accept(this); + else if (rest.size() < 2) { // Only one additional result + consumer.accept(new ClusteredUrlDetails(first)); + rest.stream() + .map(ClusteredUrlDetails::new) + .forEach(consumer); + } + else { + consumer.accept(this); + } + } + + public EdgeDomain getDomain() { + return first.url.getDomain(); + } + + public boolean hasMultiple() { + return !rest.isEmpty(); + } + + /** Returns the total number of results from the same domain, + * including such results that are not included here. */ + public int totalCount() { + return first.resultsFromSameDomain; + } + + public int remainingCount() { + return totalCount() - 1 - rest.size(); + } + + @Override + public int compareTo(@NotNull ClusteredUrlDetails o) { + return Objects.compare(first, o.first, UrlDetails::compareTo); + } +} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java index bdea053e..de6783d8 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java @@ -13,7 +13,7 @@ public class DecoratedSearchResults { private final List problems; private final String evalResult; - public final List results; + public final List results; private final String focusDomain; private final int focusDomainId; diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/PageScoreAdjustment.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/PageScoreAdjustment.java deleted file mode 100644 index 7bff0296..00000000 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/PageScoreAdjustment.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.search.model; - -import lombok.Builder; -import lombok.Getter; - -@Getter -@Builder -public class PageScoreAdjustment { - final double titleAdj; - final double titleFullHit; - final double urlAdj; - final double domainAdj; - final double descAdj; - final double descHitsAdj; - - private static final PageScoreAdjustment zero = new PageScoreAdjustment(0,0, 0,0,0, 0); - public static PageScoreAdjustment zero() { - return zero; - } - - public double getScore() { - return titleAdj + titleFullHit + urlAdj + domainAdj + descAdj + descHitsAdj; - } - - @Override - public String toString() { - return String.format("(%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f)=%2.2f", - titleAdj, titleFullHit, urlAdj, domainAdj, descAdj, descHitsAdj, getScore()); - } -} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index 85fafa8f..9a40e65c 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -101,5 +101,15 @@ public enum SearchProfile { else return SpecificationLimit.none(); } + public boolean clusterResults() { + if (this == FORUM) + return true; + if (this == WIKI) + return true; + if (this == DOCS) + return true; + return false; + } + } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index ae4f551e..4968a876 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -7,31 +7,24 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.HtmlFeature; -import java.util.EnumSet; import java.util.List; import java.util.StringJoiner; +/** A class to hold details about a single search result. */ @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString -public class UrlDetails { +public class UrlDetails implements Comparable { public long id; public int domainId; + public EdgeUrl url; public String title; public String description; - public double urlQuality; - - public int words; public String format; public int features; - public String ip; public DomainIndexingState domainState; - public long dataHash; - - public PageScoreAdjustment urlQualityAdjustment; - public long rankingId; public double termScore; public int resultsFromSameDomain; @@ -39,7 +32,6 @@ public class UrlDetails { public String positions; public SearchResultItem resultItem; public List keywordScores; - public long combinedId; public boolean hasMoreResults() { return resultsFromSameDomain > 1; @@ -69,6 +61,13 @@ public class UrlDetails { return Long.hashCode(id); } + @Override + public int compareTo(UrlDetails other) { + int result = Double.compare(getTermScore(), other.getTermScore()); + if (result == 0) result = Long.compare(getId(), other.getId()); + return result; + } + public boolean equals(Object other) { if (other == null) { return false; @@ -81,6 +80,7 @@ public class UrlDetails { } return false; } + public String getTitle() { if (title == null || title.isBlank()) { return url.toString(); @@ -88,22 +88,11 @@ public class UrlDetails { return title; } - public String getQualityPercent() { - return String.format("%2.2f%%", 100*Math.exp(urlQuality+urlQualityAdjustment.getScore())); - } - - public double getRanking() { - double lengthAdjustment = Math.max(1, words / (words + 10000.)); - return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore()))); - } - public boolean isPlainText() { return "PLAIN".equals(format); } public int getProblemCount() { - int numProblems = 0; - int mask = HtmlFeature.JS.getFeatureBit() | HtmlFeature.COOKIES.getFeatureBit() | HtmlFeature.TRACKING.getFeatureBit() @@ -153,8 +142,6 @@ public class UrlDetails { } public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); } - public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } - public int getMatchRank() { if (termScore <= 1) return 1; if (termScore <= 2) return 2; @@ -163,23 +150,4 @@ public class UrlDetails { return 10; } - - public double getFeatureScore() { - double score = 1; - if (isScripts()) { - score+=1; - } else if(!"HTML5".equals(format)) { - score+=0.5; - } - if (isAffiliate()) { - score += 2.5; - } - if (isTracking()) { - score += 1.5; - } - if (isCookies()) { - score += 1.5; - } - return score; - } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 36de5b4a..87194275 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -10,7 +10,6 @@ import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.query.model.QueryResponse; -import nu.marginalia.search.model.PageScoreAdjustment; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.results.UrlDeduplicator; import org.slf4j.Logger; @@ -22,7 +21,6 @@ import java.util.*; @Singleton public class SearchQueryIndexService { - private final Comparator resultListComparator; private final SearchQueryCountService searchVisitorCount; private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -30,11 +28,6 @@ public class SearchQueryIndexService { @Inject public SearchQueryIndexService(SearchQueryCountService searchVisitorCount) { this.searchVisitorCount = searchVisitorCount; - - resultListComparator = Comparator.comparing(UrlDetails::getTermScore) - .thenComparing(UrlDetails::getRanking) - .thenComparing(UrlDetails::getId); - } public List getResultsFromQuery(QueryResponse queryResponse) { @@ -46,7 +39,8 @@ public class SearchQueryIndexService { // Decorate and sort the results List urlDetails = getAllUrlDetails(results); - urlDetails.sort(resultListComparator); + + urlDetails.sort(Comparator.naturalOrder()); return urlDetails; } @@ -81,6 +75,7 @@ public class SearchQueryIndexService { @SneakyThrows public List getAllUrlDetails(List resultSet) { List ret = new ArrayList<>(resultSet.size()); + for (var detail : resultSet) { ret.add(new UrlDetails( detail.documentId(), @@ -88,21 +83,14 @@ public class SearchQueryIndexService { detail.url, detail.title, detail.description, - detail.urlQuality, - detail.wordsTotal, detail.format, detail.features, - "", DomainIndexingState.ACTIVE, - detail.dataHash, - PageScoreAdjustment.zero(), // urlQualityAdjustment - detail.rankingId(), detail.rankingScore, // termScore detail.resultsFromDomain(), getPositionsString(detail.rawIndexResult), detail.rawIndexResult, - detail.rawIndexResult.keywordScores, - 0L + detail.rawIndexResult.keywordScores )); } diff --git a/code/services-application/search-service/src/main/resources/static/search/serp.scss b/code/services-application/search-service/src/main/resources/static/search/serp.scss index 0c44415c..e9a70a99 100644 --- a/code/services-application/search-service/src/main/resources/static/search/serp.scss +++ b/code/services-application/search-service/src/main/resources/static/search/serp.scss @@ -607,6 +607,16 @@ footer { padding: 1ch; margin: 0; } + + ul.additional-results { + background-color: $fg-light; + padding: 1ch; + list-style: none; + margin: 0; + a { + color: $fg-dark; + } + } } .search-result[data-ms-rank="1"] { .url, h2 { filter: grayscale(0%); } } diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb new file mode 100644 index 00000000..7efc5b5e --- /dev/null +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result-rest.hdb @@ -0,0 +1,17 @@ +
+
+ Also from {{first.url.domain}} +
+ +{{#if remainingCount}} + +{{/if}} +
+ +
\ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb index 2c9c8f8a..3efe29e3 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb @@ -1,5 +1,6 @@ -
+ +
+

{{title}}

{{description}}

diff --git a/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb b/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb index d37231cb..197f5666 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/search-results.hdb @@ -23,9 +23,11 @@
Showing search results from {{focusDomain}}.
- {{/if}} - {{#each results}}{{>search/parts/search-result}}{{/each}} + {{#each results}} + {{#with first}} {{>search/parts/search-result}} {{/with}} + {{#if hasMultiple}} {{>search/parts/search-result-rest}} {{/if}} + {{/each}}
{{#with filters}}