diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java index 5a328234..b5a1fd2b 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java @@ -66,7 +66,7 @@ public class SearchOperator { logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ',')); - return searchQueryService.executeQuery(ctx, processedQuery); + return searchQueryService.executeQuery(ctx, processedQuery.specs); } public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) { @@ -76,7 +76,7 @@ public class SearchOperator { logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); - List queryResults = searchQueryService.executeQuery(ctx, processedQuery); + List queryResults = searchQueryService.executeQuery(ctx, processedQuery.specs); logger.info(queryMarker, "Search Result Count: {}", queryResults.size()); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java index e6300b1c..19c3bd7a 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java @@ -64,7 +64,7 @@ public class SiteListCommand implements SearchCommandInterface { int domainId = -1; if (null != domain) { var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain); - resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery); + resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery.specs); var maybeId = domainQueries.tryGetDomainId(domain); if (maybeId.isPresent()) { domainId = maybeId.getAsInt(); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index 92524ca4..92c22e59 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -9,7 +9,6 @@ import nu.marginalia.model.crawl.HtmlFeature; import java.util.EnumSet; import java.util.List; -import java.util.Objects; import java.util.StringJoiner; @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString @@ -98,14 +97,6 @@ public class UrlDetails { return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore()))); } - public int getSuperficialHash() { - return Objects.hash(url.path, title); - } - public String getSuperficialHashStr() { - return String.format("%8X", getSuperficialHash()); - } - - public String getGeminiLink() { return url.proto + "://" + url.domain.toString() + url.path.replace(" ", "%20").replace("\"", "%22"); } @@ -173,9 +164,6 @@ public class UrlDetails { } public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); } - public boolean isSpecialDomain() { - return domainState == DomainIndexingState.SPECIAL; - } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } public int getMatchRank() { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java index 095382fe..83c5a0ce 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java @@ -3,6 +3,7 @@ package nu.marginalia.search.results; import it.unimi.dsi.fastutil.ints.Int2LongArrayMap; import lombok.SneakyThrows; import nu.marginalia.bbpc.BrailleBlockPunchCards; +import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.model.crawl.DomainIndexingState; @@ -18,9 +19,9 @@ public class SearchResultDecorator { private final Logger logger = LoggerFactory.getLogger(getClass()); @SneakyThrows - public List getAllUrlDetails(SearchResultSet resultSet) { + public List getAllUrlDetails(List resultSet) { List ret = new ArrayList<>(resultSet.size()); - for (var detail : resultSet.results) { + for (var detail : resultSet) { ret.add(new UrlDetails( detail.documentId(), detail.domainId(), diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java index 7b083ef8..0c18294c 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/UrlDeduplicator.java @@ -4,11 +4,14 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import gnu.trove.map.hash.TObjectIntHashMap; import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.lsh.EasyLSH; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Objects; + public class UrlDeduplicator { private final int LSH_SIMILARITY_THRESHOLD = 2; private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class); @@ -22,7 +25,7 @@ public class UrlDeduplicator { this.resultsPerKey = resultsPerKey; } - public synchronized boolean shouldRemove(UrlDetails details) { + public synchronized boolean shouldRemove(DecoratedSearchResultItem details) { if (!deduplicateOnSuperficialHash(details)) return true; if (!deduplicateOnLSH(details)) @@ -33,11 +36,11 @@ public class UrlDeduplicator { return false; } - private boolean deduplicateOnSuperficialHash(UrlDetails details) { - return seenSuperficialhashes.add(details.getSuperficialHash()); + private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) { + return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title)); } - private boolean deduplicateOnLSH(UrlDetails details) { + private boolean deduplicateOnLSH(DecoratedSearchResultItem details) { long thisHash = details.dataHash; if (0 == thisHash) @@ -53,16 +56,9 @@ public class UrlDeduplicator { } - private boolean limitResultsPerDomain(UrlDetails details) { + private boolean limitResultsPerDomain(DecoratedSearchResultItem details) { final var domain = details.getUrl().getDomain(); - final String key; - - if (!details.isSpecialDomain()) { - key = domain.getLongDomainKey(); - } - else { - key = domain.getDomainKey(); - } + final String key = domain.getDomainKey(); return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 4355793b..4968299f 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.model.query.SearchSpecification; +import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.search.model.PageScoreAdjustment; import nu.marginalia.search.model.UrlDetails; @@ -42,9 +43,12 @@ public class SearchQueryIndexService { } - public List executeQuery(Context ctx, SearchQuery processedQuery) { + public List executeQuery(Context ctx, SearchSpecification specs) { // Send the query - final SearchResultSet results = indexClient.query(ctx, processedQuery.specs); + final var queryResponse = indexClient.query(ctx, specs); + + // Remove duplicates and other chaff + final var results = limitAndDeduplicateResults(specs, queryResponse.results); // Update the query count (this is what you see on the front page) searchVisitorCount.registerQuery(); @@ -53,14 +57,14 @@ public class SearchQueryIndexService { List urlDetails = resultDecorator.getAllUrlDetails(results); urlDetails.sort(resultListComparator); - return limitAndDeduplicateResults(processedQuery, urlDetails); + return urlDetails; } - private List limitAndDeduplicateResults(SearchQuery processedQuery, List decoratedResults) { - var limits = processedQuery.specs.queryLimits; + private List limitAndDeduplicateResults(SearchSpecification specs, List decoratedResults) { + var limits = specs.queryLimits; UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); - List retList = new ArrayList<>(limits.resultsTotal()); + List retList = new ArrayList<>(limits.resultsTotal()); int dedupCount = 0; for (var item : decoratedResults) {