(search) Refactor SearchQueryIndexService in preparation for feature extraction.

Prefer working on DecoratedSearchResultItem in favor of UrlDetails.
This commit is contained in:
Viktor Lofgren 2023-10-08 17:15:41 +02:00
parent 77ccab7d80
commit cf366c602f
6 changed files with 25 additions and 36 deletions

View File

@ -66,7 +66,7 @@ public class SearchOperator {
logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
return searchQueryService.executeQuery(ctx, processedQuery);
return searchQueryService.executeQuery(ctx, processedQuery.specs);
}
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) {
@ -76,7 +76,7 @@ public class SearchOperator {
logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery);
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery.specs);
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());

View File

@ -64,7 +64,7 @@ public class SiteListCommand implements SearchCommandInterface {
int domainId = -1;
if (null != domain) {
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery.specs);
var maybeId = domainQueries.tryGetDomainId(domain);
if (maybeId.isPresent()) {
domainId = maybeId.getAsInt();

View File

@ -9,7 +9,6 @@ import nu.marginalia.model.crawl.HtmlFeature;
import java.util.EnumSet;
import java.util.List;
import java.util.Objects;
import java.util.StringJoiner;
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
@ -98,14 +97,6 @@ public class UrlDetails {
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
}
public int getSuperficialHash() {
return Objects.hash(url.path, title);
}
public String getSuperficialHashStr() {
return String.format("%8X", getSuperficialHash());
}
public String getGeminiLink() {
return url.proto + "://" + url.domain.toString() + url.path.replace(" ", "%20").replace("\"", "%22");
}
@ -173,9 +164,6 @@ public class UrlDetails {
}
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); }
public boolean isSpecialDomain() {
return domainState == DomainIndexingState.SPECIAL;
}
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
public int getMatchRank() {

View File

@ -3,6 +3,7 @@ package nu.marginalia.search.results;
import it.unimi.dsi.fastutil.ints.Int2LongArrayMap;
import lombok.SneakyThrows;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.model.crawl.DomainIndexingState;
@ -18,9 +19,9 @@ public class SearchResultDecorator {
private final Logger logger = LoggerFactory.getLogger(getClass());
@SneakyThrows
public List<UrlDetails> getAllUrlDetails(SearchResultSet resultSet) {
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
for (var detail : resultSet.results) {
for (var detail : resultSet) {
ret.add(new UrlDetails(
detail.documentId(),
detail.domainId(),

View File

@ -4,11 +4,14 @@ import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.lsh.EasyLSH;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Objects;
public class UrlDeduplicator {
private final int LSH_SIMILARITY_THRESHOLD = 2;
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
@ -22,7 +25,7 @@ public class UrlDeduplicator {
this.resultsPerKey = resultsPerKey;
}
public synchronized boolean shouldRemove(UrlDetails details) {
public synchronized boolean shouldRemove(DecoratedSearchResultItem details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
@ -33,11 +36,11 @@ public class UrlDeduplicator {
return false;
}
private boolean deduplicateOnSuperficialHash(UrlDetails details) {
return seenSuperficialhashes.add(details.getSuperficialHash());
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
}
private boolean deduplicateOnLSH(UrlDetails details) {
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
long thisHash = details.dataHash;
if (0 == thisHash)
@ -53,16 +56,9 @@ public class UrlDeduplicator {
}
private boolean limitResultsPerDomain(UrlDetails details) {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key;
if (!details.isSpecialDomain()) {
key = domain.getLongDomainKey();
}
else {
key = domain.getDomainKey();
}
final String key = domain.getDomainKey();
return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey;
}

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.search.model.PageScoreAdjustment;
import nu.marginalia.search.model.UrlDetails;
@ -42,9 +43,12 @@ public class SearchQueryIndexService {
}
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
public List<UrlDetails> executeQuery(Context ctx, SearchSpecification specs) {
// Send the query
final SearchResultSet results = indexClient.query(ctx, processedQuery.specs);
final var queryResponse = indexClient.query(ctx, specs);
// Remove duplicates and other chaff
final var results = limitAndDeduplicateResults(specs, queryResponse.results);
// Update the query count (this is what you see on the front page)
searchVisitorCount.registerQuery();
@ -53,14 +57,14 @@ public class SearchQueryIndexService {
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
urlDetails.sort(resultListComparator);
return limitAndDeduplicateResults(processedQuery, urlDetails);
return urlDetails;
}
private List<UrlDetails> limitAndDeduplicateResults(SearchQuery processedQuery, List<UrlDetails> decoratedResults) {
var limits = processedQuery.specs.queryLimits;
private List<DecoratedSearchResultItem> limitAndDeduplicateResults(SearchSpecification specs, List<DecoratedSearchResultItem> decoratedResults) {
var limits = specs.queryLimits;
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
List<UrlDetails> retList = new ArrayList<>(limits.resultsTotal());
List<DecoratedSearchResultItem> retList = new ArrayList<>(limits.resultsTotal());
int dedupCount = 0;
for (var item : decoratedResults) {