(search) Refactor SearchQueryIndexService in preparation for feature extraction.
Prefer working on DecoratedSearchResultItem in favor of UrlDetails.
This commit is contained in:
parent
77ccab7d80
commit
cf366c602f
@ -66,7 +66,7 @@ public class SearchOperator {
|
||||
|
||||
logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||
|
||||
return searchQueryService.executeQuery(ctx, processedQuery);
|
||||
return searchQueryService.executeQuery(ctx, processedQuery.specs);
|
||||
}
|
||||
|
||||
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) {
|
||||
@ -76,7 +76,7 @@ public class SearchOperator {
|
||||
|
||||
logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||
|
||||
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery);
|
||||
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery.specs);
|
||||
|
||||
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
||||
|
||||
|
@ -64,7 +64,7 @@ public class SiteListCommand implements SearchCommandInterface {
|
||||
int domainId = -1;
|
||||
if (null != domain) {
|
||||
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
|
||||
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
|
||||
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery.specs);
|
||||
var maybeId = domainQueries.tryGetDomainId(domain);
|
||||
if (maybeId.isPresent()) {
|
||||
domainId = maybeId.getAsInt();
|
||||
|
@ -9,7 +9,6 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
|
||||
@ -98,14 +97,6 @@ public class UrlDetails {
|
||||
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
||||
}
|
||||
|
||||
public int getSuperficialHash() {
|
||||
return Objects.hash(url.path, title);
|
||||
}
|
||||
public String getSuperficialHashStr() {
|
||||
return String.format("%8X", getSuperficialHash());
|
||||
}
|
||||
|
||||
|
||||
public String getGeminiLink() {
|
||||
return url.proto + "://" + url.domain.toString() + url.path.replace(" ", "%20").replace("\"", "%22");
|
||||
}
|
||||
@ -173,9 +164,6 @@ public class UrlDetails {
|
||||
}
|
||||
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); }
|
||||
|
||||
public boolean isSpecialDomain() {
|
||||
return domainState == DomainIndexingState.SPECIAL;
|
||||
}
|
||||
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
||||
|
||||
public int getMatchRank() {
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.search.results;
|
||||
import it.unimi.dsi.fastutil.ints.Int2LongArrayMap;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@ -18,9 +19,9 @@ public class SearchResultDecorator {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@SneakyThrows
|
||||
public List<UrlDetails> getAllUrlDetails(SearchResultSet resultSet) {
|
||||
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
|
||||
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
|
||||
for (var detail : resultSet.results) {
|
||||
for (var detail : resultSet) {
|
||||
ret.add(new UrlDetails(
|
||||
detail.documentId(),
|
||||
detail.domainId(),
|
||||
|
@ -4,11 +4,14 @@ import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class UrlDeduplicator {
|
||||
private final int LSH_SIMILARITY_THRESHOLD = 2;
|
||||
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
|
||||
@ -22,7 +25,7 @@ public class UrlDeduplicator {
|
||||
this.resultsPerKey = resultsPerKey;
|
||||
}
|
||||
|
||||
public synchronized boolean shouldRemove(UrlDetails details) {
|
||||
public synchronized boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||
if (!deduplicateOnSuperficialHash(details))
|
||||
return true;
|
||||
if (!deduplicateOnLSH(details))
|
||||
@ -33,11 +36,11 @@ public class UrlDeduplicator {
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean deduplicateOnSuperficialHash(UrlDetails details) {
|
||||
return seenSuperficialhashes.add(details.getSuperficialHash());
|
||||
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
|
||||
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
|
||||
}
|
||||
|
||||
private boolean deduplicateOnLSH(UrlDetails details) {
|
||||
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
|
||||
long thisHash = details.dataHash;
|
||||
|
||||
if (0 == thisHash)
|
||||
@ -53,16 +56,9 @@ public class UrlDeduplicator {
|
||||
|
||||
}
|
||||
|
||||
private boolean limitResultsPerDomain(UrlDetails details) {
|
||||
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||
final var domain = details.getUrl().getDomain();
|
||||
final String key;
|
||||
|
||||
if (!details.isSpecialDomain()) {
|
||||
key = domain.getLongDomainKey();
|
||||
}
|
||||
else {
|
||||
key = domain.getDomainKey();
|
||||
}
|
||||
final String key = domain.getDomainKey();
|
||||
|
||||
return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey;
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.index.client.IndexClient;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||
import nu.marginalia.search.model.PageScoreAdjustment;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
@ -42,9 +43,12 @@ public class SearchQueryIndexService {
|
||||
|
||||
}
|
||||
|
||||
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
|
||||
public List<UrlDetails> executeQuery(Context ctx, SearchSpecification specs) {
|
||||
// Send the query
|
||||
final SearchResultSet results = indexClient.query(ctx, processedQuery.specs);
|
||||
final var queryResponse = indexClient.query(ctx, specs);
|
||||
|
||||
// Remove duplicates and other chaff
|
||||
final var results = limitAndDeduplicateResults(specs, queryResponse.results);
|
||||
|
||||
// Update the query count (this is what you see on the front page)
|
||||
searchVisitorCount.registerQuery();
|
||||
@ -53,14 +57,14 @@ public class SearchQueryIndexService {
|
||||
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
|
||||
urlDetails.sort(resultListComparator);
|
||||
|
||||
return limitAndDeduplicateResults(processedQuery, urlDetails);
|
||||
return urlDetails;
|
||||
}
|
||||
|
||||
private List<UrlDetails> limitAndDeduplicateResults(SearchQuery processedQuery, List<UrlDetails> decoratedResults) {
|
||||
var limits = processedQuery.specs.queryLimits;
|
||||
private List<DecoratedSearchResultItem> limitAndDeduplicateResults(SearchSpecification specs, List<DecoratedSearchResultItem> decoratedResults) {
|
||||
var limits = specs.queryLimits;
|
||||
|
||||
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
List<UrlDetails> retList = new ArrayList<>(limits.resultsTotal());
|
||||
List<DecoratedSearchResultItem> retList = new ArrayList<>(limits.resultsTotal());
|
||||
|
||||
int dedupCount = 0;
|
||||
for (var item : decoratedResults) {
|
||||
|
Loading…
Reference in New Issue
Block a user