(search/index) Add a new keyword "count"

This is for filtering results on how many times the term appears on the domain.  The intent is to be beneficial in creating e.g. a domain search feature.   It's also very helpful when tracking down spammy domains.
This commit is contained in:
Viktor Lofgren 2023-12-25 20:38:29 +01:00
parent c0eaca220c
commit 4763077b76
18 changed files with 61 additions and 29 deletions

View File

@ -24,6 +24,8 @@ public class SearchSpecification {
public final SpecificationLimit size; public final SpecificationLimit size;
public final SpecificationLimit rank; public final SpecificationLimit rank;
public final SpecificationLimit domainCount;
public final QueryLimits queryLimits; public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy; public final QueryStrategy queryStrategy;

View File

@ -25,9 +25,11 @@ message RpcQsQuery {
RpcSpecLimit year = 8; RpcSpecLimit year = 8;
RpcSpecLimit size = 9; RpcSpecLimit size = 9;
RpcSpecLimit rank = 10; RpcSpecLimit rank = 10;
repeated int32 domainIds = 11; RpcSpecLimit domainCount = 11;
RpcQueryLimits queryLimits = 12; repeated int32 domainIds = 12;
string searchSetIdentifier = 13; RpcQueryLimits queryLimits = 13;
string searchSetIdentifier = 14;
} }
/* Query service query response */ /* Query service query response */
@ -49,9 +51,10 @@ message RpcIndexQuery {
RpcSpecLimit year = 6; RpcSpecLimit year = 6;
RpcSpecLimit size = 7; RpcSpecLimit size = 7;
RpcSpecLimit rank = 8; RpcSpecLimit rank = 8;
RpcQueryLimits queryLimits = 9; RpcSpecLimit domainCount = 9;
string queryStrategy = 10; // Named query configuration RpcQueryLimits queryLimits = 10;
RpcResultRankingParameters parameters = 11; string queryStrategy = 11; // Named query configuration
RpcResultRankingParameters parameters = 12;
} }
/* A tagged union encoding some limit on a field */ /* A tagged union encoding some limit on a field */

View File

@ -38,6 +38,7 @@ public class QueryProtobufCodec {
builder.setYear(convertSpecLimit(query.specs.year)); builder.setYear(convertSpecLimit(query.specs.year));
builder.setSize(convertSpecLimit(query.specs.size)); builder.setSize(convertSpecLimit(query.specs.size));
builder.setRank(convertSpecLimit(query.specs.rank)); builder.setRank(convertSpecLimit(query.specs.rank));
builder.setDomainCount(convertSpecLimit(query.specs.domainCount));
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits)); builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
builder.setQueryStrategy(query.specs.queryStrategy.name()); builder.setQueryStrategy(query.specs.queryStrategy.name());
@ -58,6 +59,7 @@ public class QueryProtobufCodec {
convertSpecLimit(request.getYear()), convertSpecLimit(request.getYear()),
convertSpecLimit(request.getSize()), convertSpecLimit(request.getSize()),
convertSpecLimit(request.getRank()), convertSpecLimit(request.getRank()),
convertSpecLimit(request.getDomainCount()),
request.getDomainIdsList(), request.getDomainIdsList(),
IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()), IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()),
SearchSetIdentifier.valueOf(request.getSearchSetIdentifier())); SearchSetIdentifier.valueOf(request.getSearchSetIdentifier()));
@ -137,6 +139,7 @@ public class QueryProtobufCodec {
IndexProtobufCodec.convertSpecLimit(specs.getYear()), IndexProtobufCodec.convertSpecLimit(specs.getYear()),
IndexProtobufCodec.convertSpecLimit(specs.getSize()), IndexProtobufCodec.convertSpecLimit(specs.getSize()),
IndexProtobufCodec.convertSpecLimit(specs.getRank()), IndexProtobufCodec.convertSpecLimit(specs.getRank()),
IndexProtobufCodec.convertSpecLimit(specs.getDomainCount()),
IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()), IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()),
QueryStrategy.valueOf(specs.getQueryStrategy()), QueryStrategy.valueOf(specs.getQueryStrategy()),
convertRankingParameterss(specs.getParameters()) convertRankingParameterss(specs.getParameters())

View File

@ -20,6 +20,7 @@ public record QueryParams(
SpecificationLimit year, SpecificationLimit year,
SpecificationLimit size, SpecificationLimit size,
SpecificationLimit rank, SpecificationLimit rank,
SpecificationLimit domainCount,
List<Integer> domainIds, List<Integer> domainIds,
QueryLimits limits, QueryLimits limits,
SearchSetIdentifier identifier SearchSetIdentifier identifier
@ -35,6 +36,7 @@ public record QueryParams(
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(), List.of(),
limits, limits,
identifier identifier

View File

@ -8,6 +8,7 @@ public record IndexQueryParams(SpecificationLimit qualityLimit,
SpecificationLimit year, SpecificationLimit year,
SpecificationLimit size, SpecificationLimit size,
SpecificationLimit rank, SpecificationLimit rank,
SpecificationLimit domainCount,
SearchSet searchSet, SearchSet searchSet,
QueryStrategy queryStrategy QueryStrategy queryStrategy
) )

View File

@ -1,8 +1,10 @@
package nu.marginalia.index.query.limit; package nu.marginalia.index.query.limit;
import lombok.ToString;
public record SpecificationLimit(SpecificationLimitType type, int value) { public record SpecificationLimit(SpecificationLimitType type, int value) {
public boolean isNone() {
return type == SpecificationLimitType.NONE;
}
public static SpecificationLimit none() { public static SpecificationLimit none() {
return new SpecificationLimit(SpecificationLimitType.NONE, 0); return new SpecificationLimit(SpecificationLimitType.NONE, 0);
} }

View File

@ -82,6 +82,8 @@ public class QueryParser {
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("count") && t.str.matches("count[=><]\\d+")) {
entity.replace(new Token(TokenType.DOMAIN_COUNT_TERM, t.str.substring(5), t.displayStr));
} else if (t.str.startsWith("qs=")) { } else if (t.str.startsWith("qs=")) {
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
} else if (t.str.contains(":")) { } else if (t.str.contains(":")) {

View File

@ -40,6 +40,7 @@ public class Token {
case YEAR_TERM: visitor.onYearTerm(this); break; case YEAR_TERM: visitor.onYearTerm(this); break;
case RANK_TERM: visitor.onRankTerm(this); break; case RANK_TERM: visitor.onRankTerm(this); break;
case DOMAIN_COUNT_TERM: visitor.onDomainCountTerm(this); break;
case SIZE_TERM: visitor.onSizeTerm(this); break; case SIZE_TERM: visitor.onSizeTerm(this); break;
case QS_TERM: visitor.onQsTerm(this); break; case QS_TERM: visitor.onQsTerm(this); break;

View File

@ -16,6 +16,7 @@ public enum TokenType implements Predicate<Token> {
YEAR_TERM, YEAR_TERM,
SIZE_TERM, SIZE_TERM,
RANK_TERM, RANK_TERM,
DOMAIN_COUNT_TERM,
NEAR_TERM, NEAR_TERM,
QS_TERM, QS_TERM,

View File

@ -9,6 +9,7 @@ public interface TokenVisitor {
void onYearTerm(Token token); void onYearTerm(Token token);
void onSizeTerm(Token token); void onSizeTerm(Token token);
void onRankTerm(Token token); void onRankTerm(Token token);
void onDomainCountTerm(Token token);
void onQualityTerm(Token token); void onQualityTerm(Token token);
void onQsTerm(Token token); void onQsTerm(Token token);
} }

View File

@ -29,6 +29,7 @@ public class SearchQueryParamFactory {
profile.getYearLimit(), profile.getYearLimit(),
profile.getSizeLimit(), profile.getSizeLimit(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(), List.of(),
new QueryLimits(1, 100, 200, 8192), new QueryLimits(1, 100, 200, 8192),
profile.searchSetIdentifier profile.searchSetIdentifier
@ -47,6 +48,7 @@ public class SearchQueryParamFactory {
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(), List.of(),
new QueryLimits(count, count, 100, 512), new QueryLimits(count, count, 100, 512),
SearchSetIdentifier.NONE SearchSetIdentifier.NONE
@ -64,6 +66,7 @@ public class SearchQueryParamFactory {
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(), List.of(),
new QueryLimits(100, 100, 100, 512), new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE SearchSetIdentifier.NONE
@ -81,6 +84,7 @@ public class SearchQueryParamFactory {
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(), List.of(),
new QueryLimits(100, 100, 100, 512), new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE SearchSetIdentifier.NONE

View File

@ -56,8 +56,12 @@
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr> <tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
<tr><td>year&lt;2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr> <tr><td>year&lt;2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
<tr><td>rank&gt;50</td><td>(beta) The ranking of the website is at least 50 in a span of 1 - 255</td></tr> <tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>rank&lt;50</td><td>(beta) The ranking of the website is at most 50 in a span of 1 - 255</td></tr> <tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
<tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr> <tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr> <tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>

View File

@ -415,6 +415,13 @@ public class IndexQueryService extends IndexApiImplBase {
} }
} }
if (!params.queryParams.domainCount().isNone()) {
// Remove items that don't meet the domain count requirement
// This isn't perfect because the domain count is calculated
// after the results are sorted
resultsList.removeIf(item -> !params.queryParams.domainCount().test(domainCountFilter.getCount(item)));
}
if (resultsList.size() > params.limitTotal) { if (resultsList.size() > params.limitTotal) {
// This can't be made a stream limit() operation because we need domainCountFilter // This can't be made a stream limit() operation because we need domainCountFilter
// to run over the entire list to provide accurate statistics // to run over the entire list to provide accurate statistics

View File

@ -65,6 +65,7 @@ public class SearchParameters {
specsSet.year, specsSet.year,
specsSet.size, specsSet.size,
specsSet.rank, specsSet.rank,
specsSet.domainCount,
searchSet, searchSet,
specsSet.queryStrategy); specsSet.queryStrategy);
@ -90,6 +91,7 @@ public class SearchParameters {
IndexProtobufCodec.convertSpecLimit(request.getYear()), IndexProtobufCodec.convertSpecLimit(request.getYear()),
IndexProtobufCodec.convertSpecLimit(request.getSize()), IndexProtobufCodec.convertSpecLimit(request.getSize()),
IndexProtobufCodec.convertSpecLimit(request.getRank()), IndexProtobufCodec.convertSpecLimit(request.getRank()),
IndexProtobufCodec.convertSpecLimit(request.getDomainCount()),
searchSet, searchSet,
QueryStrategy.valueOf(request.getQueryStrategy())); QueryStrategy.valueOf(request.getQueryStrategy()));

View File

@ -127,6 +127,7 @@ public class QueryFactory {
.subqueries(subqueries) .subqueries(subqueries)
.humanQuery(query) .humanQuery(query)
.quality(qualityLimits.qualityLimit) .quality(qualityLimits.qualityLimit)
.domainCount(qualityLimits.domainCount)
.year(qualityLimits.year) .year(qualityLimits.year)
.size(qualityLimits.size) .size(qualityLimits.size)
.rank(qualityLimits.rank) .rank(qualityLimits.rank)

View File

@ -11,6 +11,7 @@ public class QueryLimitsAccumulator implements TokenVisitor {
public SpecificationLimit year; public SpecificationLimit year;
public SpecificationLimit size; public SpecificationLimit size;
public SpecificationLimit rank; public SpecificationLimit rank;
public SpecificationLimit domainCount;
public QueryStrategy queryStrategy = QueryStrategy.AUTO; public QueryStrategy queryStrategy = QueryStrategy.AUTO;
@ -19,6 +20,7 @@ public class QueryLimitsAccumulator implements TokenVisitor {
year = params.year(); year = params.year();
size = params.size(); size = params.size();
rank = params.rank(); rank = params.rank();
domainCount = params.domainCount();
} }
private SpecificationLimit parseSpecificationLimit(String str) { private SpecificationLimit parseSpecificationLimit(String str) {
@ -64,6 +66,11 @@ public class QueryLimitsAccumulator implements TokenVisitor {
rank = parseSpecificationLimit(token.str); rank = parseSpecificationLimit(token.str);
} }
@Override
public void onDomainCountTerm(Token token) {
domainCount = parseSpecificationLimit(token.str);
}
@Override @Override
public void onQualityTerm(Token token) { public void onQualityTerm(Token token) {
qualityLimit = parseSpecificationLimit(token.str); qualityLimit = parseSpecificationLimit(token.str);

View File

@ -97,27 +97,15 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
} }
@Override @Override
public void onYearTerm(Token token) { public void onYearTerm(Token token) {}
}
@Override @Override
public void onSizeTerm(Token token) { public void onSizeTerm(Token token) {}
}
@Override @Override
public void onRankTerm(Token token) { public void onRankTerm(Token token) {}
}
@Override @Override
public void onQualityTerm(Token token) { public void onDomainCountTerm(Token token) {}
}
@Override @Override
public void onQsTerm(Token token) { public void onQualityTerm(Token token) {}
@Override
} public void onQsTerm(Token token) {}
} }

View File

@ -46,6 +46,7 @@ public class QueryFactoryTest {
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(), SpecificationLimit.none(),
SpecificationLimit.none(),
null, null,
new QueryLimits(100, 100, 100, 100), new QueryLimits(100, 100, 100, 100),
SearchSetIdentifier.BLOGS)).specs; SearchSetIdentifier.BLOGS)).specs;