(search/index) Add a new keyword "count"
This is for filtering results on how many times the term appears on the domain. The intent is to be beneficial in creating e.g. a domain search feature. It's also very helpful when tracking down spammy domains.
This commit is contained in:
parent
c0eaca220c
commit
4763077b76
@ -24,6 +24,8 @@ public class SearchSpecification {
|
|||||||
public final SpecificationLimit size;
|
public final SpecificationLimit size;
|
||||||
public final SpecificationLimit rank;
|
public final SpecificationLimit rank;
|
||||||
|
|
||||||
|
public final SpecificationLimit domainCount;
|
||||||
|
|
||||||
public final QueryLimits queryLimits;
|
public final QueryLimits queryLimits;
|
||||||
|
|
||||||
public final QueryStrategy queryStrategy;
|
public final QueryStrategy queryStrategy;
|
||||||
|
@ -25,9 +25,11 @@ message RpcQsQuery {
|
|||||||
RpcSpecLimit year = 8;
|
RpcSpecLimit year = 8;
|
||||||
RpcSpecLimit size = 9;
|
RpcSpecLimit size = 9;
|
||||||
RpcSpecLimit rank = 10;
|
RpcSpecLimit rank = 10;
|
||||||
repeated int32 domainIds = 11;
|
RpcSpecLimit domainCount = 11;
|
||||||
RpcQueryLimits queryLimits = 12;
|
repeated int32 domainIds = 12;
|
||||||
string searchSetIdentifier = 13;
|
RpcQueryLimits queryLimits = 13;
|
||||||
|
string searchSetIdentifier = 14;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Query service query response */
|
/* Query service query response */
|
||||||
@ -49,9 +51,10 @@ message RpcIndexQuery {
|
|||||||
RpcSpecLimit year = 6;
|
RpcSpecLimit year = 6;
|
||||||
RpcSpecLimit size = 7;
|
RpcSpecLimit size = 7;
|
||||||
RpcSpecLimit rank = 8;
|
RpcSpecLimit rank = 8;
|
||||||
RpcQueryLimits queryLimits = 9;
|
RpcSpecLimit domainCount = 9;
|
||||||
string queryStrategy = 10; // Named query configuration
|
RpcQueryLimits queryLimits = 10;
|
||||||
RpcResultRankingParameters parameters = 11;
|
string queryStrategy = 11; // Named query configuration
|
||||||
|
RpcResultRankingParameters parameters = 12;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* A tagged union encoding some limit on a field */
|
/* A tagged union encoding some limit on a field */
|
||||||
|
@ -38,6 +38,7 @@ public class QueryProtobufCodec {
|
|||||||
builder.setYear(convertSpecLimit(query.specs.year));
|
builder.setYear(convertSpecLimit(query.specs.year));
|
||||||
builder.setSize(convertSpecLimit(query.specs.size));
|
builder.setSize(convertSpecLimit(query.specs.size));
|
||||||
builder.setRank(convertSpecLimit(query.specs.rank));
|
builder.setRank(convertSpecLimit(query.specs.rank));
|
||||||
|
builder.setDomainCount(convertSpecLimit(query.specs.domainCount));
|
||||||
|
|
||||||
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
|
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
|
||||||
builder.setQueryStrategy(query.specs.queryStrategy.name());
|
builder.setQueryStrategy(query.specs.queryStrategy.name());
|
||||||
@ -58,6 +59,7 @@ public class QueryProtobufCodec {
|
|||||||
convertSpecLimit(request.getYear()),
|
convertSpecLimit(request.getYear()),
|
||||||
convertSpecLimit(request.getSize()),
|
convertSpecLimit(request.getSize()),
|
||||||
convertSpecLimit(request.getRank()),
|
convertSpecLimit(request.getRank()),
|
||||||
|
convertSpecLimit(request.getDomainCount()),
|
||||||
request.getDomainIdsList(),
|
request.getDomainIdsList(),
|
||||||
IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()),
|
IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()),
|
||||||
SearchSetIdentifier.valueOf(request.getSearchSetIdentifier()));
|
SearchSetIdentifier.valueOf(request.getSearchSetIdentifier()));
|
||||||
@ -137,6 +139,7 @@ public class QueryProtobufCodec {
|
|||||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
||||||
|
IndexProtobufCodec.convertSpecLimit(specs.getDomainCount()),
|
||||||
IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()),
|
IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()),
|
||||||
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
||||||
convertRankingParameterss(specs.getParameters())
|
convertRankingParameterss(specs.getParameters())
|
||||||
|
@ -20,6 +20,7 @@ public record QueryParams(
|
|||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
SpecificationLimit rank,
|
SpecificationLimit rank,
|
||||||
|
SpecificationLimit domainCount,
|
||||||
List<Integer> domainIds,
|
List<Integer> domainIds,
|
||||||
QueryLimits limits,
|
QueryLimits limits,
|
||||||
SearchSetIdentifier identifier
|
SearchSetIdentifier identifier
|
||||||
@ -35,6 +36,7 @@ public record QueryParams(
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(),
|
||||||
limits,
|
limits,
|
||||||
identifier
|
identifier
|
||||||
|
@ -8,6 +8,7 @@ public record IndexQueryParams(SpecificationLimit qualityLimit,
|
|||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
SpecificationLimit rank,
|
SpecificationLimit rank,
|
||||||
|
SpecificationLimit domainCount,
|
||||||
SearchSet searchSet,
|
SearchSet searchSet,
|
||||||
QueryStrategy queryStrategy
|
QueryStrategy queryStrategy
|
||||||
)
|
)
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
package nu.marginalia.index.query.limit;
|
package nu.marginalia.index.query.limit;
|
||||||
|
|
||||||
import lombok.ToString;
|
|
||||||
|
|
||||||
public record SpecificationLimit(SpecificationLimitType type, int value) {
|
public record SpecificationLimit(SpecificationLimitType type, int value) {
|
||||||
|
public boolean isNone() {
|
||||||
|
return type == SpecificationLimitType.NONE;
|
||||||
|
}
|
||||||
|
|
||||||
public static SpecificationLimit none() {
|
public static SpecificationLimit none() {
|
||||||
return new SpecificationLimit(SpecificationLimitType.NONE, 0);
|
return new SpecificationLimit(SpecificationLimitType.NONE, 0);
|
||||||
}
|
}
|
||||||
|
@ -82,6 +82,8 @@ public class QueryParser {
|
|||||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||||
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
||||||
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
||||||
|
} else if (t.str.startsWith("count") && t.str.matches("count[=><]\\d+")) {
|
||||||
|
entity.replace(new Token(TokenType.DOMAIN_COUNT_TERM, t.str.substring(5), t.displayStr));
|
||||||
} else if (t.str.startsWith("qs=")) {
|
} else if (t.str.startsWith("qs=")) {
|
||||||
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||||
} else if (t.str.contains(":")) {
|
} else if (t.str.contains(":")) {
|
||||||
|
@ -40,6 +40,7 @@ public class Token {
|
|||||||
|
|
||||||
case YEAR_TERM: visitor.onYearTerm(this); break;
|
case YEAR_TERM: visitor.onYearTerm(this); break;
|
||||||
case RANK_TERM: visitor.onRankTerm(this); break;
|
case RANK_TERM: visitor.onRankTerm(this); break;
|
||||||
|
case DOMAIN_COUNT_TERM: visitor.onDomainCountTerm(this); break;
|
||||||
case SIZE_TERM: visitor.onSizeTerm(this); break;
|
case SIZE_TERM: visitor.onSizeTerm(this); break;
|
||||||
case QS_TERM: visitor.onQsTerm(this); break;
|
case QS_TERM: visitor.onQsTerm(this); break;
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ public enum TokenType implements Predicate<Token> {
|
|||||||
YEAR_TERM,
|
YEAR_TERM,
|
||||||
SIZE_TERM,
|
SIZE_TERM,
|
||||||
RANK_TERM,
|
RANK_TERM,
|
||||||
|
DOMAIN_COUNT_TERM,
|
||||||
NEAR_TERM,
|
NEAR_TERM,
|
||||||
|
|
||||||
QS_TERM,
|
QS_TERM,
|
||||||
|
@ -9,6 +9,7 @@ public interface TokenVisitor {
|
|||||||
void onYearTerm(Token token);
|
void onYearTerm(Token token);
|
||||||
void onSizeTerm(Token token);
|
void onSizeTerm(Token token);
|
||||||
void onRankTerm(Token token);
|
void onRankTerm(Token token);
|
||||||
|
void onDomainCountTerm(Token token);
|
||||||
void onQualityTerm(Token token);
|
void onQualityTerm(Token token);
|
||||||
void onQsTerm(Token token);
|
void onQsTerm(Token token);
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ public class SearchQueryParamFactory {
|
|||||||
profile.getYearLimit(),
|
profile.getYearLimit(),
|
||||||
profile.getSizeLimit(),
|
profile.getSizeLimit(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(),
|
||||||
new QueryLimits(1, 100, 200, 8192),
|
new QueryLimits(1, 100, 200, 8192),
|
||||||
profile.searchSetIdentifier
|
profile.searchSetIdentifier
|
||||||
@ -47,6 +48,7 @@ public class SearchQueryParamFactory {
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(),
|
||||||
new QueryLimits(count, count, 100, 512),
|
new QueryLimits(count, count, 100, 512),
|
||||||
SearchSetIdentifier.NONE
|
SearchSetIdentifier.NONE
|
||||||
@ -64,6 +66,7 @@ public class SearchQueryParamFactory {
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(),
|
||||||
new QueryLimits(100, 100, 100, 512),
|
new QueryLimits(100, 100, 100, 512),
|
||||||
SearchSetIdentifier.NONE
|
SearchSetIdentifier.NONE
|
||||||
@ -81,6 +84,7 @@ public class SearchQueryParamFactory {
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(),
|
||||||
new QueryLimits(100, 100, 100, 512),
|
new QueryLimits(100, 100, 100, 512),
|
||||||
SearchSetIdentifier.NONE
|
SearchSetIdentifier.NONE
|
||||||
|
@ -56,8 +56,12 @@
|
|||||||
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
|
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
|
||||||
<tr><td>year<2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
|
<tr><td>year<2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
|
||||||
|
|
||||||
<tr><td>rank>50</td><td>(beta) The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
|
<tr><td>rank>50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
|
||||||
<tr><td>rank<50</td><td>(beta) The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
|
<tr><td>rank<50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
|
||||||
|
|
||||||
|
<tr><td>count>10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
|
||||||
|
<tr><td>count<10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
|
||||||
|
|
||||||
|
|
||||||
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
||||||
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
||||||
|
@ -415,6 +415,13 @@ public class IndexQueryService extends IndexApiImplBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!params.queryParams.domainCount().isNone()) {
|
||||||
|
// Remove items that don't meet the domain count requirement
|
||||||
|
// This isn't perfect because the domain count is calculated
|
||||||
|
// after the results are sorted
|
||||||
|
resultsList.removeIf(item -> !params.queryParams.domainCount().test(domainCountFilter.getCount(item)));
|
||||||
|
}
|
||||||
|
|
||||||
if (resultsList.size() > params.limitTotal) {
|
if (resultsList.size() > params.limitTotal) {
|
||||||
// This can't be made a stream limit() operation because we need domainCountFilter
|
// This can't be made a stream limit() operation because we need domainCountFilter
|
||||||
// to run over the entire list to provide accurate statistics
|
// to run over the entire list to provide accurate statistics
|
||||||
|
@ -65,6 +65,7 @@ public class SearchParameters {
|
|||||||
specsSet.year,
|
specsSet.year,
|
||||||
specsSet.size,
|
specsSet.size,
|
||||||
specsSet.rank,
|
specsSet.rank,
|
||||||
|
specsSet.domainCount,
|
||||||
searchSet,
|
searchSet,
|
||||||
specsSet.queryStrategy);
|
specsSet.queryStrategy);
|
||||||
|
|
||||||
@ -90,6 +91,7 @@ public class SearchParameters {
|
|||||||
IndexProtobufCodec.convertSpecLimit(request.getYear()),
|
IndexProtobufCodec.convertSpecLimit(request.getYear()),
|
||||||
IndexProtobufCodec.convertSpecLimit(request.getSize()),
|
IndexProtobufCodec.convertSpecLimit(request.getSize()),
|
||||||
IndexProtobufCodec.convertSpecLimit(request.getRank()),
|
IndexProtobufCodec.convertSpecLimit(request.getRank()),
|
||||||
|
IndexProtobufCodec.convertSpecLimit(request.getDomainCount()),
|
||||||
searchSet,
|
searchSet,
|
||||||
QueryStrategy.valueOf(request.getQueryStrategy()));
|
QueryStrategy.valueOf(request.getQueryStrategy()));
|
||||||
|
|
||||||
|
@ -127,6 +127,7 @@ public class QueryFactory {
|
|||||||
.subqueries(subqueries)
|
.subqueries(subqueries)
|
||||||
.humanQuery(query)
|
.humanQuery(query)
|
||||||
.quality(qualityLimits.qualityLimit)
|
.quality(qualityLimits.qualityLimit)
|
||||||
|
.domainCount(qualityLimits.domainCount)
|
||||||
.year(qualityLimits.year)
|
.year(qualityLimits.year)
|
||||||
.size(qualityLimits.size)
|
.size(qualityLimits.size)
|
||||||
.rank(qualityLimits.rank)
|
.rank(qualityLimits.rank)
|
||||||
|
@ -11,6 +11,7 @@ public class QueryLimitsAccumulator implements TokenVisitor {
|
|||||||
public SpecificationLimit year;
|
public SpecificationLimit year;
|
||||||
public SpecificationLimit size;
|
public SpecificationLimit size;
|
||||||
public SpecificationLimit rank;
|
public SpecificationLimit rank;
|
||||||
|
public SpecificationLimit domainCount;
|
||||||
|
|
||||||
public QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
public QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
||||||
|
|
||||||
@ -19,6 +20,7 @@ public class QueryLimitsAccumulator implements TokenVisitor {
|
|||||||
year = params.year();
|
year = params.year();
|
||||||
size = params.size();
|
size = params.size();
|
||||||
rank = params.rank();
|
rank = params.rank();
|
||||||
|
domainCount = params.domainCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
private SpecificationLimit parseSpecificationLimit(String str) {
|
private SpecificationLimit parseSpecificationLimit(String str) {
|
||||||
@ -64,6 +66,11 @@ public class QueryLimitsAccumulator implements TokenVisitor {
|
|||||||
rank = parseSpecificationLimit(token.str);
|
rank = parseSpecificationLimit(token.str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onDomainCountTerm(Token token) {
|
||||||
|
domainCount = parseSpecificationLimit(token.str);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onQualityTerm(Token token) {
|
public void onQualityTerm(Token token) {
|
||||||
qualityLimit = parseSpecificationLimit(token.str);
|
qualityLimit = parseSpecificationLimit(token.str);
|
||||||
|
@ -97,27 +97,15 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onYearTerm(Token token) {
|
public void onYearTerm(Token token) {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onSizeTerm(Token token) {
|
public void onSizeTerm(Token token) {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onRankTerm(Token token) {
|
public void onRankTerm(Token token) {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onQualityTerm(Token token) {
|
public void onDomainCountTerm(Token token) {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onQsTerm(Token token) {
|
public void onQualityTerm(Token token) {}
|
||||||
|
@Override
|
||||||
}
|
public void onQsTerm(Token token) {}
|
||||||
}
|
}
|
||||||
|
@ -46,6 +46,7 @@ public class QueryFactoryTest {
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
null,
|
null,
|
||||||
new QueryLimits(100, 100, 100, 100),
|
new QueryLimits(100, 100, 100, 100),
|
||||||
SearchSetIdentifier.BLOGS)).specs;
|
SearchSetIdentifier.BLOGS)).specs;
|
||||||
|
Loading…
Reference in New Issue
Block a user