Merge pull request #53 from MarginaliaSearch/standalone-index
Move ranking to the index-service, and query parsing to a new query-service; separate out the search-service
This commit is contained in:
commit
c8d820c17b
@ -5,4 +5,6 @@ public class IndexMqEndpoints {
|
|||||||
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
|
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
|
||||||
public static final String SWITCH_INDEX = "SWITCH-INDEX";
|
public static final String SWITCH_INDEX = "SWITCH-INDEX";
|
||||||
|
|
||||||
|
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,82 @@
|
|||||||
|
package nu.marginalia.index.client.model.results;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.ToString;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@ToString
|
||||||
|
public class DecoratedSearchResultItem {
|
||||||
|
public final SearchResultItem rawIndexResult;
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
public final EdgeUrl url;
|
||||||
|
@NotNull
|
||||||
|
public final String title;
|
||||||
|
@NotNull
|
||||||
|
public final String description;
|
||||||
|
public final double urlQuality;
|
||||||
|
@NotNull
|
||||||
|
public final String format;
|
||||||
|
|
||||||
|
/** Document features bitmask, see HtmlFeature */
|
||||||
|
public final int features;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public final Integer pubYear;
|
||||||
|
public final long dataHash;
|
||||||
|
public final int wordsTotal;
|
||||||
|
public final double rankingScore;
|
||||||
|
|
||||||
|
public long documentId() {
|
||||||
|
return rawIndexResult.getDocumentId();
|
||||||
|
}
|
||||||
|
public int domainId() {
|
||||||
|
return rawIndexResult.getDomainId();
|
||||||
|
}
|
||||||
|
public int resultsFromDomain() {
|
||||||
|
return rawIndexResult.getResultsFromDomain();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<SearchResultKeywordScore> keywordScores() {
|
||||||
|
return rawIndexResult.getKeywordScores();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long rankingId() {
|
||||||
|
return rawIndexResult.getRanking();
|
||||||
|
}
|
||||||
|
|
||||||
|
public DecoratedSearchResultItem(SearchResultItem rawIndexResult,
|
||||||
|
@NotNull
|
||||||
|
EdgeUrl url,
|
||||||
|
@NotNull
|
||||||
|
String title,
|
||||||
|
@NotNull
|
||||||
|
String description,
|
||||||
|
double urlQuality,
|
||||||
|
@NotNull
|
||||||
|
String format,
|
||||||
|
int features,
|
||||||
|
@Nullable
|
||||||
|
Integer pubYear,
|
||||||
|
long dataHash,
|
||||||
|
int wordsTotal,
|
||||||
|
double rankingScore)
|
||||||
|
{
|
||||||
|
this.rawIndexResult = rawIndexResult;
|
||||||
|
this.url = url;
|
||||||
|
this.title = title;
|
||||||
|
this.description = description;
|
||||||
|
this.urlQuality = urlQuality;
|
||||||
|
this.format = format;
|
||||||
|
this.features = features;
|
||||||
|
this.pubYear = pubYear;
|
||||||
|
this.dataHash = dataHash;
|
||||||
|
this.wordsTotal = wordsTotal;
|
||||||
|
this.rankingScore = rankingScore;
|
||||||
|
}
|
||||||
|
}
|
@ -8,8 +8,7 @@ import java.util.List;
|
|||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
@AllArgsConstructor @Getter @ToString
|
||||||
public class SearchResultSet {
|
public class SearchResultSet {
|
||||||
public List<SearchResultItem> results;
|
public List<DecoratedSearchResultItem> results;
|
||||||
public ResultRankingContext rankingContext;
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return results.size();
|
return results.size();
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id 'java'
|
id 'java'
|
||||||
|
|
||||||
|
|
||||||
id 'jvm-test-suite'
|
id 'jvm-test-suite'
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -13,8 +11,10 @@ java {
|
|||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:api:index-api')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:features-index:index-query')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
|
|
@ -0,0 +1,65 @@
|
|||||||
|
package nu.marginalia.query.client;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import io.prometheus.client.Summary;
|
||||||
|
import io.reactivex.rxjava3.core.Observable;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.client.AbstractDynamicClient;
|
||||||
|
import nu.marginalia.client.Context;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||||
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.query.model.QueryParams;
|
||||||
|
import nu.marginalia.query.model.QueryResponse;
|
||||||
|
import nu.marginalia.service.descriptor.ServiceDescriptors;
|
||||||
|
import nu.marginalia.service.id.ServiceId;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.CheckReturnValue;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class QueryClient extends AbstractDynamicClient {
|
||||||
|
|
||||||
|
private static final Summary wmsa_search_index_api_delegate_time = Summary.build().name("wmsa_search_index_api_delegate_time").help("-").register();
|
||||||
|
private static final Summary wmsa_search_index_api_search_time = Summary.build().name("wmsa_search_index_api_search_time").help("-").register();
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private final MqOutbox outbox;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public QueryClient(ServiceDescriptors descriptors,
|
||||||
|
MessageQueueFactory messageQueueFactory) {
|
||||||
|
|
||||||
|
super(descriptors.forId(ServiceId.Query), WmsaHome.getHostsFile(), GsonFactory::get);
|
||||||
|
|
||||||
|
String inboxName = ServiceId.Query.name + ":" + "0";
|
||||||
|
String outboxName = System.getProperty("service-name", UUID.randomUUID().toString());
|
||||||
|
|
||||||
|
outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Delegate an Index API style query directly to the index service */
|
||||||
|
@CheckReturnValue
|
||||||
|
public SearchResultSet delegate(Context ctx, SearchSpecification specs) {
|
||||||
|
return wmsa_search_index_api_delegate_time.time(
|
||||||
|
() -> this.postGet(ctx, "/delegate/", specs, SearchResultSet.class).blockingFirst()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
@CheckReturnValue
|
||||||
|
public QueryResponse search(Context ctx, QueryParams params) {
|
||||||
|
return wmsa_search_index_api_search_time.time(
|
||||||
|
() -> this.postGet(ctx, "/search/", params, QueryResponse.class).blockingFirst()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
public MqOutbox outbox() {
|
||||||
|
return outbox;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,21 @@
|
|||||||
|
package nu.marginalia.query.model;
|
||||||
|
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class ProcessedQuery {
|
||||||
|
public final SearchSpecification specs;
|
||||||
|
public final List<String> searchTermsHuman;
|
||||||
|
public final String domain;
|
||||||
|
|
||||||
|
public ProcessedQuery(SearchSpecification specs, List<String> searchTermsHuman, String domain) {
|
||||||
|
this.specs = specs;
|
||||||
|
this.searchTermsHuman = searchTermsHuman;
|
||||||
|
this.domain = domain;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ProcessedQuery(SearchSpecification justSpecs) {
|
||||||
|
this(justSpecs, List.of(), null);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
package nu.marginalia.query.model;
|
||||||
|
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record QueryParams(
|
||||||
|
String humanQuery,
|
||||||
|
String nearDomain,
|
||||||
|
List<String> tacitIncludes,
|
||||||
|
List<String> tacitExcludes,
|
||||||
|
List<String> tacitPriority,
|
||||||
|
List<String> tacitAdvice,
|
||||||
|
SpecificationLimit quality,
|
||||||
|
SpecificationLimit year,
|
||||||
|
SpecificationLimit size,
|
||||||
|
SpecificationLimit rank,
|
||||||
|
List<Integer> domainIds,
|
||||||
|
QueryLimits limits,
|
||||||
|
SearchSetIdentifier identifier
|
||||||
|
)
|
||||||
|
{
|
||||||
|
public QueryParams(String query, QueryLimits limits, SearchSetIdentifier identifier) {
|
||||||
|
this(query, null,
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
List.of(),
|
||||||
|
limits,
|
||||||
|
identifier
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.query.model;
|
||||||
|
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public record QueryResponse(SearchSpecification specs,
|
||||||
|
List<DecoratedSearchResultItem> results,
|
||||||
|
List<String> searchTermsHuman,
|
||||||
|
List<String> problems,
|
||||||
|
String domain)
|
||||||
|
{
|
||||||
|
public Set<String> getAllKeywords() {
|
||||||
|
Set<String> keywords = new HashSet<>(100);
|
||||||
|
for (var sq : specs.subqueries) {
|
||||||
|
keywords.addAll(sq.searchTermsInclude);
|
||||||
|
}
|
||||||
|
return keywords;
|
||||||
|
}
|
||||||
|
}
|
@ -3,7 +3,7 @@
|
|||||||
## Core Services
|
## Core Services
|
||||||
|
|
||||||
* [assistant-api](assistant-api/)
|
* [assistant-api](assistant-api/)
|
||||||
* [search-api](search-api/)
|
* [query-api](query-api/)
|
||||||
* [index-api](index-api/)
|
* [index-api](index-api/)
|
||||||
|
|
||||||
These are clients for the [core services](../services-core/), along with what models
|
These are clients for the [core services](../services-core/), along with what models
|
||||||
@ -13,11 +13,11 @@ are necessary for speaking to them. They each implement the abstract client clas
|
|||||||
All that is necessary is to `@Inject` them into the constructor and then
|
All that is necessary is to `@Inject` them into the constructor and then
|
||||||
requests can be sent.
|
requests can be sent.
|
||||||
|
|
||||||
**Note:** If you are looking for the public API, it's handled by the api service in [services-satellite/api-service](../services-satellite/api-service).
|
**Note:** If you are looking for the public API, it's handled by the api service in [services-application/api-service](../services-application/api-service).
|
||||||
|
|
||||||
## MQ-API Process API
|
## MQ-API Process API
|
||||||
|
|
||||||
[process-mqapi](process-mqapi/) defines requests and inboxes for the message queue based API used
|
[process-mqapi](process-mqapi/) defines requests and inboxes for the message queue based API used
|
||||||
for interacting with processes.
|
for interacting with processes.
|
||||||
|
|
||||||
See [libraries/message-queue](../libraries/message-queue) and [services-satellite/control-service](../services-satellite/control-service).
|
See [libraries/message-queue](../libraries/message-queue) and [services-application/control-service](../services-core/control-service).
|
@ -1,8 +0,0 @@
|
|||||||
# Search API
|
|
||||||
|
|
||||||
Client and models for talking to the [search-service](../../services-core/search-service),
|
|
||||||
implemented with the base client from [service-client](../../common/service-client).
|
|
||||||
|
|
||||||
## Central Classes
|
|
||||||
|
|
||||||
* [SearchClient](src/main/java/nu/marginalia/search/client/SearchClient.java)
|
|
@ -1,52 +0,0 @@
|
|||||||
package nu.marginalia.search.client;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import io.reactivex.rxjava3.core.Observable;
|
|
||||||
import nu.marginalia.client.AbstractDynamicClient;
|
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
|
||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
|
||||||
import nu.marginalia.search.client.model.ApiSearchResults;
|
|
||||||
import nu.marginalia.service.descriptor.ServiceDescriptors;
|
|
||||||
import nu.marginalia.service.id.ServiceId;
|
|
||||||
import nu.marginalia.WmsaHome;
|
|
||||||
import nu.marginalia.client.Context;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
|
||||||
import java.net.URLEncoder;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.UUID;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class SearchClient extends AbstractDynamicClient {
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final MqOutbox outbox;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public SearchClient(ServiceDescriptors descriptors,
|
|
||||||
MessageQueueFactory messageQueueFactory) {
|
|
||||||
|
|
||||||
super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get);
|
|
||||||
|
|
||||||
String inboxName = ServiceId.Search.name + ":" + "0";
|
|
||||||
String outboxName = System.getProperty("service-name", UUID.randomUUID().toString());
|
|
||||||
|
|
||||||
outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public MqOutbox outbox() {
|
|
||||||
return outbox;
|
|
||||||
}
|
|
||||||
|
|
||||||
@CheckReturnValue
|
|
||||||
public Observable<ApiSearchResults> query(Context ctx, String queryString, int count, int profile) {
|
|
||||||
return this.get(ctx, String.format("/api/search?query=%s&count=%d&index=%d", URLEncoder.encode(queryString, StandardCharsets.UTF_8), count, profile), ApiSearchResults.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
package nu.marginalia.search.client;
|
|
||||||
|
|
||||||
public class SearchMqEndpoints {
|
|
||||||
/** Flushes the URL caches, run if significant changes have occurred in the URLs database */
|
|
||||||
public static final String FLUSH_CACHES = "FLUSH_CACHES";
|
|
||||||
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
|
|
||||||
}
|
|
@ -8,4 +8,4 @@ this information take effect in production immediately, even before
|
|||||||
the information was searchable.
|
the information was searchable.
|
||||||
|
|
||||||
It is constructed by the [loading-process](../../processes/loading-process), and consumed
|
It is constructed by the [loading-process](../../processes/loading-process), and consumed
|
||||||
by the [search-service](../../services-core/search-service).
|
by the [index-service](../../services-core/index-service).
|
@ -62,6 +62,15 @@ public class LinkdbReader {
|
|||||||
connection = createConnection();
|
connection = createConnection();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Re-establishes the connection, useful in tests and not
|
||||||
|
* much else */
|
||||||
|
public void reconnect() throws SQLException {
|
||||||
|
if (connection != null)
|
||||||
|
connection.close();
|
||||||
|
|
||||||
|
connection = createConnection();
|
||||||
|
}
|
||||||
|
|
||||||
public List<String> getUrlsFromDomain(int domainId) throws SQLException {
|
public List<String> getUrlsFromDomain(int domainId) throws SQLException {
|
||||||
if (connection == null ||
|
if (connection == null ||
|
||||||
connection.isClosed())
|
connection.isClosed())
|
||||||
|
@ -6,7 +6,6 @@ This package contains common models to the search engine
|
|||||||
|
|
||||||
* [EdgeDomain](src/main/java/nu/marginalia/model/EdgeDomain.java)
|
* [EdgeDomain](src/main/java/nu/marginalia/model/EdgeDomain.java)
|
||||||
* [EdgeUrl](src/main/java/nu/marginalia/model/EdgeUrl.java)
|
* [EdgeUrl](src/main/java/nu/marginalia/model/EdgeUrl.java)
|
||||||
* [EdgeId](src/main/java/nu/marginalia/model/id/EdgeId.java)
|
|
||||||
* [DocumentMetadata](src/main/java/nu/marginalia/model/idx/DocumentMetadata.java)
|
* [DocumentMetadata](src/main/java/nu/marginalia/model/idx/DocumentMetadata.java)
|
||||||
* [DocumentFlags](src/main/java/nu/marginalia/model/idx/DocumentFlags.java)
|
* [DocumentFlags](src/main/java/nu/marginalia/model/idx/DocumentFlags.java)
|
||||||
* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)
|
* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)
|
||||||
|
@ -10,6 +10,7 @@ public class SearchServiceDescriptors {
|
|||||||
public static ServiceDescriptors descriptors = new ServiceDescriptors(
|
public static ServiceDescriptors descriptors = new ServiceDescriptors(
|
||||||
List.of(new ServiceDescriptor(ServiceId.Api, 5004),
|
List.of(new ServiceDescriptor(ServiceId.Api, 5004),
|
||||||
new ServiceDescriptor(ServiceId.Index, 5021),
|
new ServiceDescriptor(ServiceId.Index, 5021),
|
||||||
|
new ServiceDescriptor(ServiceId.Query, 5022),
|
||||||
new ServiceDescriptor(ServiceId.Search, 5023),
|
new ServiceDescriptor(ServiceId.Search, 5023),
|
||||||
new ServiceDescriptor(ServiceId.Assistant, 5025),
|
new ServiceDescriptor(ServiceId.Assistant, 5025),
|
||||||
new ServiceDescriptor(ServiceId.Dating, 5070),
|
new ServiceDescriptor(ServiceId.Dating, 5070),
|
||||||
|
@ -6,6 +6,7 @@ public enum ServiceId {
|
|||||||
Api("api-service"),
|
Api("api-service"),
|
||||||
Search("search-service"),
|
Search("search-service"),
|
||||||
Index("index-service"),
|
Index("index-service"),
|
||||||
|
Query("query-service"),
|
||||||
|
|
||||||
Control("control-service"),
|
Control("control-service"),
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ Contains domain ranking algorithms.
|
|||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
|
||||||
* [features-search/result-ranking](../../features-search/result-ranking) - Ranks search results
|
* [result-ranking](../result-ranking) - Ranks search results
|
||||||
|
|
||||||
## Useful Resources
|
## Useful Resources
|
||||||
|
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
package nu.marginalia.index.query.limit;
|
package nu.marginalia.index.query.limit;
|
||||||
|
|
||||||
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
||||||
|
public QueryLimits forSingleDomain() {
|
||||||
|
return new QueryLimits(resultsTotal, resultsTotal, timeoutMs, fetchSize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,11 +14,11 @@ These indices rely heavily on the [libraries/btree](../libraries/btree) and [lib
|
|||||||
|
|
||||||
## Algorithms
|
## Algorithms
|
||||||
|
|
||||||
* [domain-ranking](domain-ranking/) contains ranking algorithms.
|
* [domain-ranking](domain-ranking/) contains domain ranking algorithms.
|
||||||
|
* [result-ranking](result-ranking/) contains logic for ranking search results by relevance.
|
||||||
|
|
||||||
# Libraries
|
# Libraries
|
||||||
|
|
||||||
* [index-query](index-query/) contains structures for evaluating search queries.
|
* [index-query](index-query/) contains structures for evaluating search queries.
|
||||||
* [index-journal](index-journal/) contains tools for writing and reading index data.
|
* [index-journal](index-journal/) contains tools for writing and reading index data.
|
||||||
* [lexicon](lexicon/) contains a mapping between words' string representation and an unique integer identifier.
|
|
||||||
|
|
||||||
|
@ -11,4 +11,4 @@ results higher.
|
|||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
|
||||||
* [features-index/domain-ranking](../../features-index/domain-ranking) - Ranks domains
|
* [features-index/domain-ranking](../domain-ranking) - Ranks domains
|
@ -1,6 +1,6 @@
|
|||||||
# Query Parser
|
# Query Parser
|
||||||
|
|
||||||
End-user search query parsing tools used by the [search-service](../../services-core/search-service).
|
End-user search query parsing tools used by the [query-service](../../services-core/query-service).
|
||||||
|
|
||||||
## Central Classes
|
## Central Classes
|
||||||
|
|
@ -36,7 +36,6 @@ public class Token {
|
|||||||
case EXCLUDE_TERM: visitor.onExcludeTerm(this); break;
|
case EXCLUDE_TERM: visitor.onExcludeTerm(this); break;
|
||||||
case PRIORTY_TERM: visitor.onPriorityTerm(this); break;
|
case PRIORTY_TERM: visitor.onPriorityTerm(this); break;
|
||||||
case ADVICE_TERM: visitor.onAdviceTerm(this); break;
|
case ADVICE_TERM: visitor.onAdviceTerm(this); break;
|
||||||
case NEAR_TERM: visitor.onNearTerm(this); break;
|
|
||||||
case LITERAL_TERM: visitor.onLiteralTerm(this); break;
|
case LITERAL_TERM: visitor.onLiteralTerm(this); break;
|
||||||
|
|
||||||
case YEAR_TERM: visitor.onYearTerm(this); break;
|
case YEAR_TERM: visitor.onYearTerm(this); break;
|
@ -6,8 +6,6 @@ public interface TokenVisitor {
|
|||||||
void onExcludeTerm(Token token);
|
void onExcludeTerm(Token token);
|
||||||
void onPriorityTerm(Token token);
|
void onPriorityTerm(Token token);
|
||||||
void onAdviceTerm(Token token);
|
void onAdviceTerm(Token token);
|
||||||
void onNearTerm(Token token);
|
|
||||||
|
|
||||||
void onYearTerm(Token token);
|
void onYearTerm(Token token);
|
||||||
void onSizeTerm(Token token);
|
void onSizeTerm(Token token);
|
||||||
void onRankTerm(Token token);
|
void onRankTerm(Token token);
|
7
code/features-qs/readme.md
Normal file
7
code/features-qs/readme.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
**Note**: This package is called `features-qs` rather than `features-query` because the latter,
|
||||||
|
though more consistent with other packages like features-index, would be very confusing
|
||||||
|
as there are other packages elsewhere with the 'query' name (e.g. features-index/index-query).
|
||||||
|
|
||||||
|
## Contents
|
||||||
|
|
||||||
|
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar.
|
@ -3,8 +3,6 @@
|
|||||||
These are bits of search-engine related code that are relatively isolated pieces of business logic,
|
These are bits of search-engine related code that are relatively isolated pieces of business logic,
|
||||||
that benefit from the clarity of being kept separate from the rest of the
|
that benefit from the clarity of being kept separate from the rest of the
|
||||||
search engine code.
|
search engine code.
|
||||||
|
*
|
||||||
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar.
|
|
||||||
* [result-ranking](result-ranking/) contains logic for ranking search results by relevance.
|
|
||||||
* [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random
|
* [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random
|
||||||
exploration mode.
|
exploration mode.
|
||||||
|
@ -13,4 +13,4 @@ its words, how they stem, POS tags, and so on.
|
|||||||
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
|
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
|
||||||
are important.
|
are important.
|
||||||
|
|
||||||
[features-search/query-parser](../../features-search/query-parser) also does some language processing.
|
[features-qs/query-parser](../../features-qs/query-parser) also does some language processing.
|
@ -7,7 +7,6 @@ Contains models shared by the [crawling-process](../../processes/crawling-proces
|
|||||||
|
|
||||||
* [CrawledDocument](src/main/java/nu/marginalia/crawling/model/CrawledDocument.java)
|
* [CrawledDocument](src/main/java/nu/marginalia/crawling/model/CrawledDocument.java)
|
||||||
* [CrawledDomain](src/main/java/nu/marginalia/crawling/model/CrawledDomain.java)
|
* [CrawledDomain](src/main/java/nu/marginalia/crawling/model/CrawledDomain.java)
|
||||||
* [CrawlingSpecification](src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java)
|
|
||||||
|
|
||||||
### Serialization
|
### Serialization
|
||||||
* [CrawledDomainReader](src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java)
|
* [CrawledDomainReader](src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java)
|
||||||
|
@ -8,7 +8,7 @@ into per-domain snapshots.
|
|||||||
* [CrawlerMain](src/main/java/nu/marginalia/crawl/CrawlerMain.java) orchestrates the crawling.
|
* [CrawlerMain](src/main/java/nu/marginalia/crawl/CrawlerMain.java) orchestrates the crawling.
|
||||||
* [CrawlerRetreiver](src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java)
|
* [CrawlerRetreiver](src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java)
|
||||||
visits known addresses from a domain and downloads each document.
|
visits known addresses from a domain and downloads each document.
|
||||||
* [HttpFetcher](src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java)
|
* [HttpFetcher](src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java)
|
||||||
fetches a URL.
|
fetches a URL.
|
||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
@ -32,7 +32,7 @@ dependencies {
|
|||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':third-party:commons-codec')
|
implementation project(':third-party:commons-codec')
|
||||||
testImplementation project(':code:services-core:search-service')
|
testImplementation project(':code:services-application:search-service')
|
||||||
|
|
||||||
implementation project(':code:process-models:crawling-model')
|
implementation project(':code:process-models:crawling-model')
|
||||||
implementation project(':code:process-models:processed-data')
|
implementation project(':code:process-models:processed-data')
|
||||||
|
@ -6,5 +6,4 @@ the index-service.
|
|||||||
|
|
||||||
## Central Classes
|
## Central Classes
|
||||||
|
|
||||||
* [LoaderMain](src/main/java/nu/marginalia/loading/LoaderMain.java) main class.
|
* [LoaderMain](src/main/java/nu/marginalia/loading/LoaderMain.java) main class.
|
||||||
* [Loader](src/main/java/nu/marginalia/loading/loader/Loader.java) evaluates loading instructions.
|
|
@ -5,8 +5,7 @@
|
|||||||
The [crawling-process](crawling-process/) fetches website contents and saves them
|
The [crawling-process](crawling-process/) fetches website contents and saves them
|
||||||
as compressed JSON models described in [crawling-model](../process-models/crawling-model/).
|
as compressed JSON models described in [crawling-model](../process-models/crawling-model/).
|
||||||
|
|
||||||
The operation is specified by a crawl job specification. This is generated by [tools/crawl-job-extractor](../tools/crawl-job-extractor/)
|
The operation is specified by a [crawl specification](../process-models/crawl-spec), which can be created in the control GUI.
|
||||||
based on the content in the database.
|
|
||||||
|
|
||||||
## 2. Converting Process
|
## 2. Converting Process
|
||||||
|
|
||||||
|
@ -14,14 +14,15 @@ A map of the most important components and how they relate can be found below.
|
|||||||
|
|
||||||
### Services
|
### Services
|
||||||
* [core services](services-core/) "macroservices", stateful, memory hungry doing heavy lifting.
|
* [core services](services-core/) "macroservices", stateful, memory hungry doing heavy lifting.
|
||||||
* * [control-service](services-core/control-service)
|
* * [control](services-core/control-service)
|
||||||
* * [search](services-core/search-service)
|
* * [query](services-core/query-service)
|
||||||
* * [index](services-core/index-service)
|
* * [index](services-core/index-service)
|
||||||
* * [assistant](services-core/assistant-service)
|
* * [assistant](services-core/assistant-service)
|
||||||
* [satellite services](services-satellite/) "microservices", stateless providing additional functionality.
|
* [application services](services-application/) "microservices", stateless providing additional functionality and making an application out of the search engine.
|
||||||
* * [api](services-satellite/api-service) - public API
|
* * [api](services-application/api-service) - public API
|
||||||
* * [dating](services-satellite/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/)
|
* * [search](services-application/search-service) - marginalia search application
|
||||||
* * [explorer](services-satellite/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/)
|
* * [dating](services-application/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/)
|
||||||
|
* * [explorer](services-application/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/)
|
||||||
* an [internal API](api/)
|
* an [internal API](api/)
|
||||||
|
|
||||||
### Processes
|
### Processes
|
||||||
|
@ -28,7 +28,9 @@ dependencies {
|
|||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:api:search-api')
|
implementation project(':code:api:query-api')
|
||||||
|
implementation project(':code:api:index-api')
|
||||||
|
implementation project(':code:features-index:index-query')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
@ -0,0 +1,111 @@
|
|||||||
|
package nu.marginalia.api;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.api.model.ApiSearchResult;
|
||||||
|
import nu.marginalia.api.model.ApiSearchResultQueryDetails;
|
||||||
|
import nu.marginalia.api.model.ApiSearchResults;
|
||||||
|
import nu.marginalia.client.Context;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
|
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||||
|
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||||
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import nu.marginalia.query.model.QueryParams;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class ApiSearchOperator {
|
||||||
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ApiSearchOperator(QueryClient queryClient) {
|
||||||
|
this.queryClient = queryClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ApiSearchResults query(Context context,
|
||||||
|
String query,
|
||||||
|
int count,
|
||||||
|
int index)
|
||||||
|
{
|
||||||
|
var rsp = queryClient.search(context, createParams(query, count, index));
|
||||||
|
|
||||||
|
return new ApiSearchResults("RESTRICTED", query,
|
||||||
|
rsp.results()
|
||||||
|
.stream()
|
||||||
|
.map(this::convert)
|
||||||
|
.sorted(Comparator.comparing(ApiSearchResult::getQuality).reversed())
|
||||||
|
.limit(count)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private QueryParams createParams(String query, int count, int index) {
|
||||||
|
SearchSetIdentifier searchSet = selectSearchSet(index);
|
||||||
|
|
||||||
|
return new QueryParams(
|
||||||
|
query,
|
||||||
|
new QueryLimits(
|
||||||
|
2,
|
||||||
|
Math.min(100, count),
|
||||||
|
150,
|
||||||
|
8192),
|
||||||
|
searchSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
private SearchSetIdentifier selectSearchSet(int index) {
|
||||||
|
return switch (index) {
|
||||||
|
case 0 -> SearchSetIdentifier.NONE;
|
||||||
|
case 1 -> SearchSetIdentifier.SMALLWEB;
|
||||||
|
case 2 -> SearchSetIdentifier.RETRO;
|
||||||
|
case 3 -> SearchSetIdentifier.NONE;
|
||||||
|
case 5 -> SearchSetIdentifier.NONE;
|
||||||
|
default -> SearchSetIdentifier.NONE;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ApiSearchResult convert(DecoratedSearchResultItem url) {
|
||||||
|
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||||
|
if (url.rawIndexResult != null) {
|
||||||
|
var bySet = url.rawIndexResult.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for (var entries : bySet.values()) {
|
||||||
|
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||||
|
for (var entry : entries) {
|
||||||
|
var metadata = new WordMetadata(entry.encodedWordMetadata());
|
||||||
|
if (metadata.isEmpty())
|
||||||
|
continue outer;
|
||||||
|
|
||||||
|
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||||
|
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
|
||||||
|
}
|
||||||
|
details.add(lst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ApiSearchResult(
|
||||||
|
url.url.toString(),
|
||||||
|
url.getTitle(),
|
||||||
|
url.getDescription(),
|
||||||
|
sanitizeNaN(url.rankingScore, -100),
|
||||||
|
details
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private double sanitizeNaN(double value, double alternative) {
|
||||||
|
if (!Double.isFinite(value)) {
|
||||||
|
return alternative;
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
@ -3,13 +3,13 @@ package nu.marginalia.api;
|
|||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.api.model.ApiLicense;
|
import nu.marginalia.api.model.ApiLicense;
|
||||||
|
import nu.marginalia.api.model.ApiSearchResults;
|
||||||
import nu.marginalia.api.svc.LicenseService;
|
import nu.marginalia.api.svc.LicenseService;
|
||||||
import nu.marginalia.api.svc.RateLimiterService;
|
import nu.marginalia.api.svc.RateLimiterService;
|
||||||
import nu.marginalia.api.svc.ResponseCache;
|
import nu.marginalia.api.svc.ResponseCache;
|
||||||
import nu.marginalia.client.Context;
|
import nu.marginalia.client.Context;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.search.client.SearchClient;
|
import nu.marginalia.query.client.QueryClient;
|
||||||
import nu.marginalia.search.client.model.ApiSearchResults;
|
|
||||||
import nu.marginalia.service.server.*;
|
import nu.marginalia.service.server.*;
|
||||||
import nu.marginalia.service.server.mq.MqNotification;
|
import nu.marginalia.service.server.mq.MqNotification;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -24,29 +24,32 @@ public class ApiService extends Service {
|
|||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
private final SearchClient searchClient;
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
private final ResponseCache responseCache;
|
private final ResponseCache responseCache;
|
||||||
private final LicenseService licenseService;
|
private final LicenseService licenseService;
|
||||||
private final RateLimiterService rateLimiterService;
|
private final RateLimiterService rateLimiterService;
|
||||||
|
private final ApiSearchOperator searchOperator;
|
||||||
|
|
||||||
// Marker for filtering out sensitive content from the persistent logs
|
// Marker for filtering out sensitive content from the persistent logs
|
||||||
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
|
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ApiService(BaseServiceParams params,
|
public ApiService(BaseServiceParams params,
|
||||||
SearchClient searchClient,
|
QueryClient queryClient,
|
||||||
ResponseCache responseCache,
|
ResponseCache responseCache,
|
||||||
LicenseService licenseService,
|
LicenseService licenseService,
|
||||||
RateLimiterService rateLimiterService
|
RateLimiterService rateLimiterService,
|
||||||
|
ApiSearchOperator searchOperator
|
||||||
) {
|
) {
|
||||||
|
|
||||||
super(params);
|
super(params);
|
||||||
|
|
||||||
this.searchClient = searchClient;
|
this.queryClient = queryClient;
|
||||||
this.responseCache = responseCache;
|
this.responseCache = responseCache;
|
||||||
this.licenseService = licenseService;
|
this.licenseService = licenseService;
|
||||||
this.rateLimiterService = rateLimiterService;
|
this.rateLimiterService = rateLimiterService;
|
||||||
|
this.searchOperator = searchOperator;
|
||||||
|
|
||||||
Spark.get("/public/api/", (rq, rsp) -> {
|
Spark.get("/public/api/", (rq, rsp) -> {
|
||||||
rsp.redirect("https://memex.marginalia.nu/projects/edge/api.gmi");
|
rsp.redirect("https://memex.marginalia.nu/projects/edge/api.gmi");
|
||||||
@ -76,6 +79,8 @@ public class ApiService extends Service {
|
|||||||
|
|
||||||
var license = licenseService.getLicense(request.params("key"));
|
var license = licenseService.getLicense(request.params("key"));
|
||||||
|
|
||||||
|
response.type("application/json");
|
||||||
|
|
||||||
var cachedResponse = responseCache.getResults(license, args[0], request.queryString());
|
var cachedResponse = responseCache.getResults(license, args[0], request.queryString());
|
||||||
if (cachedResponse.isPresent()) {
|
if (cachedResponse.isPresent()) {
|
||||||
return cachedResponse.get();
|
return cachedResponse.get();
|
||||||
@ -87,7 +92,6 @@ public class ApiService extends Service {
|
|||||||
// We set content type late because in the case of error, we don't want to tell the client
|
// We set content type late because in the case of error, we don't want to tell the client
|
||||||
// that the error message is JSON when it is plain text.
|
// that the error message is JSON when it is plain text.
|
||||||
|
|
||||||
response.type("application/json");
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -102,8 +106,9 @@ public class ApiService extends Service {
|
|||||||
|
|
||||||
logger.info(queryMarker, "{} Search {}", license.key, query);
|
logger.info(queryMarker, "{} Search {}", license.key, query);
|
||||||
|
|
||||||
return searchClient.query(Context.fromRequest(request), query, count, index)
|
return searchOperator
|
||||||
.blockingFirst().withLicense(license.getLicense());
|
.query(Context.fromRequest(request), query, count, index)
|
||||||
|
.withLicense(license.getLicense());
|
||||||
}
|
}
|
||||||
|
|
||||||
private int intParam(Request request, String name, int defaultValue) {
|
private int intParam(Request request, String name, int defaultValue) {
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.search.client.model;
|
package nu.marginalia.api.model;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.search.client.model;
|
package nu.marginalia.api.model;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.search.client.model;
|
package nu.marginalia.api.model;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
@ -3,20 +3,12 @@ package nu.marginalia.api.svc;
|
|||||||
import com.google.common.cache.Cache;
|
import com.google.common.cache.Cache;
|
||||||
import com.google.common.cache.CacheBuilder;
|
import com.google.common.cache.CacheBuilder;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.api.model.ApiLicense;
|
import nu.marginalia.api.model.*;
|
||||||
import nu.marginalia.search.client.model.ApiSearchResults;
|
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/** This response cache exists entirely to help SearXNG with its rate limiting.
|
/** This response cache exists entirely to help clients with its rate limiting.
|
||||||
* For some reason they're hitting the API with like 5-12 identical requests.
|
|
||||||
* <p/>
|
|
||||||
* I've submitted an issue, they were like nah mang it works fine must
|
|
||||||
* be something else ¯\_(ツ)_/¯.
|
|
||||||
* <p/>
|
|
||||||
* So we're going to cache the API responses for a short while to mitigate the
|
|
||||||
* impact of such shotgun queries on the ratelimit.
|
|
||||||
*/
|
*/
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ResponseCache {
|
public class ResponseCache {
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.api.svc;
|
package nu.marginalia.api.svc;
|
||||||
|
|
||||||
import nu.marginalia.api.model.ApiLicense;
|
import nu.marginalia.api.model.*;
|
||||||
import nu.marginalia.search.client.model.ApiSearchResults;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
9
code/services-application/readme.md
Normal file
9
code/services-application/readme.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# Application Services
|
||||||
|
|
||||||
|
The application services offer user interfaces/applications around
|
||||||
|
interacting with the [core services](../services-core).
|
||||||
|
|
||||||
|
* The [api-service](api-service/) offers a public API
|
||||||
|
* The [dating-service](dating-service/) is [explore.marginalia.nu](https://explore.marginalia.nu/)
|
||||||
|
* The [explorer-service](dating-service/) is [explore2.marginalia.nu](https://explore2.marginalia.nu/)
|
||||||
|
* The [search-service](search-service/) is the main application for [search.marginalia.nu](https://search.marginalia.nu/)
|
@ -25,7 +25,6 @@ dependencies {
|
|||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:linkdb')
|
|
||||||
implementation project(':code:features-index:index-query')
|
implementation project(':code:features-index:index-query')
|
||||||
|
|
||||||
implementation project(':code:libraries:easy-lsh')
|
implementation project(':code:libraries:easy-lsh')
|
||||||
@ -34,16 +33,16 @@ dependencies {
|
|||||||
implementation project(':code:libraries:term-frequency-dict')
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
|
||||||
implementation project(':code:api:assistant-api')
|
implementation project(':code:api:assistant-api')
|
||||||
|
implementation project(':code:api:query-api')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
implementation project(':code:api:search-api')
|
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:common:renderer')
|
implementation project(':code:common:renderer')
|
||||||
|
|
||||||
implementation project(':code:features-search:screenshots')
|
implementation project(':code:features-search:screenshots')
|
||||||
implementation project(':code:features-search:random-websites')
|
implementation project(':code:features-search:random-websites')
|
||||||
implementation project(':code:features-search:query-parser')
|
implementation project(':code:features-qs:query-parser')
|
||||||
implementation project(':code:features-search:result-ranking')
|
implementation project(':code:features-index:result-ranking')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
@ -20,8 +20,6 @@ appropriate services.
|
|||||||
* [CommandEvaluator](src/main/java/nu/marginalia/search/command/CommandEvaluator.java) interprets a user query and acts
|
* [CommandEvaluator](src/main/java/nu/marginalia/search/command/CommandEvaluator.java) interprets a user query and acts
|
||||||
upon it, dealing with special operations like `browse:` or `site:`.
|
upon it, dealing with special operations like `browse:` or `site:`.
|
||||||
|
|
||||||
* [QueryFactory](src/main/java/nu/marginalia/search/query/QueryFactory.java) parses a search query into a machine readable query specification.
|
|
||||||
|
|
||||||
* [SearchQueryIndexService](src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java) passes a parsed search query to the index service, and
|
* [SearchQueryIndexService](src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java) passes a parsed search query to the index service, and
|
||||||
then decorates the search results so that they can be rendered.
|
then decorates the search results so that they can be rendered.
|
||||||
|
|
@ -0,0 +1,15 @@
|
|||||||
|
package nu.marginalia.search;
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import nu.marginalia.LanguageModels;
|
||||||
|
import nu.marginalia.WebsiteUrl;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
|
||||||
|
public class SearchModule extends AbstractModule {
|
||||||
|
|
||||||
|
public void configure() {
|
||||||
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
|
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/")));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -7,12 +7,12 @@ import io.reactivex.rxjava3.schedulers.Schedulers;
|
|||||||
import nu.marginalia.assistant.client.AssistantClient;
|
import nu.marginalia.assistant.client.AssistantClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.query.client.QueryClient;
|
||||||
|
import nu.marginalia.query.model.QueryResponse;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.client.Context;
|
import nu.marginalia.client.Context;
|
||||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||||
import nu.marginalia.search.query.QueryFactory;
|
import nu.marginalia.search.model.UserSearchParameters;
|
||||||
import nu.marginalia.search.query.model.SearchQuery;
|
|
||||||
import nu.marginalia.search.query.model.UserSearchParameters;
|
|
||||||
import nu.marginalia.search.svc.SearchQueryIndexService;
|
import nu.marginalia.search.svc.SearchQueryIndexService;
|
||||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
@ -37,58 +37,59 @@ public class SearchOperator {
|
|||||||
|
|
||||||
private final AssistantClient assistantClient;
|
private final AssistantClient assistantClient;
|
||||||
private final DbDomainQueries domainQueries;
|
private final DbDomainQueries domainQueries;
|
||||||
private final QueryFactory queryFactory;
|
private final QueryClient queryClient;
|
||||||
|
|
||||||
private final SearchQueryIndexService searchQueryService;
|
private final SearchQueryIndexService searchQueryService;
|
||||||
|
private final SearchQueryParamFactory paramFactory;
|
||||||
private final SearchUnitConversionService searchUnitConversionService;
|
private final SearchUnitConversionService searchUnitConversionService;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchOperator(AssistantClient assistantClient,
|
public SearchOperator(AssistantClient assistantClient,
|
||||||
DbDomainQueries domainQueries,
|
DbDomainQueries domainQueries,
|
||||||
QueryFactory queryFactory,
|
QueryClient queryClient,
|
||||||
SearchQueryIndexService searchQueryService,
|
SearchQueryIndexService searchQueryService,
|
||||||
SearchUnitConversionService searchUnitConversionService) {
|
SearchQueryParamFactory paramFactory,
|
||||||
|
SearchUnitConversionService searchUnitConversionService)
|
||||||
|
{
|
||||||
|
|
||||||
this.assistantClient = assistantClient;
|
this.assistantClient = assistantClient;
|
||||||
this.domainQueries = domainQueries;
|
this.domainQueries = domainQueries;
|
||||||
this.queryFactory = queryFactory;
|
this.queryClient = queryClient;
|
||||||
|
|
||||||
this.searchQueryService = searchQueryService;
|
this.searchQueryService = searchQueryService;
|
||||||
|
this.paramFactory = paramFactory;
|
||||||
this.searchUnitConversionService = searchUnitConversionService;
|
this.searchUnitConversionService = searchUnitConversionService;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<UrlDetails> doApiSearch(Context ctx,
|
public List<UrlDetails> doSiteSearch(Context ctx,
|
||||||
UserSearchParameters params) {
|
String domain) {
|
||||||
|
|
||||||
|
var queryParams = paramFactory.forSiteSearch(domain);
|
||||||
|
var queryResponse = queryClient.search(ctx, queryParams);
|
||||||
|
|
||||||
SearchQuery processedQuery = queryFactory.createQuery(params);
|
return searchQueryService.getResultsFromQuery(queryResponse);
|
||||||
|
|
||||||
logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
|
||||||
|
|
||||||
return searchQueryService.executeQuery(ctx, processedQuery);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) {
|
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters userParams) {
|
||||||
|
|
||||||
Future<String> eval = searchUnitConversionService.tryEval(ctx, params.humanQuery());
|
Future<String> eval = searchUnitConversionService.tryEval(ctx, userParams.humanQuery());
|
||||||
SearchQuery processedQuery = queryFactory.createQuery(params);
|
var queryParams = paramFactory.forRegularSearch(userParams);
|
||||||
|
var queryResponse = queryClient.search(ctx, queryParams);
|
||||||
|
|
||||||
logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
List<UrlDetails> queryResults = searchQueryService.getResultsFromQuery(queryResponse);
|
||||||
|
|
||||||
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery);
|
|
||||||
|
|
||||||
|
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
|
||||||
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
||||||
|
|
||||||
String evalResult = getFutureOrDefault(eval, "");
|
String evalResult = getFutureOrDefault(eval, "");
|
||||||
|
|
||||||
return DecoratedSearchResults.builder()
|
return DecoratedSearchResults.builder()
|
||||||
.params(params)
|
.params(userParams)
|
||||||
.problems(getProblems(ctx, evalResult, queryResults, processedQuery))
|
.problems(getProblems(ctx, evalResult, queryResults, queryResponse))
|
||||||
.evalResult(evalResult)
|
.evalResult(evalResult)
|
||||||
.results(queryResults)
|
.results(queryResults)
|
||||||
.focusDomain(processedQuery.domain)
|
.focusDomain(queryResponse.domain())
|
||||||
.focusDomainId(getDomainId(processedQuery.domain))
|
.focusDomainId(getDomainId(queryResponse.domain()))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,20 +114,20 @@ public class SearchOperator {
|
|||||||
return domainQueries.tryGetDomainId(new EdgeDomain(domain)).orElse(-1);
|
return domainQueries.tryGetDomainId(new EdgeDomain(domain)).orElse(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getProblems(Context ctx, String evalResult, List<UrlDetails> queryResults, SearchQuery processedQuery) {
|
private List<String> getProblems(Context ctx, String evalResult, List<UrlDetails> queryResults, QueryResponse response) {
|
||||||
final List<String> problems = new ArrayList<>(processedQuery.problems);
|
final List<String> problems = new ArrayList<>(response.problems());
|
||||||
boolean siteSearch = processedQuery.domain != null;
|
boolean siteSearch = response.domain() != null;
|
||||||
|
|
||||||
if (!siteSearch) {
|
if (!siteSearch) {
|
||||||
if (queryResults.size() <= 5 && null == evalResult) {
|
if (queryResults.size() <= 5 && null == evalResult) {
|
||||||
spellCheckTerms(ctx, processedQuery).forEach(problems::add);
|
spellCheckTerms(ctx, response).forEach(problems::add);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (queryResults.size() <= 5) {
|
if (queryResults.size() <= 5) {
|
||||||
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results. <a href=\"https://memex.marginalia.nu/projects/edge/search-tips.gmi\">Tips</a>.");
|
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results. <a href=\"https://memex.marginalia.nu/projects/edge/search-tips.gmi\">Tips</a>.");
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> representativeKeywords = processedQuery.getAllKeywords();
|
Set<String> representativeKeywords = response.getAllKeywords();
|
||||||
if (representativeKeywords.size()>1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
|
if (representativeKeywords.size()>1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
|
||||||
{
|
{
|
||||||
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
|
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
|
||||||
@ -137,8 +138,8 @@ public class SearchOperator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Iterable<String> spellCheckTerms(Context ctx, SearchQuery disjointedQuery) {
|
private Iterable<String> spellCheckTerms(Context ctx, QueryResponse response) {
|
||||||
return Observable.fromIterable(disjointedQuery.searchTermsHuman)
|
return Observable.fromIterable(response.searchTermsHuman())
|
||||||
.subscribeOn(Schedulers.io())
|
.subscribeOn(Schedulers.io())
|
||||||
.flatMap(term -> assistantClient.spellCheck(ctx, term)
|
.flatMap(term -> assistantClient.spellCheck(ctx, term)
|
||||||
.onErrorReturn(e -> Collections.emptyList())
|
.onErrorReturn(e -> Collections.emptyList())
|
@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.search;
|
||||||
|
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
import nu.marginalia.query.model.QueryParams;
|
||||||
|
import nu.marginalia.search.model.UserSearchParameters;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class SearchQueryParamFactory {
|
||||||
|
|
||||||
|
public QueryParams forRegularSearch(UserSearchParameters userParams) {
|
||||||
|
SearchSubquery prototype = new SearchSubquery();
|
||||||
|
var profile = userParams.profile();
|
||||||
|
profile.addTacitTerms(prototype);
|
||||||
|
|
||||||
|
return new QueryParams(
|
||||||
|
userParams.humanQuery(),
|
||||||
|
null,
|
||||||
|
prototype.searchTermsInclude,
|
||||||
|
prototype.searchTermsExclude,
|
||||||
|
prototype.searchTermsPriority,
|
||||||
|
prototype.searchTermsAdvice,
|
||||||
|
profile.getQualityLimit(),
|
||||||
|
profile.getYearLimit(),
|
||||||
|
profile.getSizeLimit(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
List.of(),
|
||||||
|
new QueryLimits(2, 100, 200, 8192),
|
||||||
|
profile.searchSetIdentifier
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public QueryParams forSiteSearch(String domain) {
|
||||||
|
return new QueryParams("site:"+domain,
|
||||||
|
null,
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
SpecificationLimit.none(),
|
||||||
|
List.of(),
|
||||||
|
new QueryLimits(100, 100, 100, 512),
|
||||||
|
SearchSetIdentifier.NONE
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -6,15 +6,11 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.WebsiteUrl;
|
import nu.marginalia.WebsiteUrl;
|
||||||
import nu.marginalia.client.Context;
|
import nu.marginalia.client.Context;
|
||||||
import nu.marginalia.db.storage.FileStorageService;
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
|
||||||
import nu.marginalia.linkdb.LinkdbReader;
|
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.search.client.SearchMqEndpoints;
|
|
||||||
import nu.marginalia.search.svc.SearchFrontPageService;
|
import nu.marginalia.search.svc.SearchFrontPageService;
|
||||||
import nu.marginalia.search.svc.*;
|
import nu.marginalia.search.svc.*;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.server.*;
|
import nu.marginalia.service.server.*;
|
||||||
import nu.marginalia.service.server.mq.MqNotification;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
@ -23,18 +19,13 @@ import spark.Spark;
|
|||||||
|
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public class SearchService extends Service {
|
public class SearchService extends Service {
|
||||||
|
|
||||||
private final WebsiteUrl websiteUrl;
|
private final WebsiteUrl websiteUrl;
|
||||||
private final StaticResources staticResources;
|
private final StaticResources staticResources;
|
||||||
private final FileStorageService fileStorageService;
|
|
||||||
private final LinkdbReader linkdbReader;
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||||
private final ServiceEventLog eventLog;
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Inject
|
@Inject
|
||||||
@ -45,18 +36,12 @@ public class SearchService extends Service {
|
|||||||
SearchErrorPageService errorPageService,
|
SearchErrorPageService errorPageService,
|
||||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||||
SearchFlagSiteService flagSiteService,
|
SearchFlagSiteService flagSiteService,
|
||||||
SearchQueryService searchQueryService,
|
SearchQueryService searchQueryService
|
||||||
SearchApiQueryService apiQueryService,
|
|
||||||
FileStorageService fileStorageService,
|
|
||||||
LinkdbReader linkdbReader
|
|
||||||
) {
|
) {
|
||||||
super(params);
|
super(params);
|
||||||
|
|
||||||
this.eventLog = params.eventLog;
|
|
||||||
this.websiteUrl = websiteUrl;
|
this.websiteUrl = websiteUrl;
|
||||||
this.staticResources = staticResources;
|
this.staticResources = staticResources;
|
||||||
this.fileStorageService = fileStorageService;
|
|
||||||
this.linkdbReader = linkdbReader;
|
|
||||||
|
|
||||||
Spark.staticFiles.expireTime(600);
|
Spark.staticFiles.expireTime(600);
|
||||||
|
|
||||||
@ -64,7 +49,6 @@ public class SearchService extends Service {
|
|||||||
|
|
||||||
Gson gson = GsonFactory.get();
|
Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
Spark.get("/api/search", apiQueryService::apiSearch, gson::toJson);
|
|
||||||
Spark.get("/public/search", searchQueryService::pathSearch);
|
Spark.get("/public/search", searchQueryService::pathSearch);
|
||||||
Spark.get("/public/site-search/:site/*", this::siteSearchRedir);
|
Spark.get("/public/site-search/:site/*", this::siteSearchRedir);
|
||||||
Spark.get("/public/", frontPageService::render);
|
Spark.get("/public/", frontPageService::render);
|
||||||
@ -87,21 +71,6 @@ public class SearchService extends Service {
|
|||||||
Spark.awaitInitialization();
|
Spark.awaitInitialization();
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@MqNotification(endpoint = SearchMqEndpoints.SWITCH_LINKDB)
|
|
||||||
public void switchLinkdb(String unusedArg) {
|
|
||||||
logger.info("Switching link database");
|
|
||||||
|
|
||||||
Path newPath = fileStorageService.getStorageByType(FileStorageType.LINKDB_STAGING)
|
|
||||||
.asPath()
|
|
||||||
.resolve("links.db");
|
|
||||||
|
|
||||||
if (Files.exists(newPath)) {
|
|
||||||
eventLog.logEvent("SEARCH-SWITCH-LINKDB", "");
|
|
||||||
linkdbReader.switchInput(newPath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object serveStatic(Request request, Response response) {
|
private Object serveStatic(Request request, Response response) {
|
||||||
String resource = request.params("resource");
|
String resource = request.params("resource");
|
||||||
staticResources.serveStatic("search", resource, request, response);
|
staticResources.serveStatic("search", resource, request, response);
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user