Merge pull request #53 from MarginaliaSearch/standalone-index

Move ranking to the index-service, and query parsing to a new query-service; separate out the search-service
This commit is contained in:
Viktor 2023-10-09 15:42:06 +02:00 committed by GitHub
commit c8d820c17b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
192 changed files with 1242 additions and 818 deletions

View File

@ -5,4 +5,6 @@ public class IndexMqEndpoints {
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
public static final String SWITCH_INDEX = "SWITCH-INDEX";
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
}

View File

@ -0,0 +1,82 @@
package nu.marginalia.index.client.model.results;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.model.EdgeUrl;
import org.jetbrains.annotations.NotNull;
import javax.annotation.Nullable;
import java.util.List;
@Getter
@ToString
public class DecoratedSearchResultItem {
public final SearchResultItem rawIndexResult;
@NotNull
public final EdgeUrl url;
@NotNull
public final String title;
@NotNull
public final String description;
public final double urlQuality;
@NotNull
public final String format;
/** Document features bitmask, see HtmlFeature */
public final int features;
@Nullable
public final Integer pubYear;
public final long dataHash;
public final int wordsTotal;
public final double rankingScore;
public long documentId() {
return rawIndexResult.getDocumentId();
}
public int domainId() {
return rawIndexResult.getDomainId();
}
public int resultsFromDomain() {
return rawIndexResult.getResultsFromDomain();
}
public List<SearchResultKeywordScore> keywordScores() {
return rawIndexResult.getKeywordScores();
}
public long rankingId() {
return rawIndexResult.getRanking();
}
public DecoratedSearchResultItem(SearchResultItem rawIndexResult,
@NotNull
EdgeUrl url,
@NotNull
String title,
@NotNull
String description,
double urlQuality,
@NotNull
String format,
int features,
@Nullable
Integer pubYear,
long dataHash,
int wordsTotal,
double rankingScore)
{
this.rawIndexResult = rawIndexResult;
this.url = url;
this.title = title;
this.description = description;
this.urlQuality = urlQuality;
this.format = format;
this.features = features;
this.pubYear = pubYear;
this.dataHash = dataHash;
this.wordsTotal = wordsTotal;
this.rankingScore = rankingScore;
}
}

View File

@ -8,8 +8,7 @@ import java.util.List;
@AllArgsConstructor @Getter @ToString
public class SearchResultSet {
public List<SearchResultItem> results;
public ResultRankingContext rankingContext;
public List<DecoratedSearchResultItem> results;
public int size() {
return results.size();
}

View File

@ -1,7 +1,5 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
@ -13,8 +11,10 @@ java {
dependencies {
implementation project(':code:common:model')
implementation project(':code:api:index-api')
implementation project(':code:common:config')
implementation project(':code:libraries:message-queue')
implementation project(':code:features-index:index-query')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')

View File

@ -0,0 +1,65 @@
package nu.marginalia.query.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.prometheus.client.Summary;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.WmsaHome;
import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.Context;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.query.model.QueryParams;
import nu.marginalia.query.model.QueryResponse;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckReturnValue;
import java.util.UUID;
@Singleton
public class QueryClient extends AbstractDynamicClient {
private static final Summary wmsa_search_index_api_delegate_time = Summary.build().name("wmsa_search_index_api_delegate_time").help("-").register();
private static final Summary wmsa_search_index_api_search_time = Summary.build().name("wmsa_search_index_api_search_time").help("-").register();
private final Logger logger = LoggerFactory.getLogger(getClass());
private final MqOutbox outbox;
@Inject
public QueryClient(ServiceDescriptors descriptors,
MessageQueueFactory messageQueueFactory) {
super(descriptors.forId(ServiceId.Query), WmsaHome.getHostsFile(), GsonFactory::get);
String inboxName = ServiceId.Query.name + ":" + "0";
String outboxName = System.getProperty("service-name", UUID.randomUUID().toString());
outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID());
}
/** Delegate an Index API style query directly to the index service */
@CheckReturnValue
public SearchResultSet delegate(Context ctx, SearchSpecification specs) {
return wmsa_search_index_api_delegate_time.time(
() -> this.postGet(ctx, "/delegate/", specs, SearchResultSet.class).blockingFirst()
);
}
@CheckReturnValue
public QueryResponse search(Context ctx, QueryParams params) {
return wmsa_search_index_api_search_time.time(
() -> this.postGet(ctx, "/search/", params, QueryResponse.class).blockingFirst()
);
}
public MqOutbox outbox() {
return outbox;
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.query.model;
import nu.marginalia.index.client.model.query.SearchSpecification;
import java.util.*;
public class ProcessedQuery {
public final SearchSpecification specs;
public final List<String> searchTermsHuman;
public final String domain;
public ProcessedQuery(SearchSpecification specs, List<String> searchTermsHuman, String domain) {
this.specs = specs;
this.searchTermsHuman = searchTermsHuman;
this.domain = domain;
}
public ProcessedQuery(SearchSpecification justSpecs) {
this(justSpecs, List.of(), null);
}
}

View File

@ -0,0 +1,41 @@
package nu.marginalia.query.model;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
import java.util.List;
public record QueryParams(
String humanQuery,
String nearDomain,
List<String> tacitIncludes,
List<String> tacitExcludes,
List<String> tacitPriority,
List<String> tacitAdvice,
SpecificationLimit quality,
SpecificationLimit year,
SpecificationLimit size,
SpecificationLimit rank,
List<Integer> domainIds,
QueryLimits limits,
SearchSetIdentifier identifier
)
{
public QueryParams(String query, QueryLimits limits, SearchSetIdentifier identifier) {
this(query, null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
limits,
identifier
);
}
}

View File

@ -0,0 +1,23 @@
package nu.marginalia.query.model;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public record QueryResponse(SearchSpecification specs,
List<DecoratedSearchResultItem> results,
List<String> searchTermsHuman,
List<String> problems,
String domain)
{
public Set<String> getAllKeywords() {
Set<String> keywords = new HashSet<>(100);
for (var sq : specs.subqueries) {
keywords.addAll(sq.searchTermsInclude);
}
return keywords;
}
}

View File

@ -3,7 +3,7 @@
## Core Services
* [assistant-api](assistant-api/)
* [search-api](search-api/)
* [query-api](query-api/)
* [index-api](index-api/)
These are clients for the [core services](../services-core/), along with what models
@ -13,11 +13,11 @@ are necessary for speaking to them. They each implement the abstract client clas
All that is necessary is to `@Inject` them into the constructor and then
requests can be sent.
**Note:** If you are looking for the public API, it's handled by the api service in [services-satellite/api-service](../services-satellite/api-service).
**Note:** If you are looking for the public API, it's handled by the api service in [services-application/api-service](../services-application/api-service).
## MQ-API Process API
[process-mqapi](process-mqapi/) defines requests and inboxes for the message queue based API used
for interacting with processes.
See [libraries/message-queue](../libraries/message-queue) and [services-satellite/control-service](../services-satellite/control-service).
See [libraries/message-queue](../libraries/message-queue) and [services-application/control-service](../services-core/control-service).

View File

@ -1,8 +0,0 @@
# Search API
Client and models for talking to the [search-service](../../services-core/search-service),
implemented with the base client from [service-client](../../common/service-client).
## Central Classes
* [SearchClient](src/main/java/nu/marginalia/search/client/SearchClient.java)

View File

@ -1,52 +0,0 @@
package nu.marginalia.search.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.search.client.model.ApiSearchResults;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
import nu.marginalia.WmsaHome;
import nu.marginalia.client.Context;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckReturnValue;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.UUID;
@Singleton
public class SearchClient extends AbstractDynamicClient {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final MqOutbox outbox;
@Inject
public SearchClient(ServiceDescriptors descriptors,
MessageQueueFactory messageQueueFactory) {
super(descriptors.forId(ServiceId.Search), WmsaHome.getHostsFile(), GsonFactory::get);
String inboxName = ServiceId.Search.name + ":" + "0";
String outboxName = System.getProperty("service-name", UUID.randomUUID().toString());
outbox = messageQueueFactory.createOutbox(inboxName, outboxName, UUID.randomUUID());
}
public MqOutbox outbox() {
return outbox;
}
@CheckReturnValue
public Observable<ApiSearchResults> query(Context ctx, String queryString, int count, int profile) {
return this.get(ctx, String.format("/api/search?query=%s&count=%d&index=%d", URLEncoder.encode(queryString, StandardCharsets.UTF_8), count, profile), ApiSearchResults.class);
}
}

View File

@ -1,7 +0,0 @@
package nu.marginalia.search.client;
public class SearchMqEndpoints {
/** Flushes the URL caches, run if significant changes have occurred in the URLs database */
public static final String FLUSH_CACHES = "FLUSH_CACHES";
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
}

View File

@ -8,4 +8,4 @@ this information take effect in production immediately, even before
the information was searchable.
It is constructed by the [loading-process](../../processes/loading-process), and consumed
by the [search-service](../../services-core/search-service).
by the [index-service](../../services-core/index-service).

View File

@ -62,6 +62,15 @@ public class LinkdbReader {
connection = createConnection();
}
/** Re-establishes the connection, useful in tests and not
* much else */
public void reconnect() throws SQLException {
if (connection != null)
connection.close();
connection = createConnection();
}
public List<String> getUrlsFromDomain(int domainId) throws SQLException {
if (connection == null ||
connection.isClosed())

View File

@ -6,7 +6,6 @@ This package contains common models to the search engine
* [EdgeDomain](src/main/java/nu/marginalia/model/EdgeDomain.java)
* [EdgeUrl](src/main/java/nu/marginalia/model/EdgeUrl.java)
* [EdgeId](src/main/java/nu/marginalia/model/id/EdgeId.java)
* [DocumentMetadata](src/main/java/nu/marginalia/model/idx/DocumentMetadata.java)
* [DocumentFlags](src/main/java/nu/marginalia/model/idx/DocumentFlags.java)
* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)

View File

@ -10,6 +10,7 @@ public class SearchServiceDescriptors {
public static ServiceDescriptors descriptors = new ServiceDescriptors(
List.of(new ServiceDescriptor(ServiceId.Api, 5004),
new ServiceDescriptor(ServiceId.Index, 5021),
new ServiceDescriptor(ServiceId.Query, 5022),
new ServiceDescriptor(ServiceId.Search, 5023),
new ServiceDescriptor(ServiceId.Assistant, 5025),
new ServiceDescriptor(ServiceId.Dating, 5070),

View File

@ -6,6 +6,7 @@ public enum ServiceId {
Api("api-service"),
Search("search-service"),
Index("index-service"),
Query("query-service"),
Control("control-service"),

View File

@ -17,7 +17,7 @@ Contains domain ranking algorithms.
## See Also
* [features-search/result-ranking](../../features-search/result-ranking) - Ranks search results
* [result-ranking](../result-ranking) - Ranks search results
## Useful Resources

View File

@ -1,4 +1,7 @@
package nu.marginalia.index.query.limit;
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
public QueryLimits forSingleDomain() {
return new QueryLimits(resultsTotal, resultsTotal, timeoutMs, fetchSize);
}
}

View File

@ -14,11 +14,11 @@ These indices rely heavily on the [libraries/btree](../libraries/btree) and [lib
## Algorithms
* [domain-ranking](domain-ranking/) contains ranking algorithms.
* [domain-ranking](domain-ranking/) contains domain ranking algorithms.
* [result-ranking](result-ranking/) contains logic for ranking search results by relevance.
# Libraries
* [index-query](index-query/) contains structures for evaluating search queries.
* [index-journal](index-journal/) contains tools for writing and reading index data.
* [lexicon](lexicon/) contains a mapping between words' string representation and an unique integer identifier.

View File

@ -11,4 +11,4 @@ results higher.
## See Also
* [features-index/domain-ranking](../../features-index/domain-ranking) - Ranks domains
* [features-index/domain-ranking](../domain-ranking) - Ranks domains

View File

@ -1,6 +1,6 @@
# Query Parser
End-user search query parsing tools used by the [search-service](../../services-core/search-service).
End-user search query parsing tools used by the [query-service](../../services-core/query-service).
## Central Classes

View File

@ -36,7 +36,6 @@ public class Token {
case EXCLUDE_TERM: visitor.onExcludeTerm(this); break;
case PRIORTY_TERM: visitor.onPriorityTerm(this); break;
case ADVICE_TERM: visitor.onAdviceTerm(this); break;
case NEAR_TERM: visitor.onNearTerm(this); break;
case LITERAL_TERM: visitor.onLiteralTerm(this); break;
case YEAR_TERM: visitor.onYearTerm(this); break;

View File

@ -6,8 +6,6 @@ public interface TokenVisitor {
void onExcludeTerm(Token token);
void onPriorityTerm(Token token);
void onAdviceTerm(Token token);
void onNearTerm(Token token);
void onYearTerm(Token token);
void onSizeTerm(Token token);
void onRankTerm(Token token);

View File

@ -0,0 +1,7 @@
**Note**: This package is called `features-qs` rather than `features-query` because the latter,
though more consistent with other packages like features-index, would be very confusing
as there are other packages elsewhere with the 'query' name (e.g. features-index/index-query).
## Contents
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar.

View File

@ -3,8 +3,6 @@
These are bits of search-engine related code that are relatively isolated pieces of business logic,
that benefit from the clarity of being kept separate from the rest of the
search engine code.
* [query-parser](query-parser/) contains code for parsing the user-facing query grammar.
* [result-ranking](result-ranking/) contains logic for ranking search results by relevance.
*
* [screenshots](screenshots/) and [random-websites](random-websites/) contains SQL queries random
exploration mode.

View File

@ -13,4 +13,4 @@ its words, how they stem, POS tags, and so on.
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
are important.
[features-search/query-parser](../../features-search/query-parser) also does some language processing.
[features-qs/query-parser](../../features-qs/query-parser) also does some language processing.

View File

@ -7,7 +7,6 @@ Contains models shared by the [crawling-process](../../processes/crawling-proces
* [CrawledDocument](src/main/java/nu/marginalia/crawling/model/CrawledDocument.java)
* [CrawledDomain](src/main/java/nu/marginalia/crawling/model/CrawledDomain.java)
* [CrawlingSpecification](src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java)
### Serialization
* [CrawledDomainReader](src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java)

View File

@ -8,7 +8,7 @@ into per-domain snapshots.
* [CrawlerMain](src/main/java/nu/marginalia/crawl/CrawlerMain.java) orchestrates the crawling.
* [CrawlerRetreiver](src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java)
visits known addresses from a domain and downloads each document.
* [HttpFetcher](src/main/java/nu/marginalia/crawl/retreival/HttpFetcher.java)
* [HttpFetcher](src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java)
fetches a URL.
## See Also

View File

@ -32,7 +32,7 @@ dependencies {
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:language-processing')
implementation project(':third-party:commons-codec')
testImplementation project(':code:services-core:search-service')
testImplementation project(':code:services-application:search-service')
implementation project(':code:process-models:crawling-model')
implementation project(':code:process-models:processed-data')

View File

@ -6,5 +6,4 @@ the index-service.
## Central Classes
* [LoaderMain](src/main/java/nu/marginalia/loading/LoaderMain.java) main class.
* [Loader](src/main/java/nu/marginalia/loading/loader/Loader.java) evaluates loading instructions.
* [LoaderMain](src/main/java/nu/marginalia/loading/LoaderMain.java) main class.

View File

@ -5,8 +5,7 @@
The [crawling-process](crawling-process/) fetches website contents and saves them
as compressed JSON models described in [crawling-model](../process-models/crawling-model/).
The operation is specified by a crawl job specification. This is generated by [tools/crawl-job-extractor](../tools/crawl-job-extractor/)
based on the content in the database.
The operation is specified by a [crawl specification](../process-models/crawl-spec), which can be created in the control GUI.
## 2. Converting Process

View File

@ -14,14 +14,15 @@ A map of the most important components and how they relate can be found below.
### Services
* [core services](services-core/) "macroservices", stateful, memory hungry doing heavy lifting.
* * [control-service](services-core/control-service)
* * [search](services-core/search-service)
* * [control](services-core/control-service)
* * [query](services-core/query-service)
* * [index](services-core/index-service)
* * [assistant](services-core/assistant-service)
* [satellite services](services-satellite/) "microservices", stateless providing additional functionality.
* * [api](services-satellite/api-service) - public API
* * [dating](services-satellite/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/)
* * [explorer](services-satellite/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/)
* [application services](services-application/) "microservices", stateless providing additional functionality and making an application out of the search engine.
* * [api](services-application/api-service) - public API
* * [search](services-application/search-service) - marginalia search application
* * [dating](services-application/dating-service) - [https://explore.marginalia.nu/](https://explore.marginalia.nu/)
* * [explorer](services-application/explorer-service) - [https://explore2.marginalia.nu/](https://explore2.marginalia.nu/)
* an [internal API](api/)
### Processes

View File

@ -28,7 +28,9 @@ dependencies {
implementation project(':code:common:config')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:api:search-api')
implementation project(':code:api:query-api')
implementation project(':code:api:index-api')
implementation project(':code:features-index:index-query')
implementation libs.bundles.slf4j

View File

@ -0,0 +1,111 @@
package nu.marginalia.api;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.model.ApiSearchResult;
import nu.marginalia.api.model.ApiSearchResultQueryDetails;
import nu.marginalia.api.model.ApiSearchResults;
import nu.marginalia.client.Context;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.query.model.QueryParams;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@Singleton
public class ApiSearchOperator {
private final QueryClient queryClient;
@Inject
public ApiSearchOperator(QueryClient queryClient) {
this.queryClient = queryClient;
}
public ApiSearchResults query(Context context,
String query,
int count,
int index)
{
var rsp = queryClient.search(context, createParams(query, count, index));
return new ApiSearchResults("RESTRICTED", query,
rsp.results()
.stream()
.map(this::convert)
.sorted(Comparator.comparing(ApiSearchResult::getQuality).reversed())
.limit(count)
.collect(Collectors.toList()));
}
private QueryParams createParams(String query, int count, int index) {
SearchSetIdentifier searchSet = selectSearchSet(index);
return new QueryParams(
query,
new QueryLimits(
2,
Math.min(100, count),
150,
8192),
searchSet);
}
private SearchSetIdentifier selectSearchSet(int index) {
return switch (index) {
case 0 -> SearchSetIdentifier.NONE;
case 1 -> SearchSetIdentifier.SMALLWEB;
case 2 -> SearchSetIdentifier.RETRO;
case 3 -> SearchSetIdentifier.NONE;
case 5 -> SearchSetIdentifier.NONE;
default -> SearchSetIdentifier.NONE;
};
}
ApiSearchResult convert(DecoratedSearchResultItem url) {
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
if (url.rawIndexResult != null) {
var bySet = url.rawIndexResult.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
outer:
for (var entries : bySet.values()) {
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
for (var entry : entries) {
var metadata = new WordMetadata(entry.encodedWordMetadata());
if (metadata.isEmpty())
continue outer;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
}
details.add(lst);
}
}
return new ApiSearchResult(
url.url.toString(),
url.getTitle(),
url.getDescription(),
sanitizeNaN(url.rankingScore, -100),
details
);
}
private double sanitizeNaN(double value, double alternative) {
if (!Double.isFinite(value)) {
return alternative;
}
return value;
}
}

View File

@ -3,13 +3,13 @@ package nu.marginalia.api;
import com.google.gson.Gson;
import com.google.inject.Inject;
import nu.marginalia.api.model.ApiLicense;
import nu.marginalia.api.model.ApiSearchResults;
import nu.marginalia.api.svc.LicenseService;
import nu.marginalia.api.svc.RateLimiterService;
import nu.marginalia.api.svc.ResponseCache;
import nu.marginalia.client.Context;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.search.client.SearchClient;
import nu.marginalia.search.client.model.ApiSearchResults;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.service.server.*;
import nu.marginalia.service.server.mq.MqNotification;
import org.slf4j.Logger;
@ -24,29 +24,32 @@ public class ApiService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = GsonFactory.get();
private final SearchClient searchClient;
private final QueryClient queryClient;
private final ResponseCache responseCache;
private final LicenseService licenseService;
private final RateLimiterService rateLimiterService;
private final ApiSearchOperator searchOperator;
// Marker for filtering out sensitive content from the persistent logs
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
@Inject
public ApiService(BaseServiceParams params,
SearchClient searchClient,
QueryClient queryClient,
ResponseCache responseCache,
LicenseService licenseService,
RateLimiterService rateLimiterService
RateLimiterService rateLimiterService,
ApiSearchOperator searchOperator
) {
super(params);
this.searchClient = searchClient;
this.queryClient = queryClient;
this.responseCache = responseCache;
this.licenseService = licenseService;
this.rateLimiterService = rateLimiterService;
this.searchOperator = searchOperator;
Spark.get("/public/api/", (rq, rsp) -> {
rsp.redirect("https://memex.marginalia.nu/projects/edge/api.gmi");
@ -76,6 +79,8 @@ public class ApiService extends Service {
var license = licenseService.getLicense(request.params("key"));
response.type("application/json");
var cachedResponse = responseCache.getResults(license, args[0], request.queryString());
if (cachedResponse.isPresent()) {
return cachedResponse.get();
@ -87,7 +92,6 @@ public class ApiService extends Service {
// We set content type late because in the case of error, we don't want to tell the client
// that the error message is JSON when it is plain text.
response.type("application/json");
return result;
}
@ -102,8 +106,9 @@ public class ApiService extends Service {
logger.info(queryMarker, "{} Search {}", license.key, query);
return searchClient.query(Context.fromRequest(request), query, count, index)
.blockingFirst().withLicense(license.getLicense());
return searchOperator
.query(Context.fromRequest(request), query, count, index)
.withLicense(license.getLicense());
}
private int intParam(Request request, String name, int defaultValue) {

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.client.model;
package nu.marginalia.api.model;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.client.model;
package nu.marginalia.api.model;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.client.model;
package nu.marginalia.api.model;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -3,20 +3,12 @@ package nu.marginalia.api.svc;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.inject.Singleton;
import nu.marginalia.api.model.ApiLicense;
import nu.marginalia.search.client.model.ApiSearchResults;
import nu.marginalia.api.model.*;
import java.time.Duration;
import java.util.Optional;
/** This response cache exists entirely to help SearXNG with its rate limiting.
* For some reason they're hitting the API with like 5-12 identical requests.
* <p/>
* I've submitted an issue, they were like nah mang it works fine must
* be something else ¯\_()_/¯.
* <p/>
* So we're going to cache the API responses for a short while to mitigate the
* impact of such shotgun queries on the ratelimit.
/** This response cache exists entirely to help clients with its rate limiting.
*/
@Singleton
public class ResponseCache {

View File

@ -1,7 +1,6 @@
package nu.marginalia.api.svc;
import nu.marginalia.api.model.ApiLicense;
import nu.marginalia.search.client.model.ApiSearchResults;
import nu.marginalia.api.model.*;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

View File

@ -0,0 +1,9 @@
# Application Services
The application services offer user interfaces/applications around
interacting with the [core services](../services-core).
* The [api-service](api-service/) offers a public API
* The [dating-service](dating-service/) is [explore.marginalia.nu](https://explore.marginalia.nu/)
* The [explorer-service](dating-service/) is [explore2.marginalia.nu](https://explore2.marginalia.nu/)
* The [search-service](search-service/) is the main application for [search.marginalia.nu](https://search.marginalia.nu/)

View File

@ -25,7 +25,6 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:common:linkdb')
implementation project(':code:features-index:index-query')
implementation project(':code:libraries:easy-lsh')
@ -34,16 +33,16 @@ dependencies {
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:api:assistant-api')
implementation project(':code:api:query-api')
implementation project(':code:api:index-api')
implementation project(':code:api:search-api')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:common:renderer')
implementation project(':code:features-search:screenshots')
implementation project(':code:features-search:random-websites')
implementation project(':code:features-search:query-parser')
implementation project(':code:features-search:result-ranking')
implementation project(':code:features-qs:query-parser')
implementation project(':code:features-index:result-ranking')
implementation libs.bundles.slf4j

View File

@ -20,8 +20,6 @@ appropriate services.
* [CommandEvaluator](src/main/java/nu/marginalia/search/command/CommandEvaluator.java) interprets a user query and acts
upon it, dealing with special operations like `browse:` or `site:`.
* [QueryFactory](src/main/java/nu/marginalia/search/query/QueryFactory.java) parses a search query into a machine readable query specification.
* [SearchQueryIndexService](src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java) passes a parsed search query to the index service, and
then decorates the search results so that they can be rendered.

View File

@ -0,0 +1,15 @@
package nu.marginalia.search;
import com.google.inject.AbstractModule;
import nu.marginalia.LanguageModels;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.WmsaHome;
public class SearchModule extends AbstractModule {
public void configure() {
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/")));
}
}

View File

@ -7,12 +7,12 @@ import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.assistant.client.AssistantClient;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.query.client.QueryClient;
import nu.marginalia.query.model.QueryResponse;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.client.Context;
import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.query.QueryFactory;
import nu.marginalia.search.query.model.SearchQuery;
import nu.marginalia.search.query.model.UserSearchParameters;
import nu.marginalia.search.model.UserSearchParameters;
import nu.marginalia.search.svc.SearchQueryIndexService;
import nu.marginalia.search.svc.SearchUnitConversionService;
import org.apache.logging.log4j.util.Strings;
@ -37,58 +37,59 @@ public class SearchOperator {
private final AssistantClient assistantClient;
private final DbDomainQueries domainQueries;
private final QueryFactory queryFactory;
private final QueryClient queryClient;
private final SearchQueryIndexService searchQueryService;
private final SearchQueryParamFactory paramFactory;
private final SearchUnitConversionService searchUnitConversionService;
@Inject
public SearchOperator(AssistantClient assistantClient,
DbDomainQueries domainQueries,
QueryFactory queryFactory,
QueryClient queryClient,
SearchQueryIndexService searchQueryService,
SearchUnitConversionService searchUnitConversionService) {
SearchQueryParamFactory paramFactory,
SearchUnitConversionService searchUnitConversionService)
{
this.assistantClient = assistantClient;
this.domainQueries = domainQueries;
this.queryFactory = queryFactory;
this.queryClient = queryClient;
this.searchQueryService = searchQueryService;
this.paramFactory = paramFactory;
this.searchUnitConversionService = searchUnitConversionService;
}
public List<UrlDetails> doApiSearch(Context ctx,
UserSearchParameters params) {
public List<UrlDetails> doSiteSearch(Context ctx,
String domain) {
var queryParams = paramFactory.forSiteSearch(domain);
var queryResponse = queryClient.search(ctx, queryParams);
SearchQuery processedQuery = queryFactory.createQuery(params);
logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
return searchQueryService.executeQuery(ctx, processedQuery);
return searchQueryService.getResultsFromQuery(queryResponse);
}
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) {
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters userParams) {
Future<String> eval = searchUnitConversionService.tryEval(ctx, params.humanQuery());
SearchQuery processedQuery = queryFactory.createQuery(params);
Future<String> eval = searchUnitConversionService.tryEval(ctx, userParams.humanQuery());
var queryParams = paramFactory.forRegularSearch(userParams);
var queryResponse = queryClient.search(ctx, queryParams);
logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery);
List<UrlDetails> queryResults = searchQueryService.getResultsFromQuery(queryResponse);
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
String evalResult = getFutureOrDefault(eval, "");
return DecoratedSearchResults.builder()
.params(params)
.problems(getProblems(ctx, evalResult, queryResults, processedQuery))
.params(userParams)
.problems(getProblems(ctx, evalResult, queryResults, queryResponse))
.evalResult(evalResult)
.results(queryResults)
.focusDomain(processedQuery.domain)
.focusDomainId(getDomainId(processedQuery.domain))
.focusDomain(queryResponse.domain())
.focusDomainId(getDomainId(queryResponse.domain()))
.build();
}
@ -113,20 +114,20 @@ public class SearchOperator {
return domainQueries.tryGetDomainId(new EdgeDomain(domain)).orElse(-1);
}
private List<String> getProblems(Context ctx, String evalResult, List<UrlDetails> queryResults, SearchQuery processedQuery) {
final List<String> problems = new ArrayList<>(processedQuery.problems);
boolean siteSearch = processedQuery.domain != null;
private List<String> getProblems(Context ctx, String evalResult, List<UrlDetails> queryResults, QueryResponse response) {
final List<String> problems = new ArrayList<>(response.problems());
boolean siteSearch = response.domain() != null;
if (!siteSearch) {
if (queryResults.size() <= 5 && null == evalResult) {
spellCheckTerms(ctx, processedQuery).forEach(problems::add);
spellCheckTerms(ctx, response).forEach(problems::add);
}
if (queryResults.size() <= 5) {
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results. <a href=\"https://memex.marginalia.nu/projects/edge/search-tips.gmi\">Tips</a>.");
}
Set<String> representativeKeywords = processedQuery.getAllKeywords();
Set<String> representativeKeywords = response.getAllKeywords();
if (representativeKeywords.size()>1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
{
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
@ -137,8 +138,8 @@ public class SearchOperator {
}
private Iterable<String> spellCheckTerms(Context ctx, SearchQuery disjointedQuery) {
return Observable.fromIterable(disjointedQuery.searchTermsHuman)
private Iterable<String> spellCheckTerms(Context ctx, QueryResponse response) {
return Observable.fromIterable(response.searchTermsHuman())
.subscribeOn(Schedulers.io())
.flatMap(term -> assistantClient.spellCheck(ctx, term)
.onErrorReturn(e -> Collections.emptyList())

View File

@ -0,0 +1,53 @@
package nu.marginalia.search;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.query.model.QueryParams;
import nu.marginalia.search.model.UserSearchParameters;
import java.util.List;
public class SearchQueryParamFactory {
public QueryParams forRegularSearch(UserSearchParameters userParams) {
SearchSubquery prototype = new SearchSubquery();
var profile = userParams.profile();
profile.addTacitTerms(prototype);
return new QueryParams(
userParams.humanQuery(),
null,
prototype.searchTermsInclude,
prototype.searchTermsExclude,
prototype.searchTermsPriority,
prototype.searchTermsAdvice,
profile.getQualityLimit(),
profile.getYearLimit(),
profile.getSizeLimit(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(2, 100, 200, 8192),
profile.searchSetIdentifier
);
}
public QueryParams forSiteSearch(String domain) {
return new QueryParams("site:"+domain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE
);
}
}

View File

@ -6,15 +6,11 @@ import lombok.SneakyThrows;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.client.Context;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.search.client.SearchMqEndpoints;
import nu.marginalia.search.svc.SearchFrontPageService;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.server.*;
import nu.marginalia.service.server.mq.MqNotification;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
@ -23,18 +19,13 @@ import spark.Spark;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
public class SearchService extends Service {
private final WebsiteUrl websiteUrl;
private final StaticResources staticResources;
private final FileStorageService fileStorageService;
private final LinkdbReader linkdbReader;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
private final ServiceEventLog eventLog;
@SneakyThrows
@Inject
@ -45,18 +36,12 @@ public class SearchService extends Service {
SearchErrorPageService errorPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchFlagSiteService flagSiteService,
SearchQueryService searchQueryService,
SearchApiQueryService apiQueryService,
FileStorageService fileStorageService,
LinkdbReader linkdbReader
SearchQueryService searchQueryService
) {
super(params);
this.eventLog = params.eventLog;
this.websiteUrl = websiteUrl;
this.staticResources = staticResources;
this.fileStorageService = fileStorageService;
this.linkdbReader = linkdbReader;
Spark.staticFiles.expireTime(600);
@ -64,7 +49,6 @@ public class SearchService extends Service {
Gson gson = GsonFactory.get();
Spark.get("/api/search", apiQueryService::apiSearch, gson::toJson);
Spark.get("/public/search", searchQueryService::pathSearch);
Spark.get("/public/site-search/:site/*", this::siteSearchRedir);
Spark.get("/public/", frontPageService::render);
@ -87,21 +71,6 @@ public class SearchService extends Service {
Spark.awaitInitialization();
}
@SneakyThrows
@MqNotification(endpoint = SearchMqEndpoints.SWITCH_LINKDB)
public void switchLinkdb(String unusedArg) {
logger.info("Switching link database");
Path newPath = fileStorageService.getStorageByType(FileStorageType.LINKDB_STAGING)
.asPath()
.resolve("links.db");
if (Files.exists(newPath)) {
eventLog.logEvent("SEARCH-SWITCH-LINKDB", "");
linkdbReader.switchInput(newPath);
}
}
private Object serveStatic(Request request, Response response) {
String resource = request.params("resource");
staticResources.serveStatic("search", resource, request, response);

Some files were not shown because too many files have changed in this diff Show More