Merge pull request 'Experimental domain-searching feature' (#45) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/45
This commit is contained in:
commit
6f5b610043
@ -2,14 +2,13 @@ package nu.marginalia.wmsa.edge.data.dao;
|
||||
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.model.*;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||
public interface EdgeDataStoreDao {
|
||||
@ -18,6 +17,9 @@ public interface EdgeDataStoreDao {
|
||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||
|
||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||
|
||||
List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId);
|
||||
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||
|
||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||
|
@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
@ -264,6 +265,31 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
return domains;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId) {
|
||||
List<BrowseResult> ret = new ArrayList<>(urlId.size());
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.createStatement()) {
|
||||
// this is safe, string cocatenation is of integers
|
||||
String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")"));
|
||||
|
||||
var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL WHERE ID IN " + inStmt);
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("SQL error", ex);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
||||
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.model.search;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.search.model.EdgeSearchRankingSymbols;
|
||||
|
||||
import java.util.Objects;
|
||||
@ -21,12 +21,9 @@ public class EdgeUrlDetails {
|
||||
public String format;
|
||||
public int features;
|
||||
|
||||
|
||||
|
||||
public String ip; // BROKEN
|
||||
public String ip;
|
||||
public EdgeDomainIndexingState domainState;
|
||||
|
||||
|
||||
public int dataHash;
|
||||
|
||||
public EdgePageScoreAdjustment urlQualityAdjustment;
|
||||
|
@ -15,6 +15,8 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
|
||||
@ -97,15 +99,32 @@ public class EdgeSearchOperator {
|
||||
|
||||
String evalResult = getEvalResult(eval);
|
||||
|
||||
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
|
||||
|
||||
return new DecoratedSearchResults(params,
|
||||
getProblems(ctx, params.humanQuery(), evalResult, queryResults, processedQuery),
|
||||
evalResult,
|
||||
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
|
||||
queryResults.resultSet,
|
||||
domainResults,
|
||||
processedQuery.domain,
|
||||
getDomainId(processedQuery.domain));
|
||||
}
|
||||
|
||||
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
|
||||
List<String> keywords = specs.subqueries.stream().filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1)
|
||||
.findFirst().map(sq -> sq.searchTermsExclude).orElseGet(Collections::emptyList);
|
||||
|
||||
if (keywords.size() == 1) {
|
||||
var request = new EdgeDomainSearchSpecification(specs.buckets.get(0), IndexBlock.TitleKeywords, keywords.get(0), 10_000, 10, 5);
|
||||
var response = indexClient.queryDomains(ctx, request);
|
||||
|
||||
return edgeDataStoreDao.getBrowseResultFromUrlIds(response.results);
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
private String getEvalResult(@Nullable Future<String> eval) {
|
||||
if (eval == null || eval.isCancelled()) {
|
||||
return "";
|
||||
|
@ -6,10 +6,10 @@ import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
|
||||
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
@ -28,6 +28,7 @@ public class BrowseCommand implements SearchCommandInterface {
|
||||
private final ScreenshotService screenshotService;
|
||||
private final EdgeDomainBlacklist blacklist;
|
||||
private final MustacheRenderer<BrowseResultSet> browseResultsRenderer;
|
||||
private final BrowseResultCleaner browseResultCleaner;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9]+$").asPredicate();
|
||||
|
||||
@ -35,12 +36,14 @@ public class BrowseCommand implements SearchCommandInterface {
|
||||
public BrowseCommand(EdgeDataStoreDao edgeDataStoreDao,
|
||||
ScreenshotService screenshotService,
|
||||
EdgeDomainBlacklist blacklist,
|
||||
RendererFactory rendererFactory)
|
||||
RendererFactory rendererFactory,
|
||||
BrowseResultCleaner browseResultCleaner)
|
||||
throws IOException
|
||||
{
|
||||
this.edgeDataStoreDao = edgeDataStoreDao;
|
||||
this.screenshotService = screenshotService;
|
||||
this.blacklist = blacklist;
|
||||
this.browseResultCleaner = browseResultCleaner;
|
||||
|
||||
browseResultsRenderer = rendererFactory.renderer("edge/browse-results");
|
||||
}
|
||||
@ -66,9 +69,7 @@ public class BrowseCommand implements SearchCommandInterface {
|
||||
if ("random".equals(word)) {
|
||||
var results = edgeDataStoreDao.getRandomDomains(25, blacklist);
|
||||
|
||||
results.removeIf(res ->
|
||||
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|
||||
|| !domainHashes.add(res.domainHash()));
|
||||
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||
|
||||
return new BrowseResultSet(results);
|
||||
}
|
||||
@ -76,9 +77,7 @@ public class BrowseCommand implements SearchCommandInterface {
|
||||
var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word));
|
||||
var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45);
|
||||
|
||||
neighbors.removeIf(res ->
|
||||
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|
||||
|| !domainHashes.add(res.domainHash()));
|
||||
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||
|
||||
return new BrowseResultSet(neighbors);
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
||||
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
|
||||
@ -24,17 +25,21 @@ public class SearchCommand implements SearchCommandInterface {
|
||||
private final EdgeSearchOperator searchOperator;
|
||||
private final UnitConversion unitConversion;
|
||||
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
|
||||
private BrowseResultCleaner browseResultCleaner;
|
||||
|
||||
@Inject
|
||||
public SearchCommand(EdgeDomainBlacklist blacklist,
|
||||
EdgeDataStoreDao dataStoreDao,
|
||||
EdgeSearchOperator searchOperator,
|
||||
UnitConversion unitConversion,
|
||||
RendererFactory rendererFactory) throws IOException {
|
||||
RendererFactory rendererFactory,
|
||||
BrowseResultCleaner browseResultCleaner
|
||||
) throws IOException {
|
||||
this.blacklist = blacklist;
|
||||
this.dataStoreDao = dataStoreDao;
|
||||
this.searchOperator = searchOperator;
|
||||
this.unitConversion = unitConversion;
|
||||
this.browseResultCleaner = browseResultCleaner;
|
||||
|
||||
searchResultsRenderer = rendererFactory.renderer("edge/search-results");
|
||||
}
|
||||
@ -46,7 +51,8 @@ public class SearchCommand implements SearchCommandInterface {
|
||||
EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js());
|
||||
DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval);
|
||||
|
||||
results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));
|
||||
results.results.removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));
|
||||
results.domainResults.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||
|
||||
return Optional.of(searchResultsRenderer.render(results));
|
||||
}
|
||||
|
@ -14,7 +14,9 @@ public class DecoratedSearchResults {
|
||||
private final List<String> problems;
|
||||
private final String evalResult;
|
||||
private final WikiArticles wiki;
|
||||
private final List<EdgeUrlDetails> results;
|
||||
|
||||
public final List<EdgeUrlDetails> results;
|
||||
public final List<BrowseResult> domainResults;
|
||||
|
||||
private final String focusDomain;
|
||||
private final int focusDomainId;
|
||||
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.wmsa.edge.search.results;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
@Singleton
|
||||
public class BrowseResultCleaner {
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
@Inject
|
||||
public BrowseResultCleaner(ScreenshotService screenshotService) {
|
||||
this.screenshotService = screenshotService;
|
||||
}
|
||||
|
||||
public Predicate<BrowseResult> shouldRemoveResultPredicate() {
|
||||
Set<String> domainHashes = new HashSet<>(100);
|
||||
|
||||
return (res) -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|
||||
|| !domainHashes.add(res.domainHash());
|
||||
}
|
||||
}
|
@ -37,6 +37,7 @@
|
||||
</section>
|
||||
{{/if}}
|
||||
|
||||
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
|
||||
{{#each results}}{{>edge/search-result}}{{/each}}
|
||||
|
||||
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
|
||||
|
Loading…
Reference in New Issue
Block a user