From bf328a059722d45f7f25010b38dfb83241b0af6b Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 28 Jul 2022 17:58:41 +0200 Subject: [PATCH] Experimental domain-searching feature --- .../wmsa/edge/data/dao/EdgeDataStoreDao.java | 10 ++++--- .../edge/data/dao/EdgeDataStoreDaoImpl.java | 26 +++++++++++++++++ .../edge/model/search/EdgeUrlDetails.java | 7 ++--- .../wmsa/edge/search/EdgeSearchOperator.java | 19 +++++++++++++ .../command/commands/BrowseCommand.java | 15 +++++----- .../command/commands/SearchCommand.java | 10 +++++-- .../search/model/DecoratedSearchResults.java | 4 ++- .../search/results/BrowseResultCleaner.java | 28 +++++++++++++++++++ .../templates/edge/search-results.hdb | 1 + 9 files changed, 100 insertions(+), 20 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java index 2f309b07..5bc65659 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java @@ -2,14 +2,13 @@ package nu.marginalia.wmsa.edge.data.dao; import com.google.inject.ImplementedBy; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; -import nu.marginalia.wmsa.edge.model.*; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.model.BrowseResult; -import java.util.Collection; import java.util.List; -import java.util.Optional; @ImplementedBy(EdgeDataStoreDaoImpl.class) public interface EdgeDataStoreDao { @@ -18,6 +17,9 @@ public interface EdgeDataStoreDao { List getDomainNeighborsAdjacent(EdgeId domainId, EdgeDomainBlacklist backlist, int count); List getRandomDomains(int count, EdgeDomainBlacklist backlist); + + List getBrowseResultFromUrlIds(List> urlId); + List getUrlDetailsMulti(List> ids); EdgeDomain getDomain(EdgeId id); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java index c73089b0..c14b5f0e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java @@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; +import java.util.stream.Collectors; public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { @@ -264,6 +265,31 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao { return domains; } + @Override + public List getBrowseResultFromUrlIds(List> urlId) { + List ret = new ArrayList<>(urlId.size()); + + try (var conn = dataSource.getConnection()) { + try (var stmt = conn.createStatement()) { + // this is safe, string cocatenation is of integers + String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")")); + + var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL WHERE ID IN " + inStmt); + while (rsp.next()) { + int id = rsp.getInt(1); + String domain = rsp.getString(2); + + ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id)); + } + } + } + catch (SQLException ex) { + logger.error("SQL error", ex); + } + + return ret; + } + @Override @SneakyThrows public EdgeDomain getDomain(EdgeId id) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java index d46aa79e..a0b5ec5e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeUrlDetails.java @@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.search.model.EdgeSearchRankingSymbols; import java.util.Objects; @@ -21,12 +21,9 @@ public class EdgeUrlDetails { public String format; public int features; - - - public String ip; // BROKEN + public String ip; public EdgeDomainIndexingState domainState; - public int dataHash; public EdgePageScoreAdjustment urlQualityAdjustment; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 7ba8f167..e1ea2cb3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -15,6 +15,8 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.search.*; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.marginalia.wmsa.edge.search.model.BrowseResult; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.query.QueryFactory; @@ -97,15 +99,32 @@ public class EdgeSearchOperator { String evalResult = getEvalResult(eval); + List domainResults = getDomainResults(ctx, processedQuery.specs); + return new DecoratedSearchResults(params, getProblems(ctx, params.humanQuery(), evalResult, queryResults, processedQuery), evalResult, definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(), queryResults.resultSet, + domainResults, processedQuery.domain, getDomainId(processedQuery.domain)); } + private List getDomainResults(Context ctx, EdgeSearchSpecification specs) { + List keywords = specs.subqueries.stream().filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1) + .findFirst().map(sq -> sq.searchTermsExclude).orElseGet(Collections::emptyList); + + if (keywords.size() == 1) { + var request = new EdgeDomainSearchSpecification(specs.buckets.get(0), IndexBlock.TitleKeywords, keywords.get(0), 10_000, 10, 5); + var response = indexClient.queryDomains(ctx, request); + + return edgeDataStoreDao.getBrowseResultFromUrlIds(response.results); + } + + return Collections.emptyList(); + } + private String getEvalResult(@Nullable Future eval) { if (eval == null || eval.isCancelled()) { return ""; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java index ba5714ea..268cc18e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java @@ -6,10 +6,10 @@ import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.BrowseResultSet; +import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; import org.slf4j.Logger; @@ -28,6 +28,7 @@ public class BrowseCommand implements SearchCommandInterface { private final ScreenshotService screenshotService; private final EdgeDomainBlacklist blacklist; private final MustacheRenderer browseResultsRenderer; + private final BrowseResultCleaner browseResultCleaner; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Predicate queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9]+$").asPredicate(); @@ -35,12 +36,14 @@ public class BrowseCommand implements SearchCommandInterface { public BrowseCommand(EdgeDataStoreDao edgeDataStoreDao, ScreenshotService screenshotService, EdgeDomainBlacklist blacklist, - RendererFactory rendererFactory) + RendererFactory rendererFactory, + BrowseResultCleaner browseResultCleaner) throws IOException { this.edgeDataStoreDao = edgeDataStoreDao; this.screenshotService = screenshotService; this.blacklist = blacklist; + this.browseResultCleaner = browseResultCleaner; browseResultsRenderer = rendererFactory.renderer("edge/browse-results"); } @@ -66,9 +69,7 @@ public class BrowseCommand implements SearchCommandInterface { if ("random".equals(word)) { var results = edgeDataStoreDao.getRandomDomains(25, blacklist); - results.removeIf(res -> - !screenshotService.hasScreenshot(new EdgeId<>(res.domainId)) - || !domainHashes.add(res.domainHash())); + results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); return new BrowseResultSet(results); } @@ -76,9 +77,7 @@ public class BrowseCommand implements SearchCommandInterface { var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word)); var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45); - neighbors.removeIf(res -> - !screenshotService.hasScreenshot(new EdgeId<>(res.domainId)) - || !domainHashes.add(res.domainHash())); + neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); return new BrowseResultSet(neighbors); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java index 67df810a..f394da31 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SearchCommand.java @@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; +import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; import nu.marginalia.wmsa.renderer.mustache.RendererFactory; @@ -24,17 +25,21 @@ public class SearchCommand implements SearchCommandInterface { private final EdgeSearchOperator searchOperator; private final UnitConversion unitConversion; private final MustacheRenderer searchResultsRenderer; + private BrowseResultCleaner browseResultCleaner; @Inject public SearchCommand(EdgeDomainBlacklist blacklist, EdgeDataStoreDao dataStoreDao, EdgeSearchOperator searchOperator, UnitConversion unitConversion, - RendererFactory rendererFactory) throws IOException { + RendererFactory rendererFactory, + BrowseResultCleaner browseResultCleaner + ) throws IOException { this.blacklist = blacklist; this.dataStoreDao = dataStoreDao; this.searchOperator = searchOperator; this.unitConversion = unitConversion; + this.browseResultCleaner = browseResultCleaner; searchResultsRenderer = rendererFactory.renderer("edge/search-results"); } @@ -46,7 +51,8 @@ public class SearchCommand implements SearchCommandInterface { EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js()); DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval); - results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain))); + results.results.removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain))); + results.domainResults.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); return Optional.of(searchResultsRenderer.render(results)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java index 30288099..fb9717af 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java @@ -14,7 +14,9 @@ public class DecoratedSearchResults { private final List problems; private final String evalResult; private final WikiArticles wiki; - private final List results; + + public final List results; + public final List domainResults; private final String focusDomain; private final int focusDomainId; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java new file mode 100644 index 00000000..c54442ff --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/BrowseResultCleaner.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.search.results; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.search.model.BrowseResult; + +import java.util.HashSet; +import java.util.Set; +import java.util.function.Predicate; + +@Singleton +public class BrowseResultCleaner { + private final ScreenshotService screenshotService; + + @Inject + public BrowseResultCleaner(ScreenshotService screenshotService) { + this.screenshotService = screenshotService; + } + + public Predicate shouldRemoveResultPredicate() { + Set domainHashes = new HashSet<>(100); + + return (res) -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId)) + || !domainHashes.add(res.domainHash()); + } +} diff --git a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb index 82d7f707..19688a48 100644 --- a/marginalia_nu/src/main/resources/templates/edge/search-results.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/search-results.hdb @@ -37,6 +37,7 @@ {{/if}} + {{#each domainResults}}{{>edge/browse-result}}{{/each}} {{#each results}}{{>edge/search-result}}{{/each}} {{#unless evalResult}}{{#if problems}}

Suggestions

    {{#each problems}}
  • {{{.}}}
  • {{/each}}
{{/if}}{{/unless}}