Merge pull request 'Experimental domain-searching feature' (#45) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/45
This commit is contained in:
Viktor Lofgren 2022-07-28 17:59:28 +02:00
commit 6f5b610043
9 changed files with 100 additions and 20 deletions

View File

@ -2,14 +2,13 @@ package nu.marginalia.wmsa.edge.data.dao;
import com.google.inject.ImplementedBy;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
@ImplementedBy(EdgeDataStoreDaoImpl.class)
public interface EdgeDataStoreDao {
@ -18,6 +17,9 @@ public interface EdgeDataStoreDao {
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId);
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
EdgeDomain getDomain(EdgeId<EdgeDomain> id);

View File

@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@ -264,6 +265,31 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return domains;
}
@Override
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId) {
List<BrowseResult> ret = new ArrayList<>(urlId.size());
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.createStatement()) {
// this is safe, string cocatenation is of integers
String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")"));
var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL WHERE ID IN " + inStmt);
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
return ret;
}
@Override
@SneakyThrows
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {

View File

@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.model.search;
import lombok.*;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.search.model.EdgeSearchRankingSymbols;
import java.util.Objects;
@ -21,12 +21,9 @@ public class EdgeUrlDetails {
public String format;
public int features;
public String ip; // BROKEN
public String ip;
public EdgeDomainIndexingState domainState;
public int dataHash;
public EdgePageScoreAdjustment urlQualityAdjustment;

View File

@ -15,6 +15,8 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
@ -97,15 +99,32 @@ public class EdgeSearchOperator {
String evalResult = getEvalResult(eval);
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
return new DecoratedSearchResults(params,
getProblems(ctx, params.humanQuery(), evalResult, queryResults, processedQuery),
evalResult,
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
queryResults.resultSet,
domainResults,
processedQuery.domain,
getDomainId(processedQuery.domain));
}
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
List<String> keywords = specs.subqueries.stream().filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1)
.findFirst().map(sq -> sq.searchTermsExclude).orElseGet(Collections::emptyList);
if (keywords.size() == 1) {
var request = new EdgeDomainSearchSpecification(specs.buckets.get(0), IndexBlock.TitleKeywords, keywords.get(0), 10_000, 10, 5);
var response = indexClient.queryDomains(ctx, request);
return edgeDataStoreDao.getBrowseResultFromUrlIds(response.results);
}
return Collections.emptyList();
}
private String getEvalResult(@Nullable Future<String> eval) {
if (eval == null || eval.isCancelled()) {
return "";

View File

@ -6,10 +6,10 @@ import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import org.slf4j.Logger;
@ -28,6 +28,7 @@ public class BrowseCommand implements SearchCommandInterface {
private final ScreenshotService screenshotService;
private final EdgeDomainBlacklist blacklist;
private final MustacheRenderer<BrowseResultSet> browseResultsRenderer;
private final BrowseResultCleaner browseResultCleaner;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9]+$").asPredicate();
@ -35,12 +36,14 @@ public class BrowseCommand implements SearchCommandInterface {
public BrowseCommand(EdgeDataStoreDao edgeDataStoreDao,
ScreenshotService screenshotService,
EdgeDomainBlacklist blacklist,
RendererFactory rendererFactory)
RendererFactory rendererFactory,
BrowseResultCleaner browseResultCleaner)
throws IOException
{
this.edgeDataStoreDao = edgeDataStoreDao;
this.screenshotService = screenshotService;
this.blacklist = blacklist;
this.browseResultCleaner = browseResultCleaner;
browseResultsRenderer = rendererFactory.renderer("edge/browse-results");
}
@ -66,9 +69,7 @@ public class BrowseCommand implements SearchCommandInterface {
if ("random".equals(word)) {
var results = edgeDataStoreDao.getRandomDomains(25, blacklist);
results.removeIf(res ->
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|| !domainHashes.add(res.domainHash()));
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return new BrowseResultSet(results);
}
@ -76,9 +77,7 @@ public class BrowseCommand implements SearchCommandInterface {
var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word));
var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45);
neighbors.removeIf(res ->
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|| !domainHashes.add(res.domainHash()));
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return new BrowseResultSet(neighbors);
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
@ -24,17 +25,21 @@ public class SearchCommand implements SearchCommandInterface {
private final EdgeSearchOperator searchOperator;
private final UnitConversion unitConversion;
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
private BrowseResultCleaner browseResultCleaner;
@Inject
public SearchCommand(EdgeDomainBlacklist blacklist,
EdgeDataStoreDao dataStoreDao,
EdgeSearchOperator searchOperator,
UnitConversion unitConversion,
RendererFactory rendererFactory) throws IOException {
RendererFactory rendererFactory,
BrowseResultCleaner browseResultCleaner
) throws IOException {
this.blacklist = blacklist;
this.dataStoreDao = dataStoreDao;
this.searchOperator = searchOperator;
this.unitConversion = unitConversion;
this.browseResultCleaner = browseResultCleaner;
searchResultsRenderer = rendererFactory.renderer("edge/search-results");
}
@ -46,7 +51,8 @@ public class SearchCommand implements SearchCommandInterface {
EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js());
DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval);
results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));
results.results.removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));
results.domainResults.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return Optional.of(searchResultsRenderer.render(results));
}

View File

@ -14,7 +14,9 @@ public class DecoratedSearchResults {
private final List<String> problems;
private final String evalResult;
private final WikiArticles wiki;
private final List<EdgeUrlDetails> results;
public final List<EdgeUrlDetails> results;
public final List<BrowseResult> domainResults;
private final String focusDomain;
private final int focusDomainId;

View File

@ -0,0 +1,28 @@
package nu.marginalia.wmsa.edge.search.results;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Predicate;
@Singleton
public class BrowseResultCleaner {
private final ScreenshotService screenshotService;
@Inject
public BrowseResultCleaner(ScreenshotService screenshotService) {
this.screenshotService = screenshotService;
}
public Predicate<BrowseResult> shouldRemoveResultPredicate() {
Set<String> domainHashes = new HashSet<>(100);
return (res) -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|| !domainHashes.add(res.domainHash());
}
}

View File

@ -37,6 +37,7 @@
</section>
{{/if}}
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
{{#each results}}{{>edge/search-result}}{{/each}}
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}