diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java index 6e320ef6..8d22516b 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java @@ -20,7 +20,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { int domainId = forwardIndexReader.getDomainId(urlId); long meta = forwardIndexReader.getDocMeta(urlId); - if (!validateDomain(domainId)) { + if (!validateDomain(domainId, meta)) { return false; } @@ -43,8 +43,8 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { return true; } - private boolean validateDomain(int domainId) { - return params.searchSet().contains(domainId); + private boolean validateDomain(int domainId, long meta) { + return params.searchSet().contains(domainId, meta); } private boolean validateQuality(long meta) { diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java index 0cc40e4d..529950e7 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java @@ -1,6 +1,12 @@ package nu.marginalia.index.searchset; public interface SearchSet { - boolean contains(int urlId); + + /** + * Returns true if the given urlId is contained in the set + * or if the documentMetadata vibes with the set + * + */ + boolean contains(int urlId, long documentMetadata); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java index 777fd33f..1367afe4 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java @@ -3,6 +3,8 @@ package nu.marginalia.index.svc.searchset; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.searchset.SearchSet; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,7 +15,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; -/** A serializable bit map of domains +/** A serializable bit map of domains corresponding to a method of ranking the domains * * @see SearchSetIdentifier * @@ -61,10 +63,27 @@ public class RankingSearchSet implements SearchSet { } @Override - public boolean contains(int urlId) { - // Fallback on allow-all if no items are in set + public boolean contains(int urlId, long documentMetadata) { + // For ranked search sets, exclude excessively commercial sites + // TODO: Maybe this particular check should be moved up to the search service and be opt-in? + if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) { + return false; + } - return set.contains(urlId) || set.isEmpty(); + // This is the main check + if (set.contains(urlId) || set.isEmpty()) { + return true; + } + + // For the rest, let through some domains that are not in the set based on the generator tag + if (identifier == SearchSetIdentifier.SMALLWEB) { + return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit()); + } + if (identifier == SearchSetIdentifier.RETRO) { + return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit()); + } + + return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit()); } public void write() throws IOException { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java index 63b433ac..2f457974 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java @@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet; public class SearchSetAny implements SearchSet { @Override - public boolean contains(int urlId) { + public boolean contains(int urlId, long meta) { return true; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java index 8d261df8..37cc07e0 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SmallSearchSet.java @@ -6,6 +6,7 @@ import nu.marginalia.index.searchset.SearchSet; import java.util.Arrays; import java.util.Collection; +/** A specialized search set for a small number of entries, for use when specifying the exact domains to query */ public class SmallSearchSet implements SearchSet { public TIntHashSet entries; @@ -14,7 +15,7 @@ public class SmallSearchSet implements SearchSet { } @Override - public boolean contains(int domainId) { + public boolean contains(int domainId, long meta) { return entries.contains(domainId); } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java index 0618db1c..67a72797 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/searchset/RankingSearchSetTest.java @@ -26,10 +26,10 @@ class RankingSearchSetTest { set.write(); RankingSearchSet set2 = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p); - assertTrue(set2.contains(1)); - assertTrue(set2.contains(5)); - assertTrue(set2.contains(7)); - assertTrue(set2.contains(9)); + assertTrue(set2.contains(1, 0)); + assertTrue(set2.contains(5, 0)); + assertTrue(set2.contains(7, 0)); + assertTrue(set2.contains(9, 0)); Files.delete(p);