Use document generator to complement the document selection.

Will let through e.g. a modern SSG in the small web filter.
This commit is contained in:
Viktor Lofgren 2023-06-22 17:21:33 +02:00
parent b5ef67ed28
commit 55c65f0935
6 changed files with 40 additions and 14 deletions

View File

@ -20,7 +20,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
int domainId = forwardIndexReader.getDomainId(urlId);
long meta = forwardIndexReader.getDocMeta(urlId);
if (!validateDomain(domainId)) {
if (!validateDomain(domainId, meta)) {
return false;
}
@ -43,8 +43,8 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
return true;
}
private boolean validateDomain(int domainId) {
return params.searchSet().contains(domainId);
private boolean validateDomain(int domainId, long meta) {
return params.searchSet().contains(domainId, meta);
}
private boolean validateQuality(long meta) {

View File

@ -1,6 +1,12 @@
package nu.marginalia.index.searchset;
public interface SearchSet {
boolean contains(int urlId);
/**
* Returns true if the given urlId is contained in the set
* or if the documentMetadata vibes with the set
*
*/
boolean contains(int urlId, long documentMetadata);
}

View File

@ -3,6 +3,8 @@ package nu.marginalia.index.svc.searchset;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -13,7 +15,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** A serializable bit map of domains
/** A serializable bit map of domains corresponding to a method of ranking the domains
*
* @see SearchSetIdentifier
*
@ -61,10 +63,27 @@ public class RankingSearchSet implements SearchSet {
}
@Override
public boolean contains(int urlId) {
// Fallback on allow-all if no items are in set
public boolean contains(int urlId, long documentMetadata) {
// For ranked search sets, exclude excessively commercial sites
// TODO: Maybe this particular check should be moved up to the search service and be opt-in?
if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) {
return false;
}
return set.contains(urlId) || set.isEmpty();
// This is the main check
if (set.contains(urlId) || set.isEmpty()) {
return true;
}
// For the rest, let through some domains that are not in the set based on the generator tag
if (identifier == SearchSetIdentifier.SMALLWEB) {
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit());
}
if (identifier == SearchSetIdentifier.RETRO) {
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit());
}
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit());
}
public void write() throws IOException {

View File

@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet;
public class SearchSetAny implements SearchSet {
@Override
public boolean contains(int urlId) {
public boolean contains(int urlId, long meta) {
return true;
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.index.searchset.SearchSet;
import java.util.Arrays;
import java.util.Collection;
/** A specialized search set for a small number of entries, for use when specifying the exact domains to query */
public class SmallSearchSet implements SearchSet {
public TIntHashSet entries;
@ -14,7 +15,7 @@ public class SmallSearchSet implements SearchSet {
}
@Override
public boolean contains(int domainId) {
public boolean contains(int domainId, long meta) {
return entries.contains(domainId);
}

View File

@ -26,10 +26,10 @@ class RankingSearchSetTest {
set.write();
RankingSearchSet set2 = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p);
assertTrue(set2.contains(1));
assertTrue(set2.contains(5));
assertTrue(set2.contains(7));
assertTrue(set2.contains(9));
assertTrue(set2.contains(1, 0));
assertTrue(set2.contains(5, 0));
assertTrue(set2.contains(7, 0));
assertTrue(set2.contains(9, 0));
Files.delete(p);