Use document generator to complement the document selection.
Will let through e.g. a modern SSG in the small web filter.
This commit is contained in:
parent
b5ef67ed28
commit
55c65f0935
@ -20,7 +20,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
int domainId = forwardIndexReader.getDomainId(urlId);
|
||||
long meta = forwardIndexReader.getDocMeta(urlId);
|
||||
|
||||
if (!validateDomain(domainId)) {
|
||||
if (!validateDomain(domainId, meta)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -43,8 +43,8 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean validateDomain(int domainId) {
|
||||
return params.searchSet().contains(domainId);
|
||||
private boolean validateDomain(int domainId, long meta) {
|
||||
return params.searchSet().contains(domainId, meta);
|
||||
}
|
||||
|
||||
private boolean validateQuality(long meta) {
|
||||
|
@ -1,6 +1,12 @@
|
||||
package nu.marginalia.index.searchset;
|
||||
|
||||
public interface SearchSet {
|
||||
boolean contains(int urlId);
|
||||
|
||||
/**
|
||||
* Returns true if the given urlId is contained in the set
|
||||
* or if the documentMetadata vibes with the set
|
||||
*
|
||||
*/
|
||||
boolean contains(int urlId, long documentMetadata);
|
||||
|
||||
}
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.index.svc.searchset;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -13,7 +15,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** A serializable bit map of domains
|
||||
/** A serializable bit map of domains corresponding to a method of ranking the domains
|
||||
*
|
||||
* @see SearchSetIdentifier
|
||||
*
|
||||
@ -61,10 +63,27 @@ public class RankingSearchSet implements SearchSet {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(int urlId) {
|
||||
// Fallback on allow-all if no items are in set
|
||||
public boolean contains(int urlId, long documentMetadata) {
|
||||
// For ranked search sets, exclude excessively commercial sites
|
||||
// TODO: Maybe this particular check should be moved up to the search service and be opt-in?
|
||||
if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return set.contains(urlId) || set.isEmpty();
|
||||
// This is the main check
|
||||
if (set.contains(urlId) || set.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// For the rest, let through some domains that are not in the set based on the generator tag
|
||||
if (identifier == SearchSetIdentifier.SMALLWEB) {
|
||||
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit());
|
||||
}
|
||||
if (identifier == SearchSetIdentifier.RETRO) {
|
||||
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit());
|
||||
}
|
||||
|
||||
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit());
|
||||
}
|
||||
|
||||
public void write() throws IOException {
|
||||
|
@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet;
|
||||
|
||||
public class SearchSetAny implements SearchSet {
|
||||
@Override
|
||||
public boolean contains(int urlId) {
|
||||
public boolean contains(int urlId, long meta) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.index.searchset.SearchSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
/** A specialized search set for a small number of entries, for use when specifying the exact domains to query */
|
||||
public class SmallSearchSet implements SearchSet {
|
||||
public TIntHashSet entries;
|
||||
|
||||
@ -14,7 +15,7 @@ public class SmallSearchSet implements SearchSet {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(int domainId) {
|
||||
public boolean contains(int domainId, long meta) {
|
||||
return entries.contains(domainId);
|
||||
}
|
||||
|
||||
|
@ -26,10 +26,10 @@ class RankingSearchSetTest {
|
||||
set.write();
|
||||
|
||||
RankingSearchSet set2 = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p);
|
||||
assertTrue(set2.contains(1));
|
||||
assertTrue(set2.contains(5));
|
||||
assertTrue(set2.contains(7));
|
||||
assertTrue(set2.contains(9));
|
||||
assertTrue(set2.contains(1, 0));
|
||||
assertTrue(set2.contains(5, 0));
|
||||
assertTrue(set2.contains(7, 0));
|
||||
assertTrue(set2.contains(9, 0));
|
||||
|
||||
Files.delete(p);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user