Word feature bit for words that appear in the URL, new search profile for plain text files, better plain text titles.
This commit is contained in:
parent
2bc212d65c
commit
722ff3bffb
@ -58,6 +58,14 @@ public record EdgeSearchResultKeywordScore(int set,
|
||||
sum -= 1;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
||||
|
||||
|
@ -33,7 +33,18 @@ public enum EdgePageWordFlags {
|
||||
/** Word is important to adjacent documents
|
||||
* @see SiteWords
|
||||
* */
|
||||
SiteAdjacent;
|
||||
SiteAdjacent,
|
||||
|
||||
/** Keyword appears in URL path
|
||||
*
|
||||
*/
|
||||
UrlPath,
|
||||
|
||||
/** Keyword appears in domain name
|
||||
*
|
||||
*/
|
||||
UrlDomain
|
||||
;
|
||||
|
||||
public int asBit() {
|
||||
return 1 << ordinal();
|
||||
|
@ -13,7 +13,9 @@ public record WordMetadata(int tfIdf,
|
||||
int positions,
|
||||
byte flags) {
|
||||
|
||||
// 8 unsused bits at the beginning
|
||||
// Bottom 16 bits are used for flags
|
||||
|
||||
public static final long FLAGS_MASK = 0xFFFFL;
|
||||
|
||||
public static final long TF_IDF_MASK = 0xFFFFL;
|
||||
public static final int TF_IDF_SHIFT = 16;
|
||||
@ -21,7 +23,6 @@ public record WordMetadata(int tfIdf,
|
||||
public static final int POSITIONS_SHIFT = 32;
|
||||
public static final long POSITIONS_MASK = 0xFFFF_FFFFL;
|
||||
|
||||
public static final long FLAGS_MASK = 0xFF;
|
||||
|
||||
|
||||
public WordMetadata() {
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
@ -23,9 +24,12 @@ public class DocumentKeywordExtractor {
|
||||
private final ArtifactKeywords artifactKeywords;
|
||||
|
||||
private final SimpleKeywords simpleKeywords;
|
||||
|
||||
private final UrlKeywords urlKeywords;
|
||||
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
|
||||
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
keywordExtractor = new KeywordExtractor();
|
||||
@ -33,6 +37,7 @@ public class DocumentKeywordExtractor {
|
||||
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
|
||||
artifactKeywords = new ArtifactKeywords();
|
||||
|
||||
urlKeywords = new UrlKeywords();
|
||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||
nameCounter = new NameCounter(keywordExtractor);
|
||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||
@ -40,7 +45,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, EdgeUrl url) {
|
||||
|
||||
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
|
||||
|
||||
@ -49,12 +54,14 @@ public class DocumentKeywordExtractor {
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords.add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords.add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords.add(rep.stemmed);
|
||||
|
||||
keywordMetadata.urlKeywords.addAll(urlKeywords.getUrlKeywords(url));
|
||||
keywordMetadata.domainKeywords.addAll(urlKeywords.getDomainKeywords(url));
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
|
@ -18,7 +18,7 @@ public class DocumentKeywordPositionBitmaskExtractor {
|
||||
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
|
||||
final KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||
|
||||
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
|
||||
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask;
|
||||
|
||||
// Mark the title words as position 0
|
||||
for (var sent : dld.titleSentences) {
|
||||
|
@ -51,7 +51,7 @@ public class KeywordCounter {
|
||||
}
|
||||
}
|
||||
|
||||
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
|
||||
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf;
|
||||
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||
|
||||
int maxVal = maxValue(counts);
|
||||
|
@ -28,7 +28,6 @@ public class SubjectCounter {
|
||||
|
||||
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
@ -81,7 +80,7 @@ public class SubjectCounter {
|
||||
return sum / parts.length;
|
||||
}
|
||||
|
||||
return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
|
||||
return keywordMetadata.wordsTfIdf.getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||
|
@ -0,0 +1,27 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class UrlKeywords {
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
public Set<String> getUrlKeywords(EdgeUrl url) {
|
||||
String path = url.path;
|
||||
|
||||
return Arrays.stream(path.split("[^a-z0-9A-Z]+"))
|
||||
.map(ps::stemWord)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public Set<String> getDomainKeywords(EdgeUrl url) {
|
||||
return Arrays.stream(url.domain.domain.split("[^a-z0-9A-Z]+"))
|
||||
.filter(s -> s.length() > 3)
|
||||
.map(ps::stemWord)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
}
|
@ -50,7 +50,20 @@ public class PlainTextLogic {
|
||||
return candidates.get(0).trim();
|
||||
}
|
||||
|
||||
return url.path.substring(url.path.lastIndexOf('/'));
|
||||
return getFileNameFromPath(url);
|
||||
|
||||
}
|
||||
|
||||
private String getFileNameFromPath(EdgeUrl url) {
|
||||
final String path = url.path;
|
||||
|
||||
int lastSlash = path.lastIndexOf('/');
|
||||
|
||||
if (lastSlash + 1 < path.length()) {
|
||||
return path.substring(lastSlash + 1);
|
||||
}
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
public boolean isSideline(String s) {
|
||||
|
@ -122,7 +122,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
new MetaTagsBuilder()
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
|
@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
new MetaTagsBuilder()
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
|
@ -11,6 +11,7 @@ import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
@ -42,6 +43,7 @@ class SentenceExtractorTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
@ -52,6 +54,7 @@ class SentenceExtractorTest {
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
|
||||
for (;;) {
|
||||
@ -60,7 +63,7 @@ class SentenceExtractorTest {
|
||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||
long start = System.currentTimeMillis();
|
||||
var dld = se.extractSentences(doc);
|
||||
documentKeywordExtractor.extractKeywords(dld);
|
||||
documentKeywordExtractor.extractKeywords(dld, url);
|
||||
total += (System.currentTimeMillis() - start);
|
||||
}
|
||||
System.out.println(total);
|
||||
@ -122,7 +125,7 @@ class SentenceExtractorTest {
|
||||
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
|
||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new EdgeUrl("https://memex.marginalia.nu/")));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.ranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
|
||||
public class RankingResultHashSetAccumulator implements RankingResultAccumulator<IntOpenHashSet> {
|
||||
private final IntOpenHashSet result = new IntOpenHashSet();
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.add(domainId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntOpenHashSet get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -7,6 +7,8 @@ public enum QueryStrategy {
|
||||
REQUIRE_FIELD_SITE,
|
||||
REQUIRE_FIELD_TITLE,
|
||||
REQUIRE_FIELD_SUBJECT,
|
||||
REQUIRE_FIELD_URL,
|
||||
REQUIRE_FIELD_DOMAIN,
|
||||
|
||||
AUTO
|
||||
}
|
||||
|
@ -4,10 +4,13 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
|
||||
public class ReverseIndexPriorityParameters {
|
||||
private static final long highPriorityFlags = EdgePageWordFlags.Title.asBit()
|
||||
private static final long highPriorityFlags =
|
||||
EdgePageWordFlags.Title.asBit()
|
||||
| EdgePageWordFlags.Subjects.asBit()
|
||||
| EdgePageWordFlags.TfIdfHigh.asBit()
|
||||
| EdgePageWordFlags.NamesWords.asBit()
|
||||
| EdgePageWordFlags.UrlDomain.asBit()
|
||||
| EdgePageWordFlags.UrlPath.asBit()
|
||||
| EdgePageWordFlags.Site.asBit()
|
||||
| EdgePageWordFlags.SiteAdjacent.asBit();
|
||||
|
||||
|
@ -11,11 +11,16 @@ import java.util.Objects;
|
||||
public final class KeywordMetadata {
|
||||
|
||||
private static final WordFrequencyData empty = new WordFrequencyData(0);
|
||||
private final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||
private final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||
private final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||
private final Object2IntOpenHashMap<String> wordsTfIdf;
|
||||
private final Object2IntOpenHashMap<String> positionMask;
|
||||
public final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||
public final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||
public final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||
|
||||
public final HashSet<String> urlKeywords = new HashSet<>(10);
|
||||
|
||||
public final HashSet<String> domainKeywords = new HashSet<>(10);
|
||||
|
||||
public final Object2IntOpenHashMap<String> wordsTfIdf;
|
||||
public final Object2IntOpenHashMap<String> positionMask;
|
||||
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
||||
|
||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||
@ -45,31 +50,17 @@ public final class KeywordMetadata {
|
||||
if (titleKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.Title);
|
||||
|
||||
if (urlKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.UrlPath);
|
||||
|
||||
if (domainKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.UrlDomain);
|
||||
|
||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||
|
||||
return new WordMetadata(tfidf, positions, flags).encode();
|
||||
}
|
||||
|
||||
public HashSet<String> titleKeywords() {
|
||||
return titleKeywords;
|
||||
}
|
||||
|
||||
public HashSet<String> subjectKeywords() {
|
||||
return subjectKeywords;
|
||||
}
|
||||
|
||||
public HashSet<String> namesKeywords() {
|
||||
return namesKeywords;
|
||||
}
|
||||
|
||||
public Object2IntOpenHashMap<String> wordsTfIdf() {
|
||||
return wordsTfIdf;
|
||||
}
|
||||
|
||||
public Object2IntOpenHashMap<String> positionMask() {
|
||||
return positionMask;
|
||||
}
|
||||
|
||||
public EnumSet<EdgePageWordFlags> wordFlagsTemplate() {
|
||||
return wordFlagsTemplate;
|
||||
}
|
||||
|
@ -98,7 +98,7 @@ public class SearchIndex {
|
||||
IndexQueryBuilder query =
|
||||
switch(params.queryStrategy()) {
|
||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
|
||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
|
||||
-> indexReader.findWordAsTopic(orderedIncludes);
|
||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
||||
};
|
||||
|
@ -152,6 +152,12 @@ public class IndexResultValuator {
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||
return EdgePageWordFlags.Title.isPresent(metadata);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
|
||||
return EdgePageWordFlags.UrlPath.isPresent(metadata);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
|
||||
return EdgePageWordFlags.UrlDomain.isPresent(metadata);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.ranking.ReversePageRank;
|
||||
import nu.marginalia.ranking.StandardPageRank;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
||||
@ -101,7 +102,7 @@ public class IndexSearchSetsService {
|
||||
var entry = rankingSettings.retro;
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
@ -115,7 +116,7 @@ public class IndexSearchSetsService {
|
||||
|
||||
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
|
||||
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
@ -128,7 +129,7 @@ public class IndexSearchSetsService {
|
||||
var entry = rankingSettings.academia;
|
||||
|
||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
|
||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.index.svc.searchset;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -21,11 +21,11 @@ import java.nio.file.StandardOpenOption;
|
||||
public class RankingSearchSet implements SearchSet {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final RoaringBitmap set;
|
||||
private final IntOpenHashSet set;
|
||||
public final SearchSetIdentifier identifier;
|
||||
public final Path source;
|
||||
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, IntOpenHashSet set) {
|
||||
this.identifier = identifier;
|
||||
this.source = source;
|
||||
this.set = set;
|
||||
@ -36,7 +36,7 @@ public class RankingSearchSet implements SearchSet {
|
||||
this.source = source;
|
||||
|
||||
if (!Files.exists(source)) {
|
||||
set = new RoaringBitmap();
|
||||
set = new IntOpenHashSet();
|
||||
}
|
||||
else {
|
||||
set = load(source);
|
||||
@ -47,8 +47,8 @@ public class RankingSearchSet implements SearchSet {
|
||||
}
|
||||
}
|
||||
|
||||
private static RoaringBitmap load(Path source) throws IOException {
|
||||
var set = new RoaringBitmap();
|
||||
private static IntOpenHashSet load(Path source) throws IOException {
|
||||
var set = new IntOpenHashSet();
|
||||
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
|
||||
for (;;) {
|
||||
try {
|
||||
@ -73,8 +73,8 @@ public class RankingSearchSet implements SearchSet {
|
||||
StandardOpenOption.CREATE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING)))
|
||||
{
|
||||
for (var iter = set.getIntIterator(); iter.hasNext();) {
|
||||
ds.writeInt(iter.next());
|
||||
for (var iter = set.intIterator(); iter.hasNext();) {
|
||||
ds.writeInt(iter.nextInt());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,9 +17,9 @@ public enum SearchProfile {
|
||||
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
|
||||
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA),
|
||||
|
||||
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
|
||||
FOOD("food", SearchSetIdentifier.NONE),
|
||||
CRAFTS("crafts", SearchSetIdentifier.NONE),
|
||||
|
||||
CLASSICS("classics", SearchSetIdentifier.NONE),
|
||||
;
|
||||
|
||||
@ -55,6 +55,9 @@ public enum SearchProfile {
|
||||
subquery.searchTermsPriority.add("format:html123");
|
||||
subquery.searchTermsPriority.add("js:false");
|
||||
}
|
||||
if (this == PLAIN_TEXT) {
|
||||
subquery.searchTermsInclude.add("format:plain");
|
||||
}
|
||||
if (this == FOOD) {
|
||||
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||
}
|
||||
|
@ -282,6 +282,8 @@ public class QueryFactory {
|
||||
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
|
||||
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
|
||||
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||
default -> QueryStrategy.AUTO;
|
||||
|
@ -152,6 +152,14 @@ public class SearchResultValuator {
|
||||
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Site)) {
|
||||
return totalFactor * 0.7;
|
||||
}
|
||||
|
||||
if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlDomain)) {
|
||||
return totalFactor * 0.8;
|
||||
}
|
||||
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlPath)) {
|
||||
return totalFactor * 0.9;
|
||||
}
|
||||
|
||||
return totalFactor;
|
||||
}
|
||||
|
||||
@ -212,6 +220,8 @@ public class SearchResultValuator {
|
||||
final boolean siteAdjacent = flags.contains(EdgePageWordFlags.SiteAdjacent);
|
||||
final boolean subject = flags.contains(EdgePageWordFlags.Subjects);
|
||||
final boolean names = flags.contains(EdgePageWordFlags.NamesWords);
|
||||
final boolean urlDomain = flags.contains(EdgePageWordFlags.UrlDomain);
|
||||
final boolean urlPath = flags.contains(EdgePageWordFlags.UrlPath);
|
||||
|
||||
if (title) {
|
||||
if (titleLength <= 64) {
|
||||
@ -236,6 +246,13 @@ public class SearchResultValuator {
|
||||
f *= Math.pow(0.8, k);
|
||||
}
|
||||
|
||||
if (urlDomain) {
|
||||
f *= Math.pow(0.8, k);
|
||||
}
|
||||
else if (urlPath) {
|
||||
f *= Math.pow(0.9, k);
|
||||
}
|
||||
|
||||
if (!title && !subject && names) {
|
||||
f *= Math.pow(0.9, k);
|
||||
}
|
||||
|
@ -11,9 +11,12 @@
|
||||
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
|
||||
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
||||
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia</option>
|
||||
<option {{#eq profile "vintage"}}selected{{/eq}} value="vintage">Web 1.0</option>
|
||||
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">No Domain Ranking</option>
|
||||
</optgroup>
|
||||
<optgroup label="Vintage">
|
||||
<option {{#eq profile "vintage"}}selected{{/eq}} value="vintage">Web 1.0</option>
|
||||
<option {{#eq profile "plain-text"}}selected{{/eq}} value="plain-text">Text Files</option>
|
||||
</optgroup>
|
||||
<optgroup label="Topics Search">
|
||||
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
||||
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts 🪡🔨 (WIP; mostly textile-craft)</option>
|
||||
|
Loading…
Reference in New Issue
Block a user