Word feature bit for words that appear in the URL, new search profile for plain text files, better plain text titles.
This commit is contained in:
parent
2bc212d65c
commit
722ff3bffb
@ -58,6 +58,14 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
sum -= 1;
|
sum -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
||||||
|
sum -= 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
||||||
|
sum -= 5;
|
||||||
|
}
|
||||||
|
|
||||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
||||||
|
|
||||||
|
@ -33,7 +33,18 @@ public enum EdgePageWordFlags {
|
|||||||
/** Word is important to adjacent documents
|
/** Word is important to adjacent documents
|
||||||
* @see SiteWords
|
* @see SiteWords
|
||||||
* */
|
* */
|
||||||
SiteAdjacent;
|
SiteAdjacent,
|
||||||
|
|
||||||
|
/** Keyword appears in URL path
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
UrlPath,
|
||||||
|
|
||||||
|
/** Keyword appears in domain name
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
UrlDomain
|
||||||
|
;
|
||||||
|
|
||||||
public int asBit() {
|
public int asBit() {
|
||||||
return 1 << ordinal();
|
return 1 << ordinal();
|
||||||
|
@ -13,7 +13,9 @@ public record WordMetadata(int tfIdf,
|
|||||||
int positions,
|
int positions,
|
||||||
byte flags) {
|
byte flags) {
|
||||||
|
|
||||||
// 8 unsused bits at the beginning
|
// Bottom 16 bits are used for flags
|
||||||
|
|
||||||
|
public static final long FLAGS_MASK = 0xFFFFL;
|
||||||
|
|
||||||
public static final long TF_IDF_MASK = 0xFFFFL;
|
public static final long TF_IDF_MASK = 0xFFFFL;
|
||||||
public static final int TF_IDF_SHIFT = 16;
|
public static final int TF_IDF_SHIFT = 16;
|
||||||
@ -21,7 +23,6 @@ public record WordMetadata(int tfIdf,
|
|||||||
public static final int POSITIONS_SHIFT = 32;
|
public static final int POSITIONS_SHIFT = 32;
|
||||||
public static final long POSITIONS_MASK = 0xFFFF_FFFFL;
|
public static final long POSITIONS_MASK = 0xFFFF_FFFFL;
|
||||||
|
|
||||||
public static final long FLAGS_MASK = 0xFF;
|
|
||||||
|
|
||||||
|
|
||||||
public WordMetadata() {
|
public WordMetadata() {
|
||||||
|
@ -9,6 +9,7 @@ import nu.marginalia.language.model.KeywordMetadata;
|
|||||||
import nu.marginalia.language.model.WordRep;
|
import nu.marginalia.language.model.WordRep;
|
||||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -23,9 +24,12 @@ public class DocumentKeywordExtractor {
|
|||||||
private final ArtifactKeywords artifactKeywords;
|
private final ArtifactKeywords artifactKeywords;
|
||||||
|
|
||||||
private final SimpleKeywords simpleKeywords;
|
private final SimpleKeywords simpleKeywords;
|
||||||
|
|
||||||
|
private final UrlKeywords urlKeywords;
|
||||||
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
|
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||||
keywordExtractor = new KeywordExtractor();
|
keywordExtractor = new KeywordExtractor();
|
||||||
@ -33,6 +37,7 @@ public class DocumentKeywordExtractor {
|
|||||||
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
|
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
|
||||||
artifactKeywords = new ArtifactKeywords();
|
artifactKeywords = new ArtifactKeywords();
|
||||||
|
|
||||||
|
urlKeywords = new UrlKeywords();
|
||||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||||
nameCounter = new NameCounter(keywordExtractor);
|
nameCounter = new NameCounter(keywordExtractor);
|
||||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||||
@ -40,7 +45,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData) {
|
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, EdgeUrl url) {
|
||||||
|
|
||||||
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
|
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
|
||||||
|
|
||||||
@ -49,12 +54,14 @@ public class DocumentKeywordExtractor {
|
|||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||||
|
|
||||||
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
|
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
|
||||||
|
|
||||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
for (var rep : titleWords) keywordMetadata.titleKeywords.add(rep.stemmed);
|
||||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords.add(rep.stemmed);
|
||||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
for (var rep : subjects) keywordMetadata.subjectKeywords.add(rep.stemmed);
|
||||||
|
|
||||||
|
keywordMetadata.urlKeywords.addAll(urlKeywords.getUrlKeywords(url));
|
||||||
|
keywordMetadata.domainKeywords.addAll(urlKeywords.getDomainKeywords(url));
|
||||||
|
|
||||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ public class DocumentKeywordPositionBitmaskExtractor {
|
|||||||
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
|
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
|
||||||
final KeywordMetadata keywordMetadata = new KeywordMetadata();
|
final KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||||
|
|
||||||
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
|
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask;
|
||||||
|
|
||||||
// Mark the title words as position 0
|
// Mark the title words as position 0
|
||||||
for (var sent : dld.titleSentences) {
|
for (var sent : dld.titleSentences) {
|
||||||
|
@ -51,7 +51,7 @@ public class KeywordCounter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
|
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf;
|
||||||
List<WordRep> tfIdfHigh = new ArrayList<>();
|
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||||
|
|
||||||
int maxVal = maxValue(counts);
|
int maxVal = maxValue(counts);
|
||||||
|
@ -28,7 +28,6 @@ public class SubjectCounter {
|
|||||||
|
|
||||||
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||||
|
|
||||||
Map<String, Integer> counts = new HashMap<>();
|
|
||||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||||
|
|
||||||
for (var sentence : dld.sentences) {
|
for (var sentence : dld.sentences) {
|
||||||
@ -81,7 +80,7 @@ public class SubjectCounter {
|
|||||||
return sum / parts.length;
|
return sum / parts.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
|
return keywordMetadata.wordsTfIdf.getOrDefault(stemmed, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||||
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.converting.processor.keywords.extractors;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class UrlKeywords {
|
||||||
|
private final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
public Set<String> getUrlKeywords(EdgeUrl url) {
|
||||||
|
String path = url.path;
|
||||||
|
|
||||||
|
return Arrays.stream(path.split("[^a-z0-9A-Z]+"))
|
||||||
|
.map(ps::stemWord)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getDomainKeywords(EdgeUrl url) {
|
||||||
|
return Arrays.stream(url.domain.domain.split("[^a-z0-9A-Z]+"))
|
||||||
|
.filter(s -> s.length() > 3)
|
||||||
|
.map(ps::stemWord)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
}
|
@ -50,7 +50,20 @@ public class PlainTextLogic {
|
|||||||
return candidates.get(0).trim();
|
return candidates.get(0).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
return url.path.substring(url.path.lastIndexOf('/'));
|
return getFileNameFromPath(url);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getFileNameFromPath(EdgeUrl url) {
|
||||||
|
final String path = url.path;
|
||||||
|
|
||||||
|
int lastSlash = path.lastIndexOf('/');
|
||||||
|
|
||||||
|
if (lastSlash + 1 < path.length()) {
|
||||||
|
return path.substring(lastSlash + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSideline(String s) {
|
public boolean isSideline(String s) {
|
||||||
|
@ -122,7 +122,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
new MetaTagsBuilder()
|
new MetaTagsBuilder()
|
||||||
.addDomainCrawlData(crawledDomain)
|
.addDomainCrawlData(crawledDomain)
|
||||||
|
@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
new MetaTagsBuilder()
|
new MetaTagsBuilder()
|
||||||
.addDomainCrawlData(crawledDomain)
|
.addDomainCrawlData(crawledDomain)
|
||||||
|
@ -11,6 +11,7 @@ import nu.marginalia.language.statistics.TermFrequencyDict;
|
|||||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||||
import nu.marginalia.language.model.WordSeparator;
|
import nu.marginalia.language.model.WordSeparator;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.test.util.TestLanguageModels;
|
import nu.marginalia.test.util.TestLanguageModels;
|
||||||
@ -42,6 +43,7 @@ class SentenceExtractorTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
|
|
||||||
@ -52,6 +54,7 @@ class SentenceExtractorTest {
|
|||||||
SentenceExtractor se = new SentenceExtractor(lm);
|
SentenceExtractor se = new SentenceExtractor(lm);
|
||||||
|
|
||||||
var dict = new TermFrequencyDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
|
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
@ -60,7 +63,7 @@ class SentenceExtractorTest {
|
|||||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
var dld = se.extractSentences(doc);
|
var dld = se.extractSentences(doc);
|
||||||
documentKeywordExtractor.extractKeywords(dld);
|
documentKeywordExtractor.extractKeywords(dld, url);
|
||||||
total += (System.currentTimeMillis() - start);
|
total += (System.currentTimeMillis() - start);
|
||||||
}
|
}
|
||||||
System.out.println(total);
|
System.out.println(total);
|
||||||
@ -122,7 +125,7 @@ class SentenceExtractorTest {
|
|||||||
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
|
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
|
||||||
|
|
||||||
var dict = new TermFrequencyDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
|
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new EdgeUrl("https://memex.marginalia.nu/")));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.ranking.accumulator;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||||
|
|
||||||
|
public class RankingResultHashSetAccumulator implements RankingResultAccumulator<IntOpenHashSet> {
|
||||||
|
private final IntOpenHashSet result = new IntOpenHashSet();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.add(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IntOpenHashSet get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -7,6 +7,8 @@ public enum QueryStrategy {
|
|||||||
REQUIRE_FIELD_SITE,
|
REQUIRE_FIELD_SITE,
|
||||||
REQUIRE_FIELD_TITLE,
|
REQUIRE_FIELD_TITLE,
|
||||||
REQUIRE_FIELD_SUBJECT,
|
REQUIRE_FIELD_SUBJECT,
|
||||||
|
REQUIRE_FIELD_URL,
|
||||||
|
REQUIRE_FIELD_DOMAIN,
|
||||||
|
|
||||||
AUTO
|
AUTO
|
||||||
}
|
}
|
||||||
|
@ -4,10 +4,13 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
|
||||||
public class ReverseIndexPriorityParameters {
|
public class ReverseIndexPriorityParameters {
|
||||||
private static final long highPriorityFlags = EdgePageWordFlags.Title.asBit()
|
private static final long highPriorityFlags =
|
||||||
|
EdgePageWordFlags.Title.asBit()
|
||||||
| EdgePageWordFlags.Subjects.asBit()
|
| EdgePageWordFlags.Subjects.asBit()
|
||||||
| EdgePageWordFlags.TfIdfHigh.asBit()
|
| EdgePageWordFlags.TfIdfHigh.asBit()
|
||||||
| EdgePageWordFlags.NamesWords.asBit()
|
| EdgePageWordFlags.NamesWords.asBit()
|
||||||
|
| EdgePageWordFlags.UrlDomain.asBit()
|
||||||
|
| EdgePageWordFlags.UrlPath.asBit()
|
||||||
| EdgePageWordFlags.Site.asBit()
|
| EdgePageWordFlags.Site.asBit()
|
||||||
| EdgePageWordFlags.SiteAdjacent.asBit();
|
| EdgePageWordFlags.SiteAdjacent.asBit();
|
||||||
|
|
||||||
|
@ -11,11 +11,16 @@ import java.util.Objects;
|
|||||||
public final class KeywordMetadata {
|
public final class KeywordMetadata {
|
||||||
|
|
||||||
private static final WordFrequencyData empty = new WordFrequencyData(0);
|
private static final WordFrequencyData empty = new WordFrequencyData(0);
|
||||||
private final HashSet<String> titleKeywords = new HashSet<>(50);
|
public final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||||
private final HashSet<String> subjectKeywords = new HashSet<>(10);
|
public final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||||
private final HashSet<String> namesKeywords = new HashSet<>(50);
|
public final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||||
private final Object2IntOpenHashMap<String> wordsTfIdf;
|
|
||||||
private final Object2IntOpenHashMap<String> positionMask;
|
public final HashSet<String> urlKeywords = new HashSet<>(10);
|
||||||
|
|
||||||
|
public final HashSet<String> domainKeywords = new HashSet<>(10);
|
||||||
|
|
||||||
|
public final Object2IntOpenHashMap<String> wordsTfIdf;
|
||||||
|
public final Object2IntOpenHashMap<String> positionMask;
|
||||||
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
|
||||||
|
|
||||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||||
@ -45,31 +50,17 @@ public final class KeywordMetadata {
|
|||||||
if (titleKeywords.contains(stemmed))
|
if (titleKeywords.contains(stemmed))
|
||||||
flags.add(EdgePageWordFlags.Title);
|
flags.add(EdgePageWordFlags.Title);
|
||||||
|
|
||||||
|
if (urlKeywords.contains(stemmed))
|
||||||
|
flags.add(EdgePageWordFlags.UrlPath);
|
||||||
|
|
||||||
|
if (domainKeywords.contains(stemmed))
|
||||||
|
flags.add(EdgePageWordFlags.UrlDomain);
|
||||||
|
|
||||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||||
|
|
||||||
return new WordMetadata(tfidf, positions, flags).encode();
|
return new WordMetadata(tfidf, positions, flags).encode();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HashSet<String> titleKeywords() {
|
|
||||||
return titleKeywords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public HashSet<String> subjectKeywords() {
|
|
||||||
return subjectKeywords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public HashSet<String> namesKeywords() {
|
|
||||||
return namesKeywords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object2IntOpenHashMap<String> wordsTfIdf() {
|
|
||||||
return wordsTfIdf;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object2IntOpenHashMap<String> positionMask() {
|
|
||||||
return positionMask;
|
|
||||||
}
|
|
||||||
|
|
||||||
public EnumSet<EdgePageWordFlags> wordFlagsTemplate() {
|
public EnumSet<EdgePageWordFlags> wordFlagsTemplate() {
|
||||||
return wordFlagsTemplate;
|
return wordFlagsTemplate;
|
||||||
}
|
}
|
||||||
|
@ -98,7 +98,7 @@ public class SearchIndex {
|
|||||||
IndexQueryBuilder query =
|
IndexQueryBuilder query =
|
||||||
switch(params.queryStrategy()) {
|
switch(params.queryStrategy()) {
|
||||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
||||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
|
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
|
||||||
-> indexReader.findWordAsTopic(orderedIncludes);
|
-> indexReader.findWordAsTopic(orderedIncludes);
|
||||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
||||||
};
|
};
|
||||||
|
@ -152,6 +152,12 @@ public class IndexResultValuator {
|
|||||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||||
return EdgePageWordFlags.Title.isPresent(metadata);
|
return EdgePageWordFlags.Title.isPresent(metadata);
|
||||||
}
|
}
|
||||||
|
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
|
||||||
|
return EdgePageWordFlags.UrlPath.isPresent(metadata);
|
||||||
|
}
|
||||||
|
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
|
||||||
|
return EdgePageWordFlags.UrlDomain.isPresent(metadata);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import nu.marginalia.ranking.ReversePageRank;
|
|||||||
import nu.marginalia.ranking.StandardPageRank;
|
import nu.marginalia.ranking.StandardPageRank;
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultBitSetAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||||
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||||
|
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
import nu.marginalia.ranking.data.RankingDomainFetcher;
|
||||||
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
import nu.marginalia.index.svc.searchset.RankingSearchSet;
|
||||||
@ -101,7 +102,7 @@ public class IndexSearchSetsService {
|
|||||||
var entry = rankingSettings.retro;
|
var entry = rankingSettings.retro;
|
||||||
|
|
||||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
|
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||||
@ -115,7 +116,7 @@ public class IndexSearchSetsService {
|
|||||||
|
|
||||||
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||||
rpr.setMaxKnownUrls(750);
|
rpr.setMaxKnownUrls(750);
|
||||||
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
|
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||||
@ -128,7 +129,7 @@ public class IndexSearchSetsService {
|
|||||||
var entry = rankingSettings.academia;
|
var entry = rankingSettings.academia;
|
||||||
|
|
||||||
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
|
||||||
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
|
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
|
||||||
|
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.index.svc.searchset;
|
package nu.marginalia.index.svc.searchset;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -21,11 +21,11 @@ import java.nio.file.StandardOpenOption;
|
|||||||
public class RankingSearchSet implements SearchSet {
|
public class RankingSearchSet implements SearchSet {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final RoaringBitmap set;
|
private final IntOpenHashSet set;
|
||||||
public final SearchSetIdentifier identifier;
|
public final SearchSetIdentifier identifier;
|
||||||
public final Path source;
|
public final Path source;
|
||||||
|
|
||||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
|
public RankingSearchSet(SearchSetIdentifier identifier, Path source, IntOpenHashSet set) {
|
||||||
this.identifier = identifier;
|
this.identifier = identifier;
|
||||||
this.source = source;
|
this.source = source;
|
||||||
this.set = set;
|
this.set = set;
|
||||||
@ -36,7 +36,7 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
this.source = source;
|
this.source = source;
|
||||||
|
|
||||||
if (!Files.exists(source)) {
|
if (!Files.exists(source)) {
|
||||||
set = new RoaringBitmap();
|
set = new IntOpenHashSet();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
set = load(source);
|
set = load(source);
|
||||||
@ -47,8 +47,8 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static RoaringBitmap load(Path source) throws IOException {
|
private static IntOpenHashSet load(Path source) throws IOException {
|
||||||
var set = new RoaringBitmap();
|
var set = new IntOpenHashSet();
|
||||||
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
|
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
|
||||||
for (;;) {
|
for (;;) {
|
||||||
try {
|
try {
|
||||||
@ -73,8 +73,8 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
StandardOpenOption.CREATE,
|
StandardOpenOption.CREATE,
|
||||||
StandardOpenOption.TRUNCATE_EXISTING)))
|
StandardOpenOption.TRUNCATE_EXISTING)))
|
||||||
{
|
{
|
||||||
for (var iter = set.getIntIterator(); iter.hasNext();) {
|
for (var iter = set.intIterator(); iter.hasNext();) {
|
||||||
ds.writeInt(iter.next());
|
ds.writeInt(iter.nextInt());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,9 +17,9 @@ public enum SearchProfile {
|
|||||||
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
|
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
|
||||||
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA),
|
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA),
|
||||||
|
|
||||||
|
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
|
||||||
FOOD("food", SearchSetIdentifier.NONE),
|
FOOD("food", SearchSetIdentifier.NONE),
|
||||||
CRAFTS("crafts", SearchSetIdentifier.NONE),
|
CRAFTS("crafts", SearchSetIdentifier.NONE),
|
||||||
|
|
||||||
CLASSICS("classics", SearchSetIdentifier.NONE),
|
CLASSICS("classics", SearchSetIdentifier.NONE),
|
||||||
;
|
;
|
||||||
|
|
||||||
@ -55,6 +55,9 @@ public enum SearchProfile {
|
|||||||
subquery.searchTermsPriority.add("format:html123");
|
subquery.searchTermsPriority.add("format:html123");
|
||||||
subquery.searchTermsPriority.add("js:false");
|
subquery.searchTermsPriority.add("js:false");
|
||||||
}
|
}
|
||||||
|
if (this == PLAIN_TEXT) {
|
||||||
|
subquery.searchTermsInclude.add("format:plain");
|
||||||
|
}
|
||||||
if (this == FOOD) {
|
if (this == FOOD) {
|
||||||
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||||
}
|
}
|
||||||
|
@ -282,6 +282,8 @@ public class QueryFactory {
|
|||||||
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||||
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||||
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||||
|
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
|
||||||
|
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
|
||||||
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||||
case "TOPIC" -> QueryStrategy.TOPIC;
|
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||||
default -> QueryStrategy.AUTO;
|
default -> QueryStrategy.AUTO;
|
||||||
|
@ -152,6 +152,14 @@ public class SearchResultValuator {
|
|||||||
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Site)) {
|
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Site)) {
|
||||||
return totalFactor * 0.7;
|
return totalFactor * 0.7;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlDomain)) {
|
||||||
|
return totalFactor * 0.8;
|
||||||
|
}
|
||||||
|
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlPath)) {
|
||||||
|
return totalFactor * 0.9;
|
||||||
|
}
|
||||||
|
|
||||||
return totalFactor;
|
return totalFactor;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,6 +220,8 @@ public class SearchResultValuator {
|
|||||||
final boolean siteAdjacent = flags.contains(EdgePageWordFlags.SiteAdjacent);
|
final boolean siteAdjacent = flags.contains(EdgePageWordFlags.SiteAdjacent);
|
||||||
final boolean subject = flags.contains(EdgePageWordFlags.Subjects);
|
final boolean subject = flags.contains(EdgePageWordFlags.Subjects);
|
||||||
final boolean names = flags.contains(EdgePageWordFlags.NamesWords);
|
final boolean names = flags.contains(EdgePageWordFlags.NamesWords);
|
||||||
|
final boolean urlDomain = flags.contains(EdgePageWordFlags.UrlDomain);
|
||||||
|
final boolean urlPath = flags.contains(EdgePageWordFlags.UrlPath);
|
||||||
|
|
||||||
if (title) {
|
if (title) {
|
||||||
if (titleLength <= 64) {
|
if (titleLength <= 64) {
|
||||||
@ -236,6 +246,13 @@ public class SearchResultValuator {
|
|||||||
f *= Math.pow(0.8, k);
|
f *= Math.pow(0.8, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (urlDomain) {
|
||||||
|
f *= Math.pow(0.8, k);
|
||||||
|
}
|
||||||
|
else if (urlPath) {
|
||||||
|
f *= Math.pow(0.9, k);
|
||||||
|
}
|
||||||
|
|
||||||
if (!title && !subject && names) {
|
if (!title && !subject && names) {
|
||||||
f *= Math.pow(0.9, k);
|
f *= Math.pow(0.9, k);
|
||||||
}
|
}
|
||||||
|
@ -8,16 +8,19 @@
|
|||||||
<div class="settings">
|
<div class="settings">
|
||||||
<select name="profile" id="profile">
|
<select name="profile" id="profile">
|
||||||
<optgroup label="General Search">
|
<optgroup label="General Search">
|
||||||
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
|
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
|
||||||
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
||||||
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia</option>
|
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia</option>
|
||||||
<option {{#eq profile "vintage"}}selected{{/eq}} value="vintage">Web 1.0</option>
|
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">No Domain Ranking</option>
|
||||||
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">No Domain Ranking</option>
|
</optgroup>
|
||||||
|
<optgroup label="Vintage">
|
||||||
|
<option {{#eq profile "vintage"}}selected{{/eq}} value="vintage">Web 1.0</option>
|
||||||
|
<option {{#eq profile "plain-text"}}selected{{/eq}} value="plain-text">Text Files</option>
|
||||||
</optgroup>
|
</optgroup>
|
||||||
<optgroup label="Topics Search">
|
<optgroup label="Topics Search">
|
||||||
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
||||||
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts 🪡🔨 (WIP; mostly textile-craft)</option>
|
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts 🪡🔨 (WIP; mostly textile-craft)</option>
|
||||||
<option {{#eq profile "classics"}}selected{{/eq}} value="classics">Classics and Antiquity 📜</option>
|
<option {{#eq profile "classics"}}selected{{/eq}} value="classics">Classics and Antiquity 📜</option>
|
||||||
</optgroup>
|
</optgroup>
|
||||||
|
|
||||||
</select>
|
</select>
|
||||||
|
Loading…
Reference in New Issue
Block a user