Word feature bit for words that appear in the URL, new search profile for plain text files, better plain text titles.

This commit is contained in:
Viktor Lofgren 2023-03-10 16:46:56 +01:00
parent 2bc212d65c
commit 722ff3bffb
24 changed files with 178 additions and 64 deletions

View File

@ -58,6 +58,14 @@ public record EdgeSearchResultKeywordScore(int set,
sum -= 1;
}
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
sum -= 5;
}
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);

View File

@ -33,7 +33,18 @@ public enum EdgePageWordFlags {
/** Word is important to adjacent documents
* @see SiteWords
* */
SiteAdjacent;
SiteAdjacent,
/** Keyword appears in URL path
*
*/
UrlPath,
/** Keyword appears in domain name
*
*/
UrlDomain
;
public int asBit() {
return 1 << ordinal();

View File

@ -13,7 +13,9 @@ public record WordMetadata(int tfIdf,
int positions,
byte flags) {
// 8 unsused bits at the beginning
// Bottom 16 bits are used for flags
public static final long FLAGS_MASK = 0xFFFFL;
public static final long TF_IDF_MASK = 0xFFFFL;
public static final int TF_IDF_SHIFT = 16;
@ -21,7 +23,6 @@ public record WordMetadata(int tfIdf,
public static final int POSITIONS_SHIFT = 32;
public static final long POSITIONS_MASK = 0xFFFF_FFFFL;
public static final long FLAGS_MASK = 0xFF;
public WordMetadata() {

View File

@ -9,6 +9,7 @@ import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import javax.inject.Inject;
import java.util.*;
@ -23,9 +24,12 @@ public class DocumentKeywordExtractor {
private final ArtifactKeywords artifactKeywords;
private final SimpleKeywords simpleKeywords;
private final UrlKeywords urlKeywords;
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
keywordExtractor = new KeywordExtractor();
@ -33,6 +37,7 @@ public class DocumentKeywordExtractor {
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
artifactKeywords = new ArtifactKeywords();
urlKeywords = new UrlKeywords();
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
nameCounter = new NameCounter(keywordExtractor);
subjectCounter = new SubjectCounter(keywordExtractor);
@ -40,7 +45,7 @@ public class DocumentKeywordExtractor {
}
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData) {
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, EdgeUrl url) {
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
@ -49,12 +54,14 @@ public class DocumentKeywordExtractor {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
for (var rep : titleWords) keywordMetadata.titleKeywords.add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords.add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords.add(rep.stemmed);
keywordMetadata.urlKeywords.addAll(urlKeywords.getUrlKeywords(url));
keywordMetadata.domainKeywords.addAll(urlKeywords.getDomainKeywords(url));
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();

View File

@ -18,7 +18,7 @@ public class DocumentKeywordPositionBitmaskExtractor {
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
final KeywordMetadata keywordMetadata = new KeywordMetadata();
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask;
// Mark the title words as position 0
for (var sent : dld.titleSentences) {

View File

@ -51,7 +51,7 @@ public class KeywordCounter {
}
}
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf();
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf;
List<WordRep> tfIdfHigh = new ArrayList<>();
int maxVal = maxValue(counts);

View File

@ -28,7 +28,6 @@ public class SubjectCounter {
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Map<String, Integer> counts = new HashMap<>();
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld.sentences) {
@ -81,7 +80,7 @@ public class SubjectCounter {
return sum / parts.length;
}
return keywordMetadata.wordsTfIdf().getOrDefault(stemmed, 0);
return keywordMetadata.wordsTfIdf.getOrDefault(stemmed, 0);
}
private boolean isDetOrAdverbOrVerb(String posTag) {

View File

@ -0,0 +1,27 @@
package nu.marginalia.converting.processor.keywords.extractors;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeUrl;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Collectors;
public class UrlKeywords {
private final PorterStemmer ps = new PorterStemmer();
public Set<String> getUrlKeywords(EdgeUrl url) {
String path = url.path;
return Arrays.stream(path.split("[^a-z0-9A-Z]+"))
.map(ps::stemWord)
.collect(Collectors.toSet());
}
public Set<String> getDomainKeywords(EdgeUrl url) {
return Arrays.stream(url.domain.domain.split("[^a-z0-9A-Z]+"))
.filter(s -> s.length() > 3)
.map(ps::stemWord)
.collect(Collectors.toSet());
}
}

View File

@ -50,7 +50,20 @@ public class PlainTextLogic {
return candidates.get(0).trim();
}
return url.path.substring(url.path.lastIndexOf('/'));
return getFileNameFromPath(url);
}
private String getFileNameFromPath(EdgeUrl url) {
final String path = url.path;
int lastSlash = path.lastIndexOf('/');
if (lastSlash + 1 < path.length()) {
return path.substring(lastSlash + 1);
}
return path;
}
public boolean isSideline(String s) {

View File

@ -122,7 +122,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)

View File

@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)

View File

@ -11,6 +11,7 @@ import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.test.util.TestLanguageModels;
@ -42,6 +43,7 @@ class SentenceExtractorTest {
}
@SneakyThrows
public static void main(String... args) throws IOException {
final LanguageModels lm = TestLanguageModels.getLanguageModels();
@ -52,6 +54,7 @@ class SentenceExtractorTest {
SentenceExtractor se = new SentenceExtractor(lm);
var dict = new TermFrequencyDict(lm);
var url = new EdgeUrl("https://memex.marginalia.nu/");
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
for (;;) {
@ -60,7 +63,7 @@ class SentenceExtractorTest {
var doc = Jsoup.parse(Files.readString(file.toPath()));
long start = System.currentTimeMillis();
var dld = se.extractSentences(doc);
documentKeywordExtractor.extractKeywords(dld);
documentKeywordExtractor.extractKeywords(dld, url);
total += (System.currentTimeMillis() - start);
}
System.out.println(total);
@ -122,7 +125,7 @@ class SentenceExtractorTest {
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
var dict = new TermFrequencyDict(lm);
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new EdgeUrl("https://memex.marginalia.nu/")));
}
@Test

View File

@ -0,0 +1,17 @@
package nu.marginalia.ranking.accumulator;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
public class RankingResultHashSetAccumulator implements RankingResultAccumulator<IntOpenHashSet> {
private final IntOpenHashSet result = new IntOpenHashSet();
@Override
public void add(int domainId, int rank) {
result.add(domainId);
}
@Override
public IntOpenHashSet get() {
return result;
}
}

View File

@ -7,6 +7,8 @@ public enum QueryStrategy {
REQUIRE_FIELD_SITE,
REQUIRE_FIELD_TITLE,
REQUIRE_FIELD_SUBJECT,
REQUIRE_FIELD_URL,
REQUIRE_FIELD_DOMAIN,
AUTO
}

View File

@ -4,10 +4,13 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.model.crawl.EdgePageWordFlags;
public class ReverseIndexPriorityParameters {
private static final long highPriorityFlags = EdgePageWordFlags.Title.asBit()
private static final long highPriorityFlags =
EdgePageWordFlags.Title.asBit()
| EdgePageWordFlags.Subjects.asBit()
| EdgePageWordFlags.TfIdfHigh.asBit()
| EdgePageWordFlags.NamesWords.asBit()
| EdgePageWordFlags.UrlDomain.asBit()
| EdgePageWordFlags.UrlPath.asBit()
| EdgePageWordFlags.Site.asBit()
| EdgePageWordFlags.SiteAdjacent.asBit();

View File

@ -11,11 +11,16 @@ import java.util.Objects;
public final class KeywordMetadata {
private static final WordFrequencyData empty = new WordFrequencyData(0);
private final HashSet<String> titleKeywords = new HashSet<>(50);
private final HashSet<String> subjectKeywords = new HashSet<>(10);
private final HashSet<String> namesKeywords = new HashSet<>(50);
private final Object2IntOpenHashMap<String> wordsTfIdf;
private final Object2IntOpenHashMap<String> positionMask;
public final HashSet<String> titleKeywords = new HashSet<>(50);
public final HashSet<String> subjectKeywords = new HashSet<>(10);
public final HashSet<String> namesKeywords = new HashSet<>(50);
public final HashSet<String> urlKeywords = new HashSet<>(10);
public final HashSet<String> domainKeywords = new HashSet<>(10);
public final Object2IntOpenHashMap<String> wordsTfIdf;
public final Object2IntOpenHashMap<String> positionMask;
private final EnumSet<EdgePageWordFlags> wordFlagsTemplate;
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
@ -45,31 +50,17 @@ public final class KeywordMetadata {
if (titleKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.Title);
if (urlKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.UrlPath);
if (domainKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.UrlDomain);
int positions = positionMask.getOrDefault(stemmed, 0);
return new WordMetadata(tfidf, positions, flags).encode();
}
public HashSet<String> titleKeywords() {
return titleKeywords;
}
public HashSet<String> subjectKeywords() {
return subjectKeywords;
}
public HashSet<String> namesKeywords() {
return namesKeywords;
}
public Object2IntOpenHashMap<String> wordsTfIdf() {
return wordsTfIdf;
}
public Object2IntOpenHashMap<String> positionMask() {
return positionMask;
}
public EnumSet<EdgePageWordFlags> wordFlagsTemplate() {
return wordFlagsTemplate;
}

View File

@ -98,7 +98,7 @@ public class SearchIndex {
IndexQueryBuilder query =
switch(params.queryStrategy()) {
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_DOMAIN, REQUIRE_FIELD_URL
-> indexReader.findWordAsTopic(orderedIncludes);
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
};

View File

@ -152,6 +152,12 @@ public class IndexResultValuator {
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return EdgePageWordFlags.Title.isPresent(metadata);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
return EdgePageWordFlags.UrlPath.isPresent(metadata);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
return EdgePageWordFlags.UrlDomain.isPresent(metadata);
}
return true;
}

View File

@ -9,6 +9,7 @@ import nu.marginalia.ranking.ReversePageRank;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultBitSetAccumulator;
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator;
import nu.marginalia.ranking.data.RankingDomainFetcher;
import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.index.svc.searchset.RankingSearchSet;
@ -101,7 +102,7 @@ public class IndexSearchSetsService {
var entry = rankingSettings.retro;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
@ -115,7 +116,7 @@ public class IndexSearchSetsService {
var rpr = new ReversePageRank(similarityDomains, entry.domains.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
var data = rpr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
@ -128,7 +129,7 @@ public class IndexSearchSetsService {
var entry = rankingSettings.academia;
var spr = new StandardPageRank(similarityDomains, entry.domains.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultBitSetAccumulator::new);
var data = spr.pageRankWithPeripheralNodes(entry.max, RankingResultHashSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);

View File

@ -1,8 +1,8 @@
package nu.marginalia.index.svc.searchset;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.searchset.SearchSet;
import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -21,11 +21,11 @@ import java.nio.file.StandardOpenOption;
public class RankingSearchSet implements SearchSet {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final RoaringBitmap set;
private final IntOpenHashSet set;
public final SearchSetIdentifier identifier;
public final Path source;
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
public RankingSearchSet(SearchSetIdentifier identifier, Path source, IntOpenHashSet set) {
this.identifier = identifier;
this.source = source;
this.set = set;
@ -36,7 +36,7 @@ public class RankingSearchSet implements SearchSet {
this.source = source;
if (!Files.exists(source)) {
set = new RoaringBitmap();
set = new IntOpenHashSet();
}
else {
set = load(source);
@ -47,8 +47,8 @@ public class RankingSearchSet implements SearchSet {
}
}
private static RoaringBitmap load(Path source) throws IOException {
var set = new RoaringBitmap();
private static IntOpenHashSet load(Path source) throws IOException {
var set = new IntOpenHashSet();
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
for (;;) {
try {
@ -73,8 +73,8 @@ public class RankingSearchSet implements SearchSet {
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING)))
{
for (var iter = set.getIntIterator(); iter.hasNext();) {
ds.writeInt(iter.next());
for (var iter = set.intIterator(); iter.hasNext();) {
ds.writeInt(iter.nextInt());
}
}
}

View File

@ -17,9 +17,9 @@ public enum SearchProfile {
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
ACADEMIA("academia", SearchSetIdentifier.ACADEMIA),
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
FOOD("food", SearchSetIdentifier.NONE),
CRAFTS("crafts", SearchSetIdentifier.NONE),
CLASSICS("classics", SearchSetIdentifier.NONE),
;
@ -55,6 +55,9 @@ public enum SearchProfile {
subquery.searchTermsPriority.add("format:html123");
subquery.searchTermsPriority.add("js:false");
}
if (this == PLAIN_TEXT) {
subquery.searchTermsInclude.add("format:plain");
}
if (this == FOOD) {
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
}

View File

@ -282,6 +282,8 @@ public class QueryFactory {
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
case "SENTENCE" -> QueryStrategy.SENTENCE;
case "TOPIC" -> QueryStrategy.TOPIC;
default -> QueryStrategy.AUTO;

View File

@ -152,6 +152,14 @@ public class SearchResultValuator {
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Site)) {
return totalFactor * 0.7;
}
if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlDomain)) {
return totalFactor * 0.8;
}
else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.UrlPath)) {
return totalFactor * 0.9;
}
return totalFactor;
}
@ -212,6 +220,8 @@ public class SearchResultValuator {
final boolean siteAdjacent = flags.contains(EdgePageWordFlags.SiteAdjacent);
final boolean subject = flags.contains(EdgePageWordFlags.Subjects);
final boolean names = flags.contains(EdgePageWordFlags.NamesWords);
final boolean urlDomain = flags.contains(EdgePageWordFlags.UrlDomain);
final boolean urlPath = flags.contains(EdgePageWordFlags.UrlPath);
if (title) {
if (titleLength <= 64) {
@ -236,6 +246,13 @@ public class SearchResultValuator {
f *= Math.pow(0.8, k);
}
if (urlDomain) {
f *= Math.pow(0.8, k);
}
else if (urlPath) {
f *= Math.pow(0.9, k);
}
if (!title && !subject && names) {
f *= Math.pow(0.9, k);
}

View File

@ -11,9 +11,12 @@
<option {{#eq profile "default"}}selected{{/eq}} value="default">Popular Sites</option>
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia</option>
<option {{#eq profile "vintage"}}selected{{/eq}} value="vintage">Web 1.0</option>
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">No Domain Ranking</option>
</optgroup>
<optgroup label="Vintage">
<option {{#eq profile "vintage"}}selected{{/eq}} value="vintage">Web 1.0</option>
<option {{#eq profile "plain-text"}}selected{{/eq}} value="plain-text">Text Files</option>
</optgroup>
<optgroup label="Topics Search">
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes &#127859;</option>
<option {{#eq profile "crafts"}}selected{{/eq}} value="crafts">Crafts &#129697;&#128296; (WIP; mostly textile-craft)</option>