Tweaks for search result relevance

This commit is contained in:
vlofgren 2022-08-29 18:01:07 +02:00
parent 3f2854a5e9
commit 813399401e
40 changed files with 509 additions and 254 deletions

View File

@ -1,6 +1,10 @@
package nu.marginalia.util; package nu.marginalia.util;
import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class DenseBitMap { public class DenseBitMap {
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
@ -15,6 +19,31 @@ public class DenseBitMap {
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
} }
public static DenseBitMap loadFromFile(Path file) throws IOException {
long size = Files.size(file);
var dbm = new DenseBitMap(size/8);
try (var bc = Files.newByteChannel(file)) {
while (dbm.buffer.position() < dbm.buffer.capacity()) {
bc.read(dbm.buffer);
}
}
dbm.buffer.clear();
return dbm;
}
public void writeToFile(Path file) throws IOException {
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
while (buffer.position() < buffer.capacity()) {
bc.write(buffer);
}
}
buffer.clear();
}
public boolean get(long pos) { public boolean get(long pos) {
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
} }

View File

@ -8,7 +8,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -30,7 +30,7 @@ public class DocumentDebugger {
Path tempDir; Path tempDir;
public DocumentDebugger(LanguageModels lm) throws IOException { public DocumentDebugger(LanguageModels lm) throws IOException {
se = new SentenceExtractor(lm); se = new SentenceExtractor(lm);
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
ke = new KeywordExtractor(); ke = new KeywordExtractor();
kc = new KeywordCounter(dict, ke); kc = new KeywordCounter(dict, ke);

View File

@ -6,8 +6,9 @@ import java.nio.file.Path;
@AllArgsConstructor @AllArgsConstructor
public class LanguageModels { public class LanguageModels {
public final Path ngramDictionary; public final Path ngramBloomFilter;
public final Path ngramFrequency; public final Path termFrequencies;
public final Path openNLPSentenceDetectionData; public final Path openNLPSentenceDetectionData;
public final Path posRules; public final Path posRules;
public final Path posDict; public final Path posDict;

View File

@ -4,7 +4,7 @@ import com.google.common.collect.Sets;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
@ -19,29 +19,54 @@ public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
private final KeywordCounter tfIdfCounter; private final KeywordCounter tfIdfCounter;
private final NameCounter nameCounter; private final NameCounter nameCounter;
private final LongNameCounter longNameCounter;
private final SubjectCounter subjectCounter; private final SubjectCounter subjectCounter;
private final NGramDict dict; private final TermFrequencyDict dict;
private final double docCount;
@Inject @Inject
public DocumentKeywordExtractor(NGramDict dict) { public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict; this.dict = dict;
docCount = dict.docCount();
keywordExtractor = new KeywordExtractor(); keywordExtractor = new KeywordExtractor();
tfIdfCounter = new KeywordCounter(dict, keywordExtractor); tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
nameCounter = new NameCounter(keywordExtractor); nameCounter = new NameCounter(keywordExtractor);
longNameCounter = new LongNameCounter(dict, keywordExtractor);
subjectCounter = new SubjectCounter(keywordExtractor); subjectCounter = new SubjectCounter(keywordExtractor);
} }
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
Collection<String> artifacts = getArtifacts(documentLanguageData);
return new EdgePageWordSet(
createWords(IndexBlock.Subjects, subjects),
createWords(IndexBlock.Title, titleWords),
createWords(IndexBlock.NamesWords, wordsNamesAll),
createWords(IndexBlock.Tfidf_Top, topKeywords),
createWords(IndexBlock.Tfidf_Middle, midKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
);
}
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) { public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData); List<WordRep> titleWords = extractTitleWords(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1); List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> subjects = subjectCounter.count(documentLanguageData); List<WordRep> subjects = subjectCounter.count(documentLanguageData);
@ -49,12 +74,9 @@ public class DocumentKeywordExtractor {
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid()); List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top()); List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects);
Collection<String> artifacts = getArtifacts(documentLanguageData); Collection<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet( var wordSet = new EdgePageWordSet(
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
createWords(IndexBlock.Subjects, subjects), createWords(IndexBlock.Subjects, subjects),
createWords(IndexBlock.Title, titleWords), createWords(IndexBlock.Title, titleWords),
createWords(IndexBlock.NamesWords, wordsNamesAll), createWords(IndexBlock.NamesWords, wordsNamesAll),
@ -121,7 +143,7 @@ public class DocumentKeywordExtractor {
else { else {
lastSet = counts.entrySet().stream() lastSet = counts.entrySet().stream()
.sorted(Comparator.comparing(e -> { .sorted(Comparator.comparing(e -> {
double N = 11820118.; // Number of documents in term freq dictionary double N = docCount; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has // Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1) // its parameter inverted (log(a^b) = b log(a); here b = -1)

View File

@ -2,7 +2,7 @@ package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -12,11 +12,13 @@ import java.util.regex.Pattern;
public class KeywordCounter { public class KeywordCounter {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
private final NGramDict dict; private final TermFrequencyDict dict;
private final double docCount;
public KeywordCounter(NGramDict dict, KeywordExtractor keywordExtractor) { public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict; this.dict = dict;
this.keywordExtractor = keywordExtractor; this.keywordExtractor = keywordExtractor;
this.docCount = (double) dict.docCount();
} }
public WordHistogram countHisto(DocumentLanguageData dld) { public WordHistogram countHisto(DocumentLanguageData dld) {
@ -71,7 +73,7 @@ public class KeywordCounter {
if (freq < 1) { if (freq < 1) {
freq = 10; freq = 10;
} }
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.); return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount);
} }
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { } public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }

View File

@ -1,9 +1,9 @@
package nu.marginalia.util.language.processing; package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.util.language.WordPatterns;
import java.lang.ref.SoftReference; import java.lang.ref.SoftReference;
import java.util.ArrayList; import java.util.ArrayList;
@ -377,4 +377,6 @@ public class KeywordExtractor {
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG")); return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
} }
} }

View File

@ -3,7 +3,7 @@ package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import java.util.*; import java.util.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -11,10 +11,11 @@ import java.util.stream.Collectors;
public class LongNameCounter { public class LongNameCounter {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
private final NGramDict dict; private final double docCount;
public LongNameCounter(NGramDict dict, KeywordExtractor keywordExtractor) { public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict; this.dict = dict;
docCount = (double) dict.docCount();
this.keywordExtractor = keywordExtractor; this.keywordExtractor = keywordExtractor;
} }

View File

@ -87,7 +87,7 @@ public class WmsaHome {
final Path home = getHomePath(); final Path home = getHomePath();
return new LanguageModels( return new LanguageModels(
home.resolve("model/ngrams-generous-emstr.bin"), home.resolve("model/ngrams.bin"),
home.resolve("model/tfreq-new-algo3.bin"), home.resolve("model/tfreq-new-algo3.bin"),
home.resolve("model/opennlp-sentence.bin"), home.resolve("model/opennlp-sentence.bin"),
home.resolve("model/English.RDR"), home.resolve("model/English.RDR"),

View File

@ -0,0 +1,78 @@
package nu.marginalia.wmsa.edge.assistant.dict;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
public class NGramBloomFilter {
private final DenseBitMap bitMap;
private static final PorterStemmer ps = new PorterStemmer();
private static final HashFunction hasher = Hashing.murmur3_128(0);
@Inject
public NGramBloomFilter() throws IOException {
this(WmsaHome.getLanguageModels());
}
public NGramBloomFilter(LanguageModels lm) throws IOException {
this(DenseBitMap.loadFromFile(lm.ngramBloomFilter));
}
public NGramBloomFilter(DenseBitMap bitMap) {
this.bitMap = bitMap;
}
public boolean isKnownNGram(String word) {
long bit = bitForWord(word, bitMap.cardinality);
return bitMap.get(bit);
}
public static void main(String... args) throws IOException {
var filter = convertFromDictionaryFile(new File(args[0]));
filter.bitMap.writeToFile(Path.of(args[1]));
}
public static NGramBloomFilter load(Path file) throws IOException {
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
}
public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
AtomicInteger popCount = new AtomicInteger();
try (var f = new KeywordLexiconJournalFile(file)) {
f.loadFile(data -> {
long bit = bitForWord(new String(data), bitMap.cardinality);
if (!bitMap.set(bit))
popCount.incrementAndGet();
});
}
System.out.println("popcount = " + popCount.get());
return new NGramBloomFilter(bitMap);
}
private static final Pattern underscore = Pattern.compile("_");
private static long bitForWord(String s, long n) {
String[] parts = underscore.split(s);
long hc = 0;
for (String part : parts) {
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
}
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
}
}

View File

@ -9,7 +9,6 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner; import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import opennlp.tools.langdetect.LanguageDetector;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -21,12 +20,17 @@ import javax.inject.Singleton;
import java.io.*; import java.io.*;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@Singleton @Singleton
public class NGramDict { public class TermFrequencyDict {
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0); private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
@ -34,21 +38,22 @@ public class NGramDict {
private static final Pattern separator = Pattern.compile("[_ ]+"); private static final Pattern separator = Pattern.compile("[_ ]+");
private static final PorterStemmer ps = new PorterStemmer(); private static final PorterStemmer ps = new PorterStemmer();
private static final long DOC_COUNT_KEY = ~0L;
private static long fileSize(Path p) throws IOException { private static long fileSize(Path p) throws IOException {
return Files.size(p); return Files.size(p);
} }
@Inject @Inject
public NGramDict(@Nullable LanguageModels models) { public TermFrequencyDict(@Nullable LanguageModels models) {
if (models == null) { if (models == null) {
return; return;
} }
if (models.ngramFrequency != null) { if (models.termFrequencies != null) {
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.ngramFrequency.toFile())))) { try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
wordRates.ensureCapacity((int)(fileSize(models.ngramFrequency)/16)); wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
for (;;) { for (;;) {
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong()); wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
@ -56,7 +61,7 @@ public class NGramDict {
} catch (EOFException eof) { } catch (EOFException eof) {
// ok // ok
} catch (IOException e) { } catch (IOException e) {
logger.error("IO Exception reading " + models.ngramFrequency, e); logger.error("IO Exception reading " + models.termFrequencies, e);
} }
} }
@ -64,60 +69,100 @@ public class NGramDict {
} }
public static void main(String... args) throws IOException { public int docCount() {
int cnt = wordRates.get(DOC_COUNT_KEY);
if (cnt == 0) {
cnt = 11820118; // legacy
}
return cnt;
}
public static void main(String... args) throws IOException, InterruptedException {
if (args.length != 2) { if (args.length != 2) {
System.err.println("Expected arguments: plan.yaml out-file"); System.err.println("Expected arguments: plan.yaml out-file");
} }
String inFile = args[0];
String outFile = args[1]; String outFile = args[1];
var plan = new CrawlPlanLoader().load(Path.of(args[0])); var plan = new CrawlPlanLoader().load(Path.of(args[0]));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
DomPruner pruner = new DomPruner(); DomPruner pruner = new DomPruner();
LanguageFilter lf = new LanguageFilter(); LanguageFilter lf = new LanguageFilter();
Map<String, Integer> counts = new HashMap<>(100_000_000); TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
Set<String> words = new HashSet<>(10_000);
ForkJoinPool fjp = new ForkJoinPool(24);
AtomicInteger docCount = new AtomicInteger();
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (domain.doc == null) if (domain.doc == null)
continue; continue;
for (var doc : domain.doc) { fjp.execute(() -> {
if (doc.documentBody == null)
continue;
Document parsed = Jsoup.parse(doc.documentBody); for (var doc : domain.doc) {
pruner.prune(parsed, 0.5); if (doc.documentBody == null)
continue;
docCount.incrementAndGet();
DocumentLanguageData dld = se.extractSentences(parsed); Document parsed = Jsoup.parse(doc.documentBody);
pruner.prune(parsed, 0.5);
if (lf.dictionaryAgreement(dld) < 0.1) { DocumentLanguageData dld = se.get().extractSentences(parsed);
continue;
}
if (lf.dictionaryAgreement(dld) < 0.1) {
for (var sent : dld.sentences) { return;
for (var word : sent) {
words.add(word.stemmed());
} }
}
for (var word : words) { Set<String> words = new HashSet<>(10_000);
counts.merge(word, 1, Integer::sum);
}
words.clear(); for (var sent : dld.sentences) {
for (var word : sent) {
words.add(word.stemmed());
}
}
fjp.execute(() -> {
synchronized (counts) {
for (var word : words) {
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
}
}
});
}
});
}
fjp.shutdown();
fjp.awaitTermination(10, TimeUnit.SECONDS);
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
synchronized (counts) {
counts.put(DOC_COUNT_KEY, docCount.get());
counts.forEachEntry((hash, cnt) -> {
try {
dos.writeLong(hash);
dos.writeLong(cnt);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
});
} }
} }
counts.forEach((w,c) -> { System.out.println(docCount.get());
if (c > 3) { //
System.out.println(w + ":" + c); // counts.forEachEntry((w,c) -> {
} // if (c > 3L) {
}); // System.out.println(w + ":" + c);
// }
// return true;
// });
} }

View File

@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.assistant.suggest;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker; import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import org.apache.commons.collections4.trie.PatriciaTrie; import org.apache.commons.collections4.trie.PatriciaTrie;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -21,7 +21,7 @@ import java.util.stream.Stream;
public class Suggestions { public class Suggestions {
private final PatriciaTrie<String> suggestionsTrie; private final PatriciaTrie<String> suggestionsTrie;
private final NGramDict nGramDict; private final TermFrequencyDict termFrequencyDict;
private final SpellChecker spellChecker; private final SpellChecker spellChecker;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$"); private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
@ -31,12 +31,12 @@ public class Suggestions {
@Inject @Inject
public Suggestions(@Named("suggestions-file") Path suggestionsFile, public Suggestions(@Named("suggestions-file") Path suggestionsFile,
SpellChecker spellChecker, SpellChecker spellChecker,
NGramDict dict TermFrequencyDict dict
) { ) {
this.spellChecker = spellChecker; this.spellChecker = spellChecker;
suggestionsTrie = loadSuggestions(suggestionsFile); suggestionsTrie = loadSuggestions(suggestionsFile);
nGramDict = dict; termFrequencyDict = dict;
logger.info("Loaded {} suggestions", suggestionsTrie.size()); logger.info("Loaded {} suggestions", suggestionsTrie.size());
} }
@ -138,7 +138,7 @@ public class Suggestions {
} }
Map<String, Long> scach = new HashMap<>(512); Map<String, Long> scach = new HashMap<>(512);
Function<String, Long> valr = s -> -nGramDict.getTermFreqHash(scach.computeIfAbsent(s, NGramDict::getStringHash)); Function<String, Long> valr = s -> -termFrequencyDict.getTermFreqHash(scach.computeIfAbsent(s, TermFrequencyDict::getStringHash));
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey) return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
.takeWhile(s -> s.startsWith(prefix)) .takeWhile(s -> s.startsWith(prefix))

View File

@ -6,7 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.DenseBitMap; import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;
@ -36,7 +36,7 @@ public class AnchorTextExtractor {
// de-duplicating billions of shuffled (url, word) tuples on limited hardware // de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels()); private final TermFrequencyDict ngramDict = new TermFrequencyDict(WmsaHome.getLanguageModels());
public AnchorTextExtractor(Predicate<String> includeDomainPredicate, public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
Predicate<EdgeUrl> includeUrlPredicate, Predicate<EdgeUrl> includeUrlPredicate,

View File

@ -17,6 +17,7 @@ public class DisqualifiedException extends Exception {
LANGUAGE, LANGUAGE,
STATUS, STATUS,
QUALITY, QUALITY,
ACCEPTABLE_ADS ACCEPTABLE_ADS,
FORBIDDEN
} }
} }

View File

@ -86,10 +86,6 @@ public class DocumentProcessor {
if (isAcceptedContentType(crawledDocument)) { if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument); var detailsWords = createDetails(crawledDomain, crawledDocument);
if (detailsWords.details().quality < minDocumentQuality) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
ret.details = detailsWords.details(); ret.details = detailsWords.details();
ret.words = detailsWords.words(); ret.words = detailsWords.words();
} }
@ -141,11 +137,14 @@ public class DocumentProcessor {
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
throws DisqualifiedException, URISyntaxException { throws DisqualifiedException, URISyntaxException {
var doc = Jsoup.parse(crawledDocument.documentBody); Document doc = Jsoup.parse(crawledDocument.documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) { if (AcceptableAds.hasAcceptableAdsTag(doc)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
} }
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
}
DomPruner domPruner = new DomPruner(); DomPruner domPruner = new DomPruner();
Document prunedDoc = doc.clone(); Document prunedDoc = doc.clone();
@ -160,11 +159,17 @@ public class DocumentProcessor {
ret.length = getLength(doc); ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc); ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.features = featureExtractor.getFeatures(crawledDomain, doc); ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
ret.quality = documentValuator.getQuality(ret.standard, doc, dld); ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
var words = getWords(dld); EdgePageWordSet words;
if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) {
words = keywordExtractor.extractKeywordsMinimal(dld);
}
else {
words = keywordExtractor.extractKeywords(dld);
}
var url = new EdgeUrl(crawledDocument.url); var url = new EdgeUrl(crawledDocument.url);
addMetaWords(ret, url, crawledDomain, words); addMetaWords(ret, url, crawledDomain, words);
@ -195,7 +200,6 @@ public class DocumentProcessor {
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords); words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words_1, tagWords);
} }
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
@ -255,7 +259,6 @@ public class DocumentProcessor {
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) { private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
Path pFilename = Path.of(link.path.toLowerCase()).getFileName(); Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
if (pFilename == null) return; if (pFilename == null) return;
@ -273,10 +276,6 @@ public class DocumentProcessor {
} }
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
if (dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualificationReason.LENGTH);
}
double languageAgreement = languageFilter.dictionaryAgreement(dld); double languageAgreement = languageFilter.dictionaryAgreement(dld);
if (languageAgreement < 0.1) { if (languageAgreement < 0.1) {
throw new DisqualifiedException(DisqualificationReason.LANGUAGE); throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
@ -292,10 +291,6 @@ public class DocumentProcessor {
return htmlStandard; return htmlStandard;
} }
private EdgePageWordSet getWords(DocumentLanguageData dld) {
return keywordExtractor.extractKeywords(dld);
}
private String getDescription(Document doc) { private String getDescription(Document doc) {
return summaryExtractor.extractSummary(doc); return summaryExtractor.extractSummary(doc);
} }

View File

@ -6,13 +6,13 @@ import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.ArrayList; import java.util.*;
import java.util.Collections; import java.util.stream.Collectors;
import java.util.List;
public class DomainProcessor { public class DomainProcessor {
private final DocumentProcessor documentProcessor; private final DocumentProcessor documentProcessor;
@ -45,6 +45,8 @@ public class DomainProcessor {
ret.documents.add(processedDoc); ret.documents.add(processedDoc);
} }
} }
addCommonSiteWords(ret);
} }
else { else {
ret.documents = Collections.emptyList(); ret.documents = Collections.emptyList();
@ -60,6 +62,40 @@ public class DomainProcessor {
return ret; return ret;
} }
private void addCommonSiteWords(ProcessedDomain ret) {
if (ret.documents.size() < 25)
return;
Map<String, Integer> topKeywordCount = new HashMap<>(ret.documents.size()*10);
for (var doc : ret.documents) {
if (doc.words == null)
continue;
for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) {
topKeywordCount.merge(word, -1, Integer::sum);
}
}
if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100)
return;
Set<String> topWords = topKeywordCount.entrySet().stream()
.filter(e -> e.getValue() < -10)
.sorted(Map.Entry.comparingByValue()).limit(5)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
if (!topWords.isEmpty()) {
for (var doc : ret.documents) {
if (doc.words != null) {
doc.words.get(IndexBlock.Site).addAll(topWords);
}
}
}
}
private double getAverageQuality(List<ProcessedDocument> documents) { private double getAverageQuality(List<ProcessedDocument> documents) {
int n = 0; int n = 0;
double q = 0.; double q = 0.;

View File

@ -4,7 +4,7 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode; import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor; import org.jsoup.select.NodeFilter;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -12,100 +12,103 @@ import java.util.Map;
public class DomPruner { public class DomPruner {
public void prune(Document document, double pruneThreshold) { public void prune(Document document, double pruneThreshold) {
PruningVisitor pruningVisitor = new PruningVisitor(); document.filter(new PruningFilter(pruneThreshold));
document.traverse(pruningVisitor);
pruningVisitor.data.forEach((node, data) -> {
if (data.depth <= 1) {
return;
}
if (data.signalNodeSize == 0) node.remove();
else if (data.noiseNodeSize > 0
&& data.signalRate() < pruneThreshold
&& data.treeSize > 3) {
node.remove();
}
});
} }
}
private static class PruningVisitor implements NodeVisitor { class PruningFilter implements NodeFilter {
private final Map<Node, NodeData> data = new HashMap<>(); private final Map<Node, NodeData> data = new HashMap<>();
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
private double pruneThreshold;
@Override public PruningFilter(double pruneThreshold) {
public void head(Node node, int depth) {} this.pruneThreshold = pruneThreshold;
@Override
public void tail(Node node, int depth) {
final NodeData dataForNode;
if (node instanceof TextNode tn) {
dataForNode = new NodeData(depth, tn.text().length(), 0);
}
else if (isSignal(node)) {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.add(data.getOrDefault(childNode, dummy));
}
}
else {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
}
}
data.put(node, dataForNode);
}
public boolean isSignal(Node node) {
if (node instanceof Element e) {
if ("a".equalsIgnoreCase(e.tagName()))
return false;
if ("nav".equalsIgnoreCase(e.tagName()))
return false;
if ("footer".equalsIgnoreCase(e.tagName()))
return false;
if ("header".equalsIgnoreCase(e.tagName()))
return false;
}
return true;
}
} }
private static class NodeData { @Override
int signalNodeSize; public FilterResult head(Node node, int depth) {
int noiseNodeSize; return FilterResult.CONTINUE;
int treeSize = 1; }
int depth;
private NodeData(int depth, int signalNodeSize, int noiseNodeSize) { @Override
this.depth = depth; public FilterResult tail(Node node, int depth) {
this.signalNodeSize = signalNodeSize; final NodeData dataForNode;
this.noiseNodeSize = noiseNodeSize;
if (node instanceof TextNode tn) {
dataForNode = new NodeData(depth, tn.text().length(), 0);
}
else if (isSignal(node)) {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.add(data.getOrDefault(childNode, dummy));
}
}
else {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
}
} }
public void add(NodeData other) { data.put(node, dataForNode);
signalNodeSize += other.signalNodeSize;
noiseNodeSize += other.noiseNodeSize; if (dataForNode.depth <= 1)
treeSize += other.treeSize; return FilterResult.CONTINUE;
if (dataForNode.signalNodeSize == 0)
return FilterResult.REMOVE;
if (dataForNode.noiseNodeSize > 0
&& dataForNode.signalRate() < pruneThreshold
&& dataForNode.treeSize > 3)
return FilterResult.REMOVE;
return FilterResult.CONTINUE;
}
public boolean isSignal(Node node) {
if (node instanceof Element e) {
if ("a".equalsIgnoreCase(e.tagName()))
return false;
if ("nav".equalsIgnoreCase(e.tagName()))
return false;
if ("footer".equalsIgnoreCase(e.tagName()))
return false;
if ("header".equalsIgnoreCase(e.tagName()))
return false;
} }
public void addAsNoise(NodeData other) { return true;
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
treeSize += other.treeSize;
}
public double signalRate() {
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
}
} }
} }
class NodeData {
int signalNodeSize;
int noiseNodeSize;
int treeSize = 1;
int depth;
NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
this.depth = depth;
this.signalNodeSize = signalNodeSize;
this.noiseNodeSize = noiseNodeSize;
}
public void add(NodeData other) {
signalNodeSize += other.signalNodeSize;
noiseNodeSize += other.noiseNodeSize;
treeSize += other.treeSize;
}
public void addAsNoise(NodeData other) {
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
treeSize += other.treeSize;
}
public double signalRate() {
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
}
}

View File

@ -2,7 +2,11 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@ -35,14 +39,20 @@ public class FeatureExtractor {
"d31qbv1cthcecs.cloudfront.net", "d31qbv1cthcecs.cloudfront.net",
"linkedin.com"); "linkedin.com");
private AdblockSimulator adblockSimulator; private final AdblockSimulator adblockSimulator;
private final RecipeDetector recipeDetector;
private final TextileCraftDetector textileCraftDetector;
private final WoodworkingDetector woodworkingDetector;
@Inject @Inject
public FeatureExtractor(AdblockSimulator adblockSimulator) { public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
this.adblockSimulator = adblockSimulator; this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector;
this.woodworkingDetector = woodworkingDetector;
} }
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) { public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
final Set<HtmlFeature> features = new HashSet<>(); final Set<HtmlFeature> features = new HashSet<>();
final Elements scriptTags = doc.getElementsByTag("script"); final Elements scriptTags = doc.getElementsByTag("script");
@ -81,9 +91,14 @@ public class FeatureExtractor {
} }
} }
if (!domain.cookies.isEmpty()) { if (!domain.cookies.isEmpty())
features.add(HtmlFeature.COOKIES); features.add(HtmlFeature.COOKIES);
}
if (recipeDetector.testP(dld) > 0.5)
features.add(HtmlFeature.CATEGORY_FOOD);
// these should be mutually exclusive
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
features.add(HtmlFeature.CATEGORY_CRAFTS);
return features; return features;
} }

View File

@ -12,6 +12,8 @@ public enum HtmlFeature {
CATEGORY_FOOD("category:food"), CATEGORY_FOOD("category:food"),
ADVERTISEMENT("special:ads"), ADVERTISEMENT("special:ads"),
CATEGORY_CRAFTS("category:crafts"),
; ;
private final String keyword; private final String keyword;

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic; package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap; import java.util.HashMap;
@ -14,6 +15,7 @@ public class RecipeDetector {
private final Map<String, Double> termValues = new HashMap<>(); private final Map<String, Double> termValues = new HashMap<>();
@Inject
public RecipeDetector() { public RecipeDetector() {
PorterStemmer ps = new PorterStemmer(); PorterStemmer ps = new PorterStemmer();

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic; package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap; import java.util.HashMap;
@ -14,6 +15,7 @@ public class TextileCraftDetector {
private final Map<String, Double> termValues = new HashMap<>(); private final Map<String, Double> termValues = new HashMap<>();
@Inject
public TextileCraftDetector() { public TextileCraftDetector() {
PorterStemmer ps = new PorterStemmer(); PorterStemmer ps = new PorterStemmer();

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic; package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap; import java.util.HashMap;
@ -14,6 +15,7 @@ public class WoodworkingDetector {
private final Map<String, Double> termValues = new HashMap<>(); private final Map<String, Double> termValues = new HashMap<>();
@Inject
public WoodworkingDetector() { public WoodworkingDetector() {
PorterStemmer ps = new PorterStemmer(); PorterStemmer ps = new PorterStemmer();

View File

@ -15,7 +15,7 @@ import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Consumer; import java.util.function.Consumer;
public class KeywordLexiconJournalFile { public class KeywordLexiconJournalFile implements AutoCloseable {
private final RandomAccessFile journalFileRAF; private final RandomAccessFile journalFileRAF;
private final File journalFile; private final File journalFile;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());

View File

@ -4,22 +4,24 @@ public enum IndexBlock {
TitleKeywords(0, 0), TitleKeywords(0, 0),
Title(1, 1), Title(1, 1),
Link(2, 1.25), Link(2, 1.15),
Subjects(3, 0.5), Subjects(3, 3.0),
NamesWords(4, 5), NamesWords(4, 3.0),
Artifacts(5, 10), Artifacts(5, 10),
Meta(6, 7), Meta(6, 7),
Tfidf_Top(7, 2), Tfidf_Top(7, 0.5),
Tfidf_Middle(8, 2.5), Tfidf_Middle(8, 1.25),
Tfidf_Lower(9, 5.0), Tfidf_Lower(9, 1.5),
Words_1(10, 3.0), Words_1(10, 3.0),
Words_2(11, 3.5), Words_2(11, 3.5),
Words_4(12, 4.0), Words_4(12, 4.0),
Words_8(13, 4.5), Words_8(13, 4.5),
Words_16Plus(14, 7.0), Words_16Plus(14, 7.0),
Site(15, 1.2),
; ;
public final int id; public final int id;

View File

@ -29,7 +29,6 @@ public class SearchIndexReader implements AutoCloseable {
IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Top,
IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Middle,
IndexBlock.Tfidf_Lower, IndexBlock.Tfidf_Lower,
IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_1,
IndexBlock.Words_2, IndexBlock.Words_2,
IndexBlock.Words_4, IndexBlock.Words_4,
@ -62,15 +61,14 @@ public class SearchIndexReader implements AutoCloseable {
queryBuilders = new EnumMap<>(IndexBlock.class); queryBuilders = new EnumMap<>(IndexBlock.class);
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1)); queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1));
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1)); queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1));
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1)); queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1));
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1)); queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1));
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1)); queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
} }
@SafeVarargs @SafeVarargs

View File

@ -19,7 +19,8 @@ public class EdgePageWordSet {
public EdgePageWords get(IndexBlock block) { public EdgePageWords get(IndexBlock block) {
var words = wordSets.get(block); var words = wordSets.get(block);
if (words == null) { if (words == null) {
return new EdgePageWords(block); words = new EdgePageWords(block);
wordSets.put(block, words);
} }
return words; return words;
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.wmsa.edge.search.query; package nu.marginalia.wmsa.edge.search.query;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -13,11 +13,11 @@ import java.util.stream.Collectors;
public class EnglishDictionary { public class EnglishDictionary {
private final Set<String> englishWords = new HashSet<>(); private final Set<String> englishWords = new HashSet<>();
private final NGramDict dict; private final TermFrequencyDict dict;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public EnglishDictionary(NGramDict dict) { public EnglishDictionary(TermFrequencyDict dict) {
this.dict = dict; this.dict = dict;
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"), try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
"Could not load word frequency table"); "Could not load word frequency table");

View File

@ -4,7 +4,8 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
@ -22,20 +23,22 @@ import java.util.*;
public class QueryFactory { public class QueryFactory {
private final LanguageModels lm; private final LanguageModels lm;
private final NGramDict dict; private final TermFrequencyDict dict;
private final EnglishDictionary englishDictionary; private final EnglishDictionary englishDictionary;
private final NGramBloomFilter nGramBloomFilter;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public QueryFactory(LanguageModels lm, NGramDict dict, EnglishDictionary englishDictionary) { public QueryFactory(LanguageModels lm, TermFrequencyDict dict, EnglishDictionary englishDictionary, NGramBloomFilter nGramBloomFilter) {
this.lm = lm; this.lm = lm;
this.dict = dict; this.dict = dict;
this.englishDictionary = englishDictionary; this.englishDictionary = englishDictionary;
this.nGramBloomFilter = nGramBloomFilter;
} }
public QueryParser getParser() { public QueryParser getParser() {
return new QueryParser(englishDictionary, new QueryVariants(lm ,dict, englishDictionary)); return new QueryParser(englishDictionary, new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
} }
public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) { public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) {

View File

@ -10,7 +10,8 @@ import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import opennlp.tools.stemmer.PorterStemmer; import opennlp.tools.stemmer.PorterStemmer;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -24,14 +25,18 @@ public class QueryVariants {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
private final SentenceExtractor sentenceExtractor; private final SentenceExtractor sentenceExtractor;
private final NGramDict dict; private final TermFrequencyDict dict;
private final PorterStemmer ps = new PorterStemmer(); private final PorterStemmer ps = new PorterStemmer();
private final static int MAX_NGRAM_LENGTH = 4; private final NGramBloomFilter nGramBloomFilter;
private final EnglishDictionary englishDictionary; private final EnglishDictionary englishDictionary;
@Inject @Inject
public QueryVariants(LanguageModels lm, NGramDict dict, EnglishDictionary englishDictionary) { public QueryVariants(LanguageModels lm,
TermFrequencyDict dict,
NGramBloomFilter nGramBloomFilter,
EnglishDictionary englishDictionary) {
this.nGramBloomFilter = nGramBloomFilter;
this.englishDictionary = englishDictionary; this.englishDictionary = englishDictionary;
this.keywordExtractor = new KeywordExtractor(); this.keywordExtractor = new KeywordExtractor();
this.sentenceExtractor = new SentenceExtractor(lm); this.sentenceExtractor = new SentenceExtractor(lm);
@ -154,11 +159,11 @@ public class QueryVariants {
double q = 0; double q = 0;
for (var word : lst) { for (var word : lst) {
String[] parts = underscore.split(word); String[] parts = underscore.split(word);
StringJoiner combined = new StringJoiner("_"); double qp = 0;
for (String part : parts) { for (String part : parts) {
combined.add(ps.stem(part)); qp += 1./(1+ dict.getTermFreq(part));
} }
q += Math.log(1 + dict.getTermFreqStemmed(combined.toString())); q += 1.0 / qp;
} }
ret.add(new QueryVariant(lst, q)); ret.add(new QueryVariant(lst, q));
} }
@ -215,8 +220,8 @@ public class QueryVariants {
while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) {
ws = wordMatcher.start()+1; ws = wordMatcher.start()+1;
ss = stemmedMatcher.start()+1; ss = stemmedMatcher.start()+1;
if (dict.getTermFreqStemmed(splitAtNumBoundaryAndStem(span.word, stemmedMatcher.start(), "_")) > 0 if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_"))
|| dict.getTermFreqStemmed(splitAtNumBoundaryAndStem(span.word, stemmedMatcher.start(), "-")) > 0) || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-")))
{ {
String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_");
asTokens2.add(combined); asTokens2.add(combined);
@ -242,7 +247,7 @@ public class QueryVariants {
for (var span : ls) { for (var span : ls) {
var matcher = dashBoundary.matcher(span.word); var matcher = dashBoundary.matcher(span.word);
if (matcher.find() && dict.getTermFreqStemmed(ps.stem(dashBoundary.matcher(span.word).replaceAll(""))) > 0) { if (matcher.find() && nGramBloomFilter.isKnownNGram(ps.stem(dashBoundary.matcher(span.word).replaceAll("")))) {
dash = true; dash = true;
String combined = dashBoundary.matcher(span.word).replaceAll(""); String combined = dashBoundary.matcher(span.word).replaceAll("");
asTokens2.add(combined); asTokens2.add(combined);
@ -262,10 +267,6 @@ public class QueryVariants {
return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1);
} }
private String splitAtNumBoundaryAndStem(String in, int splitPoint, String joiner) {
return ps.stem(in.substring(0, splitPoint+1)) + joiner + ps.stem(in.substring(splitPoint+1));
}
private List<List<Word>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) { private List<List<Word>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) {
List<List<Word>> goodSpans = new ArrayList<>(); List<List<Word>> goodSpans = new ArrayList<>();
for (int i = 0; i < sentence.length(); i++) { for (int i = 0; i < sentence.length(); i++) {

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.results;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
@ -12,7 +12,7 @@ import java.util.regex.Pattern;
@Singleton @Singleton
public class SearchResultValuator { public class SearchResultValuator {
private final NGramDict dict; private final TermFrequencyDict dict;
private static final Pattern separator = Pattern.compile("_"); private static final Pattern separator = Pattern.compile("_");
@ -20,7 +20,7 @@ public class SearchResultValuator {
private static final int AVG_LENGTH = 1400; private static final int AVG_LENGTH = 1400;
@Inject @Inject
public SearchResultValuator(NGramDict dict) { public SearchResultValuator(TermFrequencyDict dict) {
this.dict = dict; this.dict = dict;
} }

View File

@ -2,7 +2,7 @@
<html> <html>
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>Marginalia Search}</title> <title>Marginalia Search</title>
<link rel="stylesheet" href="/style-new.css" /> <link rel="stylesheet" href="/style-new.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia"> <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">

View File

@ -1,7 +1,7 @@
package nu.marginalia.util; package nu.marginalia.util;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.configuration.WmsaHome;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -26,7 +26,7 @@ public class TestLanguageModels {
var languageModelsHome = getLanguageModelsPath(); var languageModelsHome = getLanguageModelsPath();
return new LanguageModels( return new LanguageModels(
languageModelsHome.resolve("ngrams-generous-emstr.bin"), languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-generous-emstr.bin"), languageModelsHome.resolve("tfreq-generous-emstr.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),

View File

@ -1,9 +1,9 @@
package nu.marginalia.wmsa.edge.assistant.suggest; package nu.marginalia.wmsa.edge.assistant.suggest;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -17,7 +17,7 @@ class SuggestionsTest {
public static void setUp() { public static void setUp() {
LanguageModels lm = TestLanguageModels.getLanguageModels(); LanguageModels lm = TestLanguageModels.getLanguageModels();
suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"), suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"),
new SpellChecker(), new NGramDict(lm)); new SpellChecker(), new TermFrequencyDict(lm));
} }
@Test @Test

View File

@ -10,7 +10,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
@ -45,7 +45,7 @@ class SentenceExtractorTest {
System.out.println("Running"); System.out.println("Running");
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
SentenceExtractor se = new SentenceExtractor(lm); SentenceExtractor se = new SentenceExtractor(lm);
KeywordExtractor keywordExtractor = new KeywordExtractor(); KeywordExtractor keywordExtractor = new KeywordExtractor();
@ -85,7 +85,7 @@ class SentenceExtractorTest {
System.out.println("Running"); System.out.println("Running");
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
@ -110,7 +110,7 @@ class SentenceExtractorTest {
System.out.println("Running"); System.out.println("Running");
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
@ -154,7 +154,7 @@ class SentenceExtractorTest {
public void testSE() { public void testSE() {
var result = newSe.extractSentences(Jsoup.parse(new URL("https://memex.marginalia.nu/log/26-personalized-pagerank.gmi"), 10000)); var result = newSe.extractSentences(Jsoup.parse(new URL("https://memex.marginalia.nu/log/26-personalized-pagerank.gmi"), 10000));
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result)); System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));

View File

@ -1,10 +1,10 @@
package nu.marginalia.wmsa.edge.integration.arxiv; package nu.marginalia.wmsa.edge.integration.arxiv;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata; import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -26,7 +26,7 @@ class ArxivParserTest {
@Test @Test
void extractKeywords() throws IOException { void extractKeywords() throws IOException {
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);

View File

@ -1,11 +1,11 @@
package nu.marginalia.wmsa.edge.integration.stackoverflow; package nu.marginalia.wmsa.edge.integration.stackoverflow;
import nu.marginalia.util.ParallelPipe;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.ParallelPipe; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
@ -20,7 +20,7 @@ public class StackOverflowPostsTest {
@Test @Disabled("this is stupidly slow") @Test @Disabled("this is stupidly slow")
public void test() throws ParserConfigurationException, SAXException, InterruptedException { public void test() throws ParserConfigurationException, SAXException, InterruptedException {
var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm)); var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm));
ThreadLocal<StackOverflowPostProcessor> processor = ThreadLocal.withInitial(() -> { ThreadLocal<StackOverflowPostProcessor> processor = ThreadLocal.withInitial(() -> {
return new StackOverflowPostProcessor(new SentenceExtractor(lm), documentKeywordExtractor); return new StackOverflowPostProcessor(new SentenceExtractor(lm), documentKeywordExtractor);

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.integration.wikipedia; package nu.marginalia.wmsa.edge.integration.wikipedia;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ParallelPipe;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.DocumentDebugger; import nu.marginalia.util.language.DocumentDebugger;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.ParallelPipe; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
@ -21,7 +21,7 @@ public class WikipediaTest {
@Test @SneakyThrows @Test @SneakyThrows
public void test() { public void test() {
var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm)); var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm));
ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> { ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> {
return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor); return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor);
}); });
@ -48,7 +48,7 @@ public class WikipediaTest {
@Test @SneakyThrows @Test @SneakyThrows
public void test2() { public void test2() {
var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm)); var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm));
var debugger = new DocumentDebugger(lm); var debugger = new DocumentDebugger(lm);
ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> { ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> {

View File

@ -1,31 +1,35 @@
package nu.marginalia.wmsa.edge.search.query; package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.List; import java.util.List;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
class BodyQueryParserTest { class BodyQueryParserTest {
private QueryParser parser; private QueryParser parser;
private static NGramDict dict; private static TermFrequencyDict dict;
private static EnglishDictionary englishDictionary; private static EnglishDictionary englishDictionary;
private static NGramBloomFilter nGramBloomFilter;
private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeAll @BeforeAll
public static void init() { public static void init() throws IOException {
dict = new NGramDict(lm); dict = new TermFrequencyDict(lm);
nGramBloomFilter = new NGramBloomFilter(lm);
englishDictionary = new EnglishDictionary(dict); englishDictionary = new EnglishDictionary(dict);
} }
@BeforeEach @BeforeEach
public void setUp() { public void setUp() {
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, englishDictionary)); parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
} }
@Test @Test

View File

@ -1,8 +1,8 @@
package nu.marginalia.wmsa.edge.search.query; package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class EnglishDictionaryTest { class EnglishDictionaryTest {
@ -11,7 +11,7 @@ class EnglishDictionaryTest {
void getWordVariants() { void getWordVariants() {
LanguageModels lm = TestLanguageModels.getLanguageModels(); LanguageModels lm = TestLanguageModels.getLanguageModels();
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println); new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);
} }
} }

View File

@ -1,25 +1,29 @@
package nu.marginalia.wmsa.edge.search.query; package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.stream.Collectors; import java.util.stream.Collectors;
class QueryParserTest { class QueryParserTest {
private QueryParser parser; private QueryParser parser;
private static NGramDict dict; private static TermFrequencyDict dict;
private static EnglishDictionary englishDictionary; private static EnglishDictionary englishDictionary;
private static NGramBloomFilter nGramBloomFilter;
private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeEach @BeforeEach
public void setUp() { public void setUp() throws IOException {
dict = new NGramDict(lm); dict = new TermFrequencyDict(lm);
nGramBloomFilter = new NGramBloomFilter(lm);
englishDictionary = new EnglishDictionary(dict); englishDictionary = new EnglishDictionary(dict);
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, englishDictionary)); parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
} }
@Test @Test

View File

@ -3,23 +3,27 @@ package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
class QueryVariantsTest { class QueryVariantsTest {
QueryVariants variants; QueryVariants variants;
QueryParser parser; QueryParser parser;
SentenceExtractor se; SentenceExtractor se;
@BeforeEach @BeforeEach
public void setUp() { public void setUp() throws IOException {
LanguageModels lm = TestLanguageModels.getLanguageModels(); LanguageModels lm = TestLanguageModels.getLanguageModels();
se = new SentenceExtractor(lm); se = new SentenceExtractor(lm);
var dict = new NGramDict(lm); var dict = new TermFrequencyDict(lm);
variants = new QueryVariants(lm, dict, new EnglishDictionary(dict)); var ngrams = new NGramBloomFilter(lm);
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict));
parser = new QueryParser(new EnglishDictionary(dict), variants); parser = new QueryParser(new EnglishDictionary(dict), variants);
} }