Tweaks for search result relevance
This commit is contained in:
parent
3f2854a5e9
commit
813399401e
@ -1,6 +1,10 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class DenseBitMap {
|
public class DenseBitMap {
|
||||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||||
@ -15,6 +19,31 @@ public class DenseBitMap {
|
|||||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static DenseBitMap loadFromFile(Path file) throws IOException {
|
||||||
|
long size = Files.size(file);
|
||||||
|
var dbm = new DenseBitMap(size/8);
|
||||||
|
|
||||||
|
try (var bc = Files.newByteChannel(file)) {
|
||||||
|
while (dbm.buffer.position() < dbm.buffer.capacity()) {
|
||||||
|
bc.read(dbm.buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dbm.buffer.clear();
|
||||||
|
|
||||||
|
return dbm;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeToFile(Path file) throws IOException {
|
||||||
|
|
||||||
|
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||||
|
while (buffer.position() < buffer.capacity()) {
|
||||||
|
bc.write(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
public boolean get(long pos) {
|
public boolean get(long pos) {
|
||||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
|||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
@ -30,7 +30,7 @@ public class DocumentDebugger {
|
|||||||
Path tempDir;
|
Path tempDir;
|
||||||
public DocumentDebugger(LanguageModels lm) throws IOException {
|
public DocumentDebugger(LanguageModels lm) throws IOException {
|
||||||
se = new SentenceExtractor(lm);
|
se = new SentenceExtractor(lm);
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
ke = new KeywordExtractor();
|
ke = new KeywordExtractor();
|
||||||
|
|
||||||
kc = new KeywordCounter(dict, ke);
|
kc = new KeywordCounter(dict, ke);
|
||||||
|
@ -6,8 +6,9 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class LanguageModels {
|
public class LanguageModels {
|
||||||
public final Path ngramDictionary;
|
public final Path ngramBloomFilter;
|
||||||
public final Path ngramFrequency;
|
public final Path termFrequencies;
|
||||||
|
|
||||||
public final Path openNLPSentenceDetectionData;
|
public final Path openNLPSentenceDetectionData;
|
||||||
public final Path posRules;
|
public final Path posRules;
|
||||||
public final Path posDict;
|
public final Path posDict;
|
||||||
|
@ -4,7 +4,7 @@ import com.google.common.collect.Sets;
|
|||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
@ -19,29 +19,54 @@ public class DocumentKeywordExtractor {
|
|||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final KeywordCounter tfIdfCounter;
|
private final KeywordCounter tfIdfCounter;
|
||||||
private final NameCounter nameCounter;
|
private final NameCounter nameCounter;
|
||||||
private final LongNameCounter longNameCounter;
|
|
||||||
private final SubjectCounter subjectCounter;
|
private final SubjectCounter subjectCounter;
|
||||||
|
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
private final double docCount;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(NGramDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
|
docCount = dict.docCount();
|
||||||
|
|
||||||
keywordExtractor = new KeywordExtractor();
|
keywordExtractor = new KeywordExtractor();
|
||||||
|
|
||||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||||
nameCounter = new NameCounter(keywordExtractor);
|
nameCounter = new NameCounter(keywordExtractor);
|
||||||
longNameCounter = new LongNameCounter(dict, keywordExtractor);
|
|
||||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
|
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||||
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||||
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
|
|
||||||
|
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||||
|
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||||
|
|
||||||
|
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
|
return new EdgePageWordSet(
|
||||||
|
createWords(IndexBlock.Subjects, subjects),
|
||||||
|
createWords(IndexBlock.Title, titleWords),
|
||||||
|
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||||
|
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||||
|
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||||
|
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
|
|
||||||
@ -49,12 +74,9 @@ public class DocumentKeywordExtractor {
|
|||||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||||
|
|
||||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects);
|
|
||||||
|
|
||||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
var wordSet = new EdgePageWordSet(
|
var wordSet = new EdgePageWordSet(
|
||||||
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
|
|
||||||
createWords(IndexBlock.Subjects, subjects),
|
createWords(IndexBlock.Subjects, subjects),
|
||||||
createWords(IndexBlock.Title, titleWords),
|
createWords(IndexBlock.Title, titleWords),
|
||||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||||
@ -121,7 +143,7 @@ public class DocumentKeywordExtractor {
|
|||||||
else {
|
else {
|
||||||
lastSet = counts.entrySet().stream()
|
lastSet = counts.entrySet().stream()
|
||||||
.sorted(Comparator.comparing(e -> {
|
.sorted(Comparator.comparing(e -> {
|
||||||
double N = 11820118.; // Number of documents in term freq dictionary
|
double N = docCount; // Number of documents in term freq dictionary
|
||||||
|
|
||||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.util.language.processing;
|
|||||||
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
@ -12,11 +12,13 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class KeywordCounter {
|
public class KeywordCounter {
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
private final double docCount;
|
||||||
|
|
||||||
public KeywordCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
|
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
|
this.docCount = (double) dict.docCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||||
@ -71,7 +73,7 @@ public class KeywordCounter {
|
|||||||
if (freq < 1) {
|
if (freq < 1) {
|
||||||
freq = 10;
|
freq = 10;
|
||||||
}
|
}
|
||||||
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.);
|
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
|
||||||
|
|
||||||
import java.lang.ref.SoftReference;
|
import java.lang.ref.SoftReference;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -377,4 +377,6 @@ public class KeywordExtractor {
|
|||||||
|
|
||||||
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.util.language.processing;
|
|||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@ -11,10 +11,11 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
public class LongNameCounter {
|
public class LongNameCounter {
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
|
private final TermFrequencyDict dict;
|
||||||
private final NGramDict dict;
|
private final double docCount;
|
||||||
public LongNameCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
|
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
|
docCount = (double) dict.docCount();
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ public class WmsaHome {
|
|||||||
final Path home = getHomePath();
|
final Path home = getHomePath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
home.resolve("model/ngrams-generous-emstr.bin"),
|
home.resolve("model/ngrams.bin"),
|
||||||
home.resolve("model/tfreq-new-algo3.bin"),
|
home.resolve("model/tfreq-new-algo3.bin"),
|
||||||
home.resolve("model/opennlp-sentence.bin"),
|
home.resolve("model/opennlp-sentence.bin"),
|
||||||
home.resolve("model/English.RDR"),
|
home.resolve("model/English.RDR"),
|
||||||
|
@ -0,0 +1,78 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.assistant.dict;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.util.DenseBitMap;
|
||||||
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class NGramBloomFilter {
|
||||||
|
private final DenseBitMap bitMap;
|
||||||
|
private static final PorterStemmer ps = new PorterStemmer();
|
||||||
|
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NGramBloomFilter() throws IOException {
|
||||||
|
this(WmsaHome.getLanguageModels());
|
||||||
|
}
|
||||||
|
|
||||||
|
public NGramBloomFilter(LanguageModels lm) throws IOException {
|
||||||
|
this(DenseBitMap.loadFromFile(lm.ngramBloomFilter));
|
||||||
|
}
|
||||||
|
|
||||||
|
public NGramBloomFilter(DenseBitMap bitMap) {
|
||||||
|
this.bitMap = bitMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isKnownNGram(String word) {
|
||||||
|
long bit = bitForWord(word, bitMap.cardinality);
|
||||||
|
|
||||||
|
return bitMap.get(bit);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException {
|
||||||
|
var filter = convertFromDictionaryFile(new File(args[0]));
|
||||||
|
filter.bitMap.writeToFile(Path.of(args[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static NGramBloomFilter load(Path file) throws IOException {
|
||||||
|
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
|
||||||
|
DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
|
||||||
|
AtomicInteger popCount = new AtomicInteger();
|
||||||
|
try (var f = new KeywordLexiconJournalFile(file)) {
|
||||||
|
f.loadFile(data -> {
|
||||||
|
long bit = bitForWord(new String(data), bitMap.cardinality);
|
||||||
|
if (!bitMap.set(bit))
|
||||||
|
popCount.incrementAndGet();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("popcount = " + popCount.get());
|
||||||
|
return new NGramBloomFilter(bitMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Pattern underscore = Pattern.compile("_");
|
||||||
|
|
||||||
|
private static long bitForWord(String s, long n) {
|
||||||
|
String[] parts = underscore.split(s);
|
||||||
|
long hc = 0;
|
||||||
|
for (String part : parts) {
|
||||||
|
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
|
||||||
|
}
|
||||||
|
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -9,7 +9,6 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
|||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
import opennlp.tools.langdetect.LanguageDetector;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -21,12 +20,17 @@ import javax.inject.Singleton;
|
|||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class NGramDict {
|
public class TermFrequencyDict {
|
||||||
|
|
||||||
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
||||||
|
|
||||||
@ -34,21 +38,22 @@ public class NGramDict {
|
|||||||
private static final Pattern separator = Pattern.compile("[_ ]+");
|
private static final Pattern separator = Pattern.compile("[_ ]+");
|
||||||
private static final PorterStemmer ps = new PorterStemmer();
|
private static final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
private static final long DOC_COUNT_KEY = ~0L;
|
||||||
private static long fileSize(Path p) throws IOException {
|
private static long fileSize(Path p) throws IOException {
|
||||||
return Files.size(p);
|
return Files.size(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public NGramDict(@Nullable LanguageModels models) {
|
public TermFrequencyDict(@Nullable LanguageModels models) {
|
||||||
if (models == null) {
|
if (models == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (models.ngramFrequency != null) {
|
if (models.termFrequencies != null) {
|
||||||
|
|
||||||
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.ngramFrequency.toFile())))) {
|
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
|
||||||
|
|
||||||
wordRates.ensureCapacity((int)(fileSize(models.ngramFrequency)/16));
|
wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
||||||
@ -56,7 +61,7 @@ public class NGramDict {
|
|||||||
} catch (EOFException eof) {
|
} catch (EOFException eof) {
|
||||||
// ok
|
// ok
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("IO Exception reading " + models.ngramFrequency, e);
|
logger.error("IO Exception reading " + models.termFrequencies, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -64,60 +69,100 @@ public class NGramDict {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public int docCount() {
|
||||||
|
int cnt = wordRates.get(DOC_COUNT_KEY);
|
||||||
|
|
||||||
|
if (cnt == 0) {
|
||||||
|
cnt = 11820118; // legacy
|
||||||
|
}
|
||||||
|
return cnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException, InterruptedException {
|
||||||
if (args.length != 2) {
|
if (args.length != 2) {
|
||||||
System.err.println("Expected arguments: plan.yaml out-file");
|
System.err.println("Expected arguments: plan.yaml out-file");
|
||||||
}
|
}
|
||||||
String inFile = args[0];
|
|
||||||
String outFile = args[1];
|
String outFile = args[1];
|
||||||
|
|
||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||||
DomPruner pruner = new DomPruner();
|
DomPruner pruner = new DomPruner();
|
||||||
LanguageFilter lf = new LanguageFilter();
|
LanguageFilter lf = new LanguageFilter();
|
||||||
|
|
||||||
Map<String, Integer> counts = new HashMap<>(100_000_000);
|
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||||
Set<String> words = new HashSet<>(10_000);
|
|
||||||
|
ForkJoinPool fjp = new ForkJoinPool(24);
|
||||||
|
AtomicInteger docCount = new AtomicInteger();
|
||||||
|
|
||||||
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
||||||
|
|
||||||
if (domain.doc == null)
|
if (domain.doc == null)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (var doc : domain.doc) {
|
fjp.execute(() -> {
|
||||||
if (doc.documentBody == null)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
Document parsed = Jsoup.parse(doc.documentBody);
|
for (var doc : domain.doc) {
|
||||||
pruner.prune(parsed, 0.5);
|
if (doc.documentBody == null)
|
||||||
|
continue;
|
||||||
|
docCount.incrementAndGet();
|
||||||
|
|
||||||
DocumentLanguageData dld = se.extractSentences(parsed);
|
Document parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
pruner.prune(parsed, 0.5);
|
||||||
|
|
||||||
if (lf.dictionaryAgreement(dld) < 0.1) {
|
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (lf.dictionaryAgreement(dld) < 0.1) {
|
||||||
for (var sent : dld.sentences) {
|
return;
|
||||||
for (var word : sent) {
|
|
||||||
words.add(word.stemmed());
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
for (var word : words) {
|
Set<String> words = new HashSet<>(10_000);
|
||||||
counts.merge(word, 1, Integer::sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
words.clear();
|
for (var sent : dld.sentences) {
|
||||||
|
for (var word : sent) {
|
||||||
|
words.add(word.stemmed());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fjp.execute(() -> {
|
||||||
|
synchronized (counts) {
|
||||||
|
for (var word : words) {
|
||||||
|
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fjp.shutdown();
|
||||||
|
fjp.awaitTermination(10, TimeUnit.SECONDS);
|
||||||
|
|
||||||
|
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
||||||
|
synchronized (counts) {
|
||||||
|
counts.put(DOC_COUNT_KEY, docCount.get());
|
||||||
|
|
||||||
|
counts.forEachEntry((hash, cnt) -> {
|
||||||
|
try {
|
||||||
|
dos.writeLong(hash);
|
||||||
|
dos.writeLong(cnt);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counts.forEach((w,c) -> {
|
System.out.println(docCount.get());
|
||||||
if (c > 3) {
|
//
|
||||||
System.out.println(w + ":" + c);
|
// counts.forEachEntry((w,c) -> {
|
||||||
}
|
// if (c > 3L) {
|
||||||
});
|
// System.out.println(w + ":" + c);
|
||||||
|
// }
|
||||||
|
// return true;
|
||||||
|
// });
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.assistant.suggest;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -21,7 +21,7 @@ import java.util.stream.Stream;
|
|||||||
|
|
||||||
public class Suggestions {
|
public class Suggestions {
|
||||||
private final PatriciaTrie<String> suggestionsTrie;
|
private final PatriciaTrie<String> suggestionsTrie;
|
||||||
private final NGramDict nGramDict;
|
private final TermFrequencyDict termFrequencyDict;
|
||||||
private final SpellChecker spellChecker;
|
private final SpellChecker spellChecker;
|
||||||
|
|
||||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||||
@ -31,12 +31,12 @@ public class Suggestions {
|
|||||||
@Inject
|
@Inject
|
||||||
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
||||||
SpellChecker spellChecker,
|
SpellChecker spellChecker,
|
||||||
NGramDict dict
|
TermFrequencyDict dict
|
||||||
) {
|
) {
|
||||||
this.spellChecker = spellChecker;
|
this.spellChecker = spellChecker;
|
||||||
|
|
||||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||||
nGramDict = dict;
|
termFrequencyDict = dict;
|
||||||
|
|
||||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||||
}
|
}
|
||||||
@ -138,7 +138,7 @@ public class Suggestions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Long> scach = new HashMap<>(512);
|
Map<String, Long> scach = new HashMap<>(512);
|
||||||
Function<String, Long> valr = s -> -nGramDict.getTermFreqHash(scach.computeIfAbsent(s, NGramDict::getStringHash));
|
Function<String, Long> valr = s -> -termFrequencyDict.getTermFreqHash(scach.computeIfAbsent(s, TermFrequencyDict::getStringHash));
|
||||||
|
|
||||||
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
||||||
.takeWhile(s -> s.startsWith(prefix))
|
.takeWhile(s -> s.startsWith(prefix))
|
||||||
|
@ -6,7 +6,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.util.DenseBitMap;
|
import nu.marginalia.util.DenseBitMap;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
@ -36,7 +36,7 @@ public class AnchorTextExtractor {
|
|||||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||||
|
|
||||||
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
|
private final TermFrequencyDict ngramDict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||||
Predicate<EdgeUrl> includeUrlPredicate,
|
Predicate<EdgeUrl> includeUrlPredicate,
|
||||||
|
@ -17,6 +17,7 @@ public class DisqualifiedException extends Exception {
|
|||||||
LANGUAGE,
|
LANGUAGE,
|
||||||
STATUS,
|
STATUS,
|
||||||
QUALITY,
|
QUALITY,
|
||||||
ACCEPTABLE_ADS
|
ACCEPTABLE_ADS,
|
||||||
|
FORBIDDEN
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -86,10 +86,6 @@ public class DocumentProcessor {
|
|||||||
if (isAcceptedContentType(crawledDocument)) {
|
if (isAcceptedContentType(crawledDocument)) {
|
||||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||||
|
|
||||||
if (detailsWords.details().quality < minDocumentQuality) {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.details = detailsWords.details();
|
ret.details = detailsWords.details();
|
||||||
ret.words = detailsWords.words();
|
ret.words = detailsWords.words();
|
||||||
}
|
}
|
||||||
@ -141,11 +137,14 @@ public class DocumentProcessor {
|
|||||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||||
throws DisqualifiedException, URISyntaxException {
|
throws DisqualifiedException, URISyntaxException {
|
||||||
|
|
||||||
var doc = Jsoup.parse(crawledDocument.documentBody);
|
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||||
|
|
||||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||||
}
|
}
|
||||||
|
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
DomPruner domPruner = new DomPruner();
|
DomPruner domPruner = new DomPruner();
|
||||||
Document prunedDoc = doc.clone();
|
Document prunedDoc = doc.clone();
|
||||||
@ -160,11 +159,17 @@ public class DocumentProcessor {
|
|||||||
ret.length = getLength(doc);
|
ret.length = getLength(doc);
|
||||||
ret.standard = getHtmlStandard(doc);
|
ret.standard = getHtmlStandard(doc);
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc);
|
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||||
|
|
||||||
var words = getWords(dld);
|
EdgePageWordSet words;
|
||||||
|
if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) {
|
||||||
|
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
words = keywordExtractor.extractKeywords(dld);
|
||||||
|
}
|
||||||
|
|
||||||
var url = new EdgeUrl(crawledDocument.url);
|
var url = new EdgeUrl(crawledDocument.url);
|
||||||
addMetaWords(ret, url, crawledDomain, words);
|
addMetaWords(ret, url, crawledDomain, words);
|
||||||
@ -195,7 +200,6 @@ public class DocumentProcessor {
|
|||||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, tagWords);
|
words.append(IndexBlock.Meta, tagWords);
|
||||||
words.append(IndexBlock.Words_1, tagWords);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||||
@ -255,7 +259,6 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||||
|
|
||||||
|
|
||||||
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
||||||
|
|
||||||
if (pFilename == null) return;
|
if (pFilename == null) return;
|
||||||
@ -273,10 +276,6 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||||
if (dld.totalNumWords() < minDocumentLength) {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.LENGTH);
|
|
||||||
}
|
|
||||||
|
|
||||||
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
||||||
if (languageAgreement < 0.1) {
|
if (languageAgreement < 0.1) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||||
@ -292,10 +291,6 @@ public class DocumentProcessor {
|
|||||||
return htmlStandard;
|
return htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgePageWordSet getWords(DocumentLanguageData dld) {
|
|
||||||
return keywordExtractor.extractKeywords(dld);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getDescription(Document doc) {
|
private String getDescription(Document doc) {
|
||||||
return summaryExtractor.extractSummary(doc);
|
return summaryExtractor.extractSummary(doc);
|
||||||
}
|
}
|
||||||
|
@ -6,13 +6,13 @@ import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
|||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
import java.util.stream.Collectors;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
@ -45,6 +45,8 @@ public class DomainProcessor {
|
|||||||
ret.documents.add(processedDoc);
|
ret.documents.add(processedDoc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
addCommonSiteWords(ret);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.documents = Collections.emptyList();
|
ret.documents = Collections.emptyList();
|
||||||
@ -60,6 +62,40 @@ public class DomainProcessor {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addCommonSiteWords(ProcessedDomain ret) {
|
||||||
|
|
||||||
|
if (ret.documents.size() < 25)
|
||||||
|
return;
|
||||||
|
|
||||||
|
Map<String, Integer> topKeywordCount = new HashMap<>(ret.documents.size()*10);
|
||||||
|
|
||||||
|
for (var doc : ret.documents) {
|
||||||
|
if (doc.words == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) {
|
||||||
|
topKeywordCount.merge(word, -1, Integer::sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100)
|
||||||
|
return;
|
||||||
|
|
||||||
|
Set<String> topWords = topKeywordCount.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() < -10)
|
||||||
|
.sorted(Map.Entry.comparingByValue()).limit(5)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
if (!topWords.isEmpty()) {
|
||||||
|
for (var doc : ret.documents) {
|
||||||
|
if (doc.words != null) {
|
||||||
|
doc.words.get(IndexBlock.Site).addAll(topWords);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||||
int n = 0;
|
int n = 0;
|
||||||
double q = 0.;
|
double q = 0.;
|
||||||
|
@ -4,7 +4,7 @@ import org.jsoup.nodes.Document;
|
|||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
import org.jsoup.nodes.TextNode;
|
import org.jsoup.nodes.TextNode;
|
||||||
import org.jsoup.select.NodeVisitor;
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -12,100 +12,103 @@ import java.util.Map;
|
|||||||
public class DomPruner {
|
public class DomPruner {
|
||||||
|
|
||||||
public void prune(Document document, double pruneThreshold) {
|
public void prune(Document document, double pruneThreshold) {
|
||||||
PruningVisitor pruningVisitor = new PruningVisitor();
|
document.filter(new PruningFilter(pruneThreshold));
|
||||||
document.traverse(pruningVisitor);
|
|
||||||
|
|
||||||
pruningVisitor.data.forEach((node, data) -> {
|
|
||||||
if (data.depth <= 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (data.signalNodeSize == 0) node.remove();
|
|
||||||
else if (data.noiseNodeSize > 0
|
|
||||||
&& data.signalRate() < pruneThreshold
|
|
||||||
&& data.treeSize > 3) {
|
|
||||||
node.remove();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static class PruningVisitor implements NodeVisitor {
|
class PruningFilter implements NodeFilter {
|
||||||
|
|
||||||
private final Map<Node, NodeData> data = new HashMap<>();
|
private final Map<Node, NodeData> data = new HashMap<>();
|
||||||
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||||
|
private double pruneThreshold;
|
||||||
|
|
||||||
@Override
|
public PruningFilter(double pruneThreshold) {
|
||||||
public void head(Node node, int depth) {}
|
this.pruneThreshold = pruneThreshold;
|
||||||
|
|
||||||
@Override
|
|
||||||
public void tail(Node node, int depth) {
|
|
||||||
final NodeData dataForNode;
|
|
||||||
|
|
||||||
if (node instanceof TextNode tn) {
|
|
||||||
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
|
||||||
}
|
|
||||||
else if (isSignal(node)) {
|
|
||||||
dataForNode = new NodeData(depth, 0,0);
|
|
||||||
for (var childNode : node.childNodes()) {
|
|
||||||
dataForNode.add(data.getOrDefault(childNode, dummy));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
dataForNode = new NodeData(depth, 0,0);
|
|
||||||
for (var childNode : node.childNodes()) {
|
|
||||||
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
data.put(node, dataForNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSignal(Node node) {
|
|
||||||
|
|
||||||
if (node instanceof Element e) {
|
|
||||||
if ("a".equalsIgnoreCase(e.tagName()))
|
|
||||||
return false;
|
|
||||||
if ("nav".equalsIgnoreCase(e.tagName()))
|
|
||||||
return false;
|
|
||||||
if ("footer".equalsIgnoreCase(e.tagName()))
|
|
||||||
return false;
|
|
||||||
if ("header".equalsIgnoreCase(e.tagName()))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class NodeData {
|
@Override
|
||||||
int signalNodeSize;
|
public FilterResult head(Node node, int depth) {
|
||||||
int noiseNodeSize;
|
return FilterResult.CONTINUE;
|
||||||
int treeSize = 1;
|
}
|
||||||
int depth;
|
|
||||||
|
|
||||||
private NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
|
@Override
|
||||||
this.depth = depth;
|
public FilterResult tail(Node node, int depth) {
|
||||||
this.signalNodeSize = signalNodeSize;
|
final NodeData dataForNode;
|
||||||
this.noiseNodeSize = noiseNodeSize;
|
|
||||||
|
if (node instanceof TextNode tn) {
|
||||||
|
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
||||||
|
}
|
||||||
|
else if (isSignal(node)) {
|
||||||
|
dataForNode = new NodeData(depth, 0,0);
|
||||||
|
for (var childNode : node.childNodes()) {
|
||||||
|
dataForNode.add(data.getOrDefault(childNode, dummy));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
dataForNode = new NodeData(depth, 0,0);
|
||||||
|
for (var childNode : node.childNodes()) {
|
||||||
|
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(NodeData other) {
|
data.put(node, dataForNode);
|
||||||
signalNodeSize += other.signalNodeSize;
|
|
||||||
noiseNodeSize += other.noiseNodeSize;
|
if (dataForNode.depth <= 1)
|
||||||
treeSize += other.treeSize;
|
return FilterResult.CONTINUE;
|
||||||
|
|
||||||
|
if (dataForNode.signalNodeSize == 0)
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
if (dataForNode.noiseNodeSize > 0
|
||||||
|
&& dataForNode.signalRate() < pruneThreshold
|
||||||
|
&& dataForNode.treeSize > 3)
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSignal(Node node) {
|
||||||
|
|
||||||
|
if (node instanceof Element e) {
|
||||||
|
if ("a".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("nav".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("footer".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("header".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addAsNoise(NodeData other) {
|
return true;
|
||||||
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
|
|
||||||
treeSize += other.treeSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double signalRate() {
|
|
||||||
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class NodeData {
|
||||||
|
int signalNodeSize;
|
||||||
|
int noiseNodeSize;
|
||||||
|
int treeSize = 1;
|
||||||
|
int depth;
|
||||||
|
|
||||||
|
NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
|
||||||
|
this.depth = depth;
|
||||||
|
this.signalNodeSize = signalNodeSize;
|
||||||
|
this.noiseNodeSize = noiseNodeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(NodeData other) {
|
||||||
|
signalNodeSize += other.signalNodeSize;
|
||||||
|
noiseNodeSize += other.noiseNodeSize;
|
||||||
|
treeSize += other.treeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addAsNoise(NodeData other) {
|
||||||
|
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
|
||||||
|
treeSize += other.treeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double signalRate() {
|
||||||
|
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,11 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@ -35,14 +39,20 @@ public class FeatureExtractor {
|
|||||||
"d31qbv1cthcecs.cloudfront.net",
|
"d31qbv1cthcecs.cloudfront.net",
|
||||||
"linkedin.com");
|
"linkedin.com");
|
||||||
|
|
||||||
private AdblockSimulator adblockSimulator;
|
private final AdblockSimulator adblockSimulator;
|
||||||
|
private final RecipeDetector recipeDetector;
|
||||||
|
private final TextileCraftDetector textileCraftDetector;
|
||||||
|
private final WoodworkingDetector woodworkingDetector;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeatureExtractor(AdblockSimulator adblockSimulator) {
|
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
|
||||||
this.adblockSimulator = adblockSimulator;
|
this.adblockSimulator = adblockSimulator;
|
||||||
|
this.recipeDetector = recipeDetector;
|
||||||
|
this.textileCraftDetector = textileCraftDetector;
|
||||||
|
this.woodworkingDetector = woodworkingDetector;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
|
||||||
final Set<HtmlFeature> features = new HashSet<>();
|
final Set<HtmlFeature> features = new HashSet<>();
|
||||||
|
|
||||||
final Elements scriptTags = doc.getElementsByTag("script");
|
final Elements scriptTags = doc.getElementsByTag("script");
|
||||||
@ -81,9 +91,14 @@ public class FeatureExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!domain.cookies.isEmpty()) {
|
if (!domain.cookies.isEmpty())
|
||||||
features.add(HtmlFeature.COOKIES);
|
features.add(HtmlFeature.COOKIES);
|
||||||
}
|
|
||||||
|
if (recipeDetector.testP(dld) > 0.5)
|
||||||
|
features.add(HtmlFeature.CATEGORY_FOOD);
|
||||||
|
// these should be mutually exclusive
|
||||||
|
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
|
||||||
|
features.add(HtmlFeature.CATEGORY_CRAFTS);
|
||||||
|
|
||||||
return features;
|
return features;
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,8 @@ public enum HtmlFeature {
|
|||||||
CATEGORY_FOOD("category:food"),
|
CATEGORY_FOOD("category:food"),
|
||||||
|
|
||||||
ADVERTISEMENT("special:ads"),
|
ADVERTISEMENT("special:ads"),
|
||||||
|
|
||||||
|
CATEGORY_CRAFTS("category:crafts"),
|
||||||
;
|
;
|
||||||
|
|
||||||
private final String keyword;
|
private final String keyword;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -14,6 +15,7 @@ public class RecipeDetector {
|
|||||||
|
|
||||||
private final Map<String, Double> termValues = new HashMap<>();
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
@Inject
|
||||||
public RecipeDetector() {
|
public RecipeDetector() {
|
||||||
PorterStemmer ps = new PorterStemmer();
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -14,6 +15,7 @@ public class TextileCraftDetector {
|
|||||||
|
|
||||||
private final Map<String, Double> termValues = new HashMap<>();
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
@Inject
|
||||||
public TextileCraftDetector() {
|
public TextileCraftDetector() {
|
||||||
PorterStemmer ps = new PorterStemmer();
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -14,6 +15,7 @@ public class WoodworkingDetector {
|
|||||||
|
|
||||||
private final Map<String, Double> termValues = new HashMap<>();
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
@Inject
|
||||||
public WoodworkingDetector() {
|
public WoodworkingDetector() {
|
||||||
PorterStemmer ps = new PorterStemmer();
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ import java.util.concurrent.locks.ReadWriteLock;
|
|||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
public class KeywordLexiconJournalFile {
|
public class KeywordLexiconJournalFile implements AutoCloseable {
|
||||||
private final RandomAccessFile journalFileRAF;
|
private final RandomAccessFile journalFileRAF;
|
||||||
private final File journalFile;
|
private final File journalFile;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
@ -4,22 +4,24 @@ public enum IndexBlock {
|
|||||||
TitleKeywords(0, 0),
|
TitleKeywords(0, 0),
|
||||||
Title(1, 1),
|
Title(1, 1),
|
||||||
|
|
||||||
Link(2, 1.25),
|
Link(2, 1.15),
|
||||||
|
|
||||||
Subjects(3, 0.5),
|
Subjects(3, 3.0),
|
||||||
NamesWords(4, 5),
|
NamesWords(4, 3.0),
|
||||||
Artifacts(5, 10),
|
Artifacts(5, 10),
|
||||||
Meta(6, 7),
|
Meta(6, 7),
|
||||||
|
|
||||||
Tfidf_Top(7, 2),
|
Tfidf_Top(7, 0.5),
|
||||||
Tfidf_Middle(8, 2.5),
|
Tfidf_Middle(8, 1.25),
|
||||||
Tfidf_Lower(9, 5.0),
|
Tfidf_Lower(9, 1.5),
|
||||||
|
|
||||||
Words_1(10, 3.0),
|
Words_1(10, 3.0),
|
||||||
Words_2(11, 3.5),
|
Words_2(11, 3.5),
|
||||||
Words_4(12, 4.0),
|
Words_4(12, 4.0),
|
||||||
Words_8(13, 4.5),
|
Words_8(13, 4.5),
|
||||||
Words_16Plus(14, 7.0),
|
Words_16Plus(14, 7.0),
|
||||||
|
|
||||||
|
Site(15, 1.2),
|
||||||
;
|
;
|
||||||
|
|
||||||
public final int id;
|
public final int id;
|
||||||
|
@ -29,7 +29,6 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
IndexBlock.Tfidf_Top,
|
IndexBlock.Tfidf_Top,
|
||||||
IndexBlock.Tfidf_Middle,
|
IndexBlock.Tfidf_Middle,
|
||||||
IndexBlock.Tfidf_Lower,
|
IndexBlock.Tfidf_Lower,
|
||||||
IndexBlock.NamesWords,
|
|
||||||
IndexBlock.Words_1,
|
IndexBlock.Words_1,
|
||||||
IndexBlock.Words_2,
|
IndexBlock.Words_2,
|
||||||
IndexBlock.Words_4,
|
IndexBlock.Words_4,
|
||||||
@ -62,15 +61,14 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
|
|
||||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1));
|
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1));
|
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1));
|
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1));
|
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
||||||
|
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SafeVarargs
|
@SafeVarargs
|
||||||
|
@ -19,7 +19,8 @@ public class EdgePageWordSet {
|
|||||||
public EdgePageWords get(IndexBlock block) {
|
public EdgePageWords get(IndexBlock block) {
|
||||||
var words = wordSets.get(block);
|
var words = wordSets.get(block);
|
||||||
if (words == null) {
|
if (words == null) {
|
||||||
return new EdgePageWords(block);
|
words = new EdgePageWords(block);
|
||||||
|
wordSets.put(block, words);
|
||||||
}
|
}
|
||||||
return words;
|
return words;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
package nu.marginalia.wmsa.edge.search.query;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -13,11 +13,11 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
public class EnglishDictionary {
|
public class EnglishDictionary {
|
||||||
private final Set<String> englishWords = new HashSet<>();
|
private final Set<String> englishWords = new HashSet<>();
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EnglishDictionary(NGramDict dict) {
|
public EnglishDictionary(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
|
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
|
||||||
"Could not load word frequency table");
|
"Could not load word frequency table");
|
||||||
|
@ -4,7 +4,8 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||||
@ -22,20 +23,22 @@ import java.util.*;
|
|||||||
public class QueryFactory {
|
public class QueryFactory {
|
||||||
|
|
||||||
private final LanguageModels lm;
|
private final LanguageModels lm;
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
private final EnglishDictionary englishDictionary;
|
private final EnglishDictionary englishDictionary;
|
||||||
|
private final NGramBloomFilter nGramBloomFilter;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryFactory(LanguageModels lm, NGramDict dict, EnglishDictionary englishDictionary) {
|
public QueryFactory(LanguageModels lm, TermFrequencyDict dict, EnglishDictionary englishDictionary, NGramBloomFilter nGramBloomFilter) {
|
||||||
this.lm = lm;
|
this.lm = lm;
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
|
|
||||||
this.englishDictionary = englishDictionary;
|
this.englishDictionary = englishDictionary;
|
||||||
|
this.nGramBloomFilter = nGramBloomFilter;
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryParser getParser() {
|
public QueryParser getParser() {
|
||||||
return new QueryParser(englishDictionary, new QueryVariants(lm ,dict, englishDictionary));
|
return new QueryParser(englishDictionary, new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) {
|
public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) {
|
||||||
|
@ -10,7 +10,8 @@ import nu.marginalia.util.language.processing.KeywordExtractor;
|
|||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import opennlp.tools.stemmer.PorterStemmer;
|
import opennlp.tools.stemmer.PorterStemmer;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -24,14 +25,18 @@ public class QueryVariants {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final SentenceExtractor sentenceExtractor;
|
private final SentenceExtractor sentenceExtractor;
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
private final PorterStemmer ps = new PorterStemmer();
|
private final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
private final static int MAX_NGRAM_LENGTH = 4;
|
private final NGramBloomFilter nGramBloomFilter;
|
||||||
private final EnglishDictionary englishDictionary;
|
private final EnglishDictionary englishDictionary;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryVariants(LanguageModels lm, NGramDict dict, EnglishDictionary englishDictionary) {
|
public QueryVariants(LanguageModels lm,
|
||||||
|
TermFrequencyDict dict,
|
||||||
|
NGramBloomFilter nGramBloomFilter,
|
||||||
|
EnglishDictionary englishDictionary) {
|
||||||
|
this.nGramBloomFilter = nGramBloomFilter;
|
||||||
this.englishDictionary = englishDictionary;
|
this.englishDictionary = englishDictionary;
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
this.keywordExtractor = new KeywordExtractor();
|
||||||
this.sentenceExtractor = new SentenceExtractor(lm);
|
this.sentenceExtractor = new SentenceExtractor(lm);
|
||||||
@ -154,11 +159,11 @@ public class QueryVariants {
|
|||||||
double q = 0;
|
double q = 0;
|
||||||
for (var word : lst) {
|
for (var word : lst) {
|
||||||
String[] parts = underscore.split(word);
|
String[] parts = underscore.split(word);
|
||||||
StringJoiner combined = new StringJoiner("_");
|
double qp = 0;
|
||||||
for (String part : parts) {
|
for (String part : parts) {
|
||||||
combined.add(ps.stem(part));
|
qp += 1./(1+ dict.getTermFreq(part));
|
||||||
}
|
}
|
||||||
q += Math.log(1 + dict.getTermFreqStemmed(combined.toString()));
|
q += 1.0 / qp;
|
||||||
}
|
}
|
||||||
ret.add(new QueryVariant(lst, q));
|
ret.add(new QueryVariant(lst, q));
|
||||||
}
|
}
|
||||||
@ -215,8 +220,8 @@ public class QueryVariants {
|
|||||||
while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) {
|
while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) {
|
||||||
ws = wordMatcher.start()+1;
|
ws = wordMatcher.start()+1;
|
||||||
ss = stemmedMatcher.start()+1;
|
ss = stemmedMatcher.start()+1;
|
||||||
if (dict.getTermFreqStemmed(splitAtNumBoundaryAndStem(span.word, stemmedMatcher.start(), "_")) > 0
|
if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_"))
|
||||||
|| dict.getTermFreqStemmed(splitAtNumBoundaryAndStem(span.word, stemmedMatcher.start(), "-")) > 0)
|
|| nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-")))
|
||||||
{
|
{
|
||||||
String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_");
|
String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_");
|
||||||
asTokens2.add(combined);
|
asTokens2.add(combined);
|
||||||
@ -242,7 +247,7 @@ public class QueryVariants {
|
|||||||
|
|
||||||
for (var span : ls) {
|
for (var span : ls) {
|
||||||
var matcher = dashBoundary.matcher(span.word);
|
var matcher = dashBoundary.matcher(span.word);
|
||||||
if (matcher.find() && dict.getTermFreqStemmed(ps.stem(dashBoundary.matcher(span.word).replaceAll(""))) > 0) {
|
if (matcher.find() && nGramBloomFilter.isKnownNGram(ps.stem(dashBoundary.matcher(span.word).replaceAll("")))) {
|
||||||
dash = true;
|
dash = true;
|
||||||
String combined = dashBoundary.matcher(span.word).replaceAll("");
|
String combined = dashBoundary.matcher(span.word).replaceAll("");
|
||||||
asTokens2.add(combined);
|
asTokens2.add(combined);
|
||||||
@ -262,10 +267,6 @@ public class QueryVariants {
|
|||||||
return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1);
|
return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String splitAtNumBoundaryAndStem(String in, int splitPoint, String joiner) {
|
|
||||||
return ps.stem(in.substring(0, splitPoint+1)) + joiner + ps.stem(in.substring(splitPoint+1));
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<List<Word>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) {
|
private List<List<Word>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) {
|
||||||
List<List<Word>> goodSpans = new ArrayList<>();
|
List<List<Word>> goodSpans = new ArrayList<>();
|
||||||
for (int i = 0; i < sentence.length(); i++) {
|
for (int i = 0; i < sentence.length(); i++) {
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.results;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||||
|
|
||||||
@ -12,7 +12,7 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class SearchResultValuator {
|
public class SearchResultValuator {
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
|
||||||
private static final Pattern separator = Pattern.compile("_");
|
private static final Pattern separator = Pattern.compile("_");
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ public class SearchResultValuator {
|
|||||||
private static final int AVG_LENGTH = 1400;
|
private static final int AVG_LENGTH = 1400;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchResultValuator(NGramDict dict) {
|
public SearchResultValuator(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>Marginalia Search}</title>
|
<title>Marginalia Search</title>
|
||||||
|
|
||||||
<link rel="stylesheet" href="/style-new.css" />
|
<link rel="stylesheet" href="/style-new.css" />
|
||||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
|
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.util;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -26,7 +26,7 @@ public class TestLanguageModels {
|
|||||||
var languageModelsHome = getLanguageModelsPath();
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
languageModelsHome.resolve("ngrams-generous-emstr.bin"),
|
languageModelsHome.resolve("ngrams.bin"),
|
||||||
languageModelsHome.resolve("tfreq-generous-emstr.bin"),
|
languageModelsHome.resolve("tfreq-generous-emstr.bin"),
|
||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.assistant.suggest;
|
package nu.marginalia.wmsa.edge.assistant.suggest;
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -17,7 +17,7 @@ class SuggestionsTest {
|
|||||||
public static void setUp() {
|
public static void setUp() {
|
||||||
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"),
|
suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"),
|
||||||
new SpellChecker(), new NGramDict(lm));
|
new SpellChecker(), new TermFrequencyDict(lm));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -10,7 +10,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
|||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -45,7 +45,7 @@ class SentenceExtractorTest {
|
|||||||
|
|
||||||
System.out.println("Running");
|
System.out.println("Running");
|
||||||
|
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
|
|
||||||
SentenceExtractor se = new SentenceExtractor(lm);
|
SentenceExtractor se = new SentenceExtractor(lm);
|
||||||
KeywordExtractor keywordExtractor = new KeywordExtractor();
|
KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||||
@ -85,7 +85,7 @@ class SentenceExtractorTest {
|
|||||||
|
|
||||||
System.out.println("Running");
|
System.out.println("Running");
|
||||||
|
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
|
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ class SentenceExtractorTest {
|
|||||||
|
|
||||||
System.out.println("Running");
|
System.out.println("Running");
|
||||||
|
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
|
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||||
|
|
||||||
@ -154,7 +154,7 @@ class SentenceExtractorTest {
|
|||||||
public void testSE() {
|
public void testSE() {
|
||||||
var result = newSe.extractSentences(Jsoup.parse(new URL("https://memex.marginalia.nu/log/26-personalized-pagerank.gmi"), 10000));
|
var result = newSe.extractSentences(Jsoup.parse(new URL("https://memex.marginalia.nu/log/26-personalized-pagerank.gmi"), 10000));
|
||||||
|
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
|
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
package nu.marginalia.wmsa.edge.integration.arxiv;
|
package nu.marginalia.wmsa.edge.integration.arxiv;
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata;
|
import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -26,7 +26,7 @@ class ArxivParserTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void extractKeywords() throws IOException {
|
void extractKeywords() throws IOException {
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
|
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||||
|
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
||||||
|
|
||||||
|
import nu.marginalia.util.ParallelPipe;
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.util.ParallelPipe;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||||
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
|
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
@ -20,7 +20,7 @@ public class StackOverflowPostsTest {
|
|||||||
|
|
||||||
@Test @Disabled("this is stupidly slow")
|
@Test @Disabled("this is stupidly slow")
|
||||||
public void test() throws ParserConfigurationException, SAXException, InterruptedException {
|
public void test() throws ParserConfigurationException, SAXException, InterruptedException {
|
||||||
var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm));
|
var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm));
|
||||||
|
|
||||||
ThreadLocal<StackOverflowPostProcessor> processor = ThreadLocal.withInitial(() -> {
|
ThreadLocal<StackOverflowPostProcessor> processor = ThreadLocal.withInitial(() -> {
|
||||||
return new StackOverflowPostProcessor(new SentenceExtractor(lm), documentKeywordExtractor);
|
return new StackOverflowPostProcessor(new SentenceExtractor(lm), documentKeywordExtractor);
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.ParallelPipe;
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.DocumentDebugger;
|
import nu.marginalia.util.language.DocumentDebugger;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.util.ParallelPipe;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||||
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
|
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
@ -21,7 +21,7 @@ public class WikipediaTest {
|
|||||||
|
|
||||||
@Test @SneakyThrows
|
@Test @SneakyThrows
|
||||||
public void test() {
|
public void test() {
|
||||||
var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm));
|
var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm));
|
||||||
ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> {
|
ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> {
|
||||||
return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor);
|
return new WikipediaProcessor(new SentenceExtractor(lm), documentKeywordExtractor);
|
||||||
});
|
});
|
||||||
@ -48,7 +48,7 @@ public class WikipediaTest {
|
|||||||
|
|
||||||
@Test @SneakyThrows
|
@Test @SneakyThrows
|
||||||
public void test2() {
|
public void test2() {
|
||||||
var documentKeywordExtractor = new DocumentKeywordExtractor(new NGramDict(lm));
|
var documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(lm));
|
||||||
var debugger = new DocumentDebugger(lm);
|
var debugger = new DocumentDebugger(lm);
|
||||||
|
|
||||||
ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> {
|
ThreadLocal<WikipediaProcessor> processor = ThreadLocal.withInitial(() -> {
|
||||||
|
@ -1,31 +1,35 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
package nu.marginalia.wmsa.edge.search.query;
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class BodyQueryParserTest {
|
class BodyQueryParserTest {
|
||||||
private QueryParser parser;
|
private QueryParser parser;
|
||||||
private static NGramDict dict;
|
private static TermFrequencyDict dict;
|
||||||
private static EnglishDictionary englishDictionary;
|
private static EnglishDictionary englishDictionary;
|
||||||
|
private static NGramBloomFilter nGramBloomFilter;
|
||||||
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void init() {
|
public static void init() throws IOException {
|
||||||
dict = new NGramDict(lm);
|
dict = new TermFrequencyDict(lm);
|
||||||
|
nGramBloomFilter = new NGramBloomFilter(lm);
|
||||||
englishDictionary = new EnglishDictionary(dict);
|
englishDictionary = new EnglishDictionary(dict);
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, englishDictionary));
|
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
package nu.marginalia.wmsa.edge.search.query;
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
class EnglishDictionaryTest {
|
class EnglishDictionaryTest {
|
||||||
@ -11,7 +11,7 @@ class EnglishDictionaryTest {
|
|||||||
void getWordVariants() {
|
void getWordVariants() {
|
||||||
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
|
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);
|
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,25 +1,29 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
package nu.marginalia.wmsa.edge.search.query;
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
class QueryParserTest {
|
class QueryParserTest {
|
||||||
private QueryParser parser;
|
private QueryParser parser;
|
||||||
private static NGramDict dict;
|
private static TermFrequencyDict dict;
|
||||||
private static EnglishDictionary englishDictionary;
|
private static EnglishDictionary englishDictionary;
|
||||||
|
private static NGramBloomFilter nGramBloomFilter;
|
||||||
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() throws IOException {
|
||||||
dict = new NGramDict(lm);
|
dict = new TermFrequencyDict(lm);
|
||||||
|
nGramBloomFilter = new NGramBloomFilter(lm);
|
||||||
englishDictionary = new EnglishDictionary(dict);
|
englishDictionary = new EnglishDictionary(dict);
|
||||||
|
|
||||||
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, englishDictionary));
|
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -3,23 +3,27 @@ package nu.marginalia.wmsa.edge.search.query;
|
|||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
class QueryVariantsTest {
|
class QueryVariantsTest {
|
||||||
QueryVariants variants;
|
QueryVariants variants;
|
||||||
QueryParser parser;
|
QueryParser parser;
|
||||||
SentenceExtractor se;
|
SentenceExtractor se;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() throws IOException {
|
||||||
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
|
|
||||||
se = new SentenceExtractor(lm);
|
se = new SentenceExtractor(lm);
|
||||||
|
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
variants = new QueryVariants(lm, dict, new EnglishDictionary(dict));
|
var ngrams = new NGramBloomFilter(lm);
|
||||||
|
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict));
|
||||||
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user