Tweaks for search result relevance
This commit is contained in:
parent
813399401e
commit
5f993c72dd
@ -63,7 +63,7 @@ public class WordPatterns {
|
|||||||
if (word.isBlank()) {
|
if (word.isBlank()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasMoreThanTwo(word, '-', 2)) {
|
if (hasMoreThanTwo(word, '-', 4)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasMoreThanTwo(word, '+', 2)) {
|
if (hasMoreThanTwo(word, '+', 2)) {
|
||||||
@ -80,7 +80,7 @@ public class WordPatterns {
|
|||||||
if (Character.isDigit(word.charAt(i))) {
|
if (Character.isDigit(word.charAt(i))) {
|
||||||
numDigits++;
|
numDigits++;
|
||||||
}
|
}
|
||||||
if (numDigits > 6)
|
if (numDigits > 16)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,8 +5,8 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class AsciiFlattener {
|
public class AsciiFlattener {
|
||||||
|
|
||||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+");
|
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
|
||||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$");
|
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
|
||||||
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
||||||
|
|
||||||
public static String flattenUnicode(String s) {
|
public static String flattenUnicode(String s) {
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
@ -12,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
|||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class DocumentKeywordExtractor {
|
public class DocumentKeywordExtractor {
|
||||||
|
|
||||||
@ -42,7 +40,7 @@ public class DocumentKeywordExtractor {
|
|||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
|
|
||||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||||
@ -190,30 +188,7 @@ public class DocumentKeywordExtractor {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private Collection<WordRep> joinWordLists(List<WordRep>... words) {
|
|
||||||
int size = 0;
|
|
||||||
for (var lst : words) {
|
|
||||||
size += lst.size();
|
|
||||||
}
|
|
||||||
if (size == 0)
|
|
||||||
return Collections.emptyList();
|
|
||||||
|
|
||||||
final LinkedHashSet<WordRep> ret = new LinkedHashSet<>(size);
|
|
||||||
for (var lst : words) {
|
|
||||||
ret.addAll(lst);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<WordRep> overlappingStems(Collection<WordRep> wordsA, Collection<WordRep> wordsB) {
|
|
||||||
Set<String> stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
|
||||||
Set<String> stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
|
||||||
Set<String> stemmedIntersect = Sets.intersection(stemmedA, stemmedB);
|
|
||||||
return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
@ -22,17 +23,20 @@ public class KeywordCounter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
HashMap<String, Integer> counts = new HashMap<>(1000);
|
||||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||||
|
|
||||||
|
|
||||||
for (var sent : dld.sentences) {
|
for (var sent : dld.sentences) {
|
||||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
|
if (span.size() == 1 &&
|
||||||
|
WordPatterns.isStopWord(sent.words[span.start]))
|
||||||
|
continue;
|
||||||
|
|
||||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||||
|
|
||||||
counts.merge(stemmed, 1., Double::sum);
|
counts.merge(stemmed, 1, Integer::sum);
|
||||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -43,15 +47,23 @@ public class KeywordCounter {
|
|||||||
Set<WordRep> h10 = new HashSet<>();
|
Set<WordRep> h10 = new HashSet<>();
|
||||||
Set<WordRep> h15 = new HashSet<>();
|
Set<WordRep> h15 = new HashSet<>();
|
||||||
|
|
||||||
|
int doubleWordCount = 0;
|
||||||
|
|
||||||
for (var entry : counts.entrySet()) {
|
for (var entry : counts.entrySet()) {
|
||||||
double value = getTermValue(entry, maxC);
|
double value = getTermValue(entry, maxC);
|
||||||
|
|
||||||
|
double avgCnt = entry.getValue();
|
||||||
|
String wordStemmed = entry.getKey();
|
||||||
|
|
||||||
Set<WordRep> histogram;
|
Set<WordRep> histogram;
|
||||||
if (value < -3) histogram = h15;
|
if (value < -3 && avgCnt>1) histogram = h15;
|
||||||
else if (value < -2) histogram = h10;
|
else if (value < -1.75 && avgCnt>1) histogram = h10;
|
||||||
else if (value < -1) histogram = h5;
|
else if (value < -1 &&
|
||||||
|
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
|
||||||
|
histogram = h5;
|
||||||
else continue;
|
else continue;
|
||||||
|
|
||||||
histogram.addAll(instances.get(entry.getKey()));
|
histogram.addAll(instances.get(wordStemmed));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new WordHistogram(h5, h10, h15);
|
return new WordHistogram(h5, h10, h15);
|
||||||
@ -59,7 +71,7 @@ public class KeywordCounter {
|
|||||||
|
|
||||||
private static final Pattern separator = Pattern.compile("_");
|
private static final Pattern separator = Pattern.compile("_");
|
||||||
|
|
||||||
public double getTermValue(Map.Entry<String, Double> e, double maxValue) {
|
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
||||||
String[] parts = separator.split(e.getKey());
|
String[] parts = separator.split(e.getKey());
|
||||||
double totalValue = 0.;
|
double totalValue = 0.;
|
||||||
for (String part : parts) {
|
for (String part : parts) {
|
||||||
@ -71,9 +83,9 @@ public class KeywordCounter {
|
|||||||
double value(String key, double value, double maxValue) {
|
double value(String key, double value, double maxValue) {
|
||||||
double freq = dict.getTermFreqStemmed(key);
|
double freq = dict.getTermFreqStemmed(key);
|
||||||
if (freq < 1) {
|
if (freq < 1) {
|
||||||
freq = 10;
|
freq = 1;
|
||||||
}
|
}
|
||||||
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount);
|
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||||
|
@ -10,84 +10,9 @@ import java.util.ArrayList;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class KeywordExtractor {
|
public class KeywordExtractor {
|
||||||
|
|
||||||
public boolean isLegacy() {
|
|
||||||
return legacy;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLegacy(boolean legacy) {
|
|
||||||
this.legacy = legacy;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean legacy;
|
|
||||||
|
|
||||||
public WordSpan[] getNameLikes(DocumentSentence sentence) {
|
|
||||||
var direct = IntStream.range(0, sentence.length())
|
|
||||||
.filter(i -> sentence.posTags[i].startsWith("N"))
|
|
||||||
.mapToObj(i -> new WordSpan(i, i+1))
|
|
||||||
;
|
|
||||||
var two = IntStream.range(1, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i -1, sentence, Collections.emptySet()))
|
|
||||||
.mapToObj(i -> new WordSpan(i-1, i+1))
|
|
||||||
;
|
|
||||||
|
|
||||||
var a_in_b = IntStream.range(2, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isProperNoun(i, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-1))
|
|
||||||
.filter(i -> isProperNoun(i-2, sentence))
|
|
||||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
|
||||||
;
|
|
||||||
|
|
||||||
var a_in_det_b = IntStream.range(3, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isProperNoun(i, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-1))
|
|
||||||
.filter(i -> sentence.posTags[i-2].equals("DT"))
|
|
||||||
.filter(i -> isProperNoun(i-3, sentence))
|
|
||||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
|
||||||
;
|
|
||||||
var a_in_in_b = IntStream.range(3, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isProperNoun(i, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence))
|
|
||||||
.filter(i -> isProperNoun(i-3, sentence))
|
|
||||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
|
||||||
;
|
|
||||||
var three = IntStream.range(2, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i-1, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i-2, sentence, Collections.emptySet()))
|
|
||||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
|
||||||
;
|
|
||||||
var four = IntStream.range(3, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-3] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i - 1, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i - 2, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i - 3, sentence, Collections.emptySet()))
|
|
||||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
|
||||||
;
|
|
||||||
|
|
||||||
return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity())
|
|
||||||
.toArray(WordSpan[]::new);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
public WordSpan[] getNames(DocumentSentence sentence) {
|
||||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||||
|
|
||||||
@ -214,7 +139,7 @@ public class KeywordExtractor {
|
|||||||
}
|
}
|
||||||
String word = sentence.constructWordFromSpan(w);
|
String word = sentence.constructWordFromSpan(w);
|
||||||
|
|
||||||
if (word.isBlank() || WordPatterns.isStopWord(word)) return false;
|
if (word.isBlank() || !WordPatterns.filter(word)) return false;
|
||||||
if (sentence.posTags[w.start].equals("CC")) return false;
|
if (sentence.posTags[w.start].equals("CC")) return false;
|
||||||
if (sentence.posTags[w.end-1].equals("IN")) return false;
|
if (sentence.posTags[w.end-1].equals("IN")) return false;
|
||||||
if (sentence.posTags[w.end-1].equals("DT")) return false;
|
if (sentence.posTags[w.end-1].equals("DT")) return false;
|
||||||
|
@ -22,6 +22,9 @@ public class NameCounter {
|
|||||||
DocumentSentence sent = dld.sentences[i];
|
DocumentSentence sent = dld.sentences[i];
|
||||||
var keywords = keywordExtractor.getNames(sent);
|
var keywords = keywordExtractor.getNames(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
|
if (span.size() <= 1)
|
||||||
|
continue;
|
||||||
|
|
||||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||||
|
|
||||||
counts.merge(stemmed, 1., Double::sum);
|
counts.merge(stemmed, 1., Double::sum);
|
||||||
|
@ -52,7 +52,7 @@ public class ConverterMain {
|
|||||||
logger.info("Starting pipe");
|
logger.info("Starting pipe");
|
||||||
|
|
||||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
||||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
|
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||||
@ -73,12 +73,7 @@ public class ConverterMain {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
plan.forEachCrawledDomain(domain -> {
|
plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
|
||||||
if (!processLog.isJobFinished(domain.id)) {
|
|
||||||
logger.info("{} - {}", domain.domain, domain.id);
|
|
||||||
pipe.accept(domain);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
pipe.join();
|
pipe.join();
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,14 @@ package nu.marginalia.wmsa.edge.converting;
|
|||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -36,7 +44,9 @@ public class CrawledInstructionWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
||||||
logger.info("Writing {} - {}", id, instructionList.size());
|
|
||||||
|
SummarizingInterpreter summary = new SummarizingInterpreter(instructionList);
|
||||||
|
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
||||||
|
|
||||||
for (var instr : instructionList) {
|
for (var instr : instructionList) {
|
||||||
outputStream.append(instr.tag().name());
|
outputStream.append(instr.tag().name());
|
||||||
@ -59,4 +69,54 @@ public class CrawledInstructionWriter {
|
|||||||
}
|
}
|
||||||
return destDir.resolve(id + ".pzstd");
|
return destDir.resolve(id + ".pzstd");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class SummarizingInterpreter implements Interpreter {
|
||||||
|
|
||||||
|
private SummarizingInterpreter(List<Instruction> instructions) {
|
||||||
|
for (var i : instructions) {
|
||||||
|
i.apply(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String domainName;
|
||||||
|
private int ok = 0;
|
||||||
|
private int error = 0;
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%s - %d %d", domainName, ok, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadUrl(EdgeUrl[] url) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomain(EdgeDomain[] domain) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadRssFeed(EdgeUrl[] rssFeed) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomainLink(DomainLink[] links) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||||
|
this.domainName = domain.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
|
||||||
|
ok++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
|
||||||
|
error++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomainRedirect(DomainLink link) {}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -72,7 +72,7 @@ public class Loader implements Interpreter {
|
|||||||
@Override
|
@Override
|
||||||
public void loadDomainLink(DomainLink[] links) {
|
public void loadDomainLink(DomainLink[] links) {
|
||||||
logger.debug("loadDomainLink({})", links, null);
|
logger.debug("loadDomainLink({})", links, null);
|
||||||
sqlLoadDomainLinks.load(links);
|
sqlLoadDomainLinks.load(data, links);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -40,13 +40,20 @@ public class SqlLoadDomainLinks {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(DomainLink[] links) {
|
public void load(LoaderData data, DomainLink[] links) {
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection();
|
try (var connection = dataSource.getConnection();
|
||||||
|
var nukeExistingLinksForDomain =
|
||||||
|
connection.prepareStatement("""
|
||||||
|
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?
|
||||||
|
""");
|
||||||
var stmt =
|
var stmt =
|
||||||
connection.prepareCall("CALL INSERT_LINK(?,?)"))
|
connection.prepareCall("CALL INSERT_LINK(?,?)"))
|
||||||
{
|
{
|
||||||
|
|
||||||
|
nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from()));
|
||||||
|
nukeExistingLinksForDomain.executeUpdate();
|
||||||
|
|
||||||
for (DomainLink link : links) {
|
for (DomainLink link : links) {
|
||||||
stmt.setString(1, link.from().toString());
|
stmt.setString(1, link.from().toString());
|
||||||
stmt.setString(2, link.to().toString());
|
stmt.setString(2, link.to().toString());
|
||||||
|
@ -15,6 +15,7 @@ public class ProcessedDocument {
|
|||||||
public EdgePageWordSet words;
|
public EdgePageWordSet words;
|
||||||
|
|
||||||
public EdgeUrlState state;
|
public EdgeUrlState state;
|
||||||
|
public String stateReason;
|
||||||
|
|
||||||
public OptionalDouble quality() {
|
public OptionalDouble quality() {
|
||||||
if (details != null) {
|
if (details != null) {
|
||||||
|
@ -70,11 +70,22 @@ public class DocumentProcessor {
|
|||||||
this.summaryExtractor = summaryExtractor;
|
this.summaryExtractor = summaryExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
|
||||||
|
ProcessedDocument ret = new ProcessedDocument();
|
||||||
|
|
||||||
|
try {
|
||||||
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
|
ret.url = getDocumentUrl(crawledDocument);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
||||||
ProcessedDocument ret = new ProcessedDocument();
|
ProcessedDocument ret = new ProcessedDocument();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ret.url = new EdgeUrl(crawledDocument.url);
|
ret.url = getDocumentUrl(crawledDocument);
|
||||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||||
|
|
||||||
if (ret.state == EdgeUrlState.OK) {
|
if (ret.state == EdgeUrlState.OK) {
|
||||||
@ -99,17 +110,31 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
catch (DisqualifiedException ex) {
|
catch (DisqualifiedException ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
|
ret.stateReason = ex.reason.toString();
|
||||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
logger.info("Failed to convert " + ret.url, ex);
|
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||||
|
throws URISyntaxException
|
||||||
|
{
|
||||||
|
if (crawledDocument.canonicalUrl != null) {
|
||||||
|
try {
|
||||||
|
return new EdgeUrl(crawledDocument.canonicalUrl);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) { /* fallthrough */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeUrl(crawledDocument.url);
|
||||||
|
}
|
||||||
|
|
||||||
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||||
if (crawledDocument.contentType == null) {
|
if (crawledDocument.contentType == null) {
|
||||||
return false;
|
return false;
|
||||||
@ -155,20 +180,26 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
ret.description = getDescription(doc);
|
|
||||||
ret.length = getLength(doc);
|
ret.length = getLength(doc);
|
||||||
ret.standard = getHtmlStandard(doc);
|
ret.standard = getHtmlStandard(doc);
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
|
||||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||||
|
|
||||||
|
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
|
||||||
|
|
||||||
EdgePageWordSet words;
|
EdgePageWordSet words;
|
||||||
if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) {
|
if (doSimpleProcessing) {
|
||||||
|
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||||
|
ret.description = "";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||||
words = keywordExtractor.extractKeywords(dld);
|
words = keywordExtractor.extractKeywords(dld);
|
||||||
|
ret.description = getDescription(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
var url = new EdgeUrl(crawledDocument.url);
|
var url = new EdgeUrl(crawledDocument.url);
|
||||||
@ -276,6 +307,10 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||||
|
if (dld.totalNumWords() < minDocumentLength) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
||||||
if (languageAgreement < 0.1) {
|
if (languageAgreement < 0.1) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||||
|
@ -1,23 +1,27 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor;
|
package nu.marginalia.wmsa.edge.converting.processor;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
|
private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor();
|
||||||
|
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
private final Double minAvgDocumentQuality;
|
private final Double minAvgDocumentQuality;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||||
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||||
@ -39,61 +43,71 @@ public class DomainProcessor {
|
|||||||
if (crawledDomain.doc != null) {
|
if (crawledDomain.doc != null) {
|
||||||
ret.documents = new ArrayList<>(crawledDomain.doc.size());
|
ret.documents = new ArrayList<>(crawledDomain.doc.size());
|
||||||
|
|
||||||
|
fixBadCanonicalTags(crawledDomain.doc);
|
||||||
|
|
||||||
|
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
||||||
for (var doc : crawledDomain.doc) {
|
for (var doc : crawledDomain.doc) {
|
||||||
|
if (disqualifier.isQualified()) {
|
||||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||||
|
|
||||||
if (processedDoc.url != null) {
|
if (processedDoc.url != null) {
|
||||||
ret.documents.add(processedDoc);
|
ret.documents.add(processedDoc);
|
||||||
|
processedDoc.quality().ifPresent(disqualifier::offer);
|
||||||
|
}
|
||||||
|
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
||||||
|
disqualifier.offer(-100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { // Short-circuit processing if quality is too low
|
||||||
|
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
||||||
|
if (stub.url != null) {
|
||||||
|
ret.documents.add(stub);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addCommonSiteWords(ret);
|
Set<String> commonSiteWords = new HashSet<>(10);
|
||||||
|
|
||||||
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
|
||||||
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
|
||||||
|
|
||||||
|
if (!commonSiteWords.isEmpty()) {
|
||||||
|
for (var doc : ret.documents) {
|
||||||
|
if (doc.words != null) {
|
||||||
|
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.documents = Collections.emptyList();
|
ret.documents = Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
double averageQuality = getAverageQuality(ret.documents);
|
|
||||||
if (averageQuality < minAvgDocumentQuality) {
|
|
||||||
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.state = getState(crawledDomain.crawlerStatus);
|
ret.state = getState(crawledDomain.crawlerStatus);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addCommonSiteWords(ProcessedDomain ret) {
|
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
||||||
|
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
||||||
|
|
||||||
if (ret.documents.size() < 25)
|
// Sometimes sites set a blanket canonical link to their root page
|
||||||
return;
|
// this removes such links from consideration
|
||||||
|
|
||||||
Map<String, Integer> topKeywordCount = new HashMap<>(ret.documents.size()*10);
|
for (var document : docs) {
|
||||||
|
if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) {
|
||||||
for (var doc : ret.documents) {
|
seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash);
|
||||||
if (doc.words == null)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) {
|
|
||||||
topKeywordCount.merge(word, -1, Integer::sum);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100)
|
for (var document : docs) {
|
||||||
return;
|
if (!Strings.isNullOrEmpty(document.canonicalUrl)
|
||||||
|
&& !Objects.equals(document.canonicalUrl, document.url)
|
||||||
|
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
|
||||||
|
document.canonicalUrl = document.url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Set<String> topWords = topKeywordCount.entrySet().stream()
|
|
||||||
.filter(e -> e.getValue() < -10)
|
|
||||||
.sorted(Map.Entry.comparingByValue()).limit(5)
|
|
||||||
.map(Map.Entry::getKey)
|
|
||||||
.collect(Collectors.toSet());
|
|
||||||
|
|
||||||
if (!topWords.isEmpty()) {
|
|
||||||
for (var doc : ret.documents) {
|
|
||||||
if (doc.words != null) {
|
|
||||||
doc.words.get(IndexBlock.Site).addAll(topWords);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||||
@ -120,4 +134,20 @@ public class DomainProcessor {
|
|||||||
default -> EdgeDomainIndexingState.ERROR;
|
default -> EdgeDomainIndexingState.ERROR;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class DocumentDisqualifier {
|
||||||
|
int count;
|
||||||
|
int goodCount;
|
||||||
|
|
||||||
|
void offer(double quality) {
|
||||||
|
count++;
|
||||||
|
if (quality > minAvgDocumentQuality) {
|
||||||
|
goodCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isQualified() {
|
||||||
|
return count < 25 || goodCount*10 >= count;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class CommonKeywordExtractor {
|
||||||
|
private final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||||
|
|
||||||
|
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
||||||
|
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||||
|
|
||||||
|
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||||
|
|
||||||
|
public List<String> getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) {
|
||||||
|
|
||||||
|
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
final Map<String, String> wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10);
|
||||||
|
|
||||||
|
final Map<String, Integer> topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10);
|
||||||
|
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
|
||||||
|
|
||||||
|
int qualifiedDocCount = 0;
|
||||||
|
for (var doc : ret.documents) {
|
||||||
|
if (doc.words == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
qualifiedDocCount++;
|
||||||
|
|
||||||
|
for (var block : sourceBlocks) {
|
||||||
|
for (var word : doc.words.get(block).words) {
|
||||||
|
String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord);
|
||||||
|
|
||||||
|
// Count by negative values to sort by Map.Entry.comparingByValue() in reverse
|
||||||
|
topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum);
|
||||||
|
|
||||||
|
stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int totalValue = 0;
|
||||||
|
for (int value : topStemmedKeywordCount.values()) {
|
||||||
|
totalValue += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
List<String> topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT);
|
||||||
|
|
||||||
|
double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD;
|
||||||
|
|
||||||
|
topStemmedKeywordCount.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() < qualifyingValue)
|
||||||
|
.sorted(Map.Entry.comparingByValue())
|
||||||
|
.limit(MAX_SITE_KEYWORDS_TO_EXTRACT)
|
||||||
|
.forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey())));
|
||||||
|
|
||||||
|
|
||||||
|
return topWords;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -14,6 +14,8 @@ public enum HtmlFeature {
|
|||||||
ADVERTISEMENT("special:ads"),
|
ADVERTISEMENT("special:ads"),
|
||||||
|
|
||||||
CATEGORY_CRAFTS("category:crafts"),
|
CATEGORY_CRAFTS("category:crafts"),
|
||||||
|
|
||||||
|
UNKNOWN("special:uncategorized")
|
||||||
;
|
;
|
||||||
|
|
||||||
private final String keyword;
|
private final String keyword;
|
||||||
|
@ -13,10 +13,14 @@ import java.io.InputStreamReader;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class CrawledDomainReader {
|
public class CrawledDomainReader {
|
||||||
private final Gson gson = new GsonBuilder().create();
|
private final Gson gson = new GsonBuilder().create();
|
||||||
|
|
||||||
|
private final ForkJoinPool pool = new ForkJoinPool(4);
|
||||||
|
|
||||||
public CrawledDomainReader() {
|
public CrawledDomainReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,7 +47,12 @@ public class CrawledDomainReader {
|
|||||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
pool.execute(() -> {
|
||||||
|
var doc = gson.fromJson(nextLine, CrawledDocument.class);
|
||||||
|
synchronized (docs) {
|
||||||
|
docs.add(doc);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
} else if (line.charAt(0) == '{') {
|
} else if (line.charAt(0) == '{') {
|
||||||
domain = gson.fromJson(line, CrawledDomain.class);
|
domain = gson.fromJson(line, CrawledDomain.class);
|
||||||
@ -52,6 +61,8 @@ public class CrawledDomainReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pool.awaitQuiescence(10, TimeUnit.SECONDS);
|
||||||
|
|
||||||
if (domain == null) {
|
if (domain == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -6,5 +6,6 @@ public enum CrawlerDocumentStatus {
|
|||||||
BAD_CHARSET,
|
BAD_CHARSET,
|
||||||
REDIRECT,
|
REDIRECT,
|
||||||
ROBOTS_TXT,
|
ROBOTS_TXT,
|
||||||
ERROR
|
ERROR,
|
||||||
|
Timeout
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.wmsa.edge.data.dao;
|
package nu.marginalia.wmsa.edge.data.dao;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
import com.google.common.cache.Cache;
|
import com.google.common.cache.Cache;
|
||||||
import com.google.common.cache.CacheBuilder;
|
import com.google.common.cache.CacheBuilder;
|
||||||
import com.google.common.util.concurrent.UncheckedExecutionException;
|
import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||||
@ -113,9 +114,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
Double.MAX_VALUE, // termScore
|
Double.MAX_VALUE, // termScore
|
||||||
0 // queryLength
|
0 // queryLength
|
||||||
);
|
);
|
||||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||||
result.add(val);
|
&& Strings.isNullOrEmpty(val.description)
|
||||||
|
&& val.url.path.length() > 1) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
result.add(val);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,6 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import com.google.protobuf.InvalidProtocolBufferException;
|
import com.google.protobuf.InvalidProtocolBufferException;
|
||||||
import gnu.trove.map.TLongIntMap;
|
import gnu.trove.map.TLongIntMap;
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
@ -227,13 +226,8 @@ public class EdgeIndexService extends Service {
|
|||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
try {
|
try {
|
||||||
if (specsSet.isStagger()) {
|
|
||||||
return new EdgeSearchResultSet(searchStaggered(specsSet));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return new EdgeSearchResultSet(searchStraight(specsSet));
|
return new EdgeSearchResultSet(searchStraight(specsSet));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
catch (HaltException ex) {
|
catch (HaltException ex) {
|
||||||
logger.warn("Halt", ex);
|
logger.warn("Halt", ex);
|
||||||
throw ex;
|
throw ex;
|
||||||
@ -249,59 +243,9 @@ public class EdgeIndexService extends Service {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStaggered(EdgeSearchSpecification specsSet) {
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
final Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
|
||||||
final TIntHashSet seenResults = new TIntHashSet();
|
|
||||||
|
|
||||||
final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] {
|
|
||||||
new DomainResultCountFilter(specsSet.limitByDomain),
|
|
||||||
new DomainResultCountFilter(specsSet.limitByDomain)
|
|
||||||
};
|
|
||||||
|
|
||||||
final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
|
||||||
final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket);
|
|
||||||
|
|
||||||
for (int i = 0; i < specsSet.buckets.size(); i+=2) {
|
|
||||||
for (var sq : specsSet.subqueries) {
|
|
||||||
for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) {
|
|
||||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
|
||||||
|
|
||||||
if (searchTerms.isEmpty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var result = performSearch(searchTerms.get(),
|
|
||||||
budget,
|
|
||||||
seenResults,
|
|
||||||
domainCountFilter[j],
|
|
||||||
sq,
|
|
||||||
List.of(specsSet.buckets.get(i+j)),
|
|
||||||
specsSet,
|
|
||||||
Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (logger.isDebugEnabled()) {
|
|
||||||
logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum());
|
|
||||||
}
|
|
||||||
|
|
||||||
int sz = result.size();
|
|
||||||
count += sz;
|
|
||||||
limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz);
|
|
||||||
|
|
||||||
if (sz > 0) {
|
|
||||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStraight(EdgeSearchSpecification specsSet) {
|
private Map<IndexBlock, List<EdgeSearchResultItem>> searchStraight(EdgeSearchSpecification specsSet) {
|
||||||
Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
Map<IndexBlock, List<EdgeSearchResultItem>> results = new HashMap<>();
|
||||||
int count = 0;
|
int count = 0;
|
||||||
TIntHashSet seenResults = new TIntHashSet();
|
TIntHashSet seenResults = new TIntHashSet();
|
||||||
|
|
||||||
@ -314,25 +258,38 @@ public class EdgeIndexService extends Service {
|
|||||||
if (searchTerms.isEmpty())
|
if (searchTerms.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
var result = performSearch(searchTerms.get(),
|
var resultForSq = performSearch(searchTerms.get(),
|
||||||
budget, seenResults, domainCountFilter,
|
budget, seenResults, domainCountFilter,
|
||||||
sq, specsSet.buckets, specsSet,
|
sq, specsSet.buckets, specsSet,
|
||||||
specsSet.limitTotal - count);
|
specsSet.limitTotal - count);
|
||||||
|
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isDebugEnabled()) {
|
||||||
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size());
|
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, resultForSq.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
count += result.size();
|
count += resultForSq.size();
|
||||||
if (result.size() > 0) {
|
if (resultForSq.size() > 0) {
|
||||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).addAll(resultForSq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
List<List<String>> distinctSearchTerms = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
|
|
||||||
|
results.forEach((index, blockResults) -> {
|
||||||
|
for (var result : blockResults) {
|
||||||
|
for (int i = 0; i < distinctSearchTerms.size(); i++) {
|
||||||
|
for (var term : distinctSearchTerms.get(i)) {
|
||||||
|
result.scores.add(getSearchTermScore(i, result.bucketId, term, result.getCombinedId()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms,
|
private List<EdgeSearchResultItem> performSearch(EdgeIndexSearchTerms searchTerms,
|
||||||
IndexSearchBudget budget,
|
IndexSearchBudget budget,
|
||||||
TIntHashSet seenResults,
|
TIntHashSet seenResults,
|
||||||
DomainResultCountFilter domainCountFilter,
|
DomainResultCountFilter domainCountFilter,
|
||||||
@ -342,14 +299,14 @@ public class EdgeIndexService extends Service {
|
|||||||
int limit)
|
int limit)
|
||||||
{
|
{
|
||||||
if (limit <= 0) {
|
if (limit <= 0) {
|
||||||
return new EdgeSearchResults();
|
return new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
|
final List<EdgeSearchResultItem> results = new ArrayList<>();
|
||||||
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
||||||
|
|
||||||
for (int i : specBuckets) {
|
for (int i : specBuckets) {
|
||||||
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
|
int foundResultsCount = results.size();
|
||||||
|
|
||||||
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
|
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
|
||||||
break;
|
break;
|
||||||
@ -362,38 +319,33 @@ public class EdgeIndexService extends Service {
|
|||||||
.limit(specs.limitTotal * 3L)
|
.limit(specs.limitTotal * 3L)
|
||||||
.distinct()
|
.distinct()
|
||||||
.limit(Math.min(specs.limitByBucket
|
.limit(Math.min(specs.limitByBucket
|
||||||
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
- results.size(), limit - foundResultsCount))
|
||||||
.forEach(resultsForBucket::add);
|
.forEach(resultsForBucket::add);
|
||||||
|
|
||||||
|
|
||||||
for (var result : resultsForBucket) {
|
for (var result : resultsForBucket) {
|
||||||
seenResults.add(result.url.id());
|
seenResults.add(result.url.id());
|
||||||
}
|
}
|
||||||
for (var result : resultsForBucket) {
|
|
||||||
for (var searchTerm : sq.searchTermsInclude) {
|
|
||||||
result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
domainCountFilter.addAll(i, resultsForBucket);
|
domainCountFilter.addAll(i, resultsForBucket);
|
||||||
|
|
||||||
if (!resultsForBucket.isEmpty()) {
|
results.addAll(resultsForBucket);
|
||||||
results.put(i, resultsForBucket);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new EdgeSearchResults(results);
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) {
|
private EdgeSearchResultKeywordScore getSearchTermScore(int set, int bucketId, String term, long urlId) {
|
||||||
final int termId = indexes.getDictionaryReader().get(term);
|
final int termId = indexes.getDictionaryReader().get(term);
|
||||||
|
|
||||||
var bucket = indexes.getBucket(bucketId);
|
var bucket = indexes.getBucket(bucketId);
|
||||||
|
|
||||||
return new EdgeSearchResultKeywordScore(term,
|
return new EdgeSearchResultKeywordScore(set, term,
|
||||||
bucket.getTermScore(termId, urlId),
|
bucket.getTermScore(termId, urlId),
|
||||||
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
|
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
|
||||||
bucket.isTermInBucket(IndexBlock.Link, termId, urlId)
|
bucket.isTermInBucket(IndexBlock.Link, termId, urlId),
|
||||||
|
bucket.isTermInBucket(IndexBlock.Site, termId, urlId),
|
||||||
|
bucket.isTermInBucket(IndexBlock.Subjects, termId, urlId)
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.model;
|
|||||||
|
|
||||||
public enum IndexBlock {
|
public enum IndexBlock {
|
||||||
TitleKeywords(0, 0),
|
TitleKeywords(0, 0),
|
||||||
Title(1, 1),
|
Title(1, 0),
|
||||||
|
|
||||||
Link(2, 1.15),
|
Link(2, 1.15),
|
||||||
|
|
||||||
Subjects(3, 3.0),
|
Subjects(3, 1.0),
|
||||||
NamesWords(4, 3.0),
|
NamesWords(4, 3.0),
|
||||||
Artifacts(5, 10),
|
Artifacts(5, 10),
|
||||||
Meta(6, 7),
|
Meta(6, 7),
|
||||||
|
|
||||||
Tfidf_Top(7, 0.5),
|
Tfidf_Top(7, 1.5),
|
||||||
Tfidf_Middle(8, 1.25),
|
Tfidf_Middle(8, 2),
|
||||||
Tfidf_Lower(9, 1.5),
|
Tfidf_Lower(9, 3.5),
|
||||||
|
|
||||||
Words_1(10, 3.0),
|
Words_1(10, 2.0),
|
||||||
Words_2(11, 3.5),
|
Words_2(11, 3.5),
|
||||||
Words_4(12, 4.0),
|
Words_4(12, 4.0),
|
||||||
Words_8(13, 4.5),
|
Words_8(13, 4.5),
|
||||||
|
@ -47,7 +47,7 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
var linkIndex = indices.get(IndexBlock.Link);
|
var linkIndex = indices.get(IndexBlock.Link);
|
||||||
var titleIndex = indices.get(IndexBlock.Title);
|
var titleIndex = indices.get(IndexBlock.Title);
|
||||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
var namesIndex = indices.get(IndexBlock.NamesWords);
|
||||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
var siteIndex = indices.get(IndexBlock.Site);
|
||||||
var metaIndex = indices.get(IndexBlock.Meta);
|
var metaIndex = indices.get(IndexBlock.Meta);
|
||||||
var topicIndex = indices.get(IndexBlock.Subjects);
|
var topicIndex = indices.get(IndexBlock.Subjects);
|
||||||
|
|
||||||
@ -61,14 +61,17 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
|
|
||||||
|
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, linkIndex), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1));
|
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1));
|
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1));
|
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1));
|
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1));
|
||||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
||||||
|
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1));
|
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, siteIndex, namesIndex, topicIndex, metaIndex), words1));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
|
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, namesIndex, siteIndex, midIndex, topicIndex, metaIndex), words1));
|
||||||
|
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Middle, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
|
||||||
|
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Lower, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex, artifacts), words1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SafeVarargs
|
@SafeVarargs
|
||||||
|
@ -46,7 +46,7 @@ public class IndexQueryBuilder {
|
|||||||
return new QueryForIndices(budget, LongStream::empty);
|
return new QueryForIndices(budget, LongStream::empty);
|
||||||
}
|
}
|
||||||
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
|
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
|
||||||
return build(budget, filter, wordId);
|
return new QueryForIndices(budget, LongStream::empty);
|
||||||
}
|
}
|
||||||
|
|
||||||
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
|
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
|
||||||
|
@ -16,6 +16,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@AllArgsConstructor @NoArgsConstructor @ToString
|
@AllArgsConstructor @NoArgsConstructor @ToString
|
||||||
@ -86,7 +87,21 @@ public class EdgeCrawlPlan {
|
|||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
|
||||||
|
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||||
|
|
||||||
|
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||||
|
entryStream
|
||||||
|
.filter(entry -> idReadPredicate.test(entry.id()))
|
||||||
|
.map(CrawlLogEntry::path)
|
||||||
|
.map(this::getCrawledFilePath)
|
||||||
|
.map(reader::readRuntimeExcept)
|
||||||
|
.forEach(consumer);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
@MustBeClosed
|
@MustBeClosed
|
||||||
public DomainsIterable domainsIterable() throws IOException {
|
public DomainsIterable domainsIterable() throws IOException {
|
||||||
return new DomainsIterable();
|
return new DomainsIterable();
|
||||||
|
@ -13,18 +13,18 @@ import java.util.List;
|
|||||||
|
|
||||||
@AllArgsConstructor @ToString @Getter @EqualsAndHashCode
|
@AllArgsConstructor @ToString @Getter @EqualsAndHashCode
|
||||||
public class EdgeSearchResultItem {
|
public class EdgeSearchResultItem {
|
||||||
public final int blockId;
|
public final int bucketId;
|
||||||
public final int queryLength;
|
public final int queryLength;
|
||||||
public final EdgeId<EdgeDomain> domain; // this isn't the external domain ID, but a ranking
|
public final EdgeId<EdgeDomain> domain; // this isn't the external domain ID, but a ranking
|
||||||
public final EdgeId<EdgeUrl> url;
|
public final EdgeId<EdgeUrl> url;
|
||||||
public final List<EdgeSearchResultKeywordScore> scores;
|
public final List<EdgeSearchResultKeywordScore> scores;
|
||||||
|
|
||||||
public EdgeSearchResultItem(int blockId, int queryLength, long val) {
|
public EdgeSearchResultItem(int bucketId, int queryLength, long val) {
|
||||||
int urlId = (int) (val & 0xFFFF_FFFFL);
|
int urlId = (int) (val & 0xFFFF_FFFFL);
|
||||||
int domainId = (int) (val >>> 32);
|
int domainId = (int) (val >>> 32);
|
||||||
|
|
||||||
this.queryLength = queryLength;
|
this.queryLength = queryLength;
|
||||||
this.blockId = blockId;
|
this.bucketId = bucketId;
|
||||||
|
|
||||||
url = new EdgeId<>(urlId);
|
url = new EdgeId<>(urlId);
|
||||||
domain = new EdgeId<>(domainId);
|
domain = new EdgeId<>(domainId);
|
||||||
|
@ -1,14 +1,6 @@
|
|||||||
package nu.marginalia.wmsa.edge.model.search;
|
package nu.marginalia.wmsa.edge.model.search;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
|
||||||
@AllArgsConstructor @ToString @EqualsAndHashCode
|
public record EdgeSearchResultKeywordScore(int set, String keyword, IndexBlock index, boolean title, boolean link, boolean site, boolean subject) {
|
||||||
public class EdgeSearchResultKeywordScore {
|
|
||||||
public final String keyword;
|
|
||||||
public final IndexBlock index;
|
|
||||||
public boolean title;
|
|
||||||
public boolean link;
|
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ import java.util.Map;
|
|||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
@AllArgsConstructor @Getter @ToString
|
||||||
public class EdgeSearchResultSet {
|
public class EdgeSearchResultSet {
|
||||||
public Map<IndexBlock, List<EdgeSearchResults>> resultsList;
|
public Map<IndexBlock, List<EdgeSearchResultItem>> resultsList;
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return resultsList.values().stream().mapToInt(List::size).sum();
|
return resultsList.values().stream().mapToInt(List::size).sum();
|
||||||
|
@ -4,29 +4,23 @@ import lombok.AllArgsConstructor;
|
|||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
@AllArgsConstructor @Getter @ToString
|
||||||
public class EdgeSearchResults {
|
public class EdgeSearchResults {
|
||||||
public final Map<Integer, List<EdgeSearchResultItem>> results;
|
public final List<EdgeSearchResultItem> results;
|
||||||
|
|
||||||
public EdgeSearchResults() {
|
public EdgeSearchResults() {
|
||||||
results = new HashMap<>();
|
results = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return results.values().stream().mapToInt(List::size).sum();
|
return results.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Stream<EdgeSearchResultItem> stream() {
|
public Stream<EdgeSearchResultItem> stream() {
|
||||||
return results.values().stream().flatMap(List::stream);
|
return results.stream();
|
||||||
}
|
|
||||||
|
|
||||||
public List<EdgeSearchResultItem> getAllItems() {
|
|
||||||
return stream().collect(Collectors.toList());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -94,7 +94,7 @@ public class EdgeUrlDetails {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public double getRanking() {
|
public double getRanking() {
|
||||||
double lengthAdjustment = Math.max(1, words / (words + 1000.));
|
double lengthAdjustment = Math.max(1, words / (words + 10000.));
|
||||||
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,6 +132,7 @@ public class EdgeUrlDetails {
|
|||||||
public boolean isCookies() {
|
public boolean isCookies() {
|
||||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||||
}
|
}
|
||||||
|
public boolean isUnknown() { return HtmlFeature.hasFeature(features, HtmlFeature.UNKNOWN); }
|
||||||
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
|
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
|
||||||
|
|
||||||
public boolean isSpecialDomain() {
|
public boolean isSpecialDomain() {
|
||||||
|
@ -39,6 +39,7 @@ import javax.annotation.Nullable;
|
|||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@ -236,6 +237,8 @@ public class EdgeSearchOperator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
||||||
|
|
||||||
private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) {
|
private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) {
|
||||||
String titleLC = p.title == null ? "" : p.title.toLowerCase();
|
String titleLC = p.title == null ? "" : p.title.toLowerCase();
|
||||||
String descLC = p.description == null ? "" : p.description.toLowerCase();
|
String descLC = p.description == null ? "" : p.description.toLowerCase();
|
||||||
@ -248,11 +251,16 @@ public class EdgeSearchOperator {
|
|||||||
.toArray(String[]::new);
|
.toArray(String[]::new);
|
||||||
int termCount = searchTermsLC.length;
|
int termCount = searchTermsLC.length;
|
||||||
|
|
||||||
String[] titleParts = titleLC.split("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
|
||||||
double titleHitsAdj = 0.;
|
double titleHitsAdj = 0.;
|
||||||
|
final String[] titleParts = titleSplitPattern.split(titleLC);
|
||||||
for (String titlePart : titleParts) {
|
for (String titlePart : titleParts) {
|
||||||
titleHitsAdj += Arrays.stream(searchTermsLC).filter(titlePart::contains).mapToInt(String::length).sum()
|
double hits = 0;
|
||||||
/ (double) Math.max(1, titlePart.trim().length());
|
for (String term : searchTermsLC) {
|
||||||
|
if (titlePart.contains(term)) {
|
||||||
|
hits += term.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
titleHitsAdj += hits / Math.max(1, titlePart.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
double titleFullHit = 0.;
|
double titleFullHit = 0.;
|
||||||
@ -299,10 +307,8 @@ public class EdgeSearchOperator {
|
|||||||
logger.debug("{}", resultSet);
|
logger.debug("{}", resultSet);
|
||||||
|
|
||||||
for (IndexBlock block : indexBlockSearchOrder) {
|
for (IndexBlock block : indexBlockSearchOrder) {
|
||||||
for (var results : resultSet.resultsList.getOrDefault(block, Collections.emptyList())) {
|
queryResults.append(100, resultDecorator.decorateSearchResults(resultSet.resultsList.getOrDefault(block, Collections.emptyList()),
|
||||||
var items = results.getAllItems();
|
block, deduplicator));
|
||||||
queryResults.append(100, resultDecorator.decorateSearchResults(items, block, deduplicator));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,31 +10,31 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
public enum EdgeSearchProfile {
|
public enum EdgeSearchProfile {
|
||||||
DEFAULT("default",
|
DEFAULT("default",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link,
|
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||||
),
|
),
|
||||||
0, 1),
|
0, 1),
|
||||||
MODERN("modern",
|
MODERN("modern",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||||
),
|
),
|
||||||
2),
|
2),
|
||||||
CORPO("corpo",
|
CORPO("corpo",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||||
4, 5, 7),
|
4, 5, 7),
|
||||||
YOLO("yolo",
|
YOLO("yolo",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||||
0, 2, 1, 3, 4, 6),
|
0, 2, 1, 3, 4, 6),
|
||||||
CORPO_CLEAN("corpo-clean",
|
CORPO_CLEAN("corpo-clean",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||||
4, 5),
|
4, 5),
|
||||||
ACADEMIA("academia",
|
ACADEMIA("academia",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||||
3),
|
3),
|
||||||
FOOD("food",
|
FOOD("food",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||||
2, 0),
|
2, 0),
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
package nu.marginalia.wmsa.edge.search.query;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -13,12 +13,12 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
public class EnglishDictionary {
|
public class EnglishDictionary {
|
||||||
private final Set<String> englishWords = new HashSet<>();
|
private final Set<String> englishWords = new HashSet<>();
|
||||||
private final TermFrequencyDict dict;
|
private final NGramBloomFilter bloomFilter;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EnglishDictionary(TermFrequencyDict dict) {
|
public EnglishDictionary(NGramBloomFilter bloomFilter) {
|
||||||
this.dict = dict;
|
this.bloomFilter = bloomFilter;
|
||||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
|
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
|
||||||
"Could not load word frequency table");
|
"Could not load word frequency table");
|
||||||
var br = new BufferedReader(new InputStreamReader(resource))
|
var br = new BufferedReader(new InputStreamReader(resource))
|
||||||
@ -44,10 +44,9 @@ public class EnglishDictionary {
|
|||||||
|
|
||||||
public Collection<String> getWordVariants(String s) {
|
public Collection<String> getWordVariants(String s) {
|
||||||
var variants = findWordVariants(s);
|
var variants = findWordVariants(s);
|
||||||
long freqBaseline = dict.getTermFreq(s);
|
|
||||||
|
|
||||||
var ret = variants.stream()
|
var ret = variants.stream()
|
||||||
.filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var)
|
.filter(bloomFilter::isKnownNGram
|
||||||
).collect(Collectors.toList());
|
).collect(Collectors.toList());
|
||||||
|
|
||||||
if (s.equals("recipe") || s.equals("recipes")) {
|
if (s.equals("recipe") || s.equals("recipes")) {
|
||||||
|
@ -130,7 +130,7 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords);
|
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title);
|
||||||
|
|
||||||
params.profile().addTacitTerms(subquery);
|
params.profile().addTacitTerms(subquery);
|
||||||
params.jsSetting().addTacitTerms(subquery);
|
params.jsSetting().addTacitTerms(subquery);
|
||||||
|
@ -101,13 +101,27 @@ public class SearchResultDecorator {
|
|||||||
if (!missedIds.isEmpty()) {
|
if (!missedIds.isEmpty()) {
|
||||||
logger.warn("Could not look up documents: {}", missedIds.toArray());
|
logger.warn("Could not look up documents: {}", missedIds.toArray());
|
||||||
}
|
}
|
||||||
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore));
|
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)
|
||||||
|
.thenComparing(url -> url.url.path.length()));
|
||||||
return retList;
|
return retList;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
|
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
|
||||||
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
|
int titleLength = details.title.length();
|
||||||
|
|
||||||
|
double value = valuator.evaluateTerms(resultItem.scores, block, details.words,titleLength) / Math.sqrt(1 + resultItem.queryLength)
|
||||||
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
|
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
|
||||||
|
|
||||||
|
System.out.println("---");
|
||||||
|
System.out.println(details.getUrl());
|
||||||
|
System.out.println(details.getTitle());
|
||||||
|
System.out.println(details.words);
|
||||||
|
for (var score : resultItem.scores) {
|
||||||
|
System.out.println(block + ":" + score);
|
||||||
|
}
|
||||||
|
System.out.println(value);
|
||||||
|
|
||||||
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -16,8 +16,8 @@ public class SearchResultValuator {
|
|||||||
|
|
||||||
private static final Pattern separator = Pattern.compile("_");
|
private static final Pattern separator = Pattern.compile("_");
|
||||||
|
|
||||||
private static final int MIN_LENGTH = 500;
|
private static final int MIN_LENGTH = 2000;
|
||||||
private static final int AVG_LENGTH = 1400;
|
private static final int AVG_LENGTH = 5000;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchResultValuator(TermFrequencyDict dict) {
|
public SearchResultValuator(TermFrequencyDict dict) {
|
||||||
@ -26,11 +26,18 @@ public class SearchResultValuator {
|
|||||||
|
|
||||||
|
|
||||||
// This is basically a bargain bin BM25
|
// This is basically a bargain bin BM25
|
||||||
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, IndexBlock block, int length) {
|
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, IndexBlock block, int length, int titleLength) {
|
||||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0);
|
||||||
|
|
||||||
|
double bestScore = 1000;
|
||||||
|
double bestLtsFactor = 1.;
|
||||||
|
|
||||||
|
for (int set = 0; set <= sets; set++) {
|
||||||
|
int thisSet = set;
|
||||||
|
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
||||||
|
|
||||||
if (scores.length == 0) {
|
if (scores.length == 0) {
|
||||||
return IndexBlock.Words_1.sortOrder;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final double[] weights = getTermWeights(scores);
|
final double[] weights = getTermWeights(scores);
|
||||||
@ -39,45 +46,65 @@ public class SearchResultValuator {
|
|||||||
double termSum = 0.;
|
double termSum = 0.;
|
||||||
double factorSum = 0.;
|
double factorSum = 0.;
|
||||||
|
|
||||||
|
double ltsFactor = 1.0;
|
||||||
|
|
||||||
for (int i = 0; i < scores.length; i++) {
|
for (int i = 0; i < scores.length; i++) {
|
||||||
|
|
||||||
final double factor = 1. / (1.0 + weights[i]);
|
final double factor = 1. / (1.0 + weights[i]);
|
||||||
|
|
||||||
factorSum += factor;
|
factorSum += factor;
|
||||||
|
|
||||||
double termValue = (scores[i].index.sortOrder + 0.5) * factor;
|
double termValue = (scores[i].index().sortOrder + 0.5) * factor;
|
||||||
|
|
||||||
if (!scores[i].link && !scores[i].title) {
|
termValue /= lengthPenalty;
|
||||||
termValue *= lengthPenalty;
|
|
||||||
|
if (scores[i].link()) {
|
||||||
|
ltsFactor *= Math.pow(0.5, 1. / scores.length);
|
||||||
}
|
}
|
||||||
else if (scores[i].link) {
|
if (scores[i].title()) {
|
||||||
termValue /= 4.75;
|
if (titleLength <= 64) {
|
||||||
|
ltsFactor *= Math.pow(0.5, 1. / scores.length);
|
||||||
|
}
|
||||||
|
else if (titleLength < 96) {
|
||||||
|
ltsFactor *= Math.pow(0.75, 1. / scores.length);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ltsFactor *= Math.pow(0.9, 1. / scores.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (scores[i].subject()) {
|
||||||
|
ltsFactor *= Math.pow(0.8, 1. / scores.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
termSum += termValue;
|
termSum += termValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert factorSum != 0 ;
|
assert factorSum != 0;
|
||||||
|
|
||||||
if (block == IndexBlock.Title || block == IndexBlock.TitleKeywords) {
|
double value = termSum / factorSum;
|
||||||
return block.sortOrder + (termSum / factorSum) / 5;
|
|
||||||
|
bestLtsFactor = Math.min(bestLtsFactor, ltsFactor);
|
||||||
|
bestScore = Math.min(bestScore, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
return termSum / factorSum;
|
return (0.7+0.3*block.sortOrder)*bestScore * bestLtsFactor;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double getLengthPenalty(int length) {
|
private double getLengthPenalty(int length) {
|
||||||
if (length < MIN_LENGTH) {
|
if (length < MIN_LENGTH) {
|
||||||
length = MIN_LENGTH;
|
length = MIN_LENGTH;
|
||||||
}
|
}
|
||||||
return (0.7 + 0.3 * length / AVG_LENGTH);
|
if (length > AVG_LENGTH) {
|
||||||
|
length = AVG_LENGTH;
|
||||||
|
}
|
||||||
|
return (0.5 + 0.5 * length / AVG_LENGTH);
|
||||||
}
|
}
|
||||||
|
|
||||||
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
|
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
|
||||||
double[] weights = new double[scores.length];
|
double[] weights = new double[scores.length];
|
||||||
|
|
||||||
for (int i = 0; i < scores.length; i++) {
|
for (int i = 0; i < scores.length; i++) {
|
||||||
String[] parts = separator.split(scores[i].keyword);
|
String[] parts = separator.split(scores[i].keyword());
|
||||||
double sumScore = 0.;
|
double sumScore = 0.;
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
@ -3,21 +3,35 @@ package nu.marginalia.wmsa.edge.tools;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
|
||||||
public class ConverterLogicTestTool {
|
public class ConverterLogicTestTool {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
DomPruner domPruner = new DomPruner();
|
||||||
|
RecipeDetector recipeDetector = new RecipeDetector();
|
||||||
|
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||||
|
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||||
|
|
||||||
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
|
|
||||||
if (args.length != 1) {
|
if (args.length != 1) {
|
||||||
@ -38,19 +52,42 @@ public class ConverterLogicTestTool {
|
|||||||
EdgeCrawlPlan plan,
|
EdgeCrawlPlan plan,
|
||||||
DomainProcessor processor
|
DomainProcessor processor
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
|
var cp = new ForkJoinPool(16);
|
||||||
|
|
||||||
plan.forEachCrawledDomain(domain -> {
|
plan.forEachCrawledDomain(domain -> {
|
||||||
var ret = processor.process(domain);
|
if (domain.doc == null) return;
|
||||||
ret.documents.forEach(doc -> {
|
|
||||||
if (doc.words == null)
|
|
||||||
|
for (var doc : domain.doc) {
|
||||||
|
if (doc.documentBody == null) continue;
|
||||||
|
|
||||||
|
Runnable task = () -> {
|
||||||
|
var parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
|
||||||
|
domPruner.prune(parsed, 0.5);
|
||||||
|
var dld = se.extractSentences(parsed);
|
||||||
|
|
||||||
|
if (dld.totalNumWords() < 250)
|
||||||
return;
|
return;
|
||||||
var artifacts = doc.words.get(IndexBlock.Artifacts);
|
|
||||||
if (artifacts.size() > 0) {
|
if (textileCraftDetector.testP(dld) > 0.3) {
|
||||||
System.out.println(doc.url + ": " + artifacts);
|
System.out.println("textilecraft\t" + doc.url);
|
||||||
|
}
|
||||||
|
if (woodworkingDetector.testP(dld) > 0.2) {
|
||||||
|
System.out.println("woodworking\t" + doc.url);
|
||||||
|
}
|
||||||
|
if (recipeDetector.testP(dld) > 0.5) {
|
||||||
|
System.out.println("recipe\t" + doc.url);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (cp.getQueuedSubmissionCount() > 32) {
|
||||||
|
task.run();
|
||||||
|
} else {
|
||||||
|
cp.execute(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -151,15 +151,6 @@ i've
|
|||||||
it's
|
it's
|
||||||
it
|
it
|
||||||
i'm
|
i'm
|
||||||
1
|
|
||||||
2
|
|
||||||
3
|
|
||||||
4
|
|
||||||
5
|
|
||||||
6
|
|
||||||
7
|
|
||||||
8
|
|
||||||
9
|
|
||||||
.
|
.
|
||||||
..
|
..
|
||||||
...
|
...
|
||||||
|
@ -42,7 +42,7 @@ class SqlLoadDomainLinksTest {
|
|||||||
@Test
|
@Test
|
||||||
public void loadDomainLinks() {
|
public void loadDomainLinks() {
|
||||||
var loader = new SqlLoadDomainLinks(dataSource);
|
var loader = new SqlLoadDomainLinks(dataSource);
|
||||||
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
|
loader.load(loaderData, new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -113,38 +113,13 @@ class SentenceExtractorTest {
|
|||||||
var dict = new TermFrequencyDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
|
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||||
|
|
||||||
// documentKeywordExtractorLegacy.setLegacy(true);
|
|
||||||
|
|
||||||
// for (;;) {
|
|
||||||
long st = System.currentTimeMillis();
|
long st = System.currentTimeMillis();
|
||||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||||
|
|
||||||
|
|
||||||
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||||
|
|
||||||
var newRes = documentKeywordExtractor.extractKeywords(newResult);
|
var newRes = documentKeywordExtractor.extractKeywords(newResult);
|
||||||
|
|
||||||
|
|
||||||
// var legacyRes = documentKeywordExtractorLegacy.extractKeywords(newResult);
|
|
||||||
//
|
|
||||||
// EdgePageWordSet difference = new EdgePageWordSet();
|
|
||||||
// for (IndexBlock block : IndexBlock.values()) {
|
|
||||||
|
|
||||||
// var newWords = new HashSet<>(newRes.get(block).words);
|
|
||||||
// var oldWords = new HashSet<>(legacyRes.get(block).words);
|
|
||||||
// newWords.removeAll(oldWords);
|
|
||||||
|
|
||||||
// if (!newWords.isEmpty()) {
|
|
||||||
// difference.append(block, newWords);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// System.out.println(difference);
|
|
||||||
System.out.println(newRes);
|
System.out.println(newRes);
|
||||||
// System.out.println("---");
|
|
||||||
}
|
}
|
||||||
System.out.println(System.currentTimeMillis() - st);
|
System.out.println(System.currentTimeMillis() - st);
|
||||||
// }
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,156 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.service;
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.util.TestUtil;
|
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
|
||||||
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
|
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
|
||||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
|
||||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
|
||||||
import org.junit.jupiter.api.AfterAll;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Tag;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.junit.jupiter.api.parallel.Execution;
|
|
||||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
|
||||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
|
||||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
|
||||||
import spark.Spark;
|
|
||||||
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static nu.marginalia.util.TestUtil.getConnection;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
|
||||||
@Execution(ExecutionMode.SAME_THREAD)
|
|
||||||
@Tag("db")
|
|
||||||
public class EdgeIndexClientTest {
|
|
||||||
private static HikariDataSource dataSource;
|
|
||||||
private static EdgeIndexService service;
|
|
||||||
private static EdgeIndexClient client;
|
|
||||||
private static Path tempDir;
|
|
||||||
private static SearchIndexes indexes;
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static HikariDataSource provideConnection() {
|
|
||||||
return getConnection();
|
|
||||||
}
|
|
||||||
|
|
||||||
static final int testPort = TestUtil.getPort();
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@BeforeAll
|
|
||||||
public static void setUpClass() {
|
|
||||||
Spark.port(testPort);
|
|
||||||
System.setProperty("service-name", "edge-index");
|
|
||||||
|
|
||||||
dataSource = provideConnection();
|
|
||||||
dataSource.setKeepaliveTime(100);
|
|
||||||
dataSource.setIdleTimeout(100);
|
|
||||||
client = new EdgeIndexClient();
|
|
||||||
client.setServiceRoute("127.0.0.1", testPort);
|
|
||||||
|
|
||||||
tempDir = Files.createTempDirectory("EdgeIndexClientTest");
|
|
||||||
|
|
||||||
var servicesFactory = new IndexServicesFactory(tempDir,tempDir,tempDir,tempDir,
|
|
||||||
"writer-index",
|
|
||||||
"writer-dictionary",
|
|
||||||
"index-words-read",
|
|
||||||
"index-urls-read",
|
|
||||||
"index-words-write",
|
|
||||||
"index-urls-write",
|
|
||||||
1L<<24,
|
|
||||||
id->false,
|
|
||||||
new SearchIndexPartitioner(null)
|
|
||||||
);
|
|
||||||
|
|
||||||
var init = new Initialization();
|
|
||||||
indexes = new SearchIndexes(servicesFactory, new SearchIndexPartitioner(null));
|
|
||||||
service = new EdgeIndexService("127.0.0.1",
|
|
||||||
testPort,
|
|
||||||
init, null,
|
|
||||||
indexes,
|
|
||||||
servicesFactory);
|
|
||||||
|
|
||||||
Spark.awaitInitialization();
|
|
||||||
init.setReady();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testMultiBucketHit() {
|
|
||||||
putWords(1, 1, -2, "fancy", "anagram", "dilbert", "whoah", "engram");
|
|
||||||
putWords(2, 2, -5, "quibble", "angry", "whoah", "fancy");
|
|
||||||
putWords(3, 3, -0.01, "strong", "manly", "muscles");
|
|
||||||
indexes.repartition();
|
|
||||||
indexes.preconvert();
|
|
||||||
indexes.reindexAll();
|
|
||||||
|
|
||||||
var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results;
|
|
||||||
System.out.println(results);
|
|
||||||
List<EdgeId<EdgeUrl>> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList());
|
|
||||||
|
|
||||||
assertEquals(2, flatResults.size());
|
|
||||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(1)));
|
|
||||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(2)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testHighHit() {
|
|
||||||
putWords(2, 5, -100, "trapphus");
|
|
||||||
indexes.repartition();
|
|
||||||
indexes.preconvert();
|
|
||||||
indexes.reindexAll();
|
|
||||||
var rsp = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("trapphus"));
|
|
||||||
System.out.println(rsp);
|
|
||||||
assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.id());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSearchDomain() {
|
|
||||||
putWords(8, 1, -2, "domain");
|
|
||||||
putWords(8, 2, -5, "domain");
|
|
||||||
putWords(10, 3, -0.01, "domain");
|
|
||||||
putWords(11, 3, -0.01, "domain");
|
|
||||||
putWords(12, 3, -0.01, "domain");
|
|
||||||
indexes.repartition();
|
|
||||||
indexes.preconvert();
|
|
||||||
indexes.reindexAll();
|
|
||||||
|
|
||||||
var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results;
|
|
||||||
System.out.println(results);
|
|
||||||
List<EdgeId<EdgeUrl>> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList());
|
|
||||||
|
|
||||||
assertEquals(2, flatResults.size());
|
|
||||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(1)));
|
|
||||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(2)));
|
|
||||||
}
|
|
||||||
|
|
||||||
void putWords(int didx, int idx, double quality, String... words) {
|
|
||||||
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
|
|
||||||
epw.addAll(Arrays.asList(words));
|
|
||||||
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
|
|
||||||
new EdgePageWordSet(epw), 0).blockingSubscribe();
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterAll
|
|
||||||
public static void tearDownClass() {
|
|
||||||
nu.marginalia.util.test.TestUtil.clearTempDir(tempDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -24,7 +24,7 @@ class BodyQueryParserTest {
|
|||||||
public static void init() throws IOException {
|
public static void init() throws IOException {
|
||||||
dict = new TermFrequencyDict(lm);
|
dict = new TermFrequencyDict(lm);
|
||||||
nGramBloomFilter = new NGramBloomFilter(lm);
|
nGramBloomFilter = new NGramBloomFilter(lm);
|
||||||
englishDictionary = new EnglishDictionary(dict);
|
englishDictionary = new EnglishDictionary(nGramBloomFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
|
@ -1,17 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
class EnglishDictionaryTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void getWordVariants() {
|
|
||||||
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
|
||||||
|
|
||||||
var dict = new TermFrequencyDict(lm);
|
|
||||||
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);
|
|
||||||
}
|
|
||||||
}
|
|
@ -21,7 +21,7 @@ class QueryParserTest {
|
|||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
dict = new TermFrequencyDict(lm);
|
dict = new TermFrequencyDict(lm);
|
||||||
nGramBloomFilter = new NGramBloomFilter(lm);
|
nGramBloomFilter = new NGramBloomFilter(lm);
|
||||||
englishDictionary = new EnglishDictionary(dict);
|
englishDictionary = new EnglishDictionary(nGramBloomFilter);
|
||||||
|
|
||||||
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
||||||
}
|
}
|
||||||
|
@ -23,8 +23,8 @@ class QueryVariantsTest {
|
|||||||
|
|
||||||
var dict = new TermFrequencyDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
var ngrams = new NGramBloomFilter(lm);
|
var ngrams = new NGramBloomFilter(lm);
|
||||||
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict));
|
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(ngrams));
|
||||||
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
parser = new QueryParser(new EnglishDictionary(ngrams), variants);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
Loading…
Reference in New Issue
Block a user