Tweaks for search result relevance
This commit is contained in:
parent
813399401e
commit
5f993c72dd
@ -63,7 +63,7 @@ public class WordPatterns {
|
||||
if (word.isBlank()) {
|
||||
return false;
|
||||
}
|
||||
if (hasMoreThanTwo(word, '-', 2)) {
|
||||
if (hasMoreThanTwo(word, '-', 4)) {
|
||||
return false;
|
||||
}
|
||||
if (hasMoreThanTwo(word, '+', 2)) {
|
||||
@ -80,7 +80,7 @@ public class WordPatterns {
|
||||
if (Character.isDigit(word.charAt(i))) {
|
||||
numDigits++;
|
||||
}
|
||||
if (numDigits > 6)
|
||||
if (numDigits > 16)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,8 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class AsciiFlattener {
|
||||
|
||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+");
|
||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$");
|
||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
|
||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
|
||||
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
||||
|
||||
public static String flattenUnicode(String s) {
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
@ -12,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
|
||||
@ -42,7 +40,7 @@ public class DocumentKeywordExtractor {
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||
@ -190,30 +188,7 @@ public class DocumentKeywordExtractor {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private Collection<WordRep> joinWordLists(List<WordRep>... words) {
|
||||
int size = 0;
|
||||
for (var lst : words) {
|
||||
size += lst.size();
|
||||
}
|
||||
if (size == 0)
|
||||
return Collections.emptyList();
|
||||
|
||||
final LinkedHashSet<WordRep> ret = new LinkedHashSet<>(size);
|
||||
for (var lst : words) {
|
||||
ret.addAll(lst);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
private Set<WordRep> overlappingStems(Collection<WordRep> wordsA, Collection<WordRep> wordsB) {
|
||||
Set<String> stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
Set<String> stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
Set<String> stemmedIntersect = Sets.intersection(stemmedA, stemmedB);
|
||||
return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet());
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
@ -22,17 +23,20 @@ public class KeywordCounter {
|
||||
}
|
||||
|
||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||
HashMap<String, Integer> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() == 1 &&
|
||||
WordPatterns.isStopWord(sent.words[span.start]))
|
||||
continue;
|
||||
|
||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
counts.merge(stemmed, 1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
@ -43,15 +47,23 @@ public class KeywordCounter {
|
||||
Set<WordRep> h10 = new HashSet<>();
|
||||
Set<WordRep> h15 = new HashSet<>();
|
||||
|
||||
int doubleWordCount = 0;
|
||||
|
||||
for (var entry : counts.entrySet()) {
|
||||
double value = getTermValue(entry, maxC);
|
||||
|
||||
double avgCnt = entry.getValue();
|
||||
String wordStemmed = entry.getKey();
|
||||
|
||||
Set<WordRep> histogram;
|
||||
if (value < -3) histogram = h15;
|
||||
else if (value < -2) histogram = h10;
|
||||
else if (value < -1) histogram = h5;
|
||||
if (value < -3 && avgCnt>1) histogram = h15;
|
||||
else if (value < -1.75 && avgCnt>1) histogram = h10;
|
||||
else if (value < -1 &&
|
||||
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
|
||||
histogram = h5;
|
||||
else continue;
|
||||
|
||||
histogram.addAll(instances.get(entry.getKey()));
|
||||
histogram.addAll(instances.get(wordStemmed));
|
||||
}
|
||||
|
||||
return new WordHistogram(h5, h10, h15);
|
||||
@ -59,7 +71,7 @@ public class KeywordCounter {
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
|
||||
public double getTermValue(Map.Entry<String, Double> e, double maxValue) {
|
||||
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
||||
String[] parts = separator.split(e.getKey());
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
@ -71,9 +83,9 @@ public class KeywordCounter {
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
freq = 10;
|
||||
freq = 1;
|
||||
}
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount);
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||
}
|
||||
|
||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||
|
@ -10,84 +10,9 @@ import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class KeywordExtractor {
|
||||
|
||||
public boolean isLegacy() {
|
||||
return legacy;
|
||||
}
|
||||
|
||||
public void setLegacy(boolean legacy) {
|
||||
this.legacy = legacy;
|
||||
}
|
||||
|
||||
private boolean legacy;
|
||||
|
||||
public WordSpan[] getNameLikes(DocumentSentence sentence) {
|
||||
var direct = IntStream.range(0, sentence.length())
|
||||
.filter(i -> sentence.posTags[i].startsWith("N"))
|
||||
.mapToObj(i -> new WordSpan(i, i+1))
|
||||
;
|
||||
var two = IntStream.range(1, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i -1, sentence, Collections.emptySet()))
|
||||
.mapToObj(i -> new WordSpan(i-1, i+1))
|
||||
;
|
||||
|
||||
var a_in_b = IntStream.range(2, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
||||
.filter(i -> isProperNoun(i, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-1))
|
||||
.filter(i -> isProperNoun(i-2, sentence))
|
||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
||||
;
|
||||
|
||||
var a_in_det_b = IntStream.range(3, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
||||
.filter(i -> isProperNoun(i, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-1))
|
||||
.filter(i -> sentence.posTags[i-2].equals("DT"))
|
||||
.filter(i -> isProperNoun(i-3, sentence))
|
||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
||||
;
|
||||
var a_in_in_b = IntStream.range(3, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
||||
.filter(i -> isProperNoun(i, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence))
|
||||
.filter(i -> isProperNoun(i-3, sentence))
|
||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
||||
;
|
||||
var three = IntStream.range(2, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i-1, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i-2, sentence, Collections.emptySet()))
|
||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
||||
;
|
||||
var four = IntStream.range(3, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-3] == WordSeparator.SPACE)
|
||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i - 1, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i - 2, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i - 3, sentence, Collections.emptySet()))
|
||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
||||
;
|
||||
|
||||
return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity())
|
||||
.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
|
||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
@ -214,7 +139,7 @@ public class KeywordExtractor {
|
||||
}
|
||||
String word = sentence.constructWordFromSpan(w);
|
||||
|
||||
if (word.isBlank() || WordPatterns.isStopWord(word)) return false;
|
||||
if (word.isBlank() || !WordPatterns.filter(word)) return false;
|
||||
if (sentence.posTags[w.start].equals("CC")) return false;
|
||||
if (sentence.posTags[w.end-1].equals("IN")) return false;
|
||||
if (sentence.posTags[w.end-1].equals("DT")) return false;
|
||||
|
@ -22,6 +22,9 @@ public class NameCounter {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1)
|
||||
continue;
|
||||
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
|
@ -52,7 +52,7 @@ public class ConverterMain {
|
||||
logger.info("Starting pipe");
|
||||
|
||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
|
||||
|
||||
@Override
|
||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||
@ -73,12 +73,7 @@ public class ConverterMain {
|
||||
|
||||
};
|
||||
|
||||
plan.forEachCrawledDomain(domain -> {
|
||||
if (!processLog.isJobFinished(domain.id)) {
|
||||
logger.info("{} - {}", domain.domain, domain.id);
|
||||
pipe.accept(domain);
|
||||
}
|
||||
});
|
||||
plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
|
||||
|
||||
pipe.join();
|
||||
}
|
||||
|
@ -3,6 +3,14 @@ package nu.marginalia.wmsa.edge.converting;
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -36,7 +44,9 @@ public class CrawledInstructionWriter {
|
||||
}
|
||||
|
||||
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
||||
logger.info("Writing {} - {}", id, instructionList.size());
|
||||
|
||||
SummarizingInterpreter summary = new SummarizingInterpreter(instructionList);
|
||||
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
||||
|
||||
for (var instr : instructionList) {
|
||||
outputStream.append(instr.tag().name());
|
||||
@ -59,4 +69,54 @@ public class CrawledInstructionWriter {
|
||||
}
|
||||
return destDir.resolve(id + ".pzstd");
|
||||
}
|
||||
|
||||
private static class SummarizingInterpreter implements Interpreter {
|
||||
|
||||
private SummarizingInterpreter(List<Instruction> instructions) {
|
||||
for (var i : instructions) {
|
||||
i.apply(this);
|
||||
}
|
||||
}
|
||||
|
||||
private String domainName;
|
||||
private int ok = 0;
|
||||
private int error = 0;
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s - %d %d", domainName, ok, error);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadUrl(EdgeUrl[] url) {}
|
||||
|
||||
@Override
|
||||
public void loadDomain(EdgeDomain[] domain) {}
|
||||
|
||||
@Override
|
||||
public void loadRssFeed(EdgeUrl[] rssFeed) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainLink(DomainLink[] links) {}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
this.domainName = domain.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
|
||||
ok++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
|
||||
error++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainRedirect(DomainLink link) {}
|
||||
}
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ public class Loader implements Interpreter {
|
||||
@Override
|
||||
public void loadDomainLink(DomainLink[] links) {
|
||||
logger.debug("loadDomainLink({})", links, null);
|
||||
sqlLoadDomainLinks.load(links);
|
||||
sqlLoadDomainLinks.load(data, links);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -40,13 +40,20 @@ public class SqlLoadDomainLinks {
|
||||
}
|
||||
}
|
||||
|
||||
public void load(DomainLink[] links) {
|
||||
public void load(LoaderData data, DomainLink[] links) {
|
||||
|
||||
try (var connection = dataSource.getConnection();
|
||||
var nukeExistingLinksForDomain =
|
||||
connection.prepareStatement("""
|
||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?
|
||||
""");
|
||||
var stmt =
|
||||
connection.prepareCall("CALL INSERT_LINK(?,?)"))
|
||||
{
|
||||
|
||||
nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from()));
|
||||
nukeExistingLinksForDomain.executeUpdate();
|
||||
|
||||
for (DomainLink link : links) {
|
||||
stmt.setString(1, link.from().toString());
|
||||
stmt.setString(2, link.to().toString());
|
||||
|
@ -15,6 +15,7 @@ public class ProcessedDocument {
|
||||
public EdgePageWordSet words;
|
||||
|
||||
public EdgeUrlState state;
|
||||
public String stateReason;
|
||||
|
||||
public OptionalDouble quality() {
|
||||
if (details != null) {
|
||||
|
@ -70,11 +70,22 @@ public class DocumentProcessor {
|
||||
this.summaryExtractor = summaryExtractor;
|
||||
}
|
||||
|
||||
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
}
|
||||
catch (Exception ex) {}
|
||||
|
||||
return ret;
|
||||
}
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
ret.url = new EdgeUrl(crawledDocument.url);
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
if (ret.state == EdgeUrlState.OK) {
|
||||
@ -99,17 +110,31 @@ public class DocumentProcessor {
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
ret.stateReason = ex.reason.toString();
|
||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
logger.info("Failed to convert " + ret.url, ex);
|
||||
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||
throws URISyntaxException
|
||||
{
|
||||
if (crawledDocument.canonicalUrl != null) {
|
||||
try {
|
||||
return new EdgeUrl(crawledDocument.canonicalUrl);
|
||||
}
|
||||
catch (URISyntaxException ex) { /* fallthrough */ }
|
||||
}
|
||||
|
||||
return new EdgeUrl(crawledDocument.url);
|
||||
}
|
||||
|
||||
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||
if (crawledDocument.contentType == null) {
|
||||
return false;
|
||||
@ -155,20 +180,26 @@ public class DocumentProcessor {
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.description = getDescription(doc);
|
||||
|
||||
ret.length = getLength(doc);
|
||||
ret.standard = getHtmlStandard(doc);
|
||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
|
||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||
|
||||
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
|
||||
|
||||
EdgePageWordSet words;
|
||||
if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) {
|
||||
if (doSimpleProcessing) {
|
||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||
ret.description = "";
|
||||
}
|
||||
else {
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
words = keywordExtractor.extractKeywords(dld);
|
||||
ret.description = getDescription(doc);
|
||||
}
|
||||
|
||||
var url = new EdgeUrl(crawledDocument.url);
|
||||
@ -276,6 +307,10 @@ public class DocumentProcessor {
|
||||
}
|
||||
|
||||
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||
if (dld.totalNumWords() < minDocumentLength) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
double languageAgreement = languageFilter.dictionaryAgreement(dld);
|
||||
if (languageAgreement < 0.1) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
|
@ -1,23 +1,27 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DomainProcessor {
|
||||
private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor();
|
||||
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final Double minAvgDocumentQuality;
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||
@ -39,61 +43,71 @@ public class DomainProcessor {
|
||||
if (crawledDomain.doc != null) {
|
||||
ret.documents = new ArrayList<>(crawledDomain.doc.size());
|
||||
|
||||
fixBadCanonicalTags(crawledDomain.doc);
|
||||
|
||||
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
||||
for (var doc : crawledDomain.doc) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
if (processedDoc.url != null) {
|
||||
ret.documents.add(processedDoc);
|
||||
if (disqualifier.isQualified()) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
|
||||
if (processedDoc.url != null) {
|
||||
ret.documents.add(processedDoc);
|
||||
processedDoc.quality().ifPresent(disqualifier::offer);
|
||||
}
|
||||
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
||||
disqualifier.offer(-100);
|
||||
}
|
||||
}
|
||||
else { // Short-circuit processing if quality is too low
|
||||
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
||||
if (stub.url != null) {
|
||||
ret.documents.add(stub);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
addCommonSiteWords(ret);
|
||||
Set<String> commonSiteWords = new HashSet<>(10);
|
||||
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
|
||||
|
||||
if (!commonSiteWords.isEmpty()) {
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words != null) {
|
||||
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
ret.documents = Collections.emptyList();
|
||||
}
|
||||
|
||||
double averageQuality = getAverageQuality(ret.documents);
|
||||
if (averageQuality < minAvgDocumentQuality) {
|
||||
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
||||
}
|
||||
|
||||
ret.state = getState(crawledDomain.crawlerStatus);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void addCommonSiteWords(ProcessedDomain ret) {
|
||||
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
||||
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
||||
|
||||
if (ret.documents.size() < 25)
|
||||
return;
|
||||
// Sometimes sites set a blanket canonical link to their root page
|
||||
// this removes such links from consideration
|
||||
|
||||
Map<String, Integer> topKeywordCount = new HashMap<>(ret.documents.size()*10);
|
||||
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words == null)
|
||||
continue;
|
||||
|
||||
for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) {
|
||||
topKeywordCount.merge(word, -1, Integer::sum);
|
||||
for (var document : docs) {
|
||||
if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) {
|
||||
seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash);
|
||||
}
|
||||
}
|
||||
|
||||
if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100)
|
||||
return;
|
||||
|
||||
Set<String> topWords = topKeywordCount.entrySet().stream()
|
||||
.filter(e -> e.getValue() < -10)
|
||||
.sorted(Map.Entry.comparingByValue()).limit(5)
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
if (!topWords.isEmpty()) {
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words != null) {
|
||||
doc.words.get(IndexBlock.Site).addAll(topWords);
|
||||
}
|
||||
for (var document : docs) {
|
||||
if (!Strings.isNullOrEmpty(document.canonicalUrl)
|
||||
&& !Objects.equals(document.canonicalUrl, document.url)
|
||||
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
|
||||
document.canonicalUrl = document.url;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||
@ -120,4 +134,20 @@ public class DomainProcessor {
|
||||
default -> EdgeDomainIndexingState.ERROR;
|
||||
};
|
||||
}
|
||||
|
||||
class DocumentDisqualifier {
|
||||
int count;
|
||||
int goodCount;
|
||||
|
||||
void offer(double quality) {
|
||||
count++;
|
||||
if (quality > minAvgDocumentQuality) {
|
||||
goodCount++;
|
||||
}
|
||||
}
|
||||
|
||||
boolean isQualified() {
|
||||
return count < 25 || goodCount*10 >= count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,71 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class CommonKeywordExtractor {
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||
|
||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
||||
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||
|
||||
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||
|
||||
public List<String> getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) {
|
||||
|
||||
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
|
||||
return Collections.emptyList();
|
||||
|
||||
final Map<String, String> wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10);
|
||||
|
||||
final Map<String, Integer> topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10);
|
||||
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
|
||||
|
||||
int qualifiedDocCount = 0;
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words == null)
|
||||
continue;
|
||||
|
||||
qualifiedDocCount++;
|
||||
|
||||
for (var block : sourceBlocks) {
|
||||
for (var word : doc.words.get(block).words) {
|
||||
String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord);
|
||||
|
||||
// Count by negative values to sort by Map.Entry.comparingByValue() in reverse
|
||||
topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum);
|
||||
|
||||
stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int totalValue = 0;
|
||||
for (int value : topStemmedKeywordCount.values()) {
|
||||
totalValue += value;
|
||||
}
|
||||
|
||||
if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION)
|
||||
return Collections.emptyList();
|
||||
|
||||
List<String> topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT);
|
||||
|
||||
double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD;
|
||||
|
||||
topStemmedKeywordCount.entrySet().stream()
|
||||
.filter(e -> e.getValue() < qualifyingValue)
|
||||
.sorted(Map.Entry.comparingByValue())
|
||||
.limit(MAX_SITE_KEYWORDS_TO_EXTRACT)
|
||||
.forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey())));
|
||||
|
||||
|
||||
return topWords;
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -14,6 +14,8 @@ public enum HtmlFeature {
|
||||
ADVERTISEMENT("special:ads"),
|
||||
|
||||
CATEGORY_CRAFTS("category:crafts"),
|
||||
|
||||
UNKNOWN("special:uncategorized")
|
||||
;
|
||||
|
||||
private final String keyword;
|
||||
|
@ -13,10 +13,14 @@ import java.io.InputStreamReader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
private final ForkJoinPool pool = new ForkJoinPool(4);
|
||||
|
||||
public CrawledDomainReader() {
|
||||
}
|
||||
|
||||
@ -43,7 +47,12 @@ public class CrawledDomainReader {
|
||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
||||
pool.execute(() -> {
|
||||
var doc = gson.fromJson(nextLine, CrawledDocument.class);
|
||||
synchronized (docs) {
|
||||
docs.add(doc);
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (line.charAt(0) == '{') {
|
||||
domain = gson.fromJson(line, CrawledDomain.class);
|
||||
@ -52,6 +61,8 @@ public class CrawledDomainReader {
|
||||
}
|
||||
}
|
||||
|
||||
pool.awaitQuiescence(10, TimeUnit.SECONDS);
|
||||
|
||||
if (domain == null) {
|
||||
return null;
|
||||
}
|
||||
|
@ -6,5 +6,6 @@ public enum CrawlerDocumentStatus {
|
||||
BAD_CHARSET,
|
||||
REDIRECT,
|
||||
ROBOTS_TXT,
|
||||
ERROR
|
||||
ERROR,
|
||||
Timeout
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.wmsa.edge.data.dao;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||
@ -113,9 +114,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
Double.MAX_VALUE, // termScore
|
||||
0 // queryLength
|
||||
);
|
||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
||||
result.add(val);
|
||||
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||
&& Strings.isNullOrEmpty(val.description)
|
||||
&& val.url.path.length() > 1) {
|
||||
continue;
|
||||
}
|
||||
result.add(val);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,6 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Histogram;
|
||||
@ -227,12 +226,7 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
try {
|
||||
if (specsSet.isStagger()) {
|
||||
return new EdgeSearchResultSet(searchStaggered(specsSet));
|
||||
}
|
||||
else {
|
||||
return new EdgeSearchResultSet(searchStraight(specsSet));
|
||||
}
|
||||
return new EdgeSearchResultSet(searchStraight(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
@ -249,59 +243,9 @@ public class EdgeIndexService extends Service {
|
||||
}
|
||||
}
|
||||
|
||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStaggered(EdgeSearchSpecification specsSet) {
|
||||
int count = 0;
|
||||
|
||||
final Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
||||
final TIntHashSet seenResults = new TIntHashSet();
|
||||
|
||||
final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] {
|
||||
new DomainResultCountFilter(specsSet.limitByDomain),
|
||||
new DomainResultCountFilter(specsSet.limitByDomain)
|
||||
};
|
||||
|
||||
final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||
final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket);
|
||||
|
||||
for (int i = 0; i < specsSet.buckets.size(); i+=2) {
|
||||
for (var sq : specsSet.subqueries) {
|
||||
for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) {
|
||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
continue;
|
||||
|
||||
var result = performSearch(searchTerms.get(),
|
||||
budget,
|
||||
seenResults,
|
||||
domainCountFilter[j],
|
||||
sq,
|
||||
List.of(specsSet.buckets.get(i+j)),
|
||||
specsSet,
|
||||
Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count)
|
||||
);
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum());
|
||||
}
|
||||
|
||||
int sz = result.size();
|
||||
count += sz;
|
||||
limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz);
|
||||
|
||||
if (sz > 0) {
|
||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStraight(EdgeSearchSpecification specsSet) {
|
||||
Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
||||
private Map<IndexBlock, List<EdgeSearchResultItem>> searchStraight(EdgeSearchSpecification specsSet) {
|
||||
Map<IndexBlock, List<EdgeSearchResultItem>> results = new HashMap<>();
|
||||
int count = 0;
|
||||
TIntHashSet seenResults = new TIntHashSet();
|
||||
|
||||
@ -314,25 +258,38 @@ public class EdgeIndexService extends Service {
|
||||
if (searchTerms.isEmpty())
|
||||
continue;
|
||||
|
||||
var result = performSearch(searchTerms.get(),
|
||||
var resultForSq = performSearch(searchTerms.get(),
|
||||
budget, seenResults, domainCountFilter,
|
||||
sq, specsSet.buckets, specsSet,
|
||||
specsSet.limitTotal - count);
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size());
|
||||
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, resultForSq.size());
|
||||
}
|
||||
|
||||
count += result.size();
|
||||
if (result.size() > 0) {
|
||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
||||
count += resultForSq.size();
|
||||
if (resultForSq.size() > 0) {
|
||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).addAll(resultForSq);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<List<String>> distinctSearchTerms = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
|
||||
results.forEach((index, blockResults) -> {
|
||||
for (var result : blockResults) {
|
||||
for (int i = 0; i < distinctSearchTerms.size(); i++) {
|
||||
for (var term : distinctSearchTerms.get(i)) {
|
||||
result.scores.add(getSearchTermScore(i, result.bucketId, term, result.getCombinedId()));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms,
|
||||
private List<EdgeSearchResultItem> performSearch(EdgeIndexSearchTerms searchTerms,
|
||||
IndexSearchBudget budget,
|
||||
TIntHashSet seenResults,
|
||||
DomainResultCountFilter domainCountFilter,
|
||||
@ -342,14 +299,14 @@ public class EdgeIndexService extends Service {
|
||||
int limit)
|
||||
{
|
||||
if (limit <= 0) {
|
||||
return new EdgeSearchResults();
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
|
||||
final List<EdgeSearchResultItem> results = new ArrayList<>();
|
||||
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
||||
|
||||
for (int i : specBuckets) {
|
||||
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
|
||||
int foundResultsCount = results.size();
|
||||
|
||||
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
|
||||
break;
|
||||
@ -362,38 +319,33 @@ public class EdgeIndexService extends Service {
|
||||
.limit(specs.limitTotal * 3L)
|
||||
.distinct()
|
||||
.limit(Math.min(specs.limitByBucket
|
||||
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
||||
- results.size(), limit - foundResultsCount))
|
||||
.forEach(resultsForBucket::add);
|
||||
|
||||
|
||||
for (var result : resultsForBucket) {
|
||||
seenResults.add(result.url.id());
|
||||
}
|
||||
for (var result : resultsForBucket) {
|
||||
for (var searchTerm : sq.searchTermsInclude) {
|
||||
result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId()));
|
||||
}
|
||||
}
|
||||
|
||||
domainCountFilter.addAll(i, resultsForBucket);
|
||||
|
||||
if (!resultsForBucket.isEmpty()) {
|
||||
results.put(i, resultsForBucket);
|
||||
}
|
||||
results.addAll(resultsForBucket);
|
||||
}
|
||||
|
||||
return new EdgeSearchResults(results);
|
||||
return results;
|
||||
}
|
||||
|
||||
private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) {
|
||||
private EdgeSearchResultKeywordScore getSearchTermScore(int set, int bucketId, String term, long urlId) {
|
||||
final int termId = indexes.getDictionaryReader().get(term);
|
||||
|
||||
var bucket = indexes.getBucket(bucketId);
|
||||
|
||||
return new EdgeSearchResultKeywordScore(term,
|
||||
return new EdgeSearchResultKeywordScore(set, term,
|
||||
bucket.getTermScore(termId, urlId),
|
||||
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
|
||||
bucket.isTermInBucket(IndexBlock.Link, termId, urlId)
|
||||
bucket.isTermInBucket(IndexBlock.Link, termId, urlId),
|
||||
bucket.isTermInBucket(IndexBlock.Site, termId, urlId),
|
||||
bucket.isTermInBucket(IndexBlock.Subjects, termId, urlId)
|
||||
);
|
||||
|
||||
}
|
||||
|
@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public enum IndexBlock {
|
||||
TitleKeywords(0, 0),
|
||||
Title(1, 1),
|
||||
Title(1, 0),
|
||||
|
||||
Link(2, 1.15),
|
||||
|
||||
Subjects(3, 3.0),
|
||||
Subjects(3, 1.0),
|
||||
NamesWords(4, 3.0),
|
||||
Artifacts(5, 10),
|
||||
Meta(6, 7),
|
||||
|
||||
Tfidf_Top(7, 0.5),
|
||||
Tfidf_Middle(8, 1.25),
|
||||
Tfidf_Lower(9, 1.5),
|
||||
Tfidf_Top(7, 1.5),
|
||||
Tfidf_Middle(8, 2),
|
||||
Tfidf_Lower(9, 3.5),
|
||||
|
||||
Words_1(10, 3.0),
|
||||
Words_1(10, 2.0),
|
||||
Words_2(11, 3.5),
|
||||
Words_4(12, 4.0),
|
||||
Words_8(13, 4.5),
|
||||
|
@ -47,7 +47,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
var linkIndex = indices.get(IndexBlock.Link);
|
||||
var titleIndex = indices.get(IndexBlock.Title);
|
||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
||||
var siteIndex = indices.get(IndexBlock.Site);
|
||||
var metaIndex = indices.get(IndexBlock.Meta);
|
||||
var topicIndex = indices.get(IndexBlock.Subjects);
|
||||
|
||||
@ -61,14 +61,17 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, linkIndex), words1));
|
||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1));
|
||||
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1));
|
||||
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1));
|
||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1));
|
||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
||||
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, siteIndex, namesIndex, topicIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, namesIndex, siteIndex, midIndex, topicIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Middle, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Lower, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex, artifacts), words1));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
|
@ -46,7 +46,7 @@ public class IndexQueryBuilder {
|
||||
return new QueryForIndices(budget, LongStream::empty);
|
||||
}
|
||||
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
|
||||
return build(budget, filter, wordId);
|
||||
return new QueryForIndices(budget, LongStream::empty);
|
||||
}
|
||||
|
||||
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
|
||||
|
@ -16,6 +16,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor @ToString
|
||||
@ -86,7 +87,21 @@ public class EdgeCrawlPlan {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
|
||||
final CrawledDomainReader reader = new CrawledDomainReader();
|
||||
|
||||
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
|
||||
entryStream
|
||||
.filter(entry -> idReadPredicate.test(entry.id()))
|
||||
.map(CrawlLogEntry::path)
|
||||
.map(this::getCrawledFilePath)
|
||||
.map(reader::readRuntimeExcept)
|
||||
.forEach(consumer);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
@MustBeClosed
|
||||
public DomainsIterable domainsIterable() throws IOException {
|
||||
return new DomainsIterable();
|
||||
|
@ -13,18 +13,18 @@ import java.util.List;
|
||||
|
||||
@AllArgsConstructor @ToString @Getter @EqualsAndHashCode
|
||||
public class EdgeSearchResultItem {
|
||||
public final int blockId;
|
||||
public final int bucketId;
|
||||
public final int queryLength;
|
||||
public final EdgeId<EdgeDomain> domain; // this isn't the external domain ID, but a ranking
|
||||
public final EdgeId<EdgeUrl> url;
|
||||
public final List<EdgeSearchResultKeywordScore> scores;
|
||||
|
||||
public EdgeSearchResultItem(int blockId, int queryLength, long val) {
|
||||
public EdgeSearchResultItem(int bucketId, int queryLength, long val) {
|
||||
int urlId = (int) (val & 0xFFFF_FFFFL);
|
||||
int domainId = (int) (val >>> 32);
|
||||
|
||||
this.queryLength = queryLength;
|
||||
this.blockId = blockId;
|
||||
this.bucketId = bucketId;
|
||||
|
||||
url = new EdgeId<>(urlId);
|
||||
domain = new EdgeId<>(domainId);
|
||||
|
@ -1,14 +1,6 @@
|
||||
package nu.marginalia.wmsa.edge.model.search;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
|
||||
@AllArgsConstructor @ToString @EqualsAndHashCode
|
||||
public class EdgeSearchResultKeywordScore {
|
||||
public final String keyword;
|
||||
public final IndexBlock index;
|
||||
public boolean title;
|
||||
public boolean link;
|
||||
public record EdgeSearchResultKeywordScore(int set, String keyword, IndexBlock index, boolean title, boolean link, boolean site, boolean subject) {
|
||||
}
|
||||
|
@ -10,7 +10,7 @@ import java.util.Map;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeSearchResultSet {
|
||||
public Map<IndexBlock, List<EdgeSearchResults>> resultsList;
|
||||
public Map<IndexBlock, List<EdgeSearchResultItem>> resultsList;
|
||||
|
||||
public int size() {
|
||||
return resultsList.values().stream().mapToInt(List::size).sum();
|
||||
|
@ -4,29 +4,23 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeSearchResults {
|
||||
public final Map<Integer, List<EdgeSearchResultItem>> results;
|
||||
public final List<EdgeSearchResultItem> results;
|
||||
|
||||
public EdgeSearchResults() {
|
||||
results = new HashMap<>();
|
||||
results = new ArrayList<>();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return results.values().stream().mapToInt(List::size).sum();
|
||||
return results.size();
|
||||
}
|
||||
|
||||
public Stream<EdgeSearchResultItem> stream() {
|
||||
return results.values().stream().flatMap(List::stream);
|
||||
}
|
||||
|
||||
public List<EdgeSearchResultItem> getAllItems() {
|
||||
return stream().collect(Collectors.toList());
|
||||
return results.stream();
|
||||
}
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ public class EdgeUrlDetails {
|
||||
}
|
||||
|
||||
public double getRanking() {
|
||||
double lengthAdjustment = Math.max(1, words / (words + 1000.));
|
||||
double lengthAdjustment = Math.max(1, words / (words + 10000.));
|
||||
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
||||
}
|
||||
|
||||
@ -132,6 +132,7 @@ public class EdgeUrlDetails {
|
||||
public boolean isCookies() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||
}
|
||||
public boolean isUnknown() { return HtmlFeature.hasFeature(features, HtmlFeature.UNKNOWN); }
|
||||
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
|
||||
|
||||
public boolean isSpecialDomain() {
|
||||
|
@ -39,6 +39,7 @@ import javax.annotation.Nullable;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Singleton
|
||||
@ -236,6 +237,8 @@ public class EdgeSearchOperator {
|
||||
}
|
||||
|
||||
|
||||
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
||||
|
||||
private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) {
|
||||
String titleLC = p.title == null ? "" : p.title.toLowerCase();
|
||||
String descLC = p.description == null ? "" : p.description.toLowerCase();
|
||||
@ -248,11 +251,16 @@ public class EdgeSearchOperator {
|
||||
.toArray(String[]::new);
|
||||
int termCount = searchTermsLC.length;
|
||||
|
||||
String[] titleParts = titleLC.split("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
||||
double titleHitsAdj = 0.;
|
||||
final String[] titleParts = titleSplitPattern.split(titleLC);
|
||||
for (String titlePart : titleParts) {
|
||||
titleHitsAdj += Arrays.stream(searchTermsLC).filter(titlePart::contains).mapToInt(String::length).sum()
|
||||
/ (double) Math.max(1, titlePart.trim().length());
|
||||
double hits = 0;
|
||||
for (String term : searchTermsLC) {
|
||||
if (titlePart.contains(term)) {
|
||||
hits += term.length();
|
||||
}
|
||||
}
|
||||
titleHitsAdj += hits / Math.max(1, titlePart.length());
|
||||
}
|
||||
|
||||
double titleFullHit = 0.;
|
||||
@ -299,10 +307,8 @@ public class EdgeSearchOperator {
|
||||
logger.debug("{}", resultSet);
|
||||
|
||||
for (IndexBlock block : indexBlockSearchOrder) {
|
||||
for (var results : resultSet.resultsList.getOrDefault(block, Collections.emptyList())) {
|
||||
var items = results.getAllItems();
|
||||
queryResults.append(100, resultDecorator.decorateSearchResults(items, block, deduplicator));
|
||||
}
|
||||
queryResults.append(100, resultDecorator.decorateSearchResults(resultSet.resultsList.getOrDefault(block, Collections.emptyList()),
|
||||
block, deduplicator));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,31 +10,31 @@ import java.util.stream.Collectors;
|
||||
|
||||
public enum EdgeSearchProfile {
|
||||
DEFAULT("default",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link,
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
0, 1),
|
||||
MODERN("modern",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
2),
|
||||
CORPO("corpo",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
4, 5, 7),
|
||||
YOLO("yolo",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
0, 2, 1, 3, 4, 6),
|
||||
CORPO_CLEAN("corpo-clean",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
4, 5),
|
||||
ACADEMIA("academia",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
3),
|
||||
FOOD("food",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||
2, 0),
|
||||
;
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.search.query;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -13,12 +13,12 @@ import java.util.stream.Collectors;
|
||||
|
||||
public class EnglishDictionary {
|
||||
private final Set<String> englishWords = new HashSet<>();
|
||||
private final TermFrequencyDict dict;
|
||||
private final NGramBloomFilter bloomFilter;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public EnglishDictionary(TermFrequencyDict dict) {
|
||||
this.dict = dict;
|
||||
public EnglishDictionary(NGramBloomFilter bloomFilter) {
|
||||
this.bloomFilter = bloomFilter;
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
@ -44,10 +44,9 @@ public class EnglishDictionary {
|
||||
|
||||
public Collection<String> getWordVariants(String s) {
|
||||
var variants = findWordVariants(s);
|
||||
long freqBaseline = dict.getTermFreq(s);
|
||||
|
||||
var ret = variants.stream()
|
||||
.filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var)
|
||||
.filter(bloomFilter::isKnownNGram
|
||||
).collect(Collectors.toList());
|
||||
|
||||
if (s.equals("recipe") || s.equals("recipes")) {
|
||||
|
@ -130,7 +130,7 @@ public class QueryFactory {
|
||||
}
|
||||
}
|
||||
|
||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords);
|
||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title);
|
||||
|
||||
params.profile().addTacitTerms(subquery);
|
||||
params.jsSetting().addTacitTerms(subquery);
|
||||
|
@ -101,13 +101,27 @@ public class SearchResultDecorator {
|
||||
if (!missedIds.isEmpty()) {
|
||||
logger.warn("Could not look up documents: {}", missedIds.toArray());
|
||||
}
|
||||
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore));
|
||||
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)
|
||||
.thenComparing(url -> url.url.path.length()));
|
||||
return retList;
|
||||
}
|
||||
|
||||
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
|
||||
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
|
||||
int titleLength = details.title.length();
|
||||
|
||||
double value = valuator.evaluateTerms(resultItem.scores, block, details.words,titleLength) / Math.sqrt(1 + resultItem.queryLength)
|
||||
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
|
||||
|
||||
System.out.println("---");
|
||||
System.out.println(details.getUrl());
|
||||
System.out.println(details.getTitle());
|
||||
System.out.println(details.words);
|
||||
for (var score : resultItem.scores) {
|
||||
System.out.println(block + ":" + score);
|
||||
}
|
||||
System.out.println(value);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -16,8 +16,8 @@ public class SearchResultValuator {
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
|
||||
private static final int MIN_LENGTH = 500;
|
||||
private static final int AVG_LENGTH = 1400;
|
||||
private static final int MIN_LENGTH = 2000;
|
||||
private static final int AVG_LENGTH = 5000;
|
||||
|
||||
@Inject
|
||||
public SearchResultValuator(TermFrequencyDict dict) {
|
||||
@ -26,58 +26,85 @@ public class SearchResultValuator {
|
||||
|
||||
|
||||
// This is basically a bargain bin BM25
|
||||
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, IndexBlock block, int length) {
|
||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
||||
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, IndexBlock block, int length, int titleLength) {
|
||||
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0);
|
||||
|
||||
if (scores.length == 0) {
|
||||
return IndexBlock.Words_1.sortOrder;
|
||||
}
|
||||
double bestScore = 1000;
|
||||
double bestLtsFactor = 1.;
|
||||
|
||||
final double[] weights = getTermWeights(scores);
|
||||
final double lengthPenalty = getLengthPenalty(length);
|
||||
for (int set = 0; set <= sets; set++) {
|
||||
int thisSet = set;
|
||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
||||
|
||||
double termSum = 0.;
|
||||
double factorSum = 0.;
|
||||
|
||||
for (int i = 0; i < scores.length; i++) {
|
||||
|
||||
final double factor = 1. / (1.0 + weights[i]);
|
||||
|
||||
factorSum += factor;
|
||||
|
||||
double termValue = (scores[i].index.sortOrder + 0.5) * factor;
|
||||
|
||||
if (!scores[i].link && !scores[i].title) {
|
||||
termValue *= lengthPenalty;
|
||||
}
|
||||
else if (scores[i].link) {
|
||||
termValue /= 4.75;
|
||||
if (scores.length == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
termSum += termValue;
|
||||
final double[] weights = getTermWeights(scores);
|
||||
final double lengthPenalty = getLengthPenalty(length);
|
||||
|
||||
double termSum = 0.;
|
||||
double factorSum = 0.;
|
||||
|
||||
double ltsFactor = 1.0;
|
||||
|
||||
for (int i = 0; i < scores.length; i++) {
|
||||
|
||||
final double factor = 1. / (1.0 + weights[i]);
|
||||
|
||||
factorSum += factor;
|
||||
|
||||
double termValue = (scores[i].index().sortOrder + 0.5) * factor;
|
||||
|
||||
termValue /= lengthPenalty;
|
||||
|
||||
if (scores[i].link()) {
|
||||
ltsFactor *= Math.pow(0.5, 1. / scores.length);
|
||||
}
|
||||
if (scores[i].title()) {
|
||||
if (titleLength <= 64) {
|
||||
ltsFactor *= Math.pow(0.5, 1. / scores.length);
|
||||
}
|
||||
else if (titleLength < 96) {
|
||||
ltsFactor *= Math.pow(0.75, 1. / scores.length);
|
||||
}
|
||||
else {
|
||||
ltsFactor *= Math.pow(0.9, 1. / scores.length);
|
||||
}
|
||||
}
|
||||
if (scores[i].subject()) {
|
||||
ltsFactor *= Math.pow(0.8, 1. / scores.length);
|
||||
}
|
||||
|
||||
termSum += termValue;
|
||||
}
|
||||
|
||||
assert factorSum != 0;
|
||||
|
||||
double value = termSum / factorSum;
|
||||
|
||||
bestLtsFactor = Math.min(bestLtsFactor, ltsFactor);
|
||||
bestScore = Math.min(bestScore, value);
|
||||
}
|
||||
|
||||
assert factorSum != 0 ;
|
||||
|
||||
if (block == IndexBlock.Title || block == IndexBlock.TitleKeywords) {
|
||||
return block.sortOrder + (termSum / factorSum) / 5;
|
||||
}
|
||||
|
||||
return termSum / factorSum;
|
||||
return (0.7+0.3*block.sortOrder)*bestScore * bestLtsFactor;
|
||||
}
|
||||
|
||||
private double getLengthPenalty(int length) {
|
||||
if (length < MIN_LENGTH) {
|
||||
length = MIN_LENGTH;
|
||||
}
|
||||
return (0.7 + 0.3 * length / AVG_LENGTH);
|
||||
if (length > AVG_LENGTH) {
|
||||
length = AVG_LENGTH;
|
||||
}
|
||||
return (0.5 + 0.5 * length / AVG_LENGTH);
|
||||
}
|
||||
|
||||
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
|
||||
double[] weights = new double[scores.length];
|
||||
|
||||
for (int i = 0; i < scores.length; i++) {
|
||||
String[] parts = separator.split(scores[i].keyword);
|
||||
String[] parts = separator.split(scores[i].keyword());
|
||||
double sumScore = 0.;
|
||||
|
||||
int count = 0;
|
||||
|
@ -3,21 +3,35 @@ package nu.marginalia.wmsa.edge.tools;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class ConverterLogicTestTool {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
DomPruner domPruner = new DomPruner();
|
||||
RecipeDetector recipeDetector = new RecipeDetector();
|
||||
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
|
||||
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
if (args.length != 1) {
|
||||
@ -38,19 +52,42 @@ public class ConverterLogicTestTool {
|
||||
EdgeCrawlPlan plan,
|
||||
DomainProcessor processor
|
||||
) throws Exception {
|
||||
var cp = new ForkJoinPool(16);
|
||||
|
||||
plan.forEachCrawledDomain(domain -> {
|
||||
var ret = processor.process(domain);
|
||||
ret.documents.forEach(doc -> {
|
||||
if (doc.words == null)
|
||||
return;
|
||||
var artifacts = doc.words.get(IndexBlock.Artifacts);
|
||||
if (artifacts.size() > 0) {
|
||||
System.out.println(doc.url + ": " + artifacts);
|
||||
}
|
||||
});
|
||||
});
|
||||
if (domain.doc == null) return;
|
||||
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
Runnable task = () -> {
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
|
||||
domPruner.prune(parsed, 0.5);
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
||||
if (dld.totalNumWords() < 250)
|
||||
return;
|
||||
|
||||
if (textileCraftDetector.testP(dld) > 0.3) {
|
||||
System.out.println("textilecraft\t" + doc.url);
|
||||
}
|
||||
if (woodworkingDetector.testP(dld) > 0.2) {
|
||||
System.out.println("woodworking\t" + doc.url);
|
||||
}
|
||||
if (recipeDetector.testP(dld) > 0.5) {
|
||||
System.out.println("recipe\t" + doc.url);
|
||||
}
|
||||
};
|
||||
|
||||
if (cp.getQueuedSubmissionCount() > 32) {
|
||||
task.run();
|
||||
} else {
|
||||
cp.execute(task);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -151,15 +151,6 @@ i've
|
||||
it's
|
||||
it
|
||||
i'm
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
.
|
||||
..
|
||||
...
|
||||
|
@ -42,7 +42,7 @@ class SqlLoadDomainLinksTest {
|
||||
@Test
|
||||
public void loadDomainLinks() {
|
||||
var loader = new SqlLoadDomainLinks(dataSource);
|
||||
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
|
||||
loader.load(loaderData, new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
|
||||
}
|
||||
|
||||
}
|
@ -113,38 +113,13 @@ class SentenceExtractorTest {
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
|
||||
// documentKeywordExtractorLegacy.setLegacy(true);
|
||||
|
||||
// for (;;) {
|
||||
long st = System.currentTimeMillis();
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
|
||||
|
||||
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||
|
||||
var newRes = documentKeywordExtractor.extractKeywords(newResult);
|
||||
|
||||
|
||||
// var legacyRes = documentKeywordExtractorLegacy.extractKeywords(newResult);
|
||||
//
|
||||
// EdgePageWordSet difference = new EdgePageWordSet();
|
||||
// for (IndexBlock block : IndexBlock.values()) {
|
||||
|
||||
// var newWords = new HashSet<>(newRes.get(block).words);
|
||||
// var oldWords = new HashSet<>(legacyRes.get(block).words);
|
||||
// newWords.removeAll(oldWords);
|
||||
|
||||
// if (!newWords.isEmpty()) {
|
||||
// difference.append(block, newWords);
|
||||
// }
|
||||
// }
|
||||
// System.out.println(difference);
|
||||
System.out.println(newRes);
|
||||
// System.out.println("---");
|
||||
}
|
||||
System.out.println(System.currentTimeMillis() - st);
|
||||
// }
|
||||
long st = System.currentTimeMillis();
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||
var newRes = documentKeywordExtractor.extractKeywords(newResult);
|
||||
System.out.println(newRes);
|
||||
}
|
||||
System.out.println(System.currentTimeMillis() - st);
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,156 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.service;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.TestUtil;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceLock;
|
||||
import spark.Spark;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static nu.marginalia.util.TestUtil.getConnection;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
|
||||
@Execution(ExecutionMode.SAME_THREAD)
|
||||
@Tag("db")
|
||||
public class EdgeIndexClientTest {
|
||||
private static HikariDataSource dataSource;
|
||||
private static EdgeIndexService service;
|
||||
private static EdgeIndexClient client;
|
||||
private static Path tempDir;
|
||||
private static SearchIndexes indexes;
|
||||
|
||||
@SneakyThrows
|
||||
public static HikariDataSource provideConnection() {
|
||||
return getConnection();
|
||||
}
|
||||
|
||||
static final int testPort = TestUtil.getPort();
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeAll
|
||||
public static void setUpClass() {
|
||||
Spark.port(testPort);
|
||||
System.setProperty("service-name", "edge-index");
|
||||
|
||||
dataSource = provideConnection();
|
||||
dataSource.setKeepaliveTime(100);
|
||||
dataSource.setIdleTimeout(100);
|
||||
client = new EdgeIndexClient();
|
||||
client.setServiceRoute("127.0.0.1", testPort);
|
||||
|
||||
tempDir = Files.createTempDirectory("EdgeIndexClientTest");
|
||||
|
||||
var servicesFactory = new IndexServicesFactory(tempDir,tempDir,tempDir,tempDir,
|
||||
"writer-index",
|
||||
"writer-dictionary",
|
||||
"index-words-read",
|
||||
"index-urls-read",
|
||||
"index-words-write",
|
||||
"index-urls-write",
|
||||
1L<<24,
|
||||
id->false,
|
||||
new SearchIndexPartitioner(null)
|
||||
);
|
||||
|
||||
var init = new Initialization();
|
||||
indexes = new SearchIndexes(servicesFactory, new SearchIndexPartitioner(null));
|
||||
service = new EdgeIndexService("127.0.0.1",
|
||||
testPort,
|
||||
init, null,
|
||||
indexes,
|
||||
servicesFactory);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
init.setReady();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiBucketHit() {
|
||||
putWords(1, 1, -2, "fancy", "anagram", "dilbert", "whoah", "engram");
|
||||
putWords(2, 2, -5, "quibble", "angry", "whoah", "fancy");
|
||||
putWords(3, 3, -0.01, "strong", "manly", "muscles");
|
||||
indexes.repartition();
|
||||
indexes.preconvert();
|
||||
indexes.reindexAll();
|
||||
|
||||
var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results;
|
||||
System.out.println(results);
|
||||
List<EdgeId<EdgeUrl>> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList());
|
||||
|
||||
assertEquals(2, flatResults.size());
|
||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(1)));
|
||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(2)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHighHit() {
|
||||
putWords(2, 5, -100, "trapphus");
|
||||
indexes.repartition();
|
||||
indexes.preconvert();
|
||||
indexes.reindexAll();
|
||||
var rsp = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("trapphus"));
|
||||
System.out.println(rsp);
|
||||
assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.id());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSearchDomain() {
|
||||
putWords(8, 1, -2, "domain");
|
||||
putWords(8, 2, -5, "domain");
|
||||
putWords(10, 3, -0.01, "domain");
|
||||
putWords(11, 3, -0.01, "domain");
|
||||
putWords(12, 3, -0.01, "domain");
|
||||
indexes.repartition();
|
||||
indexes.preconvert();
|
||||
indexes.reindexAll();
|
||||
|
||||
var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results;
|
||||
System.out.println(results);
|
||||
List<EdgeId<EdgeUrl>> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList());
|
||||
|
||||
assertEquals(2, flatResults.size());
|
||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(1)));
|
||||
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(2)));
|
||||
}
|
||||
|
||||
void putWords(int didx, int idx, double quality, String... words) {
|
||||
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
|
||||
epw.addAll(Arrays.asList(words));
|
||||
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
|
||||
new EdgePageWordSet(epw), 0).blockingSubscribe();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownClass() {
|
||||
nu.marginalia.util.test.TestUtil.clearTempDir(tempDir);
|
||||
}
|
||||
|
||||
}
|
@ -24,7 +24,7 @@ class BodyQueryParserTest {
|
||||
public static void init() throws IOException {
|
||||
dict = new TermFrequencyDict(lm);
|
||||
nGramBloomFilter = new NGramBloomFilter(lm);
|
||||
englishDictionary = new EnglishDictionary(dict);
|
||||
englishDictionary = new EnglishDictionary(nGramBloomFilter);
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
|
@ -1,17 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.search.query;
|
||||
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class EnglishDictionaryTest {
|
||||
|
||||
@Test
|
||||
void getWordVariants() {
|
||||
LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);
|
||||
}
|
||||
}
|
@ -21,7 +21,7 @@ class QueryParserTest {
|
||||
public void setUp() throws IOException {
|
||||
dict = new TermFrequencyDict(lm);
|
||||
nGramBloomFilter = new NGramBloomFilter(lm);
|
||||
englishDictionary = new EnglishDictionary(dict);
|
||||
englishDictionary = new EnglishDictionary(nGramBloomFilter);
|
||||
|
||||
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
||||
}
|
||||
|
@ -23,8 +23,8 @@ class QueryVariantsTest {
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
var ngrams = new NGramBloomFilter(lm);
|
||||
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict));
|
||||
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
||||
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(ngrams));
|
||||
parser = new QueryParser(new EnglishDictionary(ngrams), variants);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Loading…
Reference in New Issue
Block a user