Tweaks for search result relevance

This commit is contained in:
vlofgren 2022-09-02 09:34:20 +02:00
parent 813399401e
commit 5f993c72dd
43 changed files with 539 additions and 574 deletions

View File

@ -63,7 +63,7 @@ public class WordPatterns {
if (word.isBlank()) {
return false;
}
if (hasMoreThanTwo(word, '-', 2)) {
if (hasMoreThanTwo(word, '-', 4)) {
return false;
}
if (hasMoreThanTwo(word, '+', 2)) {
@ -80,7 +80,7 @@ public class WordPatterns {
if (Character.isDigit(word.charAt(i))) {
numDigits++;
}
if (numDigits > 6)
if (numDigits > 16)
return false;
}

View File

@ -5,8 +5,8 @@ import java.util.regex.Pattern;
public class AsciiFlattener {
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+");
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$");
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
public static String flattenUnicode(String s) {

View File

@ -1,6 +1,5 @@
package nu.marginalia.util.language.processing;
import com.google.common.collect.Sets;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep;
@ -12,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import javax.inject.Inject;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DocumentKeywordExtractor {
@ -42,7 +40,7 @@ public class DocumentKeywordExtractor {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
@ -190,30 +188,7 @@ public class DocumentKeywordExtractor {
.collect(Collectors.toList());
}
private Collection<WordRep> joinWordLists(List<WordRep>... words) {
int size = 0;
for (var lst : words) {
size += lst.size();
}
if (size == 0)
return Collections.emptyList();
final LinkedHashSet<WordRep> ret = new LinkedHashSet<>(size);
for (var lst : words) {
ret.addAll(lst);
}
return ret;
}
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
}
private Set<WordRep> overlappingStems(Collection<WordRep> wordsA, Collection<WordRep> wordsB) {
Set<String> stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
Set<String> stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
Set<String> stemmedIntersect = Sets.intersection(stemmedA, stemmedB);
return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet());
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
@ -22,17 +23,20 @@ public class KeywordCounter {
}
public WordHistogram countHisto(DocumentLanguageData dld) {
HashMap<String, Double> counts = new HashMap<>(1000);
HashMap<String, Integer> counts = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
if (span.size() == 1 &&
WordPatterns.isStopWord(sent.words[span.start]))
continue;
String stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1., Double::sum);
counts.merge(stemmed, 1, Integer::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
}
}
@ -43,15 +47,23 @@ public class KeywordCounter {
Set<WordRep> h10 = new HashSet<>();
Set<WordRep> h15 = new HashSet<>();
int doubleWordCount = 0;
for (var entry : counts.entrySet()) {
double value = getTermValue(entry, maxC);
double avgCnt = entry.getValue();
String wordStemmed = entry.getKey();
Set<WordRep> histogram;
if (value < -3) histogram = h15;
else if (value < -2) histogram = h10;
else if (value < -1) histogram = h5;
if (value < -3 && avgCnt>1) histogram = h15;
else if (value < -1.75 && avgCnt>1) histogram = h10;
else if (value < -1 &&
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
histogram = h5;
else continue;
histogram.addAll(instances.get(entry.getKey()));
histogram.addAll(instances.get(wordStemmed));
}
return new WordHistogram(h5, h10, h15);
@ -59,7 +71,7 @@ public class KeywordCounter {
private static final Pattern separator = Pattern.compile("_");
public double getTermValue(Map.Entry<String, Double> e, double maxValue) {
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
String[] parts = separator.split(e.getKey());
double totalValue = 0.;
for (String part : parts) {
@ -71,9 +83,9 @@ public class KeywordCounter {
double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key);
if (freq < 1) {
freq = 10;
freq = 1;
}
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/docCount);
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
}
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }

View File

@ -10,84 +10,9 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class KeywordExtractor {
public boolean isLegacy() {
return legacy;
}
public void setLegacy(boolean legacy) {
this.legacy = legacy;
}
private boolean legacy;
public WordSpan[] getNameLikes(DocumentSentence sentence) {
var direct = IntStream.range(0, sentence.length())
.filter(i -> sentence.posTags[i].startsWith("N"))
.mapToObj(i -> new WordSpan(i, i+1))
;
var two = IntStream.range(1, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
.filter(i -> isName(i, sentence, Collections.emptySet()))
.filter(i -> isName(i -1, sentence, Collections.emptySet()))
.mapToObj(i -> new WordSpan(i-1, i+1))
;
var a_in_b = IntStream.range(2, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
.filter(i -> isProperNoun(i, sentence))
.filter(i -> isJoiner(sentence, i-1))
.filter(i -> isProperNoun(i-2, sentence))
.mapToObj(i -> new WordSpan(i-2, i+1))
;
var a_in_det_b = IntStream.range(3, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE)
.filter(i -> isProperNoun(i, sentence))
.filter(i -> isJoiner(sentence, i-1))
.filter(i -> sentence.posTags[i-2].equals("DT"))
.filter(i -> isProperNoun(i-3, sentence))
.mapToObj(i -> new WordSpan(i-3, i+1))
;
var a_in_in_b = IntStream.range(3, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE)
.filter(i -> isProperNoun(i, sentence))
.filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
.filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence))
.filter(i -> isProperNoun(i-3, sentence))
.mapToObj(i -> new WordSpan(i-3, i+1))
;
var three = IntStream.range(2, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE)
.filter(i -> isName(i, sentence, Collections.emptySet()))
.filter(i -> isName(i-1, sentence, Collections.emptySet()))
.filter(i -> isName(i-2, sentence, Collections.emptySet()))
.mapToObj(i -> new WordSpan(i-2, i+1))
;
var four = IntStream.range(3, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE
&& sentence.separators[i-3] == WordSeparator.SPACE)
.filter(i -> isName(i, sentence, Collections.emptySet()))
.filter(i -> isName(i - 1, sentence, Collections.emptySet()))
.filter(i -> isName(i - 2, sentence, Collections.emptySet()))
.filter(i -> isName(i - 3, sentence, Collections.emptySet()))
.mapToObj(i -> new WordSpan(i-3, i+1))
;
return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity())
.toArray(WordSpan[]::new);
}
public WordSpan[] getNames(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(sentence.length());
@ -214,7 +139,7 @@ public class KeywordExtractor {
}
String word = sentence.constructWordFromSpan(w);
if (word.isBlank() || WordPatterns.isStopWord(word)) return false;
if (word.isBlank() || !WordPatterns.filter(word)) return false;
if (sentence.posTags[w.start].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("IN")) return false;
if (sentence.posTags[w.end-1].equals("DT")) return false;

View File

@ -22,6 +22,9 @@ public class NameCounter {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getNames(sent);
for (var span : keywords) {
if (span.size() <= 1)
continue;
var stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1., Double::sum);

View File

@ -52,7 +52,7 @@ public class ConverterMain {
logger.info("Starting pipe");
try (WorkLog processLog = plan.createProcessWorkLog()) {
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
@Override
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
@ -73,12 +73,7 @@ public class ConverterMain {
};
plan.forEachCrawledDomain(domain -> {
if (!processLog.isJobFinished(domain.id)) {
logger.info("{} - {}", domain.domain, domain.id);
pipe.accept(domain);
}
});
plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
pipe.join();
}

View File

@ -3,6 +3,14 @@ package nu.marginalia.wmsa.edge.converting;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -36,7 +44,9 @@ public class CrawledInstructionWriter {
}
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
logger.info("Writing {} - {}", id, instructionList.size());
SummarizingInterpreter summary = new SummarizingInterpreter(instructionList);
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
for (var instr : instructionList) {
outputStream.append(instr.tag().name());
@ -59,4 +69,54 @@ public class CrawledInstructionWriter {
}
return destDir.resolve(id + ".pzstd");
}
private static class SummarizingInterpreter implements Interpreter {
private SummarizingInterpreter(List<Instruction> instructions) {
for (var i : instructions) {
i.apply(this);
}
}
private String domainName;
private int ok = 0;
private int error = 0;
public String toString() {
return String.format("%s - %d %d", domainName, ok, error);
}
@Override
public void loadUrl(EdgeUrl[] url) {}
@Override
public void loadDomain(EdgeDomain[] domain) {}
@Override
public void loadRssFeed(EdgeUrl[] rssFeed) {}
@Override
public void loadDomainLink(DomainLink[] links) {}
@Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
this.domainName = domain.toString();
}
@Override
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
ok++;
}
@Override
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
error++;
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
@Override
public void loadDomainRedirect(DomainLink link) {}
}
}

View File

@ -72,7 +72,7 @@ public class Loader implements Interpreter {
@Override
public void loadDomainLink(DomainLink[] links) {
logger.debug("loadDomainLink({})", links, null);
sqlLoadDomainLinks.load(links);
sqlLoadDomainLinks.load(data, links);
}
@Override

View File

@ -40,13 +40,20 @@ public class SqlLoadDomainLinks {
}
}
public void load(DomainLink[] links) {
public void load(LoaderData data, DomainLink[] links) {
try (var connection = dataSource.getConnection();
var nukeExistingLinksForDomain =
connection.prepareStatement("""
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?
""");
var stmt =
connection.prepareCall("CALL INSERT_LINK(?,?)"))
{
nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from()));
nukeExistingLinksForDomain.executeUpdate();
for (DomainLink link : links) {
stmt.setString(1, link.from().toString());
stmt.setString(2, link.to().toString());

View File

@ -15,6 +15,7 @@ public class ProcessedDocument {
public EdgePageWordSet words;
public EdgeUrlState state;
public String stateReason;
public OptionalDouble quality() {
if (details != null) {

View File

@ -70,11 +70,22 @@ public class DocumentProcessor {
this.summaryExtractor = summaryExtractor;
}
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
ProcessedDocument ret = new ProcessedDocument();
try {
ret.state = EdgeUrlState.DISQUALIFIED;
ret.url = getDocumentUrl(crawledDocument);
}
catch (Exception ex) {}
return ret;
}
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
ProcessedDocument ret = new ProcessedDocument();
try {
ret.url = new EdgeUrl(crawledDocument.url);
ret.url = getDocumentUrl(crawledDocument);
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
if (ret.state == EdgeUrlState.OK) {
@ -99,17 +110,31 @@ public class DocumentProcessor {
}
catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
ret.stateReason = ex.reason.toString();
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
}
catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
logger.info("Failed to convert " + ret.url, ex);
logger.info("Failed to convert " + crawledDocument.url, ex);
ex.printStackTrace();
}
return ret;
}
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
throws URISyntaxException
{
if (crawledDocument.canonicalUrl != null) {
try {
return new EdgeUrl(crawledDocument.canonicalUrl);
}
catch (URISyntaxException ex) { /* fallthrough */ }
}
return new EdgeUrl(crawledDocument.url);
}
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
if (crawledDocument.contentType == null) {
return false;
@ -155,20 +180,26 @@ public class DocumentProcessor {
var ret = new ProcessedDocumentDetails();
ret.description = getDescription(doc);
ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
EdgePageWordSet words;
if (ret.quality < minDocumentQuality || dld.totalNumWords() < minDocumentLength) {
if (doSimpleProcessing) {
ret.features = Set.of(HtmlFeature.UNKNOWN);
words = keywordExtractor.extractKeywordsMinimal(dld);
ret.description = "";
}
else {
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
words = keywordExtractor.extractKeywords(dld);
ret.description = getDescription(doc);
}
var url = new EdgeUrl(crawledDocument.url);
@ -276,6 +307,10 @@ public class DocumentProcessor {
}
private void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
if (dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualificationReason.LENGTH);
}
double languageAgreement = languageFilter.dictionaryAgreement(dld);
if (languageAgreement < 0.1) {
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);

View File

@ -1,23 +1,27 @@
package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.*;
import java.util.stream.Collectors;
public class DomainProcessor {
private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor();
private final DocumentProcessor documentProcessor;
private final Double minAvgDocumentQuality;
@Inject
public DomainProcessor(DocumentProcessor documentProcessor,
@Named("min-avg-document-quality") Double minAvgDocumentQuality
@ -39,61 +43,71 @@ public class DomainProcessor {
if (crawledDomain.doc != null) {
ret.documents = new ArrayList<>(crawledDomain.doc.size());
fixBadCanonicalTags(crawledDomain.doc);
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
for (var doc : crawledDomain.doc) {
var processedDoc = documentProcessor.process(doc, crawledDomain);
if (processedDoc.url != null) {
ret.documents.add(processedDoc);
if (disqualifier.isQualified()) {
var processedDoc = documentProcessor.process(doc, crawledDomain);
if (processedDoc.url != null) {
ret.documents.add(processedDoc);
processedDoc.quality().ifPresent(disqualifier::offer);
}
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
disqualifier.offer(-100);
}
}
else { // Short-circuit processing if quality is too low
var stub = documentProcessor.makeDisqualifiedStub(doc);
if (stub.url != null) {
ret.documents.add(stub);
}
}
}
addCommonSiteWords(ret);
Set<String> commonSiteWords = new HashSet<>(10);
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
if (!commonSiteWords.isEmpty()) {
for (var doc : ret.documents) {
if (doc.words != null) {
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
}
}
}
}
else {
ret.documents = Collections.emptyList();
}
double averageQuality = getAverageQuality(ret.documents);
if (averageQuality < minAvgDocumentQuality) {
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
}
ret.state = getState(crawledDomain.crawlerStatus);
return ret;
}
private void addCommonSiteWords(ProcessedDomain ret) {
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
Map<String, Set<String>> seenCanonicals = new HashMap<>();
if (ret.documents.size() < 25)
return;
// Sometimes sites set a blanket canonical link to their root page
// this removes such links from consideration
Map<String, Integer> topKeywordCount = new HashMap<>(ret.documents.size()*10);
for (var doc : ret.documents) {
if (doc.words == null)
continue;
for (var word : doc.words.get(IndexBlock.Tfidf_Top).words) {
topKeywordCount.merge(word, -1, Integer::sum);
for (var document : docs) {
if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) {
seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash);
}
}
if (topKeywordCount.values().stream().mapToInt(i -> i).sum() > -100)
return;
Set<String> topWords = topKeywordCount.entrySet().stream()
.filter(e -> e.getValue() < -10)
.sorted(Map.Entry.comparingByValue()).limit(5)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
if (!topWords.isEmpty()) {
for (var doc : ret.documents) {
if (doc.words != null) {
doc.words.get(IndexBlock.Site).addAll(topWords);
}
for (var document : docs) {
if (!Strings.isNullOrEmpty(document.canonicalUrl)
&& !Objects.equals(document.canonicalUrl, document.url)
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
document.canonicalUrl = document.url;
}
}
}
private double getAverageQuality(List<ProcessedDocument> documents) {
@ -120,4 +134,20 @@ public class DomainProcessor {
default -> EdgeDomainIndexingState.ERROR;
};
}
class DocumentDisqualifier {
int count;
int goodCount;
void offer(double quality) {
count++;
if (quality > minAvgDocumentQuality) {
goodCount++;
}
}
boolean isQualified() {
return count < 25 || goodCount*10 >= count;
}
}
}

View File

@ -0,0 +1,71 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import java.util.*;
public class CommonKeywordExtractor {
private final PorterStemmer ps = new PorterStemmer();
private static final int MIN_REQUIRED_DOCUMENTS = 25;
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
public List<String> getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) {
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
return Collections.emptyList();
final Map<String, String> wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10);
final Map<String, Integer> topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10);
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
int qualifiedDocCount = 0;
for (var doc : ret.documents) {
if (doc.words == null)
continue;
qualifiedDocCount++;
for (var block : sourceBlocks) {
for (var word : doc.words.get(block).words) {
String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord);
// Count by negative values to sort by Map.Entry.comparingByValue() in reverse
topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum);
stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word);
}
}
}
int totalValue = 0;
for (int value : topStemmedKeywordCount.values()) {
totalValue += value;
}
if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION)
return Collections.emptyList();
List<String> topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT);
double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD;
topStemmedKeywordCount.entrySet().stream()
.filter(e -> e.getValue() < qualifyingValue)
.sorted(Map.Entry.comparingByValue())
.limit(MAX_SITE_KEYWORDS_TO_EXTRACT)
.forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey())));
return topWords;
}
}

View File

@ -14,6 +14,8 @@ public enum HtmlFeature {
ADVERTISEMENT("special:ads"),
CATEGORY_CRAFTS("category:crafts"),
UNKNOWN("special:uncategorized")
;
private final String keyword;

View File

@ -13,10 +13,14 @@ import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
public class CrawledDomainReader {
private final Gson gson = new GsonBuilder().create();
private final ForkJoinPool pool = new ForkJoinPool(4);
public CrawledDomainReader() {
}
@ -43,7 +47,12 @@ public class CrawledDomainReader {
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
domain = gson.fromJson(nextLine, CrawledDomain.class);
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
pool.execute(() -> {
var doc = gson.fromJson(nextLine, CrawledDocument.class);
synchronized (docs) {
docs.add(doc);
}
});
}
} else if (line.charAt(0) == '{') {
domain = gson.fromJson(line, CrawledDomain.class);
@ -52,6 +61,8 @@ public class CrawledDomainReader {
}
}
pool.awaitQuiescence(10, TimeUnit.SECONDS);
if (domain == null) {
return null;
}

View File

@ -6,5 +6,6 @@ public enum CrawlerDocumentStatus {
BAD_CHARSET,
REDIRECT,
ROBOTS_TXT,
ERROR
ERROR,
Timeout
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.data.dao;
import com.google.common.base.Strings;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.util.concurrent.UncheckedExecutionException;
@ -113,9 +114,12 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
Double.MAX_VALUE, // termScore
0 // queryLength
);
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
result.add(val);
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description)
&& val.url.path.length() > 1) {
continue;
}
result.add(val);
}
}

View File

@ -6,7 +6,6 @@ import com.google.inject.Inject;
import com.google.inject.name.Named;
import com.google.protobuf.InvalidProtocolBufferException;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Histogram;
@ -227,12 +226,7 @@ public class EdgeIndexService extends Service {
long start = System.currentTimeMillis();
try {
if (specsSet.isStagger()) {
return new EdgeSearchResultSet(searchStaggered(specsSet));
}
else {
return new EdgeSearchResultSet(searchStraight(specsSet));
}
return new EdgeSearchResultSet(searchStraight(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
@ -249,59 +243,9 @@ public class EdgeIndexService extends Service {
}
}
private Map<IndexBlock, List<EdgeSearchResults>> searchStaggered(EdgeSearchSpecification specsSet) {
int count = 0;
final Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
final TIntHashSet seenResults = new TIntHashSet();
final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] {
new DomainResultCountFilter(specsSet.limitByDomain),
new DomainResultCountFilter(specsSet.limitByDomain)
};
final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket);
for (int i = 0; i < specsSet.buckets.size(); i+=2) {
for (var sq : specsSet.subqueries) {
for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) {
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty())
continue;
var result = performSearch(searchTerms.get(),
budget,
seenResults,
domainCountFilter[j],
sq,
List.of(specsSet.buckets.get(i+j)),
specsSet,
Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count)
);
if (logger.isDebugEnabled()) {
logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum());
}
int sz = result.size();
count += sz;
limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz);
if (sz > 0) {
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
}
}
}
}
return results;
}
@NotNull
private Map<IndexBlock, List<EdgeSearchResults>> searchStraight(EdgeSearchSpecification specsSet) {
Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
private Map<IndexBlock, List<EdgeSearchResultItem>> searchStraight(EdgeSearchSpecification specsSet) {
Map<IndexBlock, List<EdgeSearchResultItem>> results = new HashMap<>();
int count = 0;
TIntHashSet seenResults = new TIntHashSet();
@ -314,25 +258,38 @@ public class EdgeIndexService extends Service {
if (searchTerms.isEmpty())
continue;
var result = performSearch(searchTerms.get(),
var resultForSq = performSearch(searchTerms.get(),
budget, seenResults, domainCountFilter,
sq, specsSet.buckets, specsSet,
specsSet.limitTotal - count);
if (logger.isDebugEnabled()) {
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size());
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, resultForSq.size());
}
count += result.size();
if (result.size() > 0) {
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
count += resultForSq.size();
if (resultForSq.size() > 0) {
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).addAll(resultForSq);
}
}
List<List<String>> distinctSearchTerms = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
results.forEach((index, blockResults) -> {
for (var result : blockResults) {
for (int i = 0; i < distinctSearchTerms.size(); i++) {
for (var term : distinctSearchTerms.get(i)) {
result.scores.add(getSearchTermScore(i, result.bucketId, term, result.getCombinedId()));
}
}
}
});
return results;
}
private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms,
private List<EdgeSearchResultItem> performSearch(EdgeIndexSearchTerms searchTerms,
IndexSearchBudget budget,
TIntHashSet seenResults,
DomainResultCountFilter domainCountFilter,
@ -342,14 +299,14 @@ public class EdgeIndexService extends Service {
int limit)
{
if (limit <= 0) {
return new EdgeSearchResults();
return new ArrayList<>();
}
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
final List<EdgeSearchResultItem> results = new ArrayList<>();
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
for (int i : specBuckets) {
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
int foundResultsCount = results.size();
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
break;
@ -362,38 +319,33 @@ public class EdgeIndexService extends Service {
.limit(specs.limitTotal * 3L)
.distinct()
.limit(Math.min(specs.limitByBucket
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
- results.size(), limit - foundResultsCount))
.forEach(resultsForBucket::add);
for (var result : resultsForBucket) {
seenResults.add(result.url.id());
}
for (var result : resultsForBucket) {
for (var searchTerm : sq.searchTermsInclude) {
result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId()));
}
}
domainCountFilter.addAll(i, resultsForBucket);
if (!resultsForBucket.isEmpty()) {
results.put(i, resultsForBucket);
}
results.addAll(resultsForBucket);
}
return new EdgeSearchResults(results);
return results;
}
private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) {
private EdgeSearchResultKeywordScore getSearchTermScore(int set, int bucketId, String term, long urlId) {
final int termId = indexes.getDictionaryReader().get(term);
var bucket = indexes.getBucket(bucketId);
return new EdgeSearchResultKeywordScore(term,
return new EdgeSearchResultKeywordScore(set, term,
bucket.getTermScore(termId, urlId),
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
bucket.isTermInBucket(IndexBlock.Link, termId, urlId)
bucket.isTermInBucket(IndexBlock.Link, termId, urlId),
bucket.isTermInBucket(IndexBlock.Site, termId, urlId),
bucket.isTermInBucket(IndexBlock.Subjects, termId, urlId)
);
}

View File

@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.model;
public enum IndexBlock {
TitleKeywords(0, 0),
Title(1, 1),
Title(1, 0),
Link(2, 1.15),
Subjects(3, 3.0),
Subjects(3, 1.0),
NamesWords(4, 3.0),
Artifacts(5, 10),
Meta(6, 7),
Tfidf_Top(7, 0.5),
Tfidf_Middle(8, 1.25),
Tfidf_Lower(9, 1.5),
Tfidf_Top(7, 1.5),
Tfidf_Middle(8, 2),
Tfidf_Lower(9, 3.5),
Words_1(10, 3.0),
Words_1(10, 2.0),
Words_2(11, 3.5),
Words_4(12, 4.0),
Words_8(13, 4.5),

View File

@ -47,7 +47,7 @@ public class SearchIndexReader implements AutoCloseable {
var linkIndex = indices.get(IndexBlock.Link);
var titleIndex = indices.get(IndexBlock.Title);
var namesIndex = indices.get(IndexBlock.NamesWords);
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
var siteIndex = indices.get(IndexBlock.Site);
var metaIndex = indices.get(IndexBlock.Meta);
var topicIndex = indices.get(IndexBlock.Subjects);
@ -61,14 +61,17 @@ public class SearchIndexReader implements AutoCloseable {
queryBuilders = new EnumMap<>(IndexBlock.class);
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, linkIndex), words1));
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1), words1));
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words2), words1));
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words4), words1));
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words8), words1));
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, namesIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(linkIndex, namesIndex, topIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, linkIndex, topIndex, siteIndex, namesIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, namesIndex, siteIndex, midIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Middle, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Lower, new IndexQueryBuilder(listOfNonNulls(midIndex, linkIndex, namesIndex, topIndex, siteIndex, midIndex, lowIndex, topicIndex, metaIndex, artifacts), words1));
}
@SafeVarargs

View File

@ -46,7 +46,7 @@ public class IndexQueryBuilder {
return new QueryForIndices(budget, LongStream::empty);
}
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
return build(budget, filter, wordId);
return new QueryForIndices(budget, LongStream::empty);
}
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);

View File

@ -16,6 +16,7 @@ import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Stream;
@AllArgsConstructor @NoArgsConstructor @ToString
@ -86,7 +87,21 @@ public class EdgeCrawlPlan {
throw new RuntimeException(ex);
}
}
public void forEachCrawledDomain(Predicate<String> idReadPredicate, Consumer<CrawledDomain> consumer) {
final CrawledDomainReader reader = new CrawledDomainReader();
try (Stream<CrawlLogEntry> entryStream = WorkLog.streamLog(crawl.getLogFile())) {
entryStream
.filter(entry -> idReadPredicate.test(entry.id()))
.map(CrawlLogEntry::path)
.map(this::getCrawledFilePath)
.map(reader::readRuntimeExcept)
.forEach(consumer);
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
}
@MustBeClosed
public DomainsIterable domainsIterable() throws IOException {
return new DomainsIterable();

View File

@ -13,18 +13,18 @@ import java.util.List;
@AllArgsConstructor @ToString @Getter @EqualsAndHashCode
public class EdgeSearchResultItem {
public final int blockId;
public final int bucketId;
public final int queryLength;
public final EdgeId<EdgeDomain> domain; // this isn't the external domain ID, but a ranking
public final EdgeId<EdgeUrl> url;
public final List<EdgeSearchResultKeywordScore> scores;
public EdgeSearchResultItem(int blockId, int queryLength, long val) {
public EdgeSearchResultItem(int bucketId, int queryLength, long val) {
int urlId = (int) (val & 0xFFFF_FFFFL);
int domainId = (int) (val >>> 32);
this.queryLength = queryLength;
this.blockId = blockId;
this.bucketId = bucketId;
url = new EdgeId<>(urlId);
domain = new EdgeId<>(domainId);

View File

@ -1,14 +1,6 @@
package nu.marginalia.wmsa.edge.model.search;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
@AllArgsConstructor @ToString @EqualsAndHashCode
public class EdgeSearchResultKeywordScore {
public final String keyword;
public final IndexBlock index;
public boolean title;
public boolean link;
public record EdgeSearchResultKeywordScore(int set, String keyword, IndexBlock index, boolean title, boolean link, boolean site, boolean subject) {
}

View File

@ -10,7 +10,7 @@ import java.util.Map;
@AllArgsConstructor @Getter @ToString
public class EdgeSearchResultSet {
public Map<IndexBlock, List<EdgeSearchResults>> resultsList;
public Map<IndexBlock, List<EdgeSearchResultItem>> resultsList;
public int size() {
return resultsList.values().stream().mapToInt(List::size).sum();

View File

@ -4,29 +4,23 @@ import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@AllArgsConstructor @Getter @ToString
public class EdgeSearchResults {
public final Map<Integer, List<EdgeSearchResultItem>> results;
public final List<EdgeSearchResultItem> results;
public EdgeSearchResults() {
results = new HashMap<>();
results = new ArrayList<>();
}
public int size() {
return results.values().stream().mapToInt(List::size).sum();
return results.size();
}
public Stream<EdgeSearchResultItem> stream() {
return results.values().stream().flatMap(List::stream);
}
public List<EdgeSearchResultItem> getAllItems() {
return stream().collect(Collectors.toList());
return results.stream();
}
}

View File

@ -94,7 +94,7 @@ public class EdgeUrlDetails {
}
public double getRanking() {
double lengthAdjustment = Math.max(1, words / (words + 1000.));
double lengthAdjustment = Math.max(1, words / (words + 10000.));
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
}
@ -132,6 +132,7 @@ public class EdgeUrlDetails {
public boolean isCookies() {
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
}
public boolean isUnknown() { return HtmlFeature.hasFeature(features, HtmlFeature.UNKNOWN); }
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.ADVERTISEMENT); }
public boolean isSpecialDomain() {

View File

@ -39,6 +39,7 @@ import javax.annotation.Nullable;
import java.util.*;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Singleton
@ -236,6 +237,8 @@ public class EdgeSearchOperator {
}
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
private EdgePageScoreAdjustment adjustScoreBasedOnQuery(EdgeUrlDetails p, EdgeSearchSpecification specs) {
String titleLC = p.title == null ? "" : p.title.toLowerCase();
String descLC = p.description == null ? "" : p.description.toLowerCase();
@ -248,11 +251,16 @@ public class EdgeSearchOperator {
.toArray(String[]::new);
int termCount = searchTermsLC.length;
String[] titleParts = titleLC.split("[:!|./]|(\\s-|-\\s)|\\s{2,}");
double titleHitsAdj = 0.;
final String[] titleParts = titleSplitPattern.split(titleLC);
for (String titlePart : titleParts) {
titleHitsAdj += Arrays.stream(searchTermsLC).filter(titlePart::contains).mapToInt(String::length).sum()
/ (double) Math.max(1, titlePart.trim().length());
double hits = 0;
for (String term : searchTermsLC) {
if (titlePart.contains(term)) {
hits += term.length();
}
}
titleHitsAdj += hits / Math.max(1, titlePart.length());
}
double titleFullHit = 0.;
@ -299,10 +307,8 @@ public class EdgeSearchOperator {
logger.debug("{}", resultSet);
for (IndexBlock block : indexBlockSearchOrder) {
for (var results : resultSet.resultsList.getOrDefault(block, Collections.emptyList())) {
var items = results.getAllItems();
queryResults.append(100, resultDecorator.decorateSearchResults(items, block, deduplicator));
}
queryResults.append(100, resultDecorator.decorateSearchResults(resultSet.resultsList.getOrDefault(block, Collections.emptyList()),
block, deduplicator));
}
}

View File

@ -10,31 +10,31 @@ import java.util.stream.Collectors;
public enum EdgeSearchProfile {
DEFAULT("default",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link,
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
0, 1),
MODERN("modern",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
2),
CORPO("corpo",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
4, 5, 7),
YOLO("yolo",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
4, 5),
ACADEMIA("academia",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
List.of( IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
3),
FOOD("food",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
List.of(IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
2, 0),
;

View File

@ -1,7 +1,7 @@
package nu.marginalia.wmsa.edge.search.query;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -13,12 +13,12 @@ import java.util.stream.Collectors;
public class EnglishDictionary {
private final Set<String> englishWords = new HashSet<>();
private final TermFrequencyDict dict;
private final NGramBloomFilter bloomFilter;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public EnglishDictionary(TermFrequencyDict dict) {
this.dict = dict;
public EnglishDictionary(NGramBloomFilter bloomFilter) {
this.bloomFilter = bloomFilter;
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
@ -44,10 +44,9 @@ public class EnglishDictionary {
public Collection<String> getWordVariants(String s) {
var variants = findWordVariants(s);
long freqBaseline = dict.getTermFreq(s);
var ret = variants.stream()
.filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var)
.filter(bloomFilter::isKnownNGram
).collect(Collectors.toList());
if (s.equals("recipe") || s.equals("recipes")) {

View File

@ -130,7 +130,7 @@ public class QueryFactory {
}
}
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords);
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title);
params.profile().addTacitTerms(subquery);
params.jsSetting().addTacitTerms(subquery);

View File

@ -101,13 +101,27 @@ public class SearchResultDecorator {
if (!missedIds.isEmpty()) {
logger.warn("Could not look up documents: {}", missedIds.toArray());
}
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore));
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)
.thenComparing(url -> url.url.path.length()));
return retList;
}
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
int titleLength = details.title.length();
double value = valuator.evaluateTerms(resultItem.scores, block, details.words,titleLength) / Math.sqrt(1 + resultItem.queryLength)
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
System.out.println("---");
System.out.println(details.getUrl());
System.out.println(details.getTitle());
System.out.println(details.words);
for (var score : resultItem.scores) {
System.out.println(block + ":" + score);
}
System.out.println(value);
return value;
}
}

View File

@ -16,8 +16,8 @@ public class SearchResultValuator {
private static final Pattern separator = Pattern.compile("_");
private static final int MIN_LENGTH = 500;
private static final int AVG_LENGTH = 1400;
private static final int MIN_LENGTH = 2000;
private static final int AVG_LENGTH = 5000;
@Inject
public SearchResultValuator(TermFrequencyDict dict) {
@ -26,58 +26,85 @@ public class SearchResultValuator {
// This is basically a bargain bin BM25
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, IndexBlock block, int length) {
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, IndexBlock block, int length, int titleLength) {
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0);
if (scores.length == 0) {
return IndexBlock.Words_1.sortOrder;
}
double bestScore = 1000;
double bestLtsFactor = 1.;
final double[] weights = getTermWeights(scores);
final double lengthPenalty = getLengthPenalty(length);
for (int set = 0; set <= sets; set++) {
int thisSet = set;
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
double termSum = 0.;
double factorSum = 0.;
for (int i = 0; i < scores.length; i++) {
final double factor = 1. / (1.0 + weights[i]);
factorSum += factor;
double termValue = (scores[i].index.sortOrder + 0.5) * factor;
if (!scores[i].link && !scores[i].title) {
termValue *= lengthPenalty;
}
else if (scores[i].link) {
termValue /= 4.75;
if (scores.length == 0) {
continue;
}
termSum += termValue;
final double[] weights = getTermWeights(scores);
final double lengthPenalty = getLengthPenalty(length);
double termSum = 0.;
double factorSum = 0.;
double ltsFactor = 1.0;
for (int i = 0; i < scores.length; i++) {
final double factor = 1. / (1.0 + weights[i]);
factorSum += factor;
double termValue = (scores[i].index().sortOrder + 0.5) * factor;
termValue /= lengthPenalty;
if (scores[i].link()) {
ltsFactor *= Math.pow(0.5, 1. / scores.length);
}
if (scores[i].title()) {
if (titleLength <= 64) {
ltsFactor *= Math.pow(0.5, 1. / scores.length);
}
else if (titleLength < 96) {
ltsFactor *= Math.pow(0.75, 1. / scores.length);
}
else {
ltsFactor *= Math.pow(0.9, 1. / scores.length);
}
}
if (scores[i].subject()) {
ltsFactor *= Math.pow(0.8, 1. / scores.length);
}
termSum += termValue;
}
assert factorSum != 0;
double value = termSum / factorSum;
bestLtsFactor = Math.min(bestLtsFactor, ltsFactor);
bestScore = Math.min(bestScore, value);
}
assert factorSum != 0 ;
if (block == IndexBlock.Title || block == IndexBlock.TitleKeywords) {
return block.sortOrder + (termSum / factorSum) / 5;
}
return termSum / factorSum;
return (0.7+0.3*block.sortOrder)*bestScore * bestLtsFactor;
}
private double getLengthPenalty(int length) {
if (length < MIN_LENGTH) {
length = MIN_LENGTH;
}
return (0.7 + 0.3 * length / AVG_LENGTH);
if (length > AVG_LENGTH) {
length = AVG_LENGTH;
}
return (0.5 + 0.5 * length / AVG_LENGTH);
}
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
double[] weights = new double[scores.length];
for (int i = 0; i < scores.length; i++) {
String[] parts = separator.split(scores[i].keyword);
String[] parts = separator.split(scores[i].keyword());
double sumScore = 0.;
int count = 0;

View File

@ -3,21 +3,35 @@ package nu.marginalia.wmsa.edge.tools;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.ConverterModule;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruner;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.ForkJoinPool;
public class ConverterLogicTestTool {
private final Logger logger = LoggerFactory.getLogger(getClass());
DomPruner domPruner = new DomPruner();
RecipeDetector recipeDetector = new RecipeDetector();
WoodworkingDetector woodworkingDetector = new WoodworkingDetector();
TextileCraftDetector textileCraftDetector = new TextileCraftDetector();
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
public static void main(String... args) throws IOException {
if (args.length != 1) {
@ -38,19 +52,42 @@ public class ConverterLogicTestTool {
EdgeCrawlPlan plan,
DomainProcessor processor
) throws Exception {
var cp = new ForkJoinPool(16);
plan.forEachCrawledDomain(domain -> {
var ret = processor.process(domain);
ret.documents.forEach(doc -> {
if (doc.words == null)
return;
var artifacts = doc.words.get(IndexBlock.Artifacts);
if (artifacts.size() > 0) {
System.out.println(doc.url + ": " + artifacts);
}
});
});
if (domain.doc == null) return;
for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
Runnable task = () -> {
var parsed = Jsoup.parse(doc.documentBody);
domPruner.prune(parsed, 0.5);
var dld = se.extractSentences(parsed);
if (dld.totalNumWords() < 250)
return;
if (textileCraftDetector.testP(dld) > 0.3) {
System.out.println("textilecraft\t" + doc.url);
}
if (woodworkingDetector.testP(dld) > 0.2) {
System.out.println("woodworking\t" + doc.url);
}
if (recipeDetector.testP(dld) > 0.5) {
System.out.println("recipe\t" + doc.url);
}
};
if (cp.getQueuedSubmissionCount() > 32) {
task.run();
} else {
cp.execute(task);
}
}
});
}
}

View File

@ -151,15 +151,6 @@ i've
it's
it
i'm
1
2
3
4
5
6
7
8
9
.
..
...

View File

@ -42,7 +42,7 @@ class SqlLoadDomainLinksTest {
@Test
public void loadDomainLinks() {
var loader = new SqlLoadDomainLinks(dataSource);
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
loader.load(loaderData, new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
}
}

View File

@ -113,38 +113,13 @@ class SentenceExtractorTest {
var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
// documentKeywordExtractorLegacy.setLegacy(true);
// for (;;) {
long st = System.currentTimeMillis();
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
var newRes = documentKeywordExtractor.extractKeywords(newResult);
// var legacyRes = documentKeywordExtractorLegacy.extractKeywords(newResult);
//
// EdgePageWordSet difference = new EdgePageWordSet();
// for (IndexBlock block : IndexBlock.values()) {
// var newWords = new HashSet<>(newRes.get(block).words);
// var oldWords = new HashSet<>(legacyRes.get(block).words);
// newWords.removeAll(oldWords);
// if (!newWords.isEmpty()) {
// difference.append(block, newWords);
// }
// }
// System.out.println(difference);
System.out.println(newRes);
// System.out.println("---");
}
System.out.println(System.currentTimeMillis() - st);
// }
long st = System.currentTimeMillis();
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
var newRes = documentKeywordExtractor.extractKeywords(newResult);
System.out.println(newRes);
}
System.out.println(System.currentTimeMillis() - st);
}

View File

@ -1,156 +0,0 @@
package nu.marginalia.wmsa.edge.index.service;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import spark.Spark;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import static nu.marginalia.util.TestUtil.getConnection;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@ResourceLock(value = "mariadb", mode = ResourceAccessMode.READ_WRITE)
@Execution(ExecutionMode.SAME_THREAD)
@Tag("db")
public class EdgeIndexClientTest {
private static HikariDataSource dataSource;
private static EdgeIndexService service;
private static EdgeIndexClient client;
private static Path tempDir;
private static SearchIndexes indexes;
@SneakyThrows
public static HikariDataSource provideConnection() {
return getConnection();
}
static final int testPort = TestUtil.getPort();
@SneakyThrows
@BeforeAll
public static void setUpClass() {
Spark.port(testPort);
System.setProperty("service-name", "edge-index");
dataSource = provideConnection();
dataSource.setKeepaliveTime(100);
dataSource.setIdleTimeout(100);
client = new EdgeIndexClient();
client.setServiceRoute("127.0.0.1", testPort);
tempDir = Files.createTempDirectory("EdgeIndexClientTest");
var servicesFactory = new IndexServicesFactory(tempDir,tempDir,tempDir,tempDir,
"writer-index",
"writer-dictionary",
"index-words-read",
"index-urls-read",
"index-words-write",
"index-urls-write",
1L<<24,
id->false,
new SearchIndexPartitioner(null)
);
var init = new Initialization();
indexes = new SearchIndexes(servicesFactory, new SearchIndexPartitioner(null));
service = new EdgeIndexService("127.0.0.1",
testPort,
init, null,
indexes,
servicesFactory);
Spark.awaitInitialization();
init.setReady();
}
@Test
public void testMultiBucketHit() {
putWords(1, 1, -2, "fancy", "anagram", "dilbert", "whoah", "engram");
putWords(2, 2, -5, "quibble", "angry", "whoah", "fancy");
putWords(3, 3, -0.01, "strong", "manly", "muscles");
indexes.repartition();
indexes.preconvert();
indexes.reindexAll();
var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results;
System.out.println(results);
List<EdgeId<EdgeUrl>> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList());
assertEquals(2, flatResults.size());
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(1)));
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(2)));
}
@Test
public void testHighHit() {
putWords(2, 5, -100, "trapphus");
indexes.repartition();
indexes.preconvert();
indexes.reindexAll();
var rsp = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("trapphus"));
System.out.println(rsp);
assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.id());
}
@Test
public void testSearchDomain() {
putWords(8, 1, -2, "domain");
putWords(8, 2, -5, "domain");
putWords(10, 3, -0.01, "domain");
putWords(11, 3, -0.01, "domain");
putWords(12, 3, -0.01, "domain");
indexes.repartition();
indexes.preconvert();
indexes.reindexAll();
var results = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("fancy")).resultsList.get(IndexBlock.Title).get(0).results;
System.out.println(results);
List<EdgeId<EdgeUrl>> flatResults = results.values().stream().flatMap(List::stream).map(rs -> rs.url).collect(Collectors.toList());
assertEquals(2, flatResults.size());
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(1)));
assertTrue(flatResults.contains(new EdgeId<EdgeUrl>(2)));
}
void putWords(int didx, int idx, double quality, String... words) {
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
epw.addAll(Arrays.asList(words));
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
new EdgePageWordSet(epw), 0).blockingSubscribe();
}
@AfterAll
public static void tearDownClass() {
nu.marginalia.util.test.TestUtil.clearTempDir(tempDir);
}
}

View File

@ -24,7 +24,7 @@ class BodyQueryParserTest {
public static void init() throws IOException {
dict = new TermFrequencyDict(lm);
nGramBloomFilter = new NGramBloomFilter(lm);
englishDictionary = new EnglishDictionary(dict);
englishDictionary = new EnglishDictionary(nGramBloomFilter);
}
@BeforeEach

View File

@ -1,17 +0,0 @@
package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.Test;
class EnglishDictionaryTest {
@Test
void getWordVariants() {
LanguageModels lm = TestLanguageModels.getLanguageModels();
var dict = new TermFrequencyDict(lm);
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);
}
}

View File

@ -21,7 +21,7 @@ class QueryParserTest {
public void setUp() throws IOException {
dict = new TermFrequencyDict(lm);
nGramBloomFilter = new NGramBloomFilter(lm);
englishDictionary = new EnglishDictionary(dict);
englishDictionary = new EnglishDictionary(nGramBloomFilter);
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
}

View File

@ -23,8 +23,8 @@ class QueryVariantsTest {
var dict = new TermFrequencyDict(lm);
var ngrams = new NGramBloomFilter(lm);
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(dict));
parser = new QueryParser(new EnglishDictionary(dict), variants);
variants = new QueryVariants(lm, dict, ngrams, new EnglishDictionary(ngrams));
parser = new QueryParser(new EnglishDictionary(ngrams), variants);
}
@Test