Change TF-IDF normalization to reduce the amount of not-so-relevant matches.

This commit is contained in:
vlofgren 2022-08-27 11:38:29 +02:00
parent f4ad7aaf33
commit c865d6c6b2
6 changed files with 68 additions and 59 deletions

View File

@ -69,7 +69,7 @@ public class DocumentDebugger {
Set<String> reps = new HashSet<>();
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {

View File

@ -40,35 +40,27 @@ public class DocumentKeywordExtractor {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
int totalSize = wordsTfIdf.size();
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
List<WordRep> lowKeywords = new ArrayList<>(totalSize / 2);
List<WordRep> midKeywords = new ArrayList<>(totalSize / 2);
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
for(var v : wordsTfIdf) {
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
else lowKeywords.add(v);
}
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects);
Collection<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet(
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
createWords(IndexBlock.Topic, subjects),
createWords(IndexBlock.Subjects, subjects),
createWords(IndexBlock.Title, titleWords),
createWords(IndexBlock.NamesWords, wordsNamesAll),
createWords(IndexBlock.Top, topKeywords),
createWords(IndexBlock.Middle, midKeywords),
createWords(IndexBlock.Low, lowKeywords),
createWords(IndexBlock.Tfidf_Top, topKeywords),
createWords(IndexBlock.Tfidf_Middle, midKeywords),
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
);

View File

@ -4,9 +4,11 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import java.util.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class KeywordCounter {
private final KeywordExtractor keywordExtractor;
@ -17,10 +19,11 @@ public class KeywordCounter {
this.keywordExtractor = keywordExtractor;
}
public List<WordRep> count(DocumentLanguageData dld) {
public WordHistogram countHisto(DocumentLanguageData dld) {
HashMap<String, Double> counts = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
@ -32,34 +35,44 @@ public class KeywordCounter {
}
}
return counts.entrySet().stream()
.filter(e -> e.getValue() > 1)
.sorted(Comparator.comparing(this::getTermValue))
.map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream())
.filter(w -> w.word.length() > 1)
.limit(150)
.collect(Collectors.toList());
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
Set<WordRep> h5 = new HashSet<>();
Set<WordRep> h10 = new HashSet<>();
Set<WordRep> h15 = new HashSet<>();
for (var entry : counts.entrySet()) {
double value = getTermValue(entry, maxC);
Set<WordRep> histogram;
if (value < -3) histogram = h15;
else if (value < -2) histogram = h10;
else if (value < -1) histogram = h5;
else continue;
histogram.addAll(instances.get(entry.getKey()));
}
return new WordHistogram(h5, h10, h15);
}
private static final Pattern separator = Pattern.compile("_");
public double getTermValue(Map.Entry<String, Double> e) {
public double getTermValue(Map.Entry<String, Double> e, double maxValue) {
String[] parts = separator.split(e.getKey());
double totalValue = 0.;
for (String part : parts) {
totalValue += value(part, e.getValue());
totalValue += value(part, e.getValue(), maxValue);
}
return totalValue / Math.sqrt(parts.length);
return totalValue / parts.length;
}
double value(String key, double value) {
double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key);
if (freq < 1) {
freq = 10;
}
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.);
}
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
}

View File

@ -3,16 +3,20 @@ package nu.marginalia.wmsa.edge.index.model;
public enum IndexBlock {
TitleKeywords(0, 0),
Title(1, 1),
Link(2, 1.25),
Top(3, 2),
Middle(4, 2.5),
Low(5, 3.0),
Words_1(6, 3.0),
Meta(7, 7),
Words_2(8, 3.5),
NamesWords(9, 5),
Artifacts(10, 10),
Topic(11, 0.5),
Subjects(3, 0.5),
NamesWords(4, 5),
Artifacts(5, 10),
Meta(6, 7),
Tfidf_Top(7, 2),
Tfidf_Middle(8, 2.5),
Tfidf_Lower(9, 5.0),
Words_1(10, 3.0),
Words_2(11, 3.5),
Words_4(12, 4.0),
Words_8(13, 4.5),
Words_16Plus(14, 7.0),

View File

@ -26,9 +26,9 @@ public class SearchIndexReader implements AutoCloseable {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
IndexBlock.Top,
IndexBlock.Middle,
IndexBlock.Low,
IndexBlock.Tfidf_Top,
IndexBlock.Tfidf_Middle,
IndexBlock.Tfidf_Lower,
IndexBlock.NamesWords,
IndexBlock.Words_1,
IndexBlock.Words_2,
@ -42,15 +42,15 @@ public class SearchIndexReader implements AutoCloseable {
EnumMap<IndexBlock, SearchIndex> indices) {
this.indices = indices;
var lowIndex = indices.get(IndexBlock.Low);
var midIndex = indices.get(IndexBlock.Middle);
var topIndex = indices.get(IndexBlock.Top);
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
var topIndex = indices.get(IndexBlock.Tfidf_Top);
var linkIndex = indices.get(IndexBlock.Link);
var titleIndex = indices.get(IndexBlock.Title);
var namesIndex = indices.get(IndexBlock.NamesWords);
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
var metaIndex = indices.get(IndexBlock.Meta);
var topicIndex = indices.get(IndexBlock.Topic);
var topicIndex = indices.get(IndexBlock.Subjects);
var words1 = indices.get(IndexBlock.Words_1);
var words2 = indices.get(IndexBlock.Words_2);
@ -70,7 +70,7 @@ public class SearchIndexReader implements AutoCloseable {
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
}
@SafeVarargs

View File

@ -10,31 +10,31 @@ import java.util.stream.Collectors;
public enum EdgeSearchProfile {
DEFAULT("default",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
0, 1),
MODERN("modern",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
2),
CORPO("corpo",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
4, 5, 7),
YOLO("yolo",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
4, 5),
ACADEMIA("academia",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
3),
FOOD("food",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
2, 0),
;