Change TF-IDF normalization to reduce the amount of not-so-relevant matches.
This commit is contained in:
parent
f4ad7aaf33
commit
c865d6c6b2
@ -69,7 +69,7 @@ public class DocumentDebugger {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
|
||||
|
@ -40,35 +40,27 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
|
||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
int totalSize = wordsTfIdf.size();
|
||||
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
|
||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||
|
||||
List<WordRep> lowKeywords = new ArrayList<>(totalSize / 2);
|
||||
List<WordRep> midKeywords = new ArrayList<>(totalSize / 2);
|
||||
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
|
||||
|
||||
for(var v : wordsTfIdf) {
|
||||
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
|
||||
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
|
||||
else lowKeywords.add(v);
|
||||
}
|
||||
|
||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
|
||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects);
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
var wordSet = new EdgePageWordSet(
|
||||
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
|
||||
createWords(IndexBlock.Topic, subjects),
|
||||
createWords(IndexBlock.Subjects, subjects),
|
||||
createWords(IndexBlock.Title, titleWords),
|
||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||
createWords(IndexBlock.Top, topKeywords),
|
||||
createWords(IndexBlock.Middle, midKeywords),
|
||||
createWords(IndexBlock.Low, lowKeywords),
|
||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
|
||||
|
@ -4,9 +4,11 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class KeywordCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
@ -17,10 +19,11 @@ public class KeywordCounter {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
@ -32,34 +35,44 @@ public class KeywordCounter {
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream()
|
||||
.filter(e -> e.getValue() > 1)
|
||||
.sorted(Comparator.comparing(this::getTermValue))
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream())
|
||||
.filter(w -> w.word.length() > 1)
|
||||
.limit(150)
|
||||
.collect(Collectors.toList());
|
||||
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
|
||||
|
||||
Set<WordRep> h5 = new HashSet<>();
|
||||
Set<WordRep> h10 = new HashSet<>();
|
||||
Set<WordRep> h15 = new HashSet<>();
|
||||
|
||||
for (var entry : counts.entrySet()) {
|
||||
double value = getTermValue(entry, maxC);
|
||||
Set<WordRep> histogram;
|
||||
if (value < -3) histogram = h15;
|
||||
else if (value < -2) histogram = h10;
|
||||
else if (value < -1) histogram = h5;
|
||||
else continue;
|
||||
|
||||
histogram.addAll(instances.get(entry.getKey()));
|
||||
}
|
||||
|
||||
return new WordHistogram(h5, h10, h15);
|
||||
}
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
|
||||
public double getTermValue(Map.Entry<String, Double> e) {
|
||||
public double getTermValue(Map.Entry<String, Double> e, double maxValue) {
|
||||
String[] parts = separator.split(e.getKey());
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
totalValue += value(part, e.getValue());
|
||||
totalValue += value(part, e.getValue(), maxValue);
|
||||
}
|
||||
return totalValue / Math.sqrt(parts.length);
|
||||
return totalValue / parts.length;
|
||||
}
|
||||
|
||||
double value(String key, double value) {
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
freq = 10;
|
||||
}
|
||||
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.);
|
||||
}
|
||||
|
||||
|
||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||
}
|
||||
|
@ -3,16 +3,20 @@ package nu.marginalia.wmsa.edge.index.model;
|
||||
public enum IndexBlock {
|
||||
TitleKeywords(0, 0),
|
||||
Title(1, 1),
|
||||
|
||||
Link(2, 1.25),
|
||||
Top(3, 2),
|
||||
Middle(4, 2.5),
|
||||
Low(5, 3.0),
|
||||
Words_1(6, 3.0),
|
||||
Meta(7, 7),
|
||||
Words_2(8, 3.5),
|
||||
NamesWords(9, 5),
|
||||
Artifacts(10, 10),
|
||||
Topic(11, 0.5),
|
||||
|
||||
Subjects(3, 0.5),
|
||||
NamesWords(4, 5),
|
||||
Artifacts(5, 10),
|
||||
Meta(6, 7),
|
||||
|
||||
Tfidf_Top(7, 2),
|
||||
Tfidf_Middle(8, 2.5),
|
||||
Tfidf_Lower(9, 5.0),
|
||||
|
||||
Words_1(10, 3.0),
|
||||
Words_2(11, 3.5),
|
||||
Words_4(12, 4.0),
|
||||
Words_8(13, 4.5),
|
||||
Words_16Plus(14, 7.0),
|
||||
|
@ -26,9 +26,9 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
||||
IndexBlock.Top,
|
||||
IndexBlock.Middle,
|
||||
IndexBlock.Low,
|
||||
IndexBlock.Tfidf_Top,
|
||||
IndexBlock.Tfidf_Middle,
|
||||
IndexBlock.Tfidf_Lower,
|
||||
IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1,
|
||||
IndexBlock.Words_2,
|
||||
@ -42,15 +42,15 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
EnumMap<IndexBlock, SearchIndex> indices) {
|
||||
this.indices = indices;
|
||||
|
||||
var lowIndex = indices.get(IndexBlock.Low);
|
||||
var midIndex = indices.get(IndexBlock.Middle);
|
||||
var topIndex = indices.get(IndexBlock.Top);
|
||||
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
|
||||
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
|
||||
var topIndex = indices.get(IndexBlock.Tfidf_Top);
|
||||
var linkIndex = indices.get(IndexBlock.Link);
|
||||
var titleIndex = indices.get(IndexBlock.Title);
|
||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
||||
var metaIndex = indices.get(IndexBlock.Meta);
|
||||
var topicIndex = indices.get(IndexBlock.Topic);
|
||||
var topicIndex = indices.get(IndexBlock.Subjects);
|
||||
|
||||
var words1 = indices.get(IndexBlock.Words_1);
|
||||
var words2 = indices.get(IndexBlock.Words_2);
|
||||
@ -70,7 +70,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
|
@ -10,31 +10,31 @@ import java.util.stream.Collectors;
|
||||
|
||||
public enum EdgeSearchProfile {
|
||||
DEFAULT("default",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
0, 1),
|
||||
MODERN("modern",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
2),
|
||||
CORPO("corpo",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
4, 5, 7),
|
||||
YOLO("yolo",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
0, 2, 1, 3, 4, 6),
|
||||
CORPO_CLEAN("corpo-clean",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
4, 5),
|
||||
ACADEMIA("academia",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
3),
|
||||
FOOD("food",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||
2, 0),
|
||||
;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user