Change TF-IDF normalization to reduce the amount of not-so-relevant matches.

2022-08-27 11:38:29 +02:00 · 2022-08-27 11:38:29 +02:00 · c865d6c6b2
commit c865d6c6b2
parent f4ad7aaf33
6 changed files with 68 additions and 59 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java
@ -69,7 +69,7 @@ public class DocumentDebugger {
        Set<String> reps = new HashSet<>();

 //        kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
-        kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
+//        kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));

        try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {

--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@ -40,35 +40,27 @@ public class DocumentKeywordExtractor {

        List<WordRep> titleWords = extractTitleWords(documentLanguageData);

-        List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
+        KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
        List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
        List<WordRep> subjects = subjectCounter.count(documentLanguageData);

-        int totalSize = wordsTfIdf.size();
+        List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
+        List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
+        List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());

-        List<WordRep> lowKeywords = new ArrayList<>(totalSize / 2);
-        List<WordRep> midKeywords = new ArrayList<>(totalSize / 2);
-        List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
-
-        for(var v : wordsTfIdf) {
-            if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
-            else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
-            else lowKeywords.add(v);
-        }
-
-        var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
+        var wordsToMatchWithTitle = joinWordLists(topKeywords, wordsNamesRepeated, subjects);

        Collection<String> artifacts = getArtifacts(documentLanguageData);

        var wordSet = new EdgePageWordSet(
                createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
-                createWords(IndexBlock.Topic, subjects),
+                createWords(IndexBlock.Subjects, subjects),
                createWords(IndexBlock.Title, titleWords),
                createWords(IndexBlock.NamesWords, wordsNamesAll),
-                createWords(IndexBlock.Top, topKeywords),
-                createWords(IndexBlock.Middle, midKeywords),
-                createWords(IndexBlock.Low, lowKeywords),
+                createWords(IndexBlock.Tfidf_Top, topKeywords),
+                createWords(IndexBlock.Tfidf_Middle, midKeywords),
+                createWords(IndexBlock.Tfidf_Lower, lowKeywords),
                new EdgePageWords(IndexBlock.Artifacts, artifacts)
        );

--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java
@ -4,9 +4,11 @@ import nu.marginalia.util.language.processing.model.DocumentLanguageData;
 import nu.marginalia.util.language.processing.model.WordRep;
 import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;

-import java.util.*;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
 import java.util.regex.Pattern;
-import java.util.stream.Collectors;

 public class KeywordCounter {
    private final KeywordExtractor keywordExtractor;
@ -17,10 +19,11 @@ public class KeywordCounter {
        this.keywordExtractor = keywordExtractor;
    }

-    public List<WordRep> count(DocumentLanguageData dld) {
+    public WordHistogram countHisto(DocumentLanguageData dld) {
        HashMap<String, Double> counts = new HashMap<>(1000);
        HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);

+
        for (var sent : dld.sentences) {
            var keywords = keywordExtractor.getKeywordsFromSentence(sent);
            for (var span : keywords) {
@ -32,34 +35,44 @@ public class KeywordCounter {
            }
        }

-        return counts.entrySet().stream()
-                .filter(e -> e.getValue() > 1)
-                .sorted(Comparator.comparing(this::getTermValue))
-                .map(Map.Entry::getKey)
-                .flatMap(w -> instances.get(w).stream())
-                .filter(w -> w.word.length() > 1)
-                .limit(150)
-                .collect(Collectors.toList());
+        double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
+
+        Set<WordRep> h5 = new HashSet<>();
+        Set<WordRep> h10 = new HashSet<>();
+        Set<WordRep> h15 = new HashSet<>();
+
+        for (var entry : counts.entrySet()) {
+            double value = getTermValue(entry, maxC);
+            Set<WordRep> histogram;
+            if (value < -3) histogram = h15;
+            else if (value < -2) histogram = h10;
+            else if (value < -1) histogram = h5;
+            else continue;
+
+            histogram.addAll(instances.get(entry.getKey()));
+        }
+
+        return new WordHistogram(h5, h10, h15);
    }

    private static final Pattern separator = Pattern.compile("_");

-    public double getTermValue(Map.Entry<String, Double> e) {
+    public double getTermValue(Map.Entry<String, Double> e, double maxValue) {
        String[] parts = separator.split(e.getKey());
        double totalValue = 0.;
        for (String part : parts) {
-            totalValue += value(part, e.getValue());
+            totalValue += value(part, e.getValue(), maxValue);
        }
-        return totalValue / Math.sqrt(parts.length);
+        return totalValue / parts.length;
    }

-    double value(String key, double value) {
+    double value(String key, double value, double maxValue) {
        double freq = dict.getTermFreqStemmed(key);
        if (freq < 1) {
            freq = 10;
        }
-        return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
+        return (0.1 + 0.9*value/maxValue) * Math.log((1.1+freq)/11820118.);
    }

-
+    public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java
@ -3,16 +3,20 @@ package nu.marginalia.wmsa.edge.index.model;
 public enum IndexBlock {
    TitleKeywords(0, 0),
    Title(1, 1),
+
    Link(2, 1.25),
-    Top(3, 2),
-    Middle(4, 2.5),
-    Low(5, 3.0),
-    Words_1(6, 3.0),
-    Meta(7, 7),
-    Words_2(8, 3.5),
-    NamesWords(9, 5),
-    Artifacts(10, 10),
-    Topic(11, 0.5),
+
+    Subjects(3, 0.5),
+    NamesWords(4, 5),
+    Artifacts(5, 10),
+    Meta(6, 7),
+
+    Tfidf_Top(7, 2),
+    Tfidf_Middle(8, 2.5),
+    Tfidf_Lower(9, 5.0),
+
+    Words_1(10, 3.0),
+    Words_2(11, 3.5),
    Words_4(12, 4.0),
    Words_8(13, 4.5),
    Words_16Plus(14, 7.0),
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
@ -26,9 +26,9 @@ public class SearchIndexReader implements AutoCloseable {
    private final Logger logger = LoggerFactory.getLogger(getClass());

    private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
-            IndexBlock.Top,
-            IndexBlock.Middle,
-            IndexBlock.Low,
+            IndexBlock.Tfidf_Top,
+            IndexBlock.Tfidf_Middle,
+            IndexBlock.Tfidf_Lower,
            IndexBlock.NamesWords,
            IndexBlock.Words_1,
            IndexBlock.Words_2,
@ -42,15 +42,15 @@ public class SearchIndexReader implements AutoCloseable {
            EnumMap<IndexBlock, SearchIndex> indices) {
        this.indices = indices;

-        var lowIndex  = indices.get(IndexBlock.Low);
-        var midIndex  = indices.get(IndexBlock.Middle);
-        var topIndex  = indices.get(IndexBlock.Top);
+        var lowIndex  = indices.get(IndexBlock.Tfidf_Lower);
+        var midIndex  = indices.get(IndexBlock.Tfidf_Middle);
+        var topIndex  = indices.get(IndexBlock.Tfidf_Top);
        var linkIndex  = indices.get(IndexBlock.Link);
        var titleIndex  = indices.get(IndexBlock.Title);
        var namesIndex  = indices.get(IndexBlock.NamesWords);
        var titleKeywordsIndex  = indices.get(IndexBlock.TitleKeywords);
        var metaIndex  = indices.get(IndexBlock.Meta);
-        var topicIndex  = indices.get(IndexBlock.Topic);
+        var topicIndex  = indices.get(IndexBlock.Subjects);

        var words1  = indices.get(IndexBlock.Words_1);
        var words2  = indices.get(IndexBlock.Words_2);
@ -70,7 +70,7 @@ public class SearchIndexReader implements AutoCloseable {

        underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
        underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
-        underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
+        underspecifiedQueryBuilders.put(IndexBlock.Tfidf_Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
    }

    @SafeVarargs
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java
@ -10,31 +10,31 @@ import java.util.stream.Collectors;

 public enum EdgeSearchProfile {
    DEFAULT("default",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Link,
                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
                    ),
            0, 1),
    MODERN("modern",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Link, IndexBlock.NamesWords,
                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
                    ),
            2),
    CORPO("corpo",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
            4, 5, 7),
    YOLO("yolo",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords,
                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
            0, 2, 1, 3, 4, 6),
    CORPO_CLEAN("corpo-clean",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
            4, 5),
    ACADEMIA("academia",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link, IndexBlock.NamesWords),
            3),
    FOOD("food",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link,  IndexBlock.Words_1, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Tfidf_Top, IndexBlock.Tfidf_Middle, IndexBlock.Tfidf_Lower, IndexBlock.Link,  IndexBlock.Words_1, IndexBlock.NamesWords),
            2, 0),
    ;