Expose additional functionality through WordsTfIdfCounts.

Bump requirements for being flagged as high TF-IDF from 2 occurences to 3.
This commit is contained in:
Viktor Lofgren 2023-06-12 17:42:31 +02:00 committed by Viktor
parent 4138233ddf
commit 443cf0cf1e

View File

@ -2,7 +2,6 @@ package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
@ -18,7 +17,7 @@ import static java.lang.Math.max;
/** Extract counts and TF-IDF for the words in the document,
* keep track of high-scoring words for flagging
*/
public class WordsTfIdfCounts implements WordReps {
public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
private final TermFrequencyDict dict;
private final double docCount;
@ -41,7 +40,7 @@ public class WordsTfIdfCounts implements WordReps {
int value = getTermValue(key, cnt, maxVal);
tfIdf.put(key, value);
if (cnt > 1 && value > 100) {
if (cnt > 2 && value > 100) {
highTfIdfInstances.add(key);
}
});
@ -74,6 +73,10 @@ public class WordsTfIdfCounts implements WordReps {
return counts;
}
public long termFrequencyDictValue(WordRep rep) {
return dict.getTermFreqStemmed(rep.stemmed);
}
private String spanToStemmed(DocumentSentence sentence, WordSpan span) {
if (span.size() == 1)
return sentence.stemmedWords[span.start];
@ -133,4 +136,8 @@ public class WordsTfIdfCounts implements WordReps {
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
}
@Override
public int compare(WordRep o1, WordRep o2) {
return tfIdf.getOrDefault(o1, 0) - tfIdf.getOrDefault(o2, 0);
}
}