From 3f2854a5e910b6d7ef809ecf0095bfe059e2f195 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 27 Aug 2022 20:30:18 +0200 Subject: [PATCH] WIP n-gram loader --- .../marginalia/wmsa/edge/assistant/dict/NGramDict.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java index 1c2f1c8a..992ddbba 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/NGramDict.java @@ -78,6 +78,7 @@ public class NGramDict { LanguageFilter lf = new LanguageFilter(); Map counts = new HashMap<>(100_000_000); + Set words = new HashSet<>(10_000); for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine @@ -97,11 +98,18 @@ public class NGramDict { continue; } + for (var sent : dld.sentences) { for (var word : sent) { - counts.merge(word.stemmed(), 1, Integer::sum); + words.add(word.stemmed()); } } + + for (var word : words) { + counts.merge(word, 1, Integer::sum); + } + + words.clear(); } }