From 0ecab53635a54059ef4f8980db915dec99903e27 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 13 Mar 2023 23:40:26 +0100 Subject: [PATCH] Yet more restructuring. --- .../keyword-extraction/build.gradle | 1 + .../DocumentKeywordExtractor.java | 2 +- .../extractors/WordsTfIdfCounts.java | 2 +- .../SentenceExtractorTest.java | 2 +- .../extractors/SubjectLikeKeywordsTest.java | 2 +- .../index/forward/ForwardIndexReader.java | 12 ----- .../features-search/query-parser/build.gradle | 2 + .../query_parser/QueryVariants.java | 4 +- .../query_parser/BodyQueryParserTest.java | 4 +- .../query_parser/QueryVariantsTest.java | 4 +- .../language-processing/build.gradle | 1 + code/libraries/language-processing/readme.md | 3 -- .../statistics/EnglishDictionary.java | 1 + .../libraries/ngram-bloom-filter/build.gradle | 42 +++++++++++++++ .../ngram_bloom_filter}/DenseBitMap.java | 2 +- .../ngram_bloom_filter}/NGramBloomFilter.java | 22 +------- .../ngram_bloom_filter}/DenseBitMapTest.java | 2 +- .../term-frequency-dict/build.gradle | 52 +++++++++++++++++++ .../TermFrequencyDict.java | 51 ++++++++---------- .../assistant-service/build.gradle | 3 +- .../assistant/suggest/Suggestions.java | 2 +- .../services-core/search-service/build.gradle | 2 + .../marginalia/search/query/QueryFactory.java | 4 +- .../valuation/SearchResultValuator.java | 2 +- .../search/query/QueryFactoryTest.java | 4 +- .../valuation/SearchResultValuatorTest.java | 2 +- settings.gradle | 2 + 27 files changed, 147 insertions(+), 85 deletions(-) create mode 100644 code/libraries/ngram-bloom-filter/build.gradle rename code/libraries/{language-processing/src/main/java/nu/marginalia/language/statistics => ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter}/DenseBitMap.java (97%) rename code/libraries/{language-processing/src/main/java/nu/marginalia/language/statistics => ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter}/NGramBloomFilter.java (68%) rename code/libraries/{language-processing/src/test/java/nu/marginalia/language/statistics => ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter}/DenseBitMapTest.java (96%) create mode 100644 code/libraries/term-frequency-dict/build.gradle rename code/libraries/{language-processing/src/main/java/nu/marginalia/language/statistics => term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict}/TermFrequencyDict.java (82%) diff --git a/code/features-convert/keyword-extraction/build.gradle b/code/features-convert/keyword-extraction/build.gradle index 53020518..e4467155 100644 --- a/code/features-convert/keyword-extraction/build.gradle +++ b/code/features-convert/keyword-extraction/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java index e9737abf..7dad2cfe 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java @@ -6,7 +6,7 @@ import nu.marginalia.language.WordPatterns; import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.EdgeUrl; import javax.inject.Inject; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java index fc262449..859685e8 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java @@ -8,7 +8,7 @@ import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; import java.util.*; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java index cfd1d9d9..b0edf5b8 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java @@ -6,7 +6,7 @@ import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.language.model.WordSeparator; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java index 895beb4c..eb3de606 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java @@ -3,7 +3,7 @@ package nu.marginalia.keyword_extraction.extractors; import com.google.common.collect.Sets; import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.test.util.TestLanguageModels; import org.junit.jupiter.api.Test; diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index 1837b079..76e13951 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -52,7 +52,6 @@ public class ForwardIndexReader { private static TLongIntHashMap loadIds(Path idsFile) throws IOException { var idsArray = LongArray.mmapRead(idsFile); - idsArray.advice(NativeIO.Advice.Sequential); var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1); @@ -85,20 +84,9 @@ public class ForwardIndexReader { return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET)); } - public DocPost docPost(long docId) { - long offset = idxForDoc(docId); - if (offset < 0) throw new IllegalStateException("Forward index is not loaded"); - - final long meta = data.get(ENTRY_SIZE * offset + METADATA_OFFSET); - final int domain = Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET)); - - return new DocPost(meta, domain); - } - private int idxForDoc(long docId) { return idToOffset.get(docId); } - public record DocPost(long meta, int domainId) {} } diff --git a/code/features-search/query-parser/build.gradle b/code/features-search/query-parser/build.gradle index 0ee4ea94..44d4b14b 100644 --- a/code/features-search/query-parser/build.gradle +++ b/code/features-search/query-parser/build.gradle @@ -13,6 +13,8 @@ java { } dependencies { implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:ngram-bloom-filter') + implementation project(':code:libraries:term-frequency-dict') implementation project(':code:features-convert:keyword-extraction') implementation project(':code:common:config') implementation project(':code:common:model') diff --git a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java index 800b4443..d0590183 100644 --- a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java +++ b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java @@ -8,8 +8,8 @@ import nu.marginalia.LanguageModels; import nu.marginalia.keyword_extraction.KeywordExtractor; import nu.marginalia.language.statistics.EnglishDictionary; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.language.statistics.NGramBloomFilter; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.ngram_bloom_filter.NGramBloomFilter; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; import nu.marginalia.query_parser.token.Token; diff --git a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java index 27f282b4..cd9a61eb 100644 --- a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java +++ b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java @@ -2,8 +2,8 @@ package nu.marginalia.query_parser; import nu.marginalia.LanguageModels; import nu.marginalia.language.statistics.EnglishDictionary; -import nu.marginalia.language.statistics.NGramBloomFilter; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.ngram_bloom_filter.NGramBloomFilter; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.query_parser.token.TokenType; import nu.marginalia.util.TestLanguageModels; import org.junit.jupiter.api.BeforeAll; diff --git a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java index 0abd0cc1..e67a6940 100644 --- a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java +++ b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java @@ -2,8 +2,8 @@ package nu.marginalia.query_parser; import nu.marginalia.LanguageModels; import nu.marginalia.language.statistics.EnglishDictionary; -import nu.marginalia.language.statistics.NGramBloomFilter; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.ngram_bloom_filter.NGramBloomFilter; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.language.sentence.SentenceExtractor; import org.junit.jupiter.api.BeforeAll; diff --git a/code/libraries/language-processing/build.gradle b/code/libraries/language-processing/build.gradle index f0d52d1f..e71d3c27 100644 --- a/code/libraries/language-processing/build.gradle +++ b/code/libraries/language-processing/build.gradle @@ -21,6 +21,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:term-frequency-dict') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/libraries/language-processing/readme.md b/code/libraries/language-processing/readme.md index ee3fd14b..20821b13 100644 --- a/code/libraries/language-processing/readme.md +++ b/code/libraries/language-processing/readme.md @@ -8,9 +8,6 @@ This library contains various tools used in language processing. Creates a [DocumentLanguageData](src/main/java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing its words, how they stem, POS tags, and so on. -* [TermFrequencyDict](src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java) -* [NGramBloomFilter](src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java) - ## See Also [features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java index f861fdda..d96c0666 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java @@ -1,6 +1,7 @@ package nu.marginalia.language.statistics; import com.google.inject.Inject; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/libraries/ngram-bloom-filter/build.gradle b/code/libraries/ngram-bloom-filter/build.gradle new file mode 100644 index 00000000..1d8dabd7 --- /dev/null +++ b/code/libraries/ngram-bloom-filter/build.gradle @@ -0,0 +1,42 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "de.undercouch.download" version "5.1.0" +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:config') + implementation project(':third-party:porterstemmer') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + implementation libs.notnull + + implementation libs.bundles.nlp + implementation libs.guice + implementation libs.trove + implementation libs.fastutil + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/DenseBitMap.java b/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/DenseBitMap.java similarity index 97% rename from code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/DenseBitMap.java rename to code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/DenseBitMap.java index c31a386c..a69576cc 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/DenseBitMap.java +++ b/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/DenseBitMap.java @@ -1,4 +1,4 @@ -package nu.marginalia.language.statistics; +package nu.marginalia.ngram_bloom_filter; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java b/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/NGramBloomFilter.java similarity index 68% rename from code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java rename to code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/NGramBloomFilter.java index 367ccf37..85af1367 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java +++ b/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/NGramBloomFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.language.statistics; +package nu.marginalia.ngram_bloom_filter; import ca.rmen.porterstemmer.PorterStemmer; import com.google.common.hash.HashFunction; @@ -46,30 +46,10 @@ public class NGramBloomFilter { return bitMap.get(bit); } -// public static void main(String... args) throws IOException { -// var filter = convertFromDictionaryFile(new File(args[0])); -// filter.bitMap.writeToFile(Path.of(args[1])); -// } - public static NGramBloomFilter load(Path file) throws IOException { return new NGramBloomFilter(DenseBitMap.loadFromFile(file)); } -// public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException { -// DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L); -// AtomicInteger popCount = new AtomicInteger(); -// try (var f = new KeywordLexiconJournalFile(file)) { -// f.loadFile(data -> { -// long bit = bitForWord(new String(data), bitMap.cardinality); -// if (!bitMap.set(bit)) -// popCount.incrementAndGet(); -// }); -// } -// -// System.out.println("popcount = " + popCount.get()); -// return new NGramBloomFilter(bitMap); -// } - private static final Pattern underscore = Pattern.compile("_"); private static long bitForWord(String s, long n) { diff --git a/code/libraries/language-processing/src/test/java/nu/marginalia/language/statistics/DenseBitMapTest.java b/code/libraries/ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter/DenseBitMapTest.java similarity index 96% rename from code/libraries/language-processing/src/test/java/nu/marginalia/language/statistics/DenseBitMapTest.java rename to code/libraries/ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter/DenseBitMapTest.java index 1a506c4c..783b2ca9 100644 --- a/code/libraries/language-processing/src/test/java/nu/marginalia/language/statistics/DenseBitMapTest.java +++ b/code/libraries/ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter/DenseBitMapTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.language.statistics; +package nu.marginalia.ngram_bloom_filter; import org.junit.jupiter.api.Test; diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle new file mode 100644 index 00000000..f0d52d1f --- /dev/null +++ b/code/libraries/term-frequency-dict/build.gradle @@ -0,0 +1,52 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + + id "me.champeau.jmh" version "0.6.6" + id "de.undercouch.download" version "5.1.0" + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':third-party:rdrpostagger') + implementation project(':third-party:porterstemmer') + implementation project(':third-party:monkey-patch-opennlp') + implementation project(':code:common:model') + implementation project(':code:common:config') + implementation project(':code:libraries:easy-lsh') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + implementation libs.notnull + + implementation libs.guice + implementation libs.jsoup + implementation libs.trove + implementation libs.fastutil + + implementation libs.bundles.nlp + implementation libs.commons.lang3 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java b/code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java similarity index 82% rename from code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java rename to code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java index 1ac5c601..16f40528 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java +++ b/code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java @@ -1,60 +1,51 @@ -package nu.marginalia.language.statistics; +package nu.marginalia.term_frequency_dict; import ca.rmen.porterstemmer.PorterStemmer; import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.LanguageModels; +import org.apache.commons.lang3.StringUtils; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.Nullable; import javax.inject.Inject; import javax.inject.Singleton; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; -import java.util.regex.Pattern; -import java.util.stream.Collectors; @Singleton public class TermFrequencyDict { - private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0); - private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final Pattern separator = Pattern.compile("[_ ]+"); private static final PorterStemmer ps = new PorterStemmer(); private static final long DOC_COUNT_KEY = ~0L; - private static long fileSize(Path p) throws IOException { - return Files.size(p); - } @Inject - public TermFrequencyDict(@Nullable LanguageModels models) { - if (models == null) { - return; - } + public TermFrequencyDict(@NotNull LanguageModels models) { + this(models.termFrequencies); + } - if (models.termFrequencies != null) { + public TermFrequencyDict(Path file) { + try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(file.toFile())))) { + wordRates.ensureCapacity((int)(Files.size(file)/16)); - try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) { - - wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16)); - - for (;;) { - wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong()); - } - } catch (EOFException eof) { - // ok - } catch (IOException e) { - logger.error("IO Exception reading " + models.termFrequencies, e); + for (;;) { + wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong()); } + } catch (EOFException eof) { + // ok + } catch (IOException e) { + logger.error("IO Exception reading " + file, e); } logger.info("Read {} N-grams frequencies", wordRates.size()); } + public TermFrequencyDict(TLongIntHashMap data) { + wordRates.putAll(data); + } public int docCount() { int cnt = wordRates.get(DOC_COUNT_KEY); @@ -65,6 +56,7 @@ public class TermFrequencyDict { return cnt; } +// WIP refactoring, this needs a new home: // // public static void main(String... args) throws IOException, InterruptedException { // if (args.length != 2) { @@ -151,8 +143,8 @@ public class TermFrequencyDict { // } public static long getStringHash(String s) { - String[] strings = separator.split(s); - if (s.length() > 1) { + if (s.indexOf(' ') >= 0 || s.indexOf('_') >= 0) { + String[] strings = StringUtils.split(s, " _"); byte[][] parts = new byte[strings.length][]; for (int i = 0; i < parts.length; i++) { parts[i] = ps.stemWord(strings[i]).getBytes(); @@ -163,6 +155,7 @@ public class TermFrequencyDict { return longHash(s.getBytes()); } } + public long getTermFreqHash(long hash) { return wordRates.get(hash); } diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index 50821032..62a8c8a6 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -31,8 +31,9 @@ dependencies { implementation project(':code:common:service-client') implementation project(':code:features-search:screenshots') - implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java index cb59ffd4..2fd62091 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java @@ -2,7 +2,7 @@ package nu.marginalia.assistant.suggest; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.assistant.dict.SpellChecker; import org.apache.commons.collections4.trie.PatriciaTrie; diff --git a/code/services-core/search-service/build.gradle b/code/services-core/search-service/build.gradle index 22f96602..5715a8ff 100644 --- a/code/services-core/search-service/build.gradle +++ b/code/services-core/search-service/build.gradle @@ -29,6 +29,8 @@ dependencies { implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:braille-block-punch-cards') + implementation project(':code:libraries:ngram-bloom-filter') + implementation project(':code:libraries:term-frequency-dict') implementation project(':code:api:assistant-api') implementation project(':code:api:index-api') diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java index ca38e25a..e1425ffd 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java @@ -9,8 +9,8 @@ import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.statistics.EnglishDictionary; -import nu.marginalia.language.statistics.NGramBloomFilter; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.ngram_bloom_filter.NGramBloomFilter; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.query_parser.QueryParser; import nu.marginalia.query_parser.QueryPermutation; import nu.marginalia.query_parser.QueryVariants; diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java index 8055dcf8..045fd48f 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java @@ -2,7 +2,7 @@ package nu.marginalia.search.valuation; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java index b3ec6dca..09fe52c3 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java @@ -4,8 +4,8 @@ import nu.marginalia.WmsaHome; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.language.statistics.EnglishDictionary; import nu.marginalia.index.client.model.query.SearchSpecification; -import nu.marginalia.language.statistics.NGramBloomFilter; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.ngram_bloom_filter.NGramBloomFilter; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.model.SearchProfile; import nu.marginalia.search.query.model.UserSearchParameters; diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java index aaa4a56a..e3b22bcc 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java @@ -1,7 +1,7 @@ package nu.marginalia.search.valuation; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; -import nu.marginalia.language.statistics.TermFrequencyDict; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.crawl.PubDate; diff --git a/settings.gradle b/settings.gradle index 9ce6f7c8..6e014e19 100644 --- a/settings.gradle +++ b/settings.gradle @@ -17,6 +17,8 @@ include 'code:libraries:random-write-funnel' include 'code:libraries:next-prime' include 'code:libraries:braille-block-punch-cards' include 'code:libraries:language-processing' +include 'code:libraries:ngram-bloom-filter' +include 'code:libraries:term-frequency-dict' include 'code:features-search:screenshots' include 'code:features-search:random-websites'