Yet more restructuring.
This commit is contained in:
parent
d82532b7f1
commit
0ecab53635
@ -23,6 +23,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -6,7 +6,7 @@ import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.inject.Inject;
|
||||
|
@ -8,7 +8,7 @@ import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -6,7 +6,7 @@ import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.keyword_extraction.extractors;
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
@ -52,7 +52,6 @@ public class ForwardIndexReader {
|
||||
|
||||
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
|
||||
var idsArray = LongArray.mmapRead(idsFile);
|
||||
idsArray.advice(NativeIO.Advice.Sequential);
|
||||
|
||||
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
||||
|
||||
@ -85,20 +84,9 @@ public class ForwardIndexReader {
|
||||
return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
|
||||
}
|
||||
|
||||
public DocPost docPost(long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) throw new IllegalStateException("Forward index is not loaded");
|
||||
|
||||
final long meta = data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
||||
final int domain = Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
|
||||
|
||||
return new DocPost(meta, domain);
|
||||
}
|
||||
|
||||
private int idxForDoc(long docId) {
|
||||
return idToOffset.get(docId);
|
||||
}
|
||||
|
||||
|
||||
public record DocPost(long meta, int domainId) {}
|
||||
}
|
||||
|
@ -13,6 +13,8 @@ java {
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:ngram-bloom-filter')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
|
@ -8,8 +8,8 @@ import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.query_parser.token.Token;
|
||||
|
@ -2,8 +2,8 @@ package nu.marginalia.query_parser;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.query_parser.token.TokenType;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
|
@ -2,8 +2,8 @@ package nu.marginalia.query_parser;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
|
@ -21,6 +21,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -8,9 +8,6 @@ This library contains various tools used in language processing.
|
||||
Creates a [DocumentLanguageData](src/main/java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
|
||||
its words, how they stem, POS tags, and so on.
|
||||
|
||||
* [TermFrequencyDict](src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java)
|
||||
* [NGramBloomFilter](src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java)
|
||||
|
||||
## See Also
|
||||
|
||||
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.language.statistics;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
42
code/libraries/ngram-bloom-filter/build.gradle
Normal file
42
code/libraries/ngram-bloom-filter/build.gradle
Normal file
@ -0,0 +1,42 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.guice
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.language.statistics;
|
||||
package nu.marginalia.ngram_bloom_filter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.language.statistics;
|
||||
package nu.marginalia.ngram_bloom_filter;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.common.hash.HashFunction;
|
||||
@ -46,30 +46,10 @@ public class NGramBloomFilter {
|
||||
return bitMap.get(bit);
|
||||
}
|
||||
|
||||
// public static void main(String... args) throws IOException {
|
||||
// var filter = convertFromDictionaryFile(new File(args[0]));
|
||||
// filter.bitMap.writeToFile(Path.of(args[1]));
|
||||
// }
|
||||
|
||||
public static NGramBloomFilter load(Path file) throws IOException {
|
||||
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
|
||||
}
|
||||
|
||||
// public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
|
||||
// DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
|
||||
// AtomicInteger popCount = new AtomicInteger();
|
||||
// try (var f = new KeywordLexiconJournalFile(file)) {
|
||||
// f.loadFile(data -> {
|
||||
// long bit = bitForWord(new String(data), bitMap.cardinality);
|
||||
// if (!bitMap.set(bit))
|
||||
// popCount.incrementAndGet();
|
||||
// });
|
||||
// }
|
||||
//
|
||||
// System.out.println("popcount = " + popCount.get());
|
||||
// return new NGramBloomFilter(bitMap);
|
||||
// }
|
||||
|
||||
private static final Pattern underscore = Pattern.compile("_");
|
||||
|
||||
private static long bitForWord(String s, long n) {
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.language.statistics;
|
||||
package nu.marginalia.ngram_bloom_filter;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
52
code/libraries/term-frequency-dict/build.gradle
Normal file
52
code/libraries/term-frequency-dict/build.gradle
Normal file
@ -0,0 +1,52 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:monkey-patch-opennlp')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.jsoup
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
@ -1,60 +1,51 @@
|
||||
package nu.marginalia.language.statistics;
|
||||
package nu.marginalia.term_frequency_dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Singleton
|
||||
public class TermFrequencyDict {
|
||||
|
||||
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private static final Pattern separator = Pattern.compile("[_ ]+");
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private static final long DOC_COUNT_KEY = ~0L;
|
||||
private static long fileSize(Path p) throws IOException {
|
||||
return Files.size(p);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public TermFrequencyDict(@Nullable LanguageModels models) {
|
||||
if (models == null) {
|
||||
return;
|
||||
}
|
||||
public TermFrequencyDict(@NotNull LanguageModels models) {
|
||||
this(models.termFrequencies);
|
||||
}
|
||||
|
||||
if (models.termFrequencies != null) {
|
||||
public TermFrequencyDict(Path file) {
|
||||
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(file.toFile())))) {
|
||||
wordRates.ensureCapacity((int)(Files.size(file)/16));
|
||||
|
||||
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
|
||||
|
||||
wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
|
||||
|
||||
for (;;) {
|
||||
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
||||
}
|
||||
} catch (EOFException eof) {
|
||||
// ok
|
||||
} catch (IOException e) {
|
||||
logger.error("IO Exception reading " + models.termFrequencies, e);
|
||||
for (;;) {
|
||||
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
||||
}
|
||||
} catch (EOFException eof) {
|
||||
// ok
|
||||
} catch (IOException e) {
|
||||
logger.error("IO Exception reading " + file, e);
|
||||
}
|
||||
|
||||
logger.info("Read {} N-grams frequencies", wordRates.size());
|
||||
}
|
||||
|
||||
public TermFrequencyDict(TLongIntHashMap data) {
|
||||
wordRates.putAll(data);
|
||||
}
|
||||
|
||||
public int docCount() {
|
||||
int cnt = wordRates.get(DOC_COUNT_KEY);
|
||||
@ -65,6 +56,7 @@ public class TermFrequencyDict {
|
||||
return cnt;
|
||||
}
|
||||
|
||||
// WIP refactoring, this needs a new home:
|
||||
//
|
||||
// public static void main(String... args) throws IOException, InterruptedException {
|
||||
// if (args.length != 2) {
|
||||
@ -151,8 +143,8 @@ public class TermFrequencyDict {
|
||||
// }
|
||||
|
||||
public static long getStringHash(String s) {
|
||||
String[] strings = separator.split(s);
|
||||
if (s.length() > 1) {
|
||||
if (s.indexOf(' ') >= 0 || s.indexOf('_') >= 0) {
|
||||
String[] strings = StringUtils.split(s, " _");
|
||||
byte[][] parts = new byte[strings.length][];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
parts[i] = ps.stemWord(strings[i]).getBytes();
|
||||
@ -163,6 +155,7 @@ public class TermFrequencyDict {
|
||||
return longHash(s.getBytes());
|
||||
}
|
||||
}
|
||||
|
||||
public long getTermFreqHash(long hash) {
|
||||
return wordRates.get(hash);
|
||||
}
|
@ -31,8 +31,9 @@ dependencies {
|
||||
implementation project(':code:common:service-client')
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.assistant.suggest;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.assistant.dict.SpellChecker;
|
||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||
|
@ -29,6 +29,8 @@ dependencies {
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:ngram-bloom-filter')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:api:assistant-api')
|
||||
implementation project(':code:api:index-api')
|
||||
|
@ -9,8 +9,8 @@ import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.query_parser.QueryParser;
|
||||
import nu.marginalia.query_parser.QueryPermutation;
|
||||
import nu.marginalia.query_parser.QueryVariants;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.search.valuation;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
|
@ -4,8 +4,8 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.search.command.SearchJsParameter;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
import nu.marginalia.search.query.model.UserSearchParameters;
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.search.valuation;
|
||||
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
@ -17,6 +17,8 @@ include 'code:libraries:random-write-funnel'
|
||||
include 'code:libraries:next-prime'
|
||||
include 'code:libraries:braille-block-punch-cards'
|
||||
include 'code:libraries:language-processing'
|
||||
include 'code:libraries:ngram-bloom-filter'
|
||||
include 'code:libraries:term-frequency-dict'
|
||||
|
||||
include 'code:features-search:screenshots'
|
||||
include 'code:features-search:random-websites'
|
||||
|
Loading…
Reference in New Issue
Block a user