Yet more restructuring.

This commit is contained in:
Viktor Lofgren 2023-03-13 23:40:26 +01:00
parent d82532b7f1
commit 0ecab53635
27 changed files with 147 additions and 85 deletions

View File

@ -23,6 +23,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -6,7 +6,7 @@ import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import javax.inject.Inject;

View File

@ -8,7 +8,7 @@ import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
import java.util.*;

View File

@ -6,7 +6,7 @@ import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;

View File

@ -3,7 +3,7 @@ package nu.marginalia.keyword_extraction.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test;

View File

@ -52,7 +52,6 @@ public class ForwardIndexReader {
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
var idsArray = LongArray.mmapRead(idsFile);
idsArray.advice(NativeIO.Advice.Sequential);
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
@ -85,20 +84,9 @@ public class ForwardIndexReader {
return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
}
public DocPost docPost(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) throw new IllegalStateException("Forward index is not loaded");
final long meta = data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
final int domain = Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
return new DocPost(meta, domain);
}
private int idxForDoc(long docId) {
return idToOffset.get(docId);
}
public record DocPost(long meta, int domainId) {}
}

View File

@ -13,6 +13,8 @@ java {
}
dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:ngram-bloom-filter')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:common:config')
implementation project(':code:common:model')

View File

@ -8,8 +8,8 @@ import nu.marginalia.LanguageModels;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.query_parser.token.Token;

View File

@ -2,8 +2,8 @@ package nu.marginalia.query_parser;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.query_parser.token.TokenType;
import nu.marginalia.util.TestLanguageModels;
import org.junit.jupiter.api.BeforeAll;

View File

@ -2,8 +2,8 @@ package nu.marginalia.query_parser;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.language.sentence.SentenceExtractor;
import org.junit.jupiter.api.BeforeAll;

View File

@ -21,6 +21,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -8,9 +8,6 @@ This library contains various tools used in language processing.
Creates a [DocumentLanguageData](src/main/java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
its words, how they stem, POS tags, and so on.
* [TermFrequencyDict](src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java)
* [NGramBloomFilter](src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java)
## See Also
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords

View File

@ -1,6 +1,7 @@
package nu.marginalia.language.statistics;
import com.google.inject.Inject;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -0,0 +1,42 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id "de.undercouch.download" version "5.1.0"
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation project(':code:common:config')
implementation project(':third-party:porterstemmer')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.bundles.nlp
implementation libs.guice
implementation libs.trove
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.statistics;
package nu.marginalia.ngram_bloom_filter;
import java.io.IOException;
import java.nio.ByteBuffer;

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.statistics;
package nu.marginalia.ngram_bloom_filter;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.common.hash.HashFunction;
@ -46,30 +46,10 @@ public class NGramBloomFilter {
return bitMap.get(bit);
}
// public static void main(String... args) throws IOException {
// var filter = convertFromDictionaryFile(new File(args[0]));
// filter.bitMap.writeToFile(Path.of(args[1]));
// }
public static NGramBloomFilter load(Path file) throws IOException {
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
}
// public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
// DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
// AtomicInteger popCount = new AtomicInteger();
// try (var f = new KeywordLexiconJournalFile(file)) {
// f.loadFile(data -> {
// long bit = bitForWord(new String(data), bitMap.cardinality);
// if (!bitMap.set(bit))
// popCount.incrementAndGet();
// });
// }
//
// System.out.println("popcount = " + popCount.get());
// return new NGramBloomFilter(bitMap);
// }
private static final Pattern underscore = Pattern.compile("_");
private static long bitForWord(String s, long n) {

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.statistics;
package nu.marginalia.ngram_bloom_filter;
import org.junit.jupiter.api.Test;

View File

@ -0,0 +1,52 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id "me.champeau.jmh" version "0.6.6"
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guice
implementation libs.jsoup
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.nlp
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -1,60 +1,51 @@
package nu.marginalia.language.statistics;
package nu.marginalia.term_frequency_dict;
import ca.rmen.porterstemmer.PorterStemmer;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.LanguageModels;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Singleton
public class TermFrequencyDict {
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Pattern separator = Pattern.compile("[_ ]+");
private static final PorterStemmer ps = new PorterStemmer();
private static final long DOC_COUNT_KEY = ~0L;
private static long fileSize(Path p) throws IOException {
return Files.size(p);
}
@Inject
public TermFrequencyDict(@Nullable LanguageModels models) {
if (models == null) {
return;
}
public TermFrequencyDict(@NotNull LanguageModels models) {
this(models.termFrequencies);
}
if (models.termFrequencies != null) {
public TermFrequencyDict(Path file) {
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(file.toFile())))) {
wordRates.ensureCapacity((int)(Files.size(file)/16));
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
for (;;) {
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
}
} catch (EOFException eof) {
// ok
} catch (IOException e) {
logger.error("IO Exception reading " + models.termFrequencies, e);
for (;;) {
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
}
} catch (EOFException eof) {
// ok
} catch (IOException e) {
logger.error("IO Exception reading " + file, e);
}
logger.info("Read {} N-grams frequencies", wordRates.size());
}
public TermFrequencyDict(TLongIntHashMap data) {
wordRates.putAll(data);
}
public int docCount() {
int cnt = wordRates.get(DOC_COUNT_KEY);
@ -65,6 +56,7 @@ public class TermFrequencyDict {
return cnt;
}
// WIP refactoring, this needs a new home:
//
// public static void main(String... args) throws IOException, InterruptedException {
// if (args.length != 2) {
@ -151,8 +143,8 @@ public class TermFrequencyDict {
// }
public static long getStringHash(String s) {
String[] strings = separator.split(s);
if (s.length() > 1) {
if (s.indexOf(' ') >= 0 || s.indexOf('_') >= 0) {
String[] strings = StringUtils.split(s, " _");
byte[][] parts = new byte[strings.length][];
for (int i = 0; i < parts.length; i++) {
parts[i] = ps.stemWord(strings[i]).getBytes();
@ -163,6 +155,7 @@ public class TermFrequencyDict {
return longHash(s.getBytes());
}
}
public long getTermFreqHash(long hash) {
return wordRates.get(hash);
}

View File

@ -31,8 +31,9 @@ dependencies {
implementation project(':code:common:service-client')
implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -2,7 +2,7 @@ package nu.marginalia.assistant.suggest;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.assistant.dict.SpellChecker;
import org.apache.commons.collections4.trie.PatriciaTrie;

View File

@ -29,6 +29,8 @@ dependencies {
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:ngram-bloom-filter')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:api:assistant-api')
implementation project(':code:api:index-api')

View File

@ -9,8 +9,8 @@ import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.query_parser.QueryParser;
import nu.marginalia.query_parser.QueryPermutation;
import nu.marginalia.query_parser.QueryVariants;

View File

@ -2,7 +2,7 @@ package nu.marginalia.search.valuation;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;

View File

@ -4,8 +4,8 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.search.command.SearchJsParameter;
import nu.marginalia.search.model.SearchProfile;
import nu.marginalia.search.query.model.UserSearchParameters;

View File

@ -1,7 +1,7 @@
package nu.marginalia.search.valuation;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.crawl.PubDate;

View File

@ -17,6 +17,8 @@ include 'code:libraries:random-write-funnel'
include 'code:libraries:next-prime'
include 'code:libraries:braille-block-punch-cards'
include 'code:libraries:language-processing'
include 'code:libraries:ngram-bloom-filter'
include 'code:libraries:term-frequency-dict'
include 'code:features-search:screenshots'
include 'code:features-search:random-websites'