More restructuring, big bug fixes in keyword extraction.

This commit is contained in:
Viktor Lofgren 2023-03-13 17:39:53 +01:00
parent 281f1322a9
commit d82532b7f1
381 changed files with 1205 additions and 820 deletions

View File

@ -1,3 +0,0 @@
# Crawl Common
Contains model classes shared by the whole crawl-process-load ecosystem.

View File

@ -1,10 +0,0 @@
log4j2.isThreadContextMapInheritable=true
status = info
appender.console.type = Console
appender.console.name = LogToConsole
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
appender.console.filter.http.type = MarkerFilter
rootLogger.level = info
rootLogger.appenderRef.console.ref = LogToConsole
#rootLogger.appenderRef.http.ref = LogHttpTraffic

View File

@ -1,4 +0,0 @@
# Converting Models
Contains models shared by the [converting-process](../../crawl-processes/converting-process/) and
[loading-process](../../crawl-processes/loading-process/).

View File

@ -1,101 +0,0 @@
package nu.marginalia.converting.processor.keywords;
import nu.marginalia.converting.processor.keywords.extractors.*;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import javax.inject.Inject;
import java.util.*;
import java.util.stream.Collectors;
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
private final KeywordCounter tfIdfCounter;
private final NameCounter nameCounter;
private final SubjectCounter subjectCounter;
private final ArtifactKeywords artifactKeywords;
private final SimpleKeywords simpleKeywords;
private final UrlKeywords urlKeywords;
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
keywordExtractor = new KeywordExtractor();
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
artifactKeywords = new ArtifactKeywords();
urlKeywords = new UrlKeywords();
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
nameCounter = new NameCounter(keywordExtractor);
subjectCounter = new SubjectCounter(keywordExtractor);
simpleKeywords = new SimpleKeywords(keywordExtractor);
}
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, EdgeUrl url) {
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.updateWordStatistics(keywordMetadata, documentLanguageData);
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords.add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords.add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords.add(rep.stemmed);
keywordMetadata.urlKeywords.addAll(urlKeywords.getUrlKeywords(url));
keywordMetadata.domainKeywords.addAll(urlKeywords.getDomainKeywords(url));
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
simpleKeywords.getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
createWords(wordsBuilder, keywordMetadata, wordsTfIdf);
createWords(wordsBuilder, keywordMetadata, titleWords);
createWords(wordsBuilder, keywordMetadata, subjects);
wordsBuilder.addAllSyntheticTerms(artifacts);
return wordsBuilder;
}
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
.limit(100)
.collect(Collectors.toList());
}
public void createWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
Collection<WordRep> words) {
for (var word : words) {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
if (WordPatterns.hasWordQualities(flatWord)) {
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed));
}
}
}
}

View File

@ -1,109 +0,0 @@
package nu.marginalia.converting.processor.keywords.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.statistics.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import static java.lang.Math.max;
public class KeywordCounter {
private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
private final double docCount;
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict;
this.keywordExtractor = keywordExtractor;
this.docCount = dict.docCount();
}
public List<WordRep> updateWordStatistics(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) {
continue;
}
var rep = new WordRep(sent, span);
counts.mergeInt(rep.stemmed, 1, Integer::sum);
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(16));
if (instanceSet.size() < 4) {
instanceSet.add(rep);
}
}
}
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf;
List<WordRep> tfIdfHigh = new ArrayList<>();
int maxVal = maxValue(counts);
counts.forEach((key, cnt) -> {
int value = getTermValue(key, cnt, maxVal);
tfIdf.put(key, value);
if (cnt > 1 && value > 100) {
tfIdfHigh.addAll(instances.get(key));
}
});
return tfIdfHigh;
}
private int maxValue(Object2IntOpenHashMap<?> map) {
int maxC = 0;
for (int c : map.values()) {
maxC = max(c, maxC);
}
return maxC;
}
public int getTermValue(String key, int count, double maxValue) {
if (key.indexOf('_') >= 0) {
String[] parts = StringUtils.split(key, '_');
double totalValue = 0.;
for (String part : parts) {
totalValue += value(part, count, maxValue);
}
return normalizeValue(totalValue / parts.length);
}
else {
return normalizeValue(value(key, count, maxValue));
}
}
int normalizeValue(double v) {
return (int)(-v*75);
}
double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key);
if (freq < 1) {
freq = 1;
}
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
}
}

View File

@ -1,45 +0,0 @@
package nu.marginalia.converting.processor.keywords.extractors;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.keywords.KeywordExtractor;
import java.util.*;
import java.util.stream.Collectors;
public class NameCounter {
private final KeywordExtractor keywordExtractor;
public NameCounter(KeywordExtractor keywordExtractor) {
this.keywordExtractor = keywordExtractor;
}
public List<WordRep> count(DocumentLanguageData dld, int minCount) {
HashMap<String, Double> counts = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getProperNames(sent);
for (var span : keywords) {
if (span.size() <= 1)
continue;
var stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1., Double::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
}
}
return counts.entrySet().stream()
.filter(e -> e.getValue() >= minCount)
.sorted(Comparator.comparing(e -> -e.getValue()))
.limit(150)
.map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream())
.collect(Collectors.toList());
}
}

View File

@ -1,54 +0,0 @@
package nu.marginalia.converting.processor.keywords.extractors;
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.model.idx.WordFlags;
import java.util.EnumSet;
public class SimpleKeywords {
private final KeywordExtractor keywordExtractor;
public SimpleKeywords(KeywordExtractor keywordExtractor) {
this.keywordExtractor = keywordExtractor;
}
public void getSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData documentLanguageData) {
EnumSet<WordFlags> flagsTemplate = EnumSet.noneOf(WordFlags.class);
for (var sent : documentLanguageData.sentences) {
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) {
if (word.isStopWord()) {
continue;
}
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
}
}
}
}

View File

@ -1,27 +0,0 @@
package nu.marginalia.converting.processor.keywords.extractors;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeUrl;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Collectors;
public class UrlKeywords {
private final PorterStemmer ps = new PorterStemmer();
public Set<String> getUrlKeywords(EdgeUrl url) {
String path = url.path;
return Arrays.stream(path.split("[^a-z0-9A-Z]+"))
.map(ps::stemWord)
.collect(Collectors.toSet());
}
public Set<String> getDomainKeywords(EdgeUrl url) {
return Arrays.stream(url.domain.domain.split("[^a-z0-9A-Z]+"))
.filter(s -> s.length() > 3)
.map(ps::stemWord)
.collect(Collectors.toSet());
}
}

View File

@ -1,129 +0,0 @@
package nu.marginalia.converting.tool;
import nu.marginalia.LanguageModels;
import nu.marginalia.converting.processor.keywords.extractors.KeywordCounter;
import nu.marginalia.converting.processor.keywords.extractors.NameCounter;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.language.statistics.TermFrequencyDict;
import org.jsoup.nodes.Document;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
public class DocumentDebugger {
private final KeywordCounter kc;
private final SentenceExtractor se;
private final KeywordExtractor ke;
private final NameCounter nc;
final Map<String, Path> docsByPath = new TreeMap<>();
Path tempDir;
public DocumentDebugger(LanguageModels lm) throws IOException {
se = new SentenceExtractor(lm);
var dict = new TermFrequencyDict(lm);
ke = new KeywordExtractor();
kc = new KeywordCounter(dict, ke);
nc = new NameCounter(ke);
tempDir = Files.createTempDirectory("documentdebugger");
}
public void writeIndex() throws FileNotFoundException {
var output = tempDir.resolve("index.html");
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
pw.println("<ul>");
docsByPath.forEach((name, path) -> {
pw.println("<li>");
pw.printf("<a href=\"file://%s\">%s</a>", path, name);
pw.println("</li>");
});
pw.println("</ul>");
}
System.out.println(output);
}
public Path debugDocument(String name, Document document) throws IOException {
var output = tempDir.resolve(name.substring(name.lastIndexOf("/")+1)+".html");
docsByPath.put(name, output);
document.select("table,sup,.reference").remove();
var languageData = se.extractSentences(document);
Set<String> reps = new HashSet<>();
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
for (var sent : languageData.titleSentences) {
pw.print("<h1>");
printSent(pw, sent, reps);
pw.println("</h1>");
}
for (var sent : languageData.sentences) {
pw.println("<div>");
printSent(pw, sent, reps);
pw.println("</div>");
}
}
return output;
}
private void printSent(PrintWriter pw, DocumentSentence sent, Set<String> words) {
TreeMap<Integer, Set<WordRep>> spans = new TreeMap<>();
var names = ke.getKeywordsFromSentence(sent);
for (var span : names) {
for (int j = 0; j < span.size(); j++) {
spans.computeIfAbsent(span.start + j, n -> new HashSet<>()).add(new WordRep(sent, span));
}
}
for (int i = 0; i < sent.words.length; i++) {
List<WordRep> matches = spans.getOrDefault(i, Collections.emptySet()).stream().filter(rep -> true || words.contains(rep.stemmed)).collect(Collectors.toList());
printTag(pw, sent, i, matches);
}
}
private void printTag(PrintWriter pw, DocumentSentence sent, int i, List<WordRep> matches) {
String style;
if (matches.isEmpty()) {
style = "";
}
else if (matches.size() == 1 && !matches.get(0).word.contains("_")) {
style = "text-decoration: underline; color: #00f";
}
else {
style = "text-decoration: underline; color: #f00";
}
pw.printf("<ruby title=\"%s\" style=\"%s\">",
matches.stream().map(rep -> rep.word).collect(Collectors.joining(", ")),
style
);
pw.print(sent.words[i]);
pw.print("<rt>"); pw.println(sent.posTags[i]); pw.print("</rt>");
pw.print("</ruby> ");
if (sent.separators[i] == WordSeparator.COMMA)
pw.printf(", ");
}
}

View File

@ -0,0 +1,56 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
application {
mainClass = 'nu.marginalia.converting.ConverterMain'
applicationName = 'converter-process'
}
tasks.distZip.enabled = false
dependencies {
implementation project(':third-party:porterstemmer')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:libraries:language-processing')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.jsoup
implementation libs.commons.lang3
implementation libs.guice
implementation libs.guava
implementation libs.trove
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
maxHeapSize = "8G"
useJUnitPlatform()
}
task fastTests(type: Test) {
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,13 @@
# Keyword Extraction
This code deals with identifying keywords in a document, their positions in the document,
their important based on [TF-IDF](https://en.wikipedia.org/wiki/Tf-idf) and their grammatical
functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).
## Central Classes
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java)
## See Also
* [libraries/language-processing](../../libraries/language-processing) does a lot of the heavy lifting.

View File

@ -0,0 +1,104 @@
package nu.marginalia.keyword_extraction;
import nu.marginalia.keyword_extraction.extractors.*;
import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import javax.inject.Inject;
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict;
this.keywordExtractor = new KeywordExtractor();
}
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) {
var bitmask = new KeywordPositionBitmask(keywordExtractor, dld);
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var keywordMetadata = KeywordMetadata.builder()
.bitmask(bitmask)
.tfIdfCounts(tfIdfCounts)
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
.urlKeywords(urlKeywords)
.build();
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
createSimpleWords(wordsBuilder, keywordMetadata, dld);
createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder;
}
private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
WordReps words) {
for (var word : words.getReps()) {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
if (WordPatterns.hasWordQualities(flatWord)) {
wordsBuilder.add(flatWord, metadata.getMetadataForWord(word.stemmed));
}
}
}
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData documentLanguageData)
{
for (var sent : documentLanguageData.sentences) {
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) {
if (word.isStopWord()) {
continue;
}
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.getMetadataForWord(rep.stemmed));
}
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.language.keywords;
package nu.marginalia.keyword_extraction;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentSentence;
@ -60,8 +60,9 @@ public class KeywordExtractor {
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isNoun(i, sentence))
spans.add(new WordSpan(i, i+1));
if (isNoun(i, sentence)) {
spans.add(new WordSpan(i, i + 1));
}
}
for (int i = 1; i < sentence.length(); i++) {

View File

@ -0,0 +1,64 @@
package nu.marginalia.keyword_extraction;
import lombok.Builder;
import nu.marginalia.keyword_extraction.extractors.*;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags;
import java.util.EnumSet;
class KeywordMetadata {
private final KeywordPositionBitmask bitmask;
private final TitleKeywords titleKeywords;
private final NameLikeKeywords nameLikeKeywords;
private final SubjectLikeKeywords subjectLikeKeywords;
private final UrlKeywords urlKeywords;
private final WordsTfIdfCounts tfIdfCounts;
@Builder
public KeywordMetadata(
KeywordPositionBitmask bitmask,
TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords,
WordsTfIdfCounts tfIdfCounts) {
this.bitmask = bitmask;
this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords;
this.urlKeywords = urlKeywords;
this.tfIdfCounts = tfIdfCounts;
}
public long getMetadataForWord(String stemmed) {
int tfidf = tfIdfCounts.getTfIdf(stemmed);
EnumSet<WordFlags> flags = EnumSet.noneOf(WordFlags.class);
if (tfidf > 100)
flags.add(WordFlags.TfIdfHigh);
if (subjectLikeKeywords.contains(stemmed))
flags.add(WordFlags.Subjects);
if (nameLikeKeywords.contains(stemmed))
flags.add(WordFlags.NamesWords);
if (titleKeywords.contains(stemmed))
flags.add(WordFlags.Title);
if (urlKeywords.containsUrl(stemmed))
flags.add(WordFlags.UrlPath);
if (urlKeywords.containsDomain(stemmed))
flags.add(WordFlags.UrlDomain);
int positions = bitmask.get(stemmed);
return new WordMetadata(tfidf, positions, flags).encode();
}
}

View File

@ -0,0 +1,9 @@
package nu.marginalia.keyword_extraction;
import nu.marginalia.language.model.WordRep;
import java.util.Collection;
public interface WordReps {
Collection<WordRep> getReps();
}

View File

@ -1,19 +1,18 @@
package nu.marginalia.converting.processor.keywords.extractors;
package nu.marginalia.keyword_extraction.extractors;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.regex.Pattern;
public class ArtifactKeywords {
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
public List<String> getArtifactKeywords(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
private final Set<String> words;
public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
words = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
@ -24,22 +23,25 @@ public class ArtifactKeywords {
continue;
}
reps.add(lc);
words.add(lc);
String domain = lc.substring(lc.indexOf('@'));
String user = lc.substring(0, lc.indexOf('@'));
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
reps.add(domain);
words.add(domain.substring(1));
words.add(domain);
}
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
reps.add(user);
words.add(user);
}
}
}
return new ArrayList<>(reps);
}
public Collection<String> getWords() {
return words;
}
}

View File

@ -1,35 +1,27 @@
package nu.marginalia.converting.processor.keywords.extractors;
package nu.marginalia.keyword_extraction.extractors;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
/** Generates a position bitmask for each word in a document */
public class DocumentKeywordPositionBitmaskExtractor {
private final KeywordExtractor keywordExtractor;
public class KeywordPositionBitmask {
private final Object2IntOpenHashMap<String> positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
@Inject
public DocumentKeywordPositionBitmaskExtractor(KeywordExtractor keywordExtractor) {
this.keywordExtractor = keywordExtractor;
}
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
final KeywordMetadata keywordMetadata = new KeywordMetadata();
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask;
public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
// Mark the title words as position 0
for (var sent : dld.titleSentences) {
int posBit = 1;
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
@ -40,17 +32,19 @@ public class DocumentKeywordPositionBitmaskExtractor {
int posBit = (int)((1L << linePos.pos()) & 0xFFFF_FFFFL);
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
linePos.next();
}
}
return keywordMetadata;
public int get(String stemmed) {
return positionMask.getOrDefault(stemmed, 0);
}
private int bitwiseOr(int a, int b) {

View File

@ -0,0 +1,70 @@
package nu.marginalia.keyword_extraction.extractors;
import com.google.common.base.CharMatcher;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import java.util.*;
import java.util.stream.Collectors;
/** Keywords that look like they could be a name */
public class NameLikeKeywords implements WordReps {
private final List<WordRep> nameWords;
private final Set<String> stemmed;
public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) {
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase);
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getProperNames(sent);
for (var span : keywords) {
if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start]))
continue;
var stemmed = sent.constructStemmedWordFromSpan(span);
counts.addTo(stemmed, -1);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
}
}
nameWords = counts.object2IntEntrySet().stream()
.filter(e -> hasEnough(e, minCount))
.sorted(Comparator.comparingInt(Object2IntMap.Entry::getIntValue))
.limit(150)
.map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream())
.collect(Collectors.toList());
stemmed = nameWords.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
}
public boolean hasEnough(Object2IntMap.Entry<String> entry, int minCount) {
final int count = -entry.getIntValue();
if (entry.getKey().contains("_")) {
return count >= minCount;
}
else {
return count >= minCount + 1;
}
}
public boolean contains(String wordStemmed) {
return stemmed.contains(wordStemmed);
}
@Override
public Collection<WordRep> getReps() {
return nameWords;
}
}

View File

@ -1,48 +1,49 @@
package nu.marginalia.converting.processor.keywords.extractors;
package nu.marginalia.keyword_extraction.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
public class SubjectCounter {
private final KeywordExtractor keywordExtractor;
public SubjectCounter(KeywordExtractor keywordExtractor) {
this.keywordExtractor = keywordExtractor;
}
public class SubjectLikeKeywords implements WordReps {
private final List<WordRep> wordList;
private final Set<String> stemmed;
// Seeks out subjects in a sentence by constructs like
//
// [Name] (Verbs) (the|a|Adverb|Verb) ...
// [Name] (Verbs) (the|a|Adverb|Verb|Noun) ...
// e.g.
//
// Greeks bearing gifts -> Greeks
// Steve McQueen drove fast | cars -> Steve McQueen
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
WordsTfIdfCounts tfIdfCounts,
DocumentLanguageData dld) {
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
if (kw.end + 2 >= sentence.length()) {
continue;
}
if (sentence.separators[kw.end] == WordSeparator.COMMA
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
break;
continue;
String nextTag = sentence.posTags[kw.end];
String nextNextTag = sentence.posTags[kw.end+1];
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
if (isVerb(nextTag) && isDetOrAdverbOrVerbOrNoun(nextNextTag)) {
var span = new WordSpan(kw.start, kw.end);
var rep = new WordRep(sentence, span);
@ -53,18 +54,30 @@ public class SubjectCounter {
}
}
Map<String, Integer> scores = new HashMap<>(instances.size());
Object2IntOpenHashMap<String> scores = new Object2IntOpenHashMap<>(instances.size());
for (String stemmed : instances.keySet()) {
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
scores.put(stemmed, getTermTfIdf(tfIdfCounts, stemmed));
}
return scores.entrySet().stream()
.filter(e -> e.getValue() >= 150)
wordList = scores.object2IntEntrySet().stream()
.filter(e -> e.getIntValue() >= 100)
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
.collect(Collectors.toList());
stemmed = wordList.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
}
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
public boolean contains(String wordStemmed) {
return stemmed.contains(wordStemmed);
}
@Override
public Collection<WordRep> getReps() {
return wordList;
}
private int getTermTfIdf(WordsTfIdfCounts tfIdfCounts, String stemmed) {
if (stemmed.contains("_")) {
int sum = 0;
String[] parts = StringUtils.split(stemmed, '_');
@ -74,25 +87,26 @@ public class SubjectCounter {
}
for (String part : parts) {
sum += getTermTfIdf(keywordMetadata, part);
sum += getTermTfIdf(tfIdfCounts, part);
}
return sum / parts.length;
}
return keywordMetadata.wordsTfIdf.getOrDefault(stemmed, 0);
return tfIdfCounts.getTfIdf(stemmed);
}
private boolean isDetOrAdverbOrVerb(String posTag) {
private boolean isDetOrAdverbOrVerbOrNoun(String posTag) {
return "DT".equals(posTag) // determinant
|| "RB".equals(posTag) // adverb
|| posTag.startsWith("RB") // adverb
|| posTag.startsWith("VB") // verb
|| posTag.startsWith("JJ"); // adjective
|| posTag.startsWith("JJ") // adjective
|| posTag.startsWith("P")
|| posTag.startsWith("NN");
}
boolean isVerb(String posTag) {
return posTag.startsWith("VB")
&& !posTag.equals("VB"); // not interested in the infinitive
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.keyword_extraction.extractors;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import java.util.Arrays;
import java.util.Collection;
import java.util.Set;
import java.util.stream.Collectors;
/** Extract keywords from the title */
public class TitleKeywords implements WordReps {
private final Set<WordRep> titleKeywords;
private final Set<String> stemmed;
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
.limit(100)
.collect(Collectors.toSet());
stemmed = titleKeywords.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
}
public boolean contains(String wordStemmed) {
return stemmed.contains(wordStemmed);
}
@Override
public Collection<WordRep> getReps() {
return titleKeywords;
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.keyword_extraction.extractors;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Collectors;
/** Extract keywords from the URL */
public class UrlKeywords {
private static final PorterStemmer ps = new PorterStemmer();
private final Set<String> urlKeywords;
private final Set<String> domainKeywords;
public UrlKeywords(EdgeUrl url) {
String path = url.path;
urlKeywords = Arrays.stream(path.split("[^a-z0-9A-Z]+"))
.map(ps::stemWord)
.collect(Collectors.toSet());
domainKeywords = Arrays.stream(url.domain.toString().split("[^a-z0-9A-Z]+"))
.filter(s -> s.length() > 3)
.map(ps::stemWord)
.collect(Collectors.toSet());
}
public boolean containsUrl(String stemmed) {
return urlKeywords.contains(stemmed);
}
public boolean containsDomain(String stemmed) {
return domainKeywords.contains(stemmed);
}
}

View File

@ -0,0 +1,143 @@
package nu.marginalia.keyword_extraction.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword_extraction.WordReps;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.statistics.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import static java.lang.Math.max;
/** Extract counts and TF-IDF for the words in the document,
* keep track of high-scoring words for flagging
*/
public class WordsTfIdfCounts implements WordReps {
private final TermFrequencyDict dict;
private final double docCount;
private final Object2IntOpenHashMap<String> tfIdf;
private final Set<WordRep> tfIdfHigh;
public WordsTfIdfCounts(TermFrequencyDict dict,
KeywordExtractor keywordExtractor,
DocumentLanguageData dld) {
this.dict = dict;
this.docCount = dict.docCount();
this.tfIdf = new Object2IntOpenHashMap<>(10_000);
var counts = getCounts(keywordExtractor, dld);
int maxVal = maxValue(counts);
Set<String> highTfIdfInstances = new HashSet<>();
counts.forEach((key, cnt) -> {
int value = getTermValue(key, cnt, maxVal);
tfIdf.put(key, value);
if (cnt > 1 && value > 100) {
highTfIdfInstances.add(key);
}
});
// Collect words with a high TF-IDF so that they can be marked with a bit flag
tfIdfHigh = new HashSet<>(100);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
if (highTfIdfInstances.contains(spanToStemmed(sent, span))) {
tfIdfHigh.add(new WordRep(sent, span));
}
}
}
}
private Object2IntOpenHashMap<String> getCounts(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
counts.defaultReturnValue(0);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
if (span.size() == 1
&& WordPatterns.isStopWord(sent.words[span.start]))
{
continue;
}
counts.addTo(spanToStemmed(sent, span), 1);
}
}
return counts;
}
private String spanToStemmed(DocumentSentence sentence, WordSpan span) {
if (span.size() == 1)
return sentence.stemmedWords[span.start];
StringBuilder builder = new StringBuilder();
for (int i = span.start; i < span.end; i++) {
if (!builder.isEmpty())
builder.append('_');
builder.append(sentence.stemmedWords[i]);
}
return builder.toString();
}
public int getTfIdf(String stemmed) {
return tfIdf.getOrDefault(stemmed, 0);
}
@Override
public Collection<WordRep> getReps() {
return tfIdfHigh;
}
private int maxValue(Object2IntOpenHashMap<?> map) {
int maxC = 0;
for (int c : map.values()) {
maxC = max(c, maxC);
}
return maxC;
}
public int getTermValue(String key, int count, double maxValue) {
if (key.indexOf('_') >= 0) {
String[] parts = StringUtils.split(key, '_');
double totalValue = 0.;
for (String part : parts) {
totalValue += value(part, count, maxValue);
}
return normalizeValue(totalValue / parts.length);
}
else {
return normalizeValue(value(key, count, maxValue));
}
}
int normalizeValue(double v) {
return (int)(-v*75);
}
double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key);
if (freq < 1) {
freq = 1;
}
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.model;
package nu.marginalia.keyword_extraction.model;
import nu.marginalia.model.idx.WordMetadata;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.model;
package nu.marginalia.keyword_extraction.model;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;
@ -63,10 +63,9 @@ public class DocumentKeywordsBuilder {
public void addAllSyntheticTerms(Collection<String> newWords) {
long meta = WordFlags.Synthetic.asBit();
newWords.forEach(word -> {
words.putIfAbsent(word, meta);
});
// Only add the synthetic flag if the words aren't already present
newWords.forEach(word -> words.putIfAbsent(word, meta));
}
public List<String> getWordsWithAnyFlag(long flags) {

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.keywords;
package nu.marginalia.keyword_extraction;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
@ -7,7 +7,6 @@ import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.WordSeparator;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;

View File

@ -0,0 +1,22 @@
package nu.marginalia.keyword_extraction.extractors;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class ArtifactKeywordsTest {
@Test
public void testExtractArtifacts() {
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
System.out.println(artifacts.getWords());
assertTrue(artifacts.getWords().contains("vlofgren"));
assertTrue(artifacts.getWords().contains("marginalia.nu"));
assertTrue(artifacts.getWords().contains("@marginalia.nu"));
assertTrue(artifacts.getWords().contains("vlofgren@marginalia.nu"));
}
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.keyword_extraction.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test;
import java.util.Collections;
import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
class NameLikeKeywordsTest {
String text = """
In 60 BC, Caesar, Crassus, and Pompey formed the First Triumvirate, an informal political alliance that
dominated Roman politics for several years. Their attempts to amass power as Populares were opposed by
the Optimates within the Roman Senate, among them Cato the Younger with the frequent support of Cicero.
Caesar rose to become one of the most powerful politicians in the Roman Republic through a string of
military victories in the Gallic Wars, completed by 51 BC, which greatly extended Roman territory.
During this time he both invaded Britain and built a bridge across the Rhine river. These achievements
and the support of his veteran army threatened to eclipse the standing of Pompey, who had realigned himself
with the Senate after the death of Crassus in 53 BC. With the Gallic Wars concluded, the Senate ordered
Caesar to step down from his military command and return to Rome. In 49 BC, Caesar openly defied the
Senate's authority by crossing the Rubicon and marching towards Rome at the head of an army. This
began Caesar's civil war, which he won, leaving him in a position of near unchallenged power and
influence in 45 BC.
After assuming control of government, Caesar began a program of social and governmental reforms,
including the creation of the Julian calendar. He gave citizenship to many residents of far regions
of the Roman Republic. He initiated land reform and support for veterans. He centralized the
bureaucracy of the Republic and was eventually proclaimed "dictator for life" (dictator perpetuo).
His populist and authoritarian reforms angered the elites, who began to conspire against him. On the
Ides of March (15 March) 44 BC, Caesar was assassinated by a group of rebellious senators led by Brutus
and Cassius, who stabbed him to death. A new series of civil wars broke out and the constitutional
government of the Republic was never fully restored. Caesar's great-nephew and adopted heir Octavian,
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
""";
@Test
public void test() {
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
NameLikeKeywords keywords = new NameLikeKeywords(new KeywordExtractor(), se.extractSentences(text, "Julius Caesar"), 2);
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of("caesar", "senate", "roman", "republic", "roman_republic");
// rome isn't counted because PorterStemmer is derp
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
}
}

View File

@ -0,0 +1,62 @@
package nu.marginalia.keyword_extraction.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test;
import java.util.Collections;
import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.assertEquals;
class SubjectLikeKeywordsTest {
String text = """
In 60 BC, Caesar, Crassus, and Pompey formed the First Triumvirate, an informal political alliance that
dominated Roman politics for several years. Their attempts to amass power as Populares were opposed by
the Optimates within the Roman Senate, among them Cato the Younger with the frequent support of Cicero.
Caesar rose to become one of the most powerful politicians in the Roman Republic through a string of
military victories in the Gallic Wars, completed by 51 BC, which greatly extended Roman territory.
During this time he both invaded Britain and built a bridge across the Rhine river. These achievements
and the support of his veteran army threatened to eclipse the standing of Pompey, who had realigned himself
with the Senate after the death of Crassus in 53 BC. With the Gallic Wars concluded, the Senate ordered
Caesar to step down from his military command and return to Rome. In 49 BC, Caesar openly defied the
Senate's authority by crossing the Rubicon and marching towards Rome at the head of an army. This
began Caesar's civil war, which he won, leaving him in a position of near unchallenged power and
influence in 45 BC.
After assuming control of government, Caesar began a program of social and governmental reforms,
including the creation of the Julian calendar. He gave citizenship to many residents of far regions
of the Roman Republic. He initiated land reform and support for veterans. He centralized the
bureaucracy of the Republic and was eventually proclaimed "dictator for life" (dictator perpetuo).
His populist and authoritarian reforms angered the elites, who began to conspire against him. On the
Ides of March (15 March) 44 BC, Caesar was assassinated by a group of rebellious senators led by Brutus
and Cassius, who stabbed him to death. A new series of civil wars broke out and the constitutional
government of the Republic was never fully restored. Caesar's great-nephew and adopted heir Octavian,
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
""";
@Test
public void test() {
var lm = TestLanguageModels.getLanguageModels();
var dict = new TermFrequencyDict(lm);
SentenceExtractor se = new SentenceExtractor(lm);
var dld = se.extractSentences(text, "Julius Caesar");
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
SubjectLikeKeywords keywords = new SubjectLikeKeywords(new KeywordExtractor(),
tfIdfCounts,
dld);
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of("caesar", "republic", "authoritarian_reforms", "senate", "pompey", "reforms", "government_of_the_republic");
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
}
}

View File

@ -0,0 +1,208 @@
package nu.marginalia.keyword_extraction.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Collections;
import java.util.Set;
import java.util.stream.Collectors;
class TitleKeywordsTest {
String document = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>MEMEX - Creepy Website Similarity [ 2022-12-26 ]</title>
<link rel="stylesheet" href="/style-new.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
\s
</head>
<body class="double" lang="en">
<header>
<nav>
<a href="http://www.marginalia.nu/">Marginalia</a>
<a href="http://search.marginalia.nu/">Search Engine</a>
<a href="http://encyclopedia.marginalia.nu/">Encyclopedia</a>
</nav>
</header>
<nav class="topbar">
<h1>Memex</h1>
<a href="/" class="path root"><img src="/ico/root.png" title="root"> marginalia</a>
<a href="/log" class="path dir"><img src="/ico/dir.png" title="dir"> log</a>
<a href="/log/69-creepy-website-similarity.gmi" class="path file"><img src="/ico/file.png" title="file"> 69-creepy-website-similarity.gmi</a>
</nav>
<article>
<section id="memex-node">
<h1 id="1">Creepy Website Similarity [ 2022-12-26 ]</h1>
<br>
This is a write-up about an experiment from a few months ago, in how to find websites that are similar to each other. Website similarity is useful for many things, including discovering new websites to crawl, as well as suggesting similar websites in the Marginalia Search random exploration mode.<br>
<br>
<dl class="link"><dt><a class="external" href="https://explore2.marginalia.nu/">https://explore2.marginalia.nu/</a></dt><dd>A link to a slapdash interface for exploring the experimental data.</dd></dl>
<br>
The approach chosen was to use the link graph look for websites that are linked to from the same websites. This turned out to work remarkably well. <br>
<br>
There are some alternative feature spaces that might have been used, such as TF-IDF data. Using incident links turned out to be surprisingly powerful, almost to an uncanny degree as it's able to find similarities even among websites that Marginalia doesn't index.<br>
<br>
As a whole the feature shares a lot of similarity with how you would construct a recommendation algorithm of the type "other shoppers also bought", and in doing so also exposes how creepy they can be. You can't build a recommendation engine without building a tool for profiling. It's largely the same thing.<br>
<br>
If you for example point the website explorer to the fringes of politics, it will map that web-space with terrifying accuracy.<br>
<br>
<dl class="link"><dt><a class="external" href="https://explore2.marginalia.nu/search?domain=qanon.pub">https://explore2.marginalia.nu/search?domain=qanon.pub</a></dt><dd>qanon.pub's neighbors</dd></dl>
<br>
Note again how few of those websites are actually indexed by Marginalia. Only those websites with 'MS' links are! The rest are inferred from the data. On the one hand it's fascinating and cool, on the other it's deeply troubling: If I can create such a map on PC in my living room, imagine what might be accomplished with a datacenter.<br>
<br>
You might think "Well what's the problem? QAnon deserves all the scrutiny, give them nowhere to hide!". Except this sort of tool could concievably work just as well as well for mapping democracy advocates in Hong Kong, Putin-critics in Russia, gay people in Uganda, and so forth.<br>
<br>
<h2 id="1.1">Implementation details</h2>
<br>
In practice, cosine similarity is used to compare the similarity between websites. This is a statistical method perhaps most commonly used in machine learning, but it has other uses as well. <br>
<br>
Cosine similarity is calculated by taking the inner product of two vectors and dividing by their norms<br>
<br>
<pre>
a x b
p = ---------\s
|a| |b|</pre>
<br>
As you might remember from linear algebra, this is a measure of how much two vectors "pull in the same direction". The cosine similarity of two identical vectors is unity, and for orthogonal vectors it is zero.<br>
<br>
This data has extremely high dimensionality, the vector space consists of nearly 10 million domains, so most "standard" tools like numpy/scipy will not load the data without serious massaging. That juice doesn't appear to be worth the squeeze when it's just as easy to roll what you need on your own (which you'd probably need to do regardless to get it into those tools, Random Reprojection or some such). <br>
<br>
Since the vectors in questions are just bitmaps, either a website has a link or it does not, the vector product can be simplified to a logical AND operation. The first stab at the problem was to use RoaringBitmaps.<br>
<br>
<pre>
double cosineSimilarity(RoaringBitmap a, RoaringBitmap b) {
double andCardinality = RoaringBitmap.andCardinality(a, b);
andCardinality /= Math.sqrt(a.getCardinality());
andCardinality /= Math.sqrt(b.getCardinality());
return andCardinality;
}
</pre>
<br>
This works but it's just a bit too slow to be practical. Sacrificing some memory for speed turns out to be necessary. Roaring Bitmaps are memory efficient, but a general purpose library. It's easy to create a drop-in replacement that implements only andCardinality() and getCardinality() in a way that caters to the specifics of the data. <br>
<br>
A simple 64 bit bloom filter makes it possible to short-circuit a lot of the calculations since many vectors are small and trivially don't overlap. The vector data is stored in sorted lists. Comparing sorted lists is very cache friendly and fast, while using relatively little memory. Storing a dense matrix would require RAM on the order of hundreds of terabytes so that's no good.<br>
<br>
The actual code rewritten for brevity, as a sketch the and-cardinality calculation looks like this, and performs about 5-20x faster than RoaringBitmaps for this specfic use case:<br>
<br>
<pre>
int andCardinality(AndCardIntSet a, AndCardIntSet b) {
if ((a.hash & b.hash) == 0) {
return 0;
}
int i = 0, j = 0;
int card = 0;
do {
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
if (diff &lt; 0) i++;
else if (diff &gt; 0) j++;
else {
i++;
j++;
card++;
}
} while (i &lt; a.getCardinality() && j &lt; b.getCardinality());
return card;
\s
}
</pre>
<br>
This calculates similarities between websites at a rate where it's feasible to pre-calculate the similarities between all known websites within a couple of days. It's on the cusp of being viable to offer ad-hoc calculations, but not quite without being a denial of service-hazard. <br>
<br>
To do this in real time, the search space could be reduced using some form of locality-sensitive hash scheme, although for a proof of concept this performs well enough on its own. <br>
<br>
<h2 id="1.2">Closing thoughts</h2>
<br>
This has been online for a while and I've been debating whether to do this write-up. To be honest this is probably the creepiest piece of software I've built. <br>
<br>
At the same time, I can't imagine I'm the first to conceive of doing this. To repeat, you almost can't build a suggestions engine without this type of side-effect, and recommendations are *everywhere* these days. They are on Spotify, Youtube, Facebook, Reddit, Twitter, Amazon, Netflix, Google, even small web shops have them. <br>
<br>
In that light, it's better to make the discovery public and highlight its potential so that it might serve as an example of how and why these recommendation algorithms are profoundly creepy. <br>
<br>
<h2 id="1.3">Topic</h2>
<br>
<a class="internal" href="/topic/astrolabe.gmi">/topic/astrolabe.gmi</a><br>
<a class="internal" href="/topic/programming.gmi">/topic/programming.gmi</a><br>
</section>
<div id="sidebar">
<section class="tools">
<h1>69-creepy-website-similarity.gmi</h1>
<a class="download" href="/api/raw?url=/log/69-creepy-website-similarity.gmi">Raw</a><br>
<a rel="nofollow" href="/api/update?url=/log/69-creepy-website-similarity.gmi" class="verb">Edit</a>
<a rel="nofollow" href="/api/rename?type=gmi&url=/log/69-creepy-website-similarity.gmi" class="verb">Rename</a>
\s
<br/>
<div class="toc">
\s
<a href="#1" class="heading-1">1 Creepy Website Similarity [ 2022-12-26 ]</a>
\s
<a href="#1.1" class="heading-2">1.1 Implementation details</a>
\s
<a href="#1.2" class="heading-2">1.2 Closing thoughts</a>
\s
<a href="#1.3" class="heading-2">1.3 Topic</a>
\s
</div>
</section>
<section id="memex-backlinks">
<h1 id="backlinks"> Backlinks </h1>
<dl>
<dt><a href="/log/73-new-approach-to-ranking.gmi">/log/73-new-approach-to-ranking.gmi</a></dt>
<dd>A new approach to domain ranking [ 2023-02-06 ] - See Also</dd>
</dl>
</section>
</div>
</article>
<footer>
Reach me at <a class="fancy-teknisk" href="mailto:kontakt@marginalia.nu">kontakt@marginalia.nu</a>.
<br />
</footer>
</body>
""";
@Test
public void extractTitleWords() {
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps();
var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of(
"creepy",
"website",
"similarity",
"creepy_website",
"website_similarity",
"creepy_website_similarity",
"memex", "2022-12-26");
Assertions.assertEquals(Collections.emptySet(),
Sets.symmetricDifference(words, expected));
}
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.keyword_extraction.extractors;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.*;
class UrlKeywordsTest {
private final PorterStemmer ps = new PorterStemmer();
@Test
void containsDomain() throws URISyntaxException {
var keywords = new UrlKeywords(new EdgeUrl("https://memex.marginalia.nu/log/69-creepy-website-similarity.gmi"));
assertTrue(keywords.containsDomain(ps.stemWord("memex")));
assertTrue(keywords.containsDomain(ps.stemWord("marginalia")));
}
@Test
void containsDomainNoWWWNoCom() throws URISyntaxException {
var keywords = new UrlKeywords(new EdgeUrl("https://www.example.com/log/69-creepy-website-similarity.gmi"));
assertTrue(keywords.containsDomain(ps.stemWord("example")));
assertFalse(keywords.containsDomain(ps.stemWord("www")));
assertFalse(keywords.containsDomain(ps.stemWord("com")));
}
@Test
void pathFragments() throws URISyntaxException {
var keywords = new UrlKeywords(new EdgeUrl("https://memex.marginalia.nu/log/69-creepy-website-similarity.gmi"));
assertTrue(keywords.containsUrl(ps.stemWord("creepy")));
assertTrue(keywords.containsUrl(ps.stemWord("website")));
assertTrue(keywords.containsUrl(ps.stemWord("similarity")));
assertTrue(keywords.containsUrl(ps.stemWord("69")));
assertTrue(keywords.containsUrl(ps.stemWord("log")));
assertFalse(keywords.containsUrl(ps.stemWord("memex")));
}
}

View File

@ -15,7 +15,7 @@ java {
dependencies {
implementation project(':code:common:model')
implementation project(':code:crawl-models:common')
implementation project(':code:process-models:converting-model')
implementation libs.lombok
annotationProcessor libs.lombok
@ -23,7 +23,7 @@ dependencies {
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.notnull
implementation libs.gson
implementation libs.bundles.gson
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test

View File

@ -1,8 +1,8 @@
package nu.marginalia.pubdate;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.converting.model.HtmlStandard;
import org.jsoup.nodes.Document;
import java.util.Optional;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import java.time.DateTimeException;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.heuristic.*;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -3,7 +3,7 @@ package nu.marginalia.pubdate.heuristic;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateEffortLevel;
import nu.marginalia.pubdate.PubDateHeuristic;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;

View File

@ -1,6 +1,6 @@
package nu.marginalia.pubdate.heuristic;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.pubdate.PubDateHeuristic;
import nu.marginalia.pubdate.PubDateParser;
@ -17,7 +17,8 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
@Override
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url,
Document document, HtmlStandard htmlStandard) {
final String urlString = url.path;
var matcher = yearUrlPattern.matcher(urlString);

View File

@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;

View File

@ -0,0 +1,11 @@
# Converter Features
## Major features
* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document
## Smaller features:
* [adblock](adblock/) - Simulates Adblock
* [pubdate](pubdate/) - Determines when a document was published
* [topic-detection](topic-detection/) - Tries to identify the topic of a website

View File

@ -16,7 +16,7 @@ dependencies {
implementation project(':code:common:config')
implementation project(':code:features-crawl:work-log')
implementation project(':code:libraries:guarded-regex')
implementation project(':code:crawl-models:crawling-model')
implementation project(':code:process-models:crawling-model')
implementation libs.notnull
implementation libs.lombok

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawling.common.plan;
package nu.marginalia.crawl_plan;
import com.google.errorprone.annotations.MustBeClosed;
import lombok.AllArgsConstructor;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawling.common.plan;
package nu.marginalia.crawl_plan;
import org.yaml.snakeyaml.Yaml;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawling.common.plan;
package nu.marginalia.crawl_plan;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;

View File

@ -3,11 +3,6 @@
These are bits of search-engine related code that are relatively isolated pieces of business logic,
that benefit from the clarity of being kept separate from the rest of the crawling code.
* [adblock](adblock/) - Simulates Adblock
* [pubdate](pubdate/) - Determines when a document was published
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists
* [work-log](work-log/) - Work journal for resuming long processes
* [link-parser](link-parser/) - Code for parsing and normalizing links

View File

@ -16,70 +16,72 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
@Override
public boolean test(long docId) {
var post = forwardIndexReader.docPost(docId & 0xFFFF_FFFFL);
int urlId = (int) (docId & 0xFFFF_FFFFL);
int domainId = forwardIndexReader.getDomainId(urlId);
long meta = forwardIndexReader.getDocMeta(urlId);
if (!validateDomain(post)) {
if (!validateDomain(domainId)) {
return false;
}
if (!validateQuality(post)) {
if (!validateQuality(meta)) {
return false;
}
if (!validateYear(post)) {
if (!validateYear(meta)) {
return false;
}
if (!validateSize(post)) {
if (!validateSize(meta)) {
return false;
}
if (!validateRank(post)) {
if (!validateRank(meta)) {
return false;
}
return true;
}
private boolean validateDomain(ForwardIndexReader.DocPost post) {
return params.searchSet().contains(post.domainId());
private boolean validateDomain(int domainId) {
return params.searchSet().contains(domainId);
}
private boolean validateQuality(ForwardIndexReader.DocPost post) {
private boolean validateQuality(long meta) {
final var limit = params.qualityLimit();
if (limit.type() == SpecificationLimitType.NONE) {
return true;
}
final int quality = DocumentMetadata.decodeQuality(post.meta());
final int quality = DocumentMetadata.decodeQuality(meta);
return limit.test(quality);
}
private boolean validateYear(ForwardIndexReader.DocPost post) {
private boolean validateYear(long meta) {
if (params.year().type() == SpecificationLimitType.NONE)
return true;
int postVal = DocumentMetadata.decodeYear(post.meta());
int postVal = DocumentMetadata.decodeYear(meta);
return params.year().test(postVal);
}
private boolean validateSize(ForwardIndexReader.DocPost post) {
private boolean validateSize(long meta) {
if (params.size().type() == SpecificationLimitType.NONE)
return true;
int postVal = DocumentMetadata.decodeSize(post.meta());
int postVal = DocumentMetadata.decodeSize(meta);
return params.size().test(postVal);
}
private boolean validateRank(ForwardIndexReader.DocPost post) {
private boolean validateRank(long meta) {
if (params.rank().type() == SpecificationLimitType.NONE)
return true;
int postVal = DocumentMetadata.decodeRank(post.meta());
int postVal = DocumentMetadata.decodeRank(meta);
return params.rank().test(postVal);
}

View File

@ -2,7 +2,7 @@
The index journal contains a list of entries with keywords and keyword metadata per document.
This journal is written by [crawl-processes/loading-process](../../crawl-processes/loading-process) and read
This journal is written by [processes/loading-process](../../processes/loading-process) and read
when constructing the [forward](../index-forward) and [reverse](../index-reverse)
indices.

View File

@ -3,7 +3,7 @@
The lexicon contains a mapping for words to identifiers. This lexicon is populated from a journal.
The actual word data isn't mapped, but rather a 64 bit hash.
The lexicon is written by [crawl-processes/loading-process](../../crawl-processes/loading-process) and read when
The lexicon is written by [processes/loading-process](../../processes/loading-process) and read when
[services-core/index-service](../../services-core/index-service) interprets queries.
## Central Classes

View File

@ -13,6 +13,7 @@ java {
}
dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:common:config')
implementation project(':code:common:model')

View File

@ -5,8 +5,8 @@ import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.LanguageModels;
import nu.marginalia.keyword_extraction.KeywordExtractor;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;

View File

@ -11,7 +11,7 @@ public class CompressedBigString implements BigString {
private final int length;
private final byte[] encoded;
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();;
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();

View File

@ -0,0 +1,19 @@
# Language Processing
This library contains various tools used in language processing.
## Central Classes
* [SentenceExtractor](src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java) -
Creates a [DocumentLanguageData](src/main/java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
its words, how they stem, POS tags, and so on.
* [TermFrequencyDict](src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java)
* [NGramBloomFilter](src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java)
## See Also
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
are important.
[features-search/query-parser](../../features-search/query-parser) also does some language processing.

View File

@ -1,98 +0,0 @@
package nu.marginalia.language.model;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Objects;
public final class KeywordMetadata {
private static final WordFrequencyData empty = new WordFrequencyData(0);
public final HashSet<String> titleKeywords = new HashSet<>(50);
public final HashSet<String> subjectKeywords = new HashSet<>(10);
public final HashSet<String> namesKeywords = new HashSet<>(50);
public final HashSet<String> urlKeywords = new HashSet<>(10);
public final HashSet<String> domainKeywords = new HashSet<>(10);
public final Object2IntOpenHashMap<String> wordsTfIdf;
public final Object2IntOpenHashMap<String> positionMask;
private final EnumSet<WordFlags> wordFlagsTemplate;
public KeywordMetadata(EnumSet<WordFlags> flags) {
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f);
this.wordFlagsTemplate = flags;
}
public KeywordMetadata() {
this(EnumSet.noneOf(WordFlags.class));
}
public long getMetadataForWord(EnumSet<WordFlags> flagsTemplate, String stemmed) {
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
EnumSet<WordFlags> flags = flagsTemplate.clone();
if (tfidf > 100)
flags.add(WordFlags.TfIdfHigh);
if (subjectKeywords.contains(stemmed))
flags.add(WordFlags.Subjects);
if (namesKeywords.contains(stemmed))
flags.add(WordFlags.NamesWords);
if (titleKeywords.contains(stemmed))
flags.add(WordFlags.Title);
if (urlKeywords.contains(stemmed))
flags.add(WordFlags.UrlPath);
if (domainKeywords.contains(stemmed))
flags.add(WordFlags.UrlDomain);
int positions = positionMask.getOrDefault(stemmed, 0);
return new WordMetadata(tfidf, positions, flags).encode();
}
public EnumSet<WordFlags> wordFlagsTemplate() {
return wordFlagsTemplate;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (KeywordMetadata) obj;
return Objects.equals(this.titleKeywords, that.titleKeywords) &&
Objects.equals(this.subjectKeywords, that.subjectKeywords) &&
Objects.equals(this.namesKeywords, that.namesKeywords) &&
Objects.equals(this.wordsTfIdf, that.wordsTfIdf) &&
Objects.equals(this.positionMask, that.positionMask) &&
Objects.equals(this.wordFlagsTemplate, that.wordFlagsTemplate);
}
@Override
public int hashCode() {
return Objects.hash(titleKeywords, subjectKeywords, namesKeywords, wordsTfIdf, positionMask, wordFlagsTemplate);
}
@Override
public String toString() {
return "KeywordMetadata[" +
"titleKeywords=" + titleKeywords + ", " +
"subjectKeywords=" + subjectKeywords + ", " +
"namesKeywords=" + namesKeywords + ", " +
"wordsTfIdf=" + wordsTfIdf + ", " +
"positionMask=" + positionMask + ", " +
"wordFlagsTemplate=" + wordFlagsTemplate + ']';
}
}

View File

@ -37,16 +37,6 @@ public class WordSpan implements Comparable<WordSpan>{
}
public boolean hasSimilarWords(DocumentSentence s, WordSpan other) {
for (int i = start; i < end; i++) {
for (int j = other.start; j < other.end; j++) {
if (s.stemmedWords[i].equals(s.stemmedWords[j]))
return true;
}
}
return false;
}
public String toString() {
return String.format("WordSpan[%s,%s]", start, end);
}

View File

@ -173,17 +173,7 @@ public class TermFrequencyDict {
return wordRates.get(longHash(s.getBytes()));
}
public static String getStemmedString(String s) {
String[] strings = separator.split(s);
if (s.length() > 1) {
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
}
else {
return s;
}
}
// If this ever changes, we need to re-generate the term frequency dictionary
public static long longHash(byte[]... bytesSets) {
if (bytesSets == null || bytesSets.length == 0)
return 0;

View File

@ -17,7 +17,7 @@ dependencies {
implementation project(':code:common:service-client')
implementation project(':code:libraries:language-processing')
implementation project(':code:crawl-models:common')
implementation project(':code:features-convert:keyword-extraction')
implementation libs.lombok
annotationProcessor libs.lombok

View File

@ -0,0 +1,4 @@
# Converting Models
Contains models shared by the [converting-process](../../processes/converting-process/) and
[loading-process](../../processes/loading-process/).

View File

@ -1,10 +1,10 @@
package nu.marginalia.converting.instruction;
import nu.marginalia.keyword_extraction.model.DocumentKeywords;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;

View File

@ -1,6 +1,6 @@
package nu.marginalia.converting.instruction.instructions;
import nu.marginalia.converting.model.DocumentKeywords;
import nu.marginalia.keyword_extraction.model.DocumentKeywords;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.InstructionTag;

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.instruction.instructions;
import nu.marginalia.crawling.common.model.HtmlStandard;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.converting.instruction.Instruction;
import nu.marginalia.converting.instruction.InstructionTag;
@ -14,7 +13,7 @@ public record LoadProcessedDocument(EdgeUrl url,
String title,
String description,
int htmlFeatures,
HtmlStandard standard,
String standard,
int length,
long hash,
double quality,

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawling.common.model;
package nu.marginalia.converting.model;
public enum HtmlStandard {

View File

@ -1,7 +1,7 @@
# Crawling Models
Contains models shared by the [crawling-process](../../crawl-processes/crawling-process/) and
[converting-process](../../crawl-processes/converting-process/).
Contains models shared by the [crawling-process](../../processes/crawling-process/) and
[converting-process](../../processes/converting-process/).
## Central Classes

Some files were not shown because too many files have changed in this diff Show More