More restructuring, big bug fixes in keyword extraction.
This commit is contained in:
parent
281f1322a9
commit
d82532b7f1
@ -1,3 +0,0 @@
|
||||
# Crawl Common
|
||||
|
||||
Contains model classes shared by the whole crawl-process-load ecosystem.
|
@ -1,10 +0,0 @@
|
||||
log4j2.isThreadContextMapInheritable=true
|
||||
status = info
|
||||
appender.console.type = Console
|
||||
appender.console.name = LogToConsole
|
||||
appender.console.layout.type = PatternLayout
|
||||
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
|
||||
appender.console.filter.http.type = MarkerFilter
|
||||
rootLogger.level = info
|
||||
rootLogger.appenderRef.console.ref = LogToConsole
|
||||
#rootLogger.appenderRef.http.ref = LogHttpTraffic
|
@ -1,4 +0,0 @@
|
||||
# Converting Models
|
||||
|
||||
Contains models shared by the [converting-process](../../crawl-processes/converting-process/) and
|
||||
[loading-process](../../crawl-processes/loading-process/).
|
@ -1,101 +0,0 @@
|
||||
package nu.marginalia.converting.processor.keywords;
|
||||
|
||||
import nu.marginalia.converting.processor.keywords.extractors.*;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final KeywordCounter tfIdfCounter;
|
||||
private final NameCounter nameCounter;
|
||||
private final SubjectCounter subjectCounter;
|
||||
private final ArtifactKeywords artifactKeywords;
|
||||
|
||||
private final SimpleKeywords simpleKeywords;
|
||||
|
||||
private final UrlKeywords urlKeywords;
|
||||
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
|
||||
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
keywordExtractor = new KeywordExtractor();
|
||||
|
||||
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
|
||||
artifactKeywords = new ArtifactKeywords();
|
||||
|
||||
urlKeywords = new UrlKeywords();
|
||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||
nameCounter = new NameCounter(keywordExtractor);
|
||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||
simpleKeywords = new SimpleKeywords(keywordExtractor);
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, EdgeUrl url) {
|
||||
|
||||
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.updateWordStatistics(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords.add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords.add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords.add(rep.stemmed);
|
||||
|
||||
keywordMetadata.urlKeywords.addAll(urlKeywords.getUrlKeywords(url));
|
||||
keywordMetadata.domainKeywords.addAll(urlKeywords.getDomainKeywords(url));
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
simpleKeywords.getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
|
||||
|
||||
createWords(wordsBuilder, keywordMetadata, wordsTfIdf);
|
||||
createWords(wordsBuilder, keywordMetadata, titleWords);
|
||||
createWords(wordsBuilder, keywordMetadata, subjects);
|
||||
|
||||
wordsBuilder.addAllSyntheticTerms(artifacts);
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
|
||||
|
||||
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
|
||||
return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
.limit(100)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public void createWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
Collection<WordRep> words) {
|
||||
|
||||
for (var word : words) {
|
||||
|
||||
String flatWord = AsciiFlattener.flattenUnicode(word.word);
|
||||
|
||||
if (WordPatterns.hasWordQualities(flatWord)) {
|
||||
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,109 +0,0 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
|
||||
public class KeywordCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
|
||||
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||
this.dict = dict;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.docCount = dict.docCount();
|
||||
}
|
||||
|
||||
public List<WordRep> updateWordStatistics(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
||||
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
|
||||
if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var rep = new WordRep(sent, span);
|
||||
|
||||
counts.mergeInt(rep.stemmed, 1, Integer::sum);
|
||||
|
||||
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(16));
|
||||
if (instanceSet.size() < 4) {
|
||||
instanceSet.add(rep);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Object2IntOpenHashMap<String> tfIdf = keywordMetadata.wordsTfIdf;
|
||||
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||
|
||||
int maxVal = maxValue(counts);
|
||||
|
||||
|
||||
counts.forEach((key, cnt) -> {
|
||||
int value = getTermValue(key, cnt, maxVal);
|
||||
|
||||
tfIdf.put(key, value);
|
||||
|
||||
if (cnt > 1 && value > 100) {
|
||||
tfIdfHigh.addAll(instances.get(key));
|
||||
}
|
||||
});
|
||||
|
||||
return tfIdfHigh;
|
||||
}
|
||||
|
||||
private int maxValue(Object2IntOpenHashMap<?> map) {
|
||||
int maxC = 0;
|
||||
|
||||
for (int c : map.values()) {
|
||||
maxC = max(c, maxC);
|
||||
}
|
||||
|
||||
return maxC;
|
||||
}
|
||||
|
||||
public int getTermValue(String key, int count, double maxValue) {
|
||||
if (key.indexOf('_') >= 0) {
|
||||
String[] parts = StringUtils.split(key, '_');
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
totalValue += value(part, count, maxValue);
|
||||
}
|
||||
return normalizeValue(totalValue / parts.length);
|
||||
}
|
||||
else {
|
||||
return normalizeValue(value(key, count, maxValue));
|
||||
}
|
||||
}
|
||||
|
||||
int normalizeValue(double v) {
|
||||
return (int)(-v*75);
|
||||
}
|
||||
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
freq = 1;
|
||||
}
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||
}
|
||||
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class NameCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
|
||||
public NameCounter(KeywordExtractor keywordExtractor) {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld, int minCount) {
|
||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1)
|
||||
continue;
|
||||
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream()
|
||||
.filter(e -> e.getValue() >= minCount)
|
||||
.sorted(Comparator.comparing(e -> -e.getValue()))
|
||||
.limit(150)
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
@ -1,54 +0,0 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
public class SimpleKeywords {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
|
||||
public SimpleKeywords(KeywordExtractor keywordExtractor) {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public void getSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData documentLanguageData) {
|
||||
|
||||
EnumSet<WordFlags> flagsTemplate = EnumSet.noneOf(WordFlags.class);
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
for (var word : sent) {
|
||||
if (word.isStopWord()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class UrlKeywords {
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
public Set<String> getUrlKeywords(EdgeUrl url) {
|
||||
String path = url.path;
|
||||
|
||||
return Arrays.stream(path.split("[^a-z0-9A-Z]+"))
|
||||
.map(ps::stemWord)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public Set<String> getDomainKeywords(EdgeUrl url) {
|
||||
return Arrays.stream(url.domain.domain.split("[^a-z0-9A-Z]+"))
|
||||
.filter(s -> s.length() > 3)
|
||||
.map(ps::stemWord)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
}
|
@ -1,129 +0,0 @@
|
||||
package nu.marginalia.converting.tool;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.converting.processor.keywords.extractors.KeywordCounter;
|
||||
import nu.marginalia.converting.processor.keywords.extractors.NameCounter;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DocumentDebugger {
|
||||
private final KeywordCounter kc;
|
||||
private final SentenceExtractor se;
|
||||
private final KeywordExtractor ke;
|
||||
private final NameCounter nc;
|
||||
|
||||
final Map<String, Path> docsByPath = new TreeMap<>();
|
||||
Path tempDir;
|
||||
public DocumentDebugger(LanguageModels lm) throws IOException {
|
||||
se = new SentenceExtractor(lm);
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
ke = new KeywordExtractor();
|
||||
|
||||
kc = new KeywordCounter(dict, ke);
|
||||
nc = new NameCounter(ke);
|
||||
|
||||
tempDir = Files.createTempDirectory("documentdebugger");
|
||||
}
|
||||
|
||||
public void writeIndex() throws FileNotFoundException {
|
||||
var output = tempDir.resolve("index.html");
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
pw.println("<ul>");
|
||||
|
||||
docsByPath.forEach((name, path) -> {
|
||||
pw.println("<li>");
|
||||
pw.printf("<a href=\"file://%s\">%s</a>", path, name);
|
||||
pw.println("</li>");
|
||||
});
|
||||
|
||||
|
||||
pw.println("</ul>");
|
||||
}
|
||||
|
||||
System.out.println(output);
|
||||
}
|
||||
|
||||
public Path debugDocument(String name, Document document) throws IOException {
|
||||
|
||||
var output = tempDir.resolve(name.substring(name.lastIndexOf("/")+1)+".html");
|
||||
docsByPath.put(name, output);
|
||||
|
||||
document.select("table,sup,.reference").remove();
|
||||
var languageData = se.extractSentences(document);
|
||||
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
|
||||
for (var sent : languageData.titleSentences) {
|
||||
pw.print("<h1>");
|
||||
printSent(pw, sent, reps);
|
||||
pw.println("</h1>");
|
||||
}
|
||||
|
||||
for (var sent : languageData.sentences) {
|
||||
pw.println("<div>");
|
||||
printSent(pw, sent, reps);
|
||||
pw.println("</div>");
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
private void printSent(PrintWriter pw, DocumentSentence sent, Set<String> words) {
|
||||
TreeMap<Integer, Set<WordRep>> spans = new TreeMap<>();
|
||||
|
||||
var names = ke.getKeywordsFromSentence(sent);
|
||||
|
||||
for (var span : names) {
|
||||
for (int j = 0; j < span.size(); j++) {
|
||||
spans.computeIfAbsent(span.start + j, n -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < sent.words.length; i++) {
|
||||
List<WordRep> matches = spans.getOrDefault(i, Collections.emptySet()).stream().filter(rep -> true || words.contains(rep.stemmed)).collect(Collectors.toList());
|
||||
|
||||
printTag(pw, sent, i, matches);
|
||||
}
|
||||
}
|
||||
|
||||
private void printTag(PrintWriter pw, DocumentSentence sent, int i, List<WordRep> matches) {
|
||||
|
||||
String style;
|
||||
if (matches.isEmpty()) {
|
||||
style = "";
|
||||
}
|
||||
else if (matches.size() == 1 && !matches.get(0).word.contains("_")) {
|
||||
style = "text-decoration: underline; color: #00f";
|
||||
}
|
||||
else {
|
||||
style = "text-decoration: underline; color: #f00";
|
||||
}
|
||||
pw.printf("<ruby title=\"%s\" style=\"%s\">",
|
||||
matches.stream().map(rep -> rep.word).collect(Collectors.joining(", ")),
|
||||
style
|
||||
);
|
||||
pw.print(sent.words[i]);
|
||||
pw.print("<rt>"); pw.println(sent.posTags[i]); pw.print("</rt>");
|
||||
pw.print("</ruby> ");
|
||||
if (sent.separators[i] == WordSeparator.COMMA)
|
||||
pw.printf(", ");
|
||||
}
|
||||
}
|
56
code/features-convert/keyword-extraction/build.gradle
Normal file
56
code/features-convert/keyword-extraction/build.gradle
Normal file
@ -0,0 +1,56 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.converting.ConverterMain'
|
||||
applicationName = 'converter-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.guava
|
||||
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
13
code/features-convert/keyword-extraction/readme.md
Normal file
13
code/features-convert/keyword-extraction/readme.md
Normal file
@ -0,0 +1,13 @@
|
||||
# Keyword Extraction
|
||||
|
||||
This code deals with identifying keywords in a document, their positions in the document,
|
||||
their important based on [TF-IDF](https://en.wikipedia.org/wiki/Tf-idf) and their grammatical
|
||||
functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java)
|
||||
|
||||
## See Also
|
||||
|
||||
* [libraries/language-processing](../../libraries/language-processing) does a lot of the heavy lifting.
|
@ -0,0 +1,104 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
|
||||
import nu.marginalia.keyword_extraction.extractors.*;
|
||||
import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.inject.Inject;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
this.dict = dict;
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) {
|
||||
|
||||
var bitmask = new KeywordPositionBitmask(keywordExtractor, dld);
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.bitmask(bitmask)
|
||||
.tfIdfCounts(tfIdfCounts)
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
createSimpleWords(wordsBuilder, keywordMetadata, dld);
|
||||
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
|
||||
|
||||
private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
WordReps words) {
|
||||
|
||||
for (var word : words.getReps()) {
|
||||
|
||||
String flatWord = AsciiFlattener.flattenUnicode(word.word);
|
||||
|
||||
if (WordPatterns.hasWordQualities(flatWord)) {
|
||||
wordsBuilder.add(flatWord, metadata.getMetadataForWord(word.stemmed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData documentLanguageData)
|
||||
{
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
for (var word : sent) {
|
||||
if (word.isStopWord()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed()));
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(rep.stemmed));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.language.keywords;
|
||||
package nu.marginalia.keyword_extraction;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
@ -60,8 +60,9 @@ public class KeywordExtractor {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isNoun(i, sentence))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
if (isNoun(i, sentence)) {
|
||||
spans.add(new WordSpan(i, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
@ -0,0 +1,64 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword_extraction.extractors.*;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
class KeywordMetadata {
|
||||
|
||||
private final KeywordPositionBitmask bitmask;
|
||||
private final TitleKeywords titleKeywords;
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
private final UrlKeywords urlKeywords;
|
||||
private final WordsTfIdfCounts tfIdfCounts;
|
||||
|
||||
@Builder
|
||||
public KeywordMetadata(
|
||||
KeywordPositionBitmask bitmask,
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords,
|
||||
WordsTfIdfCounts tfIdfCounts) {
|
||||
|
||||
this.bitmask = bitmask;
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
this.urlKeywords = urlKeywords;
|
||||
this.tfIdfCounts = tfIdfCounts;
|
||||
}
|
||||
|
||||
public long getMetadataForWord(String stemmed) {
|
||||
|
||||
int tfidf = tfIdfCounts.getTfIdf(stemmed);
|
||||
EnumSet<WordFlags> flags = EnumSet.noneOf(WordFlags.class);
|
||||
|
||||
if (tfidf > 100)
|
||||
flags.add(WordFlags.TfIdfHigh);
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Subjects);
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.NamesWords);
|
||||
|
||||
if (titleKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Title);
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed))
|
||||
flags.add(WordFlags.UrlPath);
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed))
|
||||
flags.add(WordFlags.UrlDomain);
|
||||
|
||||
int positions = bitmask.get(stemmed);
|
||||
|
||||
return new WordMetadata(tfidf, positions, flags).encode();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
package nu.marginalia.keyword_extraction;
|
||||
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
public interface WordReps {
|
||||
Collection<WordRep> getReps();
|
||||
}
|
@ -1,19 +1,18 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class ArtifactKeywords {
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
|
||||
public List<String> getArtifactKeywords(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
private final Set<String> words;
|
||||
|
||||
public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
|
||||
words = new HashSet<>();
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
@ -24,22 +23,25 @@ public class ArtifactKeywords {
|
||||
continue;
|
||||
}
|
||||
|
||||
reps.add(lc);
|
||||
words.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
String user = lc.substring(0, lc.indexOf('@'));
|
||||
|
||||
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
|
||||
reps.add(domain);
|
||||
words.add(domain.substring(1));
|
||||
words.add(domain);
|
||||
}
|
||||
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
|
||||
reps.add(user);
|
||||
words.add(user);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return new ArrayList<>(reps);
|
||||
}
|
||||
|
||||
public Collection<String> getWords() {
|
||||
return words;
|
||||
}
|
||||
}
|
@ -1,35 +1,27 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
|
||||
/** Generates a position bitmask for each word in a document */
|
||||
public class DocumentKeywordPositionBitmaskExtractor {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
public class KeywordPositionBitmask {
|
||||
private final Object2IntOpenHashMap<String> positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordPositionBitmaskExtractor(KeywordExtractor keywordExtractor) {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
|
||||
final KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||
|
||||
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask;
|
||||
public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
||||
|
||||
// Mark the title words as position 0
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = 1;
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -40,17 +32,19 @@ public class DocumentKeywordPositionBitmaskExtractor {
|
||||
int posBit = (int)((1L << linePos.pos()) & 0xFFFF_FFFFL);
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
linePos.next();
|
||||
}
|
||||
}
|
||||
|
||||
return keywordMetadata;
|
||||
public int get(String stemmed) {
|
||||
return positionMask.getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
||||
private int bitwiseOr(int a, int b) {
|
@ -0,0 +1,70 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Keywords that look like they could be a name */
|
||||
public class NameLikeKeywords implements WordReps {
|
||||
private final List<WordRep> nameWords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) {
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase);
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start]))
|
||||
continue;
|
||||
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.addTo(stemmed, -1);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
nameWords = counts.object2IntEntrySet().stream()
|
||||
.filter(e -> hasEnough(e, minCount))
|
||||
.sorted(Comparator.comparingInt(Object2IntMap.Entry::getIntValue))
|
||||
.limit(150)
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
stemmed = nameWords.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public boolean hasEnough(Object2IntMap.Entry<String> entry, int minCount) {
|
||||
final int count = -entry.getIntValue();
|
||||
|
||||
if (entry.getKey().contains("_")) {
|
||||
return count >= minCount;
|
||||
}
|
||||
else {
|
||||
return count >= minCount + 1;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean contains(String wordStemmed) {
|
||||
return stemmed.contains(wordStemmed);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<WordRep> getReps() {
|
||||
return nameWords;
|
||||
}
|
||||
}
|
@ -1,48 +1,49 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SubjectCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
|
||||
public SubjectCounter(KeywordExtractor keywordExtractor) {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
public class SubjectLikeKeywords implements WordReps {
|
||||
private final List<WordRep> wordList;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
// Seeks out subjects in a sentence by constructs like
|
||||
//
|
||||
// [Name] (Verbs) (the|a|Adverb|Verb) ...
|
||||
// [Name] (Verbs) (the|a|Adverb|Verb|Noun) ...
|
||||
// e.g.
|
||||
//
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
|
||||
WordsTfIdfCounts tfIdfCounts,
|
||||
DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
if (sentence.separators[kw.end] == WordSeparator.COMMA
|
||||
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
|
||||
break;
|
||||
continue;
|
||||
|
||||
String nextTag = sentence.posTags[kw.end];
|
||||
String nextNextTag = sentence.posTags[kw.end+1];
|
||||
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerbOrNoun(nextNextTag)) {
|
||||
var span = new WordSpan(kw.start, kw.end);
|
||||
var rep = new WordRep(sentence, span);
|
||||
|
||||
@ -53,18 +54,30 @@ public class SubjectCounter {
|
||||
}
|
||||
}
|
||||
|
||||
Map<String, Integer> scores = new HashMap<>(instances.size());
|
||||
Object2IntOpenHashMap<String> scores = new Object2IntOpenHashMap<>(instances.size());
|
||||
for (String stemmed : instances.keySet()) {
|
||||
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
|
||||
scores.put(stemmed, getTermTfIdf(tfIdfCounts, stemmed));
|
||||
}
|
||||
|
||||
return scores.entrySet().stream()
|
||||
.filter(e -> e.getValue() >= 150)
|
||||
wordList = scores.object2IntEntrySet().stream()
|
||||
.filter(e -> e.getIntValue() >= 100)
|
||||
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
|
||||
stemmed = wordList.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
|
||||
public boolean contains(String wordStemmed) {
|
||||
return stemmed.contains(wordStemmed);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<WordRep> getReps() {
|
||||
return wordList;
|
||||
}
|
||||
|
||||
private int getTermTfIdf(WordsTfIdfCounts tfIdfCounts, String stemmed) {
|
||||
if (stemmed.contains("_")) {
|
||||
int sum = 0;
|
||||
String[] parts = StringUtils.split(stemmed, '_');
|
||||
@ -74,25 +87,26 @@ public class SubjectCounter {
|
||||
}
|
||||
|
||||
for (String part : parts) {
|
||||
sum += getTermTfIdf(keywordMetadata, part);
|
||||
sum += getTermTfIdf(tfIdfCounts, part);
|
||||
}
|
||||
|
||||
return sum / parts.length;
|
||||
}
|
||||
|
||||
return keywordMetadata.wordsTfIdf.getOrDefault(stemmed, 0);
|
||||
return tfIdfCounts.getTfIdf(stemmed);
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||
private boolean isDetOrAdverbOrVerbOrNoun(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| "RB".equals(posTag) // adverb
|
||||
|| posTag.startsWith("RB") // adverb
|
||||
|| posTag.startsWith("VB") // verb
|
||||
|| posTag.startsWith("JJ"); // adjective
|
||||
|| posTag.startsWith("JJ") // adjective
|
||||
|| posTag.startsWith("P")
|
||||
|| posTag.startsWith("NN");
|
||||
}
|
||||
|
||||
boolean isVerb(String posTag) {
|
||||
return posTag.startsWith("VB")
|
||||
&& !posTag.equals("VB"); // not interested in the infinitive
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Extract keywords from the title */
|
||||
public class TitleKeywords implements WordReps {
|
||||
private final Set<WordRep> titleKeywords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
|
||||
titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
.limit(100)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
stemmed = titleKeywords.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public boolean contains(String wordStemmed) {
|
||||
return stemmed.contains(wordStemmed);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<WordRep> getReps() {
|
||||
return titleKeywords;
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Extract keywords from the URL */
|
||||
public class UrlKeywords {
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private final Set<String> urlKeywords;
|
||||
private final Set<String> domainKeywords;
|
||||
|
||||
public UrlKeywords(EdgeUrl url) {
|
||||
String path = url.path;
|
||||
|
||||
|
||||
urlKeywords = Arrays.stream(path.split("[^a-z0-9A-Z]+"))
|
||||
.map(ps::stemWord)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
domainKeywords = Arrays.stream(url.domain.toString().split("[^a-z0-9A-Z]+"))
|
||||
.filter(s -> s.length() > 3)
|
||||
.map(ps::stemWord)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public boolean containsUrl(String stemmed) {
|
||||
return urlKeywords.contains(stemmed);
|
||||
}
|
||||
|
||||
|
||||
public boolean containsDomain(String stemmed) {
|
||||
return domainKeywords.contains(stemmed);
|
||||
}
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword_extraction.WordReps;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
|
||||
/** Extract counts and TF-IDF for the words in the document,
|
||||
* keep track of high-scoring words for flagging
|
||||
*/
|
||||
public class WordsTfIdfCounts implements WordReps {
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
|
||||
private final Object2IntOpenHashMap<String> tfIdf;
|
||||
private final Set<WordRep> tfIdfHigh;
|
||||
|
||||
public WordsTfIdfCounts(TermFrequencyDict dict,
|
||||
KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld) {
|
||||
this.dict = dict;
|
||||
this.docCount = dict.docCount();
|
||||
|
||||
this.tfIdf = new Object2IntOpenHashMap<>(10_000);
|
||||
|
||||
var counts = getCounts(keywordExtractor, dld);
|
||||
int maxVal = maxValue(counts);
|
||||
Set<String> highTfIdfInstances = new HashSet<>();
|
||||
|
||||
counts.forEach((key, cnt) -> {
|
||||
int value = getTermValue(key, cnt, maxVal);
|
||||
|
||||
tfIdf.put(key, value);
|
||||
if (cnt > 1 && value > 100) {
|
||||
highTfIdfInstances.add(key);
|
||||
}
|
||||
});
|
||||
|
||||
// Collect words with a high TF-IDF so that they can be marked with a bit flag
|
||||
|
||||
tfIdfHigh = new HashSet<>(100);
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
if (highTfIdfInstances.contains(spanToStemmed(sent, span))) {
|
||||
tfIdfHigh.add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Object2IntOpenHashMap<String> getCounts(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
counts.defaultReturnValue(0);
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
|
||||
if (span.size() == 1
|
||||
&& WordPatterns.isStopWord(sent.words[span.start]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
counts.addTo(spanToStemmed(sent, span), 1);
|
||||
}
|
||||
}
|
||||
|
||||
return counts;
|
||||
}
|
||||
|
||||
private String spanToStemmed(DocumentSentence sentence, WordSpan span) {
|
||||
if (span.size() == 1)
|
||||
return sentence.stemmedWords[span.start];
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = span.start; i < span.end; i++) {
|
||||
if (!builder.isEmpty())
|
||||
builder.append('_');
|
||||
builder.append(sentence.stemmedWords[i]);
|
||||
}
|
||||
return builder.toString();
|
||||
|
||||
}
|
||||
|
||||
public int getTfIdf(String stemmed) {
|
||||
return tfIdf.getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<WordRep> getReps() {
|
||||
return tfIdfHigh;
|
||||
}
|
||||
|
||||
private int maxValue(Object2IntOpenHashMap<?> map) {
|
||||
int maxC = 0;
|
||||
|
||||
for (int c : map.values()) {
|
||||
maxC = max(c, maxC);
|
||||
}
|
||||
|
||||
return maxC;
|
||||
}
|
||||
|
||||
public int getTermValue(String key, int count, double maxValue) {
|
||||
if (key.indexOf('_') >= 0) {
|
||||
String[] parts = StringUtils.split(key, '_');
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
totalValue += value(part, count, maxValue);
|
||||
}
|
||||
return normalizeValue(totalValue / parts.length);
|
||||
}
|
||||
else {
|
||||
return normalizeValue(value(key, count, maxValue));
|
||||
}
|
||||
}
|
||||
|
||||
int normalizeValue(double v) {
|
||||
return (int)(-v*75);
|
||||
}
|
||||
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
freq = 1;
|
||||
}
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.model;
|
||||
package nu.marginalia.keyword_extraction.model;
|
||||
|
||||
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.model;
|
||||
package nu.marginalia.keyword_extraction.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
||||
@ -63,10 +63,9 @@ public class DocumentKeywordsBuilder {
|
||||
public void addAllSyntheticTerms(Collection<String> newWords) {
|
||||
long meta = WordFlags.Synthetic.asBit();
|
||||
|
||||
newWords.forEach(word -> {
|
||||
words.putIfAbsent(word, meta);
|
||||
});
|
||||
// Only add the synthetic flag if the words aren't already present
|
||||
|
||||
newWords.forEach(word -> words.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.keywords;
|
||||
package nu.marginalia.keyword_extraction;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
@ -7,7 +7,6 @@ import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
@ -0,0 +1,22 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class ArtifactKeywordsTest {
|
||||
|
||||
@Test
|
||||
public void testExtractArtifacts() {
|
||||
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
|
||||
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
|
||||
System.out.println(artifacts.getWords());
|
||||
assertTrue(artifacts.getWords().contains("vlofgren"));
|
||||
assertTrue(artifacts.getWords().contains("marginalia.nu"));
|
||||
assertTrue(artifacts.getWords().contains("@marginalia.nu"));
|
||||
assertTrue(artifacts.getWords().contains("vlofgren@marginalia.nu"));
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class NameLikeKeywordsTest {
|
||||
String text = """
|
||||
In 60 BC, Caesar, Crassus, and Pompey formed the First Triumvirate, an informal political alliance that
|
||||
dominated Roman politics for several years. Their attempts to amass power as Populares were opposed by
|
||||
the Optimates within the Roman Senate, among them Cato the Younger with the frequent support of Cicero.
|
||||
Caesar rose to become one of the most powerful politicians in the Roman Republic through a string of
|
||||
military victories in the Gallic Wars, completed by 51 BC, which greatly extended Roman territory.
|
||||
During this time he both invaded Britain and built a bridge across the Rhine river. These achievements
|
||||
and the support of his veteran army threatened to eclipse the standing of Pompey, who had realigned himself
|
||||
with the Senate after the death of Crassus in 53 BC. With the Gallic Wars concluded, the Senate ordered
|
||||
Caesar to step down from his military command and return to Rome. In 49 BC, Caesar openly defied the
|
||||
Senate's authority by crossing the Rubicon and marching towards Rome at the head of an army. This
|
||||
began Caesar's civil war, which he won, leaving him in a position of near unchallenged power and
|
||||
influence in 45 BC.
|
||||
|
||||
After assuming control of government, Caesar began a program of social and governmental reforms,
|
||||
including the creation of the Julian calendar. He gave citizenship to many residents of far regions
|
||||
of the Roman Republic. He initiated land reform and support for veterans. He centralized the
|
||||
bureaucracy of the Republic and was eventually proclaimed "dictator for life" (dictator perpetuo).
|
||||
His populist and authoritarian reforms angered the elites, who began to conspire against him. On the
|
||||
Ides of March (15 March) 44 BC, Caesar was assassinated by a group of rebellious senators led by Brutus
|
||||
and Cassius, who stabbed him to death. A new series of civil wars broke out and the constitutional
|
||||
government of the Republic was never fully restored. Caesar's great-nephew and adopted heir Octavian,
|
||||
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
|
||||
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
NameLikeKeywords keywords = new NameLikeKeywords(new KeywordExtractor(), se.extractSentences(text, "Julius Caesar"), 2);
|
||||
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
Set<String> expected = Set.of("caesar", "senate", "roman", "republic", "roman_republic");
|
||||
|
||||
// rome isn't counted because PorterStemmer is derp
|
||||
|
||||
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
|
||||
}
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class SubjectLikeKeywordsTest {
|
||||
String text = """
|
||||
In 60 BC, Caesar, Crassus, and Pompey formed the First Triumvirate, an informal political alliance that
|
||||
dominated Roman politics for several years. Their attempts to amass power as Populares were opposed by
|
||||
the Optimates within the Roman Senate, among them Cato the Younger with the frequent support of Cicero.
|
||||
Caesar rose to become one of the most powerful politicians in the Roman Republic through a string of
|
||||
military victories in the Gallic Wars, completed by 51 BC, which greatly extended Roman territory.
|
||||
During this time he both invaded Britain and built a bridge across the Rhine river. These achievements
|
||||
and the support of his veteran army threatened to eclipse the standing of Pompey, who had realigned himself
|
||||
with the Senate after the death of Crassus in 53 BC. With the Gallic Wars concluded, the Senate ordered
|
||||
Caesar to step down from his military command and return to Rome. In 49 BC, Caesar openly defied the
|
||||
Senate's authority by crossing the Rubicon and marching towards Rome at the head of an army. This
|
||||
began Caesar's civil war, which he won, leaving him in a position of near unchallenged power and
|
||||
influence in 45 BC.
|
||||
|
||||
After assuming control of government, Caesar began a program of social and governmental reforms,
|
||||
including the creation of the Julian calendar. He gave citizenship to many residents of far regions
|
||||
of the Roman Republic. He initiated land reform and support for veterans. He centralized the
|
||||
bureaucracy of the Republic and was eventually proclaimed "dictator for life" (dictator perpetuo).
|
||||
His populist and authoritarian reforms angered the elites, who began to conspire against him. On the
|
||||
Ides of March (15 March) 44 BC, Caesar was assassinated by a group of rebellious senators led by Brutus
|
||||
and Cassius, who stabbed him to death. A new series of civil wars broke out and the constitutional
|
||||
government of the Republic was never fully restored. Caesar's great-nephew and adopted heir Octavian,
|
||||
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
|
||||
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
var lm = TestLanguageModels.getLanguageModels();
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
var dld = se.extractSentences(text, "Julius Caesar");
|
||||
|
||||
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
|
||||
SubjectLikeKeywords keywords = new SubjectLikeKeywords(new KeywordExtractor(),
|
||||
tfIdfCounts,
|
||||
dld);
|
||||
|
||||
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
Set<String> expected = Set.of("caesar", "republic", "authoritarian_reforms", "senate", "pompey", "reforms", "government_of_the_republic");
|
||||
|
||||
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,208 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
class TitleKeywordsTest {
|
||||
|
||||
String document = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>MEMEX - Creepy Website Similarity [ 2022-12-26 ]</title>
|
||||
<link rel="stylesheet" href="/style-new.css" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
\s
|
||||
</head>
|
||||
<body class="double" lang="en">
|
||||
|
||||
<header>
|
||||
<nav>
|
||||
<a href="http://www.marginalia.nu/">Marginalia</a>
|
||||
<a href="http://search.marginalia.nu/">Search Engine</a>
|
||||
<a href="http://encyclopedia.marginalia.nu/">Encyclopedia</a>
|
||||
</nav>
|
||||
</header>
|
||||
<nav class="topbar">
|
||||
<h1>Memex</h1>
|
||||
|
||||
<a href="/" class="path root"><img src="/ico/root.png" title="root"> marginalia</a>
|
||||
|
||||
<a href="/log" class="path dir"><img src="/ico/dir.png" title="dir"> log</a>
|
||||
|
||||
<a href="/log/69-creepy-website-similarity.gmi" class="path file"><img src="/ico/file.png" title="file"> 69-creepy-website-similarity.gmi</a>
|
||||
|
||||
</nav>
|
||||
|
||||
<article>
|
||||
<section id="memex-node">
|
||||
<h1 id="1">Creepy Website Similarity [ 2022-12-26 ]</h1>
|
||||
<br>
|
||||
This is a write-up about an experiment from a few months ago, in how to find websites that are similar to each other. Website similarity is useful for many things, including discovering new websites to crawl, as well as suggesting similar websites in the Marginalia Search random exploration mode.<br>
|
||||
<br>
|
||||
<dl class="link"><dt><a class="external" href="https://explore2.marginalia.nu/">https://explore2.marginalia.nu/</a></dt><dd>A link to a slapdash interface for exploring the experimental data.</dd></dl>
|
||||
<br>
|
||||
The approach chosen was to use the link graph look for websites that are linked to from the same websites. This turned out to work remarkably well. <br>
|
||||
<br>
|
||||
There are some alternative feature spaces that might have been used, such as TF-IDF data. Using incident links turned out to be surprisingly powerful, almost to an uncanny degree as it's able to find similarities even among websites that Marginalia doesn't index.<br>
|
||||
<br>
|
||||
As a whole the feature shares a lot of similarity with how you would construct a recommendation algorithm of the type "other shoppers also bought", and in doing so also exposes how creepy they can be. You can't build a recommendation engine without building a tool for profiling. It's largely the same thing.<br>
|
||||
<br>
|
||||
If you for example point the website explorer to the fringes of politics, it will map that web-space with terrifying accuracy.<br>
|
||||
<br>
|
||||
<dl class="link"><dt><a class="external" href="https://explore2.marginalia.nu/search?domain=qanon.pub">https://explore2.marginalia.nu/search?domain=qanon.pub</a></dt><dd>qanon.pub's neighbors</dd></dl>
|
||||
<br>
|
||||
Note again how few of those websites are actually indexed by Marginalia. Only those websites with 'MS' links are! The rest are inferred from the data. On the one hand it's fascinating and cool, on the other it's deeply troubling: If I can create such a map on PC in my living room, imagine what might be accomplished with a datacenter.<br>
|
||||
<br>
|
||||
You might think "Well what's the problem? QAnon deserves all the scrutiny, give them nowhere to hide!". Except this sort of tool could concievably work just as well as well for mapping democracy advocates in Hong Kong, Putin-critics in Russia, gay people in Uganda, and so forth.<br>
|
||||
<br>
|
||||
<h2 id="1.1">Implementation details</h2>
|
||||
<br>
|
||||
In practice, cosine similarity is used to compare the similarity between websites. This is a statistical method perhaps most commonly used in machine learning, but it has other uses as well. <br>
|
||||
<br>
|
||||
Cosine similarity is calculated by taking the inner product of two vectors and dividing by their norms<br>
|
||||
<br>
|
||||
<pre>
|
||||
a x b
|
||||
p = ---------\s
|
||||
|a| |b|</pre>
|
||||
<br>
|
||||
As you might remember from linear algebra, this is a measure of how much two vectors "pull in the same direction". The cosine similarity of two identical vectors is unity, and for orthogonal vectors it is zero.<br>
|
||||
<br>
|
||||
This data has extremely high dimensionality, the vector space consists of nearly 10 million domains, so most "standard" tools like numpy/scipy will not load the data without serious massaging. That juice doesn't appear to be worth the squeeze when it's just as easy to roll what you need on your own (which you'd probably need to do regardless to get it into those tools, Random Reprojection or some such). <br>
|
||||
<br>
|
||||
Since the vectors in questions are just bitmaps, either a website has a link or it does not, the vector product can be simplified to a logical AND operation. The first stab at the problem was to use RoaringBitmaps.<br>
|
||||
<br>
|
||||
<pre>
|
||||
double cosineSimilarity(RoaringBitmap a, RoaringBitmap b) {
|
||||
double andCardinality = RoaringBitmap.andCardinality(a, b);
|
||||
andCardinality /= Math.sqrt(a.getCardinality());
|
||||
andCardinality /= Math.sqrt(b.getCardinality());
|
||||
return andCardinality;
|
||||
}
|
||||
</pre>
|
||||
<br>
|
||||
This works but it's just a bit too slow to be practical. Sacrificing some memory for speed turns out to be necessary. Roaring Bitmaps are memory efficient, but a general purpose library. It's easy to create a drop-in replacement that implements only andCardinality() and getCardinality() in a way that caters to the specifics of the data. <br>
|
||||
<br>
|
||||
A simple 64 bit bloom filter makes it possible to short-circuit a lot of the calculations since many vectors are small and trivially don't overlap. The vector data is stored in sorted lists. Comparing sorted lists is very cache friendly and fast, while using relatively little memory. Storing a dense matrix would require RAM on the order of hundreds of terabytes so that's no good.<br>
|
||||
<br>
|
||||
The actual code rewritten for brevity, as a sketch the and-cardinality calculation looks like this, and performs about 5-20x faster than RoaringBitmaps for this specfic use case:<br>
|
||||
<br>
|
||||
<pre>
|
||||
|
||||
int andCardinality(AndCardIntSet a, AndCardIntSet b) {
|
||||
|
||||
if ((a.hash & b.hash) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int i = 0, j = 0;
|
||||
int card = 0;
|
||||
|
||||
do {
|
||||
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||
|
||||
if (diff < 0) i++;
|
||||
else if (diff > 0) j++;
|
||||
else {
|
||||
i++;
|
||||
j++;
|
||||
card++;
|
||||
}
|
||||
} while (i < a.getCardinality() && j < b.getCardinality());
|
||||
|
||||
return card;
|
||||
\s
|
||||
}
|
||||
</pre>
|
||||
<br>
|
||||
This calculates similarities between websites at a rate where it's feasible to pre-calculate the similarities between all known websites within a couple of days. It's on the cusp of being viable to offer ad-hoc calculations, but not quite without being a denial of service-hazard. <br>
|
||||
<br>
|
||||
To do this in real time, the search space could be reduced using some form of locality-sensitive hash scheme, although for a proof of concept this performs well enough on its own. <br>
|
||||
<br>
|
||||
<h2 id="1.2">Closing thoughts</h2>
|
||||
<br>
|
||||
This has been online for a while and I've been debating whether to do this write-up. To be honest this is probably the creepiest piece of software I've built. <br>
|
||||
<br>
|
||||
At the same time, I can't imagine I'm the first to conceive of doing this. To repeat, you almost can't build a suggestions engine without this type of side-effect, and recommendations are *everywhere* these days. They are on Spotify, Youtube, Facebook, Reddit, Twitter, Amazon, Netflix, Google, even small web shops have them. <br>
|
||||
<br>
|
||||
In that light, it's better to make the discovery public and highlight its potential so that it might serve as an example of how and why these recommendation algorithms are profoundly creepy. <br>
|
||||
<br>
|
||||
<h2 id="1.3">Topic</h2>
|
||||
<br>
|
||||
<a class="internal" href="/topic/astrolabe.gmi">/topic/astrolabe.gmi</a><br>
|
||||
<a class="internal" href="/topic/programming.gmi">/topic/programming.gmi</a><br>
|
||||
|
||||
|
||||
|
||||
</section>
|
||||
<div id="sidebar">
|
||||
<section class="tools">
|
||||
<h1>69-creepy-website-similarity.gmi</h1>
|
||||
<a class="download" href="/api/raw?url=/log/69-creepy-website-similarity.gmi">Raw</a><br>
|
||||
<a rel="nofollow" href="/api/update?url=/log/69-creepy-website-similarity.gmi" class="verb">Edit</a>
|
||||
<a rel="nofollow" href="/api/rename?type=gmi&url=/log/69-creepy-website-similarity.gmi" class="verb">Rename</a>
|
||||
\s
|
||||
<br/>
|
||||
<div class="toc">
|
||||
\s
|
||||
<a href="#1" class="heading-1">1 Creepy Website Similarity [ 2022-12-26 ]</a>
|
||||
\s
|
||||
<a href="#1.1" class="heading-2">1.1 Implementation details</a>
|
||||
\s
|
||||
<a href="#1.2" class="heading-2">1.2 Closing thoughts</a>
|
||||
\s
|
||||
<a href="#1.3" class="heading-2">1.3 Topic</a>
|
||||
\s
|
||||
</div>
|
||||
</section>
|
||||
|
||||
|
||||
<section id="memex-backlinks">
|
||||
<h1 id="backlinks"> Backlinks </h1>
|
||||
<dl>
|
||||
<dt><a href="/log/73-new-approach-to-ranking.gmi">/log/73-new-approach-to-ranking.gmi</a></dt>
|
||||
<dd>A new approach to domain ranking [ 2023-02-06 ] - See Also</dd>
|
||||
</dl>
|
||||
</section>
|
||||
|
||||
|
||||
</div>
|
||||
</article>
|
||||
<footer>
|
||||
Reach me at <a class="fancy-teknisk" href="mailto:kontakt@marginalia.nu">kontakt@marginalia.nu</a>.
|
||||
<br />
|
||||
</footer>
|
||||
</body>
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void extractTitleWords() {
|
||||
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
|
||||
var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps();
|
||||
var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
|
||||
Set<String> expected = Set.of(
|
||||
"creepy",
|
||||
"website",
|
||||
"similarity",
|
||||
"creepy_website",
|
||||
"website_similarity",
|
||||
"creepy_website_similarity",
|
||||
"memex", "2022-12-26");
|
||||
|
||||
Assertions.assertEquals(Collections.emptySet(),
|
||||
Sets.symmetricDifference(words, expected));
|
||||
}
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.keyword_extraction.extractors;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class UrlKeywordsTest {
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
@Test
|
||||
void containsDomain() throws URISyntaxException {
|
||||
var keywords = new UrlKeywords(new EdgeUrl("https://memex.marginalia.nu/log/69-creepy-website-similarity.gmi"));
|
||||
assertTrue(keywords.containsDomain(ps.stemWord("memex")));
|
||||
assertTrue(keywords.containsDomain(ps.stemWord("marginalia")));
|
||||
}
|
||||
|
||||
@Test
|
||||
void containsDomainNoWWWNoCom() throws URISyntaxException {
|
||||
var keywords = new UrlKeywords(new EdgeUrl("https://www.example.com/log/69-creepy-website-similarity.gmi"));
|
||||
assertTrue(keywords.containsDomain(ps.stemWord("example")));
|
||||
assertFalse(keywords.containsDomain(ps.stemWord("www")));
|
||||
assertFalse(keywords.containsDomain(ps.stemWord("com")));
|
||||
}
|
||||
|
||||
@Test
|
||||
void pathFragments() throws URISyntaxException {
|
||||
var keywords = new UrlKeywords(new EdgeUrl("https://memex.marginalia.nu/log/69-creepy-website-similarity.gmi"));
|
||||
assertTrue(keywords.containsUrl(ps.stemWord("creepy")));
|
||||
assertTrue(keywords.containsUrl(ps.stemWord("website")));
|
||||
assertTrue(keywords.containsUrl(ps.stemWord("similarity")));
|
||||
assertTrue(keywords.containsUrl(ps.stemWord("69")));
|
||||
assertTrue(keywords.containsUrl(ps.stemWord("log")));
|
||||
assertFalse(keywords.containsUrl(ps.stemWord("memex")));
|
||||
}
|
||||
}
|
@ -15,7 +15,7 @@ java {
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:crawl-models:common')
|
||||
implementation project(':code:process-models:converting-model')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
@ -23,7 +23,7 @@ dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guice
|
||||
implementation libs.notnull
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.time.DateTimeException;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.heuristic.*;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -3,7 +3,7 @@ package nu.marginalia.pubdate.heuristic;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.pubdate.heuristic;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.pubdate.PubDateParser;
|
||||
@ -17,7 +17,8 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
||||
private static final Pattern yearUrlPattern = Pattern.compile("/\\d{4}/");
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url,
|
||||
Document document, HtmlStandard htmlStandard) {
|
||||
final String urlString = url.path;
|
||||
|
||||
var matcher = yearUrlPattern.matcher(urlString);
|
@ -2,7 +2,7 @@ package nu.marginalia.pubdate;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
11
code/features-convert/readme.md
Normal file
11
code/features-convert/readme.md
Normal file
@ -0,0 +1,11 @@
|
||||
# Converter Features
|
||||
|
||||
## Major features
|
||||
|
||||
* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document
|
||||
|
||||
## Smaller features:
|
||||
|
||||
* [adblock](adblock/) - Simulates Adblock
|
||||
* [pubdate](pubdate/) - Determines when a document was published
|
||||
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
@ -16,7 +16,7 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:features-crawl:work-log')
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:crawl-models:crawling-model')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.lombok
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawling.common.plan;
|
||||
package nu.marginalia.crawl_plan;
|
||||
|
||||
import com.google.errorprone.annotations.MustBeClosed;
|
||||
import lombok.AllArgsConstructor;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawling.common.plan;
|
||||
package nu.marginalia.crawl_plan;
|
||||
|
||||
import org.yaml.snakeyaml.Yaml;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawling.common.plan;
|
||||
package nu.marginalia.crawl_plan;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
@ -3,11 +3,6 @@
|
||||
These are bits of search-engine related code that are relatively isolated pieces of business logic,
|
||||
that benefit from the clarity of being kept separate from the rest of the crawling code.
|
||||
|
||||
|
||||
* [adblock](adblock/) - Simulates Adblock
|
||||
* [pubdate](pubdate/) - Determines when a document was published
|
||||
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
||||
|
||||
* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists
|
||||
* [work-log](work-log/) - Work journal for resuming long processes
|
||||
* [link-parser](link-parser/) - Code for parsing and normalizing links
|
||||
|
@ -16,70 +16,72 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public boolean test(long docId) {
|
||||
var post = forwardIndexReader.docPost(docId & 0xFFFF_FFFFL);
|
||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
||||
int domainId = forwardIndexReader.getDomainId(urlId);
|
||||
long meta = forwardIndexReader.getDocMeta(urlId);
|
||||
|
||||
if (!validateDomain(post)) {
|
||||
if (!validateDomain(domainId)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!validateQuality(post)) {
|
||||
if (!validateQuality(meta)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!validateYear(post)) {
|
||||
if (!validateYear(meta)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!validateSize(post)) {
|
||||
if (!validateSize(meta)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!validateRank(post)) {
|
||||
if (!validateRank(meta)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean validateDomain(ForwardIndexReader.DocPost post) {
|
||||
return params.searchSet().contains(post.domainId());
|
||||
private boolean validateDomain(int domainId) {
|
||||
return params.searchSet().contains(domainId);
|
||||
}
|
||||
|
||||
private boolean validateQuality(ForwardIndexReader.DocPost post) {
|
||||
private boolean validateQuality(long meta) {
|
||||
final var limit = params.qualityLimit();
|
||||
|
||||
if (limit.type() == SpecificationLimitType.NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
final int quality = DocumentMetadata.decodeQuality(post.meta());
|
||||
final int quality = DocumentMetadata.decodeQuality(meta);
|
||||
|
||||
return limit.test(quality);
|
||||
}
|
||||
|
||||
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
||||
private boolean validateYear(long meta) {
|
||||
if (params.year().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
|
||||
int postVal = DocumentMetadata.decodeYear(post.meta());
|
||||
int postVal = DocumentMetadata.decodeYear(meta);
|
||||
|
||||
return params.year().test(postVal);
|
||||
}
|
||||
|
||||
private boolean validateSize(ForwardIndexReader.DocPost post) {
|
||||
private boolean validateSize(long meta) {
|
||||
if (params.size().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
|
||||
int postVal = DocumentMetadata.decodeSize(post.meta());
|
||||
int postVal = DocumentMetadata.decodeSize(meta);
|
||||
|
||||
return params.size().test(postVal);
|
||||
}
|
||||
|
||||
private boolean validateRank(ForwardIndexReader.DocPost post) {
|
||||
private boolean validateRank(long meta) {
|
||||
if (params.rank().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
|
||||
int postVal = DocumentMetadata.decodeRank(post.meta());
|
||||
int postVal = DocumentMetadata.decodeRank(meta);
|
||||
|
||||
return params.rank().test(postVal);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
The index journal contains a list of entries with keywords and keyword metadata per document.
|
||||
|
||||
This journal is written by [crawl-processes/loading-process](../../crawl-processes/loading-process) and read
|
||||
This journal is written by [processes/loading-process](../../processes/loading-process) and read
|
||||
when constructing the [forward](../index-forward) and [reverse](../index-reverse)
|
||||
indices.
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
The lexicon contains a mapping for words to identifiers. This lexicon is populated from a journal.
|
||||
The actual word data isn't mapped, but rather a 64 bit hash.
|
||||
|
||||
The lexicon is written by [crawl-processes/loading-process](../../crawl-processes/loading-process) and read when
|
||||
The lexicon is written by [processes/loading-process](../../processes/loading-process) and read when
|
||||
[services-core/index-service](../../services-core/index-service) interprets queries.
|
||||
|
||||
## Central Classes
|
||||
|
@ -13,6 +13,7 @@ java {
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
|
||||
|
@ -5,8 +5,8 @@ import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.keyword_extraction.KeywordExtractor;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
|
@ -11,7 +11,7 @@ public class CompressedBigString implements BigString {
|
||||
private final int length;
|
||||
private final byte[] encoded;
|
||||
|
||||
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();;
|
||||
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();
|
||||
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
|
||||
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();
|
||||
|
||||
|
19
code/libraries/language-processing/readme.md
Normal file
19
code/libraries/language-processing/readme.md
Normal file
@ -0,0 +1,19 @@
|
||||
# Language Processing
|
||||
|
||||
This library contains various tools used in language processing.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SentenceExtractor](src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java) -
|
||||
Creates a [DocumentLanguageData](src/main/java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
|
||||
its words, how they stem, POS tags, and so on.
|
||||
|
||||
* [TermFrequencyDict](src/main/java/nu/marginalia/language/statistics/TermFrequencyDict.java)
|
||||
* [NGramBloomFilter](src/main/java/nu/marginalia/language/statistics/NGramBloomFilter.java)
|
||||
|
||||
## See Also
|
||||
|
||||
[features-convert/keyword-extraction](../../features-convert/keyword-extraction) uses this code to identify which keywords
|
||||
are important.
|
||||
|
||||
[features-search/query-parser](../../features-search/query-parser) also does some language processing.
|
@ -1,98 +0,0 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
|
||||
public final class KeywordMetadata {
|
||||
|
||||
private static final WordFrequencyData empty = new WordFrequencyData(0);
|
||||
public final HashSet<String> titleKeywords = new HashSet<>(50);
|
||||
public final HashSet<String> subjectKeywords = new HashSet<>(10);
|
||||
public final HashSet<String> namesKeywords = new HashSet<>(50);
|
||||
|
||||
public final HashSet<String> urlKeywords = new HashSet<>(10);
|
||||
|
||||
public final HashSet<String> domainKeywords = new HashSet<>(10);
|
||||
|
||||
public final Object2IntOpenHashMap<String> wordsTfIdf;
|
||||
public final Object2IntOpenHashMap<String> positionMask;
|
||||
private final EnumSet<WordFlags> wordFlagsTemplate;
|
||||
|
||||
public KeywordMetadata(EnumSet<WordFlags> flags) {
|
||||
this.positionMask = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
this.wordsTfIdf = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
this.wordFlagsTemplate = flags;
|
||||
}
|
||||
|
||||
public KeywordMetadata() {
|
||||
this(EnumSet.noneOf(WordFlags.class));
|
||||
}
|
||||
|
||||
public long getMetadataForWord(EnumSet<WordFlags> flagsTemplate, String stemmed) {
|
||||
|
||||
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
|
||||
EnumSet<WordFlags> flags = flagsTemplate.clone();
|
||||
|
||||
if (tfidf > 100)
|
||||
flags.add(WordFlags.TfIdfHigh);
|
||||
|
||||
if (subjectKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Subjects);
|
||||
|
||||
if (namesKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.NamesWords);
|
||||
|
||||
if (titleKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Title);
|
||||
|
||||
if (urlKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.UrlPath);
|
||||
|
||||
if (domainKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.UrlDomain);
|
||||
|
||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||
|
||||
return new WordMetadata(tfidf, positions, flags).encode();
|
||||
}
|
||||
|
||||
public EnumSet<WordFlags> wordFlagsTemplate() {
|
||||
return wordFlagsTemplate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||
var that = (KeywordMetadata) obj;
|
||||
return Objects.equals(this.titleKeywords, that.titleKeywords) &&
|
||||
Objects.equals(this.subjectKeywords, that.subjectKeywords) &&
|
||||
Objects.equals(this.namesKeywords, that.namesKeywords) &&
|
||||
Objects.equals(this.wordsTfIdf, that.wordsTfIdf) &&
|
||||
Objects.equals(this.positionMask, that.positionMask) &&
|
||||
Objects.equals(this.wordFlagsTemplate, that.wordFlagsTemplate);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(titleKeywords, subjectKeywords, namesKeywords, wordsTfIdf, positionMask, wordFlagsTemplate);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "KeywordMetadata[" +
|
||||
"titleKeywords=" + titleKeywords + ", " +
|
||||
"subjectKeywords=" + subjectKeywords + ", " +
|
||||
"namesKeywords=" + namesKeywords + ", " +
|
||||
"wordsTfIdf=" + wordsTfIdf + ", " +
|
||||
"positionMask=" + positionMask + ", " +
|
||||
"wordFlagsTemplate=" + wordFlagsTemplate + ']';
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -37,16 +37,6 @@ public class WordSpan implements Comparable<WordSpan>{
|
||||
|
||||
}
|
||||
|
||||
public boolean hasSimilarWords(DocumentSentence s, WordSpan other) {
|
||||
for (int i = start; i < end; i++) {
|
||||
for (int j = other.start; j < other.end; j++) {
|
||||
if (s.stemmedWords[i].equals(s.stemmedWords[j]))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("WordSpan[%s,%s]", start, end);
|
||||
}
|
||||
|
@ -173,17 +173,7 @@ public class TermFrequencyDict {
|
||||
return wordRates.get(longHash(s.getBytes()));
|
||||
}
|
||||
|
||||
public static String getStemmedString(String s) {
|
||||
String[] strings = separator.split(s);
|
||||
if (s.length() > 1) {
|
||||
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// If this ever changes, we need to re-generate the term frequency dictionary
|
||||
public static long longHash(byte[]... bytesSets) {
|
||||
if (bytesSets == null || bytesSets.length == 0)
|
||||
return 0;
|
||||
|
@ -17,7 +17,7 @@ dependencies {
|
||||
implementation project(':code:common:service-client')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:crawl-models:common')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
4
code/process-models/converting-model/readme.md
Normal file
4
code/process-models/converting-model/readme.md
Normal file
@ -0,0 +1,4 @@
|
||||
# Converting Models
|
||||
|
||||
Contains models shared by the [converting-process](../../processes/converting-process/) and
|
||||
[loading-process](../../processes/loading-process/).
|
@ -1,10 +1,10 @@
|
||||
package nu.marginalia.converting.instruction;
|
||||
|
||||
import nu.marginalia.keyword_extraction.model.DocumentKeywords;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.converting.model.DocumentKeywords;
|
||||
import nu.marginalia.converting.instruction.instructions.DomainLink;
|
||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
|
||||
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.converting.instruction.instructions;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentKeywords;
|
||||
import nu.marginalia.keyword_extraction.model.DocumentKeywords;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.converting.instruction.Instruction;
|
||||
import nu.marginalia.converting.instruction.InstructionTag;
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.instruction.instructions;
|
||||
|
||||
import nu.marginalia.crawling.common.model.HtmlStandard;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.converting.instruction.Instruction;
|
||||
import nu.marginalia.converting.instruction.InstructionTag;
|
||||
@ -14,7 +13,7 @@ public record LoadProcessedDocument(EdgeUrl url,
|
||||
String title,
|
||||
String description,
|
||||
int htmlFeatures,
|
||||
HtmlStandard standard,
|
||||
String standard,
|
||||
int length,
|
||||
long hash,
|
||||
double quality,
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawling.common.model;
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
|
||||
public enum HtmlStandard {
|
@ -1,7 +1,7 @@
|
||||
# Crawling Models
|
||||
|
||||
Contains models shared by the [crawling-process](../../crawl-processes/crawling-process/) and
|
||||
[converting-process](../../crawl-processes/converting-process/).
|
||||
Contains models shared by the [crawling-process](../../processes/crawling-process/) and
|
||||
[converting-process](../../processes/converting-process/).
|
||||
|
||||
## Central Classes
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user