(control) New export actions for RSS/Atom feeds and term frequency data
This commit also refactors the executor a bit, and introduces a new converter-feature called data-extractors for this class of jobs.
This commit is contained in:
parent
4665af6c42
commit
c41e68aaab
@ -100,6 +100,13 @@ public class ExecutorClient extends AbstractDynamicClient {
|
||||
public void exportAtags(Context ctx, int node, String fid) {
|
||||
post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe();
|
||||
}
|
||||
public void exportRssFeeds(Context ctx, int node, String fid) {
|
||||
post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe();
|
||||
}
|
||||
public void exportTermFrequencies(Context ctx, int node, String fid) {
|
||||
post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe();
|
||||
}
|
||||
|
||||
public void exportData(Context ctx, int node) {
|
||||
post(ctx, node, "/export/data", "").blockingSubscribe();
|
||||
}
|
||||
|
40
code/features-convert/data-extractors/build.gradle
Normal file
40
code/features-convert/data-extractors/build.gradle
Normal file
@ -0,0 +1,40 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guice
|
||||
implementation libs.trove
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
7
code/features-convert/data-extractors/readme.md
Normal file
7
code/features-convert/data-extractors/readme.md
Normal file
@ -0,0 +1,7 @@
|
||||
Contains converter-*like* extraction jobs that operate on crawled data to produce export files.
|
||||
|
||||
## Important classes
|
||||
|
||||
* [AtagExporter](src/main/java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
|
||||
* [FeedExporter](src/main/java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
|
||||
* [TermFrequencyExporter](src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.
|
@ -0,0 +1,196 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.util.Objects;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class AtagExporter implements ExporterIf {
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
private final FileStorageService storageService;
|
||||
|
||||
@Inject
|
||||
public AtagExporter(FileStorageService storageService) {
|
||||
this.storageService = storageService;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
|
||||
var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
|
||||
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
|
||||
)
|
||||
{
|
||||
Path crawlerLogFile = inputDir.resolve("crawler.log");
|
||||
|
||||
var tagWriter = new ATagCsvWriter(bw);
|
||||
|
||||
for (var item : WorkLog.iterable(crawlerLogFile)) {
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
|
||||
exportLinks(tagWriter, stream);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(tmpFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
||||
ATagLinkFilter linkFilter = new ATagLinkFilter();
|
||||
|
||||
while (stream.hasNext()) {
|
||||
if (!(stream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (null == doc.documentBody)
|
||||
continue;
|
||||
|
||||
var baseUrl = new EdgeUrl(doc.url);
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
|
||||
for (var atag : parsed.getElementsByTag("a")) {
|
||||
String linkText = atag.text();
|
||||
|
||||
if (!linkFilter.isLinkTextEligible(linkText)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||
linkOpt
|
||||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
|
||||
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static class ATagLinkFilter {
|
||||
private final TLongHashSet hashes = new TLongHashSet();
|
||||
|
||||
private boolean isLinkTextEligible(String linkText) {
|
||||
// Filter out the most obviously uninteresting anchor texts
|
||||
|
||||
if (linkText.isBlank())
|
||||
return false;
|
||||
if (linkText.startsWith("this"))
|
||||
return false;
|
||||
if (linkText.equalsIgnoreCase("here"))
|
||||
return false;
|
||||
if (linkText.equalsIgnoreCase("click here"))
|
||||
return false;
|
||||
|
||||
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
|
||||
if (!"http".equals(url.proto) && !"https".equals(url.proto))
|
||||
return false;
|
||||
|
||||
// This is an artifact of the link parser typically
|
||||
if ("example.com".equals(url.domain.topDomain))
|
||||
return false;
|
||||
|
||||
if (linkText.contains(url.domain.toString()))
|
||||
return false;
|
||||
if (Objects.equals(url.domain, baseUrl.domain))
|
||||
return false;
|
||||
|
||||
String urlString = url.toString();
|
||||
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
|
||||
return false;
|
||||
}
|
||||
|
||||
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
|
||||
// need to be concerned about using the fast ASCII hash
|
||||
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static class ATagCsvWriter {
|
||||
private final BufferedWriter writer;
|
||||
|
||||
private ATagCsvWriter(BufferedWriter writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
|
||||
final String urlString = urlWithNoSchema(url);
|
||||
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(urlString),
|
||||
csvify(linkText),
|
||||
csvify(sourceDomain)));
|
||||
}
|
||||
|
||||
private static String urlWithNoSchema(EdgeUrl url) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(url.domain).append(url.path);
|
||||
|
||||
if (url.param != null)
|
||||
sb.append('?').append(url.param);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String csvify(Object field) {
|
||||
return field.toString().replace("\"", "\"\"");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
|
||||
public interface ExporterIf {
|
||||
void export(FileStorageId crawlId, FileStorageId destId) throws Exception;
|
||||
}
|
@ -0,0 +1,131 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class FeedExporter implements ExporterIf {
|
||||
private final FileStorageService storageService;
|
||||
|
||||
|
||||
@Inject
|
||||
public FeedExporter(FileStorageService storageService) {
|
||||
this.storageService = storageService;
|
||||
}
|
||||
|
||||
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
|
||||
var tmpFile = Files.createTempFile(destStorage.asPath(), "feeds", ".csv.gz",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
|
||||
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))))
|
||||
{
|
||||
Path crawlerLogFile = inputDir.resolve("crawler.log");
|
||||
|
||||
var tagWriter = new FeedCsvWriter(bw);
|
||||
|
||||
for (var item : WorkLog.iterable(crawlerLogFile)) {
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, crawlDataPath)) {
|
||||
exportFeeds(tagWriter, stream);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
Files.move(tmpFile, destStorage.asPath().resolve("feeds.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(tmpFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
||||
FeedExtractor feedExtractor = new FeedExtractor(new LinkParser());
|
||||
|
||||
int size = stream.sizeHint();
|
||||
|
||||
while (stream.hasNext()) {
|
||||
if (!(stream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (null == doc.documentBody)
|
||||
continue;
|
||||
|
||||
var baseUrl = new EdgeUrl(doc.url);
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
|
||||
List<EdgeUrl> feedUrls = new ArrayList<>();
|
||||
for (var link : parsed.select("link[rel=alternate]")) {
|
||||
feedExtractor
|
||||
.getFeedFromAlternateTag(baseUrl, link)
|
||||
.ifPresent(feedUrls::add);
|
||||
}
|
||||
|
||||
// Take the shortest path if there are multiple
|
||||
if (!feedUrls.isEmpty()) {
|
||||
feedUrls.sort(Comparator.comparing(url -> url.path.length()));
|
||||
exporter.accept(baseUrl.domain, size, feedUrls.getFirst());
|
||||
}
|
||||
|
||||
// Only consider the first viable document, otherwise this will be very slow
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static class FeedCsvWriter {
|
||||
private final BufferedWriter writer;
|
||||
|
||||
private FeedCsvWriter(BufferedWriter writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeDomain domain, int size, EdgeUrl path) {
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(domain),
|
||||
csvify(size),
|
||||
csvify(path)));
|
||||
}
|
||||
|
||||
private static String csvify(Object field) {
|
||||
return field.toString().replace("\"", "\"\"");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,147 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static nu.marginalia.term_frequency_dict.TermFrequencyDict.DOC_COUNT_KEY;
|
||||
import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
|
||||
|
||||
public class TermFrequencyExporter implements ExporterIf {
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
|
||||
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class);
|
||||
|
||||
@Inject
|
||||
public TermFrequencyExporter(FileStorageService storageService) {
|
||||
this.storageService = storageService;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
|
||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||
|
||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||
AtomicInteger docCount = new AtomicInteger();
|
||||
|
||||
try (ForkJoinPool fjp = new ForkJoinPool(Math.max(2, Runtime.getRuntime().availableProcessors() / 2))) {
|
||||
|
||||
Path crawlerLogFile = inputDir.resolve("crawler.log");
|
||||
|
||||
for (var item : WorkLog.iterable(crawlerLogFile)) {
|
||||
if (Thread.interrupted()) {
|
||||
fjp.shutdownNow();
|
||||
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
fjp.execute(() -> processFile(crawlDataPath, counts, docCount, se.get()));
|
||||
}
|
||||
|
||||
while (!fjp.isQuiescent()) {
|
||||
if (fjp.awaitQuiescence(10, TimeUnit.SECONDS))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
var tmpFile = Files.createTempFile(destStorage.asPath(), "freqs", ".dat.tmp",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var dos = new DataOutputStream(Files.newOutputStream(tmpFile))) {
|
||||
synchronized (counts) {
|
||||
counts.put(DOC_COUNT_KEY, docCount.get());
|
||||
|
||||
counts.forEachEntry((hash, cnt) -> {
|
||||
try {
|
||||
dos.writeLong(hash);
|
||||
dos.writeLong(cnt);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
Files.move(tmpFile, destStorage.asPath().resolve("freqs.dat"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error writing file {}", tmpFile, ex);
|
||||
Files.deleteIfExists(tmpFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) {
|
||||
TLongHashSet words = new TLongHashSet(10_000);
|
||||
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
|
||||
while (stream.hasNext()) {
|
||||
if (Thread.interrupted())
|
||||
return;
|
||||
|
||||
if (!(stream.next() instanceof CrawledDocument doc)) continue;
|
||||
if (doc.documentBody == null) continue;
|
||||
if (!doc.contentType.startsWith("text/html"))
|
||||
continue;
|
||||
|
||||
docCount.incrementAndGet();
|
||||
|
||||
Document parsed = Jsoup.parse(doc.documentBody);
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentLanguageData dld = se.extractSentences(parsed);
|
||||
|
||||
if (lf.dictionaryAgreement(dld) < 0.1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var word : sent) {
|
||||
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
|
||||
}
|
||||
}
|
||||
|
||||
synchronized (counts) {
|
||||
words.forEach(w -> {
|
||||
counts.adjustOrPutValue(w, 1, 1);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
words.clear();
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error processing file {}", crawlDataPath, ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
package nu.marginalia.link_parser;
|
||||
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.language;
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import nu.marginalia.LanguageModels;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.language;
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.language;
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.language;
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.language;
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import nu.marginalia.converting.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@ -0,0 +1,38 @@
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public class TestLanguageModels {
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
|
||||
public static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
||||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
|
||||
public static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
);
|
||||
}
|
||||
}
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.*;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
|
@ -79,8 +79,11 @@ public class ControlNodeActionsService {
|
||||
Spark.post("/public/nodes/:id/actions/new-crawl-specs", this::createNewSpecsAction,
|
||||
redirectControl.renderRedirectAcknowledgement("Creating", "../actions?view=new-crawl")
|
||||
);
|
||||
Spark.post("/public/nodes/:id/actions/export-data", this::exportData,
|
||||
redirectControl.renderRedirectAcknowledgement("Exporting", "../storage/exports")
|
||||
Spark.post("/public/nodes/:id/actions/export-db-data", this::exportDbData,
|
||||
redirectControl.renderRedirectAcknowledgement("Exporting", "..")
|
||||
);
|
||||
Spark.post("/public/nodes/:id/actions/export-from-crawl-data", this::exportFromCrawlData,
|
||||
redirectControl.renderRedirectAcknowledgement("Exporting", "..")
|
||||
);
|
||||
}
|
||||
|
||||
@ -233,8 +236,29 @@ public class ControlNodeActionsService {
|
||||
return "";
|
||||
}
|
||||
|
||||
private Object exportData(Request req, Response rsp) {
|
||||
private Object exportDbData(Request req, Response rsp) {
|
||||
executorClient.exportData(Context.fromRequest(req), Integer.parseInt(req.params("id")));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private Object exportFromCrawlData(Request req, Response rsp) {
|
||||
String exportType = req.queryParams("exportType");
|
||||
String source = req.queryParams("source");
|
||||
|
||||
if (exportType.equals("atags")) {
|
||||
executorClient.exportAtags(Context.fromRequest(req), Integer.parseInt(req.params("id")), source);
|
||||
}
|
||||
else if (exportType.equals("rss")) {
|
||||
executorClient.exportRssFeeds(Context.fromRequest(req), Integer.parseInt(req.params("id")), source);
|
||||
}
|
||||
else if (exportType.equals("termFreq")) {
|
||||
executorClient.exportTermFrequencies(Context.fromRequest(req), Integer.parseInt(req.params("id")), source);
|
||||
}
|
||||
else {
|
||||
rsp.status(404);
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
@ -97,9 +97,6 @@ public class ControlNodeService {
|
||||
Spark.post("/public/nodes/:id/storage/reset-state/:fid", this::resetState,
|
||||
redirectControl.renderRedirectAcknowledgement("Restoring", "..")
|
||||
);
|
||||
Spark.post("/public/nodes/:id/storage/:fid/export-atags", this::exportAtags,
|
||||
redirectControl.renderRedirectAcknowledgement("Exporting", "../../storage/exports")
|
||||
);
|
||||
Spark.post("/public/nodes/:id/fsms/:fsm/start", this::startFsm);
|
||||
Spark.post("/public/nodes/:id/fsms/:fsm/stop", this::stopFsm);
|
||||
}
|
||||
@ -109,11 +106,6 @@ public class ControlNodeService {
|
||||
return "";
|
||||
}
|
||||
|
||||
private Object exportAtags(Request req, Response rsp) {
|
||||
executorClient.exportAtags(Context.fromRequest(req), Integer.parseInt(req.params("id")), req.params("fid"));
|
||||
return "";
|
||||
}
|
||||
|
||||
public Object startFsm(Request req, Response rsp) throws Exception {
|
||||
executorClient.startFsm(Context.fromRequest(req), Integer.parseInt(req.params("id")), req.params("fsm").toUpperCase());
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
<h1 class="my-3">Export Data</h1>
|
||||
<h1 class="my-3">Create Data Export</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
This will export database data: Domains, blacklist and domain links. The exported data will be saved as
|
||||
a new <a href="/nodes/{{node.id}}/storage/exports">exports storage object</a>.
|
||||
</div>
|
||||
|
||||
<form method="post" action="actions/export-data" onsubmit="return confirm('Confirm export')">
|
||||
<form method="post" action="actions/export-db-data" onsubmit="return confirm('Confirm export')">
|
||||
<div class="my-3 py-3">
|
||||
<div class="row">
|
||||
<div class="col">
|
@ -0,0 +1,78 @@
|
||||
<h1 class="my-3">Export From Crawl Data</h1>
|
||||
|
||||
<div class="my-3 p-3 border bg-light">
|
||||
This will run an extraction job against a crawl data set. The generated data will be available as
|
||||
an <a href="/nodes/{{node.id}}/storage/exports">export object</a>.
|
||||
</div>
|
||||
|
||||
<form method="post" action="actions/export-from-crawl-data" onsubmit="return confirm('Confirm export')">
|
||||
<h2>Select a source</h2>
|
||||
|
||||
<table class="table">
|
||||
<tr>
|
||||
<th>Use</th>
|
||||
<th>Path</th>
|
||||
<th>Description</th>
|
||||
<th>Details</th>
|
||||
</tr>
|
||||
{{#each allCrawlData}}
|
||||
<tr>
|
||||
<td><input {{#if active}}checked{{/if}} {{#if new}}disabled{{/if}} {{#if delete}}disabled{{/if}} class="form-check-input" type="radio" name="source" id="{{id}}" value="{{id}}"></td>
|
||||
<td><label for="{{id}}" class="form-check-label" >{{path}}</label></td>
|
||||
<td>{{description}}
|
||||
<span class="text-danger">{{#if new}}[CREATING]{{/if}}</span>
|
||||
<span class="text-danger">{{#if delete}}[DELETING]{{/if}}</span>
|
||||
</td>
|
||||
<td><a href="/nodes/{{node}}/storage/details?fid={{id}}">[Details]</a></td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
|
||||
<h2>Select the export operation to run</h2>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="exportType" id="exportTypeAtags" value="atags">
|
||||
<label class="form-check-label" for="exportTypeAtags">
|
||||
Extract anchor texts
|
||||
</label>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
Creates a CSV file with information related to external anchor tags. External anchor tags can be
|
||||
used to improve search result accuracy, since they often describe what they are linking to better than
|
||||
the destination page itself.
|
||||
</small>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="exportType" id="exportTypeRSS" value="rss">
|
||||
<label class="form-check-label" for="exportTypeRSS">
|
||||
Extract RSS feeds
|
||||
</label>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
Run a best-effort attempt at extracting RSS and Atom feeds from the crawl data. The operation
|
||||
will only consider the root page of each crawl data set. It will only extract the shortest
|
||||
feed URL from each document. The result is a CSV. The CSV will contain the domain name,
|
||||
the feed URL, and the number of documents in the crawl data set for that particular domain.
|
||||
</small>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="exportType" id="exportTypeTermFreq" value="termFreq">
|
||||
<label class="form-check-label" for="exportTypeTermFreq">
|
||||
Extract term frequency data
|
||||
</label>
|
||||
<div>
|
||||
<small class="text-muted">
|
||||
Creates a binary data file consisting of term hashes and frequencies. This is the TF- side of TF-IDF,
|
||||
and is used to evaluate the importance of a term in relation to its frequency in a document.
|
||||
</small>
|
||||
</div>
|
||||
</div>
|
||||
<div class="my-3 py-3">
|
||||
<div class="row">
|
||||
<div class="col">
|
||||
<button type="submit" class="btn btn-primary">Export</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
@ -18,7 +18,8 @@
|
||||
{{#if view.sideload-encyclopedia}} {{> control/node/actions/partial-sideload-encyclopedia }} {{/if}}
|
||||
{{#if view.sideload-stackexchange}} {{> control/node/actions/partial-sideload-stackexchange }} {{/if}}
|
||||
{{#if view.sideload-warc}} {{> control/node/actions/partial-sideload-warc }} {{/if}}
|
||||
{{#if view.export-data}} {{> control/node/actions/partial-export-data }} {{/if}}
|
||||
{{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}}
|
||||
{{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}}
|
||||
{{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
|
||||
<div class="mt-10"> </div>
|
||||
</div>
|
||||
|
@ -51,15 +51,6 @@
|
||||
</form>
|
||||
{{/if}}
|
||||
|
||||
{{#if isAtagsExportable}}
|
||||
<form method="post" action="/nodes/{{node.id}}/storage/{{storage.id}}/export-atags" onsubmit="return confirm('Confirm export of anchor tags from {{storage.path}}')">
|
||||
<tr>
|
||||
<td>Export anchor tags from this crawl data</td>
|
||||
<td><button class="btn btn-secondary" type="submit">Export</button></td>
|
||||
</tr>
|
||||
</form>
|
||||
{{/if}}
|
||||
|
||||
{{#if isDeletable}}
|
||||
<form method="post" action="/nodes/{{node.id}}/storage/{{storage.id}}/delete" onsubmit="return confirm('Confirm deletion of {{storage.path}}')">
|
||||
<tr>
|
||||
|
@ -23,7 +23,8 @@
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li>
|
||||
<li><hr class="dropdown-divider"></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-data">Export Database Data</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-from-crawl-data">Export From Crawl Data</a></li>
|
||||
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=restore-backup">Restore Index Backup</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
|
@ -39,6 +39,7 @@ dependencies {
|
||||
implementation project(':code:process-models:crawl-spec')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:data-extractors')
|
||||
implementation project(':code:features-index:index-journal')
|
||||
implementation project(':code:api:index-api')
|
||||
implementation project(':code:api:query-api')
|
||||
|
@ -13,6 +13,8 @@ public enum ExecutorActor {
|
||||
CRAWL_JOB_EXTRACTOR,
|
||||
EXPORT_DATA,
|
||||
EXPORT_ATAGS,
|
||||
EXPORT_TERM_FREQUENCIES,
|
||||
EXPORT_FEEDS,
|
||||
PROC_INDEX_CONSTRUCTOR_SPAWNER,
|
||||
CONVERT,
|
||||
RESTORE_BACKUP;
|
||||
|
@ -44,6 +44,8 @@ public class ExecutorActorControlService {
|
||||
CrawlJobExtractorActor crawlJobExtractorActor,
|
||||
ExportDataActor exportDataActor,
|
||||
ExportAtagsActor exportAtagsActor,
|
||||
ExportFeedsActor exportFeedsActor,
|
||||
ExportTermFreqActor exportTermFrequenciesActor,
|
||||
ExecutorActorStateMachines stateMachines) {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
@ -68,6 +70,8 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor);
|
||||
register(ExecutorActor.EXPORT_DATA, exportDataActor);
|
||||
register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor);
|
||||
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
|
||||
register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor);
|
||||
}
|
||||
|
||||
private void register(ExecutorActor process, RecordActorPrototype graph) {
|
||||
|
@ -3,44 +3,19 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.extractor.AtagExporter;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.storage.model.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.Objects;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
@Singleton
|
||||
public class ExportAtagsActor extends RecordActorPrototype {
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
private final FileStorageService storageService;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final ExporterIf atagExporter;
|
||||
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
@ -55,46 +30,15 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
yield new Run(crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
|
||||
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
|
||||
)
|
||||
{
|
||||
Path crawlerLogFile = inputDir.resolve("crawler.log");
|
||||
|
||||
var tagWriter = new ATagCsvWriter(bw);
|
||||
|
||||
for (var item : WorkLog.iterable(crawlerLogFile)) {
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
|
||||
exportLinks(tagWriter, stream);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
try {
|
||||
atagExporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to export blacklist", ex);
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export blacklist");
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(tmpFile);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
|
||||
yield new End();
|
||||
@ -103,117 +47,6 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
};
|
||||
}
|
||||
|
||||
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
||||
ATagLinkFilter linkFilter = new ATagLinkFilter();
|
||||
|
||||
while (stream.hasNext()) {
|
||||
if (!(stream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (null == doc.documentBody)
|
||||
continue;
|
||||
|
||||
var baseUrl = new EdgeUrl(doc.url);
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
|
||||
for (var atag : parsed.getElementsByTag("a")) {
|
||||
String linkText = atag.text();
|
||||
|
||||
if (!linkFilter.isLinkTextEligible(linkText)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||
linkOpt
|
||||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
|
||||
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static class ATagLinkFilter {
|
||||
private final TLongHashSet hashes = new TLongHashSet();
|
||||
|
||||
private boolean isLinkTextEligible(String linkText) {
|
||||
// Filter out the most obviously uninteresting anchor texts
|
||||
|
||||
if (linkText.isBlank())
|
||||
return false;
|
||||
if (linkText.startsWith("this"))
|
||||
return false;
|
||||
if (linkText.equalsIgnoreCase("here"))
|
||||
return false;
|
||||
if (linkText.equalsIgnoreCase("click here"))
|
||||
return false;
|
||||
|
||||
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
|
||||
if (!"http".equals(url.proto) && !"https".equals(url.proto))
|
||||
return false;
|
||||
|
||||
// This is an artifact of the link parser typically
|
||||
if ("example.com".equals(url.domain.topDomain))
|
||||
return false;
|
||||
|
||||
if (linkText.contains(url.domain.toString()))
|
||||
return false;
|
||||
if (Objects.equals(url.domain, baseUrl.domain))
|
||||
return false;
|
||||
|
||||
String urlString = url.toString();
|
||||
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
|
||||
return false;
|
||||
}
|
||||
|
||||
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
|
||||
// need to be concerned about using the fast ASCII hash
|
||||
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static class ATagCsvWriter {
|
||||
private final BufferedWriter writer;
|
||||
|
||||
private ATagCsvWriter(BufferedWriter writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
|
||||
final String urlString = urlWithNoSchema(url);
|
||||
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(urlString),
|
||||
csvify(linkText),
|
||||
csvify(sourceDomain)));
|
||||
}
|
||||
|
||||
private static String urlWithNoSchema(EdgeUrl url) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(url.domain).append(url.path);
|
||||
|
||||
if (url.param != null)
|
||||
sb.append('?').append(url.param);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String csvify(Object field) {
|
||||
return field.toString().replace("\"", "\"\"");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
@ -222,10 +55,12 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
|
||||
@Inject
|
||||
public ExportAtagsActor(Gson gson,
|
||||
FileStorageService storageService)
|
||||
FileStorageService storageService,
|
||||
AtagExporter atagExporter)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.atagExporter = atagExporter;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,72 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.FeedExporter;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
@Singleton
|
||||
public class ExportFeedsActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final ExporterIf feedExporter;
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
feedExporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export RSS/Atom feeds from crawl data";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ExportFeedsActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
FeedExporter feedExporter)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.feedExporter = feedExporter;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.TermFrequencyExporter;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
@Singleton
|
||||
public class ExportTermFreqActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ExporterIf exporter;
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
exporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export term frequencies from crawl data";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ExportTermFreqActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
TermFrequencyExporter exporter)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.exporter = exporter;
|
||||
}
|
||||
|
||||
}
|
@ -72,6 +72,8 @@ public class ExecutorSvc extends Service {
|
||||
Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia);
|
||||
|
||||
Spark.post("/export/atags", exportService::exportAtags);
|
||||
Spark.post("/export/feeds", exportService::exportFeeds);
|
||||
Spark.post("/export/termfreq", exportService::exportTermFrequencies);
|
||||
Spark.post("/export/data", exportService::exportData);
|
||||
|
||||
Spark.post("/backup/:fid/restore", backupService::restore);
|
||||
|
@ -28,4 +28,14 @@ public class ExportService {
|
||||
return "";
|
||||
}
|
||||
|
||||
public Object exportFeeds(Request request, Response response) throws Exception {
|
||||
actorControlService.startFrom(ExecutorActor.EXPORT_FEEDS, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid"))));
|
||||
return "";
|
||||
}
|
||||
public Object exportTermFrequencies(Request request, Response response) throws Exception {
|
||||
actorControlService.startFrom(ExecutorActor.EXPORT_TERM_FREQUENCIES, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid"))));
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -2,20 +2,15 @@ package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import nu.marginalia.tools.LegacyExperiment;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
public class TopicExperiment extends LegacyExperiment {
|
||||
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.tools;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
|
@ -35,6 +35,7 @@ include 'code:features-index:result-ranking'
|
||||
|
||||
include 'code:features-convert:adblock'
|
||||
include 'code:features-convert:anchor-keywords'
|
||||
include 'code:features-convert:data-extractors'
|
||||
include 'code:features-convert:stackexchange-xml'
|
||||
include 'code:features-convert:pubdate'
|
||||
include 'code:features-convert:summary-extraction'
|
||||
|
Loading…
Reference in New Issue
Block a user