(control) New export actions for RSS/Atom feeds and term frequency data

This commit also refactors the executor a bit, and introduces a new converter-feature called data-extractors for this class of jobs.
This commit is contained in:
Viktor Lofgren 2024-01-15 14:54:26 +01:00
parent 4665af6c42
commit c41e68aaab
35 changed files with 864 additions and 214 deletions

View File

@ -100,6 +100,13 @@ public class ExecutorClient extends AbstractDynamicClient {
public void exportAtags(Context ctx, int node, String fid) {
post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe();
}
public void exportRssFeeds(Context ctx, int node, String fid) {
post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe();
}
public void exportTermFrequencies(Context ctx, int node, String fid) {
post(ctx, node, "/export/termfreq?fid="+fid, "").blockingSubscribe();
}
public void exportData(Context ctx, int node) {
post(ctx, node, "/export/data", "").blockingSubscribe();
}

View File

@ -0,0 +1,40 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:common:model')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:converting-process')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.trove
implementation libs.commons.lang3
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,7 @@
Contains converter-*like* extraction jobs that operate on crawled data to produce export files.
## Important classes
* [AtagExporter](src/main/java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
* [FeedExporter](src/main/java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
* [TermFrequencyExporter](src/main/java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.

View File

@ -0,0 +1,196 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.Objects;
import java.util.zip.GZIPOutputStream;
public class AtagExporter implements ExporterIf {
private static final LinkParser linkParser = new LinkParser();
private static final MurmurHash3_128 hash = new MurmurHash3_128();
private final FileStorageService storageService;
@Inject
public AtagExporter(FileStorageService storageService) {
this.storageService = storageService;
}
@Override
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
FileStorage destStorage = storageService.getStorage(destId);
var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Path inputDir = storageService.getStorage(crawlId).asPath();
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
)
{
Path crawlerLogFile = inputDir.resolve("crawler.log");
var tagWriter = new ATagCsvWriter(bw);
for (var item : WorkLog.iterable(crawlerLogFile)) {
if (Thread.interrupted()) {
throw new InterruptedException();
}
Path crawlDataPath = inputDir.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
exportLinks(tagWriter, stream);
}
catch (Exception ex) {
ex.printStackTrace();
}
}
Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
catch (Exception ex) {
}
finally {
Files.deleteIfExists(tmpFile);
}
}
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
ATagLinkFilter linkFilter = new ATagLinkFilter();
while (stream.hasNext()) {
if (!(stream.next() instanceof CrawledDocument doc))
continue;
if (null == doc.documentBody)
continue;
var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody);
for (var atag : parsed.getElementsByTag("a")) {
String linkText = atag.text();
if (!linkFilter.isLinkTextEligible(linkText)) {
continue;
}
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
linkOpt
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
}
}
return true;
}
private static class ATagLinkFilter {
private final TLongHashSet hashes = new TLongHashSet();
private boolean isLinkTextEligible(String linkText) {
// Filter out the most obviously uninteresting anchor texts
if (linkText.isBlank())
return false;
if (linkText.startsWith("this"))
return false;
if (linkText.equalsIgnoreCase("here"))
return false;
if (linkText.equalsIgnoreCase("click here"))
return false;
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
return false;
return true;
}
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
if (!"http".equals(url.proto) && !"https".equals(url.proto))
return false;
// This is an artifact of the link parser typically
if ("example.com".equals(url.domain.topDomain))
return false;
if (linkText.contains(url.domain.toString()))
return false;
if (Objects.equals(url.domain, baseUrl.domain))
return false;
String urlString = url.toString();
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
return false;
}
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
// need to be concerned about using the fast ASCII hash
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
return false;
}
return true;
}
}
private static class ATagCsvWriter {
private final BufferedWriter writer;
private ATagCsvWriter(BufferedWriter writer) {
this.writer = writer;
}
@SneakyThrows
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
final String urlString = urlWithNoSchema(url);
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
csvify(urlString),
csvify(linkText),
csvify(sourceDomain)));
}
private static String urlWithNoSchema(EdgeUrl url) {
StringBuilder sb = new StringBuilder();
sb.append(url.domain).append(url.path);
if (url.param != null)
sb.append('?').append(url.param);
return sb.toString();
}
private static String csvify(Object field) {
return field.toString().replace("\"", "\"\"");
}
}
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.extractor;
import nu.marginalia.storage.model.FileStorageId;
public interface ExporterIf {
void export(FileStorageId crawlId, FileStorageId destId) throws Exception;
}

View File

@ -0,0 +1,131 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.link_parser.FeedExtractor;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import org.jsoup.Jsoup;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.zip.GZIPOutputStream;
public class FeedExporter implements ExporterIf {
private final FileStorageService storageService;
@Inject
public FeedExporter(FileStorageService storageService) {
this.storageService = storageService;
}
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
FileStorage destStorage = storageService.getStorage(destId);
var tmpFile = Files.createTempFile(destStorage.asPath(), "feeds", ".csv.gz",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Path inputDir = storageService.getStorage(crawlId).asPath();
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))))
{
Path crawlerLogFile = inputDir.resolve("crawler.log");
var tagWriter = new FeedCsvWriter(bw);
for (var item : WorkLog.iterable(crawlerLogFile)) {
if (Thread.interrupted()) {
throw new InterruptedException();
}
Path crawlDataPath = inputDir.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, crawlDataPath)) {
exportFeeds(tagWriter, stream);
}
catch (Exception ex) {
ex.printStackTrace();
}
}
Files.move(tmpFile, destStorage.asPath().resolve("feeds.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
finally {
Files.deleteIfExists(tmpFile);
}
}
private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
FeedExtractor feedExtractor = new FeedExtractor(new LinkParser());
int size = stream.sizeHint();
while (stream.hasNext()) {
if (!(stream.next() instanceof CrawledDocument doc))
continue;
if (null == doc.documentBody)
continue;
var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody);
List<EdgeUrl> feedUrls = new ArrayList<>();
for (var link : parsed.select("link[rel=alternate]")) {
feedExtractor
.getFeedFromAlternateTag(baseUrl, link)
.ifPresent(feedUrls::add);
}
// Take the shortest path if there are multiple
if (!feedUrls.isEmpty()) {
feedUrls.sort(Comparator.comparing(url -> url.path.length()));
exporter.accept(baseUrl.domain, size, feedUrls.getFirst());
}
// Only consider the first viable document, otherwise this will be very slow
break;
}
return true;
}
private static class FeedCsvWriter {
private final BufferedWriter writer;
private FeedCsvWriter(BufferedWriter writer) {
this.writer = writer;
}
@SneakyThrows
public void accept(EdgeDomain domain, int size, EdgeUrl path) {
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
csvify(domain),
csvify(size),
csvify(path)));
}
private static String csvify(Object field) {
return field.toString().replace("\"", "\"\"");
}
}
}

View File

@ -0,0 +1,147 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import static nu.marginalia.term_frequency_dict.TermFrequencyDict.DOC_COUNT_KEY;
import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
public class TermFrequencyExporter implements ExporterIf {
private final FileStorageService storageService;
private final LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
private static final Logger logger = LoggerFactory.getLogger(TermFrequencyExporter.class);
@Inject
public TermFrequencyExporter(FileStorageService storageService) {
this.storageService = storageService;
}
@Override
public void export(FileStorageId crawlId, FileStorageId destId) throws Exception {
Path inputDir = storageService.getStorage(crawlId).asPath();
FileStorage destStorage = storageService.getStorage(destId);
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
AtomicInteger docCount = new AtomicInteger();
try (ForkJoinPool fjp = new ForkJoinPool(Math.max(2, Runtime.getRuntime().availableProcessors() / 2))) {
Path crawlerLogFile = inputDir.resolve("crawler.log");
for (var item : WorkLog.iterable(crawlerLogFile)) {
if (Thread.interrupted()) {
fjp.shutdownNow();
throw new InterruptedException();
}
Path crawlDataPath = inputDir.resolve(item.relPath());
fjp.execute(() -> processFile(crawlDataPath, counts, docCount, se.get()));
}
while (!fjp.isQuiescent()) {
if (fjp.awaitQuiescence(10, TimeUnit.SECONDS))
break;
}
}
var tmpFile = Files.createTempFile(destStorage.asPath(), "freqs", ".dat.tmp",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var dos = new DataOutputStream(Files.newOutputStream(tmpFile))) {
synchronized (counts) {
counts.put(DOC_COUNT_KEY, docCount.get());
counts.forEachEntry((hash, cnt) -> {
try {
dos.writeLong(hash);
dos.writeLong(cnt);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
});
}
Files.move(tmpFile, destStorage.asPath().resolve("freqs.dat"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
catch (Exception ex) {
logger.error("Error writing file {}", tmpFile, ex);
Files.deleteIfExists(tmpFile);
}
}
private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) {
TLongHashSet words = new TLongHashSet(10_000);
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
while (stream.hasNext()) {
if (Thread.interrupted())
return;
if (!(stream.next() instanceof CrawledDocument doc)) continue;
if (doc.documentBody == null) continue;
if (!doc.contentType.startsWith("text/html"))
continue;
docCount.incrementAndGet();
Document parsed = Jsoup.parse(doc.documentBody);
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.extractSentences(parsed);
if (lf.dictionaryAgreement(dld) < 0.1) {
return;
}
for (var sent : dld.sentences) {
for (var word : sent) {
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
}
}
synchronized (counts) {
words.forEach(w -> {
counts.adjustOrPutValue(w, 1, 1);
return true;
});
}
words.clear();
}
}
catch (Exception ex) {
logger.error("Error processing file {}", crawlDataPath, ex);
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic;
package nu.marginalia.link_parser;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.language;
package nu.marginalia.language.filter;
import com.github.jfasttext.JFastText;
import nu.marginalia.LanguageModels;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.language;
package nu.marginalia.language.filter;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.language;
package nu.marginalia.language.filter;
import nu.marginalia.language.model.DocumentLanguageData;

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.language;
package nu.marginalia.language.filter;
import nu.marginalia.language.model.DocumentLanguageData;

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.language;
package nu.marginalia.language.filter;
import nu.marginalia.converting.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;

View File

@ -0,0 +1,38 @@
package nu.marginalia.language.filter;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
public static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
public static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz")
);
}
}

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.processor.MetaRobotsTag;
@ -12,6 +12,7 @@ import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.converting.processor.plugin.specialization.*;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.link_parser.FeedExtractor;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;

View File

@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument;

View File

@ -79,8 +79,11 @@ public class ControlNodeActionsService {
Spark.post("/public/nodes/:id/actions/new-crawl-specs", this::createNewSpecsAction,
redirectControl.renderRedirectAcknowledgement("Creating", "../actions?view=new-crawl")
);
Spark.post("/public/nodes/:id/actions/export-data", this::exportData,
redirectControl.renderRedirectAcknowledgement("Exporting", "../storage/exports")
Spark.post("/public/nodes/:id/actions/export-db-data", this::exportDbData,
redirectControl.renderRedirectAcknowledgement("Exporting", "..")
);
Spark.post("/public/nodes/:id/actions/export-from-crawl-data", this::exportFromCrawlData,
redirectControl.renderRedirectAcknowledgement("Exporting", "..")
);
}
@ -233,8 +236,29 @@ public class ControlNodeActionsService {
return "";
}
private Object exportData(Request req, Response rsp) {
private Object exportDbData(Request req, Response rsp) {
executorClient.exportData(Context.fromRequest(req), Integer.parseInt(req.params("id")));
return "";
}
private Object exportFromCrawlData(Request req, Response rsp) {
String exportType = req.queryParams("exportType");
String source = req.queryParams("source");
if (exportType.equals("atags")) {
executorClient.exportAtags(Context.fromRequest(req), Integer.parseInt(req.params("id")), source);
}
else if (exportType.equals("rss")) {
executorClient.exportRssFeeds(Context.fromRequest(req), Integer.parseInt(req.params("id")), source);
}
else if (exportType.equals("termFreq")) {
executorClient.exportTermFrequencies(Context.fromRequest(req), Integer.parseInt(req.params("id")), source);
}
else {
rsp.status(404);
}
return "";
}
}

View File

@ -97,9 +97,6 @@ public class ControlNodeService {
Spark.post("/public/nodes/:id/storage/reset-state/:fid", this::resetState,
redirectControl.renderRedirectAcknowledgement("Restoring", "..")
);
Spark.post("/public/nodes/:id/storage/:fid/export-atags", this::exportAtags,
redirectControl.renderRedirectAcknowledgement("Exporting", "../../storage/exports")
);
Spark.post("/public/nodes/:id/fsms/:fsm/start", this::startFsm);
Spark.post("/public/nodes/:id/fsms/:fsm/stop", this::stopFsm);
}
@ -109,11 +106,6 @@ public class ControlNodeService {
return "";
}
private Object exportAtags(Request req, Response rsp) {
executorClient.exportAtags(Context.fromRequest(req), Integer.parseInt(req.params("id")), req.params("fid"));
return "";
}
public Object startFsm(Request req, Response rsp) throws Exception {
executorClient.startFsm(Context.fromRequest(req), Integer.parseInt(req.params("id")), req.params("fsm").toUpperCase());

View File

@ -1,11 +1,11 @@
<h1 class="my-3">Export Data</h1>
<h1 class="my-3">Create Data Export</h1>
<div class="my-3 p-3 border bg-light">
This will export database data: Domains, blacklist and domain links. The exported data will be saved as
a new <a href="/nodes/{{node.id}}/storage/exports">exports storage object</a>.
</div>
<form method="post" action="actions/export-data" onsubmit="return confirm('Confirm export')">
<form method="post" action="actions/export-db-data" onsubmit="return confirm('Confirm export')">
<div class="my-3 py-3">
<div class="row">
<div class="col">

View File

@ -0,0 +1,78 @@
<h1 class="my-3">Export From Crawl Data</h1>
<div class="my-3 p-3 border bg-light">
This will run an extraction job against a crawl data set. The generated data will be available as
an <a href="/nodes/{{node.id}}/storage/exports">export object</a>.
</div>
<form method="post" action="actions/export-from-crawl-data" onsubmit="return confirm('Confirm export')">
<h2>Select a source</h2>
<table class="table">
<tr>
<th>Use</th>
<th>Path</th>
<th>Description</th>
<th>Details</th>
</tr>
{{#each allCrawlData}}
<tr>
<td><input {{#if active}}checked{{/if}} {{#if new}}disabled{{/if}} {{#if delete}}disabled{{/if}} class="form-check-input" type="radio" name="source" id="{{id}}" value="{{id}}"></td>
<td><label for="{{id}}" class="form-check-label" >{{path}}</label></td>
<td>{{description}}
<span class="text-danger">{{#if new}}[CREATING]{{/if}}</span>
<span class="text-danger">{{#if delete}}[DELETING]{{/if}}</span>
</td>
<td><a href="/nodes/{{node}}/storage/details?fid={{id}}">[Details]</a></td>
</tr>
{{/each}}
</table>
<h2>Select the export operation to run</h2>
<div class="form-check">
<input class="form-check-input" type="radio" name="exportType" id="exportTypeAtags" value="atags">
<label class="form-check-label" for="exportTypeAtags">
Extract anchor texts
</label>
<div>
<small class="text-muted">
Creates a CSV file with information related to external anchor tags. External anchor tags can be
used to improve search result accuracy, since they often describe what they are linking to better than
the destination page itself.
</small>
</div>
</div>
<div class="form-check">
<input class="form-check-input" type="radio" name="exportType" id="exportTypeRSS" value="rss">
<label class="form-check-label" for="exportTypeRSS">
Extract RSS feeds
</label>
<div>
<small class="text-muted">
Run a best-effort attempt at extracting RSS and Atom feeds from the crawl data. The operation
will only consider the root page of each crawl data set. It will only extract the shortest
feed URL from each document. The result is a CSV. The CSV will contain the domain name,
the feed URL, and the number of documents in the crawl data set for that particular domain.
</small>
</div>
</div>
<div class="form-check">
<input class="form-check-input" type="radio" name="exportType" id="exportTypeTermFreq" value="termFreq">
<label class="form-check-label" for="exportTypeTermFreq">
Extract term frequency data
</label>
<div>
<small class="text-muted">
Creates a binary data file consisting of term hashes and frequencies. This is the TF- side of TF-IDF,
and is used to evaluate the importance of a term in relation to its frequency in a document.
</small>
</div>
</div>
<div class="my-3 py-3">
<div class="row">
<div class="col">
<button type="submit" class="btn btn-primary">Export</button>
</div>
</div>
</div>
</form>

View File

@ -18,7 +18,8 @@
{{#if view.sideload-encyclopedia}} {{> control/node/actions/partial-sideload-encyclopedia }} {{/if}}
{{#if view.sideload-stackexchange}} {{> control/node/actions/partial-sideload-stackexchange }} {{/if}}
{{#if view.sideload-warc}} {{> control/node/actions/partial-sideload-warc }} {{/if}}
{{#if view.export-data}} {{> control/node/actions/partial-export-data }} {{/if}}
{{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}}
{{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}}
{{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
<div class="mt-10">&nbsp;</div>
</div>

View File

@ -51,15 +51,6 @@
</form>
{{/if}}
{{#if isAtagsExportable}}
<form method="post" action="/nodes/{{node.id}}/storage/{{storage.id}}/export-atags" onsubmit="return confirm('Confirm export of anchor tags from {{storage.path}}')">
<tr>
<td>Export anchor tags from this crawl data</td>
<td><button class="btn btn-secondary" type="submit">Export</button></td>
</tr>
</form>
{{/if}}
{{#if isDeletable}}
<form method="post" action="/nodes/{{node.id}}/storage/{{storage.id}}/delete" onsubmit="return confirm('Confirm deletion of {{storage.path}}')">
<tr>

View File

@ -23,7 +23,8 @@
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-data">Export Database Data</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-from-crawl-data">Export From Crawl Data</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=restore-backup">Restore Index Backup</a></li>
</ul>
</li>

View File

@ -39,6 +39,7 @@ dependencies {
implementation project(':code:process-models:crawl-spec')
implementation project(':code:process-models:crawling-model')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:data-extractors')
implementation project(':code:features-index:index-journal')
implementation project(':code:api:index-api')
implementation project(':code:api:query-api')

View File

@ -13,6 +13,8 @@ public enum ExecutorActor {
CRAWL_JOB_EXTRACTOR,
EXPORT_DATA,
EXPORT_ATAGS,
EXPORT_TERM_FREQUENCIES,
EXPORT_FEEDS,
PROC_INDEX_CONSTRUCTOR_SPAWNER,
CONVERT,
RESTORE_BACKUP;

View File

@ -44,6 +44,8 @@ public class ExecutorActorControlService {
CrawlJobExtractorActor crawlJobExtractorActor,
ExportDataActor exportDataActor,
ExportAtagsActor exportAtagsActor,
ExportFeedsActor exportFeedsActor,
ExportTermFreqActor exportTermFrequenciesActor,
ExecutorActorStateMachines stateMachines) {
this.messageQueueFactory = messageQueueFactory;
this.eventLog = baseServiceParams.eventLog;
@ -68,6 +70,8 @@ public class ExecutorActorControlService {
register(ExecutorActor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor);
register(ExecutorActor.EXPORT_DATA, exportDataActor);
register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor);
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor);
}
private void register(ExecutorActor process, RecordActorPrototype graph) {

View File

@ -3,44 +3,19 @@ package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.extractor.AtagExporter;
import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.storage.model.*;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.time.LocalDateTime;
import java.util.Objects;
import java.util.zip.GZIPOutputStream;
@Singleton
public class ExportAtagsActor extends RecordActorPrototype {
private static final LinkParser linkParser = new LinkParser();
private static final MurmurHash3_128 hash = new MurmurHash3_128();
private final FileStorageService storageService;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ExporterIf atagExporter;
public record Export(FileStorageId crawlId) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
@ -55,46 +30,15 @@ public class ExportAtagsActor extends RecordActorPrototype {
yield new Run(crawlId, storage.id());
}
case Run(FileStorageId crawlId, FileStorageId destId) -> {
FileStorage destStorage = storageService.getStorage(destId);
storageService.setFileStorageState(destId, FileStorageState.NEW);
var tmpFile = Files.createTempFile(destStorage.asPath(), "atags", ".csv.gz",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Path inputDir = storageService.getStorage(crawlId).asPath();
try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))));
)
{
Path crawlerLogFile = inputDir.resolve("crawler.log");
var tagWriter = new ATagCsvWriter(bw);
for (var item : WorkLog.iterable(crawlerLogFile)) {
if (Thread.interrupted()) {
throw new InterruptedException();
}
Path crawlDataPath = inputDir.resolve(item.relPath());
try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) {
exportLinks(tagWriter, stream);
}
catch (Exception ex) {
ex.printStackTrace();
}
}
Files.move(tmpFile, destStorage.asPath().resolve("atags.csv.gz"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
try {
atagExporter.export(crawlId, destId);
storageService.setFileStorageState(destId, FileStorageState.UNSET);
}
catch (Exception ex) {
logger.error("Failed to export blacklist", ex);
storageService.setFileStorageState(destId, FileStorageState.DELETE);
yield new Error("Failed to export blacklist");
}
finally {
Files.deleteIfExists(tmpFile);
yield new Error("Failed to export data");
}
yield new End();
@ -103,117 +47,6 @@ public class ExportAtagsActor extends RecordActorPrototype {
};
}
private boolean exportLinks(ATagCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
ATagLinkFilter linkFilter = new ATagLinkFilter();
while (stream.hasNext()) {
if (!(stream.next() instanceof CrawledDocument doc))
continue;
if (null == doc.documentBody)
continue;
var baseUrl = new EdgeUrl(doc.url);
var parsed = Jsoup.parse(doc.documentBody);
for (var atag : parsed.getElementsByTag("a")) {
String linkText = atag.text();
if (!linkFilter.isLinkTextEligible(linkText)) {
continue;
}
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
linkOpt
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
}
}
return true;
}
private static class ATagLinkFilter {
private final TLongHashSet hashes = new TLongHashSet();
private boolean isLinkTextEligible(String linkText) {
// Filter out the most obviously uninteresting anchor texts
if (linkText.isBlank())
return false;
if (linkText.startsWith("this"))
return false;
if (linkText.equalsIgnoreCase("here"))
return false;
if (linkText.equalsIgnoreCase("click here"))
return false;
if (!StringUtils.isAsciiPrintable(linkText)) // This also filters out newlines, a good thing!
return false;
return true;
}
private boolean isEligible(EdgeUrl url, EdgeUrl baseUrl, String linkText) {
if (!"http".equals(url.proto) && !"https".equals(url.proto))
return false;
// This is an artifact of the link parser typically
if ("example.com".equals(url.domain.topDomain))
return false;
if (linkText.contains(url.domain.toString()))
return false;
if (Objects.equals(url.domain, baseUrl.domain))
return false;
String urlString = url.toString();
if (!StringUtils.isAsciiPrintable(urlString)) { // This also filters out newlines, a good thing!
return false;
}
// Deduplicate by hash; we've already checked that the strings are ASCII printable so we don't
// need to be concerned about using the fast ASCII hash
if (hashes.add(hash.hashLowerBytes(linkText) ^ hash.hashLowerBytes(urlString))) {
return false;
}
return true;
}
}
private static class ATagCsvWriter {
private final BufferedWriter writer;
private ATagCsvWriter(BufferedWriter writer) {
this.writer = writer;
}
@SneakyThrows
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
final String urlString = urlWithNoSchema(url);
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
csvify(urlString),
csvify(linkText),
csvify(sourceDomain)));
}
private static String urlWithNoSchema(EdgeUrl url) {
StringBuilder sb = new StringBuilder();
sb.append(url.domain).append(url.path);
if (url.param != null)
sb.append('?').append(url.param);
return sb.toString();
}
private static String csvify(Object field) {
return field.toString().replace("\"", "\"\"");
}
}
@Override
public String describe() {
@ -222,10 +55,12 @@ public class ExportAtagsActor extends RecordActorPrototype {
@Inject
public ExportAtagsActor(Gson gson,
FileStorageService storageService)
FileStorageService storageService,
AtagExporter atagExporter)
{
super(gson);
this.storageService = storageService;
this.atagExporter = atagExporter;
}
}

View File

@ -0,0 +1,72 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.extractor.FeedExporter;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.LocalDateTime;
@Singleton
public class ExportFeedsActor extends RecordActorPrototype {
private final FileStorageService storageService;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ExporterIf feedExporter;
public record Export(FileStorageId crawlId) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Export(FileStorageId crawlId) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id());
}
case Run(FileStorageId crawlId, FileStorageId destId) -> {
storageService.setFileStorageState(destId, FileStorageState.NEW);
try {
feedExporter.export(crawlId, destId);
storageService.setFileStorageState(destId, FileStorageState.UNSET);
}
catch (Exception ex) {
storageService.setFileStorageState(destId, FileStorageState.DELETE);
yield new Error("Failed to export data");
}
yield new End();
}
default -> new Error();
};
}
@Override
public String describe() {
return "Export RSS/Atom feeds from crawl data";
}
@Inject
public ExportFeedsActor(Gson gson,
FileStorageService storageService,
FeedExporter feedExporter)
{
super(gson);
this.storageService = storageService;
this.feedExporter = feedExporter;
}
}

View File

@ -0,0 +1,68 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.extractor.TermFrequencyExporter;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import java.time.LocalDateTime;
@Singleton
public class ExportTermFreqActor extends RecordActorPrototype {
private final FileStorageService storageService;
private final ExporterIf exporter;
public record Export(FileStorageId crawlId) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Export(FileStorageId crawlId) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id());
}
case Run(FileStorageId crawlId, FileStorageId destId) -> {
storageService.setFileStorageState(destId, FileStorageState.NEW);
try {
exporter.export(crawlId, destId);
storageService.setFileStorageState(destId, FileStorageState.UNSET);
}
catch (Exception ex) {
storageService.setFileStorageState(destId, FileStorageState.DELETE);
yield new Error("Failed to export data");
}
yield new End();
}
default -> new Error();
};
}
@Override
public String describe() {
return "Export term frequencies from crawl data";
}
@Inject
public ExportTermFreqActor(Gson gson,
FileStorageService storageService,
TermFrequencyExporter exporter)
{
super(gson);
this.storageService = storageService;
this.exporter = exporter;
}
}

View File

@ -72,6 +72,8 @@ public class ExecutorSvc extends Service {
Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia);
Spark.post("/export/atags", exportService::exportAtags);
Spark.post("/export/feeds", exportService::exportFeeds);
Spark.post("/export/termfreq", exportService::exportTermFrequencies);
Spark.post("/export/data", exportService::exportData);
Spark.post("/backup/:fid/restore", backupService::restore);

View File

@ -28,4 +28,14 @@ public class ExportService {
return "";
}
public Object exportFeeds(Request request, Response response) throws Exception {
actorControlService.startFrom(ExecutorActor.EXPORT_FEEDS, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid"))));
return "";
}
public Object exportTermFrequencies(Request request, Response response) throws Exception {
actorControlService.startFrom(ExecutorActor.EXPORT_TERM_FREQUENCIES, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid"))));
return "";
}
}

View File

@ -2,20 +2,15 @@ package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.tools.Experiment;
import nu.marginalia.tools.LegacyExperiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class TopicExperiment extends LegacyExperiment {

View File

@ -3,7 +3,7 @@ package nu.marginalia.tools;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor;

View File

@ -35,6 +35,7 @@ include 'code:features-index:result-ranking'
include 'code:features-convert:adblock'
include 'code:features-convert:anchor-keywords'
include 'code:features-convert:data-extractors'
include 'code:features-convert:stackexchange-xml'
include 'code:features-convert:pubdate'
include 'code:features-convert:summary-extraction'