(control) New GUI for exporting crawl data samples

Not going to win any beauty pageants, but this is pretty peripheral functionality.
This commit is contained in:
Viktor Lofgren 2024-01-23 17:07:45 +01:00
parent 400f4840ad
commit 805afad4fe
13 changed files with 265 additions and 3 deletions

View File

@ -100,6 +100,10 @@ public class ExecutorClient extends AbstractDynamicClient {
public void exportAtags(Context ctx, int node, FileStorageId fid) {
post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe();
}
public void exportSampleData(Context ctx, int node, FileStorageId fid, int size, String name) {
post(ctx, node, "/export/sample-data?fid="+fid+"&size="+size+"&name="+URLEncoder.encode(name, StandardCharsets.UTF_8), "").blockingSubscribe();
}
public void exportRssFeeds(Context ctx, int node, FileStorageId fid) {
post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe();
}

View File

@ -27,11 +27,12 @@ public record WorkLogEntry(String id, String ts, String path, int cnt) {
return path;
}
/** Return the path relative to the crawl root */
public String relPath() {
// Compatibility trick!
String relPath = fileName();
return relPath.substring(0, 2) + "/" + relPath.substring(2, 4) + "/" + relPath;
return STR."\{relPath.substring(0, 2)}/\{relPath.substring(2, 4)}/\{relPath}";
}
}

View File

@ -30,6 +30,7 @@ dependencies {
implementation libs.guice
implementation libs.trove
implementation libs.commons.lang3
implementation libs.commons.compress
implementation libs.notnull
implementation libs.jsoup

View File

@ -0,0 +1,97 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.utils.IOUtils;
public class SampleDataExporter {
private final FileStorageService storageService;
@Inject
public SampleDataExporter(FileStorageService storageService) {
this.storageService = storageService;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
Path crawlerLogFile = inputDir.resolve("crawler.log");
List<WorkLogEntry> entriesAll = new ArrayList<>(100_000);
for (var item : WorkLog.iterable(crawlerLogFile)) {
if (item.cnt() < 2) continue;
entriesAll.add(item);
}
if (entriesAll.size() > size) {
Collections.shuffle(entriesAll);
entriesAll = entriesAll.subList(0, size);
}
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) {
bw.write(STR."\{item.id()} \{item.ts()} \{item.relPath()} \{item.cnt()}\n");
}
}
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Files.writeString(newManifestJsonFile, STR."""
{ "description": "\{name.replace("[\"\\]", "_")}",
"type": "CRAWL_DATA" }
""");
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
for (var item : entriesAll) {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
addFileToTar(stream, crawlDataPath, item.relPath());
}
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
}
finally {
Files.deleteIfExists(newCrawlerLogFile);
Files.deleteIfExists(newManifestJsonFile);
}
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));
outputStream.putArchiveEntry(entry);
try (var fis = Files.newInputStream(file)) {
IOUtils.copy(fis, outputStream);
} finally {
outputStream.closeArchiveEntry();
}
}
}

View File

@ -86,6 +86,9 @@ public class ControlNodeActionsService {
Spark.post("/public/nodes/:id/actions/export-from-crawl-data", this::exportFromCrawlData,
redirectControl.renderRedirectAcknowledgement("Exporting", "..")
);
Spark.post("/public/nodes/:id/actions/export-sample-data", this::exportSampleData,
redirectControl.renderRedirectAcknowledgement("Exporting", "..")
);
}
public Object sideloadEncyclopedia(Request request, Response response) {
@ -286,6 +289,15 @@ public class ControlNodeActionsService {
return "";
}
private Object exportSampleData(Request req, Response rsp) {
FileStorageId source = parseSourceFileStorageId(req.queryParams("source"));
int size = Integer.parseInt(req.queryParams("size"));
String name = req.queryParams("name");
executorClient.exportSampleData(Context.fromRequest(req), Integer.parseInt(req.params("id")), source, size, name);
return "";
}
private Path parseSourcePath(String source) {

View File

@ -0,0 +1,52 @@
<h1 class="my-3">Export Sample Data</h1>
<div class="my-3 p-3 border bg-light">
This will create a set of sample crawl data form a larger set of crawl data.
The generated data will be available as an <a href="/nodes/{{node.id}}/storage/exports">export object</a>.
</div>
<form method="post" action="actions/export-sample-data" onsubmit="return confirm('Confirm export')">
<h2>Select a source</h2>
<table class="table">
<tr>
<th>Use</th>
<th>Path</th>
<th>Description</th>
<th>Details</th>
</tr>
{{#each allCrawlData}}
<tr>
<td><input {{#if active}}checked{{/if}} {{#if new}}disabled{{/if}} {{#if delete}}disabled{{/if}} class="form-check-input" type="radio" name="source" id="{{id}}" value="{{id}}"></td>
<td><label for="{{id}}" class="form-check-label" >{{path}}</label></td>
<td>{{description}}
<span class="text-danger">{{#if new}}[CREATING]{{/if}}</span>
<span class="text-danger">{{#if delete}}[DELETING]{{/if}}</span>
</td>
<td><a href="/nodes/{{node}}/storage/details?fid={{id}}">[Details]</a></td>
</tr>
{{/each}}
</table>
<h2>Parameters</h2>
<div class="mb-3">
<label for="size">Size</label>
<div><input type="text" name="size" id="size" pattern="\d+" /></div>
<small class="text-muted">How many domains to include in the sample set</small>
</div>
<div class="mb-3">
<label for="name">Name</label>
<div><input type="text" name="name" id="name" /></div>
<small class="text-muted">A name for the sample set. This name will show up in the
description of the crawl data when it's imported.</small>
</div>
<div class="my-3 py-3">
<div class="row">
<div class="col">
<button type="submit" class="btn btn-primary">Export</button>
</div>
</div>
</div>
</form>

View File

@ -21,6 +21,7 @@
{{#if view.sideload-dirtree}} {{> control/node/actions/partial-sideload-dirtree }} {{/if}}
{{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}}
{{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}}
{{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}}
{{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
<div class="mt-10">&nbsp;</div>
</div>

View File

@ -26,6 +26,7 @@
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-dirtree">Sideload Dirtree</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-sample-data">Export Sample Crawl Data</a></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-from-crawl-data">Export From Crawl Data</a></li>
<li><hr class="dropdown-divider"></li>
<li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=restore-backup">Restore Index Backup</a></li>

View File

@ -17,7 +17,7 @@ public enum ExecutorActor {
EXPORT_FEEDS,
PROC_INDEX_CONSTRUCTOR_SPAWNER,
CONVERT,
RESTORE_BACKUP;
RESTORE_BACKUP, EXPORT_SAMPLE_DATA;
public String id() {
return "fsm:" + name().toLowerCase();

View File

@ -45,6 +45,7 @@ public class ExecutorActorControlService {
ExportDataActor exportDataActor,
ExportAtagsActor exportAtagsActor,
ExportFeedsActor exportFeedsActor,
ExportSampleDataActor exportSampleDataActor,
ExportTermFreqActor exportTermFrequenciesActor,
ExecutorActorStateMachines stateMachines) {
this.messageQueueFactory = messageQueueFactory;
@ -68,10 +69,12 @@ public class ExecutorActorControlService {
register(ExecutorActor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor);
register(ExecutorActor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor);
register(ExecutorActor.EXPORT_DATA, exportDataActor);
register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor);
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor);
register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor);
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
}
private void register(ExecutorActor process, RecordActorPrototype graph) {

View File

@ -0,0 +1,79 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.extractor.ExporterIf;
import nu.marginalia.extractor.FeedExporter;
import nu.marginalia.extractor.SampleDataExporter;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.LocalDateTime;
@Singleton
public class ExportSampleDataActor extends RecordActorPrototype {
private final FileStorageService storageService;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final SampleDataExporter dataExporter;
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name) implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Export(FileStorageId crawlId, int size, String name) -> {
var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE);
var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT,
"crawl-sample-export",
STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}"
);
if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id(), size, name);
}
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name) -> {
storageService.setFileStorageState(destId, FileStorageState.NEW);
try {
dataExporter.export(crawlId, destId, size, name);
storageService.setFileStorageState(destId, FileStorageState.UNSET);
}
catch (Exception ex) {
storageService.setFileStorageState(destId, FileStorageState.DELETE);
logger.error("Failed to export data", ex);
yield new Error("Failed to export data");
}
yield new End();
}
default -> new Error();
};
}
@Override
public String describe() {
return "Export RSS/Atom feeds from crawl data";
}
@Inject
public ExportSampleDataActor(Gson gson,
FileStorageService storageService,
SampleDataExporter dataExporter)
{
super(gson);
this.storageService = storageService;
this.dataExporter = dataExporter;
}
}

View File

@ -72,6 +72,7 @@ public class ExecutorSvc extends Service {
Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia);
Spark.post("/export/atags", exportService::exportAtags);
Spark.post("/export/sample-data", exportService::exportSampleData);
Spark.post("/export/feeds", exportService::exportFeeds);
Spark.post("/export/termfreq", exportService::exportTermFrequencies);
Spark.post("/export/data", exportService::exportData);

View File

@ -6,6 +6,7 @@ import nu.marginalia.actor.ExecutorActorControlService;
import nu.marginalia.actor.task.ConvertActor;
import nu.marginalia.actor.task.ExportAtagsActor;
import nu.marginalia.actor.task.ExportDataActor;
import nu.marginalia.actor.task.ExportSampleDataActor;
import nu.marginalia.storage.model.FileStorageId;
import spark.Request;
import spark.Response;
@ -23,6 +24,15 @@ public class ExportService {
return "";
}
public Object exportSampleData(Request request, Response response) throws Exception {
actorControlService.startFrom(ExecutorActor.EXPORT_SAMPLE_DATA, new ExportSampleDataActor.Export(
FileStorageId.parse(request.queryParams("fid")),
Integer.parseInt(request.queryParams("size")),
request.queryParams("name")
));
return "";
}
public Object exportAtags(Request request, Response response) throws Exception {
actorControlService.startFrom(ExecutorActor.EXPORT_ATAGS, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid"))));
return "";