From 805afad4fe55da3ea8878bbfc96f1ed1fec32710 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 23 Jan 2024 17:07:45 +0100 Subject: [PATCH] (control) New GUI for exporting crawl data samples Not going to win any beauty pageants, but this is pretty peripheral functionality. --- .../executor/client/ExecutorClient.java | 4 + .../marginalia/process/log/WorkLogEntry.java | 3 +- .../data-extractors/build.gradle | 1 + .../extractor/SampleDataExporter.java | 97 +++++++++++++++++++ .../node/svc/ControlNodeActionsService.java | 12 +++ .../actions/partial-export-sample-data.hdb | 52 ++++++++++ .../templates/control/node/node-actions.hdb | 1 + .../control/node/partial-node-nav.hdb | 1 + .../nu/marginalia/actor/ExecutorActor.java | 2 +- .../actor/ExecutorActorControlService.java | 5 +- .../actor/task/ExportSampleDataActor.java | 79 +++++++++++++++ .../nu/marginalia/executor/ExecutorSvc.java | 1 + .../executor/svc/ExportService.java | 10 ++ 13 files changed, 265 insertions(+), 3 deletions(-) create mode 100644 code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java create mode 100644 code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-sample-data.hdb create mode 100644 code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java diff --git a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java index 294a9e86..b5554edd 100644 --- a/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java +++ b/code/api/executor-api/src/main/java/nu/marginalia/executor/client/ExecutorClient.java @@ -100,6 +100,10 @@ public class ExecutorClient extends AbstractDynamicClient { public void exportAtags(Context ctx, int node, FileStorageId fid) { post(ctx, node, "/export/atags?fid="+fid, "").blockingSubscribe(); } + public void exportSampleData(Context ctx, int node, FileStorageId fid, int size, String name) { + post(ctx, node, "/export/sample-data?fid="+fid+"&size="+size+"&name="+URLEncoder.encode(name, StandardCharsets.UTF_8), "").blockingSubscribe(); + } + public void exportRssFeeds(Context ctx, int node, FileStorageId fid) { post(ctx, node, "/export/feeds?fid="+fid, "").blockingSubscribe(); } diff --git a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java index 306389f6..f57d1880 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java @@ -27,11 +27,12 @@ public record WorkLogEntry(String id, String ts, String path, int cnt) { return path; } + /** Return the path relative to the crawl root */ public String relPath() { // Compatibility trick! String relPath = fileName(); - return relPath.substring(0, 2) + "/" + relPath.substring(2, 4) + "/" + relPath; + return STR."\{relPath.substring(0, 2)}/\{relPath.substring(2, 4)}/\{relPath}"; } } diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle index 217d5044..67c42633 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/features-convert/data-extractors/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation libs.guice implementation libs.trove implementation libs.commons.lang3 + implementation libs.commons.compress implementation libs.notnull implementation libs.jsoup diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java new file mode 100644 index 00000000..427e35e0 --- /dev/null +++ b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java @@ -0,0 +1,97 @@ +package nu.marginalia.extractor; + +import com.google.inject.Inject; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.process.log.WorkLogEntry; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorage; +import nu.marginalia.storage.model.FileStorageId; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.utils.IOUtils; + +public class SampleDataExporter { + private final FileStorageService storageService; + + @Inject + public SampleDataExporter(FileStorageService storageService) { + this.storageService = storageService; + } + public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException { + FileStorage destStorage = storageService.getStorage(destId); + Path inputDir = storageService.getStorage(crawlId).asPath(); + + Path crawlerLogFile = inputDir.resolve("crawler.log"); + + List entriesAll = new ArrayList<>(100_000); + + for (var item : WorkLog.iterable(crawlerLogFile)) { + if (item.cnt() < 2) continue; + entriesAll.add(item); + } + + if (entriesAll.size() > size) { + Collections.shuffle(entriesAll); + entriesAll = entriesAll.subList(0, size); + } + + Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) { + for (var item : entriesAll) { + bw.write(STR."\{item.id()} \{item.ts()} \{item.relPath()} \{item.cnt()}\n"); + } + } + + Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + Files.writeString(newManifestJsonFile, STR.""" + { "description": "\{name.replace("[\"\\]", "_")}", + "type": "CRAWL_DATA" } + """); + + var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar", + PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); + + try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) { + for (var item : entriesAll) { + Path crawlDataPath = inputDir.resolve(item.relPath()); + if (!Files.exists(crawlDataPath)) continue; + + addFileToTar(stream, crawlDataPath, item.relPath()); + } + + addFileToTar(stream, newCrawlerLogFile, "crawler.log"); + addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json"); + } + finally { + Files.deleteIfExists(newCrawlerLogFile); + Files.deleteIfExists(newManifestJsonFile); + } + + Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } + + private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException { + var entry = outputStream.createArchiveEntry(file.toFile(), fileName); + entry.setSize(Files.size(file)); + outputStream.putArchiveEntry(entry); + + try (var fis = Files.newInputStream(file)) { + IOUtils.copy(fis, outputStream); + } finally { + outputStream.closeArchiveEntry(); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index b729bad5..16b06998 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -86,6 +86,9 @@ public class ControlNodeActionsService { Spark.post("/public/nodes/:id/actions/export-from-crawl-data", this::exportFromCrawlData, redirectControl.renderRedirectAcknowledgement("Exporting", "..") ); + Spark.post("/public/nodes/:id/actions/export-sample-data", this::exportSampleData, + redirectControl.renderRedirectAcknowledgement("Exporting", "..") + ); } public Object sideloadEncyclopedia(Request request, Response response) { @@ -286,6 +289,15 @@ public class ControlNodeActionsService { return ""; } + private Object exportSampleData(Request req, Response rsp) { + FileStorageId source = parseSourceFileStorageId(req.queryParams("source")); + int size = Integer.parseInt(req.queryParams("size")); + String name = req.queryParams("name"); + + executorClient.exportSampleData(Context.fromRequest(req), Integer.parseInt(req.params("id")), source, size, name); + + return ""; + } private Path parseSourcePath(String source) { diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-sample-data.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-sample-data.hdb new file mode 100644 index 00000000..2a5cb223 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/node/actions/partial-export-sample-data.hdb @@ -0,0 +1,52 @@ +

Export Sample Data

+ +
+ This will create a set of sample crawl data form a larger set of crawl data. + The generated data will be available as an export object. +
+ +
+

Select a source

+ + + + + + + + + {{#each allCrawlData}} + + + + + + + {{/each}} +
UsePathDescriptionDetails
{{description}} + {{#if new}}[CREATING]{{/if}} + {{#if delete}}[DELETING]{{/if}} + [Details]
+ +

Parameters

+ +
+ +
+ How many domains to include in the sample set +
+
+ +
+ A name for the sample set. This name will show up in the + description of the crawl data when it's imported. +
+ +
+
+
+ +
+
+
+
\ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb index 682132ac..736c7961 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/node-actions.hdb @@ -21,6 +21,7 @@ {{#if view.sideload-dirtree}} {{> control/node/actions/partial-sideload-dirtree }} {{/if}} {{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}} {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} + {{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}} {{#if view.restore-backup}} {{> control/node/actions/partial-restore-backup }} {{/if}}
 
diff --git a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb index bf51f439..ddb5ff4e 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/node/partial-node-nav.hdb @@ -26,6 +26,7 @@
  • Sideload Dirtree
  • Export Database Data
  • +
  • Export Sample Crawl Data
  • Export From Crawl Data
  • Restore Index Backup
  • diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java index 53d9601e..d06549ba 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActor.java @@ -17,7 +17,7 @@ public enum ExecutorActor { EXPORT_FEEDS, PROC_INDEX_CONSTRUCTOR_SPAWNER, CONVERT, - RESTORE_BACKUP; + RESTORE_BACKUP, EXPORT_SAMPLE_DATA; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java index b02fc12a..9ff2d1ed 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -45,6 +45,7 @@ public class ExecutorActorControlService { ExportDataActor exportDataActor, ExportAtagsActor exportAtagsActor, ExportFeedsActor exportFeedsActor, + ExportSampleDataActor exportSampleDataActor, ExportTermFreqActor exportTermFrequenciesActor, ExecutorActorStateMachines stateMachines) { this.messageQueueFactory = messageQueueFactory; @@ -68,10 +69,12 @@ public class ExecutorActorControlService { register(ExecutorActor.ADJACENCY_CALCULATION, triggerAdjacencyCalculationActor); register(ExecutorActor.CRAWL_JOB_EXTRACTOR, crawlJobExtractorActor); + register(ExecutorActor.EXPORT_DATA, exportDataActor); register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor); - register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); + register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor); + register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); } private void register(ExecutorActor process, RecordActorPrototype graph) { diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java new file mode 100644 index 00000000..9954f619 --- /dev/null +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportSampleDataActor.java @@ -0,0 +1,79 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.extractor.ExporterIf; +import nu.marginalia.extractor.FeedExporter; +import nu.marginalia.extractor.SampleDataExporter; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBaseType; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageState; +import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDateTime; + +@Singleton +public class ExportSampleDataActor extends RecordActorPrototype { + private final FileStorageService storageService; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final SampleDataExporter dataExporter; + public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {} + public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Export(FileStorageId crawlId, int size, String name) -> { + var storageBase = storageService.getStorageBase(FileStorageBaseType.STORAGE); + var storage = storageService.allocateTemporaryStorage(storageBase, FileStorageType.EXPORT, + "crawl-sample-export", + STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}" + ); + + if (storage == null) yield new Error("Bad storage id"); + yield new Run(crawlId, storage.id(), size, name); + } + case Run(FileStorageId crawlId, FileStorageId destId, int size, String name) -> { + storageService.setFileStorageState(destId, FileStorageState.NEW); + + try { + dataExporter.export(crawlId, destId, size, name); + storageService.setFileStorageState(destId, FileStorageState.UNSET); + } + catch (Exception ex) { + storageService.setFileStorageState(destId, FileStorageState.DELETE); + + logger.error("Failed to export data", ex); + + yield new Error("Failed to export data"); + } + + yield new End(); + } + default -> new Error(); + }; + } + + + @Override + public String describe() { + return "Export RSS/Atom feeds from crawl data"; + } + + @Inject + public ExportSampleDataActor(Gson gson, + FileStorageService storageService, + SampleDataExporter dataExporter) + { + super(gson); + this.storageService = storageService; + this.dataExporter = dataExporter; + } + +} diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java index ee4d0891..d4420624 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/ExecutorSvc.java @@ -72,6 +72,7 @@ public class ExecutorSvc extends Service { Spark.post("/sideload/encyclopedia", sideloadService::sideloadEncyclopedia); Spark.post("/export/atags", exportService::exportAtags); + Spark.post("/export/sample-data", exportService::exportSampleData); Spark.post("/export/feeds", exportService::exportFeeds); Spark.post("/export/termfreq", exportService::exportTermFrequencies); Spark.post("/export/data", exportService::exportData); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java index 2ada2d7c..388af5b5 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/executor/svc/ExportService.java @@ -6,6 +6,7 @@ import nu.marginalia.actor.ExecutorActorControlService; import nu.marginalia.actor.task.ConvertActor; import nu.marginalia.actor.task.ExportAtagsActor; import nu.marginalia.actor.task.ExportDataActor; +import nu.marginalia.actor.task.ExportSampleDataActor; import nu.marginalia.storage.model.FileStorageId; import spark.Request; import spark.Response; @@ -23,6 +24,15 @@ public class ExportService { return ""; } + public Object exportSampleData(Request request, Response response) throws Exception { + actorControlService.startFrom(ExecutorActor.EXPORT_SAMPLE_DATA, new ExportSampleDataActor.Export( + FileStorageId.parse(request.queryParams("fid")), + Integer.parseInt(request.queryParams("size")), + request.queryParams("name") + )); + return ""; + } + public Object exportAtags(Request request, Response response) throws Exception { actorControlService.startFrom(ExecutorActor.EXPORT_ATAGS, new ExportAtagsActor.Export(FileStorageId.parse(request.queryParams("fid")))); return "";