diff --git a/code/common/db/src/main/resources/db/migration/V23_09_1_000__filestorage_livedb.sql b/code/common/db/src/main/resources/db/migration/V23_09_1_000__drop_ecurl.sql similarity index 100% rename from code/common/db/src/main/resources/db/migration/V23_09_1_000__filestorage_livedb.sql rename to code/common/db/src/main/resources/db/migration/V23_09_1_000__drop_ecurl.sql diff --git a/code/common/db/src/main/resources/db/migration/V23_09_2_000__filestorage_backup.sql b/code/common/db/src/main/resources/db/migration/V23_09_2_000__filestorage_backup.sql new file mode 100644 index 00000000..15016501 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_09_2_000__filestorage_backup.sql @@ -0,0 +1,3 @@ +INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) +VALUES +('Backup Storage', '/backup', 'BACKUP', true); \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index 10c11e21..865e6d6b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -2,15 +2,8 @@ package nu.marginalia.converting; import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdOutputStream; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import java.io.BufferedOutputStream; import java.io.IOException; diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index c2cbcb24..3c2b5a0e 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -44,6 +44,7 @@ dependencies { implementation libs.prometheus implementation libs.notnull implementation libs.guice + implementation libs.zstd implementation libs.trove implementation libs.spark diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 773f9597..ae8c9f74 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -80,6 +80,7 @@ public class ControlService extends Service { var storageRenderer = rendererFactory.renderer("control/storage-overview"); var storageSpecsRenderer = rendererFactory.renderer("control/storage-specs"); var storageCrawlsRenderer = rendererFactory.renderer("control/storage-crawls"); + var storageBackupsRenderer = rendererFactory.renderer("control/storage-backups"); var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed"); var reviewRandomDomainsRenderer = rendererFactory.renderer("control/review-random-domains"); @@ -146,6 +147,7 @@ public class ControlService extends Service { Spark.get("/public/storage", this::storageModel, storageRenderer::render); Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render); Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); + Spark.get("/public/storage/backups", this::storageModelBackups, storageBackupsRenderer::render); Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage); @@ -157,6 +159,7 @@ public class ControlService extends Service { Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToActors); Spark.post("/public/storage/:fid/process-and-load", controlActorService::triggerProcessingWithLoad, redirectToActors); Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToActors); + Spark.post("/public/storage/:fid/restore-backup", controlActorService::restoreBackup, redirectToActors); Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); @@ -359,6 +362,9 @@ public class ControlService extends Service { private Object storageModelCrawls(Request request, Response response) { return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_DATA)); } + private Object storageModelBackups(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.BACKUP)); + } private Object storageModelProcessed(Request request, Response response) { return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.PROCESSED_DATA)); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java index 23db8d8b..9eb625ce 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java @@ -15,7 +15,8 @@ public enum Actor { EXPORT_DATA, TRUNCATE_LINK_DATABASE, INDEX_CONSTRUCTOR_MONITOR, - CONVERT; + CONVERT, + RESTORE_BACKUP; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index 3575ecb2..3aea2bf9 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -38,6 +38,7 @@ public class ControlActors { ConvertAndLoadActor convertAndLoadActor, CrawlActor crawlActor, RecrawlActor recrawlActor, + RestoreBackupActor restoreBackupActor, ConverterMonitorActor converterMonitorFSM, CrawlerMonitorActor crawlerMonitorActor, LoaderMonitorActor loaderMonitor, @@ -57,6 +58,7 @@ public class ControlActors { register(Actor.CRAWL, crawlActor); register(Actor.RECRAWL, recrawlActor); register(Actor.CONVERT, convertActor); + register(Actor.RESTORE_BACKUP, restoreBackupActor); register(Actor.CONVERT_AND_LOAD, convertAndLoadActor); register(Actor.INDEX_CONSTRUCTOR_MONITOR, indexConstructorMonitorActor); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java index c8e2acb9..df9d87f0 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java @@ -9,6 +9,8 @@ import lombok.With; import nu.marginalia.actor.ActorStateFactory; import nu.marginalia.control.process.ProcessOutboxes; import nu.marginalia.control.process.ProcessService; +import nu.marginalia.control.svc.BackupService; +import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mqapi.converting.ConvertAction; @@ -27,11 +29,15 @@ import nu.marginalia.actor.state.ActorState; import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.search.client.SearchClient; import nu.marginalia.search.client.SearchMqEndpoints; +import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - +import com.github.luben.zstd.ZstdOutputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.StandardCopyOption; +import java.sql.SQLException; +import java.time.LocalDateTime; @Singleton public class ConvertAndLoadActor extends AbstractActorPrototype { @@ -42,6 +48,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { public static final String RECONVERT = "RECONVERT"; public static final String RECONVERT_WAIT = "RECONVERT-WAIT"; public static final String LOAD = "LOAD"; + public static final String BACKUP = "BACKUP"; public static final String REPARTITION = "REPARTITION"; public static final String REINDEX_FWD = "REINDEX_FWD"; public static final String REINDEX_FULL = "REINDEX_FULL"; @@ -56,6 +63,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { private final MqOutbox indexOutbox; private final MqOutbox searchOutbox; private final FileStorageService storageService; + private final BackupService backupService; private final Gson gson; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -80,6 +88,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { FileStorageService storageService, IndexClient indexClient, SearchClient searchClient, + BackupService backupService, Gson gson ) { @@ -91,6 +100,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { this.mqLoaderOutbox = processOutboxes.getLoaderOutbox(); this.mqIndexConstructorOutbox = processOutboxes.getIndexConstructorOutbox(); this.storageService = storageService; + this.backupService = backupService; this.gson = gson; } @@ -163,12 +173,12 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { @ActorState( name = LOAD, - next = REPARTITION, + next = BACKUP, resume = ActorResumeBehavior.RETRY, description = """ Instruct the loader to process the data """) - public void load(Message message) throws Exception { + public Message load(Message message) throws Exception { if (message.loaderMsgId <= 0) { var request = new LoadRequest(message.processedStorageId); long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request)); @@ -180,6 +190,18 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { if (rsp.state() != MqMessageState.OK) error("Loader failed"); + return message; + } + + @ActorState( + name = BACKUP, + next = REPARTITION, + resume = ActorResumeBehavior.RETRY, + description = """ + Create a backup snapshot of the new data + """) + public void createBackup(Message message) throws SQLException, IOException { + backupService.createBackupFromStaging(message.processedStorageId); } @ActorState( diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RestoreBackupActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RestoreBackupActor.java new file mode 100644 index 00000000..96629208 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RestoreBackupActor.java @@ -0,0 +1,49 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import nu.marginalia.actor.ActorStateFactory; +import nu.marginalia.actor.prototype.AbstractActorPrototype; +import nu.marginalia.actor.state.ActorResumeBehavior; +import nu.marginalia.actor.state.ActorState; +import nu.marginalia.control.actor.Actor; +import nu.marginalia.control.svc.BackupService; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mq.persistence.MqPersistence; + + +public class RestoreBackupActor extends AbstractActorPrototype { + // States + + public static final String RESTORE = "RESTORE"; + public static final String END = "END"; + + private final BackupService backupService; + private final MqPersistence mqPersistence; + + @Override + public String describe() { + return "Restores a backed up set of index data"; + } + @Inject + public RestoreBackupActor(ActorStateFactory stateFactory, + MqPersistence mqPersistence, + BackupService backupService + ) { + super(stateFactory); + this.mqPersistence = mqPersistence; + this.backupService = backupService; + } + + @ActorState(name=RESTORE, next = END, resume = ActorResumeBehavior.ERROR) + public void restoreBackup(FileStorageId id) throws Exception { + backupService.restoreBackup(id); + + mqPersistence.sendNewMessage( + Actor.CONVERT_AND_LOAD.id(), + null, + null, + ConvertAndLoadActor.REPARTITION, + "", + null); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java index 803ad21a..d77d28f2 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java @@ -15,10 +15,16 @@ public record FileStorageWithActions(FileStorage storage) { public boolean isLoadable() { return storage.type() == FileStorageType.PROCESSED_DATA; } + public boolean isRestorable() { + return storage.type() == FileStorageType.BACKUP; + } public boolean isConvertible() { return storage.type() == FileStorageType.CRAWL_DATA; } public boolean isDeletable() { - return storage.base().type() == FileStorageBaseType.SLOW; + var baseType = storage.base().type(); + + return baseType == FileStorageBaseType.SLOW + || baseType == FileStorageBaseType.BACKUP; } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/BackupService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/BackupService.java new file mode 100644 index 00000000..edf7fb55 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/BackupService.java @@ -0,0 +1,97 @@ +package nu.marginalia.control.svc; + +import com.github.luben.zstd.ZstdInputStream; +import com.github.luben.zstd.ZstdOutputStream; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import org.apache.commons.io.IOUtils; + +import javax.inject.Inject; +import java.io.IOException; +import java.nio.file.Files; +import java.sql.SQLException; +import java.time.LocalDateTime; + +public class BackupService { + + private final FileStorageService storageService; + + @Inject + public BackupService(FileStorageService storageService) { + this.storageService = storageService; + } + + /** Create a new backup of the contents in the _STAGING storage areas. + * This backup can later be dehydrated and quickly loaded into _LIVE. + * */ + public void createBackupFromStaging(FileStorageId associatedId) throws SQLException, IOException { + var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP); + + String desc = "Pre-load backup snapshot " + LocalDateTime.now(); + + var backupStorage = storageService.allocateTemporaryStorage(backupBase, FileStorageType.BACKUP, "snapshot", desc); + + storageService.relateFileStorages(associatedId, backupStorage.id()); + + var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING); + var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING); + var lexiconStagingStorage = storageService.getStorageByType(FileStorageType.LEXICON_STAGING); + + backupFileCompressed("links.db", linkdbStagingStorage, backupStorage); + backupFileCompressed("dictionary.dat", lexiconStagingStorage, backupStorage); + // This file format is already compressed + backupFileNoCompression("page-index.dat", indexStagingStorage, backupStorage); + } + + + /** Read back a backup into _STAGING */ + public void restoreBackup(FileStorageId backupId) throws SQLException, IOException { + var backupStorage = storageService.getStorage(backupId); + + var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING); + var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING); + var lexiconStagingStorage = storageService.getStorageByType(FileStorageType.LEXICON_STAGING); + + restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage); + restoreBackupCompressed("dictionary.dat", lexiconStagingStorage, backupStorage); + restoreBackupNoCompression("page-index.dat", indexStagingStorage, backupStorage); + } + + + private void backupFileNoCompression(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException + { + try (var is = Files.newInputStream(inputStorage.asPath().resolve(fileName)); + var os = Files.newOutputStream(backupStorage.asPath().resolve(fileName)) + ) { + IOUtils.copyLarge(is, os); + } + } + + private void restoreBackupNoCompression(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException { + try (var is = Files.newInputStream(backupStorage.asPath().resolve(fileName)); + var os = Files.newOutputStream(inputStorage.asPath().resolve(fileName)) + ) { + IOUtils.copyLarge(is, os); + } + } + + private void backupFileCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException + { + try (var is = Files.newInputStream(inputStorage.asPath().resolve(fileName)); + var os = new ZstdOutputStream(Files.newOutputStream(backupStorage.asPath().resolve(fileName))) + ) { + IOUtils.copyLarge(is, os); + } + } + private void restoreBackupCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException + { + try (var is = new ZstdInputStream(Files.newInputStream(backupStorage.asPath().resolve(fileName))); + var os = Files.newOutputStream(backupStorage.asPath().resolve(fileName)) + ) { + IOUtils.copyLarge(is, os); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index 7507e3d1..fbb2b818 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -3,10 +3,7 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; -import nu.marginalia.control.actor.task.ConvertActor; -import nu.marginalia.control.actor.task.CrawlJobExtractorActor; -import nu.marginalia.control.actor.task.ConvertAndLoadActor; -import nu.marginalia.control.actor.task.RecrawlActor; +import nu.marginalia.control.actor.task.*; import nu.marginalia.control.actor.Actor; import nu.marginalia.control.model.ActorRunState; import nu.marginalia.control.model.ActorStateGraph; @@ -158,4 +155,12 @@ public class ControlActorService { return ""; } + + public Object restoreBackup(Request request, Response response) throws Exception { + var fid = FileStorageId.parse(request.params("fid")); + controlActors.startFrom(Actor.RESTORE_BACKUP, RestoreBackupActor.RESTORE, fid); + return ""; + } + + } \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb index 575797f9..85c39898 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb @@ -3,4 +3,5 @@ Specifications Crawl Data Processed Data + Backups \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-backups.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-backups.hdb new file mode 100644 index 00000000..b5450dd1 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-backups.hdb @@ -0,0 +1,27 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
+ {{> control/partials/storage-types}} +

Backups

+ {{> control/partials/storage-table}} + +

About

+

Backups are compressed snapshots of index data, lexicon data and the document database.

+

Assuming no changes have been made to the binary format of these files, they are recoverable.

+
+ + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb index 65cbd144..ebb8f033 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb @@ -59,6 +59,14 @@ {{/if}} + {{#if isRestorable}} +
+ + Restore into live index + + +
+ {{/if}} {{#if isLoadable}}
diff --git a/docker-compose.yml b/docker-compose.yml index 0c5c3fd1..fd9c2a1a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,7 @@ x-svc: &service - "run/env/service.env" volumes: - vol:/vol + - backup:/backup - conf:/wmsa/conf:ro - model:/wmsa/model - data:/wmsa/data @@ -123,6 +124,12 @@ volumes: type: none o: bind device: run/vol + backup: + driver: local + driver_opts: + type: none + o: bind + device: run/backup logs: driver: local driver_opts: diff --git a/run/setup.sh b/run/setup.sh index 0ea929cf..71d736b7 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -18,7 +18,7 @@ function download_model { pushd $(dirname $0) -mkdir -p model logs db samples install vol/ir/{0,1}/ vol/{lr,lw} vol/iw/{0,1}/search-sets vol/{tmpf,tmps} vol/ss vol/{ldbw,ldbr} data samples/export +mkdir -p model logs db samples backup install vol/{ir,iw} vol/{lr,lw} vol/ss vol/{ldbw,ldbr} data samples/export download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR