(index,control) Recoverable index backups

This commit is contained in:
Viktor Lofgren 2023-08-25 14:57:43 +02:00
parent e710e057e2
commit 194a6057dd
17 changed files with 245 additions and 17 deletions

View File

@ -0,0 +1,3 @@
INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP)
VALUES
('Backup Storage', '/backup', 'BACKUP', true);

View File

@ -2,15 +2,8 @@ package nu.marginalia.converting;
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.converting.instruction.Interpreter;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.converting.instruction.instructions.DomainLink;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument;
import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import java.io.BufferedOutputStream;
import java.io.IOException;

View File

@ -44,6 +44,7 @@ dependencies {
implementation libs.prometheus
implementation libs.notnull
implementation libs.guice
implementation libs.zstd
implementation libs.trove
implementation libs.spark

View File

@ -80,6 +80,7 @@ public class ControlService extends Service {
var storageRenderer = rendererFactory.renderer("control/storage-overview");
var storageSpecsRenderer = rendererFactory.renderer("control/storage-specs");
var storageCrawlsRenderer = rendererFactory.renderer("control/storage-crawls");
var storageBackupsRenderer = rendererFactory.renderer("control/storage-backups");
var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed");
var reviewRandomDomainsRenderer = rendererFactory.renderer("control/review-random-domains");
@ -146,6 +147,7 @@ public class ControlService extends Service {
Spark.get("/public/storage", this::storageModel, storageRenderer::render);
Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render);
Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render);
Spark.get("/public/storage/backups", this::storageModelBackups, storageBackupsRenderer::render);
Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render);
Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render);
Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage);
@ -157,6 +159,7 @@ public class ControlService extends Service {
Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToActors);
Spark.post("/public/storage/:fid/process-and-load", controlActorService::triggerProcessingWithLoad, redirectToActors);
Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToActors);
Spark.post("/public/storage/:fid/restore-backup", controlActorService::restoreBackup, redirectToActors);
Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage);
Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage);
@ -359,6 +362,9 @@ public class ControlService extends Service {
private Object storageModelCrawls(Request request, Response response) {
return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_DATA));
}
private Object storageModelBackups(Request request, Response response) {
return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.BACKUP));
}
private Object storageModelProcessed(Request request, Response response) {
return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.PROCESSED_DATA));
}

View File

@ -15,7 +15,8 @@ public enum Actor {
EXPORT_DATA,
TRUNCATE_LINK_DATABASE,
INDEX_CONSTRUCTOR_MONITOR,
CONVERT;
CONVERT,
RESTORE_BACKUP;
public String id() {
return "fsm:" + name().toLowerCase();

View File

@ -38,6 +38,7 @@ public class ControlActors {
ConvertAndLoadActor convertAndLoadActor,
CrawlActor crawlActor,
RecrawlActor recrawlActor,
RestoreBackupActor restoreBackupActor,
ConverterMonitorActor converterMonitorFSM,
CrawlerMonitorActor crawlerMonitorActor,
LoaderMonitorActor loaderMonitor,
@ -57,6 +58,7 @@ public class ControlActors {
register(Actor.CRAWL, crawlActor);
register(Actor.RECRAWL, recrawlActor);
register(Actor.CONVERT, convertActor);
register(Actor.RESTORE_BACKUP, restoreBackupActor);
register(Actor.CONVERT_AND_LOAD, convertAndLoadActor);
register(Actor.INDEX_CONSTRUCTOR_MONITOR, indexConstructorMonitorActor);

View File

@ -9,6 +9,8 @@ import lombok.With;
import nu.marginalia.actor.ActorStateFactory;
import nu.marginalia.control.process.ProcessOutboxes;
import nu.marginalia.control.process.ProcessService;
import nu.marginalia.control.svc.BackupService;
import nu.marginalia.db.storage.model.FileStorage;
import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.IndexMqEndpoints;
import nu.marginalia.mqapi.converting.ConvertAction;
@ -27,11 +29,15 @@ import nu.marginalia.actor.state.ActorState;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.search.client.SearchClient;
import nu.marginalia.search.client.SearchMqEndpoints;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.github.luben.zstd.ZstdOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.sql.SQLException;
import java.time.LocalDateTime;
@Singleton
public class ConvertAndLoadActor extends AbstractActorPrototype {
@ -42,6 +48,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
public static final String RECONVERT = "RECONVERT";
public static final String RECONVERT_WAIT = "RECONVERT-WAIT";
public static final String LOAD = "LOAD";
public static final String BACKUP = "BACKUP";
public static final String REPARTITION = "REPARTITION";
public static final String REINDEX_FWD = "REINDEX_FWD";
public static final String REINDEX_FULL = "REINDEX_FULL";
@ -56,6 +63,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
private final MqOutbox indexOutbox;
private final MqOutbox searchOutbox;
private final FileStorageService storageService;
private final BackupService backupService;
private final Gson gson;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -80,6 +88,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
FileStorageService storageService,
IndexClient indexClient,
SearchClient searchClient,
BackupService backupService,
Gson gson
)
{
@ -91,6 +100,7 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
this.mqLoaderOutbox = processOutboxes.getLoaderOutbox();
this.mqIndexConstructorOutbox = processOutboxes.getIndexConstructorOutbox();
this.storageService = storageService;
this.backupService = backupService;
this.gson = gson;
}
@ -163,12 +173,12 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
@ActorState(
name = LOAD,
next = REPARTITION,
next = BACKUP,
resume = ActorResumeBehavior.RETRY,
description = """
Instruct the loader to process the data
""")
public void load(Message message) throws Exception {
public Message load(Message message) throws Exception {
if (message.loaderMsgId <= 0) {
var request = new LoadRequest(message.processedStorageId);
long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request));
@ -180,6 +190,18 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
if (rsp.state() != MqMessageState.OK)
error("Loader failed");
return message;
}
@ActorState(
name = BACKUP,
next = REPARTITION,
resume = ActorResumeBehavior.RETRY,
description = """
Create a backup snapshot of the new data
""")
public void createBackup(Message message) throws SQLException, IOException {
backupService.createBackupFromStaging(message.processedStorageId);
}
@ActorState(

View File

@ -0,0 +1,49 @@
package nu.marginalia.control.actor.task;
import com.google.inject.Inject;
import nu.marginalia.actor.ActorStateFactory;
import nu.marginalia.actor.prototype.AbstractActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorState;
import nu.marginalia.control.actor.Actor;
import nu.marginalia.control.svc.BackupService;
import nu.marginalia.db.storage.model.FileStorageId;
import nu.marginalia.mq.persistence.MqPersistence;
public class RestoreBackupActor extends AbstractActorPrototype {
// States
public static final String RESTORE = "RESTORE";
public static final String END = "END";
private final BackupService backupService;
private final MqPersistence mqPersistence;
@Override
public String describe() {
return "Restores a backed up set of index data";
}
@Inject
public RestoreBackupActor(ActorStateFactory stateFactory,
MqPersistence mqPersistence,
BackupService backupService
) {
super(stateFactory);
this.mqPersistence = mqPersistence;
this.backupService = backupService;
}
@ActorState(name=RESTORE, next = END, resume = ActorResumeBehavior.ERROR)
public void restoreBackup(FileStorageId id) throws Exception {
backupService.restoreBackup(id);
mqPersistence.sendNewMessage(
Actor.CONVERT_AND_LOAD.id(),
null,
null,
ConvertAndLoadActor.REPARTITION,
"",
null);
}
}

View File

@ -15,10 +15,16 @@ public record FileStorageWithActions(FileStorage storage) {
public boolean isLoadable() {
return storage.type() == FileStorageType.PROCESSED_DATA;
}
public boolean isRestorable() {
return storage.type() == FileStorageType.BACKUP;
}
public boolean isConvertible() {
return storage.type() == FileStorageType.CRAWL_DATA;
}
public boolean isDeletable() {
return storage.base().type() == FileStorageBaseType.SLOW;
var baseType = storage.base().type();
return baseType == FileStorageBaseType.SLOW
|| baseType == FileStorageBaseType.BACKUP;
}
}

View File

@ -0,0 +1,97 @@
package nu.marginalia.control.svc;
import com.github.luben.zstd.ZstdInputStream;
import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.db.storage.model.FileStorage;
import nu.marginalia.db.storage.model.FileStorageBaseType;
import nu.marginalia.db.storage.model.FileStorageId;
import nu.marginalia.db.storage.model.FileStorageType;
import org.apache.commons.io.IOUtils;
import javax.inject.Inject;
import java.io.IOException;
import java.nio.file.Files;
import java.sql.SQLException;
import java.time.LocalDateTime;
public class BackupService {
private final FileStorageService storageService;
@Inject
public BackupService(FileStorageService storageService) {
this.storageService = storageService;
}
/** Create a new backup of the contents in the _STAGING storage areas.
* This backup can later be dehydrated and quickly loaded into _LIVE.
* */
public void createBackupFromStaging(FileStorageId associatedId) throws SQLException, IOException {
var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP);
String desc = "Pre-load backup snapshot " + LocalDateTime.now();
var backupStorage = storageService.allocateTemporaryStorage(backupBase, FileStorageType.BACKUP, "snapshot", desc);
storageService.relateFileStorages(associatedId, backupStorage.id());
var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING);
var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING);
var lexiconStagingStorage = storageService.getStorageByType(FileStorageType.LEXICON_STAGING);
backupFileCompressed("links.db", linkdbStagingStorage, backupStorage);
backupFileCompressed("dictionary.dat", lexiconStagingStorage, backupStorage);
// This file format is already compressed
backupFileNoCompression("page-index.dat", indexStagingStorage, backupStorage);
}
/** Read back a backup into _STAGING */
public void restoreBackup(FileStorageId backupId) throws SQLException, IOException {
var backupStorage = storageService.getStorage(backupId);
var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING);
var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING);
var lexiconStagingStorage = storageService.getStorageByType(FileStorageType.LEXICON_STAGING);
restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage);
restoreBackupCompressed("dictionary.dat", lexiconStagingStorage, backupStorage);
restoreBackupNoCompression("page-index.dat", indexStagingStorage, backupStorage);
}
private void backupFileNoCompression(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException
{
try (var is = Files.newInputStream(inputStorage.asPath().resolve(fileName));
var os = Files.newOutputStream(backupStorage.asPath().resolve(fileName))
) {
IOUtils.copyLarge(is, os);
}
}
private void restoreBackupNoCompression(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException {
try (var is = Files.newInputStream(backupStorage.asPath().resolve(fileName));
var os = Files.newOutputStream(inputStorage.asPath().resolve(fileName))
) {
IOUtils.copyLarge(is, os);
}
}
private void backupFileCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException
{
try (var is = Files.newInputStream(inputStorage.asPath().resolve(fileName));
var os = new ZstdOutputStream(Files.newOutputStream(backupStorage.asPath().resolve(fileName)))
) {
IOUtils.copyLarge(is, os);
}
}
private void restoreBackupCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException
{
try (var is = new ZstdInputStream(Files.newInputStream(backupStorage.asPath().resolve(fileName)));
var os = Files.newOutputStream(backupStorage.asPath().resolve(fileName))
) {
IOUtils.copyLarge(is, os);
}
}
}

View File

@ -3,10 +3,7 @@ package nu.marginalia.control.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.control.actor.ControlActors;
import nu.marginalia.control.actor.task.ConvertActor;
import nu.marginalia.control.actor.task.CrawlJobExtractorActor;
import nu.marginalia.control.actor.task.ConvertAndLoadActor;
import nu.marginalia.control.actor.task.RecrawlActor;
import nu.marginalia.control.actor.task.*;
import nu.marginalia.control.actor.Actor;
import nu.marginalia.control.model.ActorRunState;
import nu.marginalia.control.model.ActorStateGraph;
@ -158,4 +155,12 @@ public class ControlActorService {
return "";
}
public Object restoreBackup(Request request, Response response) throws Exception {
var fid = FileStorageId.parse(request.params("fid"));
controlActors.startFrom(Actor.RESTORE_BACKUP, RestoreBackupActor.RESTORE, fid);
return "";
}
}

View File

@ -3,4 +3,5 @@
<a href="/storage/specs">Specifications</a>
<a href="/storage/crawls">Crawl Data</a>
<a href="/storage/processed">Processed Data</a>
<a href="/storage/backups">Backups</a>
</nav>

View File

@ -0,0 +1,27 @@
<!DOCTYPE html>
<html>
<head>
<title>Control Service</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link rel="stylesheet" href="/style.css" />
</head>
<body>
{{> control/partials/nav}}
<section>
{{> control/partials/storage-types}}
<h1>Backups</h1>
{{> control/partials/storage-table}}
<h2>About</h2>
<p>Backups are compressed snapshots of index data, lexicon data and the document database.</p>
<p>Assuming no changes have been made to the binary format of these files, they are recoverable.</p>
</section>
</body>
<script src="/refresh.js"></script>
<script>
window.setInterval(() => {
refresh(["storage"]);
}, 30000);
</script>
</html>

View File

@ -59,6 +59,14 @@
</tr>
</form>
{{/if}}
{{#if isRestorable}}
<form method="post" action="/storage/{{storage.id}}/restore-backup" onsubmit="return confirm('Confirm restoring backup {{storage.path}}')">
<tr>
<td>Restore into live index</td>
<td><button type="submit">Restore</button></td>
</tr>
</form>
{{/if}}
{{#if isLoadable}}
<form method="post" action="/storage/{{storage.id}}/load" onsubmit="return confirm('Confirm loading of {{storage.path}}')">
<tr>

View File

@ -3,6 +3,7 @@ x-svc: &service
- "run/env/service.env"
volumes:
- vol:/vol
- backup:/backup
- conf:/wmsa/conf:ro
- model:/wmsa/model
- data:/wmsa/data
@ -123,6 +124,12 @@ volumes:
type: none
o: bind
device: run/vol
backup:
driver: local
driver_opts:
type: none
o: bind
device: run/backup
logs:
driver: local
driver_opts:

View File

@ -18,7 +18,7 @@ function download_model {
pushd $(dirname $0)
mkdir -p model logs db samples install vol/ir/{0,1}/ vol/{lr,lw} vol/iw/{0,1}/search-sets vol/{tmpf,tmps} vol/ss vol/{ldbw,ldbr} data samples/export
mkdir -p model logs db samples backup install vol/{ir,iw} vol/{lr,lw} vol/ss vol/{ldbw,ldbr} data samples/export
download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR