feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
This commit is contained in:
@@ -0,0 +1,400 @@
|
||||
package volsnap
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// Lifecycle is the deploy-side seam Engine.Restore needs but volsnap must not
|
||||
// import directly (it would couple the snapshot package to the deployer/docker
|
||||
// packages). The composition root supplies an adapter over the Deployer +
|
||||
// Docker client via Engine.SetLifecycle.
|
||||
type Lifecycle interface {
|
||||
// Lock acquires the per-workload deploy lock (C1) and returns the release
|
||||
// func. Held by Restore across stop→swap→redeploy.
|
||||
Lock(workloadID string) func()
|
||||
// StopContainers stops every running container for the workload (C4 quiesce)
|
||||
// and returns the image tag the newest running container was on, so the
|
||||
// redeploy can bring the SAME version back up ("" ⇒ source default tag).
|
||||
StopContainers(ctx context.Context, workloadID string) (runningTag string, err error)
|
||||
// Redeploy re-dispatches the workload's current config WITHOUT re-acquiring
|
||||
// the per-workload lock (the caller holds it). reference pins the image tag.
|
||||
Redeploy(ctx context.Context, w store.Workload, reference string) error
|
||||
}
|
||||
|
||||
// SetLifecycle wires the deploy-side seam. Pass nil to leave restore disabled.
|
||||
func (e *Engine) SetLifecycle(lc Lifecycle) { e.lifecycle = lc }
|
||||
|
||||
// restoreJournal is the on-disk write-ahead record of an in-flight restore.
|
||||
// Written before the first destructive rename and deleted on completion; the
|
||||
// startup RecoverInterruptedRestores sweep replays it after a crash.
|
||||
type restoreJournal struct {
|
||||
SnapshotID string `json:"snapshot_id"`
|
||||
WorkloadID string `json:"workload_id"`
|
||||
Volumes []journalVolume `json:"volumes"`
|
||||
}
|
||||
|
||||
type journalVolume struct {
|
||||
Live string `json:"live"`
|
||||
Old string `json:"old"`
|
||||
Tmp string `json:"tmp"`
|
||||
Swapped bool `json:"swapped"`
|
||||
HadOld bool `json:"had_old"`
|
||||
}
|
||||
|
||||
// staged pairs a resolved volume with its per-restore staging dirs.
|
||||
type staged struct {
|
||||
rv resolvedVol
|
||||
tmp string
|
||||
old string
|
||||
}
|
||||
|
||||
// Restore overwrites the workload's live host-bind volumes with a snapshot's
|
||||
// contents and brings the app back up. It is the single, engine-owned entry
|
||||
// point for the data-loss-sensitive restore flow (image-source workloads only).
|
||||
//
|
||||
// Ordering is deliberate and crash-aware:
|
||||
//
|
||||
// pre-flight (re-resolve all volumes C3, size + per-fs disk check C5) — abort
|
||||
// here touches nothing
|
||||
// → Lock (C1) → re-validate workload → StopContainers (C4 quiesce)
|
||||
// → extract ALL volumes to sibling .tmp staging dirs (reads the source archive
|
||||
// fully BEFORE the next step can prune it; shrinks the later destructive
|
||||
// window to pure renames — R3)
|
||||
// → capture a pre-restore snapshot (durable escape hatch, after quiesce,
|
||||
// before any destructive rename — folded suggestion)
|
||||
// → write the restore journal (R3 crash recovery)
|
||||
// → swap each volume atomically (rename live→.old, .tmp→live — C2)
|
||||
// → Redeploy (C4 — image containers are recreated, never reused)
|
||||
// → remove .old + journal, emit audit event
|
||||
//
|
||||
// Engine.Restore holds NO e.mu (R1): per-workload serialization is the
|
||||
// Lifecycle lock; e.Create takes its own e.mu for the pre-restore archive
|
||||
// write, so calling it here cannot self-deadlock.
|
||||
func (e *Engine) Restore(ctx context.Context, snapshotID, workloadID string) error {
|
||||
if e.lifecycle == nil {
|
||||
return fmt.Errorf("restore: lifecycle not configured")
|
||||
}
|
||||
|
||||
snap, err := e.store.GetVolumeSnapshot(snapshotID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if snap.WorkloadID != workloadID {
|
||||
return fmt.Errorf("snapshot %s does not belong to workload %s", snapshotID, workloadID)
|
||||
}
|
||||
w, err := e.store.GetWorkloadByID(workloadID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if w.SourceKind != "image" {
|
||||
return fmt.Errorf("restore is only supported for image-source workloads")
|
||||
}
|
||||
settings, err := e.store.GetSettings()
|
||||
if err != nil {
|
||||
return fmt.Errorf("load settings: %w", err)
|
||||
}
|
||||
|
||||
manifest, err := parseManifest(snap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
resolved, err := preflightResolve(e.store, w, settings, manifest) // C3 all-or-nothing
|
||||
if err != nil {
|
||||
return fmt.Errorf("pre-flight: %w", err)
|
||||
}
|
||||
archivePath, err := e.FilePath(snap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
perIndex, _, err := archiveUncompressedSize(archivePath, maxRestoreUncompressedBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("size snapshot: %w", err)
|
||||
}
|
||||
if err := checkDiskSpace(resolved, perIndex); err != nil { // C5
|
||||
return err
|
||||
}
|
||||
|
||||
// ── past pre-flight: take the per-workload lock and quiesce ──────────────
|
||||
unlock := e.lifecycle.Lock(workloadID)
|
||||
defer unlock()
|
||||
|
||||
// A teardown may have won the lock and deleted the workload while we waited.
|
||||
if _, err := e.store.GetWorkloadByID(workloadID); err != nil {
|
||||
return fmt.Errorf("workload disappeared before restore: %w", err)
|
||||
}
|
||||
|
||||
tag, err := e.lifecycle.StopContainers(ctx, workloadID) // C4 stop
|
||||
if err != nil {
|
||||
return fmt.Errorf("stop containers: %w", err)
|
||||
}
|
||||
|
||||
// Extract every volume to its staging dir FIRST. This reads the source
|
||||
// archive fully before the pre-restore capture below can prune it, and
|
||||
// leaves only pure renames for the destructive phase (R3).
|
||||
token := uuid.New().String()[:8]
|
||||
stagedVols := make([]staged, 0, len(resolved))
|
||||
for _, rv := range resolved {
|
||||
tmp, old := stagingDirs(rv.LivePath, token, rv.Index)
|
||||
if _, exErr := safeExtractIndex(archivePath, rv.Index, tmp, maxRestoreUncompressedBytes); exErr != nil {
|
||||
cleanupStaging(stagedVols)
|
||||
_ = os.RemoveAll(tmp)
|
||||
// Nothing swapped yet — bring the app back up on its original data.
|
||||
e.redeployAfterAbort(ctx, w, tag)
|
||||
return fmt.Errorf("extract volume %q: %w", rv.Target, exErr)
|
||||
}
|
||||
stagedVols = append(stagedVols, staged{rv: rv, tmp: tmp, old: old})
|
||||
}
|
||||
|
||||
// Durable pre-restore snapshot (escape hatch). Quiesced (after stop), and
|
||||
// the source archive is already fully extracted so a prune here is harmless.
|
||||
// Best-effort, matching the DB-restore precedent: a failure is logged but
|
||||
// does not abort — the .old dirs + journal are the in-operation safety net.
|
||||
if _, err := e.Create(w, settings, "pre-restore"); err != nil {
|
||||
slog.Warn("restore: pre-restore snapshot failed (continuing)",
|
||||
"workload", workloadID, "error", err)
|
||||
}
|
||||
|
||||
// Journal before the first destructive rename so a crash can be recovered.
|
||||
jr := restoreJournal{SnapshotID: snapshotID, WorkloadID: workloadID}
|
||||
for _, sv := range stagedVols {
|
||||
jr.Volumes = append(jr.Volumes, journalVolume{Live: sv.rv.LivePath, Old: sv.old, Tmp: sv.tmp})
|
||||
}
|
||||
if err := e.writeJournal(jr); err != nil {
|
||||
cleanupStaging(stagedVols)
|
||||
e.redeployAfterAbort(ctx, w, tag)
|
||||
return fmt.Errorf("write restore journal: %w", err)
|
||||
}
|
||||
|
||||
// ── destructive phase: pure atomic renames ──────────────────────────────
|
||||
done := make([]swap, 0, len(stagedVols))
|
||||
for i, sv := range stagedVols {
|
||||
hadOld, swErr := swapVolumeDir(sv.rv.LivePath, sv.tmp, sv.old)
|
||||
if swErr != nil {
|
||||
rollbackSwaps(done) // restore already-swapped volumes
|
||||
cleanupStagingFrom(stagedVols, i) // drop remaining un-swapped tmp/old
|
||||
e.removeJournal(workloadID)
|
||||
e.redeployAfterAbort(ctx, w, tag)
|
||||
return fmt.Errorf("swap volume %q: %w", sv.rv.Target, swErr)
|
||||
}
|
||||
done = append(done, swap{live: sv.rv.LivePath, old: sv.old, tmp: sv.tmp, hadOld: hadOld})
|
||||
jr.Volumes[i].Swapped = true
|
||||
jr.Volumes[i].HadOld = hadOld
|
||||
_ = e.writeJournal(jr) // progress checkpoint (best-effort)
|
||||
}
|
||||
|
||||
// Bring the app back up against the restored data (C4 — recreate, redeploy).
|
||||
if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil {
|
||||
// The data IS restored; only the app failed to come back. Do NOT roll
|
||||
// back the volumes — surface the redeploy error so the operator retries
|
||||
// a deploy. Clean up the .old set-asides and the journal.
|
||||
cleanupOld(done)
|
||||
e.removeJournal(workloadID)
|
||||
return fmt.Errorf("redeploy after restore: %w", err)
|
||||
}
|
||||
|
||||
cleanupOld(done)
|
||||
e.removeJournal(workloadID)
|
||||
e.emitRestoreEvent(workloadID, snapshotID, len(done))
|
||||
slog.Info("volume snapshot restored", "workload", workloadID, "snapshot", snapshotID, "volumes", len(done))
|
||||
return nil
|
||||
}
|
||||
|
||||
// redeployAfterAbort re-dispatches after an aborted restore so a stopped app
|
||||
// does not stay down. Best-effort: the error is logged, not returned (the
|
||||
// restore failure is the primary error the caller surfaces).
|
||||
func (e *Engine) redeployAfterAbort(ctx context.Context, w store.Workload, tag string) {
|
||||
if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil {
|
||||
slog.Warn("restore: redeploy after abort failed", "workload", w.ID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverInterruptedRestores replays restore journals left by a crash mid-
|
||||
// restore, mirroring CleanOrphans (run once at startup, before serving). For
|
||||
// each volume: a completed swap keeps the restored live dir and drops the set-
|
||||
// aside original; an incomplete swap that left live missing is reverted from
|
||||
// .old; stray staging dirs are removed. Returns the number of journals handled.
|
||||
func (e *Engine) RecoverInterruptedRestores() (int, error) {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
|
||||
entries, err := os.ReadDir(e.snapDir)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("read snapshot dir: %w", err)
|
||||
}
|
||||
recovered := 0
|
||||
for _, ent := range entries {
|
||||
name := ent.Name()
|
||||
if ent.IsDir() || !strings.HasPrefix(name, "restore-") || !strings.HasSuffix(name, ".json") {
|
||||
continue
|
||||
}
|
||||
path := filepath.Join(e.snapDir, name)
|
||||
data, rerr := os.ReadFile(path)
|
||||
if rerr != nil {
|
||||
slog.Warn("restore recovery: read journal", "file", name, "error", rerr)
|
||||
continue
|
||||
}
|
||||
var jr restoreJournal
|
||||
if jerr := json.Unmarshal(data, &jr); jerr != nil {
|
||||
slog.Warn("restore recovery: parse journal", "file", name, "error", jerr)
|
||||
continue
|
||||
}
|
||||
slog.Warn("restore recovery: replaying interrupted restore",
|
||||
"workload", jr.WorkloadID, "snapshot", jr.SnapshotID, "volumes", len(jr.Volumes))
|
||||
for _, v := range jr.Volumes {
|
||||
recoverVolume(v)
|
||||
}
|
||||
if rmErr := os.Remove(path); rmErr != nil {
|
||||
slog.Warn("restore recovery: remove journal", "file", name, "error", rmErr)
|
||||
}
|
||||
recovered++
|
||||
}
|
||||
return recovered, nil
|
||||
}
|
||||
|
||||
// recoverVolume reconciles a single volume's on-disk state from its journal
|
||||
// entry after a crash. Each branch leaves the live dir intact (either restored
|
||||
// or original) and removes staging leftovers.
|
||||
func recoverVolume(v journalVolume) {
|
||||
if v.Swapped {
|
||||
// Swap completed: live already holds restored data. Drop the set-aside.
|
||||
_ = os.RemoveAll(v.Old)
|
||||
_ = os.RemoveAll(v.Tmp)
|
||||
return
|
||||
}
|
||||
if _, err := os.Lstat(v.Live); os.IsNotExist(err) {
|
||||
if _, oerr := os.Lstat(v.Old); oerr == nil {
|
||||
// Crashed mid-rename (live→old done, tmp→live not): revert.
|
||||
_ = os.Rename(v.Old, v.Live)
|
||||
}
|
||||
} else {
|
||||
// live is intact (original). Any .old is a dangling partial copy.
|
||||
_ = os.RemoveAll(v.Old)
|
||||
}
|
||||
_ = os.RemoveAll(v.Tmp)
|
||||
}
|
||||
|
||||
// ── journal + cleanup helpers ───────────────────────────────────────────────
|
||||
|
||||
func (e *Engine) journalPath(workloadID string) string {
|
||||
// workloadID is a server-generated id (loaded from the DB before we get
|
||||
// here). filepath.Base defends against any separator sneaking into the name.
|
||||
return filepath.Join(e.snapDir, "restore-"+filepath.Base(workloadID)+".json")
|
||||
}
|
||||
|
||||
func (e *Engine) writeJournal(jr restoreJournal) error {
|
||||
data, err := json.Marshal(jr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("encode journal: %w", err)
|
||||
}
|
||||
// Write atomically (tmp + rename): a torn journal would silently disable the
|
||||
// recovery sweep (RecoverInterruptedRestores skips unparseable journals), so
|
||||
// a crash mid-write must never leave a half-written WAL on disk. The .tmp
|
||||
// suffix is ignored by the recovery scan (it matches *.json only).
|
||||
final := e.journalPath(jr.WorkloadID)
|
||||
tmp := final + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0o600); err != nil {
|
||||
return fmt.Errorf("write journal: %w", err)
|
||||
}
|
||||
if err := os.Rename(tmp, final); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return fmt.Errorf("commit journal: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *Engine) removeJournal(workloadID string) {
|
||||
if err := os.Remove(e.journalPath(workloadID)); err != nil && !os.IsNotExist(err) {
|
||||
slog.Warn("restore: remove journal", "workload", workloadID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) emitRestoreEvent(workloadID, snapshotID string, volumes int) {
|
||||
meta, _ := json.Marshal(map[string]any{"snapshot_id": snapshotID, "volumes": volumes})
|
||||
if _, err := e.store.InsertEvent(store.EventLog{
|
||||
Source: "volsnap",
|
||||
WorkloadID: workloadID,
|
||||
Severity: "info",
|
||||
Message: "volume snapshot restored",
|
||||
Metadata: string(meta),
|
||||
}); err != nil {
|
||||
slog.Warn("restore: record event", "workload", workloadID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupStaging removes the tmp + old staging dirs for every staged volume
|
||||
// (used when aborting before the swap phase).
|
||||
func cleanupStaging(sv []staged) {
|
||||
for _, s := range sv {
|
||||
_ = os.RemoveAll(s.tmp)
|
||||
_ = os.RemoveAll(s.old)
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupStagingFrom removes staging dirs from index `from` onward (the volumes
|
||||
// not yet swapped when a swap failed).
|
||||
func cleanupStagingFrom(sv []staged, from int) {
|
||||
for i := from; i < len(sv); i++ {
|
||||
_ = os.RemoveAll(sv[i].tmp)
|
||||
_ = os.RemoveAll(sv[i].old)
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupOld removes the .old set-aside dirs after a successful (or data-
|
||||
// committed) restore to reclaim disk; the pre-restore snapshot is the durable
|
||||
// rollback target.
|
||||
func cleanupOld(done []swap) {
|
||||
for _, s := range done {
|
||||
_ = os.RemoveAll(s.old)
|
||||
}
|
||||
}
|
||||
|
||||
// checkDiskSpace verifies each target filesystem has room for the volumes that
|
||||
// will be staged on it (C5). Peak usage co-locates the live copy (renamed
|
||||
// aside, no new space) and the extracted copy (new space ≈ uncompressed size),
|
||||
// so the new allocation per filesystem is the sum of its volumes' extracted
|
||||
// sizes plus headroom. The estimate is a lower bound (see archiveUncompressedSize);
|
||||
// a mid-extract ENOSPC is still caught and rolled back.
|
||||
func checkDiskSpace(resolved []resolvedVol, perIndex map[int]int64) error {
|
||||
needByParent := map[string]int64{}
|
||||
for _, rv := range resolved {
|
||||
needByParent[filepath.Dir(rv.LivePath)] += perIndex[rv.Index]
|
||||
}
|
||||
for parent, need := range needByParent {
|
||||
probe := firstExistingAncestor(parent)
|
||||
free, err := freeDiskBytes(probe)
|
||||
if err != nil {
|
||||
return fmt.Errorf("check disk space at %s: %w", probe, err)
|
||||
}
|
||||
if int64(free) < need+diskFreeHeadroomBytes {
|
||||
return fmt.Errorf("insufficient disk space at %s: need ~%d bytes, have %d",
|
||||
parent, need+diskFreeHeadroomBytes, free)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// firstExistingAncestor walks up p until it finds a path that exists, so the
|
||||
// free-space probe has a real filesystem to stat even when the volume dir (or
|
||||
// its parent) hasn't been created yet.
|
||||
func firstExistingAncestor(p string) string {
|
||||
for {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
return p
|
||||
}
|
||||
parent := filepath.Dir(p)
|
||||
if parent == p {
|
||||
return p
|
||||
}
|
||||
p = parent
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user