Files
alexei.dolgolyov 1c47030854 feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00

236 lines
8.5 KiB
Go

package volsnap
import (
"archive/tar"
"compress/gzip"
"encoding/json"
"fmt"
"io"
"os"
"path"
"path/filepath"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/volume"
)
// maxRestoreUncompressedBytes caps the total decompressed size accepted from a
// snapshot archive during restore (decompression-bomb defence). 50 GiB is far
// above any realistic app data volume while still bounding a hostile archive.
const maxRestoreUncompressedBytes int64 = 50 << 30
// diskFreeHeadroomBytes is extra free space required beyond the extracted size
// so a restore never fills the target filesystem to the brim. The live copy is
// renamed aside (no new space), so the new allocation is ~the extracted size;
// this headroom covers filesystem overhead and metadata.
const diskFreeHeadroomBytes int64 = 256 << 20
// resolvedVol is a manifest volume whose live host path has been re-resolved
// against the workload's CURRENT config (all-or-nothing pre-flight, C3).
type resolvedVol struct {
Index int
Target string
Scope string
LivePath string
}
// parseManifest decodes the snapshot row's manifest JSON ([]SnapshotVolume).
func parseManifest(snap store.VolumeSnapshot) ([]SnapshotVolume, error) {
var m []SnapshotVolume
if err := json.Unmarshal([]byte(snap.Manifest), &m); err != nil {
return nil, fmt.Errorf("parse snapshot manifest: %w", err)
}
if len(m) == 0 {
return nil, fmt.Errorf("snapshot manifest is empty")
}
return m, nil
}
// preflightResolve re-derives every manifest volume's live host path from the
// workload's CURRENT config, ALL-OR-NOTHING (C3): if any snapshotted target is
// no longer declared, its scope is unsupported, or it can't resolve, it returns
// an error and the caller MUST abort BEFORE stopping containers or touching
// disk — config drift mid-restore is silent corruption.
//
// SECURITY: the swap target is keyed on the manifest's container Target path but
// its host directory is derived from the CURRENT (trusted, operator-set)
// Source/Scope — never from the snapshot manifest's persisted Source/Scope. The
// manifest column is attacker-influenceable (e.g. a restored/tampered DB), and
// trusting its Source for stage/project scope would let `Source:"../../etc"`
// redirect the destructive rename-swap outside the volume tree. As defence in
// depth, base-relative resolved paths are asserted to stay under BaseVolumePath.
func preflightResolve(st *store.Store, w store.Workload, settings store.Settings, manifest []SnapshotVolume) ([]resolvedVol, error) {
current, err := volumesByTarget(st, w)
if err != nil {
return nil, fmt.Errorf("load current volumes: %w", err)
}
params := volume.ResolveWorkloadParams{
BasePath: settings.BaseVolumePath,
WorkloadID: w.ID,
WorkloadName: w.Name,
AllowedVolumePaths: settings.AllowedVolumePaths,
}
out := make([]resolvedVol, 0, len(manifest))
for _, mv := range manifest {
// A negative index can never name an archive subtree.
if mv.Index < 0 {
return nil, fmt.Errorf("volume %q has invalid index %d", mv.Target, mv.Index)
}
cur, ok := current[mv.Target]
if !ok {
return nil, fmt.Errorf("volume %q is no longer declared by the workload", mv.Target)
}
if !supportedScopes[cur.Scope] {
return nil, fmt.Errorf("volume %q scope %q is not restorable", mv.Target, cur.Scope)
}
live, err := volume.ResolveWorkloadPath(cur, params)
if err != nil {
return nil, fmt.Errorf("resolve volume %q (%s): %w", mv.Target, cur.Scope, err)
}
// Containment: the destructive swap target must stay inside the volume
// root. Base-relative scopes must resolve under BaseVolumePath; absolute
// scope is already constrained to AllowedVolumePaths by the resolver.
if cur.Scope != string(store.VolumeScopeAbsolute) {
contained, cerr := pathWithinBase(settings.BaseVolumePath, live)
if cerr != nil || !contained {
return nil, fmt.Errorf("resolved path for volume %q escapes the volume root", mv.Target)
}
}
out = append(out, resolvedVol{Index: mv.Index, Target: mv.Target, Scope: cur.Scope, LivePath: live})
}
return out, nil
}
// pathWithinBase reports whether target resolves to base or a path beneath it.
// An empty base is treated as non-containing (refuse rather than allow).
func pathWithinBase(base, target string) (bool, error) {
if base == "" {
return false, nil
}
absBase, err := filepath.Abs(base)
if err != nil {
return false, err
}
absTarget, err := filepath.Abs(target)
if err != nil {
return false, err
}
return withinDir(absBase, absTarget), nil
}
// archiveUncompressedSize scans the archive's tar headers and returns the
// per-index and total uncompressed sizes, enforcing bombCap so a hostile
// archive can't make the disk pre-check allocate unbounded. Feeds the
// per-filesystem free-space pre-check (C5).
//
// The total is a LOWER-BOUND estimate of on-disk consumption: it sums regular-
// file bytes only, ignoring directory entries and per-file inode/block-rounding
// overhead, so a volume of many tiny files consumes more than reported. The
// real safety net is the staged extract + atomic swap (a mid-extract ENOSPC
// discards the staging dir and leaves live untouched), not this pre-check.
//
// "No body copy" is at the API level only — tar.Next still inflates and
// discards each skipped body, so a 50 GiB-of-headers archive does 50 GiB of
// gzip work; bombCap bounds that.
func archiveUncompressedSize(archivePath string, bombCap int64) (perIndex map[int]int64, total int64, err error) {
f, err := os.Open(archivePath)
if err != nil {
return nil, 0, fmt.Errorf("open archive: %w", err)
}
defer f.Close()
gz, err := gzip.NewReader(f)
if err != nil {
return nil, 0, fmt.Errorf("gzip reader: %w", err)
}
defer gz.Close()
perIndex = map[int]int64{}
tr := tar.NewReader(gz)
for {
hdr, e := tr.Next()
if e == io.EOF {
break
}
if e != nil {
return nil, 0, fmt.Errorf("read tar: %w", e)
}
if hdr.Typeflag != tar.TypeReg {
continue
}
name := path.Clean(hdr.Name)
if name == "manifest.json" {
continue
}
idx, ok := leadingIndex(name)
if !ok {
continue
}
total += hdr.Size
if total > bombCap {
return nil, 0, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap)
}
perIndex[idx] += hdr.Size
}
return perIndex, total, nil
}
// swap records one volume's atomic dir replacement so it can be rolled back.
type swap struct {
live string
old string // where the prior live dir was set aside ("" if live didn't exist)
tmp string // staging dir holding the freshly-extracted data
hadOld bool // whether a prior live dir existed and was moved to old
}
// stagingDirs returns the per-volume tmp and old staging paths as SIBLINGS of
// the live dir's parent, so every rename in the swap is intra-filesystem and
// therefore atomic (R2). A cross-device rename (live is itself a mountpoint)
// fails loudly in swapVolumeDir rather than silently degrading to a copy.
func stagingDirs(live, token string, index int) (tmp, old string) {
parent := filepath.Dir(live)
base := fmt.Sprintf(".tf-restore-%s-%d", token, index)
return filepath.Join(parent, base+".tmp"), filepath.Join(parent, base+".old")
}
// swapVolumeDir performs the crash-minimal two-rename swap: set the live dir
// aside to old (if it exists), then move the staged tmp into place (C2). On the
// second rename failing it reverts the first so live is never left missing.
// Returns whether a prior live dir was preserved at old (for rollback).
func swapVolumeDir(live, tmp, old string) (hadOld bool, err error) {
if _, statErr := os.Lstat(live); statErr == nil {
if rerr := os.Rename(live, old); rerr != nil {
return false, fmt.Errorf("set aside live %s: %w", live, rerr)
}
hadOld = true
} else if !os.IsNotExist(statErr) {
return false, fmt.Errorf("stat live %s: %w", live, statErr)
}
if mkErr := os.MkdirAll(filepath.Dir(live), 0o700); mkErr != nil {
if hadOld {
_ = os.Rename(old, live)
}
return hadOld, fmt.Errorf("ensure parent of %s: %w", live, mkErr)
}
if rerr := os.Rename(tmp, live); rerr != nil {
if hadOld {
_ = os.Rename(old, live) // revert: live is never left missing
}
return hadOld, fmt.Errorf("promote restored data into %s: %w", live, rerr)
}
return hadOld, nil
}
// rollbackSwaps reverts completed swaps in reverse order: drop the restored
// live dir and move the preserved original back. Best-effort — each step is
// logged by the caller; rollback must attempt every volume regardless.
func rollbackSwaps(done []swap) {
for i := len(done) - 1; i >= 0; i-- {
s := done[i]
_ = os.RemoveAll(s.live)
if s.hadOld {
_ = os.Rename(s.old, s.live)
}
}
}