1c47030854
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
236 lines
8.5 KiB
Go
236 lines
8.5 KiB
Go
package volsnap
|
|
|
|
import (
|
|
"archive/tar"
|
|
"compress/gzip"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
"github.com/alexei/tinyforge/internal/volume"
|
|
)
|
|
|
|
// maxRestoreUncompressedBytes caps the total decompressed size accepted from a
|
|
// snapshot archive during restore (decompression-bomb defence). 50 GiB is far
|
|
// above any realistic app data volume while still bounding a hostile archive.
|
|
const maxRestoreUncompressedBytes int64 = 50 << 30
|
|
|
|
// diskFreeHeadroomBytes is extra free space required beyond the extracted size
|
|
// so a restore never fills the target filesystem to the brim. The live copy is
|
|
// renamed aside (no new space), so the new allocation is ~the extracted size;
|
|
// this headroom covers filesystem overhead and metadata.
|
|
const diskFreeHeadroomBytes int64 = 256 << 20
|
|
|
|
// resolvedVol is a manifest volume whose live host path has been re-resolved
|
|
// against the workload's CURRENT config (all-or-nothing pre-flight, C3).
|
|
type resolvedVol struct {
|
|
Index int
|
|
Target string
|
|
Scope string
|
|
LivePath string
|
|
}
|
|
|
|
// parseManifest decodes the snapshot row's manifest JSON ([]SnapshotVolume).
|
|
func parseManifest(snap store.VolumeSnapshot) ([]SnapshotVolume, error) {
|
|
var m []SnapshotVolume
|
|
if err := json.Unmarshal([]byte(snap.Manifest), &m); err != nil {
|
|
return nil, fmt.Errorf("parse snapshot manifest: %w", err)
|
|
}
|
|
if len(m) == 0 {
|
|
return nil, fmt.Errorf("snapshot manifest is empty")
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
// preflightResolve re-derives every manifest volume's live host path from the
|
|
// workload's CURRENT config, ALL-OR-NOTHING (C3): if any snapshotted target is
|
|
// no longer declared, its scope is unsupported, or it can't resolve, it returns
|
|
// an error and the caller MUST abort BEFORE stopping containers or touching
|
|
// disk — config drift mid-restore is silent corruption.
|
|
//
|
|
// SECURITY: the swap target is keyed on the manifest's container Target path but
|
|
// its host directory is derived from the CURRENT (trusted, operator-set)
|
|
// Source/Scope — never from the snapshot manifest's persisted Source/Scope. The
|
|
// manifest column is attacker-influenceable (e.g. a restored/tampered DB), and
|
|
// trusting its Source for stage/project scope would let `Source:"../../etc"`
|
|
// redirect the destructive rename-swap outside the volume tree. As defence in
|
|
// depth, base-relative resolved paths are asserted to stay under BaseVolumePath.
|
|
func preflightResolve(st *store.Store, w store.Workload, settings store.Settings, manifest []SnapshotVolume) ([]resolvedVol, error) {
|
|
current, err := volumesByTarget(st, w)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("load current volumes: %w", err)
|
|
}
|
|
params := volume.ResolveWorkloadParams{
|
|
BasePath: settings.BaseVolumePath,
|
|
WorkloadID: w.ID,
|
|
WorkloadName: w.Name,
|
|
AllowedVolumePaths: settings.AllowedVolumePaths,
|
|
}
|
|
out := make([]resolvedVol, 0, len(manifest))
|
|
for _, mv := range manifest {
|
|
// A negative index can never name an archive subtree.
|
|
if mv.Index < 0 {
|
|
return nil, fmt.Errorf("volume %q has invalid index %d", mv.Target, mv.Index)
|
|
}
|
|
cur, ok := current[mv.Target]
|
|
if !ok {
|
|
return nil, fmt.Errorf("volume %q is no longer declared by the workload", mv.Target)
|
|
}
|
|
if !supportedScopes[cur.Scope] {
|
|
return nil, fmt.Errorf("volume %q scope %q is not restorable", mv.Target, cur.Scope)
|
|
}
|
|
live, err := volume.ResolveWorkloadPath(cur, params)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("resolve volume %q (%s): %w", mv.Target, cur.Scope, err)
|
|
}
|
|
// Containment: the destructive swap target must stay inside the volume
|
|
// root. Base-relative scopes must resolve under BaseVolumePath; absolute
|
|
// scope is already constrained to AllowedVolumePaths by the resolver.
|
|
if cur.Scope != string(store.VolumeScopeAbsolute) {
|
|
contained, cerr := pathWithinBase(settings.BaseVolumePath, live)
|
|
if cerr != nil || !contained {
|
|
return nil, fmt.Errorf("resolved path for volume %q escapes the volume root", mv.Target)
|
|
}
|
|
}
|
|
out = append(out, resolvedVol{Index: mv.Index, Target: mv.Target, Scope: cur.Scope, LivePath: live})
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// pathWithinBase reports whether target resolves to base or a path beneath it.
|
|
// An empty base is treated as non-containing (refuse rather than allow).
|
|
func pathWithinBase(base, target string) (bool, error) {
|
|
if base == "" {
|
|
return false, nil
|
|
}
|
|
absBase, err := filepath.Abs(base)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
absTarget, err := filepath.Abs(target)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return withinDir(absBase, absTarget), nil
|
|
}
|
|
|
|
// archiveUncompressedSize scans the archive's tar headers and returns the
|
|
// per-index and total uncompressed sizes, enforcing bombCap so a hostile
|
|
// archive can't make the disk pre-check allocate unbounded. Feeds the
|
|
// per-filesystem free-space pre-check (C5).
|
|
//
|
|
// The total is a LOWER-BOUND estimate of on-disk consumption: it sums regular-
|
|
// file bytes only, ignoring directory entries and per-file inode/block-rounding
|
|
// overhead, so a volume of many tiny files consumes more than reported. The
|
|
// real safety net is the staged extract + atomic swap (a mid-extract ENOSPC
|
|
// discards the staging dir and leaves live untouched), not this pre-check.
|
|
//
|
|
// "No body copy" is at the API level only — tar.Next still inflates and
|
|
// discards each skipped body, so a 50 GiB-of-headers archive does 50 GiB of
|
|
// gzip work; bombCap bounds that.
|
|
func archiveUncompressedSize(archivePath string, bombCap int64) (perIndex map[int]int64, total int64, err error) {
|
|
f, err := os.Open(archivePath)
|
|
if err != nil {
|
|
return nil, 0, fmt.Errorf("open archive: %w", err)
|
|
}
|
|
defer f.Close()
|
|
gz, err := gzip.NewReader(f)
|
|
if err != nil {
|
|
return nil, 0, fmt.Errorf("gzip reader: %w", err)
|
|
}
|
|
defer gz.Close()
|
|
|
|
perIndex = map[int]int64{}
|
|
tr := tar.NewReader(gz)
|
|
for {
|
|
hdr, e := tr.Next()
|
|
if e == io.EOF {
|
|
break
|
|
}
|
|
if e != nil {
|
|
return nil, 0, fmt.Errorf("read tar: %w", e)
|
|
}
|
|
if hdr.Typeflag != tar.TypeReg {
|
|
continue
|
|
}
|
|
name := path.Clean(hdr.Name)
|
|
if name == "manifest.json" {
|
|
continue
|
|
}
|
|
idx, ok := leadingIndex(name)
|
|
if !ok {
|
|
continue
|
|
}
|
|
total += hdr.Size
|
|
if total > bombCap {
|
|
return nil, 0, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap)
|
|
}
|
|
perIndex[idx] += hdr.Size
|
|
}
|
|
return perIndex, total, nil
|
|
}
|
|
|
|
// swap records one volume's atomic dir replacement so it can be rolled back.
|
|
type swap struct {
|
|
live string
|
|
old string // where the prior live dir was set aside ("" if live didn't exist)
|
|
tmp string // staging dir holding the freshly-extracted data
|
|
hadOld bool // whether a prior live dir existed and was moved to old
|
|
}
|
|
|
|
// stagingDirs returns the per-volume tmp and old staging paths as SIBLINGS of
|
|
// the live dir's parent, so every rename in the swap is intra-filesystem and
|
|
// therefore atomic (R2). A cross-device rename (live is itself a mountpoint)
|
|
// fails loudly in swapVolumeDir rather than silently degrading to a copy.
|
|
func stagingDirs(live, token string, index int) (tmp, old string) {
|
|
parent := filepath.Dir(live)
|
|
base := fmt.Sprintf(".tf-restore-%s-%d", token, index)
|
|
return filepath.Join(parent, base+".tmp"), filepath.Join(parent, base+".old")
|
|
}
|
|
|
|
// swapVolumeDir performs the crash-minimal two-rename swap: set the live dir
|
|
// aside to old (if it exists), then move the staged tmp into place (C2). On the
|
|
// second rename failing it reverts the first so live is never left missing.
|
|
// Returns whether a prior live dir was preserved at old (for rollback).
|
|
func swapVolumeDir(live, tmp, old string) (hadOld bool, err error) {
|
|
if _, statErr := os.Lstat(live); statErr == nil {
|
|
if rerr := os.Rename(live, old); rerr != nil {
|
|
return false, fmt.Errorf("set aside live %s: %w", live, rerr)
|
|
}
|
|
hadOld = true
|
|
} else if !os.IsNotExist(statErr) {
|
|
return false, fmt.Errorf("stat live %s: %w", live, statErr)
|
|
}
|
|
|
|
if mkErr := os.MkdirAll(filepath.Dir(live), 0o700); mkErr != nil {
|
|
if hadOld {
|
|
_ = os.Rename(old, live)
|
|
}
|
|
return hadOld, fmt.Errorf("ensure parent of %s: %w", live, mkErr)
|
|
}
|
|
if rerr := os.Rename(tmp, live); rerr != nil {
|
|
if hadOld {
|
|
_ = os.Rename(old, live) // revert: live is never left missing
|
|
}
|
|
return hadOld, fmt.Errorf("promote restored data into %s: %w", live, rerr)
|
|
}
|
|
return hadOld, nil
|
|
}
|
|
|
|
// rollbackSwaps reverts completed swaps in reverse order: drop the restored
|
|
// live dir and move the preserved original back. Best-effort — each step is
|
|
// logged by the caller; rollback must attempt every volume regardless.
|
|
func rollbackSwaps(done []swap) {
|
|
for i := len(done) - 1; i >= 0; i-- {
|
|
s := done[i]
|
|
_ = os.RemoveAll(s.live)
|
|
if s.hadOld {
|
|
_ = os.Rename(s.old, s.live)
|
|
}
|
|
}
|
|
}
|