1c47030854
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
214 lines
6.4 KiB
Go
214 lines
6.4 KiB
Go
package volsnap
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
)
|
|
|
|
// maxSnapshotsPerWorkload caps how many snapshots are retained per app. On
|
|
// create, older snapshots beyond this count are pruned (best-effort) so volume
|
|
// snapshots cannot grow the data disk without bound.
|
|
const maxSnapshotsPerWorkload = 20
|
|
|
|
// ErrNoSnapshotData is returned by Create when the workload has no resolved
|
|
// host-bind volume directory to capture. It is a client-actionable condition
|
|
// (HTTP 400), distinct from internal failures (HTTP 500).
|
|
var ErrNoSnapshotData = errors.New("no snapshottable volume data for this app")
|
|
|
|
// Engine creates and manages volume snapshots under <dataDir>/snapshots.
|
|
type Engine struct {
|
|
mu sync.Mutex
|
|
store *store.Store
|
|
snapDir string
|
|
|
|
// lifecycle is the deploy-side seam restore needs (per-workload lock, stop,
|
|
// redeploy). Wired post-construction via SetLifecycle from the composition
|
|
// root so volsnap stays decoupled from the deployer/docker packages. nil
|
|
// until wired; Restore refuses to run without it.
|
|
lifecycle Lifecycle
|
|
}
|
|
|
|
// New creates the snapshot engine, ensuring the snapshot directory exists.
|
|
func New(st *store.Store, dataDir string) (*Engine, error) {
|
|
dir := filepath.Join(dataDir, "snapshots")
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return nil, fmt.Errorf("create snapshot directory: %w", err)
|
|
}
|
|
return &Engine{store: st, snapDir: dir}, nil
|
|
}
|
|
|
|
// SnapDir returns the directory holding snapshot archives.
|
|
func (e *Engine) SnapDir() string { return e.snapDir }
|
|
|
|
// Create captures a snapshot of the workload's host-bind data volumes.
|
|
func (e *Engine) Create(w store.Workload, settings store.Settings, label string) (store.VolumeSnapshot, error) {
|
|
refs, _, err := SnapshotableVolumes(e.store, w, settings)
|
|
if err != nil {
|
|
return store.VolumeSnapshot{}, fmt.Errorf("enumerate volumes: %w", err)
|
|
}
|
|
if len(refs) == 0 {
|
|
return store.VolumeSnapshot{}, ErrNoSnapshotData
|
|
}
|
|
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
|
|
filename := fmt.Sprintf("%s-%s-%s.tar.gz",
|
|
idShort(w.ID), time.Now().UTC().Format("20060102-150405"), uuid.New().String()[:8])
|
|
dest := filepath.Join(e.snapDir, filename)
|
|
|
|
manifest, err := writeArchive(dest, refs)
|
|
if err != nil {
|
|
return store.VolumeSnapshot{}, err
|
|
}
|
|
|
|
info, err := os.Stat(dest)
|
|
if err != nil {
|
|
os.Remove(dest)
|
|
return store.VolumeSnapshot{}, fmt.Errorf("stat snapshot: %w", err)
|
|
}
|
|
manifestJSON, err := json.Marshal(manifest)
|
|
if err != nil {
|
|
os.Remove(dest)
|
|
return store.VolumeSnapshot{}, fmt.Errorf("encode manifest: %w", err)
|
|
}
|
|
|
|
row, err := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{
|
|
WorkloadID: w.ID,
|
|
Label: strings.TrimSpace(label),
|
|
Filename: filename,
|
|
SizeBytes: info.Size(),
|
|
Manifest: string(manifestJSON),
|
|
})
|
|
if err != nil {
|
|
os.Remove(dest) // best-effort: don't leak an orphan file
|
|
return store.VolumeSnapshot{}, fmt.Errorf("record snapshot: %w", err)
|
|
}
|
|
|
|
slog.Info("volume snapshot created", "id", row.ID, "workload", w.ID,
|
|
"volumes", len(manifest), "size", info.Size())
|
|
|
|
e.pruneWorkload(w.ID)
|
|
return row, nil
|
|
}
|
|
|
|
// List returns a workload's snapshots, newest first.
|
|
func (e *Engine) List(workloadID string) ([]store.VolumeSnapshot, error) {
|
|
return e.store.ListVolumeSnapshots(workloadID)
|
|
}
|
|
|
|
// Get returns one snapshot by id.
|
|
func (e *Engine) Get(id string) (store.VolumeSnapshot, error) {
|
|
return e.store.GetVolumeSnapshot(id)
|
|
}
|
|
|
|
// Delete removes a snapshot's archive file and its metadata row.
|
|
func (e *Engine) Delete(id string) error {
|
|
snap, err := e.store.GetVolumeSnapshot(id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
if p, perr := e.FilePath(snap); perr == nil {
|
|
if rmErr := os.Remove(p); rmErr != nil && !os.IsNotExist(rmErr) {
|
|
slog.Warn("volume snapshot: remove file", "id", id, "error", rmErr)
|
|
}
|
|
}
|
|
return e.store.DeleteVolumeSnapshot(id)
|
|
}
|
|
|
|
// FilePath resolves a snapshot's archive path and verifies it stays within the
|
|
// snapshot directory (defence-in-depth against a tampered filename column).
|
|
func (e *Engine) FilePath(snap store.VolumeSnapshot) (string, error) {
|
|
base := filepath.Base(snap.Filename)
|
|
if base == "" || base == "." || base != snap.Filename {
|
|
return "", fmt.Errorf("invalid snapshot filename")
|
|
}
|
|
p := filepath.Join(e.snapDir, base)
|
|
abs, err := filepath.Abs(p)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
absDir, _ := filepath.Abs(e.snapDir)
|
|
if !strings.HasPrefix(abs, absDir+string(filepath.Separator)) {
|
|
return "", fmt.Errorf("snapshot path escapes snapshot directory")
|
|
}
|
|
return abs, nil
|
|
}
|
|
|
|
// CleanOrphans removes snapshot archive files that have no metadata row,
|
|
// reconciling on-disk files against the DB. Workload deletion CASCADEs the
|
|
// volume_snapshots rows but cannot reach the files; this (run at startup)
|
|
// reclaims them. Mirrors backup.Engine.CleanOrphans.
|
|
func (e *Engine) CleanOrphans() (int, error) {
|
|
e.mu.Lock()
|
|
defer e.mu.Unlock()
|
|
|
|
entries, err := os.ReadDir(e.snapDir)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("read snapshot dir: %w", err)
|
|
}
|
|
filenames, err := e.store.AllVolumeSnapshotFilenames()
|
|
if err != nil {
|
|
return 0, fmt.Errorf("list snapshot filenames: %w", err)
|
|
}
|
|
known := make(map[string]bool, len(filenames))
|
|
for _, f := range filenames {
|
|
known[f] = true
|
|
}
|
|
|
|
removed := 0
|
|
for _, ent := range entries {
|
|
if ent.IsDir() || known[ent.Name()] {
|
|
continue
|
|
}
|
|
if err := os.Remove(filepath.Join(e.snapDir, ent.Name())); err != nil {
|
|
slog.Warn("volume snapshot: remove orphan", "file", ent.Name(), "error", err)
|
|
continue
|
|
}
|
|
removed++
|
|
}
|
|
return removed, nil
|
|
}
|
|
|
|
// pruneWorkload deletes snapshots beyond maxSnapshotsPerWorkload for one
|
|
// workload (oldest first). Best-effort: caller already holds e.mu.
|
|
func (e *Engine) pruneWorkload(workloadID string) {
|
|
count, err := e.store.CountVolumeSnapshots(workloadID)
|
|
if err != nil || count <= maxSnapshotsPerWorkload {
|
|
return
|
|
}
|
|
oldest, err := e.store.GetOldestVolumeSnapshots(workloadID, count-maxSnapshotsPerWorkload)
|
|
if err != nil {
|
|
slog.Warn("volume snapshot: prune query", "workload", workloadID, "error", err)
|
|
return
|
|
}
|
|
for _, snap := range oldest {
|
|
if p, perr := e.FilePath(snap); perr == nil {
|
|
_ = os.Remove(p)
|
|
}
|
|
if derr := e.store.DeleteVolumeSnapshot(snap.ID); derr != nil {
|
|
slog.Warn("volume snapshot: prune delete", "id", snap.ID, "error", derr)
|
|
}
|
|
}
|
|
}
|
|
|
|
func idShort(id string) string {
|
|
if len(id) > 8 {
|
|
return id[:8]
|
|
}
|
|
return id
|
|
}
|