feat(volsnap): volume snapshot restore (backlog #6)

Restore a captured volume snapshot onto an image workload's live host-bind data volumes, then redeploy — the most destructive workload action, built to the adversarially-reviewed design (C1–C6) with all data-loss guards. - Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from the workload's CURRENT config (never the tamperable manifest), per-filesystem disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and crash-recovery sweep (RecoverInterruptedRestores) wired before serving. - internal/keyedmutex: shared per-key lock; deployer now serializes every deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked for the restore re-dispatch, no deadlock). - Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir only), decompression-bomb cap, manifest-index bounds. - POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore header (CSRF), per-workload single-flight (409). - WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru). Scope: image-source only; scopes absolute/stage/project (driven off the same supportedScopes constant capture uses). Plan-reviewed before coding; per-phase go/security/ts reviews; final review READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path traversal (re-derive target from current config + base containment). Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00
parent 8a5f69af87
commit 1c47030854
33 changed files with 2825 additions and 34 deletions
@@ -14,6 +14,7 @@ import (
 	"github.com/alexei/tinyforge/internal/dns"
 	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/keyedmutex"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/npm"
 	"github.com/alexei/tinyforge/internal/proxy"
@@ -56,6 +57,11 @@ type Server struct {
 	// two concurrent syncs can't race on source_config (review S5).
 	gitopsSync keyedMutex

+	// volRestoreInFlight is a per-workload single-flight guard for volume
+	// snapshot restore: a concurrent restore of the same workload is rejected
+	// fast with 409 (TryLock) rather than queuing behind the deployer lock.
+	volRestoreInFlight keyedmutex.Mutex
+
 	dnsProviderMu        sync.RWMutex
 	dnsProvider          dns.Provider
 	onDNSProviderChanged DNSProviderChangedFunc
@@ -359,6 +365,10 @@ func (s *Server) Router() chi.Router {
 				r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots)
 				r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable)
 				r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot)
+				// Restore overwrites live volume data and restarts the app — the
+				// most destructive workload action. Admin-gated + X-Confirm-Restore
+				// header (CSRF) + per-workload single-flight, mirroring DB restore.
+				r.With(auth.AdminOnly).Post("/snapshots/{sid}/restore", s.restoreWorkloadSnapshot)

 				// Runtime view: per-source persisted state + storage usage.
 				// Read-only; safe for any authenticated user.
@@ -140,6 +140,72 @@ func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
 }

+// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore.
+//
+// This is the most destructive workload action: it overwrites the app's live
+// volume data with the snapshot and recreates its containers. It is guarded like
+// the DB restore — admin-only, an X-Confirm-Restore header that must echo the
+// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and
+// a per-workload single-flight so a double-click can't stack two restores. All
+// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this
+// handler only validates and delegates.
+func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	sid := chi.URLParam(r, "sid")
+
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal snapshot id)")
+		return
+	}
+
+	// Up-front validation for precise client errors (Engine.Restore re-checks
+	// ownership + source kind under the lock).
+	snap, err := s.snapshotEngine.Get(sid)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot not found")
+		return
+	}
+	if snap.WorkloadID != id {
+		respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload")
+		return
+	}
+	row, ok := s.loadWorkload(w, id)
+	if !ok {
+		return
+	}
+	if row.SourceKind != "image" {
+		respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads")
+		return
+	}
+
+	// Per-workload single-flight: reject a concurrent restore of the SAME
+	// workload with 409 rather than queuing it behind the deployer lock.
+	release, ok := s.volRestoreInFlight.TryLock(id)
+	if !ok {
+		respondError(w, http.StatusConflict, "a restore is already in progress for this workload")
+		return
+	}
+	defer release()
+
+	if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil {
+		// Raw error (which can carry resolved host paths) stays in the log; the
+		// client gets a generic message.
+		slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err)
+		respondError(w, http.StatusInternalServerError, "restore failed; see server logs")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{
+		"status":      "restored",
+		"workload_id": id,
+		"snapshot_id": sid,
+	})
+}
+
 // downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
 // tar.gz archive. The resolved path is containment-checked against the
 // snapshot directory.
@@ -1,12 +1,15 @@
 package api

 import (
+	"context"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
+	"sync"
 	"testing"
+	"time"

 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
@@ -53,7 +56,211 @@ func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
 		t.Fatalf("update settings: %v", err)
 	}

-	return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey}, baseVol
+	return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol
+}
+
+// doRestore issues an authenticated restore POST, optionally setting the
+// X-Confirm-Restore header (pass confirm="" to omit it).
+func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response {
+	t.Helper()
+	req, err := http.NewRequest(http.MethodPost,
+		e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil)
+	if err != nil {
+		t.Fatalf("new request: %v", err)
+	}
+	req.Header.Set("Authorization", "Bearer "+e.adminToken)
+	if confirm != "" {
+		req.Header.Set("X-Confirm-Restore", confirm)
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		t.Fatalf("do request: %v", err)
+	}
+	return resp
+}
+
+// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the
+// deep restore behavior is covered by the volsnap engine tests.
+type okLifecycle struct{ tag string }
+
+func (l *okLifecycle) Lock(string) func()                                     { return func() {} }
+func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil }
+func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
+
+func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
+	snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
+
+	// Missing header → 400.
+	resp := e.doRestore(t, w.ID, snap.ID, "")
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("missing header status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+	// Mismatched header → 400.
+	resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid")
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_WrongWorkload(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
+	snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
+
+	resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_NonImageWorkload(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`})
+	snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
+
+	resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_NotFound(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
+
+	resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid")
+	if resp.StatusCode != http.StatusNotFound {
+		t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_HappyPath(t *testing.T) {
+	e, baseVol := newSnapshotEnv(t)
+	e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"})
+
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name: "data-app", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil {
+		t.Fatalf("set volume: %v", err)
+	}
+	id8 := w.ID
+	if len(id8) > 8 {
+		id8 = id8[:8]
+	}
+	hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
+	if err := os.MkdirAll(hostDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	settings, _ := e.store.GetSettings()
+	snap, err := e.snapEngine.Create(w, settings, "base")
+	if err != nil {
+		t.Fatalf("create snapshot: %v", err)
+	}
+	// Drift the live data, then restore.
+	if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body)
+	}
+	resp.Body.Close()
+	if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" {
+		t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got)
+	}
+}
+
+// blockingLifecycle blocks in Lock until released, signaling when entered — so
+// a test can hold one restore in-flight and assert a second is rejected 409.
+type blockingLifecycle struct {
+	entered chan struct{}
+	release chan struct{}
+	once    sync.Once
+}
+
+func (l *blockingLifecycle) Lock(string) func() {
+	l.once.Do(func() { close(l.entered) })
+	<-l.release
+	return func() {}
+}
+func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil }
+func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
+
+// seedRestorable creates an image workload with a project volume + live data and
+// a captured snapshot, returning the workload and snapshot ids.
+func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) {
+	t.Helper()
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name: "sf-app", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	id8 := w.ID
+	if len(id8) > 8 {
+		id8 = id8[:8]
+	}
+	hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data")
+	if err := os.MkdirAll(hostDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	settings, _ := e.store.GetSettings()
+	snap, err := e.snapEngine.Create(w, settings, "base")
+	if err != nil {
+		t.Fatalf("create snapshot: %v", err)
+	}
+	return w.ID, snap.ID
+}
+
+func TestRestoreSnapshot_SingleFlight409(t *testing.T) {
+	e, baseVol := newSnapshotEnv(t)
+	wid, sid := seedRestorable(t, e, baseVol)
+	bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})}
+	e.snapEngine.SetLifecycle(bl)
+
+	// Restore #1: passes validation, takes the single-flight, then blocks inside
+	// the engine's Lock.
+	go func() {
+		resp := e.doRestore(t, wid, sid, sid)
+		resp.Body.Close()
+	}()
+
+	select {
+	case <-bl.entered:
+	case <-time.After(3 * time.Second):
+		t.Fatal("first restore never reached the lifecycle lock")
+	}
+
+	// Restore #2 for the same workload must be rejected fast with 409.
+	resp := e.doRestore(t, wid, sid, sid)
+	got := resp.StatusCode
+	resp.Body.Close()
+	close(bl.release) // let #1 finish
+	if got != http.StatusConflict {
+		t.Fatalf("concurrent restore status = %d, want 409", got)
+	}
 }

 func TestVolumeSnapshots_EndToEnd(t *testing.T) {
@@ -15,6 +15,7 @@ import (
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/crypto"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
 	"github.com/alexei/tinyforge/internal/webhook"
 	"github.com/alexei/tinyforge/internal/workload/plugin"

@@ -75,6 +76,7 @@ type apiTestEnv struct {
 	dispatcher *fakeAPIDispatcher
 	adminToken string
 	encKey     [32]byte
+	snapEngine *volsnap.Engine // set by newSnapshotEnv; nil otherwise
 }

 func (e *apiTestEnv) close() { e.srv.Close() }
@@ -670,9 +672,9 @@ func TestGetWorkloadChain_ParentSelfChildren(t *testing.T) {

 	resp := e.do(t, http.MethodGet, "/api/workloads/"+parentID+"/chain", nil)
 	var got struct {
-		Parent   *map[string]any   `json:"parent"`
-		Self     map[string]any    `json:"self"`
-		Children []map[string]any  `json:"children"`
+		Parent   *map[string]any  `json:"parent"`
+		Self     map[string]any   `json:"self"`
+		Children []map[string]any `json:"children"`
 	}
 	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
 		t.Fatalf("envelope error: %q", errMsg)
@@ -5,6 +5,7 @@
 package deployer

 import (
+	"context"
 	"fmt"
 	"log/slog"
 	"sync"
@@ -14,9 +15,11 @@ import (
 	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/events"
 	"github.com/alexei/tinyforge/internal/health"
+	"github.com/alexei/tinyforge/internal/keyedmutex"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/proxy"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

 // Deployer owns the dependency bundle each Source plugin needs at deploy
@@ -49,6 +52,29 @@ type Deployer struct {
 	drainMu      sync.Mutex
 	activeWg     sync.WaitGroup
 	shuttingDown atomic.Bool
+
+	// workloadLocks serializes deploy-class operations per workload id so two
+	// concurrent mutators of the same workload (a manual deploy, a webhook/
+	// trigger dispatch, a rollback, a promote, OR a volume-snapshot restore)
+	// can never interleave their container/volume changes. Every deploy
+	// entrypoint funnels through DispatchPlugin, so locking there gates them
+	// all at one choke point. This is the per-workload lock activeWg is NOT
+	// (activeWg is a global drain barrier for graceful shutdown).
+	workloadLocks keyedmutex.Mutex
+}
+
+// LockWorkload acquires the per-workload deploy lock for an external critical
+// section (volume-snapshot restore) and returns the release func. The restore
+// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked
+// (which does NOT re-acquire it).
+func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) }
+
+// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock,
+// because the caller (restore) already holds it via LockWorkload. Calling the
+// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant.
+// Not for general use.
+func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	return d.dispatchLocked(ctx, w, intent)
 }

 // EventPublisher is the interface for publishing events to the event bus.
@@ -15,6 +15,18 @@ import (
 // operator enables auto_backup_before_deploy, a pre-deploy Tinyforge DB
 // snapshot is taken here, after the source resolves and before it runs.
 func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	// C1: serialize all deploy-class work per workload. Held across the whole
+	// deploy so a concurrent deploy/rollback/promote/trigger — or a volume
+	// restore (which redeploys via RedeployLocked while holding this) — can
+	// never interleave container changes for the same workload.
+	unlock := d.workloadLocks.Lock(w.ID)
+	defer unlock()
+	return d.dispatchLocked(ctx, w, intent)
+}
+
+// dispatchLocked is DispatchPlugin's body, assuming the per-workload lock is
+// already held. RedeployLocked calls it directly during restore.
+func (d *Deployer) dispatchLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
 	if err := d.beginDispatch(); err != nil {
 		metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining")
 		return err
@@ -52,6 +64,12 @@ func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent
 // Used when a workload is deleted. Tracked via activeWg so Drain() honours
 // in-progress teardowns just like deploys.
 func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error {
+	// Teardown mutates the same containers/routes a deploy does, so it takes the
+	// per-workload lock too (C1). Callers tear down distinct workload ids
+	// sequentially (e.g. preview children then parent), never nested, so no
+	// self-deadlock.
+	unlock := d.workloadLocks.Lock(w.ID)
+	defer unlock()
 	if err := d.beginDispatch(); err != nil {
 		return err
 	}
@@ -0,0 +1,48 @@
+// Package keyedmutex provides a lazily-populated per-key mutex, so a critical
+// section can be serialized per key (e.g. per workload id) without a global
+// lock. It is the shared form of the pattern that originated inline in the
+// GitOps sync handler; the deployer (per-workload deploy serialization) and the
+// volume-snapshot restore single-flight both use it.
+package keyedmutex
+
+import "sync"
+
+// Mutex hands out one *sync.Mutex per key on demand. The zero value is ready to
+// use. The internal map only grows (one entry per distinct key ever locked),
+// which is bounded in practice by the number of workloads.
+type Mutex struct {
+	mu sync.Mutex
+	m  map[string]*sync.Mutex
+}
+
+func (k *Mutex) get(key string) *sync.Mutex {
+	k.mu.Lock()
+	defer k.mu.Unlock()
+	if k.m == nil {
+		k.m = make(map[string]*sync.Mutex)
+	}
+	mu, ok := k.m[key]
+	if !ok {
+		mu = &sync.Mutex{}
+		k.m[key] = mu
+	}
+	return mu
+}
+
+// Lock blocks until the mutex for key is acquired, then returns its unlock func.
+func (k *Mutex) Lock(key string) func() {
+	mu := k.get(key)
+	mu.Lock()
+	return mu.Unlock
+}
+
+// TryLock attempts to acquire the mutex for key without blocking. On success it
+// returns the unlock func and true; if the key is already locked it returns nil
+// and false so the caller can reject (e.g. HTTP 409) instead of queuing.
+func (k *Mutex) TryLock(key string) (func(), bool) {
+	mu := k.get(key)
+	if !mu.TryLock() {
+		return nil, false
+	}
+	return mu.Unlock, true
+}
@@ -0,0 +1,83 @@
+package keyedmutex
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestLockSerializesSameKey(t *testing.T) {
+	var m Mutex
+	unlock := m.Lock("a")
+
+	acquired := make(chan struct{})
+	go func() {
+		u := m.Lock("a")
+		close(acquired)
+		u()
+	}()
+
+	select {
+	case <-acquired:
+		t.Fatal("second Lock on the same key acquired while the first was held")
+	case <-time.After(50 * time.Millisecond):
+		// expected: blocked
+	}
+	unlock()
+	select {
+	case <-acquired:
+		// expected: now acquired
+	case <-time.After(time.Second):
+		t.Fatal("second Lock did not acquire after release")
+	}
+}
+
+func TestLockIndependentKeys(t *testing.T) {
+	var m Mutex
+	unlockA := m.Lock("a")
+	defer unlockA()
+	// A different key must not block.
+	done := make(chan struct{})
+	go func() { u := m.Lock("b"); u(); close(done) }()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("Lock on an independent key blocked")
+	}
+}
+
+func TestTryLock(t *testing.T) {
+	var m Mutex
+	unlock, ok := m.TryLock("a")
+	if !ok {
+		t.Fatal("TryLock should succeed on a free key")
+	}
+	if _, ok := m.TryLock("a"); ok {
+		t.Fatal("TryLock should fail while the key is held")
+	}
+	unlock()
+	u2, ok := m.TryLock("a")
+	if !ok {
+		t.Fatal("TryLock should succeed after release")
+	}
+	u2()
+}
+
+func TestConcurrentLockNoRace(t *testing.T) {
+	var m Mutex
+	var wg sync.WaitGroup
+	counter := 0
+	for i := 0; i < 50; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			u := m.Lock("shared")
+			counter++ // protected by the keyed lock
+			u()
+		}()
+	}
+	wg.Wait()
+	if counter != 50 {
+		t.Errorf("counter = %d, want 50 (lost updates ⇒ lock not serializing)", counter)
+	}
+}
@@ -0,0 +1,21 @@
+//go:build !windows
+
+package volsnap
+
+import (
+	"fmt"
+
+	"golang.org/x/sys/unix"
+)
+
+// freeDiskBytes returns the bytes available to an unprivileged process on the
+// filesystem backing path (used by the restore disk pre-check, C5). path must
+// exist; callers pass the live dir's parent.
+func freeDiskBytes(path string) (uint64, error) {
+	var st unix.Statfs_t
+	if err := unix.Statfs(path, &st); err != nil {
+		return 0, fmt.Errorf("statfs %s: %w", path, err)
+	}
+	// Bavail is blocks available to non-root; Bsize is the fragment size.
+	return st.Bavail * uint64(st.Bsize), nil
+}
@@ -0,0 +1,24 @@
+//go:build windows
+
+package volsnap
+
+import (
+	"fmt"
+
+	"golang.org/x/sys/windows"
+)
+
+// freeDiskBytes returns the bytes available to the caller on the volume backing
+// path (used by the restore disk pre-check, C5). Windows is the dev platform;
+// production runs on Linux (see disk_unix.go).
+func freeDiskBytes(path string) (uint64, error) {
+	p, err := windows.UTF16PtrFromString(path)
+	if err != nil {
+		return 0, fmt.Errorf("encode path %s: %w", path, err)
+	}
+	var freeAvail, total, totalFree uint64
+	if err := windows.GetDiskFreeSpaceEx(p, &freeAvail, &total, &totalFree); err != nil {
+		return 0, fmt.Errorf("GetDiskFreeSpaceEx %s: %w", path, err)
+	}
+	return freeAvail, nil
+}
@@ -31,6 +31,12 @@ type Engine struct {
 	mu      sync.Mutex
 	store   *store.Store
 	snapDir string
+
+	// lifecycle is the deploy-side seam restore needs (per-workload lock, stop,
+	// redeploy). Wired post-construction via SetLifecycle from the composition
+	// root so volsnap stays decoupled from the deployer/docker packages. nil
+	// until wired; Restore refuses to run without it.
+	lifecycle Lifecycle
 }

 // New creates the snapshot engine, ensuring the snapshot directory exists.
@@ -0,0 +1,162 @@
+package volsnap
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// safeExtractIndex extracts the files archived under the integer subdirectory
+// `index` of a snapshot tar.gz into dest, returning the total bytes written.
+//
+// On RESTORE the archive is treated as UNTRUSTED (it may have been downloaded,
+// hand-edited, or swapped on disk), so this is hardened well beyond what the
+// capture writer emits:
+//
+//   - zip-slip: every resolved target must stay within dest (HasPrefix check on
+//     the cleaned absolute path) — a "../" or absolute member is rejected.
+//   - type allow-list: ONLY regular files and directories are materialized;
+//     symlinks, hardlinks, char/block devices, fifos, and sockets are rejected
+//     outright (never created, never followed) — they could redirect a write
+//     outside the volume or smuggle in a device node.
+//   - decompression bomb: a running byte counter is capped at bombCap; the first
+//     byte past the cap aborts the extraction.
+//
+// dest must be a fresh staging directory (files are created O_EXCL). The caller
+// performs the atomic rename-swap of dest onto the live path separately.
+func safeExtractIndex(archivePath string, index int, dest string, bombCap int64) (int64, error) {
+	f, err := os.Open(archivePath)
+	if err != nil {
+		return 0, fmt.Errorf("open archive: %w", err)
+	}
+	defer f.Close()
+
+	gz, err := gzip.NewReader(f)
+	if err != nil {
+		return 0, fmt.Errorf("gzip reader: %w", err)
+	}
+	defer gz.Close()
+
+	cleanDest, err := filepath.Abs(dest)
+	if err != nil {
+		return 0, fmt.Errorf("resolve dest: %w", err)
+	}
+	if err := os.MkdirAll(cleanDest, 0o700); err != nil {
+		return 0, fmt.Errorf("create dest: %w", err)
+	}
+
+	tr := tar.NewReader(gz)
+	var written int64
+	for {
+		hdr, err := tr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return written, fmt.Errorf("read tar: %w", err)
+		}
+
+		// Archive paths are always forward-slash. path.Clean collapses any
+		// "./" / "../" so the prefix and containment checks see a normal form.
+		name := path.Clean(hdr.Name)
+		if name == "manifest.json" {
+			continue
+		}
+		rel, ok := stripIndexPrefix(name, index)
+		if !ok {
+			continue // belongs to a different volume's subtree
+		}
+
+		switch hdr.Typeflag {
+		case tar.TypeReg, tar.TypeDir:
+			// allowed
+		default:
+			return written, fmt.Errorf("archive entry %q has disallowed type %q", hdr.Name, string(hdr.Typeflag))
+		}
+
+		target := cleanDest
+		if rel != "" {
+			target = filepath.Join(cleanDest, filepath.FromSlash(rel))
+		}
+		if !withinDir(cleanDest, target) {
+			return written, fmt.Errorf("archive entry %q escapes destination", hdr.Name)
+		}
+
+		if hdr.Typeflag == tar.TypeDir {
+			if err := os.MkdirAll(target, 0o700); err != nil {
+				return written, fmt.Errorf("mkdir %s: %w", target, err)
+			}
+			continue
+		}
+
+		if err := os.MkdirAll(filepath.Dir(target), 0o700); err != nil {
+			return written, fmt.Errorf("mkdir parent of %s: %w", target, err)
+		}
+		remaining := bombCap - written
+		if remaining <= 0 {
+			return written, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap)
+		}
+		out, err := os.OpenFile(target, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
+		if err != nil {
+			return written, fmt.Errorf("create %s: %w", target, err)
+		}
+		// LimitReader to remaining+1: if the entry is larger than the cap allows,
+		// the extra byte is copied and written>bombCap trips the guard below.
+		n, copyErr := io.Copy(out, io.LimitReader(tr, remaining+1))
+		closeErr := out.Close()
+		written += n
+		if copyErr != nil {
+			return written, fmt.Errorf("write %s: %w", target, copyErr)
+		}
+		if closeErr != nil {
+			return written, fmt.Errorf("close %s: %w", target, closeErr)
+		}
+		if written > bombCap {
+			return written, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap)
+		}
+	}
+	return written, nil
+}
+
+// stripIndexPrefix returns the path relative to the "<index>/" subtree and
+// whether name belongs to it. name=="<index>" (the subtree root) yields ("", true).
+// The "/" boundary keeps index 1 from matching "10/...".
+func stripIndexPrefix(name string, index int) (string, bool) {
+	p := strconv.Itoa(index)
+	if name == p {
+		return "", true
+	}
+	if strings.HasPrefix(name, p+"/") {
+		return name[len(p)+1:], true
+	}
+	return "", false
+}
+
+// leadingIndex parses the first path segment of an archive entry name as the
+// volume index. Returns false for manifest.json or any non-integer prefix.
+func leadingIndex(name string) (int, bool) {
+	seg := name
+	if i := strings.IndexByte(name, '/'); i >= 0 {
+		seg = name[:i]
+	}
+	idx, err := strconv.Atoi(seg)
+	if err != nil {
+		return 0, false
+	}
+	return idx, true
+}
+
+// withinDir reports whether target is base itself or lives beneath it. Both
+// args must already be cleaned absolute paths.
+func withinDir(base, target string) bool {
+	if target == base {
+		return true
+	}
+	return strings.HasPrefix(target, base+string(filepath.Separator))
+}
@@ -0,0 +1,166 @@
+package volsnap
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+type tentry struct {
+	name     string
+	typeflag byte
+	body     string
+	linkname string
+}
+
+// buildTarGz writes an arbitrary (possibly hostile) tar.gz so the extractor's
+// untrusted-input hardening can be exercised — writeArchive only emits well-
+// formed reg/dir entries, which is the wrong shape for these tests.
+func buildTarGz(t *testing.T, entries []tentry) string {
+	t.Helper()
+	dest := filepath.Join(t.TempDir(), "snap.tar.gz")
+	f, err := os.Create(dest)
+	if err != nil {
+		t.Fatal(err)
+	}
+	gz := gzip.NewWriter(f)
+	tw := tar.NewWriter(gz)
+	for _, e := range entries {
+		hdr := &tar.Header{Name: e.name, Typeflag: e.typeflag, Mode: 0o600, Linkname: e.linkname}
+		switch e.typeflag {
+		case tar.TypeReg:
+			hdr.Size = int64(len(e.body))
+		case tar.TypeChar, tar.TypeBlock:
+			hdr.Devmajor, hdr.Devminor = 1, 1
+		}
+		if err := tw.WriteHeader(hdr); err != nil {
+			t.Fatal(err)
+		}
+		if e.typeflag == tar.TypeReg {
+			if _, err := tw.Write([]byte(e.body)); err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+	if err := tw.Close(); err != nil {
+		t.Fatal(err)
+	}
+	if err := gz.Close(); err != nil {
+		t.Fatal(err)
+	}
+	if err := f.Close(); err != nil {
+		t.Fatal(err)
+	}
+	return dest
+}
+
+func TestSafeExtractIndex_RoundTrip(t *testing.T) {
+	arc := buildTarGz(t, []tentry{
+		{name: "0/", typeflag: tar.TypeDir},
+		{name: "0/a.txt", typeflag: tar.TypeReg, body: "hello"},
+		{name: "0/sub/", typeflag: tar.TypeDir},
+		{name: "0/sub/b.txt", typeflag: tar.TypeReg, body: "world"},
+		{name: "manifest.json", typeflag: tar.TypeReg, body: "[]"},
+	})
+	dest := filepath.Join(t.TempDir(), "out")
+	n, err := safeExtractIndex(arc, 0, dest, maxRestoreUncompressedBytes)
+	if err != nil {
+		t.Fatalf("extract: %v", err)
+	}
+	if n != int64(len("hello")+len("world")) {
+		t.Errorf("written = %d, want %d", n, len("hello")+len("world"))
+	}
+	if got, _ := os.ReadFile(filepath.Join(dest, "a.txt")); string(got) != "hello" {
+		t.Errorf("a.txt = %q", got)
+	}
+	if got, _ := os.ReadFile(filepath.Join(dest, "sub", "b.txt")); string(got) != "world" {
+		t.Errorf("sub/b.txt = %q", got)
+	}
+	if _, err := os.Stat(filepath.Join(dest, "manifest.json")); !os.IsNotExist(err) {
+		t.Error("manifest.json must not be extracted into the volume")
+	}
+}
+
+func TestSafeExtractIndex_IsolatesIndex(t *testing.T) {
+	arc := buildTarGz(t, []tentry{
+		{name: "1/keep.txt", typeflag: tar.TypeReg, body: "one"},
+		{name: "10/other.txt", typeflag: tar.TypeReg, body: "ten"},
+		{name: "2/nope.txt", typeflag: tar.TypeReg, body: "two"},
+	})
+	dest := filepath.Join(t.TempDir(), "out")
+	if _, err := safeExtractIndex(arc, 1, dest, maxRestoreUncompressedBytes); err != nil {
+		t.Fatalf("extract: %v", err)
+	}
+	if _, err := os.Stat(filepath.Join(dest, "keep.txt")); err != nil {
+		t.Errorf("index 1 file missing: %v", err)
+	}
+	// "10/" must not bleed into index 1 (prefix boundary), nor "2/".
+	for _, leaked := range []string{"other.txt", "nope.txt"} {
+		if _, err := os.Stat(filepath.Join(dest, leaked)); !os.IsNotExist(err) {
+			t.Errorf("index 1 extraction leaked %q from another index", leaked)
+		}
+	}
+}
+
+func TestSafeExtractIndex_RejectsDisallowedTypes(t *testing.T) {
+	cases := map[string]tentry{
+		"symlink":  {name: "0/link", typeflag: tar.TypeSymlink, linkname: "/etc/passwd"},
+		"hardlink": {name: "0/hard", typeflag: tar.TypeLink, linkname: "0/real"},
+		"chardev":  {name: "0/cdev", typeflag: tar.TypeChar},
+		"blockdev": {name: "0/bdev", typeflag: tar.TypeBlock},
+		"fifo":     {name: "0/fifo", typeflag: tar.TypeFifo},
+		"sparse":   {name: "0/sparse", typeflag: tar.TypeCont}, // GNU sparse / contiguous
+	}
+	for name, ent := range cases {
+		t.Run(name, func(t *testing.T) {
+			arc := buildTarGz(t, []tentry{ent})
+			dest := filepath.Join(t.TempDir(), "out")
+			if _, err := safeExtractIndex(arc, 0, dest, maxRestoreUncompressedBytes); err == nil {
+				t.Fatalf("expected %s entry to be rejected", name)
+			}
+		})
+	}
+}
+
+func TestSafeExtractIndex_RejectsBomb(t *testing.T) {
+	arc := buildTarGz(t, []tentry{
+		{name: "0/big.bin", typeflag: tar.TypeReg, body: strings.Repeat("x", 4096)},
+	})
+	dest := filepath.Join(t.TempDir(), "out")
+	if _, err := safeExtractIndex(arc, 0, dest, 1024); err == nil {
+		t.Fatal("expected extraction to abort past the decompression cap")
+	}
+}
+
+func TestSafeExtractIndex_NoEscapeOutsideDest(t *testing.T) {
+	// A "../" climb and an absolute member must never materialize a file
+	// outside dest, regardless of which guard catches it. Note the backslash
+	// case is platform-split: on Windows `..\winslip.txt` is a real climb that
+	// withinDir rejects; on Linux it is a literal one-segment filename that
+	// stays harmlessly inside dest (no guard fires). Both satisfy containment.
+	arc := buildTarGz(t, []tentry{
+		{name: "0/../../escape.txt", typeflag: tar.TypeReg, body: "pwned"},
+		{name: "/abs-escape.txt", typeflag: tar.TypeReg, body: "pwned"},
+		{name: `0/..\winslip.txt`, typeflag: tar.TypeReg, body: "pwned"},
+		{name: "0/ok.txt", typeflag: tar.TypeReg, body: "fine"},
+	})
+	outParent := t.TempDir()
+	dest := filepath.Join(outParent, "out")
+	// May or may not error depending on platform/guard; the invariant is that
+	// nothing escapes dest.
+	_, _ = safeExtractIndex(arc, 0, dest, maxRestoreUncompressedBytes)
+
+	for _, escaped := range []string{
+		filepath.Join(outParent, "escape.txt"),
+		filepath.Join(filepath.Dir(outParent), "escape.txt"),
+		filepath.Join(outParent, "abs-escape.txt"),
+		filepath.Join(outParent, "winslip.txt"),
+	} {
+		if _, err := os.Stat(escaped); err == nil {
+			t.Errorf("zip-slip escaped to %s", escaped)
+		}
+	}
+}
@@ -0,0 +1,235 @@
+package volsnap
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"path/filepath"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volume"
+)
+
+// maxRestoreUncompressedBytes caps the total decompressed size accepted from a
+// snapshot archive during restore (decompression-bomb defence). 50 GiB is far
+// above any realistic app data volume while still bounding a hostile archive.
+const maxRestoreUncompressedBytes int64 = 50 << 30
+
+// diskFreeHeadroomBytes is extra free space required beyond the extracted size
+// so a restore never fills the target filesystem to the brim. The live copy is
+// renamed aside (no new space), so the new allocation is ~the extracted size;
+// this headroom covers filesystem overhead and metadata.
+const diskFreeHeadroomBytes int64 = 256 << 20
+
+// resolvedVol is a manifest volume whose live host path has been re-resolved
+// against the workload's CURRENT config (all-or-nothing pre-flight, C3).
+type resolvedVol struct {
+	Index    int
+	Target   string
+	Scope    string
+	LivePath string
+}
+
+// parseManifest decodes the snapshot row's manifest JSON ([]SnapshotVolume).
+func parseManifest(snap store.VolumeSnapshot) ([]SnapshotVolume, error) {
+	var m []SnapshotVolume
+	if err := json.Unmarshal([]byte(snap.Manifest), &m); err != nil {
+		return nil, fmt.Errorf("parse snapshot manifest: %w", err)
+	}
+	if len(m) == 0 {
+		return nil, fmt.Errorf("snapshot manifest is empty")
+	}
+	return m, nil
+}
+
+// preflightResolve re-derives every manifest volume's live host path from the
+// workload's CURRENT config, ALL-OR-NOTHING (C3): if any snapshotted target is
+// no longer declared, its scope is unsupported, or it can't resolve, it returns
+// an error and the caller MUST abort BEFORE stopping containers or touching
+// disk — config drift mid-restore is silent corruption.
+//
+// SECURITY: the swap target is keyed on the manifest's container Target path but
+// its host directory is derived from the CURRENT (trusted, operator-set)
+// Source/Scope — never from the snapshot manifest's persisted Source/Scope. The
+// manifest column is attacker-influenceable (e.g. a restored/tampered DB), and
+// trusting its Source for stage/project scope would let `Source:"../../etc"`
+// redirect the destructive rename-swap outside the volume tree. As defence in
+// depth, base-relative resolved paths are asserted to stay under BaseVolumePath.
+func preflightResolve(st *store.Store, w store.Workload, settings store.Settings, manifest []SnapshotVolume) ([]resolvedVol, error) {
+	current, err := volumesByTarget(st, w)
+	if err != nil {
+		return nil, fmt.Errorf("load current volumes: %w", err)
+	}
+	params := volume.ResolveWorkloadParams{
+		BasePath:           settings.BaseVolumePath,
+		WorkloadID:         w.ID,
+		WorkloadName:       w.Name,
+		AllowedVolumePaths: settings.AllowedVolumePaths,
+	}
+	out := make([]resolvedVol, 0, len(manifest))
+	for _, mv := range manifest {
+		// A negative index can never name an archive subtree.
+		if mv.Index < 0 {
+			return nil, fmt.Errorf("volume %q has invalid index %d", mv.Target, mv.Index)
+		}
+		cur, ok := current[mv.Target]
+		if !ok {
+			return nil, fmt.Errorf("volume %q is no longer declared by the workload", mv.Target)
+		}
+		if !supportedScopes[cur.Scope] {
+			return nil, fmt.Errorf("volume %q scope %q is not restorable", mv.Target, cur.Scope)
+		}
+		live, err := volume.ResolveWorkloadPath(cur, params)
+		if err != nil {
+			return nil, fmt.Errorf("resolve volume %q (%s): %w", mv.Target, cur.Scope, err)
+		}
+		// Containment: the destructive swap target must stay inside the volume
+		// root. Base-relative scopes must resolve under BaseVolumePath; absolute
+		// scope is already constrained to AllowedVolumePaths by the resolver.
+		if cur.Scope != string(store.VolumeScopeAbsolute) {
+			contained, cerr := pathWithinBase(settings.BaseVolumePath, live)
+			if cerr != nil || !contained {
+				return nil, fmt.Errorf("resolved path for volume %q escapes the volume root", mv.Target)
+			}
+		}
+		out = append(out, resolvedVol{Index: mv.Index, Target: mv.Target, Scope: cur.Scope, LivePath: live})
+	}
+	return out, nil
+}
+
+// pathWithinBase reports whether target resolves to base or a path beneath it.
+// An empty base is treated as non-containing (refuse rather than allow).
+func pathWithinBase(base, target string) (bool, error) {
+	if base == "" {
+		return false, nil
+	}
+	absBase, err := filepath.Abs(base)
+	if err != nil {
+		return false, err
+	}
+	absTarget, err := filepath.Abs(target)
+	if err != nil {
+		return false, err
+	}
+	return withinDir(absBase, absTarget), nil
+}
+
+// archiveUncompressedSize scans the archive's tar headers and returns the
+// per-index and total uncompressed sizes, enforcing bombCap so a hostile
+// archive can't make the disk pre-check allocate unbounded. Feeds the
+// per-filesystem free-space pre-check (C5).
+//
+// The total is a LOWER-BOUND estimate of on-disk consumption: it sums regular-
+// file bytes only, ignoring directory entries and per-file inode/block-rounding
+// overhead, so a volume of many tiny files consumes more than reported. The
+// real safety net is the staged extract + atomic swap (a mid-extract ENOSPC
+// discards the staging dir and leaves live untouched), not this pre-check.
+//
+// "No body copy" is at the API level only — tar.Next still inflates and
+// discards each skipped body, so a 50 GiB-of-headers archive does 50 GiB of
+// gzip work; bombCap bounds that.
+func archiveUncompressedSize(archivePath string, bombCap int64) (perIndex map[int]int64, total int64, err error) {
+	f, err := os.Open(archivePath)
+	if err != nil {
+		return nil, 0, fmt.Errorf("open archive: %w", err)
+	}
+	defer f.Close()
+	gz, err := gzip.NewReader(f)
+	if err != nil {
+		return nil, 0, fmt.Errorf("gzip reader: %w", err)
+	}
+	defer gz.Close()
+
+	perIndex = map[int]int64{}
+	tr := tar.NewReader(gz)
+	for {
+		hdr, e := tr.Next()
+		if e == io.EOF {
+			break
+		}
+		if e != nil {
+			return nil, 0, fmt.Errorf("read tar: %w", e)
+		}
+		if hdr.Typeflag != tar.TypeReg {
+			continue
+		}
+		name := path.Clean(hdr.Name)
+		if name == "manifest.json" {
+			continue
+		}
+		idx, ok := leadingIndex(name)
+		if !ok {
+			continue
+		}
+		total += hdr.Size
+		if total > bombCap {
+			return nil, 0, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap)
+		}
+		perIndex[idx] += hdr.Size
+	}
+	return perIndex, total, nil
+}
+
+// swap records one volume's atomic dir replacement so it can be rolled back.
+type swap struct {
+	live   string
+	old    string // where the prior live dir was set aside ("" if live didn't exist)
+	tmp    string // staging dir holding the freshly-extracted data
+	hadOld bool   // whether a prior live dir existed and was moved to old
+}
+
+// stagingDirs returns the per-volume tmp and old staging paths as SIBLINGS of
+// the live dir's parent, so every rename in the swap is intra-filesystem and
+// therefore atomic (R2). A cross-device rename (live is itself a mountpoint)
+// fails loudly in swapVolumeDir rather than silently degrading to a copy.
+func stagingDirs(live, token string, index int) (tmp, old string) {
+	parent := filepath.Dir(live)
+	base := fmt.Sprintf(".tf-restore-%s-%d", token, index)
+	return filepath.Join(parent, base+".tmp"), filepath.Join(parent, base+".old")
+}
+
+// swapVolumeDir performs the crash-minimal two-rename swap: set the live dir
+// aside to old (if it exists), then move the staged tmp into place (C2). On the
+// second rename failing it reverts the first so live is never left missing.
+// Returns whether a prior live dir was preserved at old (for rollback).
+func swapVolumeDir(live, tmp, old string) (hadOld bool, err error) {
+	if _, statErr := os.Lstat(live); statErr == nil {
+		if rerr := os.Rename(live, old); rerr != nil {
+			return false, fmt.Errorf("set aside live %s: %w", live, rerr)
+		}
+		hadOld = true
+	} else if !os.IsNotExist(statErr) {
+		return false, fmt.Errorf("stat live %s: %w", live, statErr)
+	}
+
+	if mkErr := os.MkdirAll(filepath.Dir(live), 0o700); mkErr != nil {
+		if hadOld {
+			_ = os.Rename(old, live)
+		}
+		return hadOld, fmt.Errorf("ensure parent of %s: %w", live, mkErr)
+	}
+	if rerr := os.Rename(tmp, live); rerr != nil {
+		if hadOld {
+			_ = os.Rename(old, live) // revert: live is never left missing
+		}
+		return hadOld, fmt.Errorf("promote restored data into %s: %w", live, rerr)
+	}
+	return hadOld, nil
+}
+
+// rollbackSwaps reverts completed swaps in reverse order: drop the restored
+// live dir and move the preserved original back. Best-effort — each step is
+// logged by the caller; rollback must attempt every volume regardless.
+func rollbackSwaps(done []swap) {
+	for i := len(done) - 1; i >= 0; i-- {
+		s := done[i]
+		_ = os.RemoveAll(s.live)
+		if s.hadOld {
+			_ = os.Rename(s.old, s.live)
+		}
+	}
+}
@@ -0,0 +1,400 @@
+package volsnap
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/google/uuid"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// Lifecycle is the deploy-side seam Engine.Restore needs but volsnap must not
+// import directly (it would couple the snapshot package to the deployer/docker
+// packages). The composition root supplies an adapter over the Deployer +
+// Docker client via Engine.SetLifecycle.
+type Lifecycle interface {
+	// Lock acquires the per-workload deploy lock (C1) and returns the release
+	// func. Held by Restore across stop→swap→redeploy.
+	Lock(workloadID string) func()
+	// StopContainers stops every running container for the workload (C4 quiesce)
+	// and returns the image tag the newest running container was on, so the
+	// redeploy can bring the SAME version back up ("" ⇒ source default tag).
+	StopContainers(ctx context.Context, workloadID string) (runningTag string, err error)
+	// Redeploy re-dispatches the workload's current config WITHOUT re-acquiring
+	// the per-workload lock (the caller holds it). reference pins the image tag.
+	Redeploy(ctx context.Context, w store.Workload, reference string) error
+}
+
+// SetLifecycle wires the deploy-side seam. Pass nil to leave restore disabled.
+func (e *Engine) SetLifecycle(lc Lifecycle) { e.lifecycle = lc }
+
+// restoreJournal is the on-disk write-ahead record of an in-flight restore.
+// Written before the first destructive rename and deleted on completion; the
+// startup RecoverInterruptedRestores sweep replays it after a crash.
+type restoreJournal struct {
+	SnapshotID string          `json:"snapshot_id"`
+	WorkloadID string          `json:"workload_id"`
+	Volumes    []journalVolume `json:"volumes"`
+}
+
+type journalVolume struct {
+	Live    string `json:"live"`
+	Old     string `json:"old"`
+	Tmp     string `json:"tmp"`
+	Swapped bool   `json:"swapped"`
+	HadOld  bool   `json:"had_old"`
+}
+
+// staged pairs a resolved volume with its per-restore staging dirs.
+type staged struct {
+	rv  resolvedVol
+	tmp string
+	old string
+}
+
+// Restore overwrites the workload's live host-bind volumes with a snapshot's
+// contents and brings the app back up. It is the single, engine-owned entry
+// point for the data-loss-sensitive restore flow (image-source workloads only).
+//
+// Ordering is deliberate and crash-aware:
+//
+//	pre-flight (re-resolve all volumes C3, size + per-fs disk check C5) — abort
+//	  here touches nothing
+//	→ Lock (C1)  → re-validate workload  → StopContainers (C4 quiesce)
+//	→ extract ALL volumes to sibling .tmp staging dirs (reads the source archive
+//	  fully BEFORE the next step can prune it; shrinks the later destructive
+//	  window to pure renames — R3)
+//	→ capture a pre-restore snapshot (durable escape hatch, after quiesce,
+//	  before any destructive rename — folded suggestion)
+//	→ write the restore journal (R3 crash recovery)
+//	→ swap each volume atomically (rename live→.old, .tmp→live — C2)
+//	→ Redeploy (C4 — image containers are recreated, never reused)
+//	→ remove .old + journal, emit audit event
+//
+// Engine.Restore holds NO e.mu (R1): per-workload serialization is the
+// Lifecycle lock; e.Create takes its own e.mu for the pre-restore archive
+// write, so calling it here cannot self-deadlock.
+func (e *Engine) Restore(ctx context.Context, snapshotID, workloadID string) error {
+	if e.lifecycle == nil {
+		return fmt.Errorf("restore: lifecycle not configured")
+	}
+
+	snap, err := e.store.GetVolumeSnapshot(snapshotID)
+	if err != nil {
+		return err
+	}
+	if snap.WorkloadID != workloadID {
+		return fmt.Errorf("snapshot %s does not belong to workload %s", snapshotID, workloadID)
+	}
+	w, err := e.store.GetWorkloadByID(workloadID)
+	if err != nil {
+		return err
+	}
+	if w.SourceKind != "image" {
+		return fmt.Errorf("restore is only supported for image-source workloads")
+	}
+	settings, err := e.store.GetSettings()
+	if err != nil {
+		return fmt.Errorf("load settings: %w", err)
+	}
+
+	manifest, err := parseManifest(snap)
+	if err != nil {
+		return err
+	}
+	resolved, err := preflightResolve(e.store, w, settings, manifest) // C3 all-or-nothing
+	if err != nil {
+		return fmt.Errorf("pre-flight: %w", err)
+	}
+	archivePath, err := e.FilePath(snap)
+	if err != nil {
+		return err
+	}
+	perIndex, _, err := archiveUncompressedSize(archivePath, maxRestoreUncompressedBytes)
+	if err != nil {
+		return fmt.Errorf("size snapshot: %w", err)
+	}
+	if err := checkDiskSpace(resolved, perIndex); err != nil { // C5
+		return err
+	}
+
+	// ── past pre-flight: take the per-workload lock and quiesce ──────────────
+	unlock := e.lifecycle.Lock(workloadID)
+	defer unlock()
+
+	// A teardown may have won the lock and deleted the workload while we waited.
+	if _, err := e.store.GetWorkloadByID(workloadID); err != nil {
+		return fmt.Errorf("workload disappeared before restore: %w", err)
+	}
+
+	tag, err := e.lifecycle.StopContainers(ctx, workloadID) // C4 stop
+	if err != nil {
+		return fmt.Errorf("stop containers: %w", err)
+	}
+
+	// Extract every volume to its staging dir FIRST. This reads the source
+	// archive fully before the pre-restore capture below can prune it, and
+	// leaves only pure renames for the destructive phase (R3).
+	token := uuid.New().String()[:8]
+	stagedVols := make([]staged, 0, len(resolved))
+	for _, rv := range resolved {
+		tmp, old := stagingDirs(rv.LivePath, token, rv.Index)
+		if _, exErr := safeExtractIndex(archivePath, rv.Index, tmp, maxRestoreUncompressedBytes); exErr != nil {
+			cleanupStaging(stagedVols)
+			_ = os.RemoveAll(tmp)
+			// Nothing swapped yet — bring the app back up on its original data.
+			e.redeployAfterAbort(ctx, w, tag)
+			return fmt.Errorf("extract volume %q: %w", rv.Target, exErr)
+		}
+		stagedVols = append(stagedVols, staged{rv: rv, tmp: tmp, old: old})
+	}
+
+	// Durable pre-restore snapshot (escape hatch). Quiesced (after stop), and
+	// the source archive is already fully extracted so a prune here is harmless.
+	// Best-effort, matching the DB-restore precedent: a failure is logged but
+	// does not abort — the .old dirs + journal are the in-operation safety net.
+	if _, err := e.Create(w, settings, "pre-restore"); err != nil {
+		slog.Warn("restore: pre-restore snapshot failed (continuing)",
+			"workload", workloadID, "error", err)
+	}
+
+	// Journal before the first destructive rename so a crash can be recovered.
+	jr := restoreJournal{SnapshotID: snapshotID, WorkloadID: workloadID}
+	for _, sv := range stagedVols {
+		jr.Volumes = append(jr.Volumes, journalVolume{Live: sv.rv.LivePath, Old: sv.old, Tmp: sv.tmp})
+	}
+	if err := e.writeJournal(jr); err != nil {
+		cleanupStaging(stagedVols)
+		e.redeployAfterAbort(ctx, w, tag)
+		return fmt.Errorf("write restore journal: %w", err)
+	}
+
+	// ── destructive phase: pure atomic renames ──────────────────────────────
+	done := make([]swap, 0, len(stagedVols))
+	for i, sv := range stagedVols {
+		hadOld, swErr := swapVolumeDir(sv.rv.LivePath, sv.tmp, sv.old)
+		if swErr != nil {
+			rollbackSwaps(done)               // restore already-swapped volumes
+			cleanupStagingFrom(stagedVols, i) // drop remaining un-swapped tmp/old
+			e.removeJournal(workloadID)
+			e.redeployAfterAbort(ctx, w, tag)
+			return fmt.Errorf("swap volume %q: %w", sv.rv.Target, swErr)
+		}
+		done = append(done, swap{live: sv.rv.LivePath, old: sv.old, tmp: sv.tmp, hadOld: hadOld})
+		jr.Volumes[i].Swapped = true
+		jr.Volumes[i].HadOld = hadOld
+		_ = e.writeJournal(jr) // progress checkpoint (best-effort)
+	}
+
+	// Bring the app back up against the restored data (C4 — recreate, redeploy).
+	if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil {
+		// The data IS restored; only the app failed to come back. Do NOT roll
+		// back the volumes — surface the redeploy error so the operator retries
+		// a deploy. Clean up the .old set-asides and the journal.
+		cleanupOld(done)
+		e.removeJournal(workloadID)
+		return fmt.Errorf("redeploy after restore: %w", err)
+	}
+
+	cleanupOld(done)
+	e.removeJournal(workloadID)
+	e.emitRestoreEvent(workloadID, snapshotID, len(done))
+	slog.Info("volume snapshot restored", "workload", workloadID, "snapshot", snapshotID, "volumes", len(done))
+	return nil
+}
+
+// redeployAfterAbort re-dispatches after an aborted restore so a stopped app
+// does not stay down. Best-effort: the error is logged, not returned (the
+// restore failure is the primary error the caller surfaces).
+func (e *Engine) redeployAfterAbort(ctx context.Context, w store.Workload, tag string) {
+	if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil {
+		slog.Warn("restore: redeploy after abort failed", "workload", w.ID, "error", err)
+	}
+}
+
+// RecoverInterruptedRestores replays restore journals left by a crash mid-
+// restore, mirroring CleanOrphans (run once at startup, before serving). For
+// each volume: a completed swap keeps the restored live dir and drops the set-
+// aside original; an incomplete swap that left live missing is reverted from
+// .old; stray staging dirs are removed. Returns the number of journals handled.
+func (e *Engine) RecoverInterruptedRestores() (int, error) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	entries, err := os.ReadDir(e.snapDir)
+	if err != nil {
+		return 0, fmt.Errorf("read snapshot dir: %w", err)
+	}
+	recovered := 0
+	for _, ent := range entries {
+		name := ent.Name()
+		if ent.IsDir() || !strings.HasPrefix(name, "restore-") || !strings.HasSuffix(name, ".json") {
+			continue
+		}
+		path := filepath.Join(e.snapDir, name)
+		data, rerr := os.ReadFile(path)
+		if rerr != nil {
+			slog.Warn("restore recovery: read journal", "file", name, "error", rerr)
+			continue
+		}
+		var jr restoreJournal
+		if jerr := json.Unmarshal(data, &jr); jerr != nil {
+			slog.Warn("restore recovery: parse journal", "file", name, "error", jerr)
+			continue
+		}
+		slog.Warn("restore recovery: replaying interrupted restore",
+			"workload", jr.WorkloadID, "snapshot", jr.SnapshotID, "volumes", len(jr.Volumes))
+		for _, v := range jr.Volumes {
+			recoverVolume(v)
+		}
+		if rmErr := os.Remove(path); rmErr != nil {
+			slog.Warn("restore recovery: remove journal", "file", name, "error", rmErr)
+		}
+		recovered++
+	}
+	return recovered, nil
+}
+
+// recoverVolume reconciles a single volume's on-disk state from its journal
+// entry after a crash. Each branch leaves the live dir intact (either restored
+// or original) and removes staging leftovers.
+func recoverVolume(v journalVolume) {
+	if v.Swapped {
+		// Swap completed: live already holds restored data. Drop the set-aside.
+		_ = os.RemoveAll(v.Old)
+		_ = os.RemoveAll(v.Tmp)
+		return
+	}
+	if _, err := os.Lstat(v.Live); os.IsNotExist(err) {
+		if _, oerr := os.Lstat(v.Old); oerr == nil {
+			// Crashed mid-rename (live→old done, tmp→live not): revert.
+			_ = os.Rename(v.Old, v.Live)
+		}
+	} else {
+		// live is intact (original). Any .old is a dangling partial copy.
+		_ = os.RemoveAll(v.Old)
+	}
+	_ = os.RemoveAll(v.Tmp)
+}
+
+// ── journal + cleanup helpers ───────────────────────────────────────────────
+
+func (e *Engine) journalPath(workloadID string) string {
+	// workloadID is a server-generated id (loaded from the DB before we get
+	// here). filepath.Base defends against any separator sneaking into the name.
+	return filepath.Join(e.snapDir, "restore-"+filepath.Base(workloadID)+".json")
+}
+
+func (e *Engine) writeJournal(jr restoreJournal) error {
+	data, err := json.Marshal(jr)
+	if err != nil {
+		return fmt.Errorf("encode journal: %w", err)
+	}
+	// Write atomically (tmp + rename): a torn journal would silently disable the
+	// recovery sweep (RecoverInterruptedRestores skips unparseable journals), so
+	// a crash mid-write must never leave a half-written WAL on disk. The .tmp
+	// suffix is ignored by the recovery scan (it matches *.json only).
+	final := e.journalPath(jr.WorkloadID)
+	tmp := final + ".tmp"
+	if err := os.WriteFile(tmp, data, 0o600); err != nil {
+		return fmt.Errorf("write journal: %w", err)
+	}
+	if err := os.Rename(tmp, final); err != nil {
+		_ = os.Remove(tmp)
+		return fmt.Errorf("commit journal: %w", err)
+	}
+	return nil
+}
+
+func (e *Engine) removeJournal(workloadID string) {
+	if err := os.Remove(e.journalPath(workloadID)); err != nil && !os.IsNotExist(err) {
+		slog.Warn("restore: remove journal", "workload", workloadID, "error", err)
+	}
+}
+
+func (e *Engine) emitRestoreEvent(workloadID, snapshotID string, volumes int) {
+	meta, _ := json.Marshal(map[string]any{"snapshot_id": snapshotID, "volumes": volumes})
+	if _, err := e.store.InsertEvent(store.EventLog{
+		Source:     "volsnap",
+		WorkloadID: workloadID,
+		Severity:   "info",
+		Message:    "volume snapshot restored",
+		Metadata:   string(meta),
+	}); err != nil {
+		slog.Warn("restore: record event", "workload", workloadID, "error", err)
+	}
+}
+
+// cleanupStaging removes the tmp + old staging dirs for every staged volume
+// (used when aborting before the swap phase).
+func cleanupStaging(sv []staged) {
+	for _, s := range sv {
+		_ = os.RemoveAll(s.tmp)
+		_ = os.RemoveAll(s.old)
+	}
+}
+
+// cleanupStagingFrom removes staging dirs from index `from` onward (the volumes
+// not yet swapped when a swap failed).
+func cleanupStagingFrom(sv []staged, from int) {
+	for i := from; i < len(sv); i++ {
+		_ = os.RemoveAll(sv[i].tmp)
+		_ = os.RemoveAll(sv[i].old)
+	}
+}
+
+// cleanupOld removes the .old set-aside dirs after a successful (or data-
+// committed) restore to reclaim disk; the pre-restore snapshot is the durable
+// rollback target.
+func cleanupOld(done []swap) {
+	for _, s := range done {
+		_ = os.RemoveAll(s.old)
+	}
+}
+
+// checkDiskSpace verifies each target filesystem has room for the volumes that
+// will be staged on it (C5). Peak usage co-locates the live copy (renamed
+// aside, no new space) and the extracted copy (new space ≈ uncompressed size),
+// so the new allocation per filesystem is the sum of its volumes' extracted
+// sizes plus headroom. The estimate is a lower bound (see archiveUncompressedSize);
+// a mid-extract ENOSPC is still caught and rolled back.
+func checkDiskSpace(resolved []resolvedVol, perIndex map[int]int64) error {
+	needByParent := map[string]int64{}
+	for _, rv := range resolved {
+		needByParent[filepath.Dir(rv.LivePath)] += perIndex[rv.Index]
+	}
+	for parent, need := range needByParent {
+		probe := firstExistingAncestor(parent)
+		free, err := freeDiskBytes(probe)
+		if err != nil {
+			return fmt.Errorf("check disk space at %s: %w", probe, err)
+		}
+		if int64(free) < need+diskFreeHeadroomBytes {
+			return fmt.Errorf("insufficient disk space at %s: need ~%d bytes, have %d",
+				parent, need+diskFreeHeadroomBytes, free)
+		}
+	}
+	return nil
+}
+
+// firstExistingAncestor walks up p until it finds a path that exists, so the
+// free-space probe has a real filesystem to stat even when the volume dir (or
+// its parent) hasn't been created yet.
+func firstExistingAncestor(p string) string {
+	for {
+		if _, err := os.Stat(p); err == nil {
+			return p
+		}
+		parent := filepath.Dir(p)
+		if parent == p {
+			return p
+		}
+		p = parent
+	}
+}
@@ -0,0 +1,345 @@
+package volsnap
+
+import (
+	"archive/tar"
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volume"
+)
+
+// fakeLifecycle records the order of deploy-side calls and lets tests inject
+// failures, without a real deployer/docker.
+type fakeLifecycle struct {
+	mu          sync.Mutex
+	calls       []string
+	tag         string
+	stopErr     error
+	redeployErr error
+	redeployRef string
+}
+
+func (f *fakeLifecycle) rec(s string) {
+	f.mu.Lock()
+	f.calls = append(f.calls, s)
+	f.mu.Unlock()
+}
+func (f *fakeLifecycle) Lock(string) func() { f.rec("lock"); return func() { f.rec("unlock") } }
+func (f *fakeLifecycle) StopContainers(context.Context, string) (string, error) {
+	f.rec("stop")
+	return f.tag, f.stopErr
+}
+func (f *fakeLifecycle) Redeploy(_ context.Context, _ store.Workload, ref string) error {
+	f.rec("redeploy:" + ref)
+	f.redeployRef = ref
+	return f.redeployErr
+}
+func (f *fakeLifecycle) saw(s string) bool {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for _, c := range f.calls {
+		if c == s {
+			return true
+		}
+	}
+	return false
+}
+
+func newRestoreEngine(t *testing.T) (*Engine, *store.Store, string) {
+	t.Helper()
+	st, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("store: %v", err)
+	}
+	t.Cleanup(func() { st.Close() })
+	base := t.TempDir()
+	s, _ := st.GetSettings()
+	s.BaseVolumePath = base
+	if err := st.UpdateSettings(s); err != nil {
+		t.Fatalf("settings: %v", err)
+	}
+	eng, err := New(st, t.TempDir())
+	if err != nil {
+		t.Fatalf("engine: %v", err)
+	}
+	return eng, st, base
+}
+
+// seedImageWorkload creates an image workload with one project-scope volume and
+// returns it plus the resolved live host dir.
+func seedImageWorkload(t *testing.T, st *store.Store) (store.Workload, string) {
+	t.Helper()
+	w, err := st.CreateWorkload(store.Workload{
+		Name:         "data-app",
+		Kind:         "project",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	settings, _ := st.GetSettings()
+	live, err := volume.ResolveWorkloadPath(
+		store.WorkloadVolume{Source: "data", Target: "/data", Scope: "project"},
+		volume.ResolveWorkloadParams{BasePath: settings.BaseVolumePath, WorkloadID: w.ID, WorkloadName: w.Name},
+	)
+	if err != nil {
+		t.Fatalf("resolve: %v", err)
+	}
+	return w, live
+}
+
+func TestEngineRestore_HappyPath(t *testing.T) {
+	eng, st, _ := newRestoreEngine(t)
+	w, live := seedImageWorkload(t, st)
+	mkDirWith(t, live, "orig.txt", "ORIGINAL")
+
+	settings, _ := st.GetSettings()
+	snap, err := eng.Create(w, settings, "base")
+	if err != nil {
+		t.Fatalf("create snapshot: %v", err)
+	}
+
+	// Drift: the live dir now differs from the snapshot.
+	if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil {
+		t.Fatal(err)
+	}
+	mkDirWith(t, live, "extra.txt", "NEW") // not in the snapshot
+
+	fake := &fakeLifecycle{tag: "v1.2.3"}
+	eng.SetLifecycle(fake)
+
+	// Uses the REAL eng.Create for the pre-restore capture — if Restore held
+	// e.mu this would deadlock (R1), failing the test instead of production.
+	if err := eng.Restore(context.Background(), snap.ID, w.ID); err != nil {
+		t.Fatalf("restore: %v", err)
+	}
+
+	if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" {
+		t.Errorf("orig.txt = %q, want ORIGINAL (restored)", got)
+	}
+	if _, err := os.Stat(filepath.Join(live, "extra.txt")); !os.IsNotExist(err) {
+		t.Error("extra.txt should be gone — restore replaces the volume dir wholesale")
+	}
+	for _, want := range []string{"lock", "stop", "redeploy:v1.2.3", "unlock"} {
+		if !fake.saw(want) {
+			t.Errorf("expected lifecycle call %q; calls=%v", want, fake.calls)
+		}
+	}
+	if fake.redeployRef != "v1.2.3" {
+		t.Errorf("redeploy reference = %q, want the running tag v1.2.3", fake.redeployRef)
+	}
+	// A durable pre-restore snapshot was captured (base + pre-restore).
+	snaps, _ := eng.List(w.ID)
+	if len(snaps) != 2 {
+		t.Errorf("expected 2 snapshots (base + pre-restore), got %d", len(snaps))
+	}
+	// No journal left behind.
+	assertNoJournal(t, eng)
+}
+
+func TestEngineRestore_RedeployFailureKeepsRestoredData(t *testing.T) {
+	eng, st, _ := newRestoreEngine(t)
+	w, live := seedImageWorkload(t, st)
+	mkDirWith(t, live, "orig.txt", "ORIGINAL")
+	settings, _ := st.GetSettings()
+	snap, _ := eng.Create(w, settings, "base")
+	if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil {
+		t.Fatal(err)
+	}
+
+	fake := &fakeLifecycle{tag: "v1", redeployErr: errors.New("boom")}
+	eng.SetLifecycle(fake)
+
+	err := eng.Restore(context.Background(), snap.ID, w.ID)
+	if err == nil || !strings.Contains(err.Error(), "redeploy") {
+		t.Fatalf("expected a redeploy error, got %v", err)
+	}
+	// Data is committed despite the redeploy failure — we must NOT roll it back.
+	if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" {
+		t.Errorf("orig.txt = %q, want ORIGINAL (restore committed)", got)
+	}
+	assertNoJournal(t, eng)
+}
+
+func TestEngineRestore_PreflightFailDoesNotLockOrStop(t *testing.T) {
+	eng, st, _ := newRestoreEngine(t)
+	w, _ := seedImageWorkload(t, st)
+	// A snapshot whose manifest names an unsupported scope ⇒ pre-flight aborts.
+	bad, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{
+		WorkloadID: w.ID, Filename: "bad.tar.gz",
+		Manifest: `[{"index":0,"target":"/x","scope":"named","source":"x"}]`,
+	})
+	if err != nil {
+		t.Fatalf("seed snapshot: %v", err)
+	}
+	fake := &fakeLifecycle{}
+	eng.SetLifecycle(fake)
+
+	if err := eng.Restore(context.Background(), bad.ID, w.ID); err == nil {
+		t.Fatal("expected pre-flight to abort on an unsupported scope")
+	}
+	if fake.saw("lock") || fake.saw("stop") {
+		t.Errorf("pre-flight abort must happen BEFORE lock/stop; calls=%v", fake.calls)
+	}
+}
+
+func TestEngineRestore_NilLifecycle(t *testing.T) {
+	eng, _, _ := newRestoreEngine(t)
+	if err := eng.Restore(context.Background(), "s", "w"); err == nil ||
+		!strings.Contains(err.Error(), "lifecycle") {
+		t.Fatalf("expected a lifecycle-not-configured error, got %v", err)
+	}
+}
+
+func TestEngineRestore_WrongWorkload(t *testing.T) {
+	eng, st, _ := newRestoreEngine(t)
+	w, live := seedImageWorkload(t, st)
+	mkDirWith(t, live, "f.txt", "x")
+	settings, _ := st.GetSettings()
+	snap, _ := eng.Create(w, settings, "base")
+	fake := &fakeLifecycle{}
+	eng.SetLifecycle(fake)
+
+	if err := eng.Restore(context.Background(), snap.ID, "some-other-workload"); err == nil {
+		t.Fatal("expected cross-workload restore to be rejected")
+	}
+	if fake.saw("lock") {
+		t.Error("must reject before taking the lock")
+	}
+}
+
+func TestEngineRestore_ExtractFailureAbortsAfterLock(t *testing.T) {
+	eng, st, _ := newRestoreEngine(t)
+	// The workload must CURRENTLY declare both targets so pre-flight passes and
+	// the failure happens during extraction (post-lock), not pre-flight.
+	w, err := st.CreateWorkload(store.Workload{
+		Name: "two-vol", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"x","port":80,"volumes":[` +
+			`{"source":"data","target":"/data","scope":"project"},` +
+			`{"source":"other","target":"/other","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+
+	// Hand-build a 2-volume archive where volume 1 carries a symlink entry the
+	// untrusted extractor rejects — forcing a post-lock extract failure after
+	// volume 0 has already been staged.
+	arc := buildTarGz(t, []tentry{
+		{name: "0/f.txt", typeflag: tar.TypeReg, body: "x"},
+		{name: "1/evil", typeflag: tar.TypeSymlink, linkname: "/etc/passwd"},
+	})
+	data, err := os.ReadFile(arc)
+	if err != nil {
+		t.Fatal(err)
+	}
+	fname := "extract-fail.tar.gz"
+	if err := os.WriteFile(filepath.Join(eng.snapDir, fname), data, 0o600); err != nil {
+		t.Fatal(err)
+	}
+	snap, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{
+		WorkloadID: w.ID, Filename: fname,
+		Manifest: `[{"index":0,"target":"/data","scope":"project","source":"data"},` +
+			`{"index":1,"target":"/other","scope":"project","source":"other"}]`,
+	})
+	if err != nil {
+		t.Fatalf("seed snapshot: %v", err)
+	}
+
+	fake := &fakeLifecycle{tag: "v1"}
+	eng.SetLifecycle(fake)
+
+	if err := eng.Restore(context.Background(), snap.ID, w.ID); err == nil {
+		t.Fatal("expected extract failure to abort the restore")
+	}
+	// Post-lock abort: it stopped, then brought the app back (no swaps happened).
+	if !fake.saw("lock") || !fake.saw("stop") || !fake.saw("redeploy:v1") {
+		t.Errorf("expected lock+stop+redeploy after a post-lock abort; calls=%v", fake.calls)
+	}
+	// No staging or journal left behind.
+	assertNoJournal(t, eng)
+	entries, _ := os.ReadDir(eng.snapDir)
+	for _, e := range entries {
+		if strings.Contains(e.Name(), ".tf-restore-") {
+			t.Errorf("leftover staging dir: %s", e.Name())
+		}
+	}
+}
+
+func TestRecoverInterruptedRestores(t *testing.T) {
+	eng, _, _ := newRestoreEngine(t)
+	root := t.TempDir()
+
+	// A: swap completed — keep restored live, drop old.
+	liveA := filepath.Join(root, "A")
+	oldA := filepath.Join(root, ".A.old")
+	mkDirWith(t, liveA, "f", "RESTORED-A")
+	mkDirWith(t, oldA, "f", "ORIGINAL-A")
+	// B: not swapped, live present — keep original, drop tmp.
+	liveB := filepath.Join(root, "B")
+	tmpB := filepath.Join(root, ".B.tmp")
+	mkDirWith(t, liveB, "f", "ORIGINAL-B")
+	mkDirWith(t, tmpB, "f", "STAGED-B")
+	// C: crashed mid-rename — live missing, old present — revert from old.
+	liveC := filepath.Join(root, "C")
+	oldC := filepath.Join(root, ".C.old")
+	tmpC := filepath.Join(root, ".C.tmp")
+	mkDirWith(t, oldC, "f", "ORIGINAL-C")
+	mkDirWith(t, tmpC, "f", "STAGED-C")
+
+	jr := restoreJournal{SnapshotID: "snap", WorkloadID: "wl-recover", Volumes: []journalVolume{
+		{Live: liveA, Old: oldA, Swapped: true, HadOld: true},
+		{Live: liveB, Tmp: tmpB, Swapped: false},
+		{Live: liveC, Old: oldC, Tmp: tmpC, Swapped: false},
+	}}
+	if err := eng.writeJournal(jr); err != nil {
+		t.Fatalf("write journal: %v", err)
+	}
+
+	n, err := eng.RecoverInterruptedRestores()
+	if err != nil {
+		t.Fatalf("recover: %v", err)
+	}
+	if n != 1 {
+		t.Fatalf("recovered %d journals, want 1", n)
+	}
+	if got := readIn(t, liveA, "f"); got != "RESTORED-A" {
+		t.Errorf("A live = %q, want RESTORED-A (swap kept)", got)
+	}
+	if _, err := os.Stat(oldA); !os.IsNotExist(err) {
+		t.Error("A old should be removed")
+	}
+	if got := readIn(t, liveB, "f"); got != "ORIGINAL-B" {
+		t.Errorf("B live = %q, want ORIGINAL-B (untouched)", got)
+	}
+	if _, err := os.Stat(tmpB); !os.IsNotExist(err) {
+		t.Error("B tmp should be removed")
+	}
+	if got := readIn(t, liveC, "f"); got != "ORIGINAL-C" {
+		t.Errorf("C live = %q, want ORIGINAL-C (reverted from old)", got)
+	}
+	if _, err := os.Stat(tmpC); !os.IsNotExist(err) {
+		t.Error("C tmp should be removed")
+	}
+	assertNoJournal(t, eng)
+}
+
+func assertNoJournal(t *testing.T, eng *Engine) {
+	t.Helper()
+	entries, err := os.ReadDir(eng.snapDir)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, e := range entries {
+		if strings.HasPrefix(e.Name(), "restore-") && strings.HasSuffix(e.Name(), ".json") {
+			t.Errorf("leftover restore journal: %s", e.Name())
+		}
+	}
+}
@@ -0,0 +1,270 @@
+package volsnap
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+func mkDirWith(t *testing.T, dir, fname, content string) {
+	t.Helper()
+	if err := os.MkdirAll(dir, 0o700); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, fname), []byte(content), 0o600); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func readIn(t *testing.T, dir, fname string) string {
+	t.Helper()
+	b, err := os.ReadFile(filepath.Join(dir, fname))
+	if err != nil {
+		t.Fatalf("read %s/%s: %v", dir, fname, err)
+	}
+	return string(b)
+}
+
+func TestSwapVolumeDir_ReplacesAndPreservesOld(t *testing.T) {
+	root := t.TempDir()
+	live := filepath.Join(root, "data")
+	tmp := filepath.Join(root, ".data.tmp")
+	old := filepath.Join(root, ".data.old")
+	mkDirWith(t, live, "f.txt", "ORIGINAL")
+	mkDirWith(t, tmp, "f.txt", "RESTORED")
+
+	hadOld, err := swapVolumeDir(live, tmp, old)
+	if err != nil {
+		t.Fatalf("swap: %v", err)
+	}
+	if !hadOld {
+		t.Error("hadOld should be true when a live dir existed")
+	}
+	if got := readIn(t, live, "f.txt"); got != "RESTORED" {
+		t.Errorf("live = %q, want RESTORED", got)
+	}
+	if got := readIn(t, old, "f.txt"); got != "ORIGINAL" {
+		t.Errorf("old = %q, want ORIGINAL (prior live preserved)", got)
+	}
+}
+
+func TestSwapVolumeDir_MissingLive(t *testing.T) {
+	root := t.TempDir()
+	live := filepath.Join(root, "data") // does not exist
+	tmp := filepath.Join(root, ".data.tmp")
+	old := filepath.Join(root, ".data.old")
+	mkDirWith(t, tmp, "f.txt", "RESTORED")
+
+	hadOld, err := swapVolumeDir(live, tmp, old)
+	if err != nil {
+		t.Fatalf("swap: %v", err)
+	}
+	if hadOld {
+		t.Error("hadOld should be false when no live dir existed")
+	}
+	if got := readIn(t, live, "f.txt"); got != "RESTORED" {
+		t.Errorf("live = %q, want RESTORED", got)
+	}
+}
+
+func TestSwapVolumeDir_RevertsOnSecondRenameFailure(t *testing.T) {
+	// The data-loss-critical path: the live→old rename succeeds, then tmp→live
+	// fails (here: tmp is absent). swapVolumeDir MUST self-revert old→live so
+	// the live dir is never left missing, and old must not be left dangling.
+	root := t.TempDir()
+	live := filepath.Join(root, "data")
+	old := filepath.Join(root, ".data.old")
+	mkDirWith(t, live, "f.txt", "ORIGINAL")
+
+	hadOld, err := swapVolumeDir(live, filepath.Join(root, ".data.tmp" /* absent */), old)
+	if err == nil {
+		t.Fatal("expected swap to fail when tmp is absent")
+	}
+	if got := readIn(t, live, "f.txt"); got != "ORIGINAL" {
+		t.Errorf("live = %q, want ORIGINAL restored by self-revert", got)
+	}
+	if _, statErr := os.Stat(old); !os.IsNotExist(statErr) {
+		t.Errorf("old dir should have been renamed back to live, not left dangling")
+	}
+	_ = hadOld
+}
+
+func TestStagingDirs_SameParentAsLive(t *testing.T) {
+	// R2 invariant: tmp and old must be siblings of the live dir's parent so
+	// every rename in the swap is intra-filesystem (atomic).
+	live := filepath.FromSlash("/srv/data/postgres")
+	tmp, old := stagingDirs(live, "tok", 3)
+	wantParent := filepath.Dir(live)
+	if filepath.Dir(tmp) != wantParent || filepath.Dir(old) != wantParent {
+		t.Errorf("staging dirs not siblings of live's parent: tmp=%s old=%s parent=%s", tmp, old, wantParent)
+	}
+	if tmp == old {
+		t.Error("tmp and old must be distinct paths")
+	}
+}
+
+func TestRollbackSwaps_RestoresOriginals(t *testing.T) {
+	root := t.TempDir()
+	var done []swap
+	for _, name := range []string{"vol0", "vol1"} {
+		live := filepath.Join(root, name)
+		tmp := filepath.Join(root, "."+name+".tmp")
+		old := filepath.Join(root, "."+name+".old")
+		mkDirWith(t, live, "f.txt", "ORIGINAL-"+name)
+		mkDirWith(t, tmp, "f.txt", "RESTORED-"+name)
+		hadOld, err := swapVolumeDir(live, tmp, old)
+		if err != nil {
+			t.Fatalf("swap %s: %v", name, err)
+		}
+		done = append(done, swap{live: live, old: old, tmp: tmp, hadOld: hadOld})
+	}
+	// Both are now RESTORED; rolling back must return both to ORIGINAL.
+	rollbackSwaps(done)
+	for _, name := range []string{"vol0", "vol1"} {
+		if got := readIn(t, filepath.Join(root, name), "f.txt"); got != "ORIGINAL-"+name {
+			t.Errorf("%s after rollback = %q, want ORIGINAL-%s", name, got, name)
+		}
+	}
+}
+
+func TestRollbackSwaps_PartialLeavesUnswappedIntact(t *testing.T) {
+	root := t.TempDir()
+	// vol0 swaps successfully; vol1 "fails" (its tmp is absent) so only vol0 is
+	// recorded in done. Rollback of vol0 must restore the original.
+	live0 := filepath.Join(root, "vol0")
+	tmp0 := filepath.Join(root, ".vol0.tmp")
+	old0 := filepath.Join(root, ".vol0.old")
+	mkDirWith(t, live0, "f.txt", "ORIGINAL-0")
+	mkDirWith(t, tmp0, "f.txt", "RESTORED-0")
+	hadOld, err := swapVolumeDir(live0, tmp0, old0)
+	if err != nil {
+		t.Fatalf("swap vol0: %v", err)
+	}
+
+	live1 := filepath.Join(root, "vol1")
+	mkDirWith(t, live1, "f.txt", "ORIGINAL-1")
+	if _, err := swapVolumeDir(live1, filepath.Join(root, ".vol1.tmp" /* absent */), filepath.Join(root, ".vol1.old")); err == nil {
+		t.Fatal("expected vol1 swap to fail (tmp absent)")
+	}
+
+	rollbackSwaps([]swap{{live: live0, old: old0, tmp: tmp0, hadOld: hadOld}})
+	if got := readIn(t, live0, "f.txt"); got != "ORIGINAL-0" {
+		t.Errorf("vol0 after rollback = %q, want ORIGINAL-0", got)
+	}
+	// vol1 was never swapped; its original must be untouched.
+	if got := readIn(t, live1, "f.txt"); got != "ORIGINAL-1" {
+		t.Errorf("vol1 = %q, want ORIGINAL-1 (never swapped)", got)
+	}
+}
+
+func TestPreflightResolve_AllOrNothing(t *testing.T) {
+	eng, st, base := newRestoreEngine(t)
+	_ = eng
+	w, err := st.CreateWorkload(store.Workload{
+		Name: "app", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"x","port":80,"volumes":[` +
+			`{"source":"data","target":"/data","scope":"project"},` +
+			`{"source":"var","target":"/var","scope":"stage"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	settings, _ := st.GetSettings()
+
+	// A snapshotted target no longer declared by the workload ⇒ whole abort (C3).
+	if _, err := preflightResolve(st, w, settings, []SnapshotVolume{
+		{Index: 0, Target: "/data", Scope: "project", Source: "data"},
+		{Index: 1, Target: "/gone", Scope: "project", Source: "gone"},
+	}); err == nil {
+		t.Fatal("expected all-or-nothing abort when a target is no longer declared")
+	}
+
+	// All declared ⇒ resolves every volume under BaseVolumePath.
+	resolved, err := preflightResolve(st, w, settings, []SnapshotVolume{
+		{Index: 0, Target: "/data", Scope: "project", Source: "data"},
+		{Index: 1, Target: "/var", Scope: "stage", Source: "var"},
+	})
+	if err != nil {
+		t.Fatalf("preflight: %v", err)
+	}
+	if len(resolved) != 2 {
+		t.Fatalf("resolved %d volumes, want 2", len(resolved))
+	}
+	for _, rv := range resolved {
+		if ok, _ := pathWithinBase(base, rv.LivePath); !ok {
+			t.Errorf("volume %q resolved to %q, outside base %q", rv.Target, rv.LivePath, base)
+		}
+	}
+}
+
+// TestPreflightResolve_IgnoresManifestSource is the regression guard for the
+// security fix: a tampered manifest whose Source tries to escape (../../etc)
+// must NOT redirect the swap target — the host path is re-derived from the
+// workload's CURRENT trusted config (Source "data"), staying under the base.
+func TestPreflightResolve_IgnoresManifestSource(t *testing.T) {
+	_, st, base := newRestoreEngine(t)
+	w, err := st.CreateWorkload(store.Workload{
+		Name: "app", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"x","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	settings, _ := st.GetSettings()
+
+	resolved, err := preflightResolve(st, w, settings, []SnapshotVolume{
+		{Index: 0, Target: "/data", Scope: "project", Source: "../../../../etc"},
+	})
+	if err != nil {
+		t.Fatalf("preflight: %v", err)
+	}
+	if len(resolved) != 1 {
+		t.Fatalf("resolved %d, want 1", len(resolved))
+	}
+	if ok, _ := pathWithinBase(base, resolved[0].LivePath); !ok {
+		t.Errorf("manifest Source escaped containment: resolved %q outside base %q",
+			resolved[0].LivePath, base)
+	}
+	if filepath.Base(resolved[0].LivePath) != "data" {
+		t.Errorf("expected target derived from current config (data), got %q", resolved[0].LivePath)
+	}
+}
+
+func TestArchiveUncompressedSize(t *testing.T) {
+	root := t.TempDir()
+	mustWrite(t, filepath.Join(root, "a.txt"), "hello") // 5
+	if err := os.MkdirAll(filepath.Join(root, "sub"), 0o700); err != nil {
+		t.Fatal(err)
+	}
+	mustWrite(t, filepath.Join(root, "sub", "b.txt"), "world!") // 6
+	dest := filepath.Join(t.TempDir(), "snap.tar.gz")
+	if _, err := writeArchive(dest, []VolumeRef{{Target: "/d", Scope: "project", Source: "d", HostPath: root}}); err != nil {
+		t.Fatal(err)
+	}
+
+	per, total, err := archiveUncompressedSize(dest, maxRestoreUncompressedBytes)
+	if err != nil {
+		t.Fatalf("size: %v", err)
+	}
+	if total != 11 {
+		t.Errorf("total = %d, want 11", total)
+	}
+	if per[0] != 11 {
+		t.Errorf("perIndex[0] = %d, want 11", per[0])
+	}
+	if _, _, err := archiveUncompressedSize(dest, 4); err == nil {
+		t.Error("expected sizing to abort past the decompression cap")
+	}
+}
+
+func TestFreeDiskBytes(t *testing.T) {
+	n, err := freeDiskBytes(t.TempDir())
+	if err != nil {
+		t.Fatalf("freeDiskBytes: %v", err)
+	}
+	if n == 0 {
+		t.Error("expected non-zero free space on the temp filesystem")
+	}
+}
@@ -80,27 +80,10 @@ func SnapshotableVolumes(st *store.Store, w store.Workload, settings store.Setti
 		return nil, nil, nil
 	}

-	byTarget := map[string]store.WorkloadVolume{}
-
-	var cfg scVolumes
-	if w.SourceConfig != "" {
-		// Best-effort: a malformed config simply yields no inline volumes; the
-		// persisted rows below still apply.
-		_ = json.Unmarshal([]byte(w.SourceConfig), &cfg)
-	}
-	for _, v := range cfg.Volumes {
-		if v.Target == "" {
-			continue
-		}
-		byTarget[v.Target] = store.WorkloadVolume{Source: v.Source, Target: v.Target, Scope: v.Scope, Name: v.Name}
-	}
-	persisted, perr := st.ListWorkloadVolumes(w.ID)
+	byTarget, perr := volumesByTarget(st, w)
 	if perr != nil {
 		return nil, nil, perr
 	}
-	for _, p := range persisted {
-		byTarget[p.Target] = store.WorkloadVolume{Source: p.Source, Target: p.Target, Scope: p.Scope, Name: p.Name}
-	}

 	params := volume.ResolveWorkloadParams{
 		BasePath:           settings.BaseVolumePath,
@@ -132,6 +115,37 @@ func SnapshotableVolumes(st *store.Store, w store.Workload, settings store.Setti
 	return refs, skipped, nil
 }

+// volumesByTarget merges a workload's source_config inline volumes with its
+// persisted workload_volumes rows (persisted wins on a target conflict), keyed
+// by container target path. It is the authoritative current volume set, shared
+// by capture enumeration (SnapshotableVolumes) and restore pre-flight so both
+// resolve host paths the same way — restore must never trust a snapshot's
+// persisted manifest to name a host directory.
+func volumesByTarget(st *store.Store, w store.Workload) (map[string]store.WorkloadVolume, error) {
+	byTarget := map[string]store.WorkloadVolume{}
+
+	var cfg scVolumes
+	if w.SourceConfig != "" {
+		// Best-effort: a malformed config simply yields no inline volumes; the
+		// persisted rows below still apply.
+		_ = json.Unmarshal([]byte(w.SourceConfig), &cfg)
+	}
+	for _, v := range cfg.Volumes {
+		if v.Target == "" {
+			continue
+		}
+		byTarget[v.Target] = store.WorkloadVolume{Source: v.Source, Target: v.Target, Scope: v.Scope, Name: v.Name}
+	}
+	persisted, err := st.ListWorkloadVolumes(w.ID)
+	if err != nil {
+		return nil, err
+	}
+	for _, p := range persisted {
+		byTarget[p.Target] = store.WorkloadVolume{Source: p.Source, Target: p.Target, Scope: p.Scope, Name: p.Name}
+	}
+	return byTarget, nil
+}
+
 func skipReason(scope string) string {
 	switch scope {
 	case string(store.VolumeScopeInstance):