feat(volsnap): volume snapshot restore (backlog #6)

Restore a captured volume snapshot onto an image workload's live host-bind data volumes, then redeploy — the most destructive workload action, built to the adversarially-reviewed design (C1–C6) with all data-loss guards. - Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from the workload's CURRENT config (never the tamperable manifest), per-filesystem disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and crash-recovery sweep (RecoverInterruptedRestores) wired before serving. - internal/keyedmutex: shared per-key lock; deployer now serializes every deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked for the restore re-dispatch, no deadlock). - Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir only), decompression-bomb cap, manifest-index bounds. - POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore header (CSRF), per-workload single-flight (409). - WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru). Scope: image-source only; scopes absolute/stage/project (driven off the same supportedScopes constant capture uses). Plan-reviewed before coding; per-phase go/security/ts reviews; final review READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path traversal (re-derive target from current config + base containment). Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00
parent 8a5f69af87
commit 1c47030854
33 changed files with 2825 additions and 34 deletions
@@ -5,6 +5,7 @@
 package deployer

 import (
+	"context"
 	"fmt"
 	"log/slog"
 	"sync"
@@ -14,9 +15,11 @@ import (
 	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/events"
 	"github.com/alexei/tinyforge/internal/health"
+	"github.com/alexei/tinyforge/internal/keyedmutex"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/proxy"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

 // Deployer owns the dependency bundle each Source plugin needs at deploy
@@ -49,6 +52,29 @@ type Deployer struct {
 	drainMu      sync.Mutex
 	activeWg     sync.WaitGroup
 	shuttingDown atomic.Bool
+
+	// workloadLocks serializes deploy-class operations per workload id so two
+	// concurrent mutators of the same workload (a manual deploy, a webhook/
+	// trigger dispatch, a rollback, a promote, OR a volume-snapshot restore)
+	// can never interleave their container/volume changes. Every deploy
+	// entrypoint funnels through DispatchPlugin, so locking there gates them
+	// all at one choke point. This is the per-workload lock activeWg is NOT
+	// (activeWg is a global drain barrier for graceful shutdown).
+	workloadLocks keyedmutex.Mutex
+}
+
+// LockWorkload acquires the per-workload deploy lock for an external critical
+// section (volume-snapshot restore) and returns the release func. The restore
+// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked
+// (which does NOT re-acquire it).
+func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) }
+
+// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock,
+// because the caller (restore) already holds it via LockWorkload. Calling the
+// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant.
+// Not for general use.
+func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	return d.dispatchLocked(ctx, w, intent)
 }

 // EventPublisher is the interface for publishing events to the event bus.
@@ -15,6 +15,18 @@ import (
 // operator enables auto_backup_before_deploy, a pre-deploy Tinyforge DB
 // snapshot is taken here, after the source resolves and before it runs.
 func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	// C1: serialize all deploy-class work per workload. Held across the whole
+	// deploy so a concurrent deploy/rollback/promote/trigger — or a volume
+	// restore (which redeploys via RedeployLocked while holding this) — can
+	// never interleave container changes for the same workload.
+	unlock := d.workloadLocks.Lock(w.ID)
+	defer unlock()
+	return d.dispatchLocked(ctx, w, intent)
+}
+
+// dispatchLocked is DispatchPlugin's body, assuming the per-workload lock is
+// already held. RedeployLocked calls it directly during restore.
+func (d *Deployer) dispatchLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
 	if err := d.beginDispatch(); err != nil {
 		metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining")
 		return err
@@ -52,6 +64,12 @@ func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent
 // Used when a workload is deleted. Tracked via activeWg so Drain() honours
 // in-progress teardowns just like deploys.
 func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error {
+	// Teardown mutates the same containers/routes a deploy does, so it takes the
+	// per-workload lock too (C1). Callers tear down distinct workload ids
+	// sequentially (e.g. preview children then parent), never nested, so no
+	// self-deadlock.
+	unlock := d.workloadLocks.Lock(w.ID)
+	defer unlock()
 	if err := d.beginDispatch(); err != nil {
 		return err
 	}