feat(volsnap): volume snapshot restore (backlog #6)

Restore a captured volume snapshot onto an image workload's live host-bind data volumes, then redeploy — the most destructive workload action, built to the adversarially-reviewed design (C1–C6) with all data-loss guards. - Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from the workload's CURRENT config (never the tamperable manifest), per-filesystem disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and crash-recovery sweep (RecoverInterruptedRestores) wired before serving. - internal/keyedmutex: shared per-key lock; deployer now serializes every deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked for the restore re-dispatch, no deadlock). - Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir only), decompression-bomb cap, manifest-index bounds. - POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore header (CSRF), per-workload single-flight (409). - WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru). Scope: image-source only; scopes absolute/stage/project (driven off the same supportedScopes constant capture uses). Plan-reviewed before coding; per-phase go/security/ts reviews; final review READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path traversal (re-derive target from current config + base containment). Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00
parent 8a5f69af87
commit 1c47030854
33 changed files with 2825 additions and 34 deletions
@@ -0,0 +1,48 @@
+// Package keyedmutex provides a lazily-populated per-key mutex, so a critical
+// section can be serialized per key (e.g. per workload id) without a global
+// lock. It is the shared form of the pattern that originated inline in the
+// GitOps sync handler; the deployer (per-workload deploy serialization) and the
+// volume-snapshot restore single-flight both use it.
+package keyedmutex
+
+import "sync"
+
+// Mutex hands out one *sync.Mutex per key on demand. The zero value is ready to
+// use. The internal map only grows (one entry per distinct key ever locked),
+// which is bounded in practice by the number of workloads.
+type Mutex struct {
+	mu sync.Mutex
+	m  map[string]*sync.Mutex
+}
+
+func (k *Mutex) get(key string) *sync.Mutex {
+	k.mu.Lock()
+	defer k.mu.Unlock()
+	if k.m == nil {
+		k.m = make(map[string]*sync.Mutex)
+	}
+	mu, ok := k.m[key]
+	if !ok {
+		mu = &sync.Mutex{}
+		k.m[key] = mu
+	}
+	return mu
+}
+
+// Lock blocks until the mutex for key is acquired, then returns its unlock func.
+func (k *Mutex) Lock(key string) func() {
+	mu := k.get(key)
+	mu.Lock()
+	return mu.Unlock
+}
+
+// TryLock attempts to acquire the mutex for key without blocking. On success it
+// returns the unlock func and true; if the key is already locked it returns nil
+// and false so the caller can reject (e.g. HTTP 409) instead of queuing.
+func (k *Mutex) TryLock(key string) (func(), bool) {
+	mu := k.get(key)
+	if !mu.TryLock() {
+		return nil, false
+	}
+	return mu.Unlock, true
+}
@@ -0,0 +1,83 @@
+package keyedmutex
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestLockSerializesSameKey(t *testing.T) {
+	var m Mutex
+	unlock := m.Lock("a")
+
+	acquired := make(chan struct{})
+	go func() {
+		u := m.Lock("a")
+		close(acquired)
+		u()
+	}()
+
+	select {
+	case <-acquired:
+		t.Fatal("second Lock on the same key acquired while the first was held")
+	case <-time.After(50 * time.Millisecond):
+		// expected: blocked
+	}
+	unlock()
+	select {
+	case <-acquired:
+		// expected: now acquired
+	case <-time.After(time.Second):
+		t.Fatal("second Lock did not acquire after release")
+	}
+}
+
+func TestLockIndependentKeys(t *testing.T) {
+	var m Mutex
+	unlockA := m.Lock("a")
+	defer unlockA()
+	// A different key must not block.
+	done := make(chan struct{})
+	go func() { u := m.Lock("b"); u(); close(done) }()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("Lock on an independent key blocked")
+	}
+}
+
+func TestTryLock(t *testing.T) {
+	var m Mutex
+	unlock, ok := m.TryLock("a")
+	if !ok {
+		t.Fatal("TryLock should succeed on a free key")
+	}
+	if _, ok := m.TryLock("a"); ok {
+		t.Fatal("TryLock should fail while the key is held")
+	}
+	unlock()
+	u2, ok := m.TryLock("a")
+	if !ok {
+		t.Fatal("TryLock should succeed after release")
+	}
+	u2()
+}
+
+func TestConcurrentLockNoRace(t *testing.T) {
+	var m Mutex
+	var wg sync.WaitGroup
+	counter := 0
+	for i := 0; i < 50; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			u := m.Lock("shared")
+			counter++ // protected by the keyed lock
+			u()
+		}()
+	}
+	wg.Wait()
+	if counter != 50 {
+		t.Errorf("counter = %d, want 50 (lost updates ⇒ lock not serializing)", counter)
+	}
+}