feat(volsnap): volume snapshot restore (backlog #6)

Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
This commit is contained in:
2026-06-22 17:23:52 +03:00
parent 8a5f69af87
commit 1c47030854
33 changed files with 2825 additions and 34 deletions
+48
View File
@@ -0,0 +1,48 @@
// Package keyedmutex provides a lazily-populated per-key mutex, so a critical
// section can be serialized per key (e.g. per workload id) without a global
// lock. It is the shared form of the pattern that originated inline in the
// GitOps sync handler; the deployer (per-workload deploy serialization) and the
// volume-snapshot restore single-flight both use it.
package keyedmutex
import "sync"
// Mutex hands out one *sync.Mutex per key on demand. The zero value is ready to
// use. The internal map only grows (one entry per distinct key ever locked),
// which is bounded in practice by the number of workloads.
type Mutex struct {
mu sync.Mutex
m map[string]*sync.Mutex
}
func (k *Mutex) get(key string) *sync.Mutex {
k.mu.Lock()
defer k.mu.Unlock()
if k.m == nil {
k.m = make(map[string]*sync.Mutex)
}
mu, ok := k.m[key]
if !ok {
mu = &sync.Mutex{}
k.m[key] = mu
}
return mu
}
// Lock blocks until the mutex for key is acquired, then returns its unlock func.
func (k *Mutex) Lock(key string) func() {
mu := k.get(key)
mu.Lock()
return mu.Unlock
}
// TryLock attempts to acquire the mutex for key without blocking. On success it
// returns the unlock func and true; if the key is already locked it returns nil
// and false so the caller can reject (e.g. HTTP 409) instead of queuing.
func (k *Mutex) TryLock(key string) (func(), bool) {
mu := k.get(key)
if !mu.TryLock() {
return nil, false
}
return mu.Unlock, true
}
+83
View File
@@ -0,0 +1,83 @@
package keyedmutex
import (
"sync"
"testing"
"time"
)
func TestLockSerializesSameKey(t *testing.T) {
var m Mutex
unlock := m.Lock("a")
acquired := make(chan struct{})
go func() {
u := m.Lock("a")
close(acquired)
u()
}()
select {
case <-acquired:
t.Fatal("second Lock on the same key acquired while the first was held")
case <-time.After(50 * time.Millisecond):
// expected: blocked
}
unlock()
select {
case <-acquired:
// expected: now acquired
case <-time.After(time.Second):
t.Fatal("second Lock did not acquire after release")
}
}
func TestLockIndependentKeys(t *testing.T) {
var m Mutex
unlockA := m.Lock("a")
defer unlockA()
// A different key must not block.
done := make(chan struct{})
go func() { u := m.Lock("b"); u(); close(done) }()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("Lock on an independent key blocked")
}
}
func TestTryLock(t *testing.T) {
var m Mutex
unlock, ok := m.TryLock("a")
if !ok {
t.Fatal("TryLock should succeed on a free key")
}
if _, ok := m.TryLock("a"); ok {
t.Fatal("TryLock should fail while the key is held")
}
unlock()
u2, ok := m.TryLock("a")
if !ok {
t.Fatal("TryLock should succeed after release")
}
u2()
}
func TestConcurrentLockNoRace(t *testing.T) {
var m Mutex
var wg sync.WaitGroup
counter := 0
for i := 0; i < 50; i++ {
wg.Add(1)
go func() {
defer wg.Done()
u := m.Lock("shared")
counter++ // protected by the keyed lock
u()
}()
}
wg.Wait()
if counter != 50 {
t.Errorf("counter = %d, want 50 (lost updates ⇒ lock not serializing)", counter)
}
}