feat(volsnap): volume snapshot restore (backlog #6)

Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
This commit is contained in:
2026-06-22 17:23:52 +03:00
parent 8a5f69af87
commit 1c47030854
33 changed files with 2825 additions and 34 deletions
+208 -1
View File
@@ -1,12 +1,15 @@
package api
import (
"context"
"io"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"sync"
"testing"
"time"
"github.com/alexei/tinyforge/internal/auth"
"github.com/alexei/tinyforge/internal/store"
@@ -53,7 +56,211 @@ func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
t.Fatalf("update settings: %v", err)
}
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey}, baseVol
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol
}
// doRestore issues an authenticated restore POST, optionally setting the
// X-Confirm-Restore header (pass confirm="" to omit it).
func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response {
t.Helper()
req, err := http.NewRequest(http.MethodPost,
e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil)
if err != nil {
t.Fatalf("new request: %v", err)
}
req.Header.Set("Authorization", "Bearer "+e.adminToken)
if confirm != "" {
req.Header.Set("X-Confirm-Restore", confirm)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do request: %v", err)
}
return resp
}
// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the
// deep restore behavior is covered by the volsnap engine tests.
type okLifecycle struct{ tag string }
func (l *okLifecycle) Lock(string) func() { return func() {} }
func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil }
func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) {
e, _ := newSnapshotEnv(t)
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
// Missing header → 400.
resp := e.doRestore(t, w.ID, snap.ID, "")
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("missing header status = %d, want 400", resp.StatusCode)
}
resp.Body.Close()
// Mismatched header → 400.
resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid")
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode)
}
resp.Body.Close()
}
func TestRestoreSnapshot_WrongWorkload(t *testing.T) {
e, _ := newSnapshotEnv(t)
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID)
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode)
}
resp.Body.Close()
}
func TestRestoreSnapshot_NonImageWorkload(t *testing.T) {
e, _ := newSnapshotEnv(t)
w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`})
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode)
}
resp.Body.Close()
}
func TestRestoreSnapshot_NotFound(t *testing.T) {
e, _ := newSnapshotEnv(t)
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid")
if resp.StatusCode != http.StatusNotFound {
t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode)
}
resp.Body.Close()
}
func TestRestoreSnapshot_HappyPath(t *testing.T) {
e, baseVol := newSnapshotEnv(t)
e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"})
w, err := e.store.CreateWorkload(store.Workload{
Name: "data-app", Kind: "project", SourceKind: "image",
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
})
if err != nil {
t.Fatalf("create workload: %v", err)
}
if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil {
t.Fatalf("set volume: %v", err)
}
id8 := w.ID
if len(id8) > 8 {
id8 = id8[:8]
}
hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
if err := os.MkdirAll(hostDir, 0o755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil {
t.Fatal(err)
}
settings, _ := e.store.GetSettings()
snap, err := e.snapEngine.Create(w, settings, "base")
if err != nil {
t.Fatalf("create snapshot: %v", err)
}
// Drift the live data, then restore.
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil {
t.Fatal(err)
}
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body)
}
resp.Body.Close()
if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" {
t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got)
}
}
// blockingLifecycle blocks in Lock until released, signaling when entered — so
// a test can hold one restore in-flight and assert a second is rejected 409.
type blockingLifecycle struct {
entered chan struct{}
release chan struct{}
once sync.Once
}
func (l *blockingLifecycle) Lock(string) func() {
l.once.Do(func() { close(l.entered) })
<-l.release
return func() {}
}
func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil }
func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
// seedRestorable creates an image workload with a project volume + live data and
// a captured snapshot, returning the workload and snapshot ids.
func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) {
t.Helper()
w, err := e.store.CreateWorkload(store.Workload{
Name: "sf-app", Kind: "project", SourceKind: "image",
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
})
if err != nil {
t.Fatalf("create workload: %v", err)
}
id8 := w.ID
if len(id8) > 8 {
id8 = id8[:8]
}
hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data")
if err := os.MkdirAll(hostDir, 0o755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil {
t.Fatal(err)
}
settings, _ := e.store.GetSettings()
snap, err := e.snapEngine.Create(w, settings, "base")
if err != nil {
t.Fatalf("create snapshot: %v", err)
}
return w.ID, snap.ID
}
func TestRestoreSnapshot_SingleFlight409(t *testing.T) {
e, baseVol := newSnapshotEnv(t)
wid, sid := seedRestorable(t, e, baseVol)
bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})}
e.snapEngine.SetLifecycle(bl)
// Restore #1: passes validation, takes the single-flight, then blocks inside
// the engine's Lock.
go func() {
resp := e.doRestore(t, wid, sid, sid)
resp.Body.Close()
}()
select {
case <-bl.entered:
case <-time.After(3 * time.Second):
t.Fatal("first restore never reached the lifecycle lock")
}
// Restore #2 for the same workload must be rejected fast with 409.
resp := e.doRestore(t, wid, sid, sid)
got := resp.StatusCode
resp.Body.Close()
close(bl.release) // let #1 finish
if got != http.StatusConflict {
t.Fatalf("concurrent restore status = %d, want 409", got)
}
}
func TestVolumeSnapshots_EndToEnd(t *testing.T) {