feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
@@ -53,7 +56,211 @@ func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
|
||||
t.Fatalf("update settings: %v", err)
|
||||
}
|
||||
|
||||
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey}, baseVol
|
||||
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol
|
||||
}
|
||||
|
||||
// doRestore issues an authenticated restore POST, optionally setting the
|
||||
// X-Confirm-Restore header (pass confirm="" to omit it).
|
||||
func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest(http.MethodPost,
|
||||
e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("new request: %v", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+e.adminToken)
|
||||
if confirm != "" {
|
||||
req.Header.Set("X-Confirm-Restore", confirm)
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("do request: %v", err)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the
|
||||
// deep restore behavior is covered by the volsnap engine tests.
|
||||
type okLifecycle struct{ tag string }
|
||||
|
||||
func (l *okLifecycle) Lock(string) func() { return func() {} }
|
||||
func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil }
|
||||
func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
|
||||
|
||||
func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
// Missing header → 400.
|
||||
resp := e.doRestore(t, w.ID, snap.ID, "")
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("missing header status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
// Mismatched header → 400.
|
||||
resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid")
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_WrongWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_NonImageWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_NotFound(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
|
||||
resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid")
|
||||
if resp.StatusCode != http.StatusNotFound {
|
||||
t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_HappyPath(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"})
|
||||
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "data-app", Kind: "project", SourceKind: "image",
|
||||
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil {
|
||||
t.Fatalf("set volume: %v", err)
|
||||
}
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings, _ := e.store.GetSettings()
|
||||
snap, err := e.snapEngine.Create(w, settings, "base")
|
||||
if err != nil {
|
||||
t.Fatalf("create snapshot: %v", err)
|
||||
}
|
||||
// Drift the live data, then restore.
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body)
|
||||
}
|
||||
resp.Body.Close()
|
||||
if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" {
|
||||
t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// blockingLifecycle blocks in Lock until released, signaling when entered — so
|
||||
// a test can hold one restore in-flight and assert a second is rejected 409.
|
||||
type blockingLifecycle struct {
|
||||
entered chan struct{}
|
||||
release chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (l *blockingLifecycle) Lock(string) func() {
|
||||
l.once.Do(func() { close(l.entered) })
|
||||
<-l.release
|
||||
return func() {}
|
||||
}
|
||||
func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil }
|
||||
func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
|
||||
|
||||
// seedRestorable creates an image workload with a project volume + live data and
|
||||
// a captured snapshot, returning the workload and snapshot ids.
|
||||
func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) {
|
||||
t.Helper()
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "sf-app", Kind: "project", SourceKind: "image",
|
||||
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings, _ := e.store.GetSettings()
|
||||
snap, err := e.snapEngine.Create(w, settings, "base")
|
||||
if err != nil {
|
||||
t.Fatalf("create snapshot: %v", err)
|
||||
}
|
||||
return w.ID, snap.ID
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_SingleFlight409(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
wid, sid := seedRestorable(t, e, baseVol)
|
||||
bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})}
|
||||
e.snapEngine.SetLifecycle(bl)
|
||||
|
||||
// Restore #1: passes validation, takes the single-flight, then blocks inside
|
||||
// the engine's Lock.
|
||||
go func() {
|
||||
resp := e.doRestore(t, wid, sid, sid)
|
||||
resp.Body.Close()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-bl.entered:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("first restore never reached the lifecycle lock")
|
||||
}
|
||||
|
||||
// Restore #2 for the same workload must be rejected fast with 409.
|
||||
resp := e.doRestore(t, wid, sid, sid)
|
||||
got := resp.StatusCode
|
||||
resp.Body.Close()
|
||||
close(bl.release) // let #1 finish
|
||||
if got != http.StatusConflict {
|
||||
t.Fatalf("concurrent restore status = %d, want 409", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolumeSnapshots_EndToEnd(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user