Files
alexei.dolgolyov 1c47030854 feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00

346 lines
11 KiB
Go

package volsnap
import (
"archive/tar"
"context"
"errors"
"os"
"path/filepath"
"strings"
"sync"
"testing"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/volume"
)
// fakeLifecycle records the order of deploy-side calls and lets tests inject
// failures, without a real deployer/docker.
type fakeLifecycle struct {
mu sync.Mutex
calls []string
tag string
stopErr error
redeployErr error
redeployRef string
}
func (f *fakeLifecycle) rec(s string) {
f.mu.Lock()
f.calls = append(f.calls, s)
f.mu.Unlock()
}
func (f *fakeLifecycle) Lock(string) func() { f.rec("lock"); return func() { f.rec("unlock") } }
func (f *fakeLifecycle) StopContainers(context.Context, string) (string, error) {
f.rec("stop")
return f.tag, f.stopErr
}
func (f *fakeLifecycle) Redeploy(_ context.Context, _ store.Workload, ref string) error {
f.rec("redeploy:" + ref)
f.redeployRef = ref
return f.redeployErr
}
func (f *fakeLifecycle) saw(s string) bool {
f.mu.Lock()
defer f.mu.Unlock()
for _, c := range f.calls {
if c == s {
return true
}
}
return false
}
func newRestoreEngine(t *testing.T) (*Engine, *store.Store, string) {
t.Helper()
st, err := store.New(":memory:")
if err != nil {
t.Fatalf("store: %v", err)
}
t.Cleanup(func() { st.Close() })
base := t.TempDir()
s, _ := st.GetSettings()
s.BaseVolumePath = base
if err := st.UpdateSettings(s); err != nil {
t.Fatalf("settings: %v", err)
}
eng, err := New(st, t.TempDir())
if err != nil {
t.Fatalf("engine: %v", err)
}
return eng, st, base
}
// seedImageWorkload creates an image workload with one project-scope volume and
// returns it plus the resolved live host dir.
func seedImageWorkload(t *testing.T, st *store.Store) (store.Workload, string) {
t.Helper()
w, err := st.CreateWorkload(store.Workload{
Name: "data-app",
Kind: "project",
SourceKind: "image",
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
})
if err != nil {
t.Fatalf("create workload: %v", err)
}
settings, _ := st.GetSettings()
live, err := volume.ResolveWorkloadPath(
store.WorkloadVolume{Source: "data", Target: "/data", Scope: "project"},
volume.ResolveWorkloadParams{BasePath: settings.BaseVolumePath, WorkloadID: w.ID, WorkloadName: w.Name},
)
if err != nil {
t.Fatalf("resolve: %v", err)
}
return w, live
}
func TestEngineRestore_HappyPath(t *testing.T) {
eng, st, _ := newRestoreEngine(t)
w, live := seedImageWorkload(t, st)
mkDirWith(t, live, "orig.txt", "ORIGINAL")
settings, _ := st.GetSettings()
snap, err := eng.Create(w, settings, "base")
if err != nil {
t.Fatalf("create snapshot: %v", err)
}
// Drift: the live dir now differs from the snapshot.
if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil {
t.Fatal(err)
}
mkDirWith(t, live, "extra.txt", "NEW") // not in the snapshot
fake := &fakeLifecycle{tag: "v1.2.3"}
eng.SetLifecycle(fake)
// Uses the REAL eng.Create for the pre-restore capture — if Restore held
// e.mu this would deadlock (R1), failing the test instead of production.
if err := eng.Restore(context.Background(), snap.ID, w.ID); err != nil {
t.Fatalf("restore: %v", err)
}
if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" {
t.Errorf("orig.txt = %q, want ORIGINAL (restored)", got)
}
if _, err := os.Stat(filepath.Join(live, "extra.txt")); !os.IsNotExist(err) {
t.Error("extra.txt should be gone — restore replaces the volume dir wholesale")
}
for _, want := range []string{"lock", "stop", "redeploy:v1.2.3", "unlock"} {
if !fake.saw(want) {
t.Errorf("expected lifecycle call %q; calls=%v", want, fake.calls)
}
}
if fake.redeployRef != "v1.2.3" {
t.Errorf("redeploy reference = %q, want the running tag v1.2.3", fake.redeployRef)
}
// A durable pre-restore snapshot was captured (base + pre-restore).
snaps, _ := eng.List(w.ID)
if len(snaps) != 2 {
t.Errorf("expected 2 snapshots (base + pre-restore), got %d", len(snaps))
}
// No journal left behind.
assertNoJournal(t, eng)
}
func TestEngineRestore_RedeployFailureKeepsRestoredData(t *testing.T) {
eng, st, _ := newRestoreEngine(t)
w, live := seedImageWorkload(t, st)
mkDirWith(t, live, "orig.txt", "ORIGINAL")
settings, _ := st.GetSettings()
snap, _ := eng.Create(w, settings, "base")
if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil {
t.Fatal(err)
}
fake := &fakeLifecycle{tag: "v1", redeployErr: errors.New("boom")}
eng.SetLifecycle(fake)
err := eng.Restore(context.Background(), snap.ID, w.ID)
if err == nil || !strings.Contains(err.Error(), "redeploy") {
t.Fatalf("expected a redeploy error, got %v", err)
}
// Data is committed despite the redeploy failure — we must NOT roll it back.
if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" {
t.Errorf("orig.txt = %q, want ORIGINAL (restore committed)", got)
}
assertNoJournal(t, eng)
}
func TestEngineRestore_PreflightFailDoesNotLockOrStop(t *testing.T) {
eng, st, _ := newRestoreEngine(t)
w, _ := seedImageWorkload(t, st)
// A snapshot whose manifest names an unsupported scope ⇒ pre-flight aborts.
bad, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{
WorkloadID: w.ID, Filename: "bad.tar.gz",
Manifest: `[{"index":0,"target":"/x","scope":"named","source":"x"}]`,
})
if err != nil {
t.Fatalf("seed snapshot: %v", err)
}
fake := &fakeLifecycle{}
eng.SetLifecycle(fake)
if err := eng.Restore(context.Background(), bad.ID, w.ID); err == nil {
t.Fatal("expected pre-flight to abort on an unsupported scope")
}
if fake.saw("lock") || fake.saw("stop") {
t.Errorf("pre-flight abort must happen BEFORE lock/stop; calls=%v", fake.calls)
}
}
func TestEngineRestore_NilLifecycle(t *testing.T) {
eng, _, _ := newRestoreEngine(t)
if err := eng.Restore(context.Background(), "s", "w"); err == nil ||
!strings.Contains(err.Error(), "lifecycle") {
t.Fatalf("expected a lifecycle-not-configured error, got %v", err)
}
}
func TestEngineRestore_WrongWorkload(t *testing.T) {
eng, st, _ := newRestoreEngine(t)
w, live := seedImageWorkload(t, st)
mkDirWith(t, live, "f.txt", "x")
settings, _ := st.GetSettings()
snap, _ := eng.Create(w, settings, "base")
fake := &fakeLifecycle{}
eng.SetLifecycle(fake)
if err := eng.Restore(context.Background(), snap.ID, "some-other-workload"); err == nil {
t.Fatal("expected cross-workload restore to be rejected")
}
if fake.saw("lock") {
t.Error("must reject before taking the lock")
}
}
func TestEngineRestore_ExtractFailureAbortsAfterLock(t *testing.T) {
eng, st, _ := newRestoreEngine(t)
// The workload must CURRENTLY declare both targets so pre-flight passes and
// the failure happens during extraction (post-lock), not pre-flight.
w, err := st.CreateWorkload(store.Workload{
Name: "two-vol", Kind: "project", SourceKind: "image",
SourceConfig: `{"image":"x","port":80,"volumes":[` +
`{"source":"data","target":"/data","scope":"project"},` +
`{"source":"other","target":"/other","scope":"project"}]}`,
})
if err != nil {
t.Fatalf("create workload: %v", err)
}
// Hand-build a 2-volume archive where volume 1 carries a symlink entry the
// untrusted extractor rejects — forcing a post-lock extract failure after
// volume 0 has already been staged.
arc := buildTarGz(t, []tentry{
{name: "0/f.txt", typeflag: tar.TypeReg, body: "x"},
{name: "1/evil", typeflag: tar.TypeSymlink, linkname: "/etc/passwd"},
})
data, err := os.ReadFile(arc)
if err != nil {
t.Fatal(err)
}
fname := "extract-fail.tar.gz"
if err := os.WriteFile(filepath.Join(eng.snapDir, fname), data, 0o600); err != nil {
t.Fatal(err)
}
snap, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{
WorkloadID: w.ID, Filename: fname,
Manifest: `[{"index":0,"target":"/data","scope":"project","source":"data"},` +
`{"index":1,"target":"/other","scope":"project","source":"other"}]`,
})
if err != nil {
t.Fatalf("seed snapshot: %v", err)
}
fake := &fakeLifecycle{tag: "v1"}
eng.SetLifecycle(fake)
if err := eng.Restore(context.Background(), snap.ID, w.ID); err == nil {
t.Fatal("expected extract failure to abort the restore")
}
// Post-lock abort: it stopped, then brought the app back (no swaps happened).
if !fake.saw("lock") || !fake.saw("stop") || !fake.saw("redeploy:v1") {
t.Errorf("expected lock+stop+redeploy after a post-lock abort; calls=%v", fake.calls)
}
// No staging or journal left behind.
assertNoJournal(t, eng)
entries, _ := os.ReadDir(eng.snapDir)
for _, e := range entries {
if strings.Contains(e.Name(), ".tf-restore-") {
t.Errorf("leftover staging dir: %s", e.Name())
}
}
}
func TestRecoverInterruptedRestores(t *testing.T) {
eng, _, _ := newRestoreEngine(t)
root := t.TempDir()
// A: swap completed — keep restored live, drop old.
liveA := filepath.Join(root, "A")
oldA := filepath.Join(root, ".A.old")
mkDirWith(t, liveA, "f", "RESTORED-A")
mkDirWith(t, oldA, "f", "ORIGINAL-A")
// B: not swapped, live present — keep original, drop tmp.
liveB := filepath.Join(root, "B")
tmpB := filepath.Join(root, ".B.tmp")
mkDirWith(t, liveB, "f", "ORIGINAL-B")
mkDirWith(t, tmpB, "f", "STAGED-B")
// C: crashed mid-rename — live missing, old present — revert from old.
liveC := filepath.Join(root, "C")
oldC := filepath.Join(root, ".C.old")
tmpC := filepath.Join(root, ".C.tmp")
mkDirWith(t, oldC, "f", "ORIGINAL-C")
mkDirWith(t, tmpC, "f", "STAGED-C")
jr := restoreJournal{SnapshotID: "snap", WorkloadID: "wl-recover", Volumes: []journalVolume{
{Live: liveA, Old: oldA, Swapped: true, HadOld: true},
{Live: liveB, Tmp: tmpB, Swapped: false},
{Live: liveC, Old: oldC, Tmp: tmpC, Swapped: false},
}}
if err := eng.writeJournal(jr); err != nil {
t.Fatalf("write journal: %v", err)
}
n, err := eng.RecoverInterruptedRestores()
if err != nil {
t.Fatalf("recover: %v", err)
}
if n != 1 {
t.Fatalf("recovered %d journals, want 1", n)
}
if got := readIn(t, liveA, "f"); got != "RESTORED-A" {
t.Errorf("A live = %q, want RESTORED-A (swap kept)", got)
}
if _, err := os.Stat(oldA); !os.IsNotExist(err) {
t.Error("A old should be removed")
}
if got := readIn(t, liveB, "f"); got != "ORIGINAL-B" {
t.Errorf("B live = %q, want ORIGINAL-B (untouched)", got)
}
if _, err := os.Stat(tmpB); !os.IsNotExist(err) {
t.Error("B tmp should be removed")
}
if got := readIn(t, liveC, "f"); got != "ORIGINAL-C" {
t.Errorf("C live = %q, want ORIGINAL-C (reverted from old)", got)
}
if _, err := os.Stat(tmpC); !os.IsNotExist(err) {
t.Error("C tmp should be removed")
}
assertNoJournal(t, eng)
}
func assertNoJournal(t *testing.T, eng *Engine) {
t.Helper()
entries, err := os.ReadDir(eng.snapDir)
if err != nil {
t.Fatal(err)
}
for _, e := range entries {
if strings.HasPrefix(e.Name(), "restore-") && strings.HasSuffix(e.Name(), ".json") {
t.Errorf("leftover restore journal: %s", e.Name())
}
}
}