1c47030854
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
346 lines
11 KiB
Go
346 lines
11 KiB
Go
package volsnap
|
|
|
|
import (
|
|
"archive/tar"
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
"github.com/alexei/tinyforge/internal/volume"
|
|
)
|
|
|
|
// fakeLifecycle records the order of deploy-side calls and lets tests inject
|
|
// failures, without a real deployer/docker.
|
|
type fakeLifecycle struct {
|
|
mu sync.Mutex
|
|
calls []string
|
|
tag string
|
|
stopErr error
|
|
redeployErr error
|
|
redeployRef string
|
|
}
|
|
|
|
func (f *fakeLifecycle) rec(s string) {
|
|
f.mu.Lock()
|
|
f.calls = append(f.calls, s)
|
|
f.mu.Unlock()
|
|
}
|
|
func (f *fakeLifecycle) Lock(string) func() { f.rec("lock"); return func() { f.rec("unlock") } }
|
|
func (f *fakeLifecycle) StopContainers(context.Context, string) (string, error) {
|
|
f.rec("stop")
|
|
return f.tag, f.stopErr
|
|
}
|
|
func (f *fakeLifecycle) Redeploy(_ context.Context, _ store.Workload, ref string) error {
|
|
f.rec("redeploy:" + ref)
|
|
f.redeployRef = ref
|
|
return f.redeployErr
|
|
}
|
|
func (f *fakeLifecycle) saw(s string) bool {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
for _, c := range f.calls {
|
|
if c == s {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func newRestoreEngine(t *testing.T) (*Engine, *store.Store, string) {
|
|
t.Helper()
|
|
st, err := store.New(":memory:")
|
|
if err != nil {
|
|
t.Fatalf("store: %v", err)
|
|
}
|
|
t.Cleanup(func() { st.Close() })
|
|
base := t.TempDir()
|
|
s, _ := st.GetSettings()
|
|
s.BaseVolumePath = base
|
|
if err := st.UpdateSettings(s); err != nil {
|
|
t.Fatalf("settings: %v", err)
|
|
}
|
|
eng, err := New(st, t.TempDir())
|
|
if err != nil {
|
|
t.Fatalf("engine: %v", err)
|
|
}
|
|
return eng, st, base
|
|
}
|
|
|
|
// seedImageWorkload creates an image workload with one project-scope volume and
|
|
// returns it plus the resolved live host dir.
|
|
func seedImageWorkload(t *testing.T, st *store.Store) (store.Workload, string) {
|
|
t.Helper()
|
|
w, err := st.CreateWorkload(store.Workload{
|
|
Name: "data-app",
|
|
Kind: "project",
|
|
SourceKind: "image",
|
|
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("create workload: %v", err)
|
|
}
|
|
settings, _ := st.GetSettings()
|
|
live, err := volume.ResolveWorkloadPath(
|
|
store.WorkloadVolume{Source: "data", Target: "/data", Scope: "project"},
|
|
volume.ResolveWorkloadParams{BasePath: settings.BaseVolumePath, WorkloadID: w.ID, WorkloadName: w.Name},
|
|
)
|
|
if err != nil {
|
|
t.Fatalf("resolve: %v", err)
|
|
}
|
|
return w, live
|
|
}
|
|
|
|
func TestEngineRestore_HappyPath(t *testing.T) {
|
|
eng, st, _ := newRestoreEngine(t)
|
|
w, live := seedImageWorkload(t, st)
|
|
mkDirWith(t, live, "orig.txt", "ORIGINAL")
|
|
|
|
settings, _ := st.GetSettings()
|
|
snap, err := eng.Create(w, settings, "base")
|
|
if err != nil {
|
|
t.Fatalf("create snapshot: %v", err)
|
|
}
|
|
|
|
// Drift: the live dir now differs from the snapshot.
|
|
if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
mkDirWith(t, live, "extra.txt", "NEW") // not in the snapshot
|
|
|
|
fake := &fakeLifecycle{tag: "v1.2.3"}
|
|
eng.SetLifecycle(fake)
|
|
|
|
// Uses the REAL eng.Create for the pre-restore capture — if Restore held
|
|
// e.mu this would deadlock (R1), failing the test instead of production.
|
|
if err := eng.Restore(context.Background(), snap.ID, w.ID); err != nil {
|
|
t.Fatalf("restore: %v", err)
|
|
}
|
|
|
|
if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" {
|
|
t.Errorf("orig.txt = %q, want ORIGINAL (restored)", got)
|
|
}
|
|
if _, err := os.Stat(filepath.Join(live, "extra.txt")); !os.IsNotExist(err) {
|
|
t.Error("extra.txt should be gone — restore replaces the volume dir wholesale")
|
|
}
|
|
for _, want := range []string{"lock", "stop", "redeploy:v1.2.3", "unlock"} {
|
|
if !fake.saw(want) {
|
|
t.Errorf("expected lifecycle call %q; calls=%v", want, fake.calls)
|
|
}
|
|
}
|
|
if fake.redeployRef != "v1.2.3" {
|
|
t.Errorf("redeploy reference = %q, want the running tag v1.2.3", fake.redeployRef)
|
|
}
|
|
// A durable pre-restore snapshot was captured (base + pre-restore).
|
|
snaps, _ := eng.List(w.ID)
|
|
if len(snaps) != 2 {
|
|
t.Errorf("expected 2 snapshots (base + pre-restore), got %d", len(snaps))
|
|
}
|
|
// No journal left behind.
|
|
assertNoJournal(t, eng)
|
|
}
|
|
|
|
func TestEngineRestore_RedeployFailureKeepsRestoredData(t *testing.T) {
|
|
eng, st, _ := newRestoreEngine(t)
|
|
w, live := seedImageWorkload(t, st)
|
|
mkDirWith(t, live, "orig.txt", "ORIGINAL")
|
|
settings, _ := st.GetSettings()
|
|
snap, _ := eng.Create(w, settings, "base")
|
|
if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
fake := &fakeLifecycle{tag: "v1", redeployErr: errors.New("boom")}
|
|
eng.SetLifecycle(fake)
|
|
|
|
err := eng.Restore(context.Background(), snap.ID, w.ID)
|
|
if err == nil || !strings.Contains(err.Error(), "redeploy") {
|
|
t.Fatalf("expected a redeploy error, got %v", err)
|
|
}
|
|
// Data is committed despite the redeploy failure — we must NOT roll it back.
|
|
if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" {
|
|
t.Errorf("orig.txt = %q, want ORIGINAL (restore committed)", got)
|
|
}
|
|
assertNoJournal(t, eng)
|
|
}
|
|
|
|
func TestEngineRestore_PreflightFailDoesNotLockOrStop(t *testing.T) {
|
|
eng, st, _ := newRestoreEngine(t)
|
|
w, _ := seedImageWorkload(t, st)
|
|
// A snapshot whose manifest names an unsupported scope ⇒ pre-flight aborts.
|
|
bad, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{
|
|
WorkloadID: w.ID, Filename: "bad.tar.gz",
|
|
Manifest: `[{"index":0,"target":"/x","scope":"named","source":"x"}]`,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("seed snapshot: %v", err)
|
|
}
|
|
fake := &fakeLifecycle{}
|
|
eng.SetLifecycle(fake)
|
|
|
|
if err := eng.Restore(context.Background(), bad.ID, w.ID); err == nil {
|
|
t.Fatal("expected pre-flight to abort on an unsupported scope")
|
|
}
|
|
if fake.saw("lock") || fake.saw("stop") {
|
|
t.Errorf("pre-flight abort must happen BEFORE lock/stop; calls=%v", fake.calls)
|
|
}
|
|
}
|
|
|
|
func TestEngineRestore_NilLifecycle(t *testing.T) {
|
|
eng, _, _ := newRestoreEngine(t)
|
|
if err := eng.Restore(context.Background(), "s", "w"); err == nil ||
|
|
!strings.Contains(err.Error(), "lifecycle") {
|
|
t.Fatalf("expected a lifecycle-not-configured error, got %v", err)
|
|
}
|
|
}
|
|
|
|
func TestEngineRestore_WrongWorkload(t *testing.T) {
|
|
eng, st, _ := newRestoreEngine(t)
|
|
w, live := seedImageWorkload(t, st)
|
|
mkDirWith(t, live, "f.txt", "x")
|
|
settings, _ := st.GetSettings()
|
|
snap, _ := eng.Create(w, settings, "base")
|
|
fake := &fakeLifecycle{}
|
|
eng.SetLifecycle(fake)
|
|
|
|
if err := eng.Restore(context.Background(), snap.ID, "some-other-workload"); err == nil {
|
|
t.Fatal("expected cross-workload restore to be rejected")
|
|
}
|
|
if fake.saw("lock") {
|
|
t.Error("must reject before taking the lock")
|
|
}
|
|
}
|
|
|
|
func TestEngineRestore_ExtractFailureAbortsAfterLock(t *testing.T) {
|
|
eng, st, _ := newRestoreEngine(t)
|
|
// The workload must CURRENTLY declare both targets so pre-flight passes and
|
|
// the failure happens during extraction (post-lock), not pre-flight.
|
|
w, err := st.CreateWorkload(store.Workload{
|
|
Name: "two-vol", Kind: "project", SourceKind: "image",
|
|
SourceConfig: `{"image":"x","port":80,"volumes":[` +
|
|
`{"source":"data","target":"/data","scope":"project"},` +
|
|
`{"source":"other","target":"/other","scope":"project"}]}`,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("create workload: %v", err)
|
|
}
|
|
|
|
// Hand-build a 2-volume archive where volume 1 carries a symlink entry the
|
|
// untrusted extractor rejects — forcing a post-lock extract failure after
|
|
// volume 0 has already been staged.
|
|
arc := buildTarGz(t, []tentry{
|
|
{name: "0/f.txt", typeflag: tar.TypeReg, body: "x"},
|
|
{name: "1/evil", typeflag: tar.TypeSymlink, linkname: "/etc/passwd"},
|
|
})
|
|
data, err := os.ReadFile(arc)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
fname := "extract-fail.tar.gz"
|
|
if err := os.WriteFile(filepath.Join(eng.snapDir, fname), data, 0o600); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
snap, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{
|
|
WorkloadID: w.ID, Filename: fname,
|
|
Manifest: `[{"index":0,"target":"/data","scope":"project","source":"data"},` +
|
|
`{"index":1,"target":"/other","scope":"project","source":"other"}]`,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("seed snapshot: %v", err)
|
|
}
|
|
|
|
fake := &fakeLifecycle{tag: "v1"}
|
|
eng.SetLifecycle(fake)
|
|
|
|
if err := eng.Restore(context.Background(), snap.ID, w.ID); err == nil {
|
|
t.Fatal("expected extract failure to abort the restore")
|
|
}
|
|
// Post-lock abort: it stopped, then brought the app back (no swaps happened).
|
|
if !fake.saw("lock") || !fake.saw("stop") || !fake.saw("redeploy:v1") {
|
|
t.Errorf("expected lock+stop+redeploy after a post-lock abort; calls=%v", fake.calls)
|
|
}
|
|
// No staging or journal left behind.
|
|
assertNoJournal(t, eng)
|
|
entries, _ := os.ReadDir(eng.snapDir)
|
|
for _, e := range entries {
|
|
if strings.Contains(e.Name(), ".tf-restore-") {
|
|
t.Errorf("leftover staging dir: %s", e.Name())
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestRecoverInterruptedRestores(t *testing.T) {
|
|
eng, _, _ := newRestoreEngine(t)
|
|
root := t.TempDir()
|
|
|
|
// A: swap completed — keep restored live, drop old.
|
|
liveA := filepath.Join(root, "A")
|
|
oldA := filepath.Join(root, ".A.old")
|
|
mkDirWith(t, liveA, "f", "RESTORED-A")
|
|
mkDirWith(t, oldA, "f", "ORIGINAL-A")
|
|
// B: not swapped, live present — keep original, drop tmp.
|
|
liveB := filepath.Join(root, "B")
|
|
tmpB := filepath.Join(root, ".B.tmp")
|
|
mkDirWith(t, liveB, "f", "ORIGINAL-B")
|
|
mkDirWith(t, tmpB, "f", "STAGED-B")
|
|
// C: crashed mid-rename — live missing, old present — revert from old.
|
|
liveC := filepath.Join(root, "C")
|
|
oldC := filepath.Join(root, ".C.old")
|
|
tmpC := filepath.Join(root, ".C.tmp")
|
|
mkDirWith(t, oldC, "f", "ORIGINAL-C")
|
|
mkDirWith(t, tmpC, "f", "STAGED-C")
|
|
|
|
jr := restoreJournal{SnapshotID: "snap", WorkloadID: "wl-recover", Volumes: []journalVolume{
|
|
{Live: liveA, Old: oldA, Swapped: true, HadOld: true},
|
|
{Live: liveB, Tmp: tmpB, Swapped: false},
|
|
{Live: liveC, Old: oldC, Tmp: tmpC, Swapped: false},
|
|
}}
|
|
if err := eng.writeJournal(jr); err != nil {
|
|
t.Fatalf("write journal: %v", err)
|
|
}
|
|
|
|
n, err := eng.RecoverInterruptedRestores()
|
|
if err != nil {
|
|
t.Fatalf("recover: %v", err)
|
|
}
|
|
if n != 1 {
|
|
t.Fatalf("recovered %d journals, want 1", n)
|
|
}
|
|
if got := readIn(t, liveA, "f"); got != "RESTORED-A" {
|
|
t.Errorf("A live = %q, want RESTORED-A (swap kept)", got)
|
|
}
|
|
if _, err := os.Stat(oldA); !os.IsNotExist(err) {
|
|
t.Error("A old should be removed")
|
|
}
|
|
if got := readIn(t, liveB, "f"); got != "ORIGINAL-B" {
|
|
t.Errorf("B live = %q, want ORIGINAL-B (untouched)", got)
|
|
}
|
|
if _, err := os.Stat(tmpB); !os.IsNotExist(err) {
|
|
t.Error("B tmp should be removed")
|
|
}
|
|
if got := readIn(t, liveC, "f"); got != "ORIGINAL-C" {
|
|
t.Errorf("C live = %q, want ORIGINAL-C (reverted from old)", got)
|
|
}
|
|
if _, err := os.Stat(tmpC); !os.IsNotExist(err) {
|
|
t.Error("C tmp should be removed")
|
|
}
|
|
assertNoJournal(t, eng)
|
|
}
|
|
|
|
func assertNoJournal(t *testing.T, eng *Engine) {
|
|
t.Helper()
|
|
entries, err := os.ReadDir(eng.snapDir)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, e := range entries {
|
|
if strings.HasPrefix(e.Name(), "restore-") && strings.HasSuffix(e.Name(), ".json") {
|
|
t.Errorf("leftover restore journal: %s", e.Name())
|
|
}
|
|
}
|
|
}
|