From 1c47030854c5a42e86bd79bcdddb263089d67c65 Mon Sep 17 00:00:00 2001 From: "alexei.dolgolyov" Date: Mon, 22 Jun 2026 17:23:52 +0300 Subject: [PATCH] feat(volsnap): volume snapshot restore (backlog #6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore a captured volume snapshot onto an image workload's live host-bind data volumes, then redeploy — the most destructive workload action, built to the adversarially-reviewed design (C1–C6) with all data-loss guards. - Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from the workload's CURRENT config (never the tamperable manifest), per-filesystem disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and crash-recovery sweep (RecoverInterruptedRestores) wired before serving. - internal/keyedmutex: shared per-key lock; deployer now serializes every deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked for the restore re-dispatch, no deadlock). - Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir only), decompression-bomb cap, manifest-index bounds. - POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore header (CSRF), per-workload single-flight (409). - WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru). Scope: image-source only; scopes absolute/stage/project (driven off the same supportedScopes constant capture uses). Plan-reviewed before coding; per-phase go/security/ts reviews; final review READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path traversal (re-derive target from current config + base containment). Plan: plans/volume-snapshot-restore/ --- CLAUDE.md | 20 + cmd/server/main.go | 10 + cmd/server/restore_lifecycle.go | 70 +++ go.mod | 6 +- go.sum | 2 - internal/api/router.go | 10 + internal/api/volume_snapshots.go | 66 +++ internal/api/volume_snapshots_test.go | 209 ++++++++- internal/api/workloads_test.go | 8 +- internal/deployer/deployer.go | 26 ++ internal/deployer/dispatch.go | 18 + internal/keyedmutex/keyedmutex.go | 48 +++ internal/keyedmutex/keyedmutex_test.go | 83 ++++ internal/volsnap/disk_unix.go | 21 + internal/volsnap/disk_windows.go | 24 ++ internal/volsnap/engine.go | 6 + internal/volsnap/extract.go | 162 +++++++ internal/volsnap/extract_test.go | 166 ++++++++ internal/volsnap/restore.go | 235 ++++++++++ internal/volsnap/restore_engine.go | 400 ++++++++++++++++++ internal/volsnap/restore_engine_test.go | 345 +++++++++++++++ internal/volsnap/restore_test.go | 270 ++++++++++++ internal/volsnap/volumes.go | 50 ++- plans/volume-snapshot-restore/CONTEXT.md | 60 +++ plans/volume-snapshot-restore/PLAN.md | 113 +++++ .../phase-1-engine-primitives.md | 96 +++++ .../phase-2-lifecycle-locking.md | 106 +++++ plans/volume-snapshot-restore/phase-3-api.md | 76 ++++ .../phase-4-frontend.md | 70 +++ web/src/lib/api.ts | 18 +- .../components/WorkloadSnapshotsPanel.svelte | 49 ++- web/src/lib/i18n/en.json | 8 +- web/src/lib/i18n/ru.json | 8 +- 33 files changed, 2825 insertions(+), 34 deletions(-) create mode 100644 cmd/server/restore_lifecycle.go create mode 100644 internal/keyedmutex/keyedmutex.go create mode 100644 internal/keyedmutex/keyedmutex_test.go create mode 100644 internal/volsnap/disk_unix.go create mode 100644 internal/volsnap/disk_windows.go create mode 100644 internal/volsnap/extract.go create mode 100644 internal/volsnap/extract_test.go create mode 100644 internal/volsnap/restore.go create mode 100644 internal/volsnap/restore_engine.go create mode 100644 internal/volsnap/restore_engine_test.go create mode 100644 internal/volsnap/restore_test.go create mode 100644 plans/volume-snapshot-restore/CONTEXT.md create mode 100644 plans/volume-snapshot-restore/PLAN.md create mode 100644 plans/volume-snapshot-restore/phase-1-engine-primitives.md create mode 100644 plans/volume-snapshot-restore/phase-2-lifecycle-locking.md create mode 100644 plans/volume-snapshot-restore/phase-3-api.md create mode 100644 plans/volume-snapshot-restore/phase-4-frontend.md diff --git a/CLAUDE.md b/CLAUDE.md index 313d3ba..d680ea5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -17,6 +17,26 @@ Start/restart with: `./scripts/dev-server.sh` - **"App" = workload with `source_kind !== ''`.** Triggers are first-class bindings (`workload_trigger_bindings`), NOT on the workload row — never gate app lists/counts on `trigger_kind` (it's empty for plugin workloads). Legacy pre-cutover `kind:project/stack/site` rows have an empty `source_kind` and must be excluded everywhere. - **i18n parity is mandatory** — every key in BOTH `web/src/lib/i18n/{en,ru}.json`. A missing key is NOT a build error (`$t` returns the key string), so verify parity manually. +## Backend + +- **Per-workload deploy lock.** Every deploy entrypoint (API deploy, rollback, promote, + generic-hooks, webhook trigger dispatch) funnels through `deployer.DispatchPlugin`, which + holds a per-workload `keyedmutex` lock (`internal/keyedmutex`) for the whole dispatch; + `DispatchTeardown` takes it too. This serializes all container/volume mutation per workload. + Do NOT add a deploy/teardown path that bypasses `DispatchPlugin`. Operations that must run + a deploy *while already holding* the lock (volume-snapshot restore) use + `Deployer.LockWorkload` + `RedeployLocked` (the unlocked dispatch) — calling `DispatchPlugin` + under the held lock would deadlock (Go mutexes are not reentrant). `activeWg` is a global + drain barrier for shutdown, NOT a per-workload lock. +- **Volume snapshot restore** lives in `volsnap.Engine.Restore` (engine-owned, not the API + handler): preflight re-resolves volumes from the workload's CURRENT config (never the + snapshot manifest — that's tamper-influenceable) → lock → stop → extract-to-tmp → + pre-restore snapshot → journal → atomic rename swap → redeploy. A startup + `RecoverInterruptedRestores` sweep replays the journal after a crash; it MUST be wired (with + `SetLifecycle`) before the API serves. The archive extractor treats the tar as untrusted + (zip-slip/type-allowlist/bomb-cap); the endpoint requires an `X-Confirm-Restore: ` + header (CSRF), like the DB restore. + ## Build & Test - Frontend (from `web/`): `npm run check` (svelte-check — expect 0 errors), `npm run build`, `npm run test` (vitest; pure-logic units like `sourceForms.test.ts`). diff --git a/cmd/server/main.go b/cmd/server/main.go index bc6450b..57caca3 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -419,6 +419,16 @@ func main() { apiServer.SetLogScanReloader(logScanMgr) apiServer.SetBackupEngine(backupEngine) apiServer.SetSnapshotEngine(snapshotEngine) + // Wire the restore lifecycle seam and reconcile any restore interrupted by a + // crash, BEFORE the HTTP server starts serving — so a half-applied restore is + // completed/reverted first and the restore endpoint is never reachable + // without its safety net. + snapshotEngine.SetLifecycle(&restoreLifecycle{dep: dep, docker: dockerClient, store: db}) + if n, err := snapshotEngine.RecoverInterruptedRestores(); err != nil { + slog.Warn("snapshots: recover interrupted restores on startup", "error", err) + } else if n > 0 { + slog.Info("snapshots: recovered interrupted restores on startup", "count", n) + } apiServer.SetDBPath(dbPath) apiServer.SetBackupSettingsChangedCallback(scheduleAutobackup) apiServer.SetDNSProvider(dnsProvider) diff --git a/cmd/server/restore_lifecycle.go b/cmd/server/restore_lifecycle.go new file mode 100644 index 0000000..3fa01e1 --- /dev/null +++ b/cmd/server/restore_lifecycle.go @@ -0,0 +1,70 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/alexei/tinyforge/internal/deployer" + "github.com/alexei/tinyforge/internal/docker" + "github.com/alexei/tinyforge/internal/store" + "github.com/alexei/tinyforge/internal/workload/plugin" +) + +// restoreStopTimeoutSeconds bounds the graceful-stop window per container during +// a restore quiesce before Docker kills it. +const restoreStopTimeoutSeconds = 10 + +// restoreLifecycle adapts the deployer + Docker client + store to the +// volsnap.Lifecycle seam the volume-snapshot restore flow needs. It lives in the +// composition root so the volsnap package stays decoupled from deployer/docker. +type restoreLifecycle struct { + dep *deployer.Deployer + docker *docker.Client + store *store.Store +} + +// Lock takes the deployer's per-workload deploy lock so the restore serializes +// against every deploy entrypoint (C1). +func (l *restoreLifecycle) Lock(workloadID string) func() { return l.dep.LockWorkload(workloadID) } + +// StopContainers stops every running container for the workload (quiesce before +// the volume swap, C4) and returns the image tag the newest running container +// was on, so the redeploy brings the SAME version back up. ListContainersByWorkload +// returns rows newest-first, so the first running row is the newest. +func (l *restoreLifecycle) StopContainers(ctx context.Context, workloadID string) (string, error) { + rows, err := l.store.ListContainersByWorkload(workloadID) + if err != nil { + return "", fmt.Errorf("list containers: %w", err) + } + tag := "" + for _, c := range rows { + if c.State != "running" || c.ContainerID == "" { + continue + } + if tag == "" && c.ImageTag != "" { + tag = c.ImageTag // newest running container's tag + } + if err := l.docker.StopContainer(ctx, c.ContainerID, restoreStopTimeoutSeconds); err != nil { + return "", fmt.Errorf("stop container %s: %w", c.ContainerID, err) + } + if err := l.store.UpdateContainerState(c.ID, "stopped"); err != nil { + slog.Warn("restore: mark container stopped", "container", c.ID, "error", err) + } + } + return tag, nil +} + +// Redeploy re-dispatches the workload via the deployer's unlocked path (the +// restore already holds the per-workload lock). reference pins the image tag. +func (l *restoreLifecycle) Redeploy(ctx context.Context, w store.Workload, reference string) error { + intent := plugin.DeploymentIntent{ + Reason: "restore", + Reference: reference, + Metadata: map[string]string{"note": "redeploy after volume snapshot restore"}, + TriggeredAt: time.Now().UTC(), + TriggeredBy: "restore", + } + return l.dep.RedeployLocked(ctx, plugin.WorkloadFromStore(w), intent) +} diff --git a/go.mod b/go.mod index 34b8649..06ba475 100644 --- a/go.mod +++ b/go.mod @@ -10,8 +10,11 @@ require ( github.com/moby/moby/api v1.54.0 github.com/moby/moby/client v0.3.0 github.com/robfig/cron/v3 v3.0.1 + github.com/yuin/goldmark v1.8.2 golang.org/x/crypto v0.28.0 golang.org/x/oauth2 v0.25.0 + golang.org/x/sync v0.20.0 + golang.org/x/sys v0.33.0 gopkg.in/yaml.v3 v3.0.1 modernc.org/sqlite v1.34.5 ) @@ -34,15 +37,12 @@ require ( github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect - github.com/yuin/goldmark v1.8.2 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/metric v1.35.0 // indirect go.opentelemetry.io/otel/trace v1.35.0 // indirect golang.org/x/mod v0.18.0 // indirect - golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.33.0 // indirect golang.org/x/tools v0.22.0 // indirect modernc.org/libc v1.55.3 // indirect modernc.org/mathutil v1.6.0 // indirect diff --git a/go.sum b/go.sum index 6d1a0fa..de9c4ac 100644 --- a/go.sum +++ b/go.sum @@ -85,8 +85,6 @@ golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/api/router.go b/internal/api/router.go index b56035c..3e58f8e 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -14,6 +14,7 @@ import ( "github.com/alexei/tinyforge/internal/dns" "github.com/alexei/tinyforge/internal/docker" "github.com/alexei/tinyforge/internal/events" + "github.com/alexei/tinyforge/internal/keyedmutex" "github.com/alexei/tinyforge/internal/notify" "github.com/alexei/tinyforge/internal/npm" "github.com/alexei/tinyforge/internal/proxy" @@ -56,6 +57,11 @@ type Server struct { // two concurrent syncs can't race on source_config (review S5). gitopsSync keyedMutex + // volRestoreInFlight is a per-workload single-flight guard for volume + // snapshot restore: a concurrent restore of the same workload is rejected + // fast with 409 (TryLock) rather than queuing behind the deployer lock. + volRestoreInFlight keyedmutex.Mutex + dnsProviderMu sync.RWMutex dnsProvider dns.Provider onDNSProviderChanged DNSProviderChangedFunc @@ -359,6 +365,10 @@ func (s *Server) Router() chi.Router { r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots) r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable) r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot) + // Restore overwrites live volume data and restarts the app — the + // most destructive workload action. Admin-gated + X-Confirm-Restore + // header (CSRF) + per-workload single-flight, mirroring DB restore. + r.With(auth.AdminOnly).Post("/snapshots/{sid}/restore", s.restoreWorkloadSnapshot) // Runtime view: per-source persisted state + storage usage. // Read-only; safe for any authenticated user. diff --git a/internal/api/volume_snapshots.go b/internal/api/volume_snapshots.go index b2ea00b..846d86f 100644 --- a/internal/api/volume_snapshots.go +++ b/internal/api/volume_snapshots.go @@ -140,6 +140,72 @@ func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) { respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"}) } +// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore. +// +// This is the most destructive workload action: it overwrites the app's live +// volume data with the snapshot and recreates its containers. It is guarded like +// the DB restore — admin-only, an X-Confirm-Restore header that must echo the +// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and +// a per-workload single-flight so a double-click can't stack two restores. All +// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this +// handler only validates and delegates. +func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) { + if s.snapshotEngine == nil { + respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized") + return + } + id := chi.URLParam(r, "id") + sid := chi.URLParam(r, "sid") + + if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid { + respondError(w, http.StatusBadRequest, + "missing or mismatched X-Confirm-Restore header (must equal snapshot id)") + return + } + + // Up-front validation for precise client errors (Engine.Restore re-checks + // ownership + source kind under the lock). + snap, err := s.snapshotEngine.Get(sid) + if err != nil { + respondError(w, http.StatusNotFound, "snapshot not found") + return + } + if snap.WorkloadID != id { + respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload") + return + } + row, ok := s.loadWorkload(w, id) + if !ok { + return + } + if row.SourceKind != "image" { + respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads") + return + } + + // Per-workload single-flight: reject a concurrent restore of the SAME + // workload with 409 rather than queuing it behind the deployer lock. + release, ok := s.volRestoreInFlight.TryLock(id) + if !ok { + respondError(w, http.StatusConflict, "a restore is already in progress for this workload") + return + } + defer release() + + if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil { + // Raw error (which can carry resolved host paths) stays in the log; the + // client gets a generic message. + slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err) + respondError(w, http.StatusInternalServerError, "restore failed; see server logs") + return + } + respondJSON(w, http.StatusOK, map[string]any{ + "status": "restored", + "workload_id": id, + "snapshot_id": sid, + }) +} + // downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the // tar.gz archive. The resolved path is containment-checked against the // snapshot directory. diff --git a/internal/api/volume_snapshots_test.go b/internal/api/volume_snapshots_test.go index a4adc3a..5c8e262 100644 --- a/internal/api/volume_snapshots_test.go +++ b/internal/api/volume_snapshots_test.go @@ -1,12 +1,15 @@ package api import ( + "context" "io" "net/http" "net/http/httptest" "os" "path/filepath" + "sync" "testing" + "time" "github.com/alexei/tinyforge/internal/auth" "github.com/alexei/tinyforge/internal/store" @@ -53,7 +56,211 @@ func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) { t.Fatalf("update settings: %v", err) } - return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey}, baseVol + return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol +} + +// doRestore issues an authenticated restore POST, optionally setting the +// X-Confirm-Restore header (pass confirm="" to omit it). +func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response { + t.Helper() + req, err := http.NewRequest(http.MethodPost, + e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil) + if err != nil { + t.Fatalf("new request: %v", err) + } + req.Header.Set("Authorization", "Bearer "+e.adminToken) + if confirm != "" { + req.Header.Set("X-Confirm-Restore", confirm) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do request: %v", err) + } + return resp +} + +// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the +// deep restore behavior is covered by the volsnap engine tests. +type okLifecycle struct{ tag string } + +func (l *okLifecycle) Lock(string) func() { return func() {} } +func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil } +func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil } + +func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) { + e, _ := newSnapshotEnv(t) + w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`}) + snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"}) + + // Missing header → 400. + resp := e.doRestore(t, w.ID, snap.ID, "") + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("missing header status = %d, want 400", resp.StatusCode) + } + resp.Body.Close() + // Mismatched header → 400. + resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid") + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode) + } + resp.Body.Close() +} + +func TestRestoreSnapshot_WrongWorkload(t *testing.T) { + e, _ := newSnapshotEnv(t) + w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`}) + snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"}) + + resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID) + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode) + } + resp.Body.Close() +} + +func TestRestoreSnapshot_NonImageWorkload(t *testing.T) { + e, _ := newSnapshotEnv(t) + w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`}) + snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"}) + + resp := e.doRestore(t, w.ID, snap.ID, snap.ID) + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode) + } + resp.Body.Close() +} + +func TestRestoreSnapshot_NotFound(t *testing.T) { + e, _ := newSnapshotEnv(t) + w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`}) + + resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid") + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode) + } + resp.Body.Close() +} + +func TestRestoreSnapshot_HappyPath(t *testing.T) { + e, baseVol := newSnapshotEnv(t) + e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"}) + + w, err := e.store.CreateWorkload(store.Workload{ + Name: "data-app", Kind: "project", SourceKind: "image", + SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`, + }) + if err != nil { + t.Fatalf("create workload: %v", err) + } + if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil { + t.Fatalf("set volume: %v", err) + } + id8 := w.ID + if len(id8) > 8 { + id8 = id8[:8] + } + hostDir := filepath.Join(baseVol, "data-app-"+id8, "data") + if err := os.MkdirAll(hostDir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil { + t.Fatal(err) + } + settings, _ := e.store.GetSettings() + snap, err := e.snapEngine.Create(w, settings, "base") + if err != nil { + t.Fatalf("create snapshot: %v", err) + } + // Drift the live data, then restore. + if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil { + t.Fatal(err) + } + + resp := e.doRestore(t, w.ID, snap.ID, snap.ID) + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body) + } + resp.Body.Close() + if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" { + t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got) + } +} + +// blockingLifecycle blocks in Lock until released, signaling when entered — so +// a test can hold one restore in-flight and assert a second is rejected 409. +type blockingLifecycle struct { + entered chan struct{} + release chan struct{} + once sync.Once +} + +func (l *blockingLifecycle) Lock(string) func() { + l.once.Do(func() { close(l.entered) }) + <-l.release + return func() {} +} +func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil } +func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil } + +// seedRestorable creates an image workload with a project volume + live data and +// a captured snapshot, returning the workload and snapshot ids. +func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) { + t.Helper() + w, err := e.store.CreateWorkload(store.Workload{ + Name: "sf-app", Kind: "project", SourceKind: "image", + SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`, + }) + if err != nil { + t.Fatalf("create workload: %v", err) + } + id8 := w.ID + if len(id8) > 8 { + id8 = id8[:8] + } + hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data") + if err := os.MkdirAll(hostDir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil { + t.Fatal(err) + } + settings, _ := e.store.GetSettings() + snap, err := e.snapEngine.Create(w, settings, "base") + if err != nil { + t.Fatalf("create snapshot: %v", err) + } + return w.ID, snap.ID +} + +func TestRestoreSnapshot_SingleFlight409(t *testing.T) { + e, baseVol := newSnapshotEnv(t) + wid, sid := seedRestorable(t, e, baseVol) + bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})} + e.snapEngine.SetLifecycle(bl) + + // Restore #1: passes validation, takes the single-flight, then blocks inside + // the engine's Lock. + go func() { + resp := e.doRestore(t, wid, sid, sid) + resp.Body.Close() + }() + + select { + case <-bl.entered: + case <-time.After(3 * time.Second): + t.Fatal("first restore never reached the lifecycle lock") + } + + // Restore #2 for the same workload must be rejected fast with 409. + resp := e.doRestore(t, wid, sid, sid) + got := resp.StatusCode + resp.Body.Close() + close(bl.release) // let #1 finish + if got != http.StatusConflict { + t.Fatalf("concurrent restore status = %d, want 409", got) + } } func TestVolumeSnapshots_EndToEnd(t *testing.T) { diff --git a/internal/api/workloads_test.go b/internal/api/workloads_test.go index 4d815ad..03aee2e 100644 --- a/internal/api/workloads_test.go +++ b/internal/api/workloads_test.go @@ -15,6 +15,7 @@ import ( "github.com/alexei/tinyforge/internal/auth" "github.com/alexei/tinyforge/internal/crypto" "github.com/alexei/tinyforge/internal/store" + "github.com/alexei/tinyforge/internal/volsnap" "github.com/alexei/tinyforge/internal/webhook" "github.com/alexei/tinyforge/internal/workload/plugin" @@ -75,6 +76,7 @@ type apiTestEnv struct { dispatcher *fakeAPIDispatcher adminToken string encKey [32]byte + snapEngine *volsnap.Engine // set by newSnapshotEnv; nil otherwise } func (e *apiTestEnv) close() { e.srv.Close() } @@ -670,9 +672,9 @@ func TestGetWorkloadChain_ParentSelfChildren(t *testing.T) { resp := e.do(t, http.MethodGet, "/api/workloads/"+parentID+"/chain", nil) var got struct { - Parent *map[string]any `json:"parent"` - Self map[string]any `json:"self"` - Children []map[string]any `json:"children"` + Parent *map[string]any `json:"parent"` + Self map[string]any `json:"self"` + Children []map[string]any `json:"children"` } if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" { t.Fatalf("envelope error: %q", errMsg) diff --git a/internal/deployer/deployer.go b/internal/deployer/deployer.go index dfe8957..95c7bba 100644 --- a/internal/deployer/deployer.go +++ b/internal/deployer/deployer.go @@ -5,6 +5,7 @@ package deployer import ( + "context" "fmt" "log/slog" "sync" @@ -14,9 +15,11 @@ import ( "github.com/alexei/tinyforge/internal/docker" "github.com/alexei/tinyforge/internal/events" "github.com/alexei/tinyforge/internal/health" + "github.com/alexei/tinyforge/internal/keyedmutex" "github.com/alexei/tinyforge/internal/notify" "github.com/alexei/tinyforge/internal/proxy" "github.com/alexei/tinyforge/internal/store" + "github.com/alexei/tinyforge/internal/workload/plugin" ) // Deployer owns the dependency bundle each Source plugin needs at deploy @@ -49,6 +52,29 @@ type Deployer struct { drainMu sync.Mutex activeWg sync.WaitGroup shuttingDown atomic.Bool + + // workloadLocks serializes deploy-class operations per workload id so two + // concurrent mutators of the same workload (a manual deploy, a webhook/ + // trigger dispatch, a rollback, a promote, OR a volume-snapshot restore) + // can never interleave their container/volume changes. Every deploy + // entrypoint funnels through DispatchPlugin, so locking there gates them + // all at one choke point. This is the per-workload lock activeWg is NOT + // (activeWg is a global drain barrier for graceful shutdown). + workloadLocks keyedmutex.Mutex +} + +// LockWorkload acquires the per-workload deploy lock for an external critical +// section (volume-snapshot restore) and returns the release func. The restore +// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked +// (which does NOT re-acquire it). +func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) } + +// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock, +// because the caller (restore) already holds it via LockWorkload. Calling the +// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant. +// Not for general use. +func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error { + return d.dispatchLocked(ctx, w, intent) } // EventPublisher is the interface for publishing events to the event bus. diff --git a/internal/deployer/dispatch.go b/internal/deployer/dispatch.go index a4ffd46..79a44c4 100644 --- a/internal/deployer/dispatch.go +++ b/internal/deployer/dispatch.go @@ -15,6 +15,18 @@ import ( // operator enables auto_backup_before_deploy, a pre-deploy Tinyforge DB // snapshot is taken here, after the source resolves and before it runs. func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error { + // C1: serialize all deploy-class work per workload. Held across the whole + // deploy so a concurrent deploy/rollback/promote/trigger — or a volume + // restore (which redeploys via RedeployLocked while holding this) — can + // never interleave container changes for the same workload. + unlock := d.workloadLocks.Lock(w.ID) + defer unlock() + return d.dispatchLocked(ctx, w, intent) +} + +// dispatchLocked is DispatchPlugin's body, assuming the per-workload lock is +// already held. RedeployLocked calls it directly during restore. +func (d *Deployer) dispatchLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error { if err := d.beginDispatch(); err != nil { metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining") return err @@ -52,6 +64,12 @@ func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent // Used when a workload is deleted. Tracked via activeWg so Drain() honours // in-progress teardowns just like deploys. func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error { + // Teardown mutates the same containers/routes a deploy does, so it takes the + // per-workload lock too (C1). Callers tear down distinct workload ids + // sequentially (e.g. preview children then parent), never nested, so no + // self-deadlock. + unlock := d.workloadLocks.Lock(w.ID) + defer unlock() if err := d.beginDispatch(); err != nil { return err } diff --git a/internal/keyedmutex/keyedmutex.go b/internal/keyedmutex/keyedmutex.go new file mode 100644 index 0000000..cc614a1 --- /dev/null +++ b/internal/keyedmutex/keyedmutex.go @@ -0,0 +1,48 @@ +// Package keyedmutex provides a lazily-populated per-key mutex, so a critical +// section can be serialized per key (e.g. per workload id) without a global +// lock. It is the shared form of the pattern that originated inline in the +// GitOps sync handler; the deployer (per-workload deploy serialization) and the +// volume-snapshot restore single-flight both use it. +package keyedmutex + +import "sync" + +// Mutex hands out one *sync.Mutex per key on demand. The zero value is ready to +// use. The internal map only grows (one entry per distinct key ever locked), +// which is bounded in practice by the number of workloads. +type Mutex struct { + mu sync.Mutex + m map[string]*sync.Mutex +} + +func (k *Mutex) get(key string) *sync.Mutex { + k.mu.Lock() + defer k.mu.Unlock() + if k.m == nil { + k.m = make(map[string]*sync.Mutex) + } + mu, ok := k.m[key] + if !ok { + mu = &sync.Mutex{} + k.m[key] = mu + } + return mu +} + +// Lock blocks until the mutex for key is acquired, then returns its unlock func. +func (k *Mutex) Lock(key string) func() { + mu := k.get(key) + mu.Lock() + return mu.Unlock +} + +// TryLock attempts to acquire the mutex for key without blocking. On success it +// returns the unlock func and true; if the key is already locked it returns nil +// and false so the caller can reject (e.g. HTTP 409) instead of queuing. +func (k *Mutex) TryLock(key string) (func(), bool) { + mu := k.get(key) + if !mu.TryLock() { + return nil, false + } + return mu.Unlock, true +} diff --git a/internal/keyedmutex/keyedmutex_test.go b/internal/keyedmutex/keyedmutex_test.go new file mode 100644 index 0000000..ecd5047 --- /dev/null +++ b/internal/keyedmutex/keyedmutex_test.go @@ -0,0 +1,83 @@ +package keyedmutex + +import ( + "sync" + "testing" + "time" +) + +func TestLockSerializesSameKey(t *testing.T) { + var m Mutex + unlock := m.Lock("a") + + acquired := make(chan struct{}) + go func() { + u := m.Lock("a") + close(acquired) + u() + }() + + select { + case <-acquired: + t.Fatal("second Lock on the same key acquired while the first was held") + case <-time.After(50 * time.Millisecond): + // expected: blocked + } + unlock() + select { + case <-acquired: + // expected: now acquired + case <-time.After(time.Second): + t.Fatal("second Lock did not acquire after release") + } +} + +func TestLockIndependentKeys(t *testing.T) { + var m Mutex + unlockA := m.Lock("a") + defer unlockA() + // A different key must not block. + done := make(chan struct{}) + go func() { u := m.Lock("b"); u(); close(done) }() + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("Lock on an independent key blocked") + } +} + +func TestTryLock(t *testing.T) { + var m Mutex + unlock, ok := m.TryLock("a") + if !ok { + t.Fatal("TryLock should succeed on a free key") + } + if _, ok := m.TryLock("a"); ok { + t.Fatal("TryLock should fail while the key is held") + } + unlock() + u2, ok := m.TryLock("a") + if !ok { + t.Fatal("TryLock should succeed after release") + } + u2() +} + +func TestConcurrentLockNoRace(t *testing.T) { + var m Mutex + var wg sync.WaitGroup + counter := 0 + for i := 0; i < 50; i++ { + wg.Add(1) + go func() { + defer wg.Done() + u := m.Lock("shared") + counter++ // protected by the keyed lock + u() + }() + } + wg.Wait() + if counter != 50 { + t.Errorf("counter = %d, want 50 (lost updates ⇒ lock not serializing)", counter) + } +} diff --git a/internal/volsnap/disk_unix.go b/internal/volsnap/disk_unix.go new file mode 100644 index 0000000..3a848f7 --- /dev/null +++ b/internal/volsnap/disk_unix.go @@ -0,0 +1,21 @@ +//go:build !windows + +package volsnap + +import ( + "fmt" + + "golang.org/x/sys/unix" +) + +// freeDiskBytes returns the bytes available to an unprivileged process on the +// filesystem backing path (used by the restore disk pre-check, C5). path must +// exist; callers pass the live dir's parent. +func freeDiskBytes(path string) (uint64, error) { + var st unix.Statfs_t + if err := unix.Statfs(path, &st); err != nil { + return 0, fmt.Errorf("statfs %s: %w", path, err) + } + // Bavail is blocks available to non-root; Bsize is the fragment size. + return st.Bavail * uint64(st.Bsize), nil +} diff --git a/internal/volsnap/disk_windows.go b/internal/volsnap/disk_windows.go new file mode 100644 index 0000000..6204f9e --- /dev/null +++ b/internal/volsnap/disk_windows.go @@ -0,0 +1,24 @@ +//go:build windows + +package volsnap + +import ( + "fmt" + + "golang.org/x/sys/windows" +) + +// freeDiskBytes returns the bytes available to the caller on the volume backing +// path (used by the restore disk pre-check, C5). Windows is the dev platform; +// production runs on Linux (see disk_unix.go). +func freeDiskBytes(path string) (uint64, error) { + p, err := windows.UTF16PtrFromString(path) + if err != nil { + return 0, fmt.Errorf("encode path %s: %w", path, err) + } + var freeAvail, total, totalFree uint64 + if err := windows.GetDiskFreeSpaceEx(p, &freeAvail, &total, &totalFree); err != nil { + return 0, fmt.Errorf("GetDiskFreeSpaceEx %s: %w", path, err) + } + return freeAvail, nil +} diff --git a/internal/volsnap/engine.go b/internal/volsnap/engine.go index fe12075..b431213 100644 --- a/internal/volsnap/engine.go +++ b/internal/volsnap/engine.go @@ -31,6 +31,12 @@ type Engine struct { mu sync.Mutex store *store.Store snapDir string + + // lifecycle is the deploy-side seam restore needs (per-workload lock, stop, + // redeploy). Wired post-construction via SetLifecycle from the composition + // root so volsnap stays decoupled from the deployer/docker packages. nil + // until wired; Restore refuses to run without it. + lifecycle Lifecycle } // New creates the snapshot engine, ensuring the snapshot directory exists. diff --git a/internal/volsnap/extract.go b/internal/volsnap/extract.go new file mode 100644 index 0000000..20f3cf6 --- /dev/null +++ b/internal/volsnap/extract.go @@ -0,0 +1,162 @@ +package volsnap + +import ( + "archive/tar" + "compress/gzip" + "fmt" + "io" + "os" + "path" + "path/filepath" + "strconv" + "strings" +) + +// safeExtractIndex extracts the files archived under the integer subdirectory +// `index` of a snapshot tar.gz into dest, returning the total bytes written. +// +// On RESTORE the archive is treated as UNTRUSTED (it may have been downloaded, +// hand-edited, or swapped on disk), so this is hardened well beyond what the +// capture writer emits: +// +// - zip-slip: every resolved target must stay within dest (HasPrefix check on +// the cleaned absolute path) — a "../" or absolute member is rejected. +// - type allow-list: ONLY regular files and directories are materialized; +// symlinks, hardlinks, char/block devices, fifos, and sockets are rejected +// outright (never created, never followed) — they could redirect a write +// outside the volume or smuggle in a device node. +// - decompression bomb: a running byte counter is capped at bombCap; the first +// byte past the cap aborts the extraction. +// +// dest must be a fresh staging directory (files are created O_EXCL). The caller +// performs the atomic rename-swap of dest onto the live path separately. +func safeExtractIndex(archivePath string, index int, dest string, bombCap int64) (int64, error) { + f, err := os.Open(archivePath) + if err != nil { + return 0, fmt.Errorf("open archive: %w", err) + } + defer f.Close() + + gz, err := gzip.NewReader(f) + if err != nil { + return 0, fmt.Errorf("gzip reader: %w", err) + } + defer gz.Close() + + cleanDest, err := filepath.Abs(dest) + if err != nil { + return 0, fmt.Errorf("resolve dest: %w", err) + } + if err := os.MkdirAll(cleanDest, 0o700); err != nil { + return 0, fmt.Errorf("create dest: %w", err) + } + + tr := tar.NewReader(gz) + var written int64 + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return written, fmt.Errorf("read tar: %w", err) + } + + // Archive paths are always forward-slash. path.Clean collapses any + // "./" / "../" so the prefix and containment checks see a normal form. + name := path.Clean(hdr.Name) + if name == "manifest.json" { + continue + } + rel, ok := stripIndexPrefix(name, index) + if !ok { + continue // belongs to a different volume's subtree + } + + switch hdr.Typeflag { + case tar.TypeReg, tar.TypeDir: + // allowed + default: + return written, fmt.Errorf("archive entry %q has disallowed type %q", hdr.Name, string(hdr.Typeflag)) + } + + target := cleanDest + if rel != "" { + target = filepath.Join(cleanDest, filepath.FromSlash(rel)) + } + if !withinDir(cleanDest, target) { + return written, fmt.Errorf("archive entry %q escapes destination", hdr.Name) + } + + if hdr.Typeflag == tar.TypeDir { + if err := os.MkdirAll(target, 0o700); err != nil { + return written, fmt.Errorf("mkdir %s: %w", target, err) + } + continue + } + + if err := os.MkdirAll(filepath.Dir(target), 0o700); err != nil { + return written, fmt.Errorf("mkdir parent of %s: %w", target, err) + } + remaining := bombCap - written + if remaining <= 0 { + return written, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap) + } + out, err := os.OpenFile(target, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + if err != nil { + return written, fmt.Errorf("create %s: %w", target, err) + } + // LimitReader to remaining+1: if the entry is larger than the cap allows, + // the extra byte is copied and written>bombCap trips the guard below. + n, copyErr := io.Copy(out, io.LimitReader(tr, remaining+1)) + closeErr := out.Close() + written += n + if copyErr != nil { + return written, fmt.Errorf("write %s: %w", target, copyErr) + } + if closeErr != nil { + return written, fmt.Errorf("close %s: %w", target, closeErr) + } + if written > bombCap { + return written, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap) + } + } + return written, nil +} + +// stripIndexPrefix returns the path relative to the "/" subtree and +// whether name belongs to it. name=="" (the subtree root) yields ("", true). +// The "/" boundary keeps index 1 from matching "10/...". +func stripIndexPrefix(name string, index int) (string, bool) { + p := strconv.Itoa(index) + if name == p { + return "", true + } + if strings.HasPrefix(name, p+"/") { + return name[len(p)+1:], true + } + return "", false +} + +// leadingIndex parses the first path segment of an archive entry name as the +// volume index. Returns false for manifest.json or any non-integer prefix. +func leadingIndex(name string) (int, bool) { + seg := name + if i := strings.IndexByte(name, '/'); i >= 0 { + seg = name[:i] + } + idx, err := strconv.Atoi(seg) + if err != nil { + return 0, false + } + return idx, true +} + +// withinDir reports whether target is base itself or lives beneath it. Both +// args must already be cleaned absolute paths. +func withinDir(base, target string) bool { + if target == base { + return true + } + return strings.HasPrefix(target, base+string(filepath.Separator)) +} diff --git a/internal/volsnap/extract_test.go b/internal/volsnap/extract_test.go new file mode 100644 index 0000000..9ed7856 --- /dev/null +++ b/internal/volsnap/extract_test.go @@ -0,0 +1,166 @@ +package volsnap + +import ( + "archive/tar" + "compress/gzip" + "os" + "path/filepath" + "strings" + "testing" +) + +type tentry struct { + name string + typeflag byte + body string + linkname string +} + +// buildTarGz writes an arbitrary (possibly hostile) tar.gz so the extractor's +// untrusted-input hardening can be exercised — writeArchive only emits well- +// formed reg/dir entries, which is the wrong shape for these tests. +func buildTarGz(t *testing.T, entries []tentry) string { + t.Helper() + dest := filepath.Join(t.TempDir(), "snap.tar.gz") + f, err := os.Create(dest) + if err != nil { + t.Fatal(err) + } + gz := gzip.NewWriter(f) + tw := tar.NewWriter(gz) + for _, e := range entries { + hdr := &tar.Header{Name: e.name, Typeflag: e.typeflag, Mode: 0o600, Linkname: e.linkname} + switch e.typeflag { + case tar.TypeReg: + hdr.Size = int64(len(e.body)) + case tar.TypeChar, tar.TypeBlock: + hdr.Devmajor, hdr.Devminor = 1, 1 + } + if err := tw.WriteHeader(hdr); err != nil { + t.Fatal(err) + } + if e.typeflag == tar.TypeReg { + if _, err := tw.Write([]byte(e.body)); err != nil { + t.Fatal(err) + } + } + } + if err := tw.Close(); err != nil { + t.Fatal(err) + } + if err := gz.Close(); err != nil { + t.Fatal(err) + } + if err := f.Close(); err != nil { + t.Fatal(err) + } + return dest +} + +func TestSafeExtractIndex_RoundTrip(t *testing.T) { + arc := buildTarGz(t, []tentry{ + {name: "0/", typeflag: tar.TypeDir}, + {name: "0/a.txt", typeflag: tar.TypeReg, body: "hello"}, + {name: "0/sub/", typeflag: tar.TypeDir}, + {name: "0/sub/b.txt", typeflag: tar.TypeReg, body: "world"}, + {name: "manifest.json", typeflag: tar.TypeReg, body: "[]"}, + }) + dest := filepath.Join(t.TempDir(), "out") + n, err := safeExtractIndex(arc, 0, dest, maxRestoreUncompressedBytes) + if err != nil { + t.Fatalf("extract: %v", err) + } + if n != int64(len("hello")+len("world")) { + t.Errorf("written = %d, want %d", n, len("hello")+len("world")) + } + if got, _ := os.ReadFile(filepath.Join(dest, "a.txt")); string(got) != "hello" { + t.Errorf("a.txt = %q", got) + } + if got, _ := os.ReadFile(filepath.Join(dest, "sub", "b.txt")); string(got) != "world" { + t.Errorf("sub/b.txt = %q", got) + } + if _, err := os.Stat(filepath.Join(dest, "manifest.json")); !os.IsNotExist(err) { + t.Error("manifest.json must not be extracted into the volume") + } +} + +func TestSafeExtractIndex_IsolatesIndex(t *testing.T) { + arc := buildTarGz(t, []tentry{ + {name: "1/keep.txt", typeflag: tar.TypeReg, body: "one"}, + {name: "10/other.txt", typeflag: tar.TypeReg, body: "ten"}, + {name: "2/nope.txt", typeflag: tar.TypeReg, body: "two"}, + }) + dest := filepath.Join(t.TempDir(), "out") + if _, err := safeExtractIndex(arc, 1, dest, maxRestoreUncompressedBytes); err != nil { + t.Fatalf("extract: %v", err) + } + if _, err := os.Stat(filepath.Join(dest, "keep.txt")); err != nil { + t.Errorf("index 1 file missing: %v", err) + } + // "10/" must not bleed into index 1 (prefix boundary), nor "2/". + for _, leaked := range []string{"other.txt", "nope.txt"} { + if _, err := os.Stat(filepath.Join(dest, leaked)); !os.IsNotExist(err) { + t.Errorf("index 1 extraction leaked %q from another index", leaked) + } + } +} + +func TestSafeExtractIndex_RejectsDisallowedTypes(t *testing.T) { + cases := map[string]tentry{ + "symlink": {name: "0/link", typeflag: tar.TypeSymlink, linkname: "/etc/passwd"}, + "hardlink": {name: "0/hard", typeflag: tar.TypeLink, linkname: "0/real"}, + "chardev": {name: "0/cdev", typeflag: tar.TypeChar}, + "blockdev": {name: "0/bdev", typeflag: tar.TypeBlock}, + "fifo": {name: "0/fifo", typeflag: tar.TypeFifo}, + "sparse": {name: "0/sparse", typeflag: tar.TypeCont}, // GNU sparse / contiguous + } + for name, ent := range cases { + t.Run(name, func(t *testing.T) { + arc := buildTarGz(t, []tentry{ent}) + dest := filepath.Join(t.TempDir(), "out") + if _, err := safeExtractIndex(arc, 0, dest, maxRestoreUncompressedBytes); err == nil { + t.Fatalf("expected %s entry to be rejected", name) + } + }) + } +} + +func TestSafeExtractIndex_RejectsBomb(t *testing.T) { + arc := buildTarGz(t, []tentry{ + {name: "0/big.bin", typeflag: tar.TypeReg, body: strings.Repeat("x", 4096)}, + }) + dest := filepath.Join(t.TempDir(), "out") + if _, err := safeExtractIndex(arc, 0, dest, 1024); err == nil { + t.Fatal("expected extraction to abort past the decompression cap") + } +} + +func TestSafeExtractIndex_NoEscapeOutsideDest(t *testing.T) { + // A "../" climb and an absolute member must never materialize a file + // outside dest, regardless of which guard catches it. Note the backslash + // case is platform-split: on Windows `..\winslip.txt` is a real climb that + // withinDir rejects; on Linux it is a literal one-segment filename that + // stays harmlessly inside dest (no guard fires). Both satisfy containment. + arc := buildTarGz(t, []tentry{ + {name: "0/../../escape.txt", typeflag: tar.TypeReg, body: "pwned"}, + {name: "/abs-escape.txt", typeflag: tar.TypeReg, body: "pwned"}, + {name: `0/..\winslip.txt`, typeflag: tar.TypeReg, body: "pwned"}, + {name: "0/ok.txt", typeflag: tar.TypeReg, body: "fine"}, + }) + outParent := t.TempDir() + dest := filepath.Join(outParent, "out") + // May or may not error depending on platform/guard; the invariant is that + // nothing escapes dest. + _, _ = safeExtractIndex(arc, 0, dest, maxRestoreUncompressedBytes) + + for _, escaped := range []string{ + filepath.Join(outParent, "escape.txt"), + filepath.Join(filepath.Dir(outParent), "escape.txt"), + filepath.Join(outParent, "abs-escape.txt"), + filepath.Join(outParent, "winslip.txt"), + } { + if _, err := os.Stat(escaped); err == nil { + t.Errorf("zip-slip escaped to %s", escaped) + } + } +} diff --git a/internal/volsnap/restore.go b/internal/volsnap/restore.go new file mode 100644 index 0000000..29bc48a --- /dev/null +++ b/internal/volsnap/restore.go @@ -0,0 +1,235 @@ +package volsnap + +import ( + "archive/tar" + "compress/gzip" + "encoding/json" + "fmt" + "io" + "os" + "path" + "path/filepath" + + "github.com/alexei/tinyforge/internal/store" + "github.com/alexei/tinyforge/internal/volume" +) + +// maxRestoreUncompressedBytes caps the total decompressed size accepted from a +// snapshot archive during restore (decompression-bomb defence). 50 GiB is far +// above any realistic app data volume while still bounding a hostile archive. +const maxRestoreUncompressedBytes int64 = 50 << 30 + +// diskFreeHeadroomBytes is extra free space required beyond the extracted size +// so a restore never fills the target filesystem to the brim. The live copy is +// renamed aside (no new space), so the new allocation is ~the extracted size; +// this headroom covers filesystem overhead and metadata. +const diskFreeHeadroomBytes int64 = 256 << 20 + +// resolvedVol is a manifest volume whose live host path has been re-resolved +// against the workload's CURRENT config (all-or-nothing pre-flight, C3). +type resolvedVol struct { + Index int + Target string + Scope string + LivePath string +} + +// parseManifest decodes the snapshot row's manifest JSON ([]SnapshotVolume). +func parseManifest(snap store.VolumeSnapshot) ([]SnapshotVolume, error) { + var m []SnapshotVolume + if err := json.Unmarshal([]byte(snap.Manifest), &m); err != nil { + return nil, fmt.Errorf("parse snapshot manifest: %w", err) + } + if len(m) == 0 { + return nil, fmt.Errorf("snapshot manifest is empty") + } + return m, nil +} + +// preflightResolve re-derives every manifest volume's live host path from the +// workload's CURRENT config, ALL-OR-NOTHING (C3): if any snapshotted target is +// no longer declared, its scope is unsupported, or it can't resolve, it returns +// an error and the caller MUST abort BEFORE stopping containers or touching +// disk — config drift mid-restore is silent corruption. +// +// SECURITY: the swap target is keyed on the manifest's container Target path but +// its host directory is derived from the CURRENT (trusted, operator-set) +// Source/Scope — never from the snapshot manifest's persisted Source/Scope. The +// manifest column is attacker-influenceable (e.g. a restored/tampered DB), and +// trusting its Source for stage/project scope would let `Source:"../../etc"` +// redirect the destructive rename-swap outside the volume tree. As defence in +// depth, base-relative resolved paths are asserted to stay under BaseVolumePath. +func preflightResolve(st *store.Store, w store.Workload, settings store.Settings, manifest []SnapshotVolume) ([]resolvedVol, error) { + current, err := volumesByTarget(st, w) + if err != nil { + return nil, fmt.Errorf("load current volumes: %w", err) + } + params := volume.ResolveWorkloadParams{ + BasePath: settings.BaseVolumePath, + WorkloadID: w.ID, + WorkloadName: w.Name, + AllowedVolumePaths: settings.AllowedVolumePaths, + } + out := make([]resolvedVol, 0, len(manifest)) + for _, mv := range manifest { + // A negative index can never name an archive subtree. + if mv.Index < 0 { + return nil, fmt.Errorf("volume %q has invalid index %d", mv.Target, mv.Index) + } + cur, ok := current[mv.Target] + if !ok { + return nil, fmt.Errorf("volume %q is no longer declared by the workload", mv.Target) + } + if !supportedScopes[cur.Scope] { + return nil, fmt.Errorf("volume %q scope %q is not restorable", mv.Target, cur.Scope) + } + live, err := volume.ResolveWorkloadPath(cur, params) + if err != nil { + return nil, fmt.Errorf("resolve volume %q (%s): %w", mv.Target, cur.Scope, err) + } + // Containment: the destructive swap target must stay inside the volume + // root. Base-relative scopes must resolve under BaseVolumePath; absolute + // scope is already constrained to AllowedVolumePaths by the resolver. + if cur.Scope != string(store.VolumeScopeAbsolute) { + contained, cerr := pathWithinBase(settings.BaseVolumePath, live) + if cerr != nil || !contained { + return nil, fmt.Errorf("resolved path for volume %q escapes the volume root", mv.Target) + } + } + out = append(out, resolvedVol{Index: mv.Index, Target: mv.Target, Scope: cur.Scope, LivePath: live}) + } + return out, nil +} + +// pathWithinBase reports whether target resolves to base or a path beneath it. +// An empty base is treated as non-containing (refuse rather than allow). +func pathWithinBase(base, target string) (bool, error) { + if base == "" { + return false, nil + } + absBase, err := filepath.Abs(base) + if err != nil { + return false, err + } + absTarget, err := filepath.Abs(target) + if err != nil { + return false, err + } + return withinDir(absBase, absTarget), nil +} + +// archiveUncompressedSize scans the archive's tar headers and returns the +// per-index and total uncompressed sizes, enforcing bombCap so a hostile +// archive can't make the disk pre-check allocate unbounded. Feeds the +// per-filesystem free-space pre-check (C5). +// +// The total is a LOWER-BOUND estimate of on-disk consumption: it sums regular- +// file bytes only, ignoring directory entries and per-file inode/block-rounding +// overhead, so a volume of many tiny files consumes more than reported. The +// real safety net is the staged extract + atomic swap (a mid-extract ENOSPC +// discards the staging dir and leaves live untouched), not this pre-check. +// +// "No body copy" is at the API level only — tar.Next still inflates and +// discards each skipped body, so a 50 GiB-of-headers archive does 50 GiB of +// gzip work; bombCap bounds that. +func archiveUncompressedSize(archivePath string, bombCap int64) (perIndex map[int]int64, total int64, err error) { + f, err := os.Open(archivePath) + if err != nil { + return nil, 0, fmt.Errorf("open archive: %w", err) + } + defer f.Close() + gz, err := gzip.NewReader(f) + if err != nil { + return nil, 0, fmt.Errorf("gzip reader: %w", err) + } + defer gz.Close() + + perIndex = map[int]int64{} + tr := tar.NewReader(gz) + for { + hdr, e := tr.Next() + if e == io.EOF { + break + } + if e != nil { + return nil, 0, fmt.Errorf("read tar: %w", e) + } + if hdr.Typeflag != tar.TypeReg { + continue + } + name := path.Clean(hdr.Name) + if name == "manifest.json" { + continue + } + idx, ok := leadingIndex(name) + if !ok { + continue + } + total += hdr.Size + if total > bombCap { + return nil, 0, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap) + } + perIndex[idx] += hdr.Size + } + return perIndex, total, nil +} + +// swap records one volume's atomic dir replacement so it can be rolled back. +type swap struct { + live string + old string // where the prior live dir was set aside ("" if live didn't exist) + tmp string // staging dir holding the freshly-extracted data + hadOld bool // whether a prior live dir existed and was moved to old +} + +// stagingDirs returns the per-volume tmp and old staging paths as SIBLINGS of +// the live dir's parent, so every rename in the swap is intra-filesystem and +// therefore atomic (R2). A cross-device rename (live is itself a mountpoint) +// fails loudly in swapVolumeDir rather than silently degrading to a copy. +func stagingDirs(live, token string, index int) (tmp, old string) { + parent := filepath.Dir(live) + base := fmt.Sprintf(".tf-restore-%s-%d", token, index) + return filepath.Join(parent, base+".tmp"), filepath.Join(parent, base+".old") +} + +// swapVolumeDir performs the crash-minimal two-rename swap: set the live dir +// aside to old (if it exists), then move the staged tmp into place (C2). On the +// second rename failing it reverts the first so live is never left missing. +// Returns whether a prior live dir was preserved at old (for rollback). +func swapVolumeDir(live, tmp, old string) (hadOld bool, err error) { + if _, statErr := os.Lstat(live); statErr == nil { + if rerr := os.Rename(live, old); rerr != nil { + return false, fmt.Errorf("set aside live %s: %w", live, rerr) + } + hadOld = true + } else if !os.IsNotExist(statErr) { + return false, fmt.Errorf("stat live %s: %w", live, statErr) + } + + if mkErr := os.MkdirAll(filepath.Dir(live), 0o700); mkErr != nil { + if hadOld { + _ = os.Rename(old, live) + } + return hadOld, fmt.Errorf("ensure parent of %s: %w", live, mkErr) + } + if rerr := os.Rename(tmp, live); rerr != nil { + if hadOld { + _ = os.Rename(old, live) // revert: live is never left missing + } + return hadOld, fmt.Errorf("promote restored data into %s: %w", live, rerr) + } + return hadOld, nil +} + +// rollbackSwaps reverts completed swaps in reverse order: drop the restored +// live dir and move the preserved original back. Best-effort — each step is +// logged by the caller; rollback must attempt every volume regardless. +func rollbackSwaps(done []swap) { + for i := len(done) - 1; i >= 0; i-- { + s := done[i] + _ = os.RemoveAll(s.live) + if s.hadOld { + _ = os.Rename(s.old, s.live) + } + } +} diff --git a/internal/volsnap/restore_engine.go b/internal/volsnap/restore_engine.go new file mode 100644 index 0000000..ec88de1 --- /dev/null +++ b/internal/volsnap/restore_engine.go @@ -0,0 +1,400 @@ +package volsnap + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + + "github.com/google/uuid" + + "github.com/alexei/tinyforge/internal/store" +) + +// Lifecycle is the deploy-side seam Engine.Restore needs but volsnap must not +// import directly (it would couple the snapshot package to the deployer/docker +// packages). The composition root supplies an adapter over the Deployer + +// Docker client via Engine.SetLifecycle. +type Lifecycle interface { + // Lock acquires the per-workload deploy lock (C1) and returns the release + // func. Held by Restore across stop→swap→redeploy. + Lock(workloadID string) func() + // StopContainers stops every running container for the workload (C4 quiesce) + // and returns the image tag the newest running container was on, so the + // redeploy can bring the SAME version back up ("" ⇒ source default tag). + StopContainers(ctx context.Context, workloadID string) (runningTag string, err error) + // Redeploy re-dispatches the workload's current config WITHOUT re-acquiring + // the per-workload lock (the caller holds it). reference pins the image tag. + Redeploy(ctx context.Context, w store.Workload, reference string) error +} + +// SetLifecycle wires the deploy-side seam. Pass nil to leave restore disabled. +func (e *Engine) SetLifecycle(lc Lifecycle) { e.lifecycle = lc } + +// restoreJournal is the on-disk write-ahead record of an in-flight restore. +// Written before the first destructive rename and deleted on completion; the +// startup RecoverInterruptedRestores sweep replays it after a crash. +type restoreJournal struct { + SnapshotID string `json:"snapshot_id"` + WorkloadID string `json:"workload_id"` + Volumes []journalVolume `json:"volumes"` +} + +type journalVolume struct { + Live string `json:"live"` + Old string `json:"old"` + Tmp string `json:"tmp"` + Swapped bool `json:"swapped"` + HadOld bool `json:"had_old"` +} + +// staged pairs a resolved volume with its per-restore staging dirs. +type staged struct { + rv resolvedVol + tmp string + old string +} + +// Restore overwrites the workload's live host-bind volumes with a snapshot's +// contents and brings the app back up. It is the single, engine-owned entry +// point for the data-loss-sensitive restore flow (image-source workloads only). +// +// Ordering is deliberate and crash-aware: +// +// pre-flight (re-resolve all volumes C3, size + per-fs disk check C5) — abort +// here touches nothing +// → Lock (C1) → re-validate workload → StopContainers (C4 quiesce) +// → extract ALL volumes to sibling .tmp staging dirs (reads the source archive +// fully BEFORE the next step can prune it; shrinks the later destructive +// window to pure renames — R3) +// → capture a pre-restore snapshot (durable escape hatch, after quiesce, +// before any destructive rename — folded suggestion) +// → write the restore journal (R3 crash recovery) +// → swap each volume atomically (rename live→.old, .tmp→live — C2) +// → Redeploy (C4 — image containers are recreated, never reused) +// → remove .old + journal, emit audit event +// +// Engine.Restore holds NO e.mu (R1): per-workload serialization is the +// Lifecycle lock; e.Create takes its own e.mu for the pre-restore archive +// write, so calling it here cannot self-deadlock. +func (e *Engine) Restore(ctx context.Context, snapshotID, workloadID string) error { + if e.lifecycle == nil { + return fmt.Errorf("restore: lifecycle not configured") + } + + snap, err := e.store.GetVolumeSnapshot(snapshotID) + if err != nil { + return err + } + if snap.WorkloadID != workloadID { + return fmt.Errorf("snapshot %s does not belong to workload %s", snapshotID, workloadID) + } + w, err := e.store.GetWorkloadByID(workloadID) + if err != nil { + return err + } + if w.SourceKind != "image" { + return fmt.Errorf("restore is only supported for image-source workloads") + } + settings, err := e.store.GetSettings() + if err != nil { + return fmt.Errorf("load settings: %w", err) + } + + manifest, err := parseManifest(snap) + if err != nil { + return err + } + resolved, err := preflightResolve(e.store, w, settings, manifest) // C3 all-or-nothing + if err != nil { + return fmt.Errorf("pre-flight: %w", err) + } + archivePath, err := e.FilePath(snap) + if err != nil { + return err + } + perIndex, _, err := archiveUncompressedSize(archivePath, maxRestoreUncompressedBytes) + if err != nil { + return fmt.Errorf("size snapshot: %w", err) + } + if err := checkDiskSpace(resolved, perIndex); err != nil { // C5 + return err + } + + // ── past pre-flight: take the per-workload lock and quiesce ────────────── + unlock := e.lifecycle.Lock(workloadID) + defer unlock() + + // A teardown may have won the lock and deleted the workload while we waited. + if _, err := e.store.GetWorkloadByID(workloadID); err != nil { + return fmt.Errorf("workload disappeared before restore: %w", err) + } + + tag, err := e.lifecycle.StopContainers(ctx, workloadID) // C4 stop + if err != nil { + return fmt.Errorf("stop containers: %w", err) + } + + // Extract every volume to its staging dir FIRST. This reads the source + // archive fully before the pre-restore capture below can prune it, and + // leaves only pure renames for the destructive phase (R3). + token := uuid.New().String()[:8] + stagedVols := make([]staged, 0, len(resolved)) + for _, rv := range resolved { + tmp, old := stagingDirs(rv.LivePath, token, rv.Index) + if _, exErr := safeExtractIndex(archivePath, rv.Index, tmp, maxRestoreUncompressedBytes); exErr != nil { + cleanupStaging(stagedVols) + _ = os.RemoveAll(tmp) + // Nothing swapped yet — bring the app back up on its original data. + e.redeployAfterAbort(ctx, w, tag) + return fmt.Errorf("extract volume %q: %w", rv.Target, exErr) + } + stagedVols = append(stagedVols, staged{rv: rv, tmp: tmp, old: old}) + } + + // Durable pre-restore snapshot (escape hatch). Quiesced (after stop), and + // the source archive is already fully extracted so a prune here is harmless. + // Best-effort, matching the DB-restore precedent: a failure is logged but + // does not abort — the .old dirs + journal are the in-operation safety net. + if _, err := e.Create(w, settings, "pre-restore"); err != nil { + slog.Warn("restore: pre-restore snapshot failed (continuing)", + "workload", workloadID, "error", err) + } + + // Journal before the first destructive rename so a crash can be recovered. + jr := restoreJournal{SnapshotID: snapshotID, WorkloadID: workloadID} + for _, sv := range stagedVols { + jr.Volumes = append(jr.Volumes, journalVolume{Live: sv.rv.LivePath, Old: sv.old, Tmp: sv.tmp}) + } + if err := e.writeJournal(jr); err != nil { + cleanupStaging(stagedVols) + e.redeployAfterAbort(ctx, w, tag) + return fmt.Errorf("write restore journal: %w", err) + } + + // ── destructive phase: pure atomic renames ────────────────────────────── + done := make([]swap, 0, len(stagedVols)) + for i, sv := range stagedVols { + hadOld, swErr := swapVolumeDir(sv.rv.LivePath, sv.tmp, sv.old) + if swErr != nil { + rollbackSwaps(done) // restore already-swapped volumes + cleanupStagingFrom(stagedVols, i) // drop remaining un-swapped tmp/old + e.removeJournal(workloadID) + e.redeployAfterAbort(ctx, w, tag) + return fmt.Errorf("swap volume %q: %w", sv.rv.Target, swErr) + } + done = append(done, swap{live: sv.rv.LivePath, old: sv.old, tmp: sv.tmp, hadOld: hadOld}) + jr.Volumes[i].Swapped = true + jr.Volumes[i].HadOld = hadOld + _ = e.writeJournal(jr) // progress checkpoint (best-effort) + } + + // Bring the app back up against the restored data (C4 — recreate, redeploy). + if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil { + // The data IS restored; only the app failed to come back. Do NOT roll + // back the volumes — surface the redeploy error so the operator retries + // a deploy. Clean up the .old set-asides and the journal. + cleanupOld(done) + e.removeJournal(workloadID) + return fmt.Errorf("redeploy after restore: %w", err) + } + + cleanupOld(done) + e.removeJournal(workloadID) + e.emitRestoreEvent(workloadID, snapshotID, len(done)) + slog.Info("volume snapshot restored", "workload", workloadID, "snapshot", snapshotID, "volumes", len(done)) + return nil +} + +// redeployAfterAbort re-dispatches after an aborted restore so a stopped app +// does not stay down. Best-effort: the error is logged, not returned (the +// restore failure is the primary error the caller surfaces). +func (e *Engine) redeployAfterAbort(ctx context.Context, w store.Workload, tag string) { + if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil { + slog.Warn("restore: redeploy after abort failed", "workload", w.ID, "error", err) + } +} + +// RecoverInterruptedRestores replays restore journals left by a crash mid- +// restore, mirroring CleanOrphans (run once at startup, before serving). For +// each volume: a completed swap keeps the restored live dir and drops the set- +// aside original; an incomplete swap that left live missing is reverted from +// .old; stray staging dirs are removed. Returns the number of journals handled. +func (e *Engine) RecoverInterruptedRestores() (int, error) { + e.mu.Lock() + defer e.mu.Unlock() + + entries, err := os.ReadDir(e.snapDir) + if err != nil { + return 0, fmt.Errorf("read snapshot dir: %w", err) + } + recovered := 0 + for _, ent := range entries { + name := ent.Name() + if ent.IsDir() || !strings.HasPrefix(name, "restore-") || !strings.HasSuffix(name, ".json") { + continue + } + path := filepath.Join(e.snapDir, name) + data, rerr := os.ReadFile(path) + if rerr != nil { + slog.Warn("restore recovery: read journal", "file", name, "error", rerr) + continue + } + var jr restoreJournal + if jerr := json.Unmarshal(data, &jr); jerr != nil { + slog.Warn("restore recovery: parse journal", "file", name, "error", jerr) + continue + } + slog.Warn("restore recovery: replaying interrupted restore", + "workload", jr.WorkloadID, "snapshot", jr.SnapshotID, "volumes", len(jr.Volumes)) + for _, v := range jr.Volumes { + recoverVolume(v) + } + if rmErr := os.Remove(path); rmErr != nil { + slog.Warn("restore recovery: remove journal", "file", name, "error", rmErr) + } + recovered++ + } + return recovered, nil +} + +// recoverVolume reconciles a single volume's on-disk state from its journal +// entry after a crash. Each branch leaves the live dir intact (either restored +// or original) and removes staging leftovers. +func recoverVolume(v journalVolume) { + if v.Swapped { + // Swap completed: live already holds restored data. Drop the set-aside. + _ = os.RemoveAll(v.Old) + _ = os.RemoveAll(v.Tmp) + return + } + if _, err := os.Lstat(v.Live); os.IsNotExist(err) { + if _, oerr := os.Lstat(v.Old); oerr == nil { + // Crashed mid-rename (live→old done, tmp→live not): revert. + _ = os.Rename(v.Old, v.Live) + } + } else { + // live is intact (original). Any .old is a dangling partial copy. + _ = os.RemoveAll(v.Old) + } + _ = os.RemoveAll(v.Tmp) +} + +// ── journal + cleanup helpers ─────────────────────────────────────────────── + +func (e *Engine) journalPath(workloadID string) string { + // workloadID is a server-generated id (loaded from the DB before we get + // here). filepath.Base defends against any separator sneaking into the name. + return filepath.Join(e.snapDir, "restore-"+filepath.Base(workloadID)+".json") +} + +func (e *Engine) writeJournal(jr restoreJournal) error { + data, err := json.Marshal(jr) + if err != nil { + return fmt.Errorf("encode journal: %w", err) + } + // Write atomically (tmp + rename): a torn journal would silently disable the + // recovery sweep (RecoverInterruptedRestores skips unparseable journals), so + // a crash mid-write must never leave a half-written WAL on disk. The .tmp + // suffix is ignored by the recovery scan (it matches *.json only). + final := e.journalPath(jr.WorkloadID) + tmp := final + ".tmp" + if err := os.WriteFile(tmp, data, 0o600); err != nil { + return fmt.Errorf("write journal: %w", err) + } + if err := os.Rename(tmp, final); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("commit journal: %w", err) + } + return nil +} + +func (e *Engine) removeJournal(workloadID string) { + if err := os.Remove(e.journalPath(workloadID)); err != nil && !os.IsNotExist(err) { + slog.Warn("restore: remove journal", "workload", workloadID, "error", err) + } +} + +func (e *Engine) emitRestoreEvent(workloadID, snapshotID string, volumes int) { + meta, _ := json.Marshal(map[string]any{"snapshot_id": snapshotID, "volumes": volumes}) + if _, err := e.store.InsertEvent(store.EventLog{ + Source: "volsnap", + WorkloadID: workloadID, + Severity: "info", + Message: "volume snapshot restored", + Metadata: string(meta), + }); err != nil { + slog.Warn("restore: record event", "workload", workloadID, "error", err) + } +} + +// cleanupStaging removes the tmp + old staging dirs for every staged volume +// (used when aborting before the swap phase). +func cleanupStaging(sv []staged) { + for _, s := range sv { + _ = os.RemoveAll(s.tmp) + _ = os.RemoveAll(s.old) + } +} + +// cleanupStagingFrom removes staging dirs from index `from` onward (the volumes +// not yet swapped when a swap failed). +func cleanupStagingFrom(sv []staged, from int) { + for i := from; i < len(sv); i++ { + _ = os.RemoveAll(sv[i].tmp) + _ = os.RemoveAll(sv[i].old) + } +} + +// cleanupOld removes the .old set-aside dirs after a successful (or data- +// committed) restore to reclaim disk; the pre-restore snapshot is the durable +// rollback target. +func cleanupOld(done []swap) { + for _, s := range done { + _ = os.RemoveAll(s.old) + } +} + +// checkDiskSpace verifies each target filesystem has room for the volumes that +// will be staged on it (C5). Peak usage co-locates the live copy (renamed +// aside, no new space) and the extracted copy (new space ≈ uncompressed size), +// so the new allocation per filesystem is the sum of its volumes' extracted +// sizes plus headroom. The estimate is a lower bound (see archiveUncompressedSize); +// a mid-extract ENOSPC is still caught and rolled back. +func checkDiskSpace(resolved []resolvedVol, perIndex map[int]int64) error { + needByParent := map[string]int64{} + for _, rv := range resolved { + needByParent[filepath.Dir(rv.LivePath)] += perIndex[rv.Index] + } + for parent, need := range needByParent { + probe := firstExistingAncestor(parent) + free, err := freeDiskBytes(probe) + if err != nil { + return fmt.Errorf("check disk space at %s: %w", probe, err) + } + if int64(free) < need+diskFreeHeadroomBytes { + return fmt.Errorf("insufficient disk space at %s: need ~%d bytes, have %d", + parent, need+diskFreeHeadroomBytes, free) + } + } + return nil +} + +// firstExistingAncestor walks up p until it finds a path that exists, so the +// free-space probe has a real filesystem to stat even when the volume dir (or +// its parent) hasn't been created yet. +func firstExistingAncestor(p string) string { + for { + if _, err := os.Stat(p); err == nil { + return p + } + parent := filepath.Dir(p) + if parent == p { + return p + } + p = parent + } +} diff --git a/internal/volsnap/restore_engine_test.go b/internal/volsnap/restore_engine_test.go new file mode 100644 index 0000000..995a47b --- /dev/null +++ b/internal/volsnap/restore_engine_test.go @@ -0,0 +1,345 @@ +package volsnap + +import ( + "archive/tar" + "context" + "errors" + "os" + "path/filepath" + "strings" + "sync" + "testing" + + "github.com/alexei/tinyforge/internal/store" + "github.com/alexei/tinyforge/internal/volume" +) + +// fakeLifecycle records the order of deploy-side calls and lets tests inject +// failures, without a real deployer/docker. +type fakeLifecycle struct { + mu sync.Mutex + calls []string + tag string + stopErr error + redeployErr error + redeployRef string +} + +func (f *fakeLifecycle) rec(s string) { + f.mu.Lock() + f.calls = append(f.calls, s) + f.mu.Unlock() +} +func (f *fakeLifecycle) Lock(string) func() { f.rec("lock"); return func() { f.rec("unlock") } } +func (f *fakeLifecycle) StopContainers(context.Context, string) (string, error) { + f.rec("stop") + return f.tag, f.stopErr +} +func (f *fakeLifecycle) Redeploy(_ context.Context, _ store.Workload, ref string) error { + f.rec("redeploy:" + ref) + f.redeployRef = ref + return f.redeployErr +} +func (f *fakeLifecycle) saw(s string) bool { + f.mu.Lock() + defer f.mu.Unlock() + for _, c := range f.calls { + if c == s { + return true + } + } + return false +} + +func newRestoreEngine(t *testing.T) (*Engine, *store.Store, string) { + t.Helper() + st, err := store.New(":memory:") + if err != nil { + t.Fatalf("store: %v", err) + } + t.Cleanup(func() { st.Close() }) + base := t.TempDir() + s, _ := st.GetSettings() + s.BaseVolumePath = base + if err := st.UpdateSettings(s); err != nil { + t.Fatalf("settings: %v", err) + } + eng, err := New(st, t.TempDir()) + if err != nil { + t.Fatalf("engine: %v", err) + } + return eng, st, base +} + +// seedImageWorkload creates an image workload with one project-scope volume and +// returns it plus the resolved live host dir. +func seedImageWorkload(t *testing.T, st *store.Store) (store.Workload, string) { + t.Helper() + w, err := st.CreateWorkload(store.Workload{ + Name: "data-app", + Kind: "project", + SourceKind: "image", + SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`, + }) + if err != nil { + t.Fatalf("create workload: %v", err) + } + settings, _ := st.GetSettings() + live, err := volume.ResolveWorkloadPath( + store.WorkloadVolume{Source: "data", Target: "/data", Scope: "project"}, + volume.ResolveWorkloadParams{BasePath: settings.BaseVolumePath, WorkloadID: w.ID, WorkloadName: w.Name}, + ) + if err != nil { + t.Fatalf("resolve: %v", err) + } + return w, live +} + +func TestEngineRestore_HappyPath(t *testing.T) { + eng, st, _ := newRestoreEngine(t) + w, live := seedImageWorkload(t, st) + mkDirWith(t, live, "orig.txt", "ORIGINAL") + + settings, _ := st.GetSettings() + snap, err := eng.Create(w, settings, "base") + if err != nil { + t.Fatalf("create snapshot: %v", err) + } + + // Drift: the live dir now differs from the snapshot. + if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil { + t.Fatal(err) + } + mkDirWith(t, live, "extra.txt", "NEW") // not in the snapshot + + fake := &fakeLifecycle{tag: "v1.2.3"} + eng.SetLifecycle(fake) + + // Uses the REAL eng.Create for the pre-restore capture — if Restore held + // e.mu this would deadlock (R1), failing the test instead of production. + if err := eng.Restore(context.Background(), snap.ID, w.ID); err != nil { + t.Fatalf("restore: %v", err) + } + + if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" { + t.Errorf("orig.txt = %q, want ORIGINAL (restored)", got) + } + if _, err := os.Stat(filepath.Join(live, "extra.txt")); !os.IsNotExist(err) { + t.Error("extra.txt should be gone — restore replaces the volume dir wholesale") + } + for _, want := range []string{"lock", "stop", "redeploy:v1.2.3", "unlock"} { + if !fake.saw(want) { + t.Errorf("expected lifecycle call %q; calls=%v", want, fake.calls) + } + } + if fake.redeployRef != "v1.2.3" { + t.Errorf("redeploy reference = %q, want the running tag v1.2.3", fake.redeployRef) + } + // A durable pre-restore snapshot was captured (base + pre-restore). + snaps, _ := eng.List(w.ID) + if len(snaps) != 2 { + t.Errorf("expected 2 snapshots (base + pre-restore), got %d", len(snaps)) + } + // No journal left behind. + assertNoJournal(t, eng) +} + +func TestEngineRestore_RedeployFailureKeepsRestoredData(t *testing.T) { + eng, st, _ := newRestoreEngine(t) + w, live := seedImageWorkload(t, st) + mkDirWith(t, live, "orig.txt", "ORIGINAL") + settings, _ := st.GetSettings() + snap, _ := eng.Create(w, settings, "base") + if err := os.WriteFile(filepath.Join(live, "orig.txt"), []byte("CHANGED"), 0o600); err != nil { + t.Fatal(err) + } + + fake := &fakeLifecycle{tag: "v1", redeployErr: errors.New("boom")} + eng.SetLifecycle(fake) + + err := eng.Restore(context.Background(), snap.ID, w.ID) + if err == nil || !strings.Contains(err.Error(), "redeploy") { + t.Fatalf("expected a redeploy error, got %v", err) + } + // Data is committed despite the redeploy failure — we must NOT roll it back. + if got := readIn(t, live, "orig.txt"); got != "ORIGINAL" { + t.Errorf("orig.txt = %q, want ORIGINAL (restore committed)", got) + } + assertNoJournal(t, eng) +} + +func TestEngineRestore_PreflightFailDoesNotLockOrStop(t *testing.T) { + eng, st, _ := newRestoreEngine(t) + w, _ := seedImageWorkload(t, st) + // A snapshot whose manifest names an unsupported scope ⇒ pre-flight aborts. + bad, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{ + WorkloadID: w.ID, Filename: "bad.tar.gz", + Manifest: `[{"index":0,"target":"/x","scope":"named","source":"x"}]`, + }) + if err != nil { + t.Fatalf("seed snapshot: %v", err) + } + fake := &fakeLifecycle{} + eng.SetLifecycle(fake) + + if err := eng.Restore(context.Background(), bad.ID, w.ID); err == nil { + t.Fatal("expected pre-flight to abort on an unsupported scope") + } + if fake.saw("lock") || fake.saw("stop") { + t.Errorf("pre-flight abort must happen BEFORE lock/stop; calls=%v", fake.calls) + } +} + +func TestEngineRestore_NilLifecycle(t *testing.T) { + eng, _, _ := newRestoreEngine(t) + if err := eng.Restore(context.Background(), "s", "w"); err == nil || + !strings.Contains(err.Error(), "lifecycle") { + t.Fatalf("expected a lifecycle-not-configured error, got %v", err) + } +} + +func TestEngineRestore_WrongWorkload(t *testing.T) { + eng, st, _ := newRestoreEngine(t) + w, live := seedImageWorkload(t, st) + mkDirWith(t, live, "f.txt", "x") + settings, _ := st.GetSettings() + snap, _ := eng.Create(w, settings, "base") + fake := &fakeLifecycle{} + eng.SetLifecycle(fake) + + if err := eng.Restore(context.Background(), snap.ID, "some-other-workload"); err == nil { + t.Fatal("expected cross-workload restore to be rejected") + } + if fake.saw("lock") { + t.Error("must reject before taking the lock") + } +} + +func TestEngineRestore_ExtractFailureAbortsAfterLock(t *testing.T) { + eng, st, _ := newRestoreEngine(t) + // The workload must CURRENTLY declare both targets so pre-flight passes and + // the failure happens during extraction (post-lock), not pre-flight. + w, err := st.CreateWorkload(store.Workload{ + Name: "two-vol", Kind: "project", SourceKind: "image", + SourceConfig: `{"image":"x","port":80,"volumes":[` + + `{"source":"data","target":"/data","scope":"project"},` + + `{"source":"other","target":"/other","scope":"project"}]}`, + }) + if err != nil { + t.Fatalf("create workload: %v", err) + } + + // Hand-build a 2-volume archive where volume 1 carries a symlink entry the + // untrusted extractor rejects — forcing a post-lock extract failure after + // volume 0 has already been staged. + arc := buildTarGz(t, []tentry{ + {name: "0/f.txt", typeflag: tar.TypeReg, body: "x"}, + {name: "1/evil", typeflag: tar.TypeSymlink, linkname: "/etc/passwd"}, + }) + data, err := os.ReadFile(arc) + if err != nil { + t.Fatal(err) + } + fname := "extract-fail.tar.gz" + if err := os.WriteFile(filepath.Join(eng.snapDir, fname), data, 0o600); err != nil { + t.Fatal(err) + } + snap, err := st.CreateVolumeSnapshot(store.VolumeSnapshot{ + WorkloadID: w.ID, Filename: fname, + Manifest: `[{"index":0,"target":"/data","scope":"project","source":"data"},` + + `{"index":1,"target":"/other","scope":"project","source":"other"}]`, + }) + if err != nil { + t.Fatalf("seed snapshot: %v", err) + } + + fake := &fakeLifecycle{tag: "v1"} + eng.SetLifecycle(fake) + + if err := eng.Restore(context.Background(), snap.ID, w.ID); err == nil { + t.Fatal("expected extract failure to abort the restore") + } + // Post-lock abort: it stopped, then brought the app back (no swaps happened). + if !fake.saw("lock") || !fake.saw("stop") || !fake.saw("redeploy:v1") { + t.Errorf("expected lock+stop+redeploy after a post-lock abort; calls=%v", fake.calls) + } + // No staging or journal left behind. + assertNoJournal(t, eng) + entries, _ := os.ReadDir(eng.snapDir) + for _, e := range entries { + if strings.Contains(e.Name(), ".tf-restore-") { + t.Errorf("leftover staging dir: %s", e.Name()) + } + } +} + +func TestRecoverInterruptedRestores(t *testing.T) { + eng, _, _ := newRestoreEngine(t) + root := t.TempDir() + + // A: swap completed — keep restored live, drop old. + liveA := filepath.Join(root, "A") + oldA := filepath.Join(root, ".A.old") + mkDirWith(t, liveA, "f", "RESTORED-A") + mkDirWith(t, oldA, "f", "ORIGINAL-A") + // B: not swapped, live present — keep original, drop tmp. + liveB := filepath.Join(root, "B") + tmpB := filepath.Join(root, ".B.tmp") + mkDirWith(t, liveB, "f", "ORIGINAL-B") + mkDirWith(t, tmpB, "f", "STAGED-B") + // C: crashed mid-rename — live missing, old present — revert from old. + liveC := filepath.Join(root, "C") + oldC := filepath.Join(root, ".C.old") + tmpC := filepath.Join(root, ".C.tmp") + mkDirWith(t, oldC, "f", "ORIGINAL-C") + mkDirWith(t, tmpC, "f", "STAGED-C") + + jr := restoreJournal{SnapshotID: "snap", WorkloadID: "wl-recover", Volumes: []journalVolume{ + {Live: liveA, Old: oldA, Swapped: true, HadOld: true}, + {Live: liveB, Tmp: tmpB, Swapped: false}, + {Live: liveC, Old: oldC, Tmp: tmpC, Swapped: false}, + }} + if err := eng.writeJournal(jr); err != nil { + t.Fatalf("write journal: %v", err) + } + + n, err := eng.RecoverInterruptedRestores() + if err != nil { + t.Fatalf("recover: %v", err) + } + if n != 1 { + t.Fatalf("recovered %d journals, want 1", n) + } + if got := readIn(t, liveA, "f"); got != "RESTORED-A" { + t.Errorf("A live = %q, want RESTORED-A (swap kept)", got) + } + if _, err := os.Stat(oldA); !os.IsNotExist(err) { + t.Error("A old should be removed") + } + if got := readIn(t, liveB, "f"); got != "ORIGINAL-B" { + t.Errorf("B live = %q, want ORIGINAL-B (untouched)", got) + } + if _, err := os.Stat(tmpB); !os.IsNotExist(err) { + t.Error("B tmp should be removed") + } + if got := readIn(t, liveC, "f"); got != "ORIGINAL-C" { + t.Errorf("C live = %q, want ORIGINAL-C (reverted from old)", got) + } + if _, err := os.Stat(tmpC); !os.IsNotExist(err) { + t.Error("C tmp should be removed") + } + assertNoJournal(t, eng) +} + +func assertNoJournal(t *testing.T, eng *Engine) { + t.Helper() + entries, err := os.ReadDir(eng.snapDir) + if err != nil { + t.Fatal(err) + } + for _, e := range entries { + if strings.HasPrefix(e.Name(), "restore-") && strings.HasSuffix(e.Name(), ".json") { + t.Errorf("leftover restore journal: %s", e.Name()) + } + } +} diff --git a/internal/volsnap/restore_test.go b/internal/volsnap/restore_test.go new file mode 100644 index 0000000..1797860 --- /dev/null +++ b/internal/volsnap/restore_test.go @@ -0,0 +1,270 @@ +package volsnap + +import ( + "os" + "path/filepath" + "testing" + + "github.com/alexei/tinyforge/internal/store" +) + +func mkDirWith(t *testing.T, dir, fname, content string) { + t.Helper() + if err := os.MkdirAll(dir, 0o700); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(dir, fname), []byte(content), 0o600); err != nil { + t.Fatal(err) + } +} + +func readIn(t *testing.T, dir, fname string) string { + t.Helper() + b, err := os.ReadFile(filepath.Join(dir, fname)) + if err != nil { + t.Fatalf("read %s/%s: %v", dir, fname, err) + } + return string(b) +} + +func TestSwapVolumeDir_ReplacesAndPreservesOld(t *testing.T) { + root := t.TempDir() + live := filepath.Join(root, "data") + tmp := filepath.Join(root, ".data.tmp") + old := filepath.Join(root, ".data.old") + mkDirWith(t, live, "f.txt", "ORIGINAL") + mkDirWith(t, tmp, "f.txt", "RESTORED") + + hadOld, err := swapVolumeDir(live, tmp, old) + if err != nil { + t.Fatalf("swap: %v", err) + } + if !hadOld { + t.Error("hadOld should be true when a live dir existed") + } + if got := readIn(t, live, "f.txt"); got != "RESTORED" { + t.Errorf("live = %q, want RESTORED", got) + } + if got := readIn(t, old, "f.txt"); got != "ORIGINAL" { + t.Errorf("old = %q, want ORIGINAL (prior live preserved)", got) + } +} + +func TestSwapVolumeDir_MissingLive(t *testing.T) { + root := t.TempDir() + live := filepath.Join(root, "data") // does not exist + tmp := filepath.Join(root, ".data.tmp") + old := filepath.Join(root, ".data.old") + mkDirWith(t, tmp, "f.txt", "RESTORED") + + hadOld, err := swapVolumeDir(live, tmp, old) + if err != nil { + t.Fatalf("swap: %v", err) + } + if hadOld { + t.Error("hadOld should be false when no live dir existed") + } + if got := readIn(t, live, "f.txt"); got != "RESTORED" { + t.Errorf("live = %q, want RESTORED", got) + } +} + +func TestSwapVolumeDir_RevertsOnSecondRenameFailure(t *testing.T) { + // The data-loss-critical path: the live→old rename succeeds, then tmp→live + // fails (here: tmp is absent). swapVolumeDir MUST self-revert old→live so + // the live dir is never left missing, and old must not be left dangling. + root := t.TempDir() + live := filepath.Join(root, "data") + old := filepath.Join(root, ".data.old") + mkDirWith(t, live, "f.txt", "ORIGINAL") + + hadOld, err := swapVolumeDir(live, filepath.Join(root, ".data.tmp" /* absent */), old) + if err == nil { + t.Fatal("expected swap to fail when tmp is absent") + } + if got := readIn(t, live, "f.txt"); got != "ORIGINAL" { + t.Errorf("live = %q, want ORIGINAL restored by self-revert", got) + } + if _, statErr := os.Stat(old); !os.IsNotExist(statErr) { + t.Errorf("old dir should have been renamed back to live, not left dangling") + } + _ = hadOld +} + +func TestStagingDirs_SameParentAsLive(t *testing.T) { + // R2 invariant: tmp and old must be siblings of the live dir's parent so + // every rename in the swap is intra-filesystem (atomic). + live := filepath.FromSlash("/srv/data/postgres") + tmp, old := stagingDirs(live, "tok", 3) + wantParent := filepath.Dir(live) + if filepath.Dir(tmp) != wantParent || filepath.Dir(old) != wantParent { + t.Errorf("staging dirs not siblings of live's parent: tmp=%s old=%s parent=%s", tmp, old, wantParent) + } + if tmp == old { + t.Error("tmp and old must be distinct paths") + } +} + +func TestRollbackSwaps_RestoresOriginals(t *testing.T) { + root := t.TempDir() + var done []swap + for _, name := range []string{"vol0", "vol1"} { + live := filepath.Join(root, name) + tmp := filepath.Join(root, "."+name+".tmp") + old := filepath.Join(root, "."+name+".old") + mkDirWith(t, live, "f.txt", "ORIGINAL-"+name) + mkDirWith(t, tmp, "f.txt", "RESTORED-"+name) + hadOld, err := swapVolumeDir(live, tmp, old) + if err != nil { + t.Fatalf("swap %s: %v", name, err) + } + done = append(done, swap{live: live, old: old, tmp: tmp, hadOld: hadOld}) + } + // Both are now RESTORED; rolling back must return both to ORIGINAL. + rollbackSwaps(done) + for _, name := range []string{"vol0", "vol1"} { + if got := readIn(t, filepath.Join(root, name), "f.txt"); got != "ORIGINAL-"+name { + t.Errorf("%s after rollback = %q, want ORIGINAL-%s", name, got, name) + } + } +} + +func TestRollbackSwaps_PartialLeavesUnswappedIntact(t *testing.T) { + root := t.TempDir() + // vol0 swaps successfully; vol1 "fails" (its tmp is absent) so only vol0 is + // recorded in done. Rollback of vol0 must restore the original. + live0 := filepath.Join(root, "vol0") + tmp0 := filepath.Join(root, ".vol0.tmp") + old0 := filepath.Join(root, ".vol0.old") + mkDirWith(t, live0, "f.txt", "ORIGINAL-0") + mkDirWith(t, tmp0, "f.txt", "RESTORED-0") + hadOld, err := swapVolumeDir(live0, tmp0, old0) + if err != nil { + t.Fatalf("swap vol0: %v", err) + } + + live1 := filepath.Join(root, "vol1") + mkDirWith(t, live1, "f.txt", "ORIGINAL-1") + if _, err := swapVolumeDir(live1, filepath.Join(root, ".vol1.tmp" /* absent */), filepath.Join(root, ".vol1.old")); err == nil { + t.Fatal("expected vol1 swap to fail (tmp absent)") + } + + rollbackSwaps([]swap{{live: live0, old: old0, tmp: tmp0, hadOld: hadOld}}) + if got := readIn(t, live0, "f.txt"); got != "ORIGINAL-0" { + t.Errorf("vol0 after rollback = %q, want ORIGINAL-0", got) + } + // vol1 was never swapped; its original must be untouched. + if got := readIn(t, live1, "f.txt"); got != "ORIGINAL-1" { + t.Errorf("vol1 = %q, want ORIGINAL-1 (never swapped)", got) + } +} + +func TestPreflightResolve_AllOrNothing(t *testing.T) { + eng, st, base := newRestoreEngine(t) + _ = eng + w, err := st.CreateWorkload(store.Workload{ + Name: "app", Kind: "project", SourceKind: "image", + SourceConfig: `{"image":"x","port":80,"volumes":[` + + `{"source":"data","target":"/data","scope":"project"},` + + `{"source":"var","target":"/var","scope":"stage"}]}`, + }) + if err != nil { + t.Fatalf("create workload: %v", err) + } + settings, _ := st.GetSettings() + + // A snapshotted target no longer declared by the workload ⇒ whole abort (C3). + if _, err := preflightResolve(st, w, settings, []SnapshotVolume{ + {Index: 0, Target: "/data", Scope: "project", Source: "data"}, + {Index: 1, Target: "/gone", Scope: "project", Source: "gone"}, + }); err == nil { + t.Fatal("expected all-or-nothing abort when a target is no longer declared") + } + + // All declared ⇒ resolves every volume under BaseVolumePath. + resolved, err := preflightResolve(st, w, settings, []SnapshotVolume{ + {Index: 0, Target: "/data", Scope: "project", Source: "data"}, + {Index: 1, Target: "/var", Scope: "stage", Source: "var"}, + }) + if err != nil { + t.Fatalf("preflight: %v", err) + } + if len(resolved) != 2 { + t.Fatalf("resolved %d volumes, want 2", len(resolved)) + } + for _, rv := range resolved { + if ok, _ := pathWithinBase(base, rv.LivePath); !ok { + t.Errorf("volume %q resolved to %q, outside base %q", rv.Target, rv.LivePath, base) + } + } +} + +// TestPreflightResolve_IgnoresManifestSource is the regression guard for the +// security fix: a tampered manifest whose Source tries to escape (../../etc) +// must NOT redirect the swap target — the host path is re-derived from the +// workload's CURRENT trusted config (Source "data"), staying under the base. +func TestPreflightResolve_IgnoresManifestSource(t *testing.T) { + _, st, base := newRestoreEngine(t) + w, err := st.CreateWorkload(store.Workload{ + Name: "app", Kind: "project", SourceKind: "image", + SourceConfig: `{"image":"x","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`, + }) + if err != nil { + t.Fatalf("create workload: %v", err) + } + settings, _ := st.GetSettings() + + resolved, err := preflightResolve(st, w, settings, []SnapshotVolume{ + {Index: 0, Target: "/data", Scope: "project", Source: "../../../../etc"}, + }) + if err != nil { + t.Fatalf("preflight: %v", err) + } + if len(resolved) != 1 { + t.Fatalf("resolved %d, want 1", len(resolved)) + } + if ok, _ := pathWithinBase(base, resolved[0].LivePath); !ok { + t.Errorf("manifest Source escaped containment: resolved %q outside base %q", + resolved[0].LivePath, base) + } + if filepath.Base(resolved[0].LivePath) != "data" { + t.Errorf("expected target derived from current config (data), got %q", resolved[0].LivePath) + } +} + +func TestArchiveUncompressedSize(t *testing.T) { + root := t.TempDir() + mustWrite(t, filepath.Join(root, "a.txt"), "hello") // 5 + if err := os.MkdirAll(filepath.Join(root, "sub"), 0o700); err != nil { + t.Fatal(err) + } + mustWrite(t, filepath.Join(root, "sub", "b.txt"), "world!") // 6 + dest := filepath.Join(t.TempDir(), "snap.tar.gz") + if _, err := writeArchive(dest, []VolumeRef{{Target: "/d", Scope: "project", Source: "d", HostPath: root}}); err != nil { + t.Fatal(err) + } + + per, total, err := archiveUncompressedSize(dest, maxRestoreUncompressedBytes) + if err != nil { + t.Fatalf("size: %v", err) + } + if total != 11 { + t.Errorf("total = %d, want 11", total) + } + if per[0] != 11 { + t.Errorf("perIndex[0] = %d, want 11", per[0]) + } + if _, _, err := archiveUncompressedSize(dest, 4); err == nil { + t.Error("expected sizing to abort past the decompression cap") + } +} + +func TestFreeDiskBytes(t *testing.T) { + n, err := freeDiskBytes(t.TempDir()) + if err != nil { + t.Fatalf("freeDiskBytes: %v", err) + } + if n == 0 { + t.Error("expected non-zero free space on the temp filesystem") + } +} diff --git a/internal/volsnap/volumes.go b/internal/volsnap/volumes.go index 7d0728b..a20dd55 100644 --- a/internal/volsnap/volumes.go +++ b/internal/volsnap/volumes.go @@ -80,27 +80,10 @@ func SnapshotableVolumes(st *store.Store, w store.Workload, settings store.Setti return nil, nil, nil } - byTarget := map[string]store.WorkloadVolume{} - - var cfg scVolumes - if w.SourceConfig != "" { - // Best-effort: a malformed config simply yields no inline volumes; the - // persisted rows below still apply. - _ = json.Unmarshal([]byte(w.SourceConfig), &cfg) - } - for _, v := range cfg.Volumes { - if v.Target == "" { - continue - } - byTarget[v.Target] = store.WorkloadVolume{Source: v.Source, Target: v.Target, Scope: v.Scope, Name: v.Name} - } - persisted, perr := st.ListWorkloadVolumes(w.ID) + byTarget, perr := volumesByTarget(st, w) if perr != nil { return nil, nil, perr } - for _, p := range persisted { - byTarget[p.Target] = store.WorkloadVolume{Source: p.Source, Target: p.Target, Scope: p.Scope, Name: p.Name} - } params := volume.ResolveWorkloadParams{ BasePath: settings.BaseVolumePath, @@ -132,6 +115,37 @@ func SnapshotableVolumes(st *store.Store, w store.Workload, settings store.Setti return refs, skipped, nil } +// volumesByTarget merges a workload's source_config inline volumes with its +// persisted workload_volumes rows (persisted wins on a target conflict), keyed +// by container target path. It is the authoritative current volume set, shared +// by capture enumeration (SnapshotableVolumes) and restore pre-flight so both +// resolve host paths the same way — restore must never trust a snapshot's +// persisted manifest to name a host directory. +func volumesByTarget(st *store.Store, w store.Workload) (map[string]store.WorkloadVolume, error) { + byTarget := map[string]store.WorkloadVolume{} + + var cfg scVolumes + if w.SourceConfig != "" { + // Best-effort: a malformed config simply yields no inline volumes; the + // persisted rows below still apply. + _ = json.Unmarshal([]byte(w.SourceConfig), &cfg) + } + for _, v := range cfg.Volumes { + if v.Target == "" { + continue + } + byTarget[v.Target] = store.WorkloadVolume{Source: v.Source, Target: v.Target, Scope: v.Scope, Name: v.Name} + } + persisted, err := st.ListWorkloadVolumes(w.ID) + if err != nil { + return nil, err + } + for _, p := range persisted { + byTarget[p.Target] = store.WorkloadVolume{Source: p.Source, Target: p.Target, Scope: p.Scope, Name: p.Name} + } + return byTarget, nil +} + func skipReason(scope string) string { switch scope { case string(store.VolumeScopeInstance): diff --git a/plans/volume-snapshot-restore/CONTEXT.md b/plans/volume-snapshot-restore/CONTEXT.md new file mode 100644 index 0000000..2a5ea44 --- /dev/null +++ b/plans/volume-snapshot-restore/CONTEXT.md @@ -0,0 +1,60 @@ +# CONTEXT — Volume Snapshot Restore + +Working memory across phases. The orchestrator owns this file. + +## Settings (from PLAN.md header) + +- Mode: **Automated** · Execution: **Hybrid** (backend Direct, Phase 4 frontend implementer) · Strategy: **Incremental** +- Base: `main` · Branch: `feature/volume-snapshot-restore` · Remote: origin (Gitea) +- Build: `go build ./...` · Test: `go test ./internal/...` + `npm run test` · Lint: `go vet ./internal/...` + `npm run check` + +## Key codebase facts (verified during planning) + +- **Deploy choke point:** every deploy entrypoint calls `deployer.DispatchPlugin` → + put the per-workload lock there (C1). Entrypoints: `deployPluginWorkload`, + `rollbackWorkload`, `promoteFromWorkload`, `dispatchGeneric`, webhook + `fireBinding`/`handlePreviewIntent`. +- **`activeWg`/`drainMu`** in `deployer.go` = global drain barrier, NOT a per-workload lock. +- **Image idempotency short-circuit** (`image.go` Deploy ~L170-181) only fires for a + *verified-running* container → after stop, redeploy makes a fresh container; blue-green + `enforceMaxInstances` reaps the old stopped one. ⇒ stop→swap→redeploy (C4) is correct. +- **Scope resolution** (`internal/volume/resolver.go`): stage/project → `//` + (shared per-workload dir); absolute → operator's allowed path. Stage tmp/old siblings under + the live dir's PARENT so renames are same-fs (R2). +- **`volsnap.Engine`** has `e.mu` taken by Create/Delete/pruneWorkload/CleanOrphans. + `Restore` must NOT hold `e.mu` (R1). +- **Archive layout:** gzip tar, each volume under integer subdir `0/`,`1/`…, `manifest.json` + at root = `[]SnapshotVolume{Index,Target,Scope,Source}`. `supportedScopes` = + absolute/stage/project (volumes.go). +- **Precedent:** `internal/api/backups.go` `restoreBackup` — X-Confirm-Restore==id, + `restoreInFlight` CAS→409, pre-restore safety backup, atomic rename swap. +- **Composition root:** `cmd/server/main.go` constructs `deployer.New` + `volsnap.New` + + `docker` + `store`; calls `CleanOrphans` at startup (wire `RecoverInterruptedRestores` there). +- **Frontend:** `WorkloadSnapshotsPanel.svelte`; api fns `web/src/lib/api.ts` ~L581; + i18n `apps.detail.snapshots.*` in en.json + ru.json. +- `golang.org/x/sys v0.33.0` already in go.mod (indirect); build-tag precedent exists + (`lockfile_windows.go`/`lockfile_unix.go`). + +## Decisions / invariants + +- `Engine.Restore` holds NO `e.mu`; per-workload `Lifecycle.Lock` is the serialization. +- Extract ALL tmp dirs BEFORE any rename; swap is pure renames; journal tracks per-volume `swapped`. +- Pre-restore snapshot captured AFTER stop, BEFORE first rename (durable escape hatch). +- Redeploy pins the newest-running container's tag (same version back up). +- Mixed per-volume state after a mid-restore crash is an accepted v1 limit (each volume intact; pre-restore snapshot = full revert). + +## Deferred / out of scope + +- Named/project_named/instance/ephemeral scopes (consistent with capture). +- Non-image sources. +- Fully-atomic all-volumes-or-nothing restore (v1 is per-volume atomic + journal recovery). + +## Failed approaches / gotchas + +- (none yet) + +## Phase handoffs + +- Phase 1 → 2: _(filled after Phase 1)_ +- Phase 2 → 3: _(filled after Phase 2)_ +- Phase 3 → 4: _(filled after Phase 3)_ diff --git a/plans/volume-snapshot-restore/PLAN.md b/plans/volume-snapshot-restore/PLAN.md new file mode 100644 index 0000000..a23349f --- /dev/null +++ b/plans/volume-snapshot-restore/PLAN.md @@ -0,0 +1,113 @@ +# Feature: Volume Snapshot Restore (backlog #6) + +**Branch:** `feature/volume-snapshot-restore` +**Base branch:** `main` +**Created:** 2026-06-22 +**Status:** 🟡 In Progress +**Strategy:** Incremental +**Mode:** Automated +**Execution:** Hybrid — backend (Phases 1–3) Direct by the orchestrator; Phase 4 via the frontend implementer +**Remote:** origin (https://git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge.git) + +## Summary + +Restore a previously-captured volume snapshot (gzip tar of an image workload's host-bind +data volumes) back onto the live volume directories, then bring the app back up. Capture +already ships (`internal/volsnap`); restore is greenfield and **data-loss-sensitive** — a +wrong design is permanent data loss, so the design was adversarially plan-reviewed twice +(prior session + this phase breakdown). + +**Scope (deliberate):** image-source workloads only; volume scopes `absolute` / `stage` / +`project` only — driven off the SAME `volsnap.supportedScopes` constant capture uses. Named +/ project_named (Docker named volumes), instance, and ephemeral scopes are out (consistent +with capture). + +## Mandatory design fixes (non-negotiable — a wrong design = permanent data loss) + +- **C1** Serialize via a per-workload `keyedMutex` (the `internal/api/gitops.go` pattern, + extracted to `internal/keyedmutex`) keyed by workload id, gating EVERY deploy entrypoint. + All entrypoints funnel through `deployer.DispatchPlugin` (verified: deploy, rollback, + promote, generic-hooks, webhook fireBinding/handlePreviewIntent), so the lock lives there. + NOT `activeWg` (a global drain barrier, not a per-workload lock). +- **C2** Extract-to-temp + atomic rename-swap (extract→`.tmp`, rename live→`.old`, rename + `.tmp`→live), NEVER in-place. Mirrors `internal/api/backups.go` restore precedent. +- **C3** All-or-nothing pre-flight re-resolution via `volume.ResolveWorkloadPath` — abort + BEFORE stopping containers if ANY manifest volume doesn't resolve (config drift = + corruption). Runs before `Lock`/`StopContainers`. +- **C4** Image containers are recreated, not reused → **stop → swap → redeploy** (re-dispatch + via `DispatchPlugin`/`RedeployLocked`), NOT `StartContainer(oldID)`. Verified: image + source's idempotency short-circuit only fires for a *verified-running* container, so a + redeploy after stop creates a fresh container on restored data; `enforceMaxInstances` + reaps the old stopped one. +- **C5** Disk-space pre-check, **per target filesystem** (peak = live + extracted coexist). +- **C6** Treat the archive as UNTRUSTED on extract: zip-slip `HasPrefix` containment, + reject symlink/hardlink/device/fifo/socket entries, manifest-index bounds, decompression- + bomb cap. Require an `X-Confirm-Restore: ` header like the DB restore (CSRF guard). + +### Folded-in (also mandatory) +- Single-flight per-workload CAS → 409 (different apps may restore concurrently). +- Auto-capture a pre-restore snapshot, **durably committed before the first destructive + rename** (the operator's clean escape hatch). +- Logic lives in `Engine.Restore` (engine), not the API handler. + +### Resolutions from the phase-breakdown plan review (2026-06-22) +- **R1 (e.mu deadlock):** `Engine.Restore` does NOT hold `e.mu`; per-workload `Lifecycle.Lock` + is the serialization. `Create`'s own `e.mu` guards only the pre-restore archive write. +- **R2 (cross-device / containment):** stage `tmp`+`old` as siblings under the **live dir's + own parent** (same filesystem ⇒ atomic rename). Detect `EXDEV` → abort/rollback loudly. +- **R3 (crash window):** durable pre-restore snapshot before any rename; **extract all tmp + dirs first, then pure renames**; restore-journal + startup `RecoverInterruptedRestores()` + sweep (revert `live-missing→.old`, clean orphan tmp). +- **R4:** C5 checks per-target-filesystem; `StopContainers` returns newest-running tag so + redeploy pins the same version, and marks rows stopped; `Engine.Restore` re-validates the + workload AFTER acquiring the lock; best-effort audit event emitted. + +## Build & Test Commands + +- **Build:** `go build ./...` +- **Test:** `go test ./internal/...` (backend); from `web/`: `npm run test` +- **Lint:** `go vet ./internal/...`; from `web/`: `npm run check` +- **Frontend build:** from `web/`: `npm run build` +- **Dev:** `./scripts/dev-server.sh` (port 8090; restart after every build) + +## Phases + +- [x] Phase 1: Restore engine primitives + path-safe extractor + unit tests [domain: backend] → [subplan](./phase-1-engine-primitives.md) +- [x] Phase 2: Engine.Restore orchestration + lifecycle/locking + rollback [domain: backend] → [subplan](./phase-2-lifecycle-locking.md) +- [x] Phase 3: API endpoint + CSRF header + single-flight + wiring + tests [domain: backend] → [subplan](./phase-3-api.md) +- [x] Phase 4: UI Restore button + ConfirmDialog + i18n en+ru [domain: frontend] → [subplan](./phase-4-frontend.md) + +## Parallelizable Phase Groups (Orchestrator mode only) + +None — strictly sequential. Each phase depends on the prior (P2 needs P1 primitives + the +Lifecycle seam; P3 wires the adapter + needs `Engine.SetLifecycle`; P4 needs the endpoint). + +## Phase Progress Log + +| Phase | Domain | Status | Review | Build | Committed | +|-------|--------|--------|--------|-------|-----------| +| Phase 1: engine primitives | backend | ✅ Done | ✅ Passed (APPROVE w/ notes) | ✅ Passed | ⬜ | +| Phase 2: lifecycle/locking | backend | ✅ Done | ✅ Passed (APPROVE w/ notes) | ✅ Passed | ⬜ | +| Phase 3: API endpoint | backend | ✅ Done | ✅ Passed (go: APPROVE w/ notes; security: fixed CRITICAL) | ✅ Passed | ⬜ | +| Phase 4: frontend | frontend | ✅ Done | ✅ Passed (ts: APPROVE) | ✅ Passed (check 0 err, build, 26 tests) | ⬜ | + +## Outstanding Warnings + +| Phase | Warning | Severity | Status (open / resolved / accepted) | +|-------|---------|----------|-------------------------------------| +| (design) | Mid-restore crash can leave a per-volume MIXED state (some restored, some original); each volume is individually intact and the pre-restore snapshot is the full escape hatch. | 🟡 | accepted (documented v1 limit) | +| 2→3 | **B1 (was Blocker):** `RecoverInterruptedRestores()` + `SetLifecycle()` MUST be wired at startup BEFORE the API server serves — restore endpoint must not be reachable without them. | 🔴→tracked | open — HARD Phase 3 prerequisite | +| 2 | W3 residual: the swap-failure-after-partial-swap ORCHESTRATION branch (rollbackSwaps glue) is covered by primitive unit tests + recovery test + extract-failure orchestration test, but not a full mid-swap fault-injection (needs an fs-fault seam not worth the production complexity). | 🟡 | accepted | + +## Final Review + +- [x] Comprehensive code review — ✅ READY TO MERGE (no blockers/warnings; 3 non-blocking notes) +- [x] Security review (untrusted-archive extraction + CSRF + admin gating) — CRITICAL found & fixed (manifest-Source path traversal); re-derive from current config + containment +- [x] All Outstanding Warnings resolved or consciously accepted +- [x] Full build passes (`go build ./...`, `npm run build`) +- [x] Full test suite passes (`go test ./internal/...`, `npm run test` 26, `npm run check` 0 err) +- [ ] Merged to `main` (squash) + +## Amendment Log + +_(none yet)_ diff --git a/plans/volume-snapshot-restore/phase-1-engine-primitives.md b/plans/volume-snapshot-restore/phase-1-engine-primitives.md new file mode 100644 index 0000000..e2c8da4 --- /dev/null +++ b/plans/volume-snapshot-restore/phase-1-engine-primitives.md @@ -0,0 +1,96 @@ +# Phase 1: Restore engine primitives + path-safe extractor + unit tests + +**Status:** ✅ Complete +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective + +Build the dangerous filesystem primitives in isolation, fully unit-tested, with NO +docker/lifecycle wiring. Each is a pure function over directories + the store + a parsed +manifest. No caller yet (exercised by tests so not "unused"). Zero behavior change to +existing capture. + +## Tasks + +- [ ] **`internal/volsnap/extract.go`** — `safeExtractIndex(archivePath string, index int, dest string, bombCap int64) (int64, error)`: + open the gzip tar, extract only entries under the `"/"` prefix into `dest`, return + bytes written. UNTRUSTED-input guards (C6): + - zip-slip: `target := filepath.Join(dest, rel)`; require `strings.HasPrefix(filepath.Clean(target)+sep, cleanDest+sep)` (or `target == cleanDest`); reject otherwise. + - allow ONLY `tar.TypeReg` + `tar.TypeDir`; reject symlink/hardlink/char/block/fifo/socket with an error (never follow). + - decompression-bomb cap: running byte counter; abort when it would exceed `bombCap`. + - create parent dirs as needed; files `0o600`, dirs `0o700` (data dirs; ownership is the container's concern). + - skip `manifest.json` and any entry whose leading path segment ≠ `index`. +- [ ] **`internal/volsnap/restore.go`** (primitives only — NO orchestration): + - `archiveUncompressedSize(archivePath string, bombCap int64) (int64, error)` — header-only sizing pass summing `hdr.Size`, enforcing `bombCap` (feeds C5). Per-index sizes too (`map[int]int64`) so C5 can check per filesystem. + - `parseManifest(snap store.VolumeSnapshot) ([]SnapshotVolume, error)`. + - `preflightResolve(w store.Workload, settings store.Settings, manifest []SnapshotVolume) ([]resolvedVol, error)` — ALL-OR-NOTHING (C3): for every manifest volume require `supportedScopes[scope]` AND `volume.ResolveWorkloadPath` succeeds; on first failure return an error naming target+scope+reason (abort signal). `resolvedVol{Index, Target, Scope, LivePath}`. Reuses the SAME `supportedScopes` map. + - swap helpers (C2 + R2 + R3): staging is **sibling to the live dir's parent** so renames are same-filesystem. `stagingRoot(live string) string` = `filepath.Join(filepath.Dir(live), ".tf-restore-"+token)`. `swapVolumeDir(live, tmp, old string) error` = rename(live→old) then rename(tmp→live); detect `EXDEV`/cross-device and return a clear error WITHOUT having moved anything irreversibly (check device equivalence up-front or treat the rename error as fatal/rollback). `rollbackSwaps(done []swap) error` = for each completed swap in reverse, rename(live→discard), rename(old→live). + - `freeDiskBytes(path string) (uint64, error)` — platform helper. Build-tag split mirroring the repo's `lockfile_windows.go`/`lockfile_unix.go` precedent: `disk_unix.go` (`//go:build !windows`, `syscall.Statfs`) + `disk_windows.go` (`golang.org/x/sys/windows.GetDiskFreeSpaceEx`). Production target is Linux. +- [ ] **Constants:** `maxRestoreUncompressedBytes` (decompression-bomb cap) + `diskFreeSafetyMargin` named consts with rationale comments. + +## Files to Modify/Create + +- `internal/volsnap/extract.go` — untrusted extractor (new) +- `internal/volsnap/restore.go` — primitives: sizing, manifest parse, preflight, swap/rollback, free-disk (new) +- `internal/volsnap/disk_unix.go`, `internal/volsnap/disk_windows.go` — free-disk platform split (new) +- `internal/volsnap/extract_test.go`, `internal/volsnap/restore_test.go` — unit tests (new) +- `go.mod` — `golang.org/x/sys` promoted indirect→direct (already present v0.33.0) + +## Acceptance Criteria + +- Zip-slip (`../`, absolute, `..\\` on win), symlink, hardlink, device, fifo entries all rejected by `safeExtractIndex`. +- Decompression-bomb cap aborts extraction + sizing past the cap. +- Happy-path extract round-trip restores file tree + contents byte-for-byte under `dest`. +- `swapVolumeDir` + `rollbackSwaps`: full and PARTIAL-swap rollback leave the original live dirs byte-identical. +- `preflightResolve` is all-or-nothing: one unresolvable/unsupported-scope volume → error, and the caller renames nothing. +- `archiveUncompressedSize` matches the real extracted total. +- `go test ./internal/volsnap/...`, `go build ./...`, `go vet ./internal/...` all green. + +## Notes + +- Open the archive once per pass; on Unix an open fd survives a concurrent `Delete` unlink (defence against a racing snapshot delete); Windows refuses delete of an open file. Acceptable. +- `safeExtractIndex` writes into a caller-provided `dest` (the staging `tmp`), never directly onto the live path — the swap is a separate step (C2). + +## Review Checklist + +- [ ] All tasks completed +- [ ] Code follows project conventions (gofmt, wrapped errors, small funcs) +- [ ] No unintended side effects (no change to Create/List/Delete) +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + +Implemented files: `extract.go` (`safeExtractIndex`, `stripIndexPrefix`, `leadingIndex`, +`withinDir`), `restore.go` (`parseManifest`, `preflightResolve`, `archiveUncompressedSize`, +`swap`/`swapVolumeDir`/`rollbackSwaps`/`stagingDirs`, consts `maxRestoreUncompressedBytes` += 50 GiB, `diskFreeHeadroomBytes` = 256 MiB), `disk_unix.go`/`disk_windows.go` +(`freeDiskBytes`). Tests in `extract_test.go` + `restore_test.go`. `go.mod`: `x/sys` → +direct. + +**API contract for Phase 2 (Engine.Restore):** +- `safeExtractIndex(archivePath, index, dest, bombCap)` — extracts ONE volume's subtree into + a FRESH `dest` (uses `O_EXCL`); returns bytes written. Call once per resolved volume into + its `tmp` staging dir. +- `preflightResolve(w, settings, manifest)` → `[]resolvedVol{Index,Target,Scope,LivePath}`, + ALL-OR-NOTHING; already rejects unsupported scopes AND negative indices. Run BEFORE + Lock/StopContainers. +- `stagingDirs(live, token, index)` → `(tmp, old)` siblings of `filepath.Dir(live)` (same-fs + ⇒ atomic rename). Use a per-restore `token`. +- `swapVolumeDir(live, tmp, old)` → `(hadOld, err)`; self-reverts the first rename on failure + (live never left missing). Collect each completed swap into `[]swap{live,old,tmp,hadOld}` + and call `rollbackSwaps(done)` on any later failure. +- `archiveUncompressedSize(archivePath, bombCap)` → `(perIndex map[int]int64, total, err)` + for the C5 per-filesystem free-disk check. NOTE: it's a LOWER-BOUND (ignores dir/inode + overhead) — treat as advisory; the staged-extract+swap is the real net. +- `freeDiskBytes(path)` — pass the live dir's PARENT (where tmp/old land). + +**Phase 2 must:** extract ALL tmp dirs first, THEN swap all (shrinks the destructive +window); validate each manifest index maps to an existing archive subtree (W2 — only the +negative check is done so far); the disk pre-check should sum per-target-filesystem. + +**Review (go-reviewer, APPROVE WITH NOTES):** no blockers. Addressed in-phase: W2 (negative +index reject), W3 (explicit second-rename self-revert test), W4 (stagingDirs test), N1/N2/N4 +(comments + sparse-type rejection test). W1 (disk estimate is lower-bound) folded into Phase +2 guidance above. diff --git a/plans/volume-snapshot-restore/phase-2-lifecycle-locking.md b/plans/volume-snapshot-restore/phase-2-lifecycle-locking.md new file mode 100644 index 0000000..a050a13 --- /dev/null +++ b/plans/volume-snapshot-restore/phase-2-lifecycle-locking.md @@ -0,0 +1,106 @@ +# Phase 2: Engine.Restore orchestration + lifecycle/locking + rollback + +**Status:** ✅ Complete +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective + +Wire the Phase 1 primitives into the full **stop → swap → redeploy** sequence under a +per-workload lock, with crash-safe rollback (journal + recovery sweep) and a durable +pre-restore auto-capture. Define the `Lifecycle` seam; modify the Deployer for per-workload +locking + an unlocked redeploy. + +## Tasks + +- [ ] **`internal/keyedmutex/keyedmutex.go`** — extract the `gitops.go` pattern into a shared + package: `type Mutex` with `Lock(key string) func()` and `TryLock(key string) (func(), bool)` + (the Try variant serves the Phase 3 API single-flight → 409). Unit test both. +- [ ] **Deployer locking (C1)** in `internal/deployer/`: + - add `workloadLocks keyedmutex.Mutex` field. + - refactor `DispatchPlugin` → `unlock := d.workloadLocks.Lock(w.ID); defer unlock(); return d.dispatchLocked(ctx, w, intent)`; move the current body into unexported `dispatchLocked`. + - wrap `DispatchTeardown` in the same per-workload lock. + - do NOT lock `DispatchReconcile` (periodic; image Reconcile is a no-op; reconciler `markMissingRows` only flips labels = benign; locking it would stall the reconcile loop behind long deploys). + - expose `func (d *Deployer) LockWorkload(id string) func()` and `func (d *Deployer) RedeployLocked(ctx, w, intent) error` (= `dispatchLocked`, doc: "caller already holds the workload lock; calling DispatchPlugin would deadlock"). +- [ ] **`volsnap.Lifecycle` interface** (in volsnap): + - `Lock(workloadID string) func()` + - `StopContainers(ctx, workloadID string) (runningTag string, err error)` — stop every running container for the workload; return the **newest-running** container's `ImageTag` (so redeploy pins the same version; empty ⇒ source default). Mark stopped rows `State="stopped"`. + - `Redeploy(ctx, w store.Workload, reference string) error` — unlocked re-dispatch, Reason `"restore"`, Reference=tag. +- [ ] **`Engine.Restore(ctx, snapshotID, workloadID string) error`** in `internal/volsnap/restore.go` + (engine owns it). Sequence — **does NOT hold `e.mu`** (R1): + 1. load snap; verify `snap.WorkloadID == workloadID`; load workload + settings; require `source_kind=="image"`. + 2. `parseManifest`; `preflightResolve` (C3 — abort if any fails); `archiveUncompressedSize` + per-filesystem `freeDiskBytes` pre-check (C5/R4 — abort). + 3. `unlock := lc.Lock(workloadID); defer unlock()` (C1). + 4. **re-validate** the workload still exists (R4 — teardown may have won the lock); abort if gone. + 5. `tag, _ := lc.StopContainers(ctx, workloadID)` (C4 stop). + 6. **durably** capture pre-restore snapshot: `e.Create(w, settings, "pre-restore")` (folded; AFTER stop = quiesced; BEFORE any rename = R3). `Create` takes its own `e.mu` — Restore must hold none. + 7. write **restore journal** `/restore-.json` (snapshotID, per-volume {live, old, tmp, swapped:false}). + 8. **extract ALL** volumes to their `tmp` staging dirs (`safeExtractIndex`) — R3 (shrinks the destructive window to pure renames). + 9. **swap** each volume (`swapVolumeDir`), updating the journal `swapped=true` per volume. + 10. on ANY error in 8–9 → `rollbackSwaps` + `lc.Redeploy(ctx, w, tag)` + delete journal + return wrapped error. + 11. success → `lc.Redeploy(ctx, w, tag)` (C4 redeploy); remove `.old` staging dirs (reclaim disk); delete journal; best-effort audit event (`store.InsertEvent` source `"volsnap"`). + - `Engine.SetLifecycle(lc Lifecycle)` setter; `Restore` errors clearly if lifecycle is nil. +- [ ] **`Engine.RecoverInterruptedRestores() (int, error)`** (R3) — startup sweep, mirrors + `CleanOrphans`: for each `restore-*.json` journal, per volume: if `swapped` → remove `old`+`tmp`; + else if live missing && old exists → rename old→live (revert mid-rename crash), remove tmp; + else (live present, not swapped) → remove tmp. Delete journal. Log loudly. (Wiring at startup + happens in Phase 3's main.go change, beside `CleanOrphans`.) + +## Files to Modify/Create + +- `internal/keyedmutex/keyedmutex.go` (+ `_test.go`) — shared lock (new) +- `internal/deployer/deployer.go`, `internal/deployer/dispatch.go` — workloadLocks, dispatchLocked, LockWorkload, RedeployLocked, locked Teardown +- `internal/volsnap/restore.go` — Lifecycle interface, Engine.Restore, RecoverInterruptedRestores, SetLifecycle, journal type +- `internal/volsnap/restore_test.go` — fake-Lifecycle orchestration tests (extends Phase 1 file) +- `internal/api/gitops.go` — (optional, low-risk) migrate `keyedMutex`→`keyedmutex.Mutex` for DRY + +## Acceptance Criteria + +- Lock re-entrancy: `Engine.Restore` → `RedeployLocked` does NOT re-acquire the workload lock (no deadlock). All existing deployer tests still pass (lock is externally transparent). +- **Happy-path orchestration test uses the REAL `Engine.Create` (real store + `t.TempDir()`)** for the pre-restore capture so the `e.mu` deadlock (R1) would fail `go test`, not prod. Asserts call order: preflight → lock → stop → create → extract-all → swap-all → redeploy → cleanup. +- Rollback test: a swap fails midway → originals restored, redeploy called, journal deleted, error returned. +- Preflight-fail test: lock/stop NEVER called (abort before lock). +- Disk-pre-check-fail test: abort before lock. +- `RecoverInterruptedRestores` test: simulate journals in each crash state → correct revert/keep/cleanup. +- `go build ./...`, `go vet ./internal/...`, `go test ./internal/...` green. + +## Notes + +- ⚠️ The Deployer lock change touches the hot deploy path — verify no existing path re-enters `DispatchPlugin` under a held lock (webhook preview = sequential teardown-then-deploy on the child, not nested — confirmed safe). +- The API single-flight (Phase 3) is a fast 409 reject; the deployer lock is the real mutex — they compose (document). + +## Review Checklist + +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects (existing deploy/teardown behavior unchanged externally) +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + +Implemented: `internal/keyedmutex` (Lock+TryLock, tested); deployer `workloadLocks` + +`dispatchLocked` + `LockWorkload` + `RedeployLocked`, `DispatchPlugin`/`DispatchTeardown` +now per-workload-locked (reconciler intentionally NOT). `volsnap.Lifecycle` interface, +`Engine.Restore`, `restoreJournal` (atomic write — W1), `RecoverInterruptedRestores`, +`recoverVolume`, `checkDiskSpace`, `SetLifecycle`. Tests: `restore_engine_test.go` +(happy/real-Create, redeploy-fail, preflight-abort, extract-fail-after-lock, nil-lifecycle, +wrong-workload, recovery×3 states), `keyedmutex_test.go`. Full `go test ./internal/...` green. + +**Review (go-reviewer, APPROVE WITH NOTES):** no functional blockers in this diff. Verified: +no lock re-entrancy/`e.mu` self-deadlock, no prune-race (extract-all precedes `e.Create`), +recovery state machine doesn't revert good data. Addressed in-phase: W1 (atomic journal), +W3 (extract-failure orchestration test). Residual W3 (mid-swap fault injection) accepted. + +**🔴 HARD PREREQUISITES for Phase 3 (B1 + N1 from review):** +1. Wire `snapshotEngine.RecoverInterruptedRestores()` at startup in `cmd/server/main.go`, + BEFORE the API server serves — beside the existing `CleanOrphans()` call (~main.go:333). + Without it the journal/WAL protects nothing — a crash mid-restore is unrecovered. +2. Wire `snapshotEngine.SetLifecycle(adapter)` strictly BEFORE serving (same place as + `SetSnapshotEngine`) so the `e.lifecycle` field is safely published (no race). +3. The restore endpoint MUST NOT be reachable until both are wired. + +**Lifecycle adapter (Phase 3, main.go) maps:** `Lock`→`deployer.LockWorkload`; +`StopContainers`→`store.ListContainersByWorkload` + `docker.StopContainer` each running + +`UpdateContainerState(...,"stopped")` + return newest-running `ImageTag`; +`Redeploy`→`deployer.RedeployLocked` with a `restore`-reason intent (Reference=tag). diff --git a/plans/volume-snapshot-restore/phase-3-api.md b/plans/volume-snapshot-restore/phase-3-api.md new file mode 100644 index 0000000..38116ea --- /dev/null +++ b/plans/volume-snapshot-restore/phase-3-api.md @@ -0,0 +1,76 @@ +# Phase 3: API endpoint + CSRF header + single-flight + wiring + tests + +**Status:** ✅ Complete +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective + +Expose restore over HTTP behind the destructive-action guards, and wire the real +`Lifecycle` adapter + startup recovery sweep at the composition root. + +## Tasks + +- [ ] **`restoreWorkloadSnapshot` handler** in `internal/api/volume_snapshots.go`: + `POST /api/workloads/{id}/snapshots/{sid}/restore`. + - require `X-Confirm-Restore: ` header == path `sid` (C6, mirror `backups.go` `restoreBackup`); mismatch → 400. + - per-workload single-flight via `keyedmutex.TryLock(id)` (or a `sync.Map` LoadOrStore) → 409 if a restore for this workload is already running; release on completion. (Different apps restore concurrently.) + - load snapshot via `snapshotEngine.Get(sid)`; verify `snap.WorkloadID == id`; load workload; require `source_kind=="image"` (else 400). + - call `s.snapshotEngine.Restore(r.Context(), id, sid)`. Synchronous (mirrors `deployPluginWorkload` which blocks on dispatch). `ErrNoSnapshotData`-class / client-actionable → 400; success → 200 `{"status":"restored",...}`; other → 500 with a generic message (no internal detail leak — mirror existing handlers; raw error to `slog`). +- [ ] **Route registration** in `internal/api/router.go` under the admin group on `/workloads/{id}`, beside the existing `POST /snapshots`. Admin-gated. +- [ ] **Lifecycle adapter + recovery wiring** in `cmd/server/main.go` (composition root — already + imports deployer + docker + store + volsnap): + - a small struct implementing `volsnap.Lifecycle`: `Lock`→`deployer.LockWorkload`; `StopContainers`→`store.ListContainersByWorkload` + `docker.StopContainer` each running + `store.UpdateContainerState(...,"stopped")` + return newest-running `ImageTag`; `Redeploy`→`deployer.RedeployLocked` with `toPluginWorkload`/`WorkloadFromStore` + a `restore`-reason intent. + - `snapshotEngine.SetLifecycle(adapter)`. + - call `snapshotEngine.RecoverInterruptedRestores()` at startup beside the existing `CleanOrphans()` call; log result. + - keeps the `api` package decoupled from `deployer` (router.go deliberately avoids importing deployer). + +## Files to Modify/Create + +- `internal/api/volume_snapshots.go` — `restoreWorkloadSnapshot` handler + single-flight field on `Server` +- `internal/api/router.go` — route registration +- `cmd/server/main.go` — Lifecycle adapter, `SetLifecycle`, `RecoverInterruptedRestores` startup wiring +- `internal/api/volume_snapshots_test.go` — handler tests (extends existing file) + +## Acceptance Criteria + +- Missing/mismatched `X-Confirm-Restore` → 400. +- Concurrent second restore for the same workload → 409. +- Non-image workload → 400; snapshot belonging to another workload → 400/404. +- Happy path → 200 (with a fake engine/lifecycle or a seeded real engine). +- `go build ./...`, `go vet ./internal/...`, `go test ./internal/...` green; existing snapshot tests pass. + +## Notes + +- The handler is thin: validation + single-flight + delegate to `Engine.Restore`. All the dangerous logic stays in the engine (folded requirement). +- Confirm the admin group + `X-Confirm-Restore` together match the DB-restore threat model (custom header defeats CSRF form/img posts; admin JWT + AdminOnly gates authz). + +## Review Checklist + +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + +Implemented: `restoreWorkloadSnapshot` handler (`internal/api/volume_snapshots.go`) — admin, +`X-Confirm-Restore: ` header, per-workload single-flight (`volRestoreInFlight.TryLock` +→ 409); route `POST /api/workloads/{id}/snapshots/{sid}/restore`; `restoreLifecycle` adapter +(`cmd/server/restore_lifecycle.go`); main.go wires `SetLifecycle` + `RecoverInterruptedRestores` +BEFORE serving (B1/N1 resolved). Tests: header miss/mismatch→400, wrong-workload→400, +non-image→400, not-found→404, happy-path→200, single-flight→409. + +**SECURITY FIX (review BLOCK → resolved):** the security review caught a CRITICAL — the +manifest's persisted `Source`/`Scope` (attacker-influenceable) was being trusted to compute +the destructive swap target, so `Source:"../../etc"` could clobber `/etc`. Fixed by +re-deriving the swap target from the workload's CURRENT config keyed by container Target path +(`volumesByTarget` shared helper) + a `pathWithinBase` containment assertion for base-relative +scopes. Regression guards: `TestPreflightResolve_IgnoresManifestSource`, +`TestPreflightResolve_AllOrNothing`. Both reviews now clear. + +**API for Phase 4 (frontend):** `POST /api/workloads/{id}/snapshots/{sid}/restore` with header +`X-Confirm-Restore: ` (REQUIRED — mirror how the DB restore sends it). 200 `{status:"restored"}`, +400 (header/ownership/non-image), 404 (not found), 409 (already in progress), 500 (engine). +Restore is synchronous (blocks until done, like deploy). diff --git a/plans/volume-snapshot-restore/phase-4-frontend.md b/plans/volume-snapshot-restore/phase-4-frontend.md new file mode 100644 index 0000000..6d67d3e --- /dev/null +++ b/plans/volume-snapshot-restore/phase-4-frontend.md @@ -0,0 +1,70 @@ +# Phase 4: UI Restore button + ConfirmDialog + i18n en+ru + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +Built by the **frontend implementer agent**. Must follow project conventions: Svelte 5 +runes, `ConfirmDialog` for the destructive action (NEVER `window.confirm`), `$t` with +**en+ru parity**, the existing `.panel`/`.forge-btn-ghost` vocabulary in +`WorkloadSnapshotsPanel.svelte`. + +## Tasks + +- [ ] **`web/src/lib/api.ts`**: `restoreSnapshot(workloadId: string, sid: string): Promise` + — POST `/api/workloads/${workloadId}/snapshots/${sid}/restore` with header + `X-Confirm-Restore: ${sid}`. Check how the DB restore sends its `X-Confirm-Restore` header + and reuse that fetch mechanism (the typed `post` may need a header-capable variant or a + raw `fetch` like `download` already uses). +- [ ] **`WorkloadSnapshotsPanel.svelte`**: + - add a **Restore** action per snapshot row (beside Download/Delete) → opens `ConfirmDialog`. + - ConfirmDialog: strong destructive copy — title + message making clear it **overwrites + live data and restarts the app**, and that a **pre-restore snapshot is auto-captured**; + `confirmVariant="danger"`. + - on confirm: call `restoreSnapshot`, show a "restoring…" busy state (disable row actions), + toast success/failure, then `load()` to refresh. + - update the file's top comment (currently "Restore is intentionally NOT here yet") to + reflect that restore now ships. +- [ ] **i18n**: add `apps.detail.snapshots.restore`, `.restoring`, `.restored`, + `.restoreFailed`, `.confirmRestoreTitle`, `.confirmRestoreMessage` to BOTH + `web/src/lib/i18n/en.json` and `web/src/lib/i18n/ru.json`. Verify parity manually (a missing + key is NOT a build error — `$t` returns the key string). + +## Files to Modify/Create + +- `web/src/lib/api.ts` — `restoreSnapshot` +- `web/src/lib/components/WorkloadSnapshotsPanel.svelte` — Restore button + ConfirmDialog + busy state +- `web/src/lib/i18n/en.json`, `web/src/lib/i18n/ru.json` — restore keys (parity) + +## Acceptance Criteria + +- `npm run check` 0 errors; `npm run build` succeeds; `npm run test` green. +- en/ru key parity equal (every new key in both files). +- ConfirmDialog used (no native confirm/alert); danger variant; copy warns about data overwrite + app restart. +- Restore button disabled while a restore is in flight. +- Restart dev server (`./scripts/dev-server.sh`). + +## Notes + +- Only image-source workloads expose snapshots, so no source-kind gating is needed in the panel beyond what already exists. +- Keep the Restore button visually subordinate to Download but clearly destructive (danger styling) — it's the most dangerous action in the panel. + +## Review Checklist + +- [ ] All tasks completed +- [ ] Code follows project conventions (ToggleSwitch/ConfirmDialog rules, runes) +- [ ] No unintended side effects +- [ ] Build passes (`npm run check` + `npm run build`) +- [ ] Tests pass; i18n parity verified + +## Handoff to Next Phase + +Implemented: `api.restoreSnapshot(workloadId, sid)` (POST + `X-Confirm-Restore` header, +mirrors `restoreBackup`); `WorkloadSnapshotsPanel.svelte` Restore button per row → +`ConfirmDialog` (danger, warns: overwrites live data + restarts app + auto pre-restore +snapshot) → `doRestore` with busy state (`restoringId` disables all row actions, active row +shows "Restoring…"); i18n `apps.detail.snapshots.restore*` in en+ru (parity verified). + +Verify: `npm run check` 0 errors, `npm run build` OK, `npm run test` 26 pass; i18n parity +equal; dev server restarted on :9000. typescript-reviewer: APPROVE (no blockers; one cosmetic +wording note on the success toast addressed). Final phase — done. diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index bbaa086..def9afe 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -579,8 +579,8 @@ export function backupDownloadUrl(id: string): string { } // ── Volume Snapshots ─────────────────────────────────────────────── -// Per-workload archives of host-bind data volumes. Capture-only for now -// (create/list/delete/download); restore is a separate later phase. +// Per-workload archives of host-bind data volumes: create/list/delete/ +// download, plus restore (overwrites live data + restarts the app). export interface SnapshotInfo { id: string; @@ -629,6 +629,20 @@ export function snapshotDownloadUrl(sid: string): string { return `/api/snapshots/${sid}/download`; } +export function restoreSnapshot( + workloadId: string, + sid: string +): Promise<{ status: string; workload_id: string; snapshot_id: string }> { + // X-Confirm-Restore echoes the snapshot id (same CSRF guard as the DB + // restore): the backend rejects any POST whose header doesn't match the + // path param, defeating blind cross-origin POSTs that can't set custom + // headers without a preflight. Sent alongside the bearer JWT. + return request(`/api/workloads/${workloadId}/snapshots/${sid}/restore`, { + method: 'POST', + headers: { 'X-Confirm-Restore': sid } + }); +} + // ── Health ────────────────────────────────────────────────────────── export function getHealth(): Promise<{ docker: DockerHealth; proxy?: ProxyHealth }> { diff --git a/web/src/lib/components/WorkloadSnapshotsPanel.svelte b/web/src/lib/components/WorkloadSnapshotsPanel.svelte index 028d48b..06861ea 100644 --- a/web/src/lib/components/WorkloadSnapshotsPanel.svelte +++ b/web/src/lib/components/WorkloadSnapshotsPanel.svelte @@ -3,8 +3,9 @@ * WorkloadSnapshotsPanel * * Per-workload capture of host-bind data volumes (tar.gz). Create / list / - * download / delete. Restore is intentionally NOT here yet — overwriting - * live data needs container quiesce + atomic swap and ships separately. + * download / delete / restore. Restore overwrites the app's live volume data + * and recreates its containers — it quiesces the app, atomically swaps each + * volume dir, then redeploys, and auto-captures a pre-restore snapshot first. * * "Snapshotable" coverage is shown up-front (and which volumes are skipped, * with why) so users are never misled about what is actually captured. @@ -29,6 +30,8 @@ let error = $state(''); let label = $state(''); let confirmDeleteId = $state(null); + let confirmRestoreId = $state(null); + let restoringId = $state(null); const canSnapshot = $derived((snapshotable?.volumes.length ?? 0) > 0); @@ -81,6 +84,20 @@ } } + async function doRestore(id: string): Promise { + confirmRestoreId = null; + restoringId = id; + try { + await api.restoreSnapshot(workloadId, id); + toasts.success($t('apps.detail.snapshots.restored')); + await load(); + } catch (e) { + toasts.error(e instanceof Error ? e.message : $t('apps.detail.snapshots.restoreFailed')); + } finally { + restoringId = null; + } + } + async function download(snap: api.SnapshotInfo): Promise { try { const token = getAuthToken(); @@ -194,12 +211,26 @@ {volCount(snap.manifest)} {formatBytes(snap.size_bytes)} - +