feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
This commit is contained in:
@@ -14,6 +14,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/dns"
|
||||
"github.com/alexei/tinyforge/internal/docker"
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/keyedmutex"
|
||||
"github.com/alexei/tinyforge/internal/notify"
|
||||
"github.com/alexei/tinyforge/internal/npm"
|
||||
"github.com/alexei/tinyforge/internal/proxy"
|
||||
@@ -56,6 +57,11 @@ type Server struct {
|
||||
// two concurrent syncs can't race on source_config (review S5).
|
||||
gitopsSync keyedMutex
|
||||
|
||||
// volRestoreInFlight is a per-workload single-flight guard for volume
|
||||
// snapshot restore: a concurrent restore of the same workload is rejected
|
||||
// fast with 409 (TryLock) rather than queuing behind the deployer lock.
|
||||
volRestoreInFlight keyedmutex.Mutex
|
||||
|
||||
dnsProviderMu sync.RWMutex
|
||||
dnsProvider dns.Provider
|
||||
onDNSProviderChanged DNSProviderChangedFunc
|
||||
@@ -359,6 +365,10 @@ func (s *Server) Router() chi.Router {
|
||||
r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots)
|
||||
r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable)
|
||||
r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot)
|
||||
// Restore overwrites live volume data and restarts the app — the
|
||||
// most destructive workload action. Admin-gated + X-Confirm-Restore
|
||||
// header (CSRF) + per-workload single-flight, mirroring DB restore.
|
||||
r.With(auth.AdminOnly).Post("/snapshots/{sid}/restore", s.restoreWorkloadSnapshot)
|
||||
|
||||
// Runtime view: per-source persisted state + storage usage.
|
||||
// Read-only; safe for any authenticated user.
|
||||
|
||||
@@ -140,6 +140,72 @@ func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
|
||||
}
|
||||
|
||||
// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore.
|
||||
//
|
||||
// This is the most destructive workload action: it overwrites the app's live
|
||||
// volume data with the snapshot and recreates its containers. It is guarded like
|
||||
// the DB restore — admin-only, an X-Confirm-Restore header that must echo the
|
||||
// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and
|
||||
// a per-workload single-flight so a double-click can't stack two restores. All
|
||||
// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this
|
||||
// handler only validates and delegates.
|
||||
func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
id := chi.URLParam(r, "id")
|
||||
sid := chi.URLParam(r, "sid")
|
||||
|
||||
if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid {
|
||||
respondError(w, http.StatusBadRequest,
|
||||
"missing or mismatched X-Confirm-Restore header (must equal snapshot id)")
|
||||
return
|
||||
}
|
||||
|
||||
// Up-front validation for precise client errors (Engine.Restore re-checks
|
||||
// ownership + source kind under the lock).
|
||||
snap, err := s.snapshotEngine.Get(sid)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "snapshot not found")
|
||||
return
|
||||
}
|
||||
if snap.WorkloadID != id {
|
||||
respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload")
|
||||
return
|
||||
}
|
||||
row, ok := s.loadWorkload(w, id)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if row.SourceKind != "image" {
|
||||
respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads")
|
||||
return
|
||||
}
|
||||
|
||||
// Per-workload single-flight: reject a concurrent restore of the SAME
|
||||
// workload with 409 rather than queuing it behind the deployer lock.
|
||||
release, ok := s.volRestoreInFlight.TryLock(id)
|
||||
if !ok {
|
||||
respondError(w, http.StatusConflict, "a restore is already in progress for this workload")
|
||||
return
|
||||
}
|
||||
defer release()
|
||||
|
||||
if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil {
|
||||
// Raw error (which can carry resolved host paths) stays in the log; the
|
||||
// client gets a generic message.
|
||||
slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "restore failed; see server logs")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"status": "restored",
|
||||
"workload_id": id,
|
||||
"snapshot_id": sid,
|
||||
})
|
||||
}
|
||||
|
||||
// downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
|
||||
// tar.gz archive. The resolved path is containment-checked against the
|
||||
// snapshot directory.
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
@@ -53,7 +56,211 @@ func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
|
||||
t.Fatalf("update settings: %v", err)
|
||||
}
|
||||
|
||||
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey}, baseVol
|
||||
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol
|
||||
}
|
||||
|
||||
// doRestore issues an authenticated restore POST, optionally setting the
|
||||
// X-Confirm-Restore header (pass confirm="" to omit it).
|
||||
func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest(http.MethodPost,
|
||||
e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("new request: %v", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+e.adminToken)
|
||||
if confirm != "" {
|
||||
req.Header.Set("X-Confirm-Restore", confirm)
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("do request: %v", err)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the
|
||||
// deep restore behavior is covered by the volsnap engine tests.
|
||||
type okLifecycle struct{ tag string }
|
||||
|
||||
func (l *okLifecycle) Lock(string) func() { return func() {} }
|
||||
func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil }
|
||||
func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
|
||||
|
||||
func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
// Missing header → 400.
|
||||
resp := e.doRestore(t, w.ID, snap.ID, "")
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("missing header status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
// Mismatched header → 400.
|
||||
resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid")
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_WrongWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_NonImageWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_NotFound(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
|
||||
resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid")
|
||||
if resp.StatusCode != http.StatusNotFound {
|
||||
t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_HappyPath(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"})
|
||||
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "data-app", Kind: "project", SourceKind: "image",
|
||||
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil {
|
||||
t.Fatalf("set volume: %v", err)
|
||||
}
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings, _ := e.store.GetSettings()
|
||||
snap, err := e.snapEngine.Create(w, settings, "base")
|
||||
if err != nil {
|
||||
t.Fatalf("create snapshot: %v", err)
|
||||
}
|
||||
// Drift the live data, then restore.
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body)
|
||||
}
|
||||
resp.Body.Close()
|
||||
if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" {
|
||||
t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// blockingLifecycle blocks in Lock until released, signaling when entered — so
|
||||
// a test can hold one restore in-flight and assert a second is rejected 409.
|
||||
type blockingLifecycle struct {
|
||||
entered chan struct{}
|
||||
release chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (l *blockingLifecycle) Lock(string) func() {
|
||||
l.once.Do(func() { close(l.entered) })
|
||||
<-l.release
|
||||
return func() {}
|
||||
}
|
||||
func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil }
|
||||
func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
|
||||
|
||||
// seedRestorable creates an image workload with a project volume + live data and
|
||||
// a captured snapshot, returning the workload and snapshot ids.
|
||||
func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) {
|
||||
t.Helper()
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "sf-app", Kind: "project", SourceKind: "image",
|
||||
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings, _ := e.store.GetSettings()
|
||||
snap, err := e.snapEngine.Create(w, settings, "base")
|
||||
if err != nil {
|
||||
t.Fatalf("create snapshot: %v", err)
|
||||
}
|
||||
return w.ID, snap.ID
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_SingleFlight409(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
wid, sid := seedRestorable(t, e, baseVol)
|
||||
bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})}
|
||||
e.snapEngine.SetLifecycle(bl)
|
||||
|
||||
// Restore #1: passes validation, takes the single-flight, then blocks inside
|
||||
// the engine's Lock.
|
||||
go func() {
|
||||
resp := e.doRestore(t, wid, sid, sid)
|
||||
resp.Body.Close()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-bl.entered:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("first restore never reached the lifecycle lock")
|
||||
}
|
||||
|
||||
// Restore #2 for the same workload must be rejected fast with 409.
|
||||
resp := e.doRestore(t, wid, sid, sid)
|
||||
got := resp.StatusCode
|
||||
resp.Body.Close()
|
||||
close(bl.release) // let #1 finish
|
||||
if got != http.StatusConflict {
|
||||
t.Fatalf("concurrent restore status = %d, want 409", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolumeSnapshots_EndToEnd(t *testing.T) {
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/crypto"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/volsnap"
|
||||
"github.com/alexei/tinyforge/internal/webhook"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
|
||||
@@ -75,6 +76,7 @@ type apiTestEnv struct {
|
||||
dispatcher *fakeAPIDispatcher
|
||||
adminToken string
|
||||
encKey [32]byte
|
||||
snapEngine *volsnap.Engine // set by newSnapshotEnv; nil otherwise
|
||||
}
|
||||
|
||||
func (e *apiTestEnv) close() { e.srv.Close() }
|
||||
@@ -670,9 +672,9 @@ func TestGetWorkloadChain_ParentSelfChildren(t *testing.T) {
|
||||
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+parentID+"/chain", nil)
|
||||
var got struct {
|
||||
Parent *map[string]any `json:"parent"`
|
||||
Self map[string]any `json:"self"`
|
||||
Children []map[string]any `json:"children"`
|
||||
Parent *map[string]any `json:"parent"`
|
||||
Self map[string]any `json:"self"`
|
||||
Children []map[string]any `json:"children"`
|
||||
}
|
||||
if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
|
||||
Reference in New Issue
Block a user