feat(volsnap): volume snapshot restore (backlog #6)

Restore a captured volume snapshot onto an image workload's live host-bind data volumes, then redeploy — the most destructive workload action, built to the adversarially-reviewed design (C1–C6) with all data-loss guards. - Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from the workload's CURRENT config (never the tamperable manifest), per-filesystem disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and crash-recovery sweep (RecoverInterruptedRestores) wired before serving. - internal/keyedmutex: shared per-key lock; deployer now serializes every deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked for the restore re-dispatch, no deadlock). - Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir only), decompression-bomb cap, manifest-index bounds. - POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore header (CSRF), per-workload single-flight (409). - WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru). Scope: image-source only; scopes absolute/stage/project (driven off the same supportedScopes constant capture uses). Plan-reviewed before coding; per-phase go/security/ts reviews; final review READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path traversal (re-derive target from current config + base containment). Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00
parent 8a5f69af87
commit 1c47030854
33 changed files with 2825 additions and 34 deletions
@@ -140,6 +140,72 @@ func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
 }

+// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore.
+//
+// This is the most destructive workload action: it overwrites the app's live
+// volume data with the snapshot and recreates its containers. It is guarded like
+// the DB restore — admin-only, an X-Confirm-Restore header that must echo the
+// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and
+// a per-workload single-flight so a double-click can't stack two restores. All
+// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this
+// handler only validates and delegates.
+func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	sid := chi.URLParam(r, "sid")
+
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal snapshot id)")
+		return
+	}
+
+	// Up-front validation for precise client errors (Engine.Restore re-checks
+	// ownership + source kind under the lock).
+	snap, err := s.snapshotEngine.Get(sid)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot not found")
+		return
+	}
+	if snap.WorkloadID != id {
+		respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload")
+		return
+	}
+	row, ok := s.loadWorkload(w, id)
+	if !ok {
+		return
+	}
+	if row.SourceKind != "image" {
+		respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads")
+		return
+	}
+
+	// Per-workload single-flight: reject a concurrent restore of the SAME
+	// workload with 409 rather than queuing it behind the deployer lock.
+	release, ok := s.volRestoreInFlight.TryLock(id)
+	if !ok {
+		respondError(w, http.StatusConflict, "a restore is already in progress for this workload")
+		return
+	}
+	defer release()
+
+	if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil {
+		// Raw error (which can carry resolved host paths) stays in the log; the
+		// client gets a generic message.
+		slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err)
+		respondError(w, http.StatusInternalServerError, "restore failed; see server logs")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{
+		"status":      "restored",
+		"workload_id": id,
+		"snapshot_id": sid,
+	})
+}
+
 // downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
 // tar.gz archive. The resolved path is containment-checked against the
 // snapshot directory.