feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus): - Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review): WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation, ConfirmDialog-based unsaved-changes guard. - Extract lib/workload/sourceForms.ts (single source of truth for source_config) + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the /apps/[id] edit form onto the same components (removes the duplication). Add vitest + sourceForms unit tests. - Branch preview environments UI: /chain is_preview/preview_branch + a Preview environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed state); RegistryImagePicker on the registry trigger and the image source. - Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect; conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory label hints; dashboard + /apps "Total workloads" count only source_kind workloads (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker empty-list guard. - Update CLAUDE.md frontend conventions + add a Build & Test section. Also captures pre-existing in-progress platform work (not from this session): workload notifications, Prometheus metrics export, store lockfile, health probes, backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
@@ -1,7 +1,6 @@
 package api

 import (
-	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -118,7 +117,22 @@ func (s *Server) deleteBackup(w http.ResponseWriter, r *http.Request) {
 }

 // restoreBackup handles POST /api/backups/{id}/restore.
-// This replaces the current database with the backup and triggers a graceful shutdown.
+//
+// Restore happens in three documented stages so a failure at any stage
+// leaves the live DB intact:
+//
+//  1. PRE-FLIGHT (sync, before the HTTP response): PrepareRestore opens
+//     the candidate read-only and runs `PRAGMA integrity_check`. If it
+//     fails the live DB is untouched and we return 400 with the reason.
+//
+//  2. SAFETY NET: a pre-restore backup of the LIVE DB is created so the
+//     operator can roll back even if the candidate is later discovered
+//     to be missing data.
+//
+//  3. SWAP (async, after the response is flushed): close the live DB,
+//     atomic-rename the candidate over the live path, wipe WAL/SHM,
+//     trigger graceful shutdown. supervisord / systemd / docker
+//     restart=on-failure brings the process back with the new DB.
 func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	if s.backupEngine == nil {
 		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
@@ -126,13 +140,44 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	}

 	id := chi.URLParam(r, "id")
-	restorePath, err := s.backupEngine.RestorePath(id)
-	if err != nil {
-		respondError(w, http.StatusNotFound, "backup not found: "+err.Error())
+
+	// CSRF / accidental-fire guard: the restore endpoint is the most
+	// destructive surface in the API (replaces the whole DB). Even
+	// though it sits behind AdminOnly + Bearer JWT, a blind cross-site
+	// POST or a misclicked button in any open admin tab can fire it.
+	// Require the operator's client to echo X-Confirm-Restore: <id>
+	// — matching the path param — so a CSRF post-form / image-src
+	// trick can't trigger restore (browsers don't let cross-origin
+	// requests set custom headers without a preflight).
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != id {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal backup id)")
 		return
 	}

-	// Create a safety backup before restore so the user can undo if needed.
+	// Single-flight guard: a rapid double-click would otherwise spawn
+	// two goroutines racing s.store.Close() and the candidate-over-
+	// live rename. CAS to true here; if someone else won, return 409.
+	if !s.restoreInFlight.CompareAndSwap(false, true) {
+		respondError(w, http.StatusConflict, "a restore is already in progress")
+		return
+	}
+	// Do NOT release the flag — the restore path triggers shutdown.
+	// A failed restore is also terminal (the DB may be closed); a
+	// fresh process boot is the recovery path.
+	// PRE-FLIGHT: refuse before touching anything if the candidate is
+	// not a valid SQLite database or fails integrity_check. This is the
+	// guard the prior code lacked — a corrupt backup would silently
+	// overwrite a healthy live DB.
+	restorePath, err := s.backupEngine.PrepareRestore(id)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	// SAFETY NET: pre-restore snapshot of the live DB. A failure here
+	// is logged but does not abort — the integrity-checked candidate
+	// is still safer than refusing to restore.
 	if _, err := s.backupEngine.CreateBackup("pre-restore"); err != nil {
 		slog.Warn("failed to create pre-restore backup", "error", err)
 	}
@@ -153,41 +198,37 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	go func() {
 		time.Sleep(500 * time.Millisecond)

-		// Close the current database to release locks.
+		// Once we begin closing the live DB the process can no longer serve
+		// requests against a sane store, so EVERY exit path from here must
+		// trigger shutdown. Returning early would leave the server limping
+		// on a closed/half-swapped database with no path to recovery except
+		// an external kill. shutdownFunc → graceful shutdown → main returns
+		// → deferred releaseLock()/db.Close() run, and the supervisor reopens
+		// whatever DB is on disk on the next boot.
+		triggerShutdown := func() {
+			if s.shutdownFunc != nil {
+				s.shutdownFunc()
+			}
+		}
+
+		// Close the current database to release locks. AtomicReplaceDB
+		// expects the live file to be unmapped before swap (especially
+		// important on Windows where open files cannot be renamed over).
 		if err := s.store.Close(); err != nil {
-			slog.Error("restore: failed to close database", "error", err)
+			slog.Error("restore: failed to close database, restarting", "error", err)
+			triggerShutdown()
 			return
 		}

-		// Copy the backup file over the main database using streaming (no full read into memory).
-		src, err := os.Open(restorePath)
-		if err != nil {
-			slog.Error("restore: failed to open backup file", "error", err)
+		if err := s.backupEngine.AtomicReplaceDB(restorePath, s.dbPath); err != nil {
+			slog.Error("restore: atomic replace failed, restarting", "error", err)
+			triggerShutdown()
 			return
 		}
-		defer src.Close()
-
-		dst, err := os.Create(s.dbPath)
-		if err != nil {
-			slog.Error("restore: failed to create database file", "error", err)
-			return
-		}
-		defer dst.Close()
-
-		if _, err := io.Copy(dst, src); err != nil {
-			slog.Error("restore: failed to copy backup to database", "error", err)
-			return
-		}
-
-		// Remove WAL and SHM files to ensure clean state.
-		os.Remove(s.dbPath + "-wal")
-		os.Remove(s.dbPath + "-shm")

 		slog.Info("restore: database replaced, triggering shutdown")

 		// Signal the server to shut down gracefully so it can be restarted.
-		if s.shutdownFunc != nil {
-			s.shutdownFunc()
-		}
+		triggerShutdown()
 	}()
 }