feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus):
- Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review):
  WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation,
  ConfirmDialog-based unsaved-changes guard.
- Extract lib/workload/sourceForms.ts (single source of truth for source_config)
  + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the
  /apps/[id] edit form onto the same components (removes the duplication). Add
  vitest + sourceForms unit tests.
- Branch preview environments UI: /chain is_preview/preview_branch + a Preview
  environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed
  state); RegistryImagePicker on the registry trigger and the image source.
- Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect;
  conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory
  label hints; dashboard + /apps "Total workloads" count only source_kind workloads
  (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker
  empty-list guard.
- Update CLAUDE.md frontend conventions + add a Build & Test section.

Also captures pre-existing in-progress platform work (not from this session):
workload notifications, Prometheus metrics export, store lockfile, health probes,
backup hardening, and related store/webhook/scheduler changes.
This commit is contained in:
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
+73 -32
View File
@@ -1,7 +1,6 @@
package api
import (
"io"
"log/slog"
"net/http"
"os"
@@ -118,7 +117,22 @@ func (s *Server) deleteBackup(w http.ResponseWriter, r *http.Request) {
}
// restoreBackup handles POST /api/backups/{id}/restore.
// This replaces the current database with the backup and triggers a graceful shutdown.
//
// Restore happens in three documented stages so a failure at any stage
// leaves the live DB intact:
//
// 1. PRE-FLIGHT (sync, before the HTTP response): PrepareRestore opens
// the candidate read-only and runs `PRAGMA integrity_check`. If it
// fails the live DB is untouched and we return 400 with the reason.
//
// 2. SAFETY NET: a pre-restore backup of the LIVE DB is created so the
// operator can roll back even if the candidate is later discovered
// to be missing data.
//
// 3. SWAP (async, after the response is flushed): close the live DB,
// atomic-rename the candidate over the live path, wipe WAL/SHM,
// trigger graceful shutdown. supervisord / systemd / docker
// restart=on-failure brings the process back with the new DB.
func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
if s.backupEngine == nil {
respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
@@ -126,13 +140,44 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
}
id := chi.URLParam(r, "id")
restorePath, err := s.backupEngine.RestorePath(id)
if err != nil {
respondError(w, http.StatusNotFound, "backup not found: "+err.Error())
// CSRF / accidental-fire guard: the restore endpoint is the most
// destructive surface in the API (replaces the whole DB). Even
// though it sits behind AdminOnly + Bearer JWT, a blind cross-site
// POST or a misclicked button in any open admin tab can fire it.
// Require the operator's client to echo X-Confirm-Restore: <id>
// — matching the path param — so a CSRF post-form / image-src
// trick can't trigger restore (browsers don't let cross-origin
// requests set custom headers without a preflight).
if confirm := r.Header.Get("X-Confirm-Restore"); confirm != id {
respondError(w, http.StatusBadRequest,
"missing or mismatched X-Confirm-Restore header (must equal backup id)")
return
}
// Create a safety backup before restore so the user can undo if needed.
// Single-flight guard: a rapid double-click would otherwise spawn
// two goroutines racing s.store.Close() and the candidate-over-
// live rename. CAS to true here; if someone else won, return 409.
if !s.restoreInFlight.CompareAndSwap(false, true) {
respondError(w, http.StatusConflict, "a restore is already in progress")
return
}
// Do NOT release the flag — the restore path triggers shutdown.
// A failed restore is also terminal (the DB may be closed); a
// fresh process boot is the recovery path.
// PRE-FLIGHT: refuse before touching anything if the candidate is
// not a valid SQLite database or fails integrity_check. This is the
// guard the prior code lacked — a corrupt backup would silently
// overwrite a healthy live DB.
restorePath, err := s.backupEngine.PrepareRestore(id)
if err != nil {
respondError(w, http.StatusBadRequest, err.Error())
return
}
// SAFETY NET: pre-restore snapshot of the live DB. A failure here
// is logged but does not abort — the integrity-checked candidate
// is still safer than refusing to restore.
if _, err := s.backupEngine.CreateBackup("pre-restore"); err != nil {
slog.Warn("failed to create pre-restore backup", "error", err)
}
@@ -153,41 +198,37 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
go func() {
time.Sleep(500 * time.Millisecond)
// Close the current database to release locks.
// Once we begin closing the live DB the process can no longer serve
// requests against a sane store, so EVERY exit path from here must
// trigger shutdown. Returning early would leave the server limping
// on a closed/half-swapped database with no path to recovery except
// an external kill. shutdownFunc → graceful shutdown → main returns
// → deferred releaseLock()/db.Close() run, and the supervisor reopens
// whatever DB is on disk on the next boot.
triggerShutdown := func() {
if s.shutdownFunc != nil {
s.shutdownFunc()
}
}
// Close the current database to release locks. AtomicReplaceDB
// expects the live file to be unmapped before swap (especially
// important on Windows where open files cannot be renamed over).
if err := s.store.Close(); err != nil {
slog.Error("restore: failed to close database", "error", err)
slog.Error("restore: failed to close database, restarting", "error", err)
triggerShutdown()
return
}
// Copy the backup file over the main database using streaming (no full read into memory).
src, err := os.Open(restorePath)
if err != nil {
slog.Error("restore: failed to open backup file", "error", err)
if err := s.backupEngine.AtomicReplaceDB(restorePath, s.dbPath); err != nil {
slog.Error("restore: atomic replace failed, restarting", "error", err)
triggerShutdown()
return
}
defer src.Close()
dst, err := os.Create(s.dbPath)
if err != nil {
slog.Error("restore: failed to create database file", "error", err)
return
}
defer dst.Close()
if _, err := io.Copy(dst, src); err != nil {
slog.Error("restore: failed to copy backup to database", "error", err)
return
}
// Remove WAL and SHM files to ensure clean state.
os.Remove(s.dbPath + "-wal")
os.Remove(s.dbPath + "-shm")
slog.Info("restore: database replaced, triggering shutdown")
// Signal the server to shut down gracefully so it can be restarted.
if s.shutdownFunc != nil {
s.shutdownFunc()
}
triggerShutdown()
}()
}