feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus): - Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review): WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation, ConfirmDialog-based unsaved-changes guard. - Extract lib/workload/sourceForms.ts (single source of truth for source_config) + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the /apps/[id] edit form onto the same components (removes the duplication). Add vitest + sourceForms unit tests. - Branch preview environments UI: /chain is_preview/preview_branch + a Preview environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed state); RegistryImagePicker on the registry trigger and the image source. - Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect; conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory label hints; dashboard + /apps "Total workloads" count only source_kind workloads (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker empty-list guard. - Update CLAUDE.md frontend conventions + add a Build & Test section. Also captures pre-existing in-progress platform work (not from this session): workload notifications, Prometheus metrics export, store lockfile, health probes, backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
@@ -34,7 +34,19 @@ type Deployer struct {
 	dnsMu    sync.RWMutex
 	dns      dns.Provider // nil when wildcard DNS is active

+	// proxyMu protects hot-swap of d.proxy from runtime settings updates
+	// (SetProxyProvider) racing with PluginDeps() reads on the deploy path.
+	proxyMu sync.RWMutex
+
 	// Graceful shutdown: tracks in-progress deploys.
+	//
+	// drainMu serializes the "is-draining check + activeWg.Add(1)" in
+	// beginDispatch against the "set shuttingDown + Wait()" in Drain. Without
+	// it, a dispatch could pass the draining check, Drain could then flip the
+	// flag and start Wait() with a zero counter, and the dispatch could call
+	// Add(1) concurrently with Wait — a documented sync.WaitGroup misuse
+	// (panic risk) that also lets a deploy slip past the drain barrier.
+	drainMu      sync.Mutex
 	activeWg     sync.WaitGroup
 	shuttingDown atomic.Bool
 }
@@ -73,7 +85,11 @@ func New(
 }

 // SetProxyProvider updates the proxy provider at runtime (e.g., when settings change).
+// Guarded by proxyMu so concurrent deploys that read d.proxy via PluginDeps()
+// observe a coherent value (previously a torn-pointer race under -race).
 func (d *Deployer) SetProxyProvider(provider proxy.Provider) {
+	d.proxyMu.Lock()
+	defer d.proxyMu.Unlock()
 	d.proxy = provider
 }

@@ -110,8 +126,11 @@ func (d *Deployer) SetDNSProvider(provider dns.Provider) {

 // Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
 func (d *Deployer) Drain() {
-	if !d.shuttingDown.CompareAndSwap(false, true) {
-		// Already draining.
+	d.drainMu.Lock()
+	already := d.shuttingDown.Swap(true)
+	d.drainMu.Unlock()
+	if already {
+		slog.Info("deployer: drain already in progress")
 	}
 	slog.Info("deployer: draining in-progress deploys")
 	d.activeWg.Wait()
@@ -121,11 +140,17 @@ func (d *Deployer) Drain() {
 // ShuttingDown reports whether Drain() has been called.
 func (d *Deployer) ShuttingDown() bool { return d.shuttingDown.Load() }

-// rejectIfDraining is exposed in case any plugin wants the same hard-stop
-// behaviour the legacy pipeline used.
-func (d *Deployer) rejectIfDraining() error {
+// beginDispatch atomically rejects when draining and otherwise registers the
+// in-flight unit on activeWg. The shuttingDown check and the Add(1) MUST be
+// done together under drainMu (see the field comment): Drain sets the flag
+// under the same mutex before Wait(), so once Wait() observes a zero counter
+// no further Add can race it. Callers must defer d.activeWg.Done() on success.
+func (d *Deployer) beginDispatch() error {
+	d.drainMu.Lock()
+	defer d.drainMu.Unlock()
 	if d.shuttingDown.Load() {
 		return fmt.Errorf("deployer is shutting down, rejecting new deploy")
 	}
+	d.activeWg.Add(1)
 	return nil
 }
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"

+	"github.com/alexei/tinyforge/internal/metrics"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

@@ -14,16 +15,37 @@ import (
 // triggers + image deploys still go through the legacy path, while
 // /api/hooks/generic + the unified webhook ingress go through here.
 func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	if err := d.beginDispatch(); err != nil {
+		metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining")
+		return err
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
+		// Unknown source: use the constant "unknown" sentinel for the
+		// label so a typo-spam attack can't grow the metrics map with
+		// one series per bogus source_kind. The actual user-supplied
+		// value still surfaces via the wrapped error / event log.
+		metrics.DeploysTotal.Inc("unknown", "unknown_source")
 		return fmt.Errorf("dispatch %s: %w", w.Name, err)
 	}
-	return src.Deploy(ctx, d.PluginDeps(), w, intent)
+	err = src.Deploy(ctx, d.PluginDeps(), w, intent)
+	outcome := "success"
+	if err != nil {
+		outcome = "failure"
+	}
+	metrics.DeploysTotal.Inc(w.SourceKind, outcome)
+	return err
 }

 // DispatchTeardown routes a teardown call to the matching Source plugin.
-// Used when a workload is deleted.
+// Used when a workload is deleted. Tracked via activeWg so Drain() honours
+// in-progress teardowns just like deploys.
 func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error {
+	if err := d.beginDispatch(); err != nil {
+		return err
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
 		return fmt.Errorf("dispatch teardown %s: %w", w.Name, err)
@@ -33,8 +55,17 @@ func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) erro

 // DispatchReconcile routes a Reconcile call. Periodic reconciler iterates
 // every Workload and calls this; idle Sources should make it a cheap
-// no-op.
+// no-op. Tracked via activeWg so a long-running reconcile blocks Drain().
 func (d *Deployer) DispatchReconcile(ctx context.Context, w plugin.Workload) error {
+	if err := d.beginDispatch(); err != nil {
+		// Silent skip — reconcile is a periodic tick, not a user-initiated
+		// action, so we don't want to surface "draining" errors back to the
+		// reconciler loop. The next tick after restart will catch up. Routing
+		// through beginDispatch keeps the activeWg.Add atomic with the drain
+		// check (see Drain) instead of a bare shuttingDown.Load + Add race.
+		return nil
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
 		return fmt.Errorf("dispatch reconcile %s: %w", w.Name, err)
@@ -52,10 +83,13 @@ func (d *Deployer) PluginDeps() plugin.Deps {
 	d.dnsMu.RLock()
 	dnsProvider := d.dns
 	d.dnsMu.RUnlock()
+	d.proxyMu.RLock()
+	proxyProvider := d.proxy
+	d.proxyMu.RUnlock()
 	return plugin.Deps{
 		Store:    d.store,
 		Docker:   d.docker,
-		Proxy:    d.proxy,
+		Proxy:    proxyProvider,
 		DNS:      dnsProvider,
 		Health:   d.health,
 		Notifier: d.notifier,