feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus): - Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review): WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation, ConfirmDialog-based unsaved-changes guard. - Extract lib/workload/sourceForms.ts (single source of truth for source_config) + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the /apps/[id] edit form onto the same components (removes the duplication). Add vitest + sourceForms unit tests. - Branch preview environments UI: /chain is_preview/preview_branch + a Preview environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed state); RegistryImagePicker on the registry trigger and the image source. - Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect; conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory label hints; dashboard + /apps "Total workloads" count only source_kind workloads (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker empty-list guard. - Update CLAUDE.md frontend conventions + add a Build & Test section. Also captures pre-existing in-progress platform work (not from this session): workload notifications, Prometheus metrics export, store lockfile, health probes, backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
@@ -444,22 +444,12 @@ func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg
 }

 // dispatchSiteNotification fires a site_sync_success or
-// site_sync_failure event to the configured outbound webhook.
-// Resolution: per-workload URL+secret first, then fall through to
-// settings.notification_url/secret. Always best-effort.
+// site_sync_failure event for the workload via the shared multi-route
+// dispatcher in plugin.DispatchNotificationForWorkload. Resolution
+// order (workload_notifications → legacy single URL → settings global)
+// is identical to the dockerfile plugin's path so receivers see
+// consistent fan-out behaviour across source kinds.
 func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
-	if deps.Notifier == nil {
-		return
-	}
-	settings, err := deps.Store.GetSettings()
-	if err != nil {
-		slog.Warn("static site: notify settings lookup failed", "site", w.ID, "error", err)
-		return
-	}
-	url, secret, tier := resolveSiteTarget(w, settings)
-	if url == "" {
-		return
-	}
 	eventType := "site_sync_success"
 	if status == "failed" {
 		eventType = "site_sync_failure"
@@ -468,7 +458,7 @@ func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, statu
 	if domain != "" {
 		siteURL = "https://" + domain
 	}
-	deps.Notifier.SendSigned(url, secret, tier, notify.Event{
+	plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
 		Type:    eventType,
 		Project: w.Name,
 		URL:     siteURL,
@@ -476,16 +466,6 @@ func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, statu
 	})
 }

-// resolveSiteTarget mirrors the legacy resolveSiteTarget helper but
-// reads notification config off the workload row (where it now lives
-// post-refactor) rather than the static_sites row.
-func resolveSiteTarget(w plugin.Workload, settings store.Settings) (string, string, notify.Tier) {
-	if w.NotificationURL != "" {
-		return w.NotificationURL, w.NotificationSecret, notify.TierSite
-	}
-	return settings.NotificationURL, settings.NotificationSecret, notify.TierSettings
-}
-
 // publishEvent emits a static_site_status event on the bus AND
 // persists an event_log row so the dashboard's audit trail picks it
 // up. Message format ("Static site \"%s\": %s") is preserved verbatim
@@ -165,30 +165,42 @@ func TestContainerRowID_Deterministic(t *testing.T) {
 	}
 }

-func TestLockFor_ReturnsSameLockForSameWorkload(t *testing.T) {
-	// Suffix by t.Name() so the package-global saveLocks map cannot
-	// bleed key state between tests (or between -count=N runs).
+func TestSaveLock_FreedWhenIdle(t *testing.T) {
+	// After the last holder releases, the reference-counted entry must be
+	// removed from the map so the lock table cannot grow without bound.
+	// Suffix by t.Name() so the package-global saveLocks map cannot bleed
+	// key state between tests (or between -count=N runs).
 	key := t.Name() + "-wid"
-	a := lockFor(key)
-	b := lockFor(key)
-	if a != b {
-		t.Fatalf("lockFor returned distinct locks for same workload: %p vs %p", a, b)
+	lk := acquireSaveLock(key)
+	saveLocks.mu.Lock()
+	_, present := saveLocks.locks[key]
+	saveLocks.mu.Unlock()
+	if !present {
+		t.Fatal("acquireSaveLock did not register the entry while held")
+	}
+	releaseSaveLock(key, lk)
+	saveLocks.mu.Lock()
+	_, stillPresent := saveLocks.locks[key]
+	saveLocks.mu.Unlock()
+	if stillPresent {
+		t.Fatal("releaseSaveLock left the entry behind after the last holder released")
 	}
 }

-func TestLockFor_ReturnsDistinctLocksForDifferentWorkloads(t *testing.T) {
-	a := lockFor(t.Name() + "-a")
-	b := lockFor(t.Name() + "-b")
-	if a == b {
-		t.Fatalf("lockFor returned same lock for different workloads: %p", a)
-	}
+func TestSaveLock_DistinctWorkloadsDoNotSerialize(t *testing.T) {
+	// Two different workloads must be lockable at the same time. If they
+	// shared a mutex the second acquire would block forever (deadlock).
+	a := acquireSaveLock(t.Name() + "-a")
+	b := acquireSaveLock(t.Name() + "-b")
+	releaseSaveLock(t.Name()+"-b", b)
+	releaseSaveLock(t.Name()+"-a", a)
 }

-func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
-	// Two goroutines holding the same lock must run sequentially. The
-	// counter would race past 2 if locking were broken; with the lock,
-	// the increment is observed monotonically.
-	lk := lockFor(t.Name() + "-wid")
+func TestSaveLock_SerializesConcurrentAcquisitions(t *testing.T) {
+	// Goroutines acquiring the same workload's lock must run sequentially.
+	// The counter would race past 1 if locking were broken; with the lock,
+	// peak in-flight stays at 1.
+	key := t.Name() + "-wid"
 	var (
 		wg      sync.WaitGroup
 		mu      sync.Mutex
@@ -199,8 +211,8 @@ func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			lk.Lock()
-			defer lk.Unlock()
+			lk := acquireSaveLock(key)
+			defer releaseSaveLock(key, lk)

 			mu.Lock()
 			counter++
@@ -216,15 +228,15 @@ func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
 	}
 	wg.Wait()
 	if peak != 1 {
-		t.Fatalf("lockFor failed to serialize: peak in-flight = %d, want 1", peak)
+		t.Fatalf("acquireSaveLock failed to serialize: peak in-flight = %d, want 1", peak)
 	}
 }

-func TestLockFor_ConcurrentMapAccessIsSafe(t *testing.T) {
-	// Distinct workloads acquired in parallel must not panic on map
-	// access — exercises the outer-mutex protection inside lockFor.
-	// Each iteration uses a unique key so the test stresses the
-	// insertion path (the common case for "first deploy" callers).
+func TestSaveLock_ConcurrentMapAccessIsSafe(t *testing.T) {
+	// Distinct workloads acquired+released in parallel must not panic on map
+	// access — exercises the outer-mutex protection inside acquire/release.
+	// Each iteration uses a unique key so the test stresses the insertion +
+	// refcount-cleanup paths (the common case for "first deploy" callers).
 	prefix := t.Name() + "-"
 	var wg sync.WaitGroup
 	for i := 0; i < 50; i++ {
@@ -232,9 +244,9 @@ func TestLockFor_ConcurrentMapAccessIsSafe(t *testing.T) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			lk := lockFor(prefix + strconv.Itoa(i))
-			lk.Lock()
-			lk.Unlock()
+			key := prefix + strconv.Itoa(i)
+			lk := acquireSaveLock(key)
+			releaseSaveLock(key, lk)
 		}()
 	}
 	wg.Wait()
@@ -80,26 +80,55 @@ func loadState(deps plugin.Deps, w plugin.Workload) (runtimeState, *store.Contai
 // container_id / proxy_route_id and orphaning Docker resources. The
 // mutex caps the concurrency at 1 per workload; cross-workload
 // parallelism is unaffected.
+//
+// Entries are reference-counted and removed only when the last holder
+// releases. This bounds memory (no per-workload-ID leak) WITHOUT the
+// use-after-delete hazard of deleting an entry on teardown: deleting a
+// live entry while a concurrent saveState still holds (or is about to
+// lock) it would let a fresh saveState mint a SECOND mutex for the same
+// workload, losing the RMW serialization the lock exists to provide.
 var saveLocks struct {
 	mu    sync.Mutex
-	locks map[string]*sync.Mutex
+	locks map[string]*saveLock
 }

-// lockFor returns the per-workload mutex, creating it on first use.
-// The outer mutex is held only briefly during map lookup; the returned
-// per-workload lock is what callers actually contend on.
-func lockFor(workloadID string) *sync.Mutex {
+type saveLock struct {
+	mu   sync.Mutex
+	refs int
+}
+
+// acquireSaveLock returns the per-workload lock (creating it on first use),
+// registers this caller as a holder, and takes the lock. Pair with
+// releaseSaveLock. The outer mutex is held only for the bookkeeping; callers
+// contend on the returned per-workload lock.
+func acquireSaveLock(workloadID string) *saveLock {
 	saveLocks.mu.Lock()
-	defer saveLocks.mu.Unlock()
 	if saveLocks.locks == nil {
-		saveLocks.locks = map[string]*sync.Mutex{}
+		saveLocks.locks = map[string]*saveLock{}
 	}
-	m, ok := saveLocks.locks[workloadID]
+	l, ok := saveLocks.locks[workloadID]
 	if !ok {
-		m = &sync.Mutex{}
-		saveLocks.locks[workloadID] = m
+		l = &saveLock{}
+		saveLocks.locks[workloadID] = l
 	}
-	return m
+	l.refs++
+	saveLocks.mu.Unlock()
+	l.mu.Lock()
+	return l
+}
+
+// releaseSaveLock unlocks and drops the caller's reference, removing the map
+// entry once no holders remain. Because refs is incremented under saveLocks.mu
+// before the entry can be observed for deletion, an entry with a pending
+// acquirer is never deleted.
+func releaseSaveLock(workloadID string, l *saveLock) {
+	l.mu.Unlock()
+	saveLocks.mu.Lock()
+	l.refs--
+	if l.refs == 0 {
+		delete(saveLocks.locks, workloadID)
+	}
+	saveLocks.mu.Unlock()
 }

 // saveState upserts the container row, calling mutate so callers can
@@ -115,9 +144,8 @@ func lockFor(workloadID string) *sync.Mutex {
 // Per-workload mutex serializes concurrent callers so two parallel
 // Deploys can't read the same prior state and race their writes.
 func saveState(deps plugin.Deps, w plugin.Workload, mutate func(*runtimeState, *store.Container)) error {
-	lk := lockFor(w.ID)
-	lk.Lock()
-	defer lk.Unlock()
+	lk := acquireSaveLock(w.ID)
+	defer releaseSaveLock(w.ID, lk)

 	prev, prevRow, err := loadState(deps, w)
 	if err != nil {
@@ -185,14 +185,23 @@ func TestSaveState_RecoversFromInvalidExtraJSON(t *testing.T) {
 	deps, _ := testDeps(t)
 	w := plugin.Workload{ID: t.Name() + "-wid", Name: "site"}

+	// UpsertContainer now validates extra_json at the boundary, so this
+	// test seeds a valid row first and corrupts it via raw SQL to
+	// simulate a pre-existing bad row from an upgrade / external edit.
 	if err := deps.Store.UpsertContainer(store.Container{
 		ID:           containerRowID(w),
 		WorkloadID:   w.ID,
 		WorkloadKind: string(store.WorkloadKindSite),
 		Host:         "local",
-		ExtraJSON:    `{not json`,
+		ExtraJSON:    `{}`,
 	}); err != nil {
-		t.Fatalf("seed bad row: %v", err)
+		t.Fatalf("seed row: %v", err)
+	}
+	if _, err := deps.Store.DB().Exec(
+		`UPDATE containers SET extra_json = ? WHERE id = ?`,
+		`{not json`, containerRowID(w),
+	); err != nil {
+		t.Fatalf("corrupt extra_json: %v", err)
 	}

 	err := saveState(deps, w, func(state *runtimeState, _ *store.Container) {
@@ -66,5 +66,8 @@ func teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
 	if err := deps.Store.DeleteContainer(prevContainer.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
 		slog.Warn("static site: failed to delete container row", "site", w.Name, "error", err)
 	}
+	// The per-workload save-mutex is reference-counted (see state.go) and
+	// frees itself when the last holder releases, so teardown no longer
+	// deletes it explicitly — doing so could race a concurrent saveState.
 	return nil
 }