feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus): - Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review): WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation, ConfirmDialog-based unsaved-changes guard. - Extract lib/workload/sourceForms.ts (single source of truth for source_config) + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the /apps/[id] edit form onto the same components (removes the duplication). Add vitest + sourceForms unit tests. - Branch preview environments UI: /chain is_preview/preview_branch + a Preview environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed state); RegistryImagePicker on the registry trigger and the image source. - Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect; conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory label hints; dashboard + /apps "Total workloads" count only source_kind workloads (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker empty-list guard. - Update CLAUDE.md frontend conventions + add a Build & Test section. Also captures pre-existing in-progress platform work (not from this session): workload notifications, Prometheus metrics export, store lockfile, health probes, backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
@@ -2,6 +2,7 @@ package store

 import (
 	"database/sql"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
@@ -9,6 +10,22 @@ import (
 	"github.com/google/uuid"
 )

+// validateExtraJSON ensures the extra_json column never receives an
+// invalid JSON document. The codemap (docs/CODEMAPS/container-extra-json.md)
+// is explicit that readers tolerate unknown keys — but only if the value
+// is valid JSON at all. A buggy plugin writing `"not json"` would silently
+// break every reader, with no schema-level check to catch it. Guarding at
+// the store boundary keeps the invariant cheap and obvious.
+func validateExtraJSON(v string) error {
+	if v == "" {
+		return nil
+	}
+	if !json.Valid([]byte(v)) {
+		return fmt.Errorf("extra_json: not valid JSON (%d bytes)", len(v))
+	}
+	return nil
+}
+
 // containerColumns is the canonical column list for `containers` queries.
 // stage_id is populated by the deployer for project containers (so ListProxyRoutes
 // survives stage renames) and left empty for stacks and sites.
@@ -42,6 +59,9 @@ func (s *Store) CreateContainer(c Container) (Container, error) {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return Container{}, err
+	}

 	_, err := s.db.Exec(
 		`INSERT INTO containers (`+containerColumns+`)
@@ -77,6 +97,9 @@ func (s *Store) UpsertContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}

 	// SQLite UPSERT — INSERT...ON CONFLICT(id) DO UPDATE.
 	_, err := s.db.Exec(
@@ -129,6 +152,9 @@ func (s *Store) ReconcileContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}

 	// extra_json is deliberately NOT in the ON CONFLICT SET clause: the
 	// reconciler can't observe per-face route IDs from Docker, and
@@ -321,6 +347,9 @@ func (s *Store) UpdateContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}
 	result, err := s.db.Exec(
 		`UPDATE containers SET workload_id=?, workload_kind=?, role=?, stage_id=?, container_id=?,
 			image_ref=?, image_tag=?, host=?, state=?, port=?,
@@ -0,0 +1,171 @@
+package store
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// ErrLockHeld is returned when another Tinyforge process appears to be
+// running against the same data directory. SQLite + SetMaxOpenConns(1)
+// makes this otherwise-silent collision a recipe for double-fired
+// schedulers, double-polled registries, and `extra_json` RMW corruption.
+var ErrLockHeld = errors.New("data directory is locked by another tinyforge process")
+
+// Lockfile is a portable PID file. AcquireLockfile takes it; the returned
+// Release function removes it. The contract:
+//
+//   - Lockfile is created with O_CREATE|O_EXCL — atomic on POSIX, atomic
+//     on NTFS / ReFS via the equivalent.
+//   - On collision, the existing file's PID is read; if the PID is dead,
+//     we treat the lock as stale (process crashed without cleanup),
+//     reclaim it, and proceed. Live PID → ErrLockHeld.
+//   - flock is intentionally not used: cross-platform consistency wins
+//     over advisory-lock semantics for the single-instance use case.
+type Lockfile struct {
+	path string
+}
+
+// AcquireLockfile creates a PID-file lock under dataDir. Returns a
+// Release function the caller must defer. If another live process holds
+// the lock, returns ErrLockHeld with a hint pointing at the lockfile.
+//
+// Reclaim atomicity: when the existing lockfile names a dead PID, the
+// replacement is serialized through an auxiliary reclaim lock (see
+// reclaimStaleLock) so that, of N processes booting concurrently against
+// the same stale lockfile, EXACTLY ONE reclaims it and the rest get
+// ErrLockHeld. A bare `os.Remove`+`O_EXCL` retry — or a rename, which is
+// "last-writer-wins" — cannot guarantee this: multiple reclaimers can each
+// end up believing they own the lock, defeating the single-instance guard.
+func AcquireLockfile(dataDir string) (release func(), err error) {
+	path := filepath.Join(dataDir, "tinyforge.lock")
+
+	// First try: clean acquire.
+	if rel, ok, err := tryCreateExclusive(path); ok {
+		return rel, nil
+	} else if err != nil {
+		return nil, err
+	}
+
+	// Existing lockfile — read PID and decide whether to reclaim.
+	pid, readErr := readLockPID(path)
+	if readErr == nil && processAlive(pid) {
+		return nil, fmt.Errorf("%w (held by pid %d, lockfile=%s)", ErrLockHeld, pid, path)
+	}
+	// Stale lock (dead pid) or malformed file — reclaim under serialization.
+	reason := "malformed existing lockfile"
+	if readErr == nil {
+		reason = fmt.Sprintf("stale lockfile (dead pid %d)", pid)
+	}
+	return reclaimStaleLock(path, reason)
+}
+
+// tryCreateExclusive attempts an atomic O_CREATE|O_EXCL create at path.
+// Returns (release, true, nil) on success; (nil, false, nil) when the
+// file already exists; (nil, false, err) on any other error.
+func tryCreateExclusive(path string) (func(), bool, error) {
+	f, openErr := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
+	if openErr != nil {
+		if os.IsExist(openErr) {
+			return nil, false, nil
+		}
+		return nil, false, fmt.Errorf("open lockfile: %w", openErr)
+	}
+	if _, err := fmt.Fprintf(f, "%d\n", os.Getpid()); err != nil {
+		_ = f.Close()
+		_ = os.Remove(path)
+		return nil, false, fmt.Errorf("write lockfile: %w", err)
+	}
+	if err := f.Close(); err != nil {
+		_ = os.Remove(path)
+		return nil, false, fmt.Errorf("close lockfile: %w", err)
+	}
+	return func() { _ = os.Remove(path) }, true, nil
+}
+
+// reclaimStaleLock replaces a stale/malformed lockfile with one holding our
+// PID, serialized by an auxiliary reclaim lock. Holding the reclaim lock
+// (O_EXCL) guarantees that only one process performs the remove-and-recreate
+// of the main lockfile at a time, so concurrent reclaimers cannot each end
+// up "owning" the lock the way a rename or unguarded remove+create would
+// allow. The reclaim lock is itself liveness-checked so a reclaimer that
+// crashed mid-reclaim cannot wedge startup forever.
+func reclaimStaleLock(lockPath, reason string) (func(), error) {
+	reclaimPath := lockPath + ".reclaim"
+	if err := acquireReclaimLock(reclaimPath); err != nil {
+		return nil, fmt.Errorf("%w (%v; %s)", ErrLockHeld, err, reason)
+	}
+	defer func() { _ = os.Remove(reclaimPath) }()
+
+	// Serialized now. Re-check the main lock: another process may have fully
+	// reclaimed it between our liveness probe and our taking the reclaim lock.
+	if pid, perr := readLockPID(lockPath); perr == nil && processAlive(pid) {
+		return nil, fmt.Errorf("%w (reclaimed by pid %d while we waited; %s)",
+			ErrLockHeld, pid, reason)
+	}
+
+	// Safe to replace: remove the stale file, then create a fresh exclusive
+	// one. Both run while we hold the reclaim lock, so no other reclaimer can
+	// observe the gap.
+	if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
+		return nil, fmt.Errorf("%w (could not remove stale lockfile %s: %v; %s)",
+			ErrLockHeld, lockPath, err, reason)
+	}
+	rel, ok, err := tryCreateExclusive(lockPath)
+	if err != nil {
+		return nil, err
+	}
+	if !ok {
+		// Should be impossible while we hold the reclaim lock; fail safe.
+		return nil, fmt.Errorf("%w (lockfile reappeared during reclaim of %s; %s)",
+			ErrLockHeld, lockPath, reason)
+	}
+	return rel, nil
+}
+
+// acquireReclaimLock takes the auxiliary reclaim lock with O_EXCL. An
+// existing reclaim lock is honoured only while its recorded PID is alive (a
+// genuine concurrent reclaim); a stale one (dead/foreign PID) is removed once
+// and re-attempted so a crashed reclaimer cannot block boot indefinitely. Of
+// concurrent callers, O_EXCL ensures at most one acquires it; the rest fail
+// and back off to ErrLockHeld.
+func acquireReclaimLock(reclaimPath string) error {
+	for attempt := 0; attempt < 2; attempt++ {
+		f, err := os.OpenFile(reclaimPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
+		if err == nil {
+			if _, werr := fmt.Fprintf(f, "%d\n", os.Getpid()); werr != nil {
+				_ = f.Close()
+				_ = os.Remove(reclaimPath)
+				return fmt.Errorf("write reclaim lock %s: %v", reclaimPath, werr)
+			}
+			return f.Close()
+		}
+		if !os.IsExist(err) {
+			return fmt.Errorf("create reclaim lock %s: %v", reclaimPath, err)
+		}
+		// Reclaim lock present. A live owner means a real concurrent reclaim.
+		if pid, perr := readLockPID(reclaimPath); perr == nil && processAlive(pid) {
+			return fmt.Errorf("concurrent reclaim in progress (pid %d)", pid)
+		}
+		// Stale reclaim lock — clear it and retry the exclusive create once.
+		if rerr := os.Remove(reclaimPath); rerr != nil && !os.IsNotExist(rerr) {
+			return fmt.Errorf("remove stale reclaim lock %s: %v", reclaimPath, rerr)
+		}
+	}
+	return fmt.Errorf("could not acquire reclaim lock %s after retry", reclaimPath)
+}
+
+func readLockPID(path string) (int, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return 0, err
+	}
+	pidStr := strings.TrimSpace(string(data))
+	if pidStr == "" {
+		return 0, errors.New("empty lockfile")
+	}
+	return strconv.Atoi(pidStr)
+}
@@ -0,0 +1,137 @@
+package store
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+)
+
+func TestAcquireLockfile_FreshDir(t *testing.T) {
+	dir := t.TempDir()
+	release, err := AcquireLockfile(dir)
+	if err != nil {
+		t.Fatalf("AcquireLockfile: %v", err)
+	}
+	defer release()
+
+	// Lockfile should exist with our PID.
+	data, err := os.ReadFile(filepath.Join(dir, "tinyforge.lock"))
+	if err != nil {
+		t.Fatalf("read lockfile: %v", err)
+	}
+	want := fmt.Sprintf("%d\n", os.Getpid())
+	if string(data) != want {
+		t.Errorf("lockfile content = %q, want %q", data, want)
+	}
+}
+
+func TestAcquireLockfile_HeldByLivePID_Refused(t *testing.T) {
+	dir := t.TempDir()
+	// Plant a lockfile holding the current PID (which is obviously alive).
+	if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
+		[]byte(fmt.Sprintf("%d\n", os.Getpid())), 0o600); err != nil {
+		t.Fatalf("plant lockfile: %v", err)
+	}
+	release, err := AcquireLockfile(dir)
+	if err == nil {
+		release()
+		t.Fatal("expected ErrLockHeld, got nil")
+	}
+	if !errors.Is(err, ErrLockHeld) {
+		t.Errorf("error = %v, want wrap of ErrLockHeld", err)
+	}
+}
+
+func TestAcquireLockfile_StalePID_Reclaimed(t *testing.T) {
+	dir := t.TempDir()
+	// PID 1 is init/launchd/systemd on POSIX and the System Idle Process
+	// on Windows — never our process, and very unlikely to be dead. We
+	// use a deliberately-impossible PID instead: a 31-bit value far
+	// above any plausible system maximum.
+	stalePID := 2147483640
+	if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
+		[]byte(fmt.Sprintf("%d\n", stalePID)), 0o600); err != nil {
+		t.Fatalf("plant stale lockfile: %v", err)
+	}
+	release, err := AcquireLockfile(dir)
+	if err != nil {
+		t.Fatalf("expected reclaim of stale lock, got: %v", err)
+	}
+	defer release()
+
+	// Verify it now holds OUR pid, not the stale one.
+	data, err := os.ReadFile(filepath.Join(dir, "tinyforge.lock"))
+	if err != nil {
+		t.Fatalf("read lockfile after reclaim: %v", err)
+	}
+	want := fmt.Sprintf("%d\n", os.Getpid())
+	if string(data) != want {
+		t.Errorf("lockfile content after reclaim = %q, want %q", data, want)
+	}
+}
+
+func TestAcquireLockfile_ConcurrentReclaim_SingleWinner(t *testing.T) {
+	dir := t.TempDir()
+	// Plant a stale lockfile (impossibly high, certainly-dead PID), then have
+	// many goroutines race to reclaim it. Exactly one must win; the rest must
+	// be refused with ErrLockHeld. A "last-writer-wins" reclaim would let
+	// several goroutines all believe they own the lock.
+	stalePID := 2147483640
+	if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
+		[]byte(fmt.Sprintf("%d\n", stalePID)), 0o600); err != nil {
+		t.Fatalf("plant stale lockfile: %v", err)
+	}
+
+	const n = 16
+	var (
+		wg       sync.WaitGroup
+		mu       sync.Mutex
+		winners  int
+		releases []func()
+	)
+	start := make(chan struct{})
+	for i := 0; i < n; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			<-start
+			release, err := AcquireLockfile(dir)
+			if err != nil {
+				if !errors.Is(err, ErrLockHeld) {
+					t.Errorf("loser error = %v, want wrap of ErrLockHeld", err)
+				}
+				return
+			}
+			mu.Lock()
+			winners++
+			releases = append(releases, release)
+			mu.Unlock()
+		}()
+	}
+	close(start)
+	wg.Wait()
+
+	for _, r := range releases {
+		r()
+	}
+	if winners != 1 {
+		t.Fatalf("concurrent reclaim winners = %d, want exactly 1", winners)
+	}
+}
+
+func TestAcquireLockfile_ReleaseRemovesFile(t *testing.T) {
+	dir := t.TempDir()
+	release, err := AcquireLockfile(dir)
+	if err != nil {
+		t.Fatalf("AcquireLockfile: %v", err)
+	}
+	release()
+
+	path := filepath.Join(dir, "tinyforge.lock")
+	if _, err := os.Stat(path); !os.IsNotExist(err) {
+		t.Errorf("lockfile still present after release: %v", err)
+	}
+}
@@ -0,0 +1,33 @@
+//go:build !windows
+
+package store
+
+import (
+	"errors"
+	"os"
+	"syscall"
+)
+
+// processAlive checks whether the given PID belongs to a running process.
+// On POSIX, kill(pid, 0) sends no signal but returns ESRCH if the PID is
+// dead, EPERM if alive-but-foreign-owned (still "alive" for our purposes).
+//
+// os.FindProcess never returns a non-nil error on Linux / macOS / *BSD
+// for any PID value — it just records the integer. The probe is purely
+// the Signal(0) result. We keep the FindProcess call to obtain the
+// *os.Process handle Signal needs; we don't branch on its error.
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	proc, _ := os.FindProcess(pid)
+	if proc == nil {
+		return false
+	}
+	err := proc.Signal(syscall.Signal(0))
+	if err == nil {
+		return true
+	}
+	// EPERM = alive but not ours; ESRCH = dead.
+	return errors.Is(err, os.ErrPermission) || errors.Is(err, syscall.EPERM)
+}
@@ -0,0 +1,30 @@
+//go:build windows
+
+package store
+
+import (
+	"golang.org/x/sys/windows"
+)
+
+// processAlive returns true when the given PID is currently held by a
+// running Windows process. OpenProcess with PROCESS_QUERY_LIMITED_INFORMATION
+// is the supported way to check liveness without elevation.
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
+	if err != nil {
+		return false
+	}
+	defer windows.CloseHandle(h)
+	var exitCode uint32
+	if err := windows.GetExitCodeProcess(h, &exitCode); err != nil {
+		// Conservative: if we can't ask, assume alive so we don't reclaim
+		// an active lock. Worst case the operator sees ErrLockHeld and
+		// removes the lockfile by hand.
+		return true
+	}
+	const stillActive = 259 // STILL_ACTIVE
+	return exitCode == stillActive
+}
@@ -278,12 +278,20 @@ const (
 // containers.workload_kind and workloads.kind. After the hard cutover the
 // backing project / stack / static_site tables are gone — these constants
 // are just strings used to filter the unified containers index in the UI.
+//
+// `build` is the dockerfile-source kind: a container built from a
+// Dockerfile in a Git repo. Operationally it looks like a site (one
+// container, one optional public face) but its origin is the build
+// pipeline, not a static-asset extract. Dashboard filters that need to
+// distinguish "I built this from source" from "I served files from a
+// repo" should key on this value.
 type WorkloadKind string

 const (
 	WorkloadKindProject WorkloadKind = "project"
 	WorkloadKindStack   WorkloadKind = "stack"
 	WorkloadKindSite    WorkloadKind = "site"
+	WorkloadKindBuild   WorkloadKind = "build"
 )

 // Workload is the unifying primitive that abstracts Project, Stack, and StaticSite.
@@ -316,6 +324,31 @@ type Workload struct {
 	UpdatedAt               string `json:"updated_at"`
 }

+// WorkloadNotification is one configured outbound notification route for
+// a workload. Multiple rows per workload model the "one Slack channel
+// for failures, one Discord webhook for successes" routing the legacy
+// single notification_url column could not express.
+//
+// EventTypes is a comma-separated allow-list (e.g. "build_failure" or
+// "deploy_success,deploy_failure"). An empty EventTypes means the row
+// fires for every event type — the cheapest way to keep the existing
+// single-destination behaviour expressible in the new shape.
+//
+// Secret round-trips through the same crypto envelope as other stored
+// secrets; the API layer strips it from responses.
+type WorkloadNotification struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	Secret     string `json:"-"`
+	EventTypes string `json:"event_types"`
+	Enabled    bool   `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
 // Container is the normalized index of every Tinyforge-managed container.
 // Replaces the project-specific Instance table after migration. Subdomain/
 // proxy fields are hoisted as first-class columns because ListProxyRoutes,
@@ -55,11 +55,20 @@ func New(dbPath string) (*Store, error) {
 	db.SetMaxOpenConns(1)
 	db.SetConnMaxLifetime(0)

-	// Enable WAL mode and foreign keys for better concurrency and referential integrity.
+	// Enable WAL mode and foreign keys for better concurrency and
+	// referential integrity. `synchronous=NORMAL` pairs with WAL to skip
+	// the per-write fsync — the OS still flushes on checkpoint, durability
+	// is preserved across clean shutdowns, and crashes lose at most the
+	// last few committed transactions (acceptable for a tinyforge box).
+	// cache_size=-20000 = 20 MiB page cache, temp_store=MEMORY keeps
+	// indexer scratch off disk; both are pure perf knobs.
 	pragmas := []string{
 		"PRAGMA journal_mode=WAL",
+		"PRAGMA synchronous=NORMAL",
 		"PRAGMA foreign_keys=ON",
 		"PRAGMA busy_timeout=5000",
+		"PRAGMA cache_size=-20000",
+		"PRAGMA temp_store=MEMORY",
 	}
 	for _, p := range pragmas {
 		if _, err := db.Exec(p); err != nil {
@@ -284,6 +293,24 @@ func (s *Store) runMigrations() error {
 			created_at                  TEXT NOT NULL DEFAULT (datetime('now')),
 			updated_at                  TEXT NOT NULL DEFAULT (datetime('now'))
 		)`,
+		// workload_notifications: per-workload notification destinations.
+		// Each row is one route (Slack channel, Discord webhook, generic
+		// receiver, ...). event_types is a comma-separated allow-list —
+		// empty means "all events". When zero rows exist for a workload
+		// the dispatcher falls back to the legacy single notification_url
+		// column on workloads so existing setups keep working unchanged.
+		`CREATE TABLE IF NOT EXISTS workload_notifications (
+			id           TEXT PRIMARY KEY,
+			workload_id  TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+			name         TEXT NOT NULL,
+			url          TEXT NOT NULL,
+			secret       TEXT NOT NULL DEFAULT '',
+			event_types  TEXT NOT NULL DEFAULT '',
+			enabled      INTEGER NOT NULL DEFAULT 1,
+			sort_order   INTEGER NOT NULL DEFAULT 0,
+			created_at   TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at   TEXT NOT NULL DEFAULT (datetime('now'))
+		)`,
 		// workload_trigger_bindings: many-to-many between workloads and
 		// triggers. binding_config is the per-binding override applied on
 		// top of trigger.config (top-level JSON merge, binding wins).
@@ -427,6 +454,7 @@ func (s *Store) runMigrations() error {
 		`CREATE UNIQUE INDEX IF NOT EXISTS idx_triggers_webhook_secret ON triggers(webhook_secret) WHERE webhook_secret != ''`,
 		`CREATE INDEX IF NOT EXISTS idx_bindings_workload         ON workload_trigger_bindings(workload_id)`,
 		`CREATE INDEX IF NOT EXISTS idx_bindings_trigger          ON workload_trigger_bindings(trigger_id)`,
+		`CREATE INDEX IF NOT EXISTS idx_workload_notifs_workload  ON workload_notifications(workload_id)`,
 	}
 	for _, idx := range indexes {
 		if _, err := s.db.Exec(idx); err != nil {
@@ -434,13 +462,215 @@ func (s *Store) runMigrations() error {
 		}
 	}

-	if err := s.backfillTriggersFromWorkloads(); err != nil {
+	// schema_versions table gates one-shot data migrations like the
+	// trigger backfill below. Without this, the backfill scan ran on
+	// every boot even on fully-migrated DBs — wasted I/O and (more
+	// importantly) made it impossible to tell whether a "no rows
+	// processed" was a clean state or a missed-migration bug.
+	if _, err := s.db.Exec(`CREATE TABLE IF NOT EXISTS schema_versions (
+		version    INTEGER PRIMARY KEY,
+		applied_at TEXT NOT NULL DEFAULT (datetime('now'))
+	)`); err != nil {
+		return fmt.Errorf("create schema_versions: %w", err)
+	}
+
+	if err := s.runOnce(1, "trigger backfill", s.backfillTriggersFromWorkloads); err != nil {
+		// Backfill failure is non-fatal — we log and let the operator
+		// retry. The version is only recorded on success.
 		slog.Warn("trigger backfill", "error", err)
 	}

 	return nil
 }

+// runOnce executes fn at most one time per database lifetime, recording
+// success in schema_versions. Useful for data migrations whose source
+// table eventually disappears (so re-running becomes pointless or
+// dangerous).
+func (s *Store) runOnce(version int, label string, fn func() error) error {
+	var applied int
+	if err := s.db.QueryRow(`SELECT COUNT(*) FROM schema_versions WHERE version = ?`, version).Scan(&applied); err != nil {
+		return fmt.Errorf("check %s: %w", label, err)
+	}
+	if applied > 0 {
+		return nil
+	}
+	if err := fn(); err != nil {
+		return err
+	}
+	if _, err := s.db.Exec(`INSERT INTO schema_versions (version) VALUES (?)`, version); err != nil {
+		return fmt.Errorf("mark %s applied: %w", label, err)
+	}
+	slog.Info("schema migration applied", "version", version, "label", label)
+	return nil
+}
+
+// RunOnce is the public counterpart of runOnce, exposed so cmd/server can
+// gate post-store-open migrations (e.g. crypto re-encryption that needs
+// the ENCRYPTION_KEY which Store does not own) through the same
+// schema_versions ledger.
+func (s *Store) RunOnce(version int, label string, fn func() error) error {
+	return s.runOnce(version, label, fn)
+}
+
+// EnvelopeMigrator describes the contract a crypto package implements to
+// rewrite legacy unprefixed-hex ciphertext as versioned envelope values.
+// hasEnvelope reports whether a value already carries the new prefix.
+// decrypt returns plaintext for either form; encrypt always produces the
+// new envelope form. By accepting closures the store stays free of any
+// import on internal/crypto, mirroring the rest of the package layout.
+type EnvelopeMigrator struct {
+	HasEnvelope func(value string) bool
+	Decrypt     func(ciphertext string) (string, error)
+	Encrypt     func(plaintext string) (string, error)
+}
+
+// MigrateSecretsToEnvelope walks every column known to carry an encrypted
+// secret and rewrites legacy unprefixed-hex values into the new
+// envelope form using the current encryption key.
+//
+// Behaviour, per-row:
+//   - empty value → skip (no secret stored)
+//   - already-envelope value → skip (already migrated)
+//   - decrypt fails → skip (value is either plaintext from a v0 boot
+//     OR ciphertext from a rotated key; either way we cannot safely
+//     re-encrypt and leaving it alone preserves the existing read
+//     semantics)
+//   - decrypt succeeds → encrypt to envelope form + UPDATE
+//
+// The whole sweep runs in a single transaction so a power-loss
+// mid-migration leaves the DB in either the pre- or post-migration
+// state, never half. Idempotent via schema_versions version 2 — the
+// next boot is a no-op.
+//
+// Columns covered:
+//   - settings.npm_password
+//   - settings.cloudflare_api_token
+//   - auth_settings.oidc_client_secret
+//   - registries.token
+//   - workload_env.value WHERE encrypted=1
+func (s *Store) MigrateSecretsToEnvelope(m EnvelopeMigrator) error {
+	return s.runOnce(2, "secrets envelope migration", func() error {
+		tx, err := s.db.Begin()
+		if err != nil {
+			return fmt.Errorf("begin: %w", err)
+		}
+		defer func() { _ = tx.Rollback() }()
+
+		// Single-row tables (settings, auth_settings) — read-update inline.
+		singleRowColumns := []struct {
+			table, column string
+		}{
+			{"settings", "npm_password"},
+			{"settings", "cloudflare_api_token"},
+			{"auth_settings", "oidc_client_secret"},
+		}
+		for _, c := range singleRowColumns {
+			var v string
+			err := tx.QueryRow(
+				fmt.Sprintf(`SELECT %s FROM %s LIMIT 1`, c.column, c.table),
+			).Scan(&v)
+			if err != nil {
+				if errors.Is(err, sql.ErrNoRows) {
+					continue
+				}
+				// auth_settings may not exist on a brand-new DB until
+				// the OIDC code touches it; treat as nothing-to-migrate.
+				slog.Debug("envelope migration: column read skipped",
+					"table", c.table, "column", c.column, "error", err)
+				continue
+			}
+			migrated, ok := tryMigrate(m, v)
+			if !ok {
+				continue
+			}
+			if _, err := tx.Exec(
+				fmt.Sprintf(`UPDATE %s SET %s = ?`, c.table, c.column),
+				migrated,
+			); err != nil {
+				return fmt.Errorf("update %s.%s: %w", c.table, c.column, err)
+			}
+		}
+
+		// Multi-row: registries.token
+		if err := migrateRowColumn(tx, m,
+			`SELECT id, token FROM registries WHERE token != ''`,
+			`UPDATE registries SET token = ? WHERE id = ?`,
+		); err != nil {
+			return fmt.Errorf("registries.token: %w", err)
+		}
+
+		// Multi-row: workload_env.value WHERE encrypted=1
+		if err := migrateRowColumn(tx, m,
+			`SELECT id, value FROM workload_env WHERE encrypted = 1 AND value != ''`,
+			`UPDATE workload_env SET value = ? WHERE id = ?`,
+		); err != nil {
+			return fmt.Errorf("workload_env.value: %w", err)
+		}
+
+		if err := tx.Commit(); err != nil {
+			return fmt.Errorf("commit: %w", err)
+		}
+		return nil
+	})
+}
+
+// migrateRowColumn applies the envelope rewrite to every (id, value)
+// pair returned by selectQ. updateQ takes (newValue, id) as parameters.
+// Each row is its own attempt; one row failing migration (decrypt fail)
+// does not abort the others.
+func migrateRowColumn(tx *sql.Tx, m EnvelopeMigrator, selectQ, updateQ string) error {
+	rows, err := tx.Query(selectQ)
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+	type pending struct{ id, newValue string }
+	var updates []pending
+	for rows.Next() {
+		var id, value string
+		if err := rows.Scan(&id, &value); err != nil {
+			return err
+		}
+		newValue, ok := tryMigrate(m, value)
+		if !ok {
+			continue
+		}
+		updates = append(updates, pending{id, newValue})
+	}
+	if err := rows.Err(); err != nil {
+		return err
+	}
+	for _, u := range updates {
+		if _, err := tx.Exec(updateQ, u.newValue, u.id); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// tryMigrate returns the envelope-form ciphertext + true when the input
+// is a legacy unprefixed value that decrypts successfully with the
+// current key. Returns ("", false) for anything else: empty, already
+// envelope, plaintext, or decrypt-failed (rotated-key case).
+func tryMigrate(m EnvelopeMigrator, v string) (string, bool) {
+	if v == "" {
+		return "", false
+	}
+	if m.HasEnvelope(v) {
+		return "", false
+	}
+	plaintext, err := m.Decrypt(v)
+	if err != nil {
+		return "", false
+	}
+	enc, err := m.Encrypt(plaintext)
+	if err != nil {
+		return "", false
+	}
+	return enc, true
+}
+
 // backfillTriggersFromWorkloads converts embedded trigger config on
 // workload rows into standalone trigger + binding rows. Runs once per
 // boot and is idempotent — only workloads with non-empty trigger_kind
@@ -0,0 +1,159 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+
+	"github.com/google/uuid"
+)
+
+const workloadNotificationColumns = `id, workload_id, name, url, secret,
+	event_types, enabled, sort_order, created_at, updated_at`
+
+func scanWorkloadNotification(scanner interface{ Scan(...any) error }) (WorkloadNotification, error) {
+	var n WorkloadNotification
+	var enabled int
+	err := scanner.Scan(
+		&n.ID, &n.WorkloadID, &n.Name, &n.URL, &n.Secret,
+		&n.EventTypes, &enabled, &n.SortOrder, &n.CreatedAt, &n.UpdatedAt,
+	)
+	n.Enabled = enabled != 0
+	return n, err
+}
+
+// CreateWorkloadNotification inserts a notification route. Returns the
+// populated row (with assigned id + timestamps) so callers don't need to
+// follow up with a Get.
+func (s *Store) CreateWorkloadNotification(n WorkloadNotification) (WorkloadNotification, error) {
+	if n.WorkloadID == "" {
+		return WorkloadNotification{}, fmt.Errorf("workload_id is required")
+	}
+	if n.URL == "" {
+		return WorkloadNotification{}, fmt.Errorf("url is required")
+	}
+	if n.ID == "" {
+		n.ID = uuid.New().String()
+	}
+	n.CreatedAt = Now()
+	n.UpdatedAt = n.CreatedAt
+
+	_, err := s.db.Exec(
+		`INSERT INTO workload_notifications (`+workloadNotificationColumns+`)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		n.ID, n.WorkloadID, n.Name, n.URL, n.Secret,
+		n.EventTypes, BoolToInt(n.Enabled), n.SortOrder, n.CreatedAt, n.UpdatedAt,
+	)
+	if err != nil {
+		return WorkloadNotification{}, fmt.Errorf("insert workload_notification: %w", err)
+	}
+	return n, nil
+}
+
+// ListWorkloadNotifications returns every notification row for a
+// workload ordered by (sort_order, created_at) so the UI stays stable
+// across reorderings.
+func (s *Store) ListWorkloadNotifications(workloadID string) ([]WorkloadNotification, error) {
+	rows, err := s.db.Query(
+		`SELECT `+workloadNotificationColumns+`
+		 FROM workload_notifications
+		 WHERE workload_id = ?
+		 ORDER BY sort_order, created_at`,
+		workloadID,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("list workload_notifications: %w", err)
+	}
+	defer rows.Close()
+
+	out := []WorkloadNotification{}
+	for rows.Next() {
+		n, err := scanWorkloadNotification(rows)
+		if err != nil {
+			return nil, fmt.Errorf("scan workload_notification: %w", err)
+		}
+		out = append(out, n)
+	}
+	return out, rows.Err()
+}
+
+// GetWorkloadNotification fetches one notification row by id. Returns
+// ErrNotFound when the row does not exist so callers can return 404
+// cleanly.
+func (s *Store) GetWorkloadNotification(id string) (WorkloadNotification, error) {
+	n, err := scanWorkloadNotification(s.db.QueryRow(
+		`SELECT `+workloadNotificationColumns+`
+		 FROM workload_notifications WHERE id = ?`, id,
+	))
+	if errors.Is(err, sql.ErrNoRows) {
+		return WorkloadNotification{}, fmt.Errorf("workload_notification %s: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return WorkloadNotification{}, fmt.Errorf("query workload_notification: %w", err)
+	}
+	return n, nil
+}
+
+// UpdateWorkloadNotification rewrites an existing row. WorkloadID is
+// immutable — re-anchoring a route to a different workload would invite
+// silent reassignments after a paste-bug in the UI; recreate instead.
+func (s *Store) UpdateWorkloadNotification(n WorkloadNotification) error {
+	if n.ID == "" {
+		return fmt.Errorf("id is required")
+	}
+	if n.URL == "" {
+		return fmt.Errorf("url is required")
+	}
+	n.UpdatedAt = Now()
+	res, err := s.db.Exec(
+		`UPDATE workload_notifications
+		 SET name = ?, url = ?, secret = ?, event_types = ?,
+		     enabled = ?, sort_order = ?, updated_at = ?
+		 WHERE id = ?`,
+		n.Name, n.URL, n.Secret, n.EventTypes,
+		BoolToInt(n.Enabled), n.SortOrder, n.UpdatedAt, n.ID,
+	)
+	if err != nil {
+		return fmt.Errorf("update workload_notification: %w", err)
+	}
+	rows, _ := res.RowsAffected()
+	if rows == 0 {
+		return fmt.Errorf("workload_notification %s: %w", n.ID, ErrNotFound)
+	}
+	return nil
+}
+
+// DeleteWorkloadNotification drops a single notification row.
+// Idempotent: missing id returns ErrNotFound so the API can map it to
+// 404 cleanly.
+func (s *Store) DeleteWorkloadNotification(id string) error {
+	res, err := s.db.Exec(`DELETE FROM workload_notifications WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete workload_notification: %w", err)
+	}
+	rows, _ := res.RowsAffected()
+	if rows == 0 {
+		return fmt.Errorf("workload_notification %s: %w", id, ErrNotFound)
+	}
+	return nil
+}
+
+// MatchesEventType returns true when the notification row's EventTypes
+// allow-list includes eventType (or is empty, meaning "match all").
+// Helper exported so the notification dispatcher can fan-out filtering
+// inline without duplicating the comma-split parser.
+func (n WorkloadNotification) MatchesEventType(eventType string) bool {
+	if !n.Enabled {
+		return false
+	}
+	if n.EventTypes == "" {
+		return true
+	}
+	for _, et := range strings.Split(n.EventTypes, ",") {
+		if strings.TrimSpace(et) == eventType {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,170 @@
+package store
+
+import (
+	"errors"
+	"testing"
+)
+
+// seedWorkloadForNotifications creates a minimal workload row so the FK
+// constraint on workload_notifications is satisfied. Returns the new
+// workload's ID for tests to reference.
+func seedWorkloadForNotifications(t *testing.T, s *Store, name string) string {
+	t.Helper()
+	w, err := s.CreateWorkload(Workload{
+		Kind:       string(WorkloadKindProject),
+		Name:       name,
+		SourceKind: "image",
+	})
+	if err != nil {
+		t.Fatalf("seed workload: %v", err)
+	}
+	return w.ID
+}
+
+func TestCreateWorkloadNotification_RoundTrip(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+
+	created, err := s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID,
+		Name:       "Slack alerts",
+		URL:        "https://hooks.slack.test/x",
+		Secret:     "shh",
+		EventTypes: "deploy_failure,build_failure",
+		Enabled:    true,
+	})
+	if err != nil {
+		t.Fatalf("CreateWorkloadNotification: %v", err)
+	}
+	if created.ID == "" {
+		t.Fatal("expected ID to be assigned")
+	}
+
+	got, err := s.GetWorkloadNotification(created.ID)
+	if err != nil {
+		t.Fatalf("Get: %v", err)
+	}
+	if got.URL != "https://hooks.slack.test/x" || got.Name != "Slack alerts" {
+		t.Errorf("row mismatch: %+v", got)
+	}
+	if !got.Enabled {
+		t.Error("expected Enabled=true")
+	}
+	if got.EventTypes != "deploy_failure,build_failure" {
+		t.Errorf("event_types = %q", got.EventTypes)
+	}
+}
+
+func TestCreateWorkloadNotification_RejectsMissingURL(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+	_, err := s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID,
+		Name:       "broken",
+		URL:        "",
+	})
+	if err == nil {
+		t.Fatal("expected URL validation error")
+	}
+}
+
+func TestListWorkloadNotifications_SortedByOrder(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+
+	// Insert out of order; ListWorkloadNotifications should return
+	// them sorted by SortOrder ascending.
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "C", URL: "https://c.test", SortOrder: 30,
+	})
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "A", URL: "https://a.test", SortOrder: 10,
+	})
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "B", URL: "https://b.test", SortOrder: 20,
+	})
+
+	rows, err := s.ListWorkloadNotifications(wlID)
+	if err != nil {
+		t.Fatalf("list: %v", err)
+	}
+	if len(rows) != 3 {
+		t.Fatalf("len = %d, want 3", len(rows))
+	}
+	if rows[0].Name != "A" || rows[1].Name != "B" || rows[2].Name != "C" {
+		t.Errorf("sort order wrong: %q %q %q", rows[0].Name, rows[1].Name, rows[2].Name)
+	}
+}
+
+func TestUpdateWorkloadNotification_PersistsChanges(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+	n, _ := s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "old", URL: "https://old.test", Enabled: true,
+	})
+	n.Name = "new"
+	n.URL = "https://new.test"
+	n.Enabled = false
+	n.EventTypes = "deploy_success"
+	if err := s.UpdateWorkloadNotification(n); err != nil {
+		t.Fatalf("update: %v", err)
+	}
+	got, _ := s.GetWorkloadNotification(n.ID)
+	if got.Name != "new" || got.URL != "https://new.test" || got.Enabled {
+		t.Errorf("update did not persist: %+v", got)
+	}
+}
+
+func TestDeleteWorkloadNotification_ReturnsNotFoundForMissing(t *testing.T) {
+	s := newTestStore(t)
+	err := s.DeleteWorkloadNotification("nope")
+	if !errors.Is(err, ErrNotFound) {
+		t.Errorf("expected ErrNotFound, got %v", err)
+	}
+}
+
+func TestDeleteWorkloadNotification_CascadesFromWorkload(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "x", URL: "https://x.test",
+	})
+	if err := s.DeleteWorkload(wlID); err != nil {
+		t.Fatalf("delete workload: %v", err)
+	}
+	rows, err := s.ListWorkloadNotifications(wlID)
+	if err != nil {
+		t.Fatalf("list after cascade: %v", err)
+	}
+	if len(rows) != 0 {
+		t.Errorf("expected cascade delete to remove rows, got %d", len(rows))
+	}
+}
+
+func TestMatchesEventType_AllowList(t *testing.T) {
+	cases := []struct {
+		eventTypes string
+		probe      string
+		want       bool
+	}{
+		{"", "deploy_success", true},                          // empty = all
+		{"deploy_success,deploy_failure", "deploy_success", true},
+		{"deploy_success,deploy_failure", "build_failure", false},
+		{"build_failure", "build_failure", true},
+		{" deploy_success , build_failure ", "build_failure", true}, // whitespace tolerated
+	}
+	for _, c := range cases {
+		n := WorkloadNotification{Enabled: true, EventTypes: c.eventTypes}
+		got := n.MatchesEventType(c.probe)
+		if got != c.want {
+			t.Errorf("MatchesEventType(%q, %q) = %v, want %v", c.eventTypes, c.probe, got, c.want)
+		}
+	}
+}
+
+func TestMatchesEventType_DisabledNeverMatches(t *testing.T) {
+	n := WorkloadNotification{Enabled: false, EventTypes: ""}
+	if n.MatchesEventType("any") {
+		t.Error("disabled row should never match")
+	}
+}
@@ -173,11 +173,24 @@ func (s *Store) UpdateWorkload(w Workload) error {
 	return nil
 }

-// DeleteWorkload removes a workload row. Cascading deletes for the matching
-// project/stack/site row stay with the kind-specific Delete functions; this
-// only removes the workload entry.
+// DeleteWorkload removes a workload row. Cascading deletes for FK-backed
+// child tables (workload_env, workload_volumes, workload_trigger_bindings)
+// happen via SQLite's ON DELETE CASCADE. The `containers` table doesn't
+// yet have an FK to workloads (planned migration — see ops notes), so we
+// drop its rows explicitly here in the same transaction to prevent zombie
+// container rows from outliving their owning workload.
 func (s *Store) DeleteWorkload(id string) error {
-	result, err := s.db.Exec(`DELETE FROM workloads WHERE id = ?`, id)
+	tx, err := s.db.Begin()
+	if err != nil {
+		return fmt.Errorf("begin: %w", err)
+	}
+	defer func() { _ = tx.Rollback() }()
+
+	// Explicit container cleanup until the FK migration lands.
+	if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
+		return fmt.Errorf("delete containers: %w", err)
+	}
+	result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
 	if err != nil {
 		return fmt.Errorf("delete workload: %w", err)
 	}
@@ -188,6 +201,9 @@ func (s *Store) DeleteWorkload(id string) error {
 	if n == 0 {
 		return fmt.Errorf("workload %s: %w", id, ErrNotFound)
 	}
+	if err := tx.Commit(); err != nil {
+		return fmt.Errorf("commit: %w", err)
+	}
 	return nil
 }