feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus):
- Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review):
  WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation,
  ConfirmDialog-based unsaved-changes guard.
- Extract lib/workload/sourceForms.ts (single source of truth for source_config)
  + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the
  /apps/[id] edit form onto the same components (removes the duplication). Add
  vitest + sourceForms unit tests.
- Branch preview environments UI: /chain is_preview/preview_branch + a Preview
  environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed
  state); RegistryImagePicker on the registry trigger and the image source.
- Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect;
  conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory
  label hints; dashboard + /apps "Total workloads" count only source_kind workloads
  (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker
  empty-list guard.
- Update CLAUDE.md frontend conventions + add a Build & Test section.

Also captures pre-existing in-progress platform work (not from this session):
workload notifications, Prometheus metrics export, store lockfile, health probes,
backup hardening, and related store/webhook/scheduler changes.
This commit is contained in:
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
+29
View File
@@ -2,6 +2,7 @@ package store
import (
"database/sql"
"encoding/json"
"errors"
"fmt"
"strings"
@@ -9,6 +10,22 @@ import (
"github.com/google/uuid"
)
// validateExtraJSON ensures the extra_json column never receives an
// invalid JSON document. The codemap (docs/CODEMAPS/container-extra-json.md)
// is explicit that readers tolerate unknown keys — but only if the value
// is valid JSON at all. A buggy plugin writing `"not json"` would silently
// break every reader, with no schema-level check to catch it. Guarding at
// the store boundary keeps the invariant cheap and obvious.
func validateExtraJSON(v string) error {
if v == "" {
return nil
}
if !json.Valid([]byte(v)) {
return fmt.Errorf("extra_json: not valid JSON (%d bytes)", len(v))
}
return nil
}
// containerColumns is the canonical column list for `containers` queries.
// stage_id is populated by the deployer for project containers (so ListProxyRoutes
// survives stage renames) and left empty for stacks and sites.
@@ -42,6 +59,9 @@ func (s *Store) CreateContainer(c Container) (Container, error) {
if c.ExtraJSON == "" {
c.ExtraJSON = "{}"
}
if err := validateExtraJSON(c.ExtraJSON); err != nil {
return Container{}, err
}
_, err := s.db.Exec(
`INSERT INTO containers (`+containerColumns+`)
@@ -77,6 +97,9 @@ func (s *Store) UpsertContainer(c Container) error {
if c.ExtraJSON == "" {
c.ExtraJSON = "{}"
}
if err := validateExtraJSON(c.ExtraJSON); err != nil {
return err
}
// SQLite UPSERT — INSERT...ON CONFLICT(id) DO UPDATE.
_, err := s.db.Exec(
@@ -129,6 +152,9 @@ func (s *Store) ReconcileContainer(c Container) error {
if c.ExtraJSON == "" {
c.ExtraJSON = "{}"
}
if err := validateExtraJSON(c.ExtraJSON); err != nil {
return err
}
// extra_json is deliberately NOT in the ON CONFLICT SET clause: the
// reconciler can't observe per-face route IDs from Docker, and
@@ -321,6 +347,9 @@ func (s *Store) UpdateContainer(c Container) error {
if c.ExtraJSON == "" {
c.ExtraJSON = "{}"
}
if err := validateExtraJSON(c.ExtraJSON); err != nil {
return err
}
result, err := s.db.Exec(
`UPDATE containers SET workload_id=?, workload_kind=?, role=?, stage_id=?, container_id=?,
image_ref=?, image_tag=?, host=?, state=?, port=?,
+171
View File
@@ -0,0 +1,171 @@
package store
import (
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
)
// ErrLockHeld is returned when another Tinyforge process appears to be
// running against the same data directory. SQLite + SetMaxOpenConns(1)
// makes this otherwise-silent collision a recipe for double-fired
// schedulers, double-polled registries, and `extra_json` RMW corruption.
var ErrLockHeld = errors.New("data directory is locked by another tinyforge process")
// Lockfile is a portable PID file. AcquireLockfile takes it; the returned
// Release function removes it. The contract:
//
// - Lockfile is created with O_CREATE|O_EXCL — atomic on POSIX, atomic
// on NTFS / ReFS via the equivalent.
// - On collision, the existing file's PID is read; if the PID is dead,
// we treat the lock as stale (process crashed without cleanup),
// reclaim it, and proceed. Live PID → ErrLockHeld.
// - flock is intentionally not used: cross-platform consistency wins
// over advisory-lock semantics for the single-instance use case.
type Lockfile struct {
path string
}
// AcquireLockfile creates a PID-file lock under dataDir. Returns a
// Release function the caller must defer. If another live process holds
// the lock, returns ErrLockHeld with a hint pointing at the lockfile.
//
// Reclaim atomicity: when the existing lockfile names a dead PID, the
// replacement is serialized through an auxiliary reclaim lock (see
// reclaimStaleLock) so that, of N processes booting concurrently against
// the same stale lockfile, EXACTLY ONE reclaims it and the rest get
// ErrLockHeld. A bare `os.Remove`+`O_EXCL` retry — or a rename, which is
// "last-writer-wins" — cannot guarantee this: multiple reclaimers can each
// end up believing they own the lock, defeating the single-instance guard.
func AcquireLockfile(dataDir string) (release func(), err error) {
path := filepath.Join(dataDir, "tinyforge.lock")
// First try: clean acquire.
if rel, ok, err := tryCreateExclusive(path); ok {
return rel, nil
} else if err != nil {
return nil, err
}
// Existing lockfile — read PID and decide whether to reclaim.
pid, readErr := readLockPID(path)
if readErr == nil && processAlive(pid) {
return nil, fmt.Errorf("%w (held by pid %d, lockfile=%s)", ErrLockHeld, pid, path)
}
// Stale lock (dead pid) or malformed file — reclaim under serialization.
reason := "malformed existing lockfile"
if readErr == nil {
reason = fmt.Sprintf("stale lockfile (dead pid %d)", pid)
}
return reclaimStaleLock(path, reason)
}
// tryCreateExclusive attempts an atomic O_CREATE|O_EXCL create at path.
// Returns (release, true, nil) on success; (nil, false, nil) when the
// file already exists; (nil, false, err) on any other error.
func tryCreateExclusive(path string) (func(), bool, error) {
f, openErr := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
if openErr != nil {
if os.IsExist(openErr) {
return nil, false, nil
}
return nil, false, fmt.Errorf("open lockfile: %w", openErr)
}
if _, err := fmt.Fprintf(f, "%d\n", os.Getpid()); err != nil {
_ = f.Close()
_ = os.Remove(path)
return nil, false, fmt.Errorf("write lockfile: %w", err)
}
if err := f.Close(); err != nil {
_ = os.Remove(path)
return nil, false, fmt.Errorf("close lockfile: %w", err)
}
return func() { _ = os.Remove(path) }, true, nil
}
// reclaimStaleLock replaces a stale/malformed lockfile with one holding our
// PID, serialized by an auxiliary reclaim lock. Holding the reclaim lock
// (O_EXCL) guarantees that only one process performs the remove-and-recreate
// of the main lockfile at a time, so concurrent reclaimers cannot each end
// up "owning" the lock the way a rename or unguarded remove+create would
// allow. The reclaim lock is itself liveness-checked so a reclaimer that
// crashed mid-reclaim cannot wedge startup forever.
func reclaimStaleLock(lockPath, reason string) (func(), error) {
reclaimPath := lockPath + ".reclaim"
if err := acquireReclaimLock(reclaimPath); err != nil {
return nil, fmt.Errorf("%w (%v; %s)", ErrLockHeld, err, reason)
}
defer func() { _ = os.Remove(reclaimPath) }()
// Serialized now. Re-check the main lock: another process may have fully
// reclaimed it between our liveness probe and our taking the reclaim lock.
if pid, perr := readLockPID(lockPath); perr == nil && processAlive(pid) {
return nil, fmt.Errorf("%w (reclaimed by pid %d while we waited; %s)",
ErrLockHeld, pid, reason)
}
// Safe to replace: remove the stale file, then create a fresh exclusive
// one. Both run while we hold the reclaim lock, so no other reclaimer can
// observe the gap.
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
return nil, fmt.Errorf("%w (could not remove stale lockfile %s: %v; %s)",
ErrLockHeld, lockPath, err, reason)
}
rel, ok, err := tryCreateExclusive(lockPath)
if err != nil {
return nil, err
}
if !ok {
// Should be impossible while we hold the reclaim lock; fail safe.
return nil, fmt.Errorf("%w (lockfile reappeared during reclaim of %s; %s)",
ErrLockHeld, lockPath, reason)
}
return rel, nil
}
// acquireReclaimLock takes the auxiliary reclaim lock with O_EXCL. An
// existing reclaim lock is honoured only while its recorded PID is alive (a
// genuine concurrent reclaim); a stale one (dead/foreign PID) is removed once
// and re-attempted so a crashed reclaimer cannot block boot indefinitely. Of
// concurrent callers, O_EXCL ensures at most one acquires it; the rest fail
// and back off to ErrLockHeld.
func acquireReclaimLock(reclaimPath string) error {
for attempt := 0; attempt < 2; attempt++ {
f, err := os.OpenFile(reclaimPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
if err == nil {
if _, werr := fmt.Fprintf(f, "%d\n", os.Getpid()); werr != nil {
_ = f.Close()
_ = os.Remove(reclaimPath)
return fmt.Errorf("write reclaim lock %s: %v", reclaimPath, werr)
}
return f.Close()
}
if !os.IsExist(err) {
return fmt.Errorf("create reclaim lock %s: %v", reclaimPath, err)
}
// Reclaim lock present. A live owner means a real concurrent reclaim.
if pid, perr := readLockPID(reclaimPath); perr == nil && processAlive(pid) {
return fmt.Errorf("concurrent reclaim in progress (pid %d)", pid)
}
// Stale reclaim lock — clear it and retry the exclusive create once.
if rerr := os.Remove(reclaimPath); rerr != nil && !os.IsNotExist(rerr) {
return fmt.Errorf("remove stale reclaim lock %s: %v", reclaimPath, rerr)
}
}
return fmt.Errorf("could not acquire reclaim lock %s after retry", reclaimPath)
}
func readLockPID(path string) (int, error) {
data, err := os.ReadFile(path)
if err != nil {
return 0, err
}
pidStr := strings.TrimSpace(string(data))
if pidStr == "" {
return 0, errors.New("empty lockfile")
}
return strconv.Atoi(pidStr)
}
+137
View File
@@ -0,0 +1,137 @@
package store
import (
"errors"
"fmt"
"os"
"path/filepath"
"sync"
"testing"
)
func TestAcquireLockfile_FreshDir(t *testing.T) {
dir := t.TempDir()
release, err := AcquireLockfile(dir)
if err != nil {
t.Fatalf("AcquireLockfile: %v", err)
}
defer release()
// Lockfile should exist with our PID.
data, err := os.ReadFile(filepath.Join(dir, "tinyforge.lock"))
if err != nil {
t.Fatalf("read lockfile: %v", err)
}
want := fmt.Sprintf("%d\n", os.Getpid())
if string(data) != want {
t.Errorf("lockfile content = %q, want %q", data, want)
}
}
func TestAcquireLockfile_HeldByLivePID_Refused(t *testing.T) {
dir := t.TempDir()
// Plant a lockfile holding the current PID (which is obviously alive).
if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
[]byte(fmt.Sprintf("%d\n", os.Getpid())), 0o600); err != nil {
t.Fatalf("plant lockfile: %v", err)
}
release, err := AcquireLockfile(dir)
if err == nil {
release()
t.Fatal("expected ErrLockHeld, got nil")
}
if !errors.Is(err, ErrLockHeld) {
t.Errorf("error = %v, want wrap of ErrLockHeld", err)
}
}
func TestAcquireLockfile_StalePID_Reclaimed(t *testing.T) {
dir := t.TempDir()
// PID 1 is init/launchd/systemd on POSIX and the System Idle Process
// on Windows — never our process, and very unlikely to be dead. We
// use a deliberately-impossible PID instead: a 31-bit value far
// above any plausible system maximum.
stalePID := 2147483640
if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
[]byte(fmt.Sprintf("%d\n", stalePID)), 0o600); err != nil {
t.Fatalf("plant stale lockfile: %v", err)
}
release, err := AcquireLockfile(dir)
if err != nil {
t.Fatalf("expected reclaim of stale lock, got: %v", err)
}
defer release()
// Verify it now holds OUR pid, not the stale one.
data, err := os.ReadFile(filepath.Join(dir, "tinyforge.lock"))
if err != nil {
t.Fatalf("read lockfile after reclaim: %v", err)
}
want := fmt.Sprintf("%d\n", os.Getpid())
if string(data) != want {
t.Errorf("lockfile content after reclaim = %q, want %q", data, want)
}
}
func TestAcquireLockfile_ConcurrentReclaim_SingleWinner(t *testing.T) {
dir := t.TempDir()
// Plant a stale lockfile (impossibly high, certainly-dead PID), then have
// many goroutines race to reclaim it. Exactly one must win; the rest must
// be refused with ErrLockHeld. A "last-writer-wins" reclaim would let
// several goroutines all believe they own the lock.
stalePID := 2147483640
if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
[]byte(fmt.Sprintf("%d\n", stalePID)), 0o600); err != nil {
t.Fatalf("plant stale lockfile: %v", err)
}
const n = 16
var (
wg sync.WaitGroup
mu sync.Mutex
winners int
releases []func()
)
start := make(chan struct{})
for i := 0; i < n; i++ {
wg.Add(1)
go func() {
defer wg.Done()
<-start
release, err := AcquireLockfile(dir)
if err != nil {
if !errors.Is(err, ErrLockHeld) {
t.Errorf("loser error = %v, want wrap of ErrLockHeld", err)
}
return
}
mu.Lock()
winners++
releases = append(releases, release)
mu.Unlock()
}()
}
close(start)
wg.Wait()
for _, r := range releases {
r()
}
if winners != 1 {
t.Fatalf("concurrent reclaim winners = %d, want exactly 1", winners)
}
}
func TestAcquireLockfile_ReleaseRemovesFile(t *testing.T) {
dir := t.TempDir()
release, err := AcquireLockfile(dir)
if err != nil {
t.Fatalf("AcquireLockfile: %v", err)
}
release()
path := filepath.Join(dir, "tinyforge.lock")
if _, err := os.Stat(path); !os.IsNotExist(err) {
t.Errorf("lockfile still present after release: %v", err)
}
}
+33
View File
@@ -0,0 +1,33 @@
//go:build !windows
package store
import (
"errors"
"os"
"syscall"
)
// processAlive checks whether the given PID belongs to a running process.
// On POSIX, kill(pid, 0) sends no signal but returns ESRCH if the PID is
// dead, EPERM if alive-but-foreign-owned (still "alive" for our purposes).
//
// os.FindProcess never returns a non-nil error on Linux / macOS / *BSD
// for any PID value — it just records the integer. The probe is purely
// the Signal(0) result. We keep the FindProcess call to obtain the
// *os.Process handle Signal needs; we don't branch on its error.
func processAlive(pid int) bool {
if pid <= 0 {
return false
}
proc, _ := os.FindProcess(pid)
if proc == nil {
return false
}
err := proc.Signal(syscall.Signal(0))
if err == nil {
return true
}
// EPERM = alive but not ours; ESRCH = dead.
return errors.Is(err, os.ErrPermission) || errors.Is(err, syscall.EPERM)
}
+30
View File
@@ -0,0 +1,30 @@
//go:build windows
package store
import (
"golang.org/x/sys/windows"
)
// processAlive returns true when the given PID is currently held by a
// running Windows process. OpenProcess with PROCESS_QUERY_LIMITED_INFORMATION
// is the supported way to check liveness without elevation.
func processAlive(pid int) bool {
if pid <= 0 {
return false
}
h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
if err != nil {
return false
}
defer windows.CloseHandle(h)
var exitCode uint32
if err := windows.GetExitCodeProcess(h, &exitCode); err != nil {
// Conservative: if we can't ask, assume alive so we don't reclaim
// an active lock. Worst case the operator sees ErrLockHeld and
// removes the lockfile by hand.
return true
}
const stillActive = 259 // STILL_ACTIVE
return exitCode == stillActive
}
+33
View File
@@ -278,12 +278,20 @@ const (
// containers.workload_kind and workloads.kind. After the hard cutover the
// backing project / stack / static_site tables are gone — these constants
// are just strings used to filter the unified containers index in the UI.
//
// `build` is the dockerfile-source kind: a container built from a
// Dockerfile in a Git repo. Operationally it looks like a site (one
// container, one optional public face) but its origin is the build
// pipeline, not a static-asset extract. Dashboard filters that need to
// distinguish "I built this from source" from "I served files from a
// repo" should key on this value.
type WorkloadKind string
const (
WorkloadKindProject WorkloadKind = "project"
WorkloadKindStack WorkloadKind = "stack"
WorkloadKindSite WorkloadKind = "site"
WorkloadKindBuild WorkloadKind = "build"
)
// Workload is the unifying primitive that abstracts Project, Stack, and StaticSite.
@@ -316,6 +324,31 @@ type Workload struct {
UpdatedAt string `json:"updated_at"`
}
// WorkloadNotification is one configured outbound notification route for
// a workload. Multiple rows per workload model the "one Slack channel
// for failures, one Discord webhook for successes" routing the legacy
// single notification_url column could not express.
//
// EventTypes is a comma-separated allow-list (e.g. "build_failure" or
// "deploy_success,deploy_failure"). An empty EventTypes means the row
// fires for every event type — the cheapest way to keep the existing
// single-destination behaviour expressible in the new shape.
//
// Secret round-trips through the same crypto envelope as other stored
// secrets; the API layer strips it from responses.
type WorkloadNotification struct {
ID string `json:"id"`
WorkloadID string `json:"workload_id"`
Name string `json:"name"`
URL string `json:"url"`
Secret string `json:"-"`
EventTypes string `json:"event_types"`
Enabled bool `json:"enabled"`
SortOrder int `json:"sort_order"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// Container is the normalized index of every Tinyforge-managed container.
// Replaces the project-specific Instance table after migration. Subdomain/
// proxy fields are hoisted as first-class columns because ListProxyRoutes,
+232 -2
View File
@@ -55,11 +55,20 @@ func New(dbPath string) (*Store, error) {
db.SetMaxOpenConns(1)
db.SetConnMaxLifetime(0)
// Enable WAL mode and foreign keys for better concurrency and referential integrity.
// Enable WAL mode and foreign keys for better concurrency and
// referential integrity. `synchronous=NORMAL` pairs with WAL to skip
// the per-write fsync — the OS still flushes on checkpoint, durability
// is preserved across clean shutdowns, and crashes lose at most the
// last few committed transactions (acceptable for a tinyforge box).
// cache_size=-20000 = 20 MiB page cache, temp_store=MEMORY keeps
// indexer scratch off disk; both are pure perf knobs.
pragmas := []string{
"PRAGMA journal_mode=WAL",
"PRAGMA synchronous=NORMAL",
"PRAGMA foreign_keys=ON",
"PRAGMA busy_timeout=5000",
"PRAGMA cache_size=-20000",
"PRAGMA temp_store=MEMORY",
}
for _, p := range pragmas {
if _, err := db.Exec(p); err != nil {
@@ -284,6 +293,24 @@ func (s *Store) runMigrations() error {
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
// workload_notifications: per-workload notification destinations.
// Each row is one route (Slack channel, Discord webhook, generic
// receiver, ...). event_types is a comma-separated allow-list —
// empty means "all events". When zero rows exist for a workload
// the dispatcher falls back to the legacy single notification_url
// column on workloads so existing setups keep working unchanged.
`CREATE TABLE IF NOT EXISTS workload_notifications (
id TEXT PRIMARY KEY,
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
name TEXT NOT NULL,
url TEXT NOT NULL,
secret TEXT NOT NULL DEFAULT '',
event_types TEXT NOT NULL DEFAULT '',
enabled INTEGER NOT NULL DEFAULT 1,
sort_order INTEGER NOT NULL DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
// workload_trigger_bindings: many-to-many between workloads and
// triggers. binding_config is the per-binding override applied on
// top of trigger.config (top-level JSON merge, binding wins).
@@ -427,6 +454,7 @@ func (s *Store) runMigrations() error {
`CREATE UNIQUE INDEX IF NOT EXISTS idx_triggers_webhook_secret ON triggers(webhook_secret) WHERE webhook_secret != ''`,
`CREATE INDEX IF NOT EXISTS idx_bindings_workload ON workload_trigger_bindings(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_bindings_trigger ON workload_trigger_bindings(trigger_id)`,
`CREATE INDEX IF NOT EXISTS idx_workload_notifs_workload ON workload_notifications(workload_id)`,
}
for _, idx := range indexes {
if _, err := s.db.Exec(idx); err != nil {
@@ -434,13 +462,215 @@ func (s *Store) runMigrations() error {
}
}
if err := s.backfillTriggersFromWorkloads(); err != nil {
// schema_versions table gates one-shot data migrations like the
// trigger backfill below. Without this, the backfill scan ran on
// every boot even on fully-migrated DBs — wasted I/O and (more
// importantly) made it impossible to tell whether a "no rows
// processed" was a clean state or a missed-migration bug.
if _, err := s.db.Exec(`CREATE TABLE IF NOT EXISTS schema_versions (
version INTEGER PRIMARY KEY,
applied_at TEXT NOT NULL DEFAULT (datetime('now'))
)`); err != nil {
return fmt.Errorf("create schema_versions: %w", err)
}
if err := s.runOnce(1, "trigger backfill", s.backfillTriggersFromWorkloads); err != nil {
// Backfill failure is non-fatal — we log and let the operator
// retry. The version is only recorded on success.
slog.Warn("trigger backfill", "error", err)
}
return nil
}
// runOnce executes fn at most one time per database lifetime, recording
// success in schema_versions. Useful for data migrations whose source
// table eventually disappears (so re-running becomes pointless or
// dangerous).
func (s *Store) runOnce(version int, label string, fn func() error) error {
var applied int
if err := s.db.QueryRow(`SELECT COUNT(*) FROM schema_versions WHERE version = ?`, version).Scan(&applied); err != nil {
return fmt.Errorf("check %s: %w", label, err)
}
if applied > 0 {
return nil
}
if err := fn(); err != nil {
return err
}
if _, err := s.db.Exec(`INSERT INTO schema_versions (version) VALUES (?)`, version); err != nil {
return fmt.Errorf("mark %s applied: %w", label, err)
}
slog.Info("schema migration applied", "version", version, "label", label)
return nil
}
// RunOnce is the public counterpart of runOnce, exposed so cmd/server can
// gate post-store-open migrations (e.g. crypto re-encryption that needs
// the ENCRYPTION_KEY which Store does not own) through the same
// schema_versions ledger.
func (s *Store) RunOnce(version int, label string, fn func() error) error {
return s.runOnce(version, label, fn)
}
// EnvelopeMigrator describes the contract a crypto package implements to
// rewrite legacy unprefixed-hex ciphertext as versioned envelope values.
// hasEnvelope reports whether a value already carries the new prefix.
// decrypt returns plaintext for either form; encrypt always produces the
// new envelope form. By accepting closures the store stays free of any
// import on internal/crypto, mirroring the rest of the package layout.
type EnvelopeMigrator struct {
HasEnvelope func(value string) bool
Decrypt func(ciphertext string) (string, error)
Encrypt func(plaintext string) (string, error)
}
// MigrateSecretsToEnvelope walks every column known to carry an encrypted
// secret and rewrites legacy unprefixed-hex values into the new
// envelope form using the current encryption key.
//
// Behaviour, per-row:
// - empty value → skip (no secret stored)
// - already-envelope value → skip (already migrated)
// - decrypt fails → skip (value is either plaintext from a v0 boot
// OR ciphertext from a rotated key; either way we cannot safely
// re-encrypt and leaving it alone preserves the existing read
// semantics)
// - decrypt succeeds → encrypt to envelope form + UPDATE
//
// The whole sweep runs in a single transaction so a power-loss
// mid-migration leaves the DB in either the pre- or post-migration
// state, never half. Idempotent via schema_versions version 2 — the
// next boot is a no-op.
//
// Columns covered:
// - settings.npm_password
// - settings.cloudflare_api_token
// - auth_settings.oidc_client_secret
// - registries.token
// - workload_env.value WHERE encrypted=1
func (s *Store) MigrateSecretsToEnvelope(m EnvelopeMigrator) error {
return s.runOnce(2, "secrets envelope migration", func() error {
tx, err := s.db.Begin()
if err != nil {
return fmt.Errorf("begin: %w", err)
}
defer func() { _ = tx.Rollback() }()
// Single-row tables (settings, auth_settings) — read-update inline.
singleRowColumns := []struct {
table, column string
}{
{"settings", "npm_password"},
{"settings", "cloudflare_api_token"},
{"auth_settings", "oidc_client_secret"},
}
for _, c := range singleRowColumns {
var v string
err := tx.QueryRow(
fmt.Sprintf(`SELECT %s FROM %s LIMIT 1`, c.column, c.table),
).Scan(&v)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
continue
}
// auth_settings may not exist on a brand-new DB until
// the OIDC code touches it; treat as nothing-to-migrate.
slog.Debug("envelope migration: column read skipped",
"table", c.table, "column", c.column, "error", err)
continue
}
migrated, ok := tryMigrate(m, v)
if !ok {
continue
}
if _, err := tx.Exec(
fmt.Sprintf(`UPDATE %s SET %s = ?`, c.table, c.column),
migrated,
); err != nil {
return fmt.Errorf("update %s.%s: %w", c.table, c.column, err)
}
}
// Multi-row: registries.token
if err := migrateRowColumn(tx, m,
`SELECT id, token FROM registries WHERE token != ''`,
`UPDATE registries SET token = ? WHERE id = ?`,
); err != nil {
return fmt.Errorf("registries.token: %w", err)
}
// Multi-row: workload_env.value WHERE encrypted=1
if err := migrateRowColumn(tx, m,
`SELECT id, value FROM workload_env WHERE encrypted = 1 AND value != ''`,
`UPDATE workload_env SET value = ? WHERE id = ?`,
); err != nil {
return fmt.Errorf("workload_env.value: %w", err)
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit: %w", err)
}
return nil
})
}
// migrateRowColumn applies the envelope rewrite to every (id, value)
// pair returned by selectQ. updateQ takes (newValue, id) as parameters.
// Each row is its own attempt; one row failing migration (decrypt fail)
// does not abort the others.
func migrateRowColumn(tx *sql.Tx, m EnvelopeMigrator, selectQ, updateQ string) error {
rows, err := tx.Query(selectQ)
if err != nil {
return err
}
defer rows.Close()
type pending struct{ id, newValue string }
var updates []pending
for rows.Next() {
var id, value string
if err := rows.Scan(&id, &value); err != nil {
return err
}
newValue, ok := tryMigrate(m, value)
if !ok {
continue
}
updates = append(updates, pending{id, newValue})
}
if err := rows.Err(); err != nil {
return err
}
for _, u := range updates {
if _, err := tx.Exec(updateQ, u.newValue, u.id); err != nil {
return err
}
}
return nil
}
// tryMigrate returns the envelope-form ciphertext + true when the input
// is a legacy unprefixed value that decrypts successfully with the
// current key. Returns ("", false) for anything else: empty, already
// envelope, plaintext, or decrypt-failed (rotated-key case).
func tryMigrate(m EnvelopeMigrator, v string) (string, bool) {
if v == "" {
return "", false
}
if m.HasEnvelope(v) {
return "", false
}
plaintext, err := m.Decrypt(v)
if err != nil {
return "", false
}
enc, err := m.Encrypt(plaintext)
if err != nil {
return "", false
}
return enc, true
}
// backfillTriggersFromWorkloads converts embedded trigger config on
// workload rows into standalone trigger + binding rows. Runs once per
// boot and is idempotent — only workloads with non-empty trigger_kind
+159
View File
@@ -0,0 +1,159 @@
package store
import (
"database/sql"
"errors"
"fmt"
"strings"
"github.com/google/uuid"
)
const workloadNotificationColumns = `id, workload_id, name, url, secret,
event_types, enabled, sort_order, created_at, updated_at`
func scanWorkloadNotification(scanner interface{ Scan(...any) error }) (WorkloadNotification, error) {
var n WorkloadNotification
var enabled int
err := scanner.Scan(
&n.ID, &n.WorkloadID, &n.Name, &n.URL, &n.Secret,
&n.EventTypes, &enabled, &n.SortOrder, &n.CreatedAt, &n.UpdatedAt,
)
n.Enabled = enabled != 0
return n, err
}
// CreateWorkloadNotification inserts a notification route. Returns the
// populated row (with assigned id + timestamps) so callers don't need to
// follow up with a Get.
func (s *Store) CreateWorkloadNotification(n WorkloadNotification) (WorkloadNotification, error) {
if n.WorkloadID == "" {
return WorkloadNotification{}, fmt.Errorf("workload_id is required")
}
if n.URL == "" {
return WorkloadNotification{}, fmt.Errorf("url is required")
}
if n.ID == "" {
n.ID = uuid.New().String()
}
n.CreatedAt = Now()
n.UpdatedAt = n.CreatedAt
_, err := s.db.Exec(
`INSERT INTO workload_notifications (`+workloadNotificationColumns+`)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
n.ID, n.WorkloadID, n.Name, n.URL, n.Secret,
n.EventTypes, BoolToInt(n.Enabled), n.SortOrder, n.CreatedAt, n.UpdatedAt,
)
if err != nil {
return WorkloadNotification{}, fmt.Errorf("insert workload_notification: %w", err)
}
return n, nil
}
// ListWorkloadNotifications returns every notification row for a
// workload ordered by (sort_order, created_at) so the UI stays stable
// across reorderings.
func (s *Store) ListWorkloadNotifications(workloadID string) ([]WorkloadNotification, error) {
rows, err := s.db.Query(
`SELECT `+workloadNotificationColumns+`
FROM workload_notifications
WHERE workload_id = ?
ORDER BY sort_order, created_at`,
workloadID,
)
if err != nil {
return nil, fmt.Errorf("list workload_notifications: %w", err)
}
defer rows.Close()
out := []WorkloadNotification{}
for rows.Next() {
n, err := scanWorkloadNotification(rows)
if err != nil {
return nil, fmt.Errorf("scan workload_notification: %w", err)
}
out = append(out, n)
}
return out, rows.Err()
}
// GetWorkloadNotification fetches one notification row by id. Returns
// ErrNotFound when the row does not exist so callers can return 404
// cleanly.
func (s *Store) GetWorkloadNotification(id string) (WorkloadNotification, error) {
n, err := scanWorkloadNotification(s.db.QueryRow(
`SELECT `+workloadNotificationColumns+`
FROM workload_notifications WHERE id = ?`, id,
))
if errors.Is(err, sql.ErrNoRows) {
return WorkloadNotification{}, fmt.Errorf("workload_notification %s: %w", id, ErrNotFound)
}
if err != nil {
return WorkloadNotification{}, fmt.Errorf("query workload_notification: %w", err)
}
return n, nil
}
// UpdateWorkloadNotification rewrites an existing row. WorkloadID is
// immutable — re-anchoring a route to a different workload would invite
// silent reassignments after a paste-bug in the UI; recreate instead.
func (s *Store) UpdateWorkloadNotification(n WorkloadNotification) error {
if n.ID == "" {
return fmt.Errorf("id is required")
}
if n.URL == "" {
return fmt.Errorf("url is required")
}
n.UpdatedAt = Now()
res, err := s.db.Exec(
`UPDATE workload_notifications
SET name = ?, url = ?, secret = ?, event_types = ?,
enabled = ?, sort_order = ?, updated_at = ?
WHERE id = ?`,
n.Name, n.URL, n.Secret, n.EventTypes,
BoolToInt(n.Enabled), n.SortOrder, n.UpdatedAt, n.ID,
)
if err != nil {
return fmt.Errorf("update workload_notification: %w", err)
}
rows, _ := res.RowsAffected()
if rows == 0 {
return fmt.Errorf("workload_notification %s: %w", n.ID, ErrNotFound)
}
return nil
}
// DeleteWorkloadNotification drops a single notification row.
// Idempotent: missing id returns ErrNotFound so the API can map it to
// 404 cleanly.
func (s *Store) DeleteWorkloadNotification(id string) error {
res, err := s.db.Exec(`DELETE FROM workload_notifications WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete workload_notification: %w", err)
}
rows, _ := res.RowsAffected()
if rows == 0 {
return fmt.Errorf("workload_notification %s: %w", id, ErrNotFound)
}
return nil
}
// MatchesEventType returns true when the notification row's EventTypes
// allow-list includes eventType (or is empty, meaning "match all").
// Helper exported so the notification dispatcher can fan-out filtering
// inline without duplicating the comma-split parser.
func (n WorkloadNotification) MatchesEventType(eventType string) bool {
if !n.Enabled {
return false
}
if n.EventTypes == "" {
return true
}
for _, et := range strings.Split(n.EventTypes, ",") {
if strings.TrimSpace(et) == eventType {
return true
}
}
return false
}
@@ -0,0 +1,170 @@
package store
import (
"errors"
"testing"
)
// seedWorkloadForNotifications creates a minimal workload row so the FK
// constraint on workload_notifications is satisfied. Returns the new
// workload's ID for tests to reference.
func seedWorkloadForNotifications(t *testing.T, s *Store, name string) string {
t.Helper()
w, err := s.CreateWorkload(Workload{
Kind: string(WorkloadKindProject),
Name: name,
SourceKind: "image",
})
if err != nil {
t.Fatalf("seed workload: %v", err)
}
return w.ID
}
func TestCreateWorkloadNotification_RoundTrip(t *testing.T) {
s := newTestStore(t)
wlID := seedWorkloadForNotifications(t, s, "app1")
created, err := s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID,
Name: "Slack alerts",
URL: "https://hooks.slack.test/x",
Secret: "shh",
EventTypes: "deploy_failure,build_failure",
Enabled: true,
})
if err != nil {
t.Fatalf("CreateWorkloadNotification: %v", err)
}
if created.ID == "" {
t.Fatal("expected ID to be assigned")
}
got, err := s.GetWorkloadNotification(created.ID)
if err != nil {
t.Fatalf("Get: %v", err)
}
if got.URL != "https://hooks.slack.test/x" || got.Name != "Slack alerts" {
t.Errorf("row mismatch: %+v", got)
}
if !got.Enabled {
t.Error("expected Enabled=true")
}
if got.EventTypes != "deploy_failure,build_failure" {
t.Errorf("event_types = %q", got.EventTypes)
}
}
func TestCreateWorkloadNotification_RejectsMissingURL(t *testing.T) {
s := newTestStore(t)
wlID := seedWorkloadForNotifications(t, s, "app1")
_, err := s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID,
Name: "broken",
URL: "",
})
if err == nil {
t.Fatal("expected URL validation error")
}
}
func TestListWorkloadNotifications_SortedByOrder(t *testing.T) {
s := newTestStore(t)
wlID := seedWorkloadForNotifications(t, s, "app1")
// Insert out of order; ListWorkloadNotifications should return
// them sorted by SortOrder ascending.
_, _ = s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID, Name: "C", URL: "https://c.test", SortOrder: 30,
})
_, _ = s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID, Name: "A", URL: "https://a.test", SortOrder: 10,
})
_, _ = s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID, Name: "B", URL: "https://b.test", SortOrder: 20,
})
rows, err := s.ListWorkloadNotifications(wlID)
if err != nil {
t.Fatalf("list: %v", err)
}
if len(rows) != 3 {
t.Fatalf("len = %d, want 3", len(rows))
}
if rows[0].Name != "A" || rows[1].Name != "B" || rows[2].Name != "C" {
t.Errorf("sort order wrong: %q %q %q", rows[0].Name, rows[1].Name, rows[2].Name)
}
}
func TestUpdateWorkloadNotification_PersistsChanges(t *testing.T) {
s := newTestStore(t)
wlID := seedWorkloadForNotifications(t, s, "app1")
n, _ := s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID, Name: "old", URL: "https://old.test", Enabled: true,
})
n.Name = "new"
n.URL = "https://new.test"
n.Enabled = false
n.EventTypes = "deploy_success"
if err := s.UpdateWorkloadNotification(n); err != nil {
t.Fatalf("update: %v", err)
}
got, _ := s.GetWorkloadNotification(n.ID)
if got.Name != "new" || got.URL != "https://new.test" || got.Enabled {
t.Errorf("update did not persist: %+v", got)
}
}
func TestDeleteWorkloadNotification_ReturnsNotFoundForMissing(t *testing.T) {
s := newTestStore(t)
err := s.DeleteWorkloadNotification("nope")
if !errors.Is(err, ErrNotFound) {
t.Errorf("expected ErrNotFound, got %v", err)
}
}
func TestDeleteWorkloadNotification_CascadesFromWorkload(t *testing.T) {
s := newTestStore(t)
wlID := seedWorkloadForNotifications(t, s, "app1")
_, _ = s.CreateWorkloadNotification(WorkloadNotification{
WorkloadID: wlID, Name: "x", URL: "https://x.test",
})
if err := s.DeleteWorkload(wlID); err != nil {
t.Fatalf("delete workload: %v", err)
}
rows, err := s.ListWorkloadNotifications(wlID)
if err != nil {
t.Fatalf("list after cascade: %v", err)
}
if len(rows) != 0 {
t.Errorf("expected cascade delete to remove rows, got %d", len(rows))
}
}
func TestMatchesEventType_AllowList(t *testing.T) {
cases := []struct {
eventTypes string
probe string
want bool
}{
{"", "deploy_success", true}, // empty = all
{"deploy_success,deploy_failure", "deploy_success", true},
{"deploy_success,deploy_failure", "build_failure", false},
{"build_failure", "build_failure", true},
{" deploy_success , build_failure ", "build_failure", true}, // whitespace tolerated
}
for _, c := range cases {
n := WorkloadNotification{Enabled: true, EventTypes: c.eventTypes}
got := n.MatchesEventType(c.probe)
if got != c.want {
t.Errorf("MatchesEventType(%q, %q) = %v, want %v", c.eventTypes, c.probe, got, c.want)
}
}
}
func TestMatchesEventType_DisabledNeverMatches(t *testing.T) {
n := WorkloadNotification{Enabled: false, EventTypes: ""}
if n.MatchesEventType("any") {
t.Error("disabled row should never match")
}
}
+20 -4
View File
@@ -173,11 +173,24 @@ func (s *Store) UpdateWorkload(w Workload) error {
return nil
}
// DeleteWorkload removes a workload row. Cascading deletes for the matching
// project/stack/site row stay with the kind-specific Delete functions; this
// only removes the workload entry.
// DeleteWorkload removes a workload row. Cascading deletes for FK-backed
// child tables (workload_env, workload_volumes, workload_trigger_bindings)
// happen via SQLite's ON DELETE CASCADE. The `containers` table doesn't
// yet have an FK to workloads (planned migration — see ops notes), so we
// drop its rows explicitly here in the same transaction to prevent zombie
// container rows from outliving their owning workload.
func (s *Store) DeleteWorkload(id string) error {
result, err := s.db.Exec(`DELETE FROM workloads WHERE id = ?`, id)
tx, err := s.db.Begin()
if err != nil {
return fmt.Errorf("begin: %w", err)
}
defer func() { _ = tx.Rollback() }()
// Explicit container cleanup until the FK migration lands.
if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
return fmt.Errorf("delete containers: %w", err)
}
result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete workload: %w", err)
}
@@ -188,6 +201,9 @@ func (s *Store) DeleteWorkload(id string) error {
if n == 0 {
return fmt.Errorf("workload %s: %w", id, ErrNotFound)
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit: %w", err)
}
return nil
}