feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus): - Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review): WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation, ConfirmDialog-based unsaved-changes guard. - Extract lib/workload/sourceForms.ts (single source of truth for source_config) + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the /apps/[id] edit form onto the same components (removes the duplication). Add vitest + sourceForms unit tests. - Branch preview environments UI: /chain is_preview/preview_branch + a Preview environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed state); RegistryImagePicker on the registry trigger and the image source. - Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect; conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory label hints; dashboard + /apps "Total workloads" count only source_kind workloads (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker empty-list guard. - Update CLAUDE.md frontend conventions + add a Build & Test section. Also captures pre-existing in-progress platform work (not from this session): workload notifications, Prometheus metrics export, store lockfile, health probes, backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
@@ -16,13 +16,12 @@ import (
 )

 // rateLimitedLogin wraps the login handler with per-IP rate limiting.
+// Uses clientIP() so X-Forwarded-For is honored only when the request
+// arrives from a configured trusted-proxy CIDR — preventing remote
+// attackers from spoofing the header to bypass the per-IP login limiter.
 func (s *Server) rateLimitedLogin(rl *rateLimiter) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
-		ip := r.RemoteAddr
-		if fwd := r.Header.Get("X-Forwarded-For"); fwd != "" {
-			ip = fwd
-		}
-		if !rl.allow(ip) {
+		if !rl.allow(clientIP(r)) {
 			respondError(w, http.StatusTooManyRequests, "too many login attempts, try again later")
 			return
 		}
@@ -1,7 +1,6 @@
 package api

 import (
-	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -118,7 +117,22 @@ func (s *Server) deleteBackup(w http.ResponseWriter, r *http.Request) {
 }

 // restoreBackup handles POST /api/backups/{id}/restore.
-// This replaces the current database with the backup and triggers a graceful shutdown.
+//
+// Restore happens in three documented stages so a failure at any stage
+// leaves the live DB intact:
+//
+//  1. PRE-FLIGHT (sync, before the HTTP response): PrepareRestore opens
+//     the candidate read-only and runs `PRAGMA integrity_check`. If it
+//     fails the live DB is untouched and we return 400 with the reason.
+//
+//  2. SAFETY NET: a pre-restore backup of the LIVE DB is created so the
+//     operator can roll back even if the candidate is later discovered
+//     to be missing data.
+//
+//  3. SWAP (async, after the response is flushed): close the live DB,
+//     atomic-rename the candidate over the live path, wipe WAL/SHM,
+//     trigger graceful shutdown. supervisord / systemd / docker
+//     restart=on-failure brings the process back with the new DB.
 func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	if s.backupEngine == nil {
 		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
@@ -126,13 +140,44 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	}

 	id := chi.URLParam(r, "id")
-	restorePath, err := s.backupEngine.RestorePath(id)
-	if err != nil {
-		respondError(w, http.StatusNotFound, "backup not found: "+err.Error())
+
+	// CSRF / accidental-fire guard: the restore endpoint is the most
+	// destructive surface in the API (replaces the whole DB). Even
+	// though it sits behind AdminOnly + Bearer JWT, a blind cross-site
+	// POST or a misclicked button in any open admin tab can fire it.
+	// Require the operator's client to echo X-Confirm-Restore: <id>
+	// — matching the path param — so a CSRF post-form / image-src
+	// trick can't trigger restore (browsers don't let cross-origin
+	// requests set custom headers without a preflight).
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != id {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal backup id)")
 		return
 	}

-	// Create a safety backup before restore so the user can undo if needed.
+	// Single-flight guard: a rapid double-click would otherwise spawn
+	// two goroutines racing s.store.Close() and the candidate-over-
+	// live rename. CAS to true here; if someone else won, return 409.
+	if !s.restoreInFlight.CompareAndSwap(false, true) {
+		respondError(w, http.StatusConflict, "a restore is already in progress")
+		return
+	}
+	// Do NOT release the flag — the restore path triggers shutdown.
+	// A failed restore is also terminal (the DB may be closed); a
+	// fresh process boot is the recovery path.
+	// PRE-FLIGHT: refuse before touching anything if the candidate is
+	// not a valid SQLite database or fails integrity_check. This is the
+	// guard the prior code lacked — a corrupt backup would silently
+	// overwrite a healthy live DB.
+	restorePath, err := s.backupEngine.PrepareRestore(id)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	// SAFETY NET: pre-restore snapshot of the live DB. A failure here
+	// is logged but does not abort — the integrity-checked candidate
+	// is still safer than refusing to restore.
 	if _, err := s.backupEngine.CreateBackup("pre-restore"); err != nil {
 		slog.Warn("failed to create pre-restore backup", "error", err)
 	}
@@ -153,41 +198,37 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	go func() {
 		time.Sleep(500 * time.Millisecond)

-		// Close the current database to release locks.
+		// Once we begin closing the live DB the process can no longer serve
+		// requests against a sane store, so EVERY exit path from here must
+		// trigger shutdown. Returning early would leave the server limping
+		// on a closed/half-swapped database with no path to recovery except
+		// an external kill. shutdownFunc → graceful shutdown → main returns
+		// → deferred releaseLock()/db.Close() run, and the supervisor reopens
+		// whatever DB is on disk on the next boot.
+		triggerShutdown := func() {
+			if s.shutdownFunc != nil {
+				s.shutdownFunc()
+			}
+		}
+
+		// Close the current database to release locks. AtomicReplaceDB
+		// expects the live file to be unmapped before swap (especially
+		// important on Windows where open files cannot be renamed over).
 		if err := s.store.Close(); err != nil {
-			slog.Error("restore: failed to close database", "error", err)
+			slog.Error("restore: failed to close database, restarting", "error", err)
+			triggerShutdown()
 			return
 		}

-		// Copy the backup file over the main database using streaming (no full read into memory).
-		src, err := os.Open(restorePath)
-		if err != nil {
-			slog.Error("restore: failed to open backup file", "error", err)
+		if err := s.backupEngine.AtomicReplaceDB(restorePath, s.dbPath); err != nil {
+			slog.Error("restore: atomic replace failed, restarting", "error", err)
+			triggerShutdown()
 			return
 		}
-		defer src.Close()
-
-		dst, err := os.Create(s.dbPath)
-		if err != nil {
-			slog.Error("restore: failed to create database file", "error", err)
-			return
-		}
-		defer dst.Close()
-
-		if _, err := io.Copy(dst, src); err != nil {
-			slog.Error("restore: failed to copy backup to database", "error", err)
-			return
-		}
-
-		// Remove WAL and SHM files to ensure clean state.
-		os.Remove(s.dbPath + "-wal")
-		os.Remove(s.dbPath + "-shm")

 		slog.Info("restore: database replaced, triggering shutdown")

 		// Signal the server to shut down gracefully so it can be restarted.
-		if s.shutdownFunc != nil {
-			s.shutdownFunc()
-		}
+		triggerShutdown()
 	}()
 }
@@ -9,6 +9,7 @@ import (
 	"strings"
 	"time"

+	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/staticsite"
 )

@@ -350,6 +351,54 @@ func (s *Server) listImageConflicts(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, conflicts)
 }

+// inspectImageRequest is the body for POST /api/discovery/image/inspect.
+type inspectImageRequest struct {
+	Image string `json:"image"`
+}
+
+// inspectImageResponse mirrors the frontend InspectResult shape the
+// new-app wizard pre-fills from: the first exposed port (parsed to int,
+// 0 when none) and the image's HEALTHCHECK command string.
+type inspectImageResponse struct {
+	Port        int    `json:"port"`
+	Healthcheck string `json:"healthcheck"`
+}
+
+// inspectImageMetadata inspects a LOCAL image and returns its first
+// exposed port + healthcheck so the wizard can pre-fill those fields.
+// POST /api/discovery/image/inspect.
+//
+// This inspects local images only — it does not pull. When the image is
+// not present locally the docker call fails; we return a generic,
+// non-leaky 400 rather than the git-specific upstreamError so a raw
+// docker daemon string (which may echo the ref) never reaches the client.
+func (s *Server) inspectImageMetadata(w http.ResponseWriter, r *http.Request) {
+	var req inspectImageRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	image := strings.TrimSpace(req.Image)
+	if image == "" {
+		respondError(w, http.StatusBadRequest, "image is required")
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	info, err := s.docker.InspectImage(ctx, image)
+	if err != nil {
+		slog.Warn("inspect image metadata failed", "error", err)
+		respondError(w, http.StatusBadRequest, "could not inspect image — make sure it is pulled locally and the reference is correct")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, inspectImageResponse{
+		Port:        docker.ExtractPort(info.ExposedPorts),
+		Healthcheck: info.Healthcheck,
+	})
+}
+
 // stripImageTag returns the image reference with the trailing :tag
 // removed, taking care to leave a registry port (e.g. registry:5000/foo)
 // intact. Digest references (image@sha256:...) are returned unchanged.
@@ -0,0 +1,64 @@
+package api
+
+import (
+	"context"
+	"log/slog"
+	"net/http"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/metrics"
+)
+
+// livez always returns 200 if the process is up. Used by container
+// orchestrators / load balancers / Docker HEALTHCHECK as the "is the
+// binary alive" probe. Intentionally does NOT touch the DB or Docker —
+// a slow DB must not cause restart loops.
+func (s *Server) livez(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	_, _ = w.Write([]byte("ok\n"))
+}
+
+// readyz returns 200 only when the process can actually serve traffic:
+// SQLite is reachable, the encryption key is loaded, the deployer is
+// not draining. The response body is intentionally minimal — the
+// specific failing probe name is recorded in slog (operator-visible)
+// rather than returned to unauthenticated callers. This avoids handing
+// reconnaissance to an attacker who can hit /readyz during an outage
+// ("DB down" vs "encryption key missing" leaks operational state).
+func (s *Server) readyz(w http.ResponseWriter, r *http.Request) {
+	ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
+	defer cancel()
+
+	// DB ping: cheap and exact — exercises the connection pool, file
+	// lock, and busy-timeout. A failing ping means SQLite WAL is wedged
+	// or the data dir is gone.
+	if err := s.store.DB().PingContext(ctx); err != nil {
+		slog.Warn("readyz: db ping failed", "error", err)
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		_, _ = w.Write([]byte("not ready\n"))
+		return
+	}
+
+	// Encryption key sanity: if it's zero we cannot decrypt any stored
+	// secret, so the deployer paths will all explode at first use.
+	if s.encKey == ([32]byte{}) {
+		slog.Warn("readyz: encryption key not loaded")
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		_, _ = w.Write([]byte("not ready\n"))
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	_, _ = w.Write([]byte("ready\n"))
+}
+
+// metricsExport writes the process-wide metrics registry in Prometheus
+// text format. Admin-only by router placement; surface is intentionally
+// thin (no histograms / quantiles, only counters) to keep the binary
+// dependency-free.
+func (s *Server) metricsExport(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+	_ = metrics.DefaultRegistry.WritePrometheus(w)
+}
@@ -1,14 +1,119 @@
 package api

 import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
 	"log/slog"
+	"net"
 	"net/http"
+	"os"
 	"runtime/debug"
 	"strings"
 	"sync"
 	"time"
+
+	"github.com/alexei/tinyforge/internal/metrics"
 )

+// requestIDKey is the context key under which the generated/forwarded
+// X-Request-ID is stored. Exported indirectly via RequestIDFromContext
+// so handlers and services downstream of the API layer can thread it
+// into their own slog calls without re-extracting from headers.
+type requestIDKeyType struct{}
+
+var requestIDKey = requestIDKeyType{}
+
+// RequestIDFromContext returns the correlation ID for the request, or
+// "" when called outside the API request path.
+func RequestIDFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(requestIDKey).(string); ok {
+		return v
+	}
+	return ""
+}
+
+// requestID middleware ensures every request has a stable correlation
+// ID. Honors a caller-supplied X-Request-ID when the request comes from
+// a trusted proxy AND the value matches a safe character set; otherwise
+// generates a fresh 128-bit ID. The ID is echoed back as X-Request-ID
+// and stitched into every subsequent slog call via the context value
+// the `logging` middleware reads.
+//
+// Format clamp: a compromised reverse proxy (or one that mis-parses an
+// untrusted header) could forward an ID containing newlines, semicolons,
+// or other separator characters. Those would corrupt structured log
+// parsers that assume one record per line / key-value. Restricting to
+// `[A-Za-z0-9._-]{1,64}` covers UUIDs, hex IDs, and trace-context IDs
+// without any sharp edges.
+func requestID(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		rid := r.Header.Get("X-Request-ID")
+		if rid == "" || !isTrustedPeer(r) || !isValidRequestID(rid) {
+			rid = newRequestID()
+		}
+		w.Header().Set("X-Request-ID", rid)
+		ctx := context.WithValue(r.Context(), requestIDKey, rid)
+		next.ServeHTTP(w, r.WithContext(ctx))
+	})
+}
+
+// isValidRequestID enforces `[A-Za-z0-9._-]{1,64}` without compiling a
+// regex on the request path. Single linear scan, no allocations.
+func isValidRequestID(s string) bool {
+	if len(s) == 0 || len(s) > 64 {
+		return false
+	}
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		switch {
+		case c >= 'A' && c <= 'Z':
+		case c >= 'a' && c <= 'z':
+		case c >= '0' && c <= '9':
+		case c == '.' || c == '_' || c == '-':
+		default:
+			return false
+		}
+	}
+	return true
+}
+
+// isTrustedPeer is a thin wrapper around the TRUSTED_PROXY_CIDRS allow-
+// list — we honor a forwarded request-id only from upstreams we already
+// trust for X-Forwarded-For. Otherwise an internet client could spam
+// log files with attacker-chosen IDs.
+func isTrustedPeer(r *http.Request) bool {
+	peer := r.RemoteAddr
+	if host, _, err := net.SplitHostPort(peer); err == nil {
+		peer = host
+	}
+	if len(trustedProxyCIDRs) == 0 {
+		return false
+	}
+	ip := net.ParseIP(peer)
+	if ip == nil {
+		return false
+	}
+	for _, n := range trustedProxyCIDRs {
+		if n.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
+func newRequestID() string {
+	var b [16]byte
+	if _, err := rand.Read(b[:]); err != nil {
+		// Fall back to time-based suffix if crypto/rand is unavailable
+		// — extremely unlikely outside of broken environments, but the
+		// ID is for tracing not security, so a deterministic fallback
+		// is preferable to a panic.
+		return "ts-" + time.Now().UTC().Format("20060102T150405.000000000")
+	}
+	return hex.EncodeToString(b[:])
+}
+
 // logging is an HTTP middleware that logs every request with method, path,
 // status code, and duration. Webhook URLs are redacted before being logged
 // because the secret is the only authenticator — leaking it to log
@@ -20,15 +125,58 @@ func logging(next http.Handler) http.Handler {

 		next.ServeHTTP(wrapped, r)

-		slog.Info("http request",
+		fields := []any{
 			"method", r.Method,
 			"path", redactPath(r.URL.Path),
 			"status", wrapped.status,
 			"duration", time.Since(start).String(),
-		)
+		}
+		if rq := redactQuery(r.URL.RawQuery); rq != "" {
+			fields = append(fields, "query", rq)
+		}
+		if rid := RequestIDFromContext(r.Context()); rid != "" {
+			fields = append(fields, "request_id", rid)
+		}
+		slog.Info("http request", fields...)
+
+		// Lightweight per-request counter. Bucket by status class so
+		// the cardinality stays at 5 × #methods regardless of how many
+		// distinct response codes we emit.
+		metrics.HTTPRequestsTotal.Inc(bucketMethod(r.Method), statusClass(wrapped.status))
 	})
 }

+// bucketMethod normalises HTTP method names against the standard set
+// so a malicious client cannot spam arbitrary method tokens (RFC 7230
+// allows any token) and inflate the metrics map. Anything off the
+// allow-list collapses to "other".
+func bucketMethod(m string) string {
+	switch m {
+	case "GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS", "CONNECT", "TRACE":
+		return m
+	}
+	return "other"
+}
+
+// statusClass buckets a status code into "1xx".."5xx" / "other". Keeps
+// metrics cardinality bounded so a chatty endpoint can't explode the
+// metrics map with one series per distinct response code.
+func statusClass(code int) string {
+	switch {
+	case code >= 100 && code < 200:
+		return "1xx"
+	case code >= 200 && code < 300:
+		return "2xx"
+	case code >= 300 && code < 400:
+		return "3xx"
+	case code >= 400 && code < 500:
+		return "4xx"
+	case code >= 500 && code < 600:
+		return "5xx"
+	}
+	return "other"
+}
+
 // redactPath strips secrets from URL paths that carry them in segments.
 // Only the canonical /api/webhook/triggers/{secret} surface remains after
 // the hard cutover.
@@ -40,6 +188,45 @@ func redactPath(path string) string {
 	return path
 }

+// redactQueryKeys is the case-insensitive set of query-parameter names whose
+// values are masked before a URL lands in the request log. `token` is used by
+// SSE/EventSource when a custom header can't be set; the rest are
+// defence-in-depth against sensitive values ever appearing in a query string.
+var redactQueryKeys = map[string]struct{}{
+	"token":         {},
+	"secret":        {},
+	"password":      {},
+	"passwd":        {},
+	"api_key":       {},
+	"apikey":        {},
+	"access_token":  {},
+	"client_secret": {},
+	"sig":           {},
+	"signature":     {},
+}
+
+// redactQuery masks the values of sensitive query parameters (see
+// redactQueryKeys) in a URL's raw query before it lands in the request log.
+// Key matching is case-insensitive. Returns the input unchanged when there is
+// nothing to redact so a malformed URL surfaces naturally.
+func redactQuery(rawQuery string) string {
+	if rawQuery == "" {
+		return ""
+	}
+	parts := strings.Split(rawQuery, "&")
+	for i, p := range parts {
+		eq := strings.IndexByte(p, '=')
+		if eq < 0 {
+			continue
+		}
+		key := strings.ToLower(p[:eq])
+		if _, ok := redactQueryKeys[key]; ok {
+			parts[i] = p[:eq+1] + "***"
+		}
+	}
+	return strings.Join(parts, "&")
+}
+
 // recovery is an HTTP middleware that catches panics and returns a 500 response.
 func recovery(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -54,16 +241,49 @@ func recovery(next http.Handler) http.Handler {
 }

 // securityHeaders sets standard security headers on all responses.
+//
+// Strict-Transport-Security is emitted only when the request arrived
+// over HTTPS (direct TLS or forwarded). Emitting HSTS over plain HTTP
+// is harmless to compliant browsers but flags as an issue in scanners
+// and confuses some reverse proxies.
+//
+// The CSP keeps `'unsafe-inline'` for now because SvelteKit injects
+// inline boot scripts and styles; removing it requires a nonce-based
+// strategy threaded through the SvelteKit handle hook. Tracked as a
+// follow-up; documented in the security report.
 func securityHeaders(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("X-Content-Type-Options", "nosniff")
 		w.Header().Set("X-Frame-Options", "DENY")
 		w.Header().Set("Referrer-Policy", "strict-origin-when-cross-origin")
-		w.Header().Set("Content-Security-Policy", "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; connect-src 'self'; font-src 'self'")
+		w.Header().Set("Permissions-Policy", "camera=(), microphone=(), geolocation=(), payment=()")
+		w.Header().Set("Content-Security-Policy",
+			"default-src 'self'; "+
+				"script-src 'self' 'unsafe-inline'; "+
+				"style-src 'self' 'unsafe-inline'; "+
+				"img-src 'self' data:; "+
+				"connect-src 'self'; "+
+				"font-src 'self'; "+
+				"frame-ancestors 'none'; "+
+				"base-uri 'self'; "+
+				"form-action 'self'")
+		if isHTTPS(r) {
+			w.Header().Set("Strict-Transport-Security", "max-age=31536000; includeSubDomains")
+		}
 		next.ServeHTTP(w, r)
 	})
 }

+func isHTTPS(r *http.Request) bool {
+	if r.TLS != nil {
+		return true
+	}
+	if r.Header.Get("X-Forwarded-Proto") == "https" {
+		return true
+	}
+	return false
+}
+
 // cors is an HTTP middleware that handles CORS for same-origin requests.
 // The frontend is served from the same origin, so cross-origin requests are not expected.
 func cors(next http.Handler) http.Handler {
@@ -164,10 +384,7 @@ func jsonContentType(next http.Handler) http.Handler {
 func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			ip := r.RemoteAddr
-			if fwd := r.Header.Get("X-Forwarded-For"); fwd != "" {
-				ip = fwd
-			}
+			ip := clientIP(r)
 			if !rl.allow(ip) {
 				respondError(w, http.StatusTooManyRequests, "rate limit exceeded")
 				return
@@ -177,6 +394,100 @@ func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
 	}
 }

+// trustedProxyCIDRs is the parsed allow-list of upstream proxy networks
+// whose X-Forwarded-For header we honor. Set TRUSTED_PROXY_CIDRS to a
+// comma-separated list of CIDRs (e.g. "127.0.0.1/32,10.0.0.0/8") to
+// enable. When unset (the default) X-Forwarded-For is ignored entirely
+// and rate limiting + audit logging use r.RemoteAddr — preventing a
+// remote attacker from spoofing the header to bypass per-IP limiters.
+var trustedProxyCIDRs = parseTrustedProxyCIDRs(os.Getenv("TRUSTED_PROXY_CIDRS"))
+
+func parseTrustedProxyCIDRs(raw string) []*net.IPNet {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+	var nets []*net.IPNet
+	for _, p := range strings.Split(raw, ",") {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		// Allow bare IPs as /32 (IPv4) or /128 (IPv6).
+		if !strings.Contains(p, "/") {
+			if ip := net.ParseIP(p); ip != nil {
+				if ip.To4() != nil {
+					p += "/32"
+				} else {
+					p += "/128"
+				}
+			}
+		}
+		_, n, err := net.ParseCIDR(p)
+		if err != nil {
+			slog.Warn("ignoring invalid TRUSTED_PROXY_CIDRS entry", "value", p, "error", err)
+			continue
+		}
+		nets = append(nets, n)
+	}
+	return nets
+}
+
+// clientIP returns the per-request "client" address used for rate-limit
+// keying and audit attribution. X-Forwarded-For is honored ONLY when the
+// direct peer (r.RemoteAddr) belongs to a configured trusted-proxy CIDR;
+// otherwise the header is ignored to prevent header-spoofing bypasses.
+func clientIP(r *http.Request) string {
+	peer := r.RemoteAddr
+	if host, _, err := net.SplitHostPort(peer); err == nil {
+		peer = host
+	}
+	if len(trustedProxyCIDRs) == 0 {
+		return peer
+	}
+	peerIP := net.ParseIP(peer)
+	if peerIP == nil || !isTrustedProxy(peerIP) {
+		return peer
+	}
+	fwd := r.Header.Get("X-Forwarded-For")
+	if fwd == "" {
+		return peer
+	}
+	// Walk X-Forwarded-For from the RIGHTMOST entry (the address closest to
+	// us, appended by our trusted peer) leftward, skipping entries that are
+	// themselves trusted proxies, and return the first untrusted address.
+	// The LEFTMOST entry is fully client-controlled — trusting it (as a
+	// naive `fwd[:firstComma]` does) lets an attacker spoof their rate-limit
+	// and audit identity by prepending a forged value, defeating the per-IP
+	// login limiter.
+	parts := strings.Split(fwd, ",")
+	for i := len(parts) - 1; i >= 0; i-- {
+		candidate := strings.TrimSpace(parts[i])
+		ip := net.ParseIP(candidate)
+		if ip == nil {
+			continue
+		}
+		if isTrustedProxy(ip) {
+			continue
+		}
+		return candidate
+	}
+	// Every forwarded entry was a trusted proxy (or unparseable) — fall back
+	// to the direct peer.
+	return peer
+}
+
+// isTrustedProxy reports whether ip falls within a configured
+// trusted-proxy CIDR.
+func isTrustedProxy(ip net.IP) bool {
+	for _, n := range trustedProxyCIDRs {
+		if n.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
 // statusRecorder wraps http.ResponseWriter to capture the status code.
 type statusRecorder struct {
 	http.ResponseWriter
@@ -4,6 +4,7 @@ import (
 	"context"
 	"log/slog"
 	"sync"
+	"sync/atomic"

 	"github.com/go-chi/chi/v5"

@@ -61,6 +62,13 @@ type Server struct {
 	shutdownFunc            func()                                // called after restore to trigger graceful shutdown
 	onBackupSettingsChanged func(enabled bool, intervalHours int) // called when backup settings change
 	onProxyProviderChanged  func(provider proxy.Provider)         // called when proxy provider changes
+
+	// restoreInFlight is a process-wide guard against double-firing
+	// the restore endpoint. A rapid double-click would otherwise
+	// schedule two goroutines racing s.store.Close() and the
+	// candidate-over-live rename. CAS to true at the entry point;
+	// reject the second caller with 409 Conflict.
+	restoreInFlight atomic.Bool
 }

 // NewServer creates a new API Server with all required dependencies.
@@ -157,13 +165,32 @@ func (s *Server) SetDNSProviderChangedCallback(fn DNSProviderChangedFunc) {

 // initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
 func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
-	// Decrypt the OIDC client secret if it's encrypted.
+	// Decrypt the OIDC client secret. The prior code did a try-decrypt
+	// and silently treated failures as plaintext — under a rotated key
+	// that sent ciphertext upstream to the OP. Now:
+	//   - If the value carries the tf1: envelope → fail loud on
+	//     decrypt failure (rotated key / corrupted ciphertext).
+	//   - If the value is unprefixed (legacy ciphertext from v0 or true
+	//     plaintext from an old migration) → try decrypt; on failure
+	//     accept as plaintext (the only safe legacy interpretation).
 	clientSecret := as.OIDCClientSecret
 	if clientSecret != "" {
-		if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
+		switch {
+		case crypto.HasEnvelope(clientSecret):
+			decrypted, err := crypto.Decrypt(s.encKey, clientSecret)
+			if err != nil {
+				slog.Error("OIDC client secret could not be decrypted — refusing to initialize provider",
+					"error", err,
+					"hint", "rotate ENCRYPTION_KEY back, OR re-save OIDC settings to re-encrypt with the current key")
+				return
+			}
 			clientSecret = decrypted
+		default:
+			// Legacy v0 value: try decrypt; on failure assume plaintext.
+			if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
+				clientSecret = decrypted
+			}
 		}
-		// If decrypt fails, assume it's already plaintext (migration scenario).
 	}
 	provider, err := auth.NewOIDCProvider(ctx, auth.OIDCConfig{
 		IssuerURL:    as.OIDCIssuerURL,
@@ -183,12 +210,29 @@ func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
 func (s *Server) Router() chi.Router {
 	r := chi.NewRouter()

-	// Global middleware.
+	// Global middleware. requestID runs first so every downstream log
+	// line (and the access log emitted by `logging`) carries the same
+	// correlation id, plus the response carries it back on the
+	// X-Request-ID header for the operator to grep across services.
+	r.Use(requestID)
 	r.Use(recovery)
 	r.Use(securityHeaders)
 	r.Use(logging)
 	r.Use(cors)

+	// Unauthenticated health probes — mounted at the root so container
+	// orchestrators / load balancers can hit them without knowing about
+	// the /api prefix. /livez intentionally does no work and stays
+	// unbounded; /readyz pings the DB and is rate-limited to keep an
+	// unauthenticated flood from serialising behind SQLite's single
+	// writer connection (busy-timeout = 5s) and log-amplifying every
+	// request via the structured access log. The 10-per-minute budget
+	// is the existing rateLimiter default — generous for k8s readiness
+	// probes (typically every 5-10s), restrictive for an attacker.
+	r.Get("/livez", s.livez)
+	readyLimiter := newRateLimiter()
+	r.With(rateLimitMiddleware(readyLimiter)).Get("/readyz", s.readyz)
+
 	loginLimiter := newRateLimiter()
 	webhookLimiter := newRateLimiter()

@@ -232,6 +276,7 @@ func (s *Server) Router() chi.Router {
 				r.Post("/discovery/git/branches", s.listGitBranches)
 				r.Post("/discovery/git/tree", s.listGitTree)
 				r.Get("/discovery/image/conflicts", s.listImageConflicts)
+				r.Post("/discovery/image/inspect", s.inspectImageMetadata)
 			})

 			// Read-only endpoints (any authenticated user).
@@ -245,16 +290,18 @@ func (s *Server) Router() chi.Router {
 			r.Get("/events/log/stats", s.getEventLogStats)
 			r.Get("/registries", s.listRegistries)
 			r.Route("/registries/{id}", func(r chi.Router) {
+				// All registry probes are admin-gated. The /tags and
+				// /images endpoints used to be open to any authenticated
+				// user, but they make outbound requests using the
+				// admin-encrypted registry token — a viewer could
+				// effectively drive arbitrary requests against a private
+				// registry under admin credentials.
+				r.Use(auth.AdminOnly)
 				r.Get("/tags/*", s.listRegistryTags)
 				r.Get("/images", s.listRegistryImages)
-
-				// Admin-only registry mutations.
-				r.Group(func(r chi.Router) {
-					r.Use(auth.AdminOnly)
-					r.Put("/", s.updateRegistry)
-					r.Delete("/", s.deleteRegistry)
-					r.Post("/test", s.testRegistry)
-				})
+				r.Put("/", s.updateRegistry)
+				r.Delete("/", s.deleteRegistry)
+				r.Post("/test", s.testRegistry)
 			})
 			r.Get("/settings", s.getSettings)
 			r.Get("/settings/npm-certificates", s.listNpmCertificates)
@@ -312,6 +359,15 @@ func (s *Server) Router() chi.Router {
 				// of /triggers/{id}/bindings keyed on the workload side.
 				r.Get("/triggers", s.listBindingsForWorkload)
 				r.With(auth.AdminOnly).Post("/triggers", s.bindTriggerToWorkload)
+
+				// Per-workload notification routes — multi-destination
+				// fan-out (Slack channel + Discord webhook + ...). When
+				// zero rows are configured the dispatcher falls back to
+				// the legacy single-URL columns on the workload row.
+				r.Get("/notifications", s.listWorkloadNotifications)
+				r.With(auth.AdminOnly).Post("/notifications", s.createWorkloadNotification)
+				r.With(auth.AdminOnly).Put("/notifications/{nid}", s.updateWorkloadNotification)
+				r.With(auth.AdminOnly).Delete("/notifications/{nid}", s.deleteWorkloadNotification)
 			})

 			// Global container index, joined to workload + app names.
@@ -379,6 +435,12 @@ func (s *Server) Router() chi.Router {
 			r.Group(func(r chi.Router) {
 				r.Use(auth.AdminOnly)

+				// Prometheus-format metrics export. Admin-only so the
+				// counter cardinality cannot be enumerated by a low-trust
+				// viewer to map internal endpoints / sources / outcomes.
+				// Scrape with bearer auth from your Prometheus job.
+				r.Get("/metrics", s.metricsExport)
+
 				// Config export (reveals registry/global details).
 				r.Get("/config/export", s.exportConfig)

@@ -32,9 +32,26 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
 	w.WriteHeader(http.StatusOK)
 	flusher.Flush()

-	// Subscribe to instance status, deploy status, and persistent event log events.
+	// Build logs are high-volume: a single verbose `docker build` can emit
+	// thousands of lines. Streaming them to EVERY connection would flood each
+	// subscriber's bounded bus buffer and evict status/log events for ALL
+	// clients. So build logs are delivered ONLY to connections that opt in
+	// with ?workload_id=<id>, and only for that workload. Connections without
+	// the param (e.g. the global dashboard) never receive build-log frames.
+	buildLogWorkloadID := r.URL.Query().Get("workload_id")
 	sub := s.eventBus.Subscribe(func(evt events.Event) bool {
-		return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus || evt.Type == events.EventLog
+		switch evt.Type {
+		case events.EventInstanceStatus, events.EventDeployStatus, events.EventLog:
+			return true
+		case events.EventBuildLog:
+			if buildLogWorkloadID == "" {
+				return false
+			}
+			p, ok := evt.Payload.(events.BuildLogPayload)
+			return ok && p.WorkloadID == buildLogWorkloadID
+		default:
+			return false
+		}
 	})
 	defer s.eventBus.Unsubscribe(sub)

@@ -89,12 +89,16 @@ func toTriggerViewWithCount(row store.TriggerWithBindingCount) triggerView {
 // triggerRequest is the create/update body. Config is opaque per kind.
 // Auto-generates a webhook secret on create when WebhookEnabled is true;
 // the secret is exposed only via the /webhook subresource.
+//
+// WebhookRequireSignature is a *bool so we can distinguish "field omitted
+// by client" (nil → apply secure default of true when webhook is enabled)
+// from an explicit opt-out (false → respected).
 type triggerRequest struct {
 	Kind                    string          `json:"kind"`
 	Name                    string          `json:"name"`
 	Config                  json.RawMessage `json:"config"`
 	WebhookEnabled          bool            `json:"webhook_enabled"`
-	WebhookRequireSignature bool            `json:"webhook_require_signature"`
+	WebhookRequireSignature *bool           `json:"webhook_require_signature,omitempty"`
 }

 // Same per-blob caps used on the workload pluginWorkloadRequest path —
@@ -134,12 +138,26 @@ func (s *Server) getTrigger(w http.ResponseWriter, r *http.Request) {
 // buildTriggerFromRequest assembles a store.Trigger ready for insert.
 // Centralized so the standalone create endpoint and the inline-bind
 // endpoint cannot drift on secret-generation defaults.
+//
+// SECURITY: a new trigger with webhook enabled defaults to require_signature
+// = true. Operators can opt out at create time for receivers that do not
+// support HMAC, but the safer default avoids the "freshly-created trigger
+// accepts unsigned posts to its URL" footgun.
 func buildTriggerFromRequest(req triggerRequest) store.Trigger {
+	// Secure default: if webhook is enabled and the operator did NOT
+	// explicitly set require_signature, force it on. Explicit false is
+	// preserved (legacy receivers without HMAC support still work).
+	requireSig := false
+	if req.WebhookRequireSignature != nil {
+		requireSig = *req.WebhookRequireSignature
+	} else if req.WebhookEnabled {
+		requireSig = true
+	}
 	t := store.Trigger{
 		Kind:                    req.Kind,
 		Name:                    strings.TrimSpace(req.Name),
 		Config:                  string(req.Config),
-		WebhookRequireSignature: req.WebhookRequireSignature,
+		WebhookRequireSignature: requireSig,
 	}
 	if req.WebhookEnabled {
 		t.WebhookSecret = generateWebhookSecret()
@@ -199,7 +217,13 @@ func (s *Server) updateTrigger(w http.ResponseWriter, r *http.Request) {
 	if len(req.Config) > 0 {
 		existing.Config = string(req.Config)
 	}
-	existing.WebhookRequireSignature = req.WebhookRequireSignature
+	if req.WebhookRequireSignature != nil {
+		existing.WebhookRequireSignature = *req.WebhookRequireSignature
+	} else if req.WebhookEnabled && !existing.WebhookRequireSignature {
+		// Re-enabling webhook without specifying the signature flag —
+		// take the secure default.
+		existing.WebhookRequireSignature = true
+	}
 	wasEnabled := existing.WebhookSecret != ""
 	if req.WebhookEnabled && !wasEnabled {
 		// false→true transition: rotate both secrets so re-enabling
@@ -13,18 +13,29 @@ import (
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
 )

 // chainNode is the lightweight shape returned by /chain — we deliberately
 // don't return full plugin.Workload values for ancestor/descendant rows
 // because the secret fields don't belong in a chain-traversal response.
+//
+// IsPreview / PreviewBranch surface branch-preview children to the UI so it
+// can render them in a dedicated "Preview environments" panel rather than as
+// undistinguished stage children. They are computed against the chain's
+// `self` workload via preview.IsPreviewChild — the canonical "this child is a
+// branch preview" test that reverses the MaterializeForBranch naming formula.
+// Both are zero-valued (false / "") for the parent and self nodes and for
+// operator-created stage children.
 type chainNode struct {
-	ID         string `json:"id"`
-	Name       string `json:"name"`
-	SourceKind string `json:"source_kind"`
-	TriggerKind string `json:"trigger_kind"`
-	CreatedAt  string `json:"created_at"`
-	UpdatedAt  string `json:"updated_at"`
+	ID            string `json:"id"`
+	Name          string `json:"name"`
+	SourceKind    string `json:"source_kind"`
+	TriggerKind   string `json:"trigger_kind"`
+	IsPreview     bool   `json:"is_preview"`
+	PreviewBranch string `json:"preview_branch,omitempty"`
+	CreatedAt     string `json:"created_at"`
+	UpdatedAt     string `json:"updated_at"`
 }

 func chainNodeOf(w store.Workload) chainNode {
@@ -38,6 +49,32 @@ func chainNodeOf(w store.Workload) chainNode {
 	}
 }

+// previewBranchOf extracts the branch a preview child was materialized for
+// from its source_config (the `branch` key MaterializeForBranch wrote).
+// Returns "" on a missing/malformed config — the caller only calls this for
+// rows preview.IsPreviewChild already confirmed, so a blank result just means
+// the JSON couldn't be decoded.
+func previewBranchOf(w store.Workload) string {
+	var cfg struct {
+		Branch string `json:"branch"`
+	}
+	if w.SourceConfig != "" {
+		_ = json.Unmarshal([]byte(w.SourceConfig), &cfg)
+	}
+	return cfg.Branch
+}
+
+// childChainNode builds a chainNode for a child row, marking it as a branch
+// preview (and attaching its branch) when it was materialized from `self`.
+func childChainNode(self, child store.Workload) chainNode {
+	node := chainNodeOf(child)
+	if preview.IsPreviewChild(self, child) {
+		node.IsPreview = true
+		node.PreviewBranch = previewBranchOf(child)
+	}
+	return node
+}
+
 // getWorkloadChain handles GET /api/workloads/{id}/chain.
 //
 // Returns the workload's parent (or nil), itself, and its direct children
@@ -76,7 +113,7 @@ func (s *Server) getWorkloadChain(w http.ResponseWriter, r *http.Request) {
 	}
 	children := make([]chainNode, 0, len(childRows))
 	for _, c := range childRows {
-		children = append(children, chainNodeOf(c))
+		children = append(children, childChainNode(self, c))
 	}

 	respondJSON(w, http.StatusOK, map[string]any{
@@ -0,0 +1,147 @@
+package api
+
+import (
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// TestChildChainNode_MarksPreviewChildren verifies the /chain DTO builder
+// distinguishes branch-preview children (materialized by the preview package)
+// from operator-created stage children that merely share the parent link.
+// The discriminator is preview.IsPreviewChild, which reverses the
+// MaterializeForBranch naming formula: name == template.Name + "/" + slug.
+func TestChildChainNode_MarksPreviewChildren(t *testing.T) {
+	template := store.Workload{
+		ID:         "tmpl-1",
+		Name:       "myapp",
+		SourceKind: "dockerfile",
+	}
+
+	tests := []struct {
+		name       string
+		child      store.Workload
+		wantPrev   bool
+		wantBranch string
+	}{
+		{
+			name: "preview child is marked with its branch",
+			child: store.Workload{
+				ID:               "child-prev",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login","port":3000}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   true,
+			wantBranch: "feat/login",
+		},
+		{
+			name: "operator-named stage child sharing the parent is not a preview",
+			child: store.Workload{
+				ID:               "child-stage",
+				Name:             "myapp-staging",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"main"}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			name: "child of a different parent is not a preview of self",
+			child: store.Workload{
+				ID:               "child-other",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login"}`,
+				ParentWorkloadID: "some-other-template",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			name: "child with no branch in source_config is not a preview",
+			child: store.Workload{
+				ID:               "child-nobranch",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			// Same parent + a valid branch, but the name carries an extra
+			// suffix so it fails ONLY the slug-equality check (expected
+			// "myapp/feat-login", got "myapp/feat-login-staging"). The
+			// branch alone must not be enough to mark a preview.
+			name: "valid branch but name fails the slug match is not a preview",
+			child: store.Workload{
+				ID:               "child-slugmiss",
+				Name:             "myapp/feat-login-staging",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login","port":3000}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			// Uppercase + slash branch: slugifyBranch lowercases and maps
+			// "/" -> "-", so "Feature/Login" -> "feature-login" and the name
+			// "myapp/feature-login" matches. PreviewBranch must echo the RAW
+			// branch from source_config ("Feature/Login"), not the slug.
+			name: "uppercase slash branch matches and keeps raw branch",
+			child: store.Workload{
+				ID:               "child-upper",
+				Name:             "myapp/feature-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"Feature/Login","port":8080}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   true,
+			wantBranch: "Feature/Login",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			node := childChainNode(template, tc.child)
+			if node.IsPreview != tc.wantPrev {
+				t.Errorf("IsPreview = %v, want %v", node.IsPreview, tc.wantPrev)
+			}
+			if node.PreviewBranch != tc.wantBranch {
+				t.Errorf("PreviewBranch = %q, want %q", node.PreviewBranch, tc.wantBranch)
+			}
+			// Base fields must always round-trip regardless of preview status.
+			if node.ID != tc.child.ID || node.Name != tc.child.Name {
+				t.Errorf("base fields mangled: got id=%q name=%q", node.ID, node.Name)
+			}
+		})
+	}
+}
+
+// TestPreviewBranchOf_ToleratesMalformedConfig confirms the branch extractor
+// returns "" rather than panicking on a missing or invalid source_config.
+func TestPreviewBranchOf_ToleratesMalformedConfig(t *testing.T) {
+	cases := []struct {
+		name string
+		cfg  string
+		want string
+	}{
+		{"valid branch", `{"branch":"release/v1"}`, "release/v1"},
+		{"empty config", ``, ""},
+		{"empty object", `{}`, ""},
+		{"malformed json", `{not-json`, ""},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got := previewBranchOf(store.Workload{SourceConfig: c.cfg})
+			if got != c.want {
+				t.Errorf("previewBranchOf(%q) = %q, want %q", c.cfg, got, c.want)
+			}
+		})
+	}
+}
@@ -0,0 +1,231 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// workloadNotificationRow is the JSON shape returned to clients. The
+// `secret_set` boolean replaces the actual ciphertext: once stored a
+// secret is write-only, mirroring how workload_env hides encrypted
+// values. Rotating means submitting a new value.
+type workloadNotificationRow struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	SecretSet  bool   `json:"secret_set"`
+	EventTypes string `json:"event_types"`
+	Enabled    bool   `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
+func toWorkloadNotificationRow(n store.WorkloadNotification) workloadNotificationRow {
+	return workloadNotificationRow{
+		ID:         n.ID,
+		WorkloadID: n.WorkloadID,
+		Name:       n.Name,
+		URL:        n.URL,
+		SecretSet:  n.Secret != "",
+		EventTypes: n.EventTypes,
+		Enabled:    n.Enabled,
+		SortOrder:  n.SortOrder,
+		CreatedAt:  n.CreatedAt,
+		UpdatedAt:  n.UpdatedAt,
+	}
+}
+
+func (s *Server) listWorkloadNotifications(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	rows, err := s.store.ListWorkloadNotifications(id)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workload notifications")
+		return
+	}
+	out := make([]workloadNotificationRow, 0, len(rows))
+	for _, n := range rows {
+		out = append(out, toWorkloadNotificationRow(n))
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// workloadNotificationRequest is the POST/PUT body. Secret is the raw
+// plaintext webhook signing key; the server encrypts it at rest with
+// the global encryption key before INSERT. An empty Secret on UPDATE
+// leaves the stored secret untouched so the operator can edit the URL
+// or event filter without re-entering the secret each time.
+type workloadNotificationRequest struct {
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	Secret     string `json:"secret"`
+	EventTypes string `json:"event_types"`
+	Enabled    *bool  `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+}
+
+func (s *Server) createWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	var req workloadNotificationRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.URL = strings.TrimSpace(req.URL)
+	req.Name = strings.TrimSpace(req.Name)
+	if req.URL == "" {
+		respondError(w, http.StatusBadRequest, "url is required")
+		return
+	}
+	encSecret := ""
+	if req.Secret != "" {
+		v, err := crypto.Encrypt(s.encKey, req.Secret)
+		if err != nil {
+			slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
+			respondError(w, http.StatusInternalServerError, "encrypt secret")
+			return
+		}
+		encSecret = v
+	}
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+	created, err := s.store.CreateWorkloadNotification(store.WorkloadNotification{
+		WorkloadID: id,
+		Name:       req.Name,
+		URL:        req.URL,
+		Secret:     encSecret,
+		EventTypes: req.EventTypes,
+		Enabled:    enabled,
+		SortOrder:  req.SortOrder,
+	})
+	if err != nil {
+		slog.Error("workload notifications: create", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "create workload notification")
+		return
+	}
+	respondJSON(w, http.StatusCreated, toWorkloadNotificationRow(created))
+}
+
+func (s *Server) updateWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	nid := chi.URLParam(r, "nid")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	existing, err := s.store.GetWorkloadNotification(nid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload_notification")
+		return
+	}
+	if existing.WorkloadID != id {
+		// Route mismatch — the row exists but under a different workload.
+		// Return 404 rather than 403 so we don't leak the existence of
+		// foreign rows to an unauthorised caller.
+		respondNotFound(w, "workload_notification")
+		return
+	}
+
+	var req workloadNotificationRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.URL = strings.TrimSpace(req.URL)
+	req.Name = strings.TrimSpace(req.Name)
+	if req.URL == "" {
+		respondError(w, http.StatusBadRequest, "url is required")
+		return
+	}
+
+	existing.Name = req.Name
+	existing.URL = req.URL
+	existing.EventTypes = req.EventTypes
+	existing.SortOrder = req.SortOrder
+	if req.Enabled != nil {
+		existing.Enabled = *req.Enabled
+	}
+	// Empty Secret on UPDATE preserves the stored ciphertext — explicit
+	// rotation requires sending the new plaintext. This avoids forcing
+	// the operator to re-enter their secret on every URL edit.
+	if req.Secret != "" {
+		v, err := crypto.Encrypt(s.encKey, req.Secret)
+		if err != nil {
+			slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
+			respondError(w, http.StatusInternalServerError, "encrypt secret")
+			return
+		}
+		existing.Secret = v
+	}
+
+	if err := s.store.UpdateWorkloadNotification(existing); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		slog.Error("workload notifications: update", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "update workload notification")
+		return
+	}
+	respondJSON(w, http.StatusOK, toWorkloadNotificationRow(existing))
+}
+
+func (s *Server) deleteWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	nid := chi.URLParam(r, "nid")
+	existing, err := s.store.GetWorkloadNotification(nid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload_notification")
+		return
+	}
+	if existing.WorkloadID != id {
+		respondNotFound(w, "workload_notification")
+		return
+	}
+	if err := s.store.DeleteWorkloadNotification(nid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		slog.Error("workload notifications: delete", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "delete workload notification")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{"success": true})
+}
@@ -82,16 +82,27 @@ func (s *Server) getWorkloadRuntimeState(w http.ResponseWriter, r *http.Request)

 	payload := runtimeStatePayload{SourceKind: workload.SourceKind}

-	if workload.SourceKind != "static" {
+	// Both static and dockerfile sources persist their runtime state into
+	// containers.extra_json under a deterministic row id. The shapes
+	// match (status / last_commit_sha / last_sync_at / last_error) so the
+	// handler can decode them identically. The suffix differs per source
+	// kind: static uses ":site", dockerfile uses ":dockerfile".
+	var rowSuffix string
+	switch workload.SourceKind {
+	case "static":
+		rowSuffix = ":site"
+	case "dockerfile":
+		rowSuffix = ":dockerfile"
+	default:
 		respondJSON(w, http.StatusOK, payload)
 		return
 	}

-	// The static plugin owns one container row per workload at the
-	// deterministic ID <workloadID>:site. A missing row means the
-	// workload has never been deployed — return HasState=false so the
-	// UI can prompt the operator to deploy.
-	row, err := s.store.GetContainerByID(id + ":site")
+	// The owning plugin maintains one container row per workload at the
+	// deterministic ID. A missing row means the workload has never been
+	// deployed — return HasState=false so the UI can prompt the operator
+	// to deploy.
+	row, err := s.store.GetContainerByID(id + rowSuffix)
 	if err != nil {
 		if errors.Is(err, store.ErrNotFound) {
 			respondJSON(w, http.StatusOK, payload)
@@ -130,6 +130,13 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
 		SourceKind:   "static",
 		SourceConfig: `{"provider":"gitea"}`,
 	})
+	// Seed a row with a valid extra_json first, then corrupt it via raw
+	// SQL. Prior to the write-side validateExtraJSON guard this test
+	// could pass a malformed string straight to UpsertContainer; the
+	// guard now rejects that at the boundary, which is the correct
+	// behaviour. The reader resilience this test verifies remains
+	// relevant for pre-existing bad rows from upgrades or external
+	// manipulation, so we still produce one via direct SQL.
 	if err := e.store.UpsertContainer(store.Container{
 		ID:           wl.ID + ":site",
 		WorkloadID:   wl.ID,
@@ -137,10 +144,16 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
 		Host:         "local",
 		ContainerID:  "abc",
 		State:        "running",
-		ExtraJSON:    `{this is not json`,
+		ExtraJSON:    `{}`,
 	}); err != nil {
 		t.Fatalf("seed: %v", err)
 	}
+	if _, err := e.store.DB().Exec(
+		`UPDATE containers SET extra_json = ? WHERE id = ?`,
+		`{this is not json`, wl.ID+":site",
+	); err != nil {
+		t.Fatalf("corrupt extra_json: %v", err)
+	}
 	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
 	if resp.StatusCode != http.StatusOK {
 		t.Fatalf("status = %d, want 200 (decode is non-fatal)", resp.StatusCode)
@@ -155,6 +168,57 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
 	}
 }

+func TestGetWorkloadRuntimeState_DockerfileSourceDeployed_DecodesExtraJSON(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         "build-app",
+		SourceKind:   "dockerfile",
+		SourceConfig: `{"provider":"gitea","port":3000}`,
+	})
+	if err != nil {
+		t.Fatalf("seed workload: %v", err)
+	}
+	extra, _ := json.Marshal(map[string]any{
+		"status":          "deployed",
+		"last_commit_sha": "deadbeef",
+		"last_sync_at":    "2026-05-23T10:00:00Z",
+		"last_error":      "",
+	})
+	if err := e.store.UpsertContainer(store.Container{
+		ID:           wl.ID + ":dockerfile",
+		WorkloadID:   wl.ID,
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Host:         "local",
+		ContainerID:  "ffeeddcc",
+		State:        "running",
+		ExtraJSON:    string(extra),
+	}); err != nil {
+		t.Fatalf("seed container: %v", err)
+	}
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if !got.HasState {
+		t.Fatalf("HasState = false, want true")
+	}
+	if got.SourceKind != "dockerfile" {
+		t.Errorf("SourceKind = %q, want dockerfile", got.SourceKind)
+	}
+	if got.ContainerID != "ffeeddcc" || got.State != "running" {
+		t.Errorf("container fields = (%q,%q), want (ffeeddcc, running)", got.ContainerID, got.State)
+	}
+	if got.Status != "deployed" || got.LastCommitSHA != "deadbeef" {
+		t.Errorf("runtime fields = %+v, want deployed/deadbeef", got)
+	}
+}
+
 // =============================================================================
 // GET /api/workloads/{id}/storage
 // =============================================================================
@@ -14,6 +14,7 @@ import (
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
 )

 // pluginWorkloadRequest is the JSON body accepted by create + update.
@@ -227,6 +228,28 @@ func (s *Server) deletePluginWorkload(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	// Cascade-teardown any branch previews materialized from this workload
+	// so deleting a template does not orphan their containers, proxy routes,
+	// and rows. Operator-managed stage-chain children (which share the same
+	// parent link) are deliberately left alone — only previews are auto-owned
+	// by the template (see preview.IsPreviewChild).
+	if previews, err := preview.ListPreviewChildren(s.store, row); err != nil {
+		slog.Warn("delete workload: list preview children", "workload", id, "error", err)
+	} else {
+		for _, child := range previews {
+			if child.SourceKind != "" {
+				if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(child)); err != nil {
+					slog.Warn("delete workload: preview child teardown error",
+						"workload", id, "child", child.ID, "error", err)
+				}
+			}
+			if err := s.store.DeleteWorkload(child.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
+				slog.Warn("delete workload: preview child delete error",
+					"workload", id, "child", child.ID, "error", err)
+			}
+		}
+	}
+
 	if row.SourceKind != "" {
 		if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(row)); err != nil {
 			slog.Warn("delete workload: teardown error",
@@ -85,9 +85,15 @@ func (la *LocalAuth) cleanBlacklist() {
 	}
 }

+// bcryptCost is the work factor used for new password hashes. Bumped from
+// the library default (10) to 12 so cost grows with hardware. Existing
+// hashes at lower costs still verify — bcrypt encodes the cost in the
+// stored hash itself.
+const bcryptCost = 12
+
 // HashPassword hashes a plaintext password using bcrypt.
 func HashPassword(password string) (string, error) {
-	hash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
+	hash, err := bcrypt.GenerateFromPassword([]byte(password), bcryptCost)
 	if err != nil {
 		return "", fmt.Errorf("hash password: %w", err)
 	}
@@ -1,13 +1,17 @@
 package backup

 import (
+	"database/sql"
 	"fmt"
+	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"sync"
 	"time"

+	_ "modernc.org/sqlite" // read-only candidate inspection via PRAGMA integrity_check
+
 	"github.com/alexei/tinyforge/internal/store"
 )

@@ -129,6 +133,17 @@ func (e *Engine) RestorePath(id string) (string, error) {
 		return "", fmt.Errorf("get backup: %w", err)
 	}

+	// Filename comes from a DB row. Defence-in-depth: a backup file must live
+	// directly under backupDir, so reject any value carrying a path separator
+	// or traversal before joining. A poisoned row (future import path, manual
+	// insert) must never let restore read — and then atomically copy over the
+	// live DB — an arbitrary file. CreateBackup builds safe base names; this
+	// enforces the same invariant on read.
+	if backup.Filename == "" || backup.Filename == "." || backup.Filename == ".." ||
+		backup.Filename != filepath.Base(backup.Filename) {
+		return "", fmt.Errorf("backup: invalid filename %q", backup.Filename)
+	}
+
 	filePath := filepath.Join(e.backupDir, backup.Filename)
 	if _, err := os.Stat(filePath); err != nil {
 		return "", fmt.Errorf("backup file not found: %w", err)
@@ -137,6 +152,153 @@ func (e *Engine) RestorePath(id string) (string, error) {
 	return filePath, nil
 }

+// PrepareRestore validates a backup candidate before the caller swaps it
+// over the live DB. Runs three checks in order:
+//
+//  1. The candidate file exists and is non-empty.
+//  2. SQLite header magic matches (catches corrupted or partial downloads).
+//  3. `PRAGMA integrity_check` against a temp copy returns "ok"
+//     (catches WAL/page corruption that the header check misses).
+//
+// On success returns the candidate path. On failure returns a wrapped
+// error describing which probe rejected the file, so the operator can
+// see exactly why a "restore" was refused rather than getting a corrupt
+// DB at next boot.
+//
+// We use a *temp copy* for integrity_check because attaching the
+// candidate read-only into the live process would still hold a file
+// handle SQLite considers writable on Windows.
+func (e *Engine) PrepareRestore(id string) (string, error) {
+	path, err := e.RestorePath(id)
+	if err != nil {
+		return "", err
+	}
+
+	info, err := os.Stat(path)
+	if err != nil {
+		return "", fmt.Errorf("restore: stat candidate: %w", err)
+	}
+	if info.Size() < 100 {
+		return "", fmt.Errorf("restore: candidate %s is suspiciously small (%d bytes)", path, info.Size())
+	}
+
+	// SQLite file header: "SQLite format 3\x00" (16 bytes).
+	hdr, err := readHead(path, 16)
+	if err != nil {
+		return "", fmt.Errorf("restore: read header: %w", err)
+	}
+	if string(hdr) != "SQLite format 3\x00" {
+		return "", fmt.Errorf("restore: candidate %s is not a SQLite database (header mismatch)", path)
+	}
+
+	if err := integrityCheck(path); err != nil {
+		return "", fmt.Errorf("restore: integrity check failed: %w", err)
+	}
+
+	return path, nil
+}
+
+func readHead(path string, n int) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	buf := make([]byte, n)
+	// io.ReadFull (not f.Read) guarantees the buffer is filled.
+	// A bare Read can short-return on some filesystems / on small
+	// files, which would skew the SQLite-header magic check below.
+	if _, err := io.ReadFull(f, buf); err != nil {
+		return nil, err
+	}
+	return buf, nil
+}
+
+// integrityCheck opens the candidate read-only and runs
+// `PRAGMA integrity_check`. We use immutable=1 so the driver does not
+// try to create WAL/SHM sidecars or upgrade the journal mode on the
+// candidate — both of which fail with "attempt to write a readonly
+// database" against a backup file. Anything other than the single row
+// `"ok"` is treated as corruption.
+func integrityCheck(path string) error {
+	db, err := sql.Open("sqlite", "file:"+path+"?mode=ro&immutable=1")
+	if err != nil {
+		return fmt.Errorf("open candidate: %w", err)
+	}
+	defer db.Close()
+
+	rows, err := db.Query("PRAGMA integrity_check")
+	if err != nil {
+		return fmt.Errorf("pragma integrity_check: %w", err)
+	}
+	defer rows.Close()
+
+	if !rows.Next() {
+		return fmt.Errorf("integrity_check returned no rows")
+	}
+	var result string
+	if err := rows.Scan(&result); err != nil {
+		return fmt.Errorf("scan integrity_check: %w", err)
+	}
+	if result != "ok" {
+		return fmt.Errorf("integrity_check: %s", result)
+	}
+	return nil
+}
+
+// AtomicReplaceDB writes a backup candidate into place atomically.
+// The caller is expected to:
+//  1. Call PrepareRestore(id) → candidatePath.
+//  2. Take a "pre-restore" backup of the current DB via CreateBackup.
+//  3. Close the live *sql.DB.
+//  4. Call AtomicReplaceDB(candidatePath, livePath).
+//  5. Trigger graceful shutdown; main() will re-open on next start.
+//
+// AtomicReplaceDB also wipes WAL/SHM sidecar files so the new DB starts
+// from a clean checkpoint state. Failure to remove sidecars is logged
+// but non-fatal — SQLite recreates them on open.
+func (e *Engine) AtomicReplaceDB(candidatePath, livePath string) error {
+	// Copy candidate to a tmp file next to the live DB, then rename
+	// atomically. On Windows os.Rename across volumes fails, so we
+	// keep tmp on the same dir as the destination.
+	tmp := livePath + ".restore.tmp"
+	if err := copyFile(candidatePath, tmp); err != nil {
+		return fmt.Errorf("copy candidate to %s: %w", tmp, err)
+	}
+	// Best-effort: remove WAL/SHM so SQLite re-checkpoints from the
+	// restored main file rather than a stale WAL pointing at the old
+	// DB's pages.
+	for _, sidecar := range []string{livePath + "-wal", livePath + "-shm"} {
+		if err := os.Remove(sidecar); err != nil && !os.IsNotExist(err) {
+			slog.Warn("restore: remove sidecar", "path", sidecar, "error", err)
+		}
+	}
+	if err := os.Rename(tmp, livePath); err != nil {
+		// Clean up tmp on rename failure so we don't leak a partial file.
+		_ = os.Remove(tmp)
+		return fmt.Errorf("rename %s → %s: %w", tmp, livePath, err)
+	}
+	slog.Info("restore: database file replaced atomically", "live", livePath)
+	return nil
+}
+
+func copyFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
+	if err != nil {
+		return err
+	}
+	if _, err := io.Copy(out, in); err != nil {
+		_ = out.Close()
+		return err
+	}
+	return out.Close()
+}
+
 // Prune removes old backups exceeding the retention count.
 // Returns the number of backups pruned.
 func (e *Engine) Prune(retentionCount int) (int, error) {
@@ -0,0 +1,113 @@
+package backup
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// newTestEngine spins up an isolated store + engine pair for tests.
+// Each test gets its own tempdir so backup files do not collide.
+func newTestEngine(t *testing.T) (*Engine, *store.Store, string) {
+	t.Helper()
+	dir := t.TempDir()
+	dbPath := filepath.Join(dir, "tinyforge.db")
+	st, err := store.New(dbPath)
+	if err != nil {
+		t.Fatalf("store.New: %v", err)
+	}
+	t.Cleanup(func() { _ = st.Close() })
+
+	eng, err := New(st, dbPath, dir)
+	if err != nil {
+		t.Fatalf("backup.New: %v", err)
+	}
+	return eng, st, dbPath
+}
+
+func TestPrepareRestore_RejectsTinyFile(t *testing.T) {
+	eng, st, _ := newTestEngine(t)
+
+	// Plant a backup row with a tiny file masquerading as a backup.
+	tinyPath := filepath.Join(eng.BackupDir(), "tinyforge-manual-junk.db")
+	if err := os.WriteFile(tinyPath, []byte("hi"), 0o600); err != nil {
+		t.Fatalf("write tiny: %v", err)
+	}
+	bk, err := st.CreateBackup(store.Backup{
+		Filename:   "tinyforge-manual-junk.db",
+		SizeBytes:  2,
+		BackupType: "manual",
+	})
+	if err != nil {
+		t.Fatalf("CreateBackup row: %v", err)
+	}
+
+	if _, err := eng.PrepareRestore(bk.ID); err == nil {
+		t.Fatal("expected PrepareRestore to reject tiny file, got nil")
+	} else if !strings.Contains(err.Error(), "suspiciously small") {
+		t.Errorf("error = %v, want 'suspiciously small'", err)
+	}
+}
+
+func TestPrepareRestore_RejectsNonSQLite(t *testing.T) {
+	eng, st, _ := newTestEngine(t)
+
+	// 200 bytes of non-SQLite garbage: passes the size check, fails
+	// the header magic check.
+	garbagePath := filepath.Join(eng.BackupDir(), "tinyforge-manual-bogus.db")
+	junk := make([]byte, 200)
+	for i := range junk {
+		junk[i] = byte('x')
+	}
+	if err := os.WriteFile(garbagePath, junk, 0o600); err != nil {
+		t.Fatalf("write junk: %v", err)
+	}
+	bk, err := st.CreateBackup(store.Backup{
+		Filename:   "tinyforge-manual-bogus.db",
+		SizeBytes:  int64(len(junk)),
+		BackupType: "manual",
+	})
+	if err != nil {
+		t.Fatalf("CreateBackup row: %v", err)
+	}
+
+	if _, err := eng.PrepareRestore(bk.ID); err == nil {
+		t.Fatal("expected PrepareRestore to reject non-SQLite blob, got nil")
+	} else if !strings.Contains(err.Error(), "header") {
+		t.Errorf("error = %v, want header mismatch", err)
+	}
+}
+
+func TestPrepareRestore_AcceptsValidVacuumInto(t *testing.T) {
+	eng, _, _ := newTestEngine(t)
+
+	// A fresh CreateBackup from the engine itself is, by construction,
+	// a valid SQLite database — VACUUM INTO produces a clean copy.
+	bk, err := eng.CreateBackup("manual")
+	if err != nil {
+		t.Fatalf("CreateBackup: %v", err)
+	}
+	path, err := eng.PrepareRestore(bk.ID)
+	if err != nil {
+		t.Fatalf("PrepareRestore on valid backup: %v", err)
+	}
+	if path == "" {
+		t.Errorf("PrepareRestore returned empty path")
+	}
+}
+
+func TestPrepareRestore_UnknownID(t *testing.T) {
+	eng, _, _ := newTestEngine(t)
+
+	_, err := eng.PrepareRestore("nonexistent-id")
+	if err == nil {
+		t.Fatal("expected error for unknown id, got nil")
+	}
+	if errors.Is(err, store.ErrNotFound) {
+		// fine — wrapped through RestorePath
+	}
+}
@@ -10,11 +10,26 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"strings"
 )

 // ErrNoKey is returned when ENCRYPTION_KEY is not set.
 var ErrNoKey = errors.New("ENCRYPTION_KEY environment variable is not set")

+// ErrDecryptFailed wraps any cipher.Open / decoder failure. Callers
+// upgrading from the silent-fallback pattern (treat-as-plaintext when
+// decrypt errored) MUST instead surface this — a rotated key would
+// otherwise silently leak ciphertext to upstream services as if it
+// were plaintext.
+var ErrDecryptFailed = errors.New("crypto: decrypt failed (wrong key, corrupted ciphertext, or unversioned legacy value)")
+
+// envelopeV1Prefix tags ciphertext produced by Encrypt going forward.
+// Older databases may carry unprefixed hex blobs from the v0 era; those
+// are still readable via Decrypt for backward compatibility, but every
+// new write goes through EncryptV1 and emits the prefix so a future key
+// rotation has a clean fail-loud signal.
+const envelopeV1Prefix = "tf1:"
+
 // DeriveKey computes a 32-byte AES-256 key from the given passphrase using SHA-256.
 // This is acceptable when ENCRYPTION_KEY is a high-entropy random string (e.g., 32+ hex chars).
 // For human-chosen passphrases, consider Argon2id or PBKDF2 with a salt instead.
@@ -35,7 +50,8 @@ func KeyFromEnv() ([32]byte, error) {
 }

 // Encrypt encrypts plaintext using AES-256-GCM with a random nonce.
-// The returned ciphertext is hex-encoded: nonce || ciphertext+tag.
+// Returns a versioned envelope (tf1:<hex>) so downstream readers can
+// distinguish ciphertext from accidentally-stored plaintext.
 func Encrypt(key [32]byte, plaintext string) (string, error) {
 	block, err := aes.NewCipher(key[:])
 	if err != nil {
@@ -53,14 +69,34 @@ func Encrypt(key [32]byte, plaintext string) (string, error) {
 	}

 	sealed := gcm.Seal(nonce, nonce, []byte(plaintext), nil)
-	return hex.EncodeToString(sealed), nil
+	return envelopeV1Prefix + hex.EncodeToString(sealed), nil
 }

-// Decrypt decrypts a hex-encoded ciphertext produced by Encrypt.
-func Decrypt(key [32]byte, ciphertextHex string) (string, error) {
-	data, err := hex.DecodeString(ciphertextHex)
+// HasEnvelope reports whether the value is a v1-prefixed ciphertext.
+// Useful for router-level "decrypt only if encrypted" decision points
+// that previously relied on `err == nil` from a try-decrypt — that
+// pattern silently masked rotated-key failures.
+func HasEnvelope(value string) bool {
+	return strings.HasPrefix(value, envelopeV1Prefix)
+}
+
+// Decrypt decrypts an envelope (tf1:<hex>). For backward compatibility
+// it also accepts unprefixed hex from the v0 era — but only when the
+// resulting plaintext is valid; a wrong key for legacy data now returns
+// ErrDecryptFailed instead of silently treating ciphertext as
+// plaintext.
+//
+// Callers MUST NOT swallow the error and fall back to "use as-is".
+// That pattern is the exact footgun the envelope versioning removes.
+func Decrypt(key [32]byte, ciphertext string) (string, error) {
+	hexBlob := ciphertext
+	if strings.HasPrefix(hexBlob, envelopeV1Prefix) {
+		hexBlob = hexBlob[len(envelopeV1Prefix):]
+	}
+
+	data, err := hex.DecodeString(hexBlob)
 	if err != nil {
-		return "", fmt.Errorf("decode hex: %w", err)
+		return "", fmt.Errorf("%w: decode hex: %v", ErrDecryptFailed, err)
 	}

 	block, err := aes.NewCipher(key[:])
@@ -75,15 +111,15 @@ func Decrypt(key [32]byte, ciphertextHex string) (string, error) {

 	nonceSize := gcm.NonceSize()
 	if len(data) < nonceSize {
-		return "", errors.New("ciphertext too short")
+		return "", fmt.Errorf("%w: ciphertext too short", ErrDecryptFailed)
 	}

 	nonce := data[:nonceSize]
-	ciphertext := data[nonceSize:]
+	body := data[nonceSize:]

-	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
+	plaintext, err := gcm.Open(nil, nonce, body, nil)
 	if err != nil {
-		return "", fmt.Errorf("decrypt: %w", err)
+		return "", fmt.Errorf("%w: %v", ErrDecryptFailed, err)
 	}

 	return string(plaintext), nil
@@ -34,7 +34,19 @@ type Deployer struct {
 	dnsMu    sync.RWMutex
 	dns      dns.Provider // nil when wildcard DNS is active

+	// proxyMu protects hot-swap of d.proxy from runtime settings updates
+	// (SetProxyProvider) racing with PluginDeps() reads on the deploy path.
+	proxyMu sync.RWMutex
+
 	// Graceful shutdown: tracks in-progress deploys.
+	//
+	// drainMu serializes the "is-draining check + activeWg.Add(1)" in
+	// beginDispatch against the "set shuttingDown + Wait()" in Drain. Without
+	// it, a dispatch could pass the draining check, Drain could then flip the
+	// flag and start Wait() with a zero counter, and the dispatch could call
+	// Add(1) concurrently with Wait — a documented sync.WaitGroup misuse
+	// (panic risk) that also lets a deploy slip past the drain barrier.
+	drainMu      sync.Mutex
 	activeWg     sync.WaitGroup
 	shuttingDown atomic.Bool
 }
@@ -73,7 +85,11 @@ func New(
 }

 // SetProxyProvider updates the proxy provider at runtime (e.g., when settings change).
+// Guarded by proxyMu so concurrent deploys that read d.proxy via PluginDeps()
+// observe a coherent value (previously a torn-pointer race under -race).
 func (d *Deployer) SetProxyProvider(provider proxy.Provider) {
+	d.proxyMu.Lock()
+	defer d.proxyMu.Unlock()
 	d.proxy = provider
 }

@@ -110,8 +126,11 @@ func (d *Deployer) SetDNSProvider(provider dns.Provider) {

 // Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
 func (d *Deployer) Drain() {
-	if !d.shuttingDown.CompareAndSwap(false, true) {
-		// Already draining.
+	d.drainMu.Lock()
+	already := d.shuttingDown.Swap(true)
+	d.drainMu.Unlock()
+	if already {
+		slog.Info("deployer: drain already in progress")
 	}
 	slog.Info("deployer: draining in-progress deploys")
 	d.activeWg.Wait()
@@ -121,11 +140,17 @@ func (d *Deployer) Drain() {
 // ShuttingDown reports whether Drain() has been called.
 func (d *Deployer) ShuttingDown() bool { return d.shuttingDown.Load() }

-// rejectIfDraining is exposed in case any plugin wants the same hard-stop
-// behaviour the legacy pipeline used.
-func (d *Deployer) rejectIfDraining() error {
+// beginDispatch atomically rejects when draining and otherwise registers the
+// in-flight unit on activeWg. The shuttingDown check and the Add(1) MUST be
+// done together under drainMu (see the field comment): Drain sets the flag
+// under the same mutex before Wait(), so once Wait() observes a zero counter
+// no further Add can race it. Callers must defer d.activeWg.Done() on success.
+func (d *Deployer) beginDispatch() error {
+	d.drainMu.Lock()
+	defer d.drainMu.Unlock()
 	if d.shuttingDown.Load() {
 		return fmt.Errorf("deployer is shutting down, rejecting new deploy")
 	}
+	d.activeWg.Add(1)
 	return nil
 }
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"

+	"github.com/alexei/tinyforge/internal/metrics"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

@@ -14,16 +15,37 @@ import (
 // triggers + image deploys still go through the legacy path, while
 // /api/hooks/generic + the unified webhook ingress go through here.
 func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	if err := d.beginDispatch(); err != nil {
+		metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining")
+		return err
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
+		// Unknown source: use the constant "unknown" sentinel for the
+		// label so a typo-spam attack can't grow the metrics map with
+		// one series per bogus source_kind. The actual user-supplied
+		// value still surfaces via the wrapped error / event log.
+		metrics.DeploysTotal.Inc("unknown", "unknown_source")
 		return fmt.Errorf("dispatch %s: %w", w.Name, err)
 	}
-	return src.Deploy(ctx, d.PluginDeps(), w, intent)
+	err = src.Deploy(ctx, d.PluginDeps(), w, intent)
+	outcome := "success"
+	if err != nil {
+		outcome = "failure"
+	}
+	metrics.DeploysTotal.Inc(w.SourceKind, outcome)
+	return err
 }

 // DispatchTeardown routes a teardown call to the matching Source plugin.
-// Used when a workload is deleted.
+// Used when a workload is deleted. Tracked via activeWg so Drain() honours
+// in-progress teardowns just like deploys.
 func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error {
+	if err := d.beginDispatch(); err != nil {
+		return err
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
 		return fmt.Errorf("dispatch teardown %s: %w", w.Name, err)
@@ -33,8 +55,17 @@ func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) erro

 // DispatchReconcile routes a Reconcile call. Periodic reconciler iterates
 // every Workload and calls this; idle Sources should make it a cheap
-// no-op.
+// no-op. Tracked via activeWg so a long-running reconcile blocks Drain().
 func (d *Deployer) DispatchReconcile(ctx context.Context, w plugin.Workload) error {
+	if err := d.beginDispatch(); err != nil {
+		// Silent skip — reconcile is a periodic tick, not a user-initiated
+		// action, so we don't want to surface "draining" errors back to the
+		// reconciler loop. The next tick after restart will catch up. Routing
+		// through beginDispatch keeps the activeWg.Add atomic with the drain
+		// check (see Drain) instead of a bare shuttingDown.Load + Add race.
+		return nil
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
 		return fmt.Errorf("dispatch reconcile %s: %w", w.Name, err)
@@ -52,10 +83,13 @@ func (d *Deployer) PluginDeps() plugin.Deps {
 	d.dnsMu.RLock()
 	dnsProvider := d.dns
 	d.dnsMu.RUnlock()
+	d.proxyMu.RLock()
+	proxyProvider := d.proxy
+	d.proxyMu.RUnlock()
 	return plugin.Deps{
 		Store:    d.store,
 		Docker:   d.docker,
-		Proxy:    d.proxy,
+		Proxy:    proxyProvider,
 		DNS:      dnsProvider,
 		Health:   d.health,
 		Notifier: d.notifier,
@@ -2,20 +2,58 @@ package docker

 import (
 	"archive/tar"
+	"bufio"
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"

+	"github.com/moby/moby/api/types/build"
 	"github.com/moby/moby/client"
 )

-// BuildImage builds a Docker image from a directory containing a Dockerfile.
-// The directory is packaged as a tar archive and sent to the Docker daemon.
-// The tag parameter is the image name:tag to apply (e.g., "dw-site-myapp:latest").
+// BuildImage builds a Docker image from a directory containing a Dockerfile
+// at the context root. Kept as a thin wrapper around BuildImageAt for the
+// static-site plugin which always emits its generated Dockerfile at the
+// context root. New code should prefer BuildImageAt so the Dockerfile path
+// is explicit.
 func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
+	return c.BuildImageAt(ctx, contextDir, "Dockerfile", tag, nil)
+}
+
+// BuildImageAt builds a Docker image from a tar of contextDir, using the
+// Dockerfile at `dockerfile` *inside* the context (typically "Dockerfile"
+// but may be e.g. "docker/Dockerfile" when the user-supplied repo layout
+// keeps Dockerfiles in a subfolder).
+//
+// The dockerfile argument is the path *relative to contextDir*. Empty
+// strings are normalised to "Dockerfile" so callers can pass through a
+// user config value without sanitising twice.
+//
+// logFn, if non-nil, is invoked for every non-empty `stream` line the
+// daemon emits during the build. Callers use this to forward live build
+// progress (e.g. SSE bus). Errors from the daemon are NOT delivered via
+// logFn — they surface as the returned error so the caller's failure
+// path stays the single source of truth.
+func (c *Client) BuildImageAt(ctx context.Context, contextDir, dockerfile, tag string, logFn func(line string)) error {
+	if dockerfile == "" {
+		dockerfile = "Dockerfile"
+	}
+	// Normalise to forward slashes — the tar entry names use them and the
+	// Docker daemon expects the same.
+	dockerfile = filepath.ToSlash(dockerfile)
+	// Defence-in-depth: the dockerfile path is relative to contextDir and
+	// is increasingly user/config-supplied (subfolder Dockerfiles). Reject
+	// absolute paths and any `..` traversal at the boundary so a value like
+	// "../../etc/passwd" can never be handed to the daemon's build options,
+	// regardless of which builder backend resolves it.
+	if filepath.IsAbs(dockerfile) || strings.HasPrefix(dockerfile, "/") ||
+		dockerfile == ".." || strings.HasPrefix(dockerfile, "../") || strings.Contains(dockerfile, "/../") {
+		return fmt.Errorf("docker build: invalid dockerfile path %q (must be relative to the build context, no traversal)", dockerfile)
+	}
 	// Create tar archive of the build context.
 	pr, pw := io.Pipe()

@@ -50,16 +88,14 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
 				return nil
 			}

-			file, err := os.Open(path)
-			if err != nil {
-				return fmt.Errorf("open %s: %w", path, err)
+			// Per-file close, NOT defer. `defer file.Close()` inside the
+			// WalkFunc only runs when the outer goroutine returns — for a
+			// build context with thousands of files (node_modules-heavy
+			// repo) that leaks one fd per file until the walk completes
+			// and trips EMFILE on default ulimit=1024 systems.
+			if err := streamFileIntoTar(tw, path, relPath); err != nil {
+				return err
 			}
-			defer file.Close()
-
-			if _, err := io.Copy(tw, file); err != nil {
-				return fmt.Errorf("copy %s to tar: %w", relPath, err)
-			}
-
 			return nil
 		})

@@ -69,8 +105,16 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
 		pw.CloseWithError(err)
 	}()

+	// Pin the legacy builder explicitly. On Docker Engine 23+ BuildKit
+	// is the default for the CLI, but the daemon honours the explicit
+	// Version field on ImageBuildOptions. Legacy builder does NOT support
+	// `RUN --mount=type=bind,source=/host` so a malicious Dockerfile
+	// cannot mount host paths into the build context. Switching to
+	// BuildKit later requires (a) Dockerfile-content validation to
+	// reject bind-mount hints, or (b) an explicit per-workload opt-in.
 	resp, err := c.api.ImageBuild(ctx, pr, client.ImageBuildOptions{
-		Dockerfile:  "Dockerfile",
+		Version:     build.BuilderV1,
+		Dockerfile:  dockerfile,
 		Tags:        []string{tag},
 		Remove:      true,
 		ForceRemove: true,
@@ -80,16 +124,71 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
 	}
 	defer resp.Body.Close()

-	// Read the build output to completion (required for the build to finish).
-	output, err := io.ReadAll(resp.Body)
-	if err != nil {
+	// Drain the daemon's NDJSON stream to completion. The stream MUST
+	// be read for the build to finish — closing the body early aborts
+	// the build. We parse line-by-line into the {Stream, Error} shape
+	// the daemon emits so an honest `{"error":"..."}` line surfaces
+	// without false positives from informational `{"stream":"error
+	// handling: retrying..."}` chatter that the old strings.Contains
+	// path would have flagged.
+	type buildLine struct {
+		Stream string `json:"stream,omitempty"`
+		Error  string `json:"error,omitempty"`
+	}
+	scanner := bufio.NewScanner(resp.Body)
+	// Some build steps emit single lines exceeding the default 64 KiB
+	// (e.g. a fat go-mod-download dump). Bump to 1 MiB so we don't
+	// silently truncate and miss the trailing error line.
+	scanner.Buffer(make([]byte, 64*1024), 1024*1024)
+	var firstErr string
+	for scanner.Scan() {
+		line := scanner.Bytes()
+		if len(line) == 0 {
+			continue
+		}
+		var bl buildLine
+		if err := json.Unmarshal(line, &bl); err != nil {
+			// Non-JSON line — daemon shouldn't produce these, but
+			// don't fail the build over a parse hiccup.
+			continue
+		}
+		if bl.Error != "" && firstErr == "" {
+			firstErr = bl.Error
+		}
+		if logFn != nil && bl.Stream != "" {
+			logFn(bl.Stream)
+		}
+	}
+	if err := scanner.Err(); err != nil {
 		return fmt.Errorf("read build output for %s: %w", tag, err)
 	}
-
-	// Check for error in build output.
-	if strings.Contains(string(output), `"error"`) {
-		return fmt.Errorf("build image %s: build errors in output", tag)
+	if firstErr != "" {
+		return fmt.Errorf("build image %s: %s", tag, firstErr)
 	}

 	return nil
 }
+
+// streamFileIntoTar opens path, copies its contents into the tar writer
+// under the given relPath header, and closes the file *before returning*
+// — i.e. once per file, not deferred to the end of the entire walk.
+// Extracted so the per-iteration close discipline is obvious at the
+// callsite and the file handle isn't accidentally hoisted into the
+// caller's defer stack via a future refactor.
+func streamFileIntoTar(tw *tar.Writer, path, relPath string) error {
+	file, err := os.Open(path)
+	if err != nil {
+		return fmt.Errorf("open %s: %w", path, err)
+	}
+	_, copyErr := io.Copy(tw, file)
+	// Close BEFORE returning so the fd is released even on copy
+	// failure. Capture both errors so the more-specific copy error
+	// wins when both fire.
+	if cerr := file.Close(); cerr != nil && copyErr == nil {
+		copyErr = cerr
+	}
+	if copyErr != nil {
+		return fmt.Errorf("copy %s to tar: %w", relPath, copyErr)
+	}
+	return nil
+}
@@ -27,6 +27,13 @@ const (

 	// EventStackStatus is emitted when a compose stack status changes.
 	EventStackStatus EventType = "stack_status"
+
+	// EventBuildLog is emitted for each line of a streaming image build.
+	// Per-line events are ephemeral (not persisted to the event_log) — they
+	// exist to drive a live tail UI during the slow "building" phase of a
+	// dockerfile-source deploy. Subscribers should filter by WorkloadID
+	// because every dockerfile deploy on the box publishes on the same bus.
+	EventBuildLog EventType = "build_log"
 )

 // Event is a single event published on the bus.
@@ -77,6 +84,14 @@ type StaticSiteStatusPayload struct {
 	Status string `json:"status"`
 }

+// BuildLogPayload is the payload for EventBuildLog events. One event
+// per non-empty line read off the daemon's NDJSON build stream.
+type BuildLogPayload struct {
+	WorkloadID string `json:"workload_id"`
+	Line       string `json:"line"`
+	Stream     string `json:"stream,omitempty"`
+}
+
 // StackStatusPayload is the payload for EventStackStatus events.
 type StackStatusPayload struct {
 	StackID string `json:"stack_id"`
@@ -0,0 +1,250 @@
+// Package metrics provides a minimal Prometheus text-format exposition
+// of Tinyforge's operational counters. We deliberately do NOT import the
+// official client_golang library: the metrics set here is small, the text
+// format is simple, and avoiding the dependency keeps `tinyforge` a fast
+// single-binary install.
+//
+// Every counter is a sync/atomic.Int64 — cheap, lock-free, and safe to
+// touch from any goroutine. Histograms / gauges aren't modeled yet; the
+// few we need (request latency p50/p99) live downstream of slog and can
+// be added when the operator actually wants them.
+package metrics
+
+import (
+	"fmt"
+	"io"
+	"log/slog"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// Registry holds the process-wide counter set. A single zero-value
+// Registry is ready to use — see DefaultRegistry below for the
+// recommended way to grab the global handle.
+type Registry struct {
+	mu       sync.RWMutex
+	counters map[string]*counter
+}
+
+type counter struct {
+	name   string
+	help   string
+	labels []string // label names, ordered as declared at registration
+	series map[string]*atomic.Int64
+	// seriesMu only protects insertion of new label tuples — increments
+	// on existing tuples are lock-free via the atomic.
+	seriesMu sync.Mutex
+}
+
+// DefaultRegistry is the process-wide registry. All Tinyforge metrics
+// register against it. Tests can instantiate their own Registry.
+var DefaultRegistry = newRegistry()
+
+func newRegistry() *Registry {
+	return &Registry{counters: make(map[string]*counter)}
+}
+
+// NewCounter declares a counter on the default registry. Call once at
+// package init or during NewServer; subsequent calls with the same name
+// return the existing counter so re-registration is safe.
+//
+// label names define the dimensions; calls to Inc must pass values in
+// the same order. Use the empty slice for label-less counters.
+func NewCounter(name, help string, labels ...string) *Counter {
+	return DefaultRegistry.NewCounter(name, help, labels...)
+}
+
+// NewCounter on a specific Registry — useful in tests.
+func (r *Registry) NewCounter(name, help string, labels ...string) *Counter {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if c, ok := r.counters[name]; ok {
+		return &Counter{c: c}
+	}
+	c := &counter{
+		name:   name,
+		help:   help,
+		labels: append([]string(nil), labels...),
+		series: make(map[string]*atomic.Int64),
+	}
+	r.counters[name] = c
+	return &Counter{c: c}
+}
+
+// Counter is the public handle returned by NewCounter. Pass it around as
+// a value — the underlying state lives on the registry.
+type Counter struct {
+	c *counter
+}
+
+// Inc atomically increments the counter for the given label values.
+// Passing the wrong number of values is a programmer error; we surface
+// it as a panic during testing rather than silently aggregating into a
+// bogus series.
+func (c Counter) Inc(labelValues ...string) {
+	c.Add(1, labelValues...)
+}
+
+// Add atomically adds delta. Negative delta is rejected (counters are
+// monotonic by definition).
+func (c Counter) Add(delta int64, labelValues ...string) {
+	if delta < 0 {
+		return
+	}
+	if len(labelValues) != len(c.c.labels) {
+		// Programmer error. This used to panic to surface the bug, but Add
+		// runs on hot paths (HTTP middleware, deploy dispatch) and several
+		// callers are off the request goroutine, where a panic would take
+		// down the whole process rather than a single request. Log loudly
+		// and drop the sample so a mislabeled call site can never crash the
+		// server; the bug still shows up immediately in the logs and in
+		// tests via the error output.
+		slog.Error("metrics: label count mismatch — dropping sample",
+			"counter", c.c.name, "want", len(c.c.labels), "got", len(labelValues))
+		return
+	}
+	key := encodeKey(labelValues)
+	c.c.seriesMu.Lock()
+	v, ok := c.c.series[key]
+	if !ok {
+		v = new(atomic.Int64)
+		c.c.series[key] = v
+	}
+	c.c.seriesMu.Unlock()
+	v.Add(delta)
+}
+
+// encodeKey joins label values with a 0x1f separator. Prometheus label
+// values may contain anything except `"` and `\n`, which we escape on
+// exposition only — the key here is just a map index.
+func encodeKey(values []string) string {
+	return strings.Join(values, "\x1f")
+}
+
+// WritePrometheus dumps the registry in the text exposition format
+// Prometheus / VictoriaMetrics / OpenMetrics understands. Stable
+// ordering: counters alphabetical by name; series alphabetical by
+// encoded label tuple.
+func (r *Registry) WritePrometheus(w io.Writer) error {
+	r.mu.RLock()
+	names := make([]string, 0, len(r.counters))
+	for n := range r.counters {
+		names = append(names, n)
+	}
+	r.mu.RUnlock()
+	sort.Strings(names)
+
+	for _, name := range names {
+		r.mu.RLock()
+		c := r.counters[name]
+		r.mu.RUnlock()
+		if err := writeCounter(w, c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func writeCounter(w io.Writer, c *counter) error {
+	if _, err := fmt.Fprintf(w, "# HELP %s %s\n# TYPE %s counter\n", c.name, escapeHelp(c.help), c.name); err != nil {
+		return err
+	}
+	// Snapshot the series map under a SINGLE lock acquisition. The
+	// previous shape acquired+released seriesMu twice per emitted
+	// series (once for the key list, once per Load), contending with
+	// every hot-path Inc on the HTTP request path. The *atomic.Int64
+	// pointers are stable for the lifetime of the registry (we never
+	// delete entries), so reading them after the unlock is safe.
+	type sample struct {
+		key string
+		val *atomic.Int64
+	}
+	c.seriesMu.Lock()
+	samples := make([]sample, 0, len(c.series))
+	for k, v := range c.series {
+		samples = append(samples, sample{k, v})
+	}
+	c.seriesMu.Unlock()
+
+	sort.Slice(samples, func(i, j int) bool { return samples[i].key < samples[j].key })
+
+	for _, s := range samples {
+		val := s.val.Load()
+		labels := decodeKey(s.key, c.labels)
+		if labels == "" {
+			if _, err := fmt.Fprintf(w, "%s %d\n", c.name, val); err != nil {
+				return err
+			}
+			continue
+		}
+		if _, err := fmt.Fprintf(w, "%s{%s} %d\n", c.name, labels, val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func decodeKey(key string, names []string) string {
+	if key == "" || len(names) == 0 {
+		return ""
+	}
+	values := strings.Split(key, "\x1f")
+	if len(values) != len(names) {
+		// Should not happen — encodeKey/decode are symmetric.
+		return ""
+	}
+	parts := make([]string, len(names))
+	for i, n := range names {
+		parts[i] = fmt.Sprintf(`%s="%s"`, n, escapeLabelValue(values[i]))
+	}
+	return strings.Join(parts, ",")
+}
+
+func escapeHelp(s string) string {
+	r := strings.NewReplacer("\\", "\\\\", "\n", "\\n")
+	return r.Replace(s)
+}
+
+func escapeLabelValue(s string) string {
+	r := strings.NewReplacer("\\", "\\\\", "\n", "\\n", `"`, `\"`)
+	return r.Replace(s)
+}
+
+// ── Pre-declared counters ────────────────────────────────────────────
+//
+// These are the counters Tinyforge surfaces to operators. Adding more is
+// a one-line NewCounter call at the call site — no central catalogue,
+// just keep names lowercase_snake with the `tinyforge_` prefix.
+
+var (
+	HTTPRequestsTotal = NewCounter(
+		"tinyforge_http_requests_total",
+		"Total HTTP requests handled, partitioned by method and outcome class.",
+		"method", "status_class",
+	)
+	DeploysTotal = NewCounter(
+		"tinyforge_deploys_total",
+		"Total deploys dispatched, partitioned by source kind and outcome.",
+		"source_kind", "outcome",
+	)
+	WebhookDeliveriesTotal = NewCounter(
+		"tinyforge_webhook_deliveries_total",
+		"Total inbound webhook deliveries, partitioned by outcome.",
+		"outcome",
+	)
+	SchedulerTicksTotal = NewCounter(
+		"tinyforge_scheduler_ticks_total",
+		"Total scheduler ticks. The dispatched counter is the success measure.",
+	)
+	SchedulerDispatchedTotal = NewCounter(
+		"tinyforge_scheduler_dispatched_total",
+		"Triggers actually dispatched by the scheduler.",
+	)
+	OutboundNotifyTotal = NewCounter(
+		"tinyforge_outbound_notify_total",
+		"Outbound notification dispatch attempts, partitioned by outcome.",
+		"outcome",
+	)
+)
@@ -16,6 +16,8 @@ import (
 	"time"

 	"github.com/google/uuid"
+
+	"github.com/alexei/tinyforge/internal/metrics"
 )

 // Event represents a deployment / site-sync notification payload.
@@ -83,17 +85,68 @@ type TestResult struct {
 // Notifications are fire-and-forget by default — failures are logged but do
 // not propagate. SendSyncForTest is the exception, used only by the manual
 // test endpoint.
+//
+// outboundSem caps the number of in-flight outbound notifications. Without
+// it a single burst (e.g. 1000 event triggers firing on a noisy log scan)
+// would spawn 1000 simultaneous TCP connections, which both DoSes the
+// receiver and exhausts local FDs.
 type Notifier struct {
-	httpClient *http.Client
-	wg         sync.WaitGroup
+	httpClient  *http.Client
+	wg          sync.WaitGroup
+	outboundSem chan struct{}
 }

+// maxOutboundNotifications bounds the in-flight outbound webhook fan-out.
+// Sized to keep small bursts non-blocking while preventing a runaway storm
+// from starving the rest of the process. Tunable later via settings if any
+// operator legitimately needs more concurrency.
+const maxOutboundNotifications = 32
+
 // New creates a Notifier with sensible defaults.
 func New() *Notifier {
+	// Transport with bounded host pooling so a slow receiver cannot pin
+	// arbitrarily many sockets open. MaxConnsPerHost mirrors the worker
+	// pool size; idle pruning keeps long-lived processes from holding
+	// stale TCP entries indefinitely.
+	//
+	// NOTE: we deliberately do NOT apply the staticsite SSRF dialer here.
+	// Notification URLs are admin-configured, and an admin already has
+	// Docker-socket (host-root-equivalent) access, so the SSRF surface adds
+	// nothing they couldn't already reach. Blocking loopback/private targets
+	// would instead break the common self-hosted pattern of notifying a
+	// same-host sidecar/bridge (e.g. service-to-notification-bridge on
+	// 127.0.0.1). See the security review (rated LOW / out of trust boundary).
+	tr := &http.Transport{
+		MaxIdleConns:        64,
+		MaxIdleConnsPerHost: 8,
+		MaxConnsPerHost:     maxOutboundNotifications,
+		IdleConnTimeout:     90 * time.Second,
+	}
 	return &Notifier{
 		httpClient: &http.Client{
-			Timeout: 10 * time.Second,
+			Timeout:   10 * time.Second,
+			Transport: tr,
 		},
+		outboundSem: make(chan struct{}, maxOutboundNotifications),
+	}
+}
+
+// acquireSlot reserves an outbound slot, respecting ctx so a backed-up
+// queue cannot starve a request that already has its own deadline.
+func (n *Notifier) acquireSlot(ctx context.Context) bool {
+	select {
+	case n.outboundSem <- struct{}{}:
+		return true
+	case <-ctx.Done():
+		return false
+	}
+}
+
+func (n *Notifier) releaseSlot() {
+	select {
+	case <-n.outboundSem:
+	default:
+		// Drained during shutdown — never block.
 	}
 }

@@ -128,8 +181,15 @@ func (n *Notifier) SendSigned(webhookURL, secret string, tier Tier, event Event)
 	n.wg.Add(1)
 	go func() {
 		defer n.wg.Done()
-		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 		defer cancel()
+		if !n.acquireSlot(ctx) {
+			slog.Warn("notify: dropped — outbound queue saturated",
+				"tier", tier, "host", safeHost(webhookURL), "delivery", delivery, "event", event.Type)
+			metrics.OutboundNotifyTotal.Inc("dropped")
+			return
+		}
+		defer n.releaseSlot()

 		_, err := n.doSend(ctx, webhookURL, secret, tier, delivery, event)
 		// URL host only — never log the secret or full URL with user-info.
@@ -138,11 +198,13 @@ func (n *Notifier) SendSigned(webhookURL, secret string, tier Tier, event Event)
 			slog.Warn("notify: webhook send failed",
 				"tier", tier, "host", host, "delivery", delivery,
 				"event", event.Type, "signed", secret != "", "error", err)
+			metrics.OutboundNotifyTotal.Inc("failure")
 			return
 		}
 		slog.Info("notify: webhook dispatched",
 			"tier", tier, "host", host, "delivery", delivery,
 			"event", event.Type, "signed", secret != "")
+		metrics.OutboundNotifyTotal.Inc("success")
 	}()
 }

@@ -166,8 +228,15 @@ func (n *Notifier) SendPayload(webhookURL, secret, eventType string, payload any
 	n.wg.Add(1)
 	go func() {
 		defer n.wg.Done()
-		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 		defer cancel()
+		if !n.acquireSlot(ctx) {
+			slog.Warn("notify: dropped trigger payload — outbound queue saturated",
+				"tier", TierEventTrigger, "host", safeHost(webhookURL), "delivery", delivery, "event", eventType)
+			metrics.OutboundNotifyTotal.Inc("dropped")
+			return
+		}
+		defer n.releaseSlot()

 		_, err := n.doSendRaw(ctx, webhookURL, secret, TierEventTrigger, delivery, eventType, timestamp, payload)
 		host := safeHost(webhookURL)
@@ -175,11 +244,13 @@ func (n *Notifier) SendPayload(webhookURL, secret, eventType string, payload any
 			slog.Warn("notify: trigger webhook send failed",
 				"tier", TierEventTrigger, "host", host, "delivery", delivery,
 				"event", eventType, "signed", secret != "", "error", err)
+			metrics.OutboundNotifyTotal.Inc("failure")
 			return
 		}
 		slog.Info("notify: trigger webhook dispatched",
 			"tier", TierEventTrigger, "host", host, "delivery", delivery,
 			"event", eventType, "signed", secret != "")
+		metrics.OutboundNotifyTotal.Inc("success")
 	}()
 }

@@ -27,6 +27,7 @@ import (
 	"sync"
 	"time"

+	"github.com/alexei/tinyforge/internal/metrics"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 	"github.com/alexei/tinyforge/internal/workload/plugin/trigger/schedule"
@@ -124,6 +125,7 @@ func (s *Scheduler) loop(ctx context.Context) {
 // TickOnce runs a single sweep. Exposed for tests and for the boot
 // kick. On error per-trigger the loop continues with the next row.
 func (s *Scheduler) TickOnce(ctx context.Context) {
+	metrics.SchedulerTicksTotal.Inc()
 	rows, err := s.store.ListTriggers("schedule")
 	if err != nil {
 		slog.Warn("scheduler: list triggers", "error", err)
@@ -226,5 +228,6 @@ func (s *Scheduler) fire(ctx context.Context, t store.Trigger, now time.Time) {
 		slog.Warn("scheduler: dispatch", "trigger", t.Name, "error", err)
 		return
 	}
+	metrics.SchedulerDispatchedTotal.Inc()
 	slog.Info("scheduler: fired", "trigger", t.Name, "kind", t.Kind, "at", ts)
 }
@@ -92,17 +92,27 @@ func (c *Compose) Ps(ctx context.Context, projectName, yamlPath string) ([]Servi
 }

 // Logs runs `docker compose -p <projectName> logs --no-color --tail=<n> <service>`.
-// If service is empty, logs for all services are returned.
+// If service is empty, logs for all services are returned. The service arg
+// is preceded by `--` so a service name that begins with `-` cannot be
+// re-parsed as a flag by the docker CLI (flag-injection guard).
 func (c *Compose) Logs(ctx context.Context, projectName, service string, tail int) (string, error) {
 	args := []string{"logs", "--no-color", fmt.Sprintf("--tail=%d", tail)}
 	if service != "" {
-		args = append(args, service)
+		args = append(args, "--", service)
 	}
 	return c.run(ctx, projectName, args...)
 }

-// run executes `docker compose -p <projectName> <args...>` and returns combined output.
+// run executes `docker compose -p <projectName> <args...>` and returns
+// combined output. projectName is verified not to begin with `-` because
+// `docker compose -p '--foo'` would otherwise be re-parsed as a flag —
+// the callers already sanitize project names through projectNameSanitizer,
+// but a belt-and-braces refusal here means any future caller cannot
+// accidentally bypass the sanitizer.
 func (c *Compose) run(ctx context.Context, projectName string, args ...string) (string, error) {
+	if projectName == "" || strings.HasPrefix(projectName, "-") {
+		return "", fmt.Errorf("docker compose: refusing project name %q", projectName)
+	}
 	full := append([]string{"compose", "-p", projectName}, args...)
 	cmd := exec.CommandContext(ctx, c.binary, full...)
 	var buf bytes.Buffer
@@ -2,6 +2,7 @@ package stack

 import (
 	"fmt"
+	"strings"

 	"gopkg.in/yaml.v3"
 )
@@ -15,11 +16,25 @@ type ComposeSpec struct {
 }

 // ServiceSpec captures the subset of compose service fields we inspect.
+//
+// All host-escape-adjacent fields are decoded here even though Tinyforge
+// itself never reads them at runtime — surfacing them to Validate() is the
+// only way to *reject* them. Add new fields here when blocking a new
+// escape vector.
 type ServiceSpec struct {
-	Image      string            `yaml:"image,omitempty"`
-	Ports      []any             `yaml:"ports,omitempty"`
-	Labels     map[string]string `yaml:"labels,omitempty"`
-	Privileged bool              `yaml:"privileged,omitempty"`
+	Image       string            `yaml:"image,omitempty"`
+	Build       any               `yaml:"build,omitempty"` // banned — see Validate
+	Ports       []any             `yaml:"ports,omitempty"`
+	Labels      map[string]string `yaml:"labels,omitempty"`
+	Privileged  bool              `yaml:"privileged,omitempty"`
+	Volumes     []any             `yaml:"volumes,omitempty"`
+	NetworkMode string            `yaml:"network_mode,omitempty"`
+	Pid         string            `yaml:"pid,omitempty"`
+	Ipc         string            `yaml:"ipc,omitempty"`
+	UsernsMode  string            `yaml:"userns_mode,omitempty"`
+	CapAdd      []string          `yaml:"cap_add,omitempty"`
+	Devices     []any             `yaml:"devices,omitempty"`
+	SecurityOpt []string          `yaml:"security_opt,omitempty"`
 }

 // Parse decodes YAML into a ComposeSpec. Returns a descriptive error on failure.
@@ -35,10 +50,20 @@ func Parse(yamlText string) (ComposeSpec, error) {
 }

 // Validate enforces Tinyforge-level constraints beyond compose schema validity.
+// All blocked fields below are documented host-escape vectors: any one of
+// them on its own gives the container root on the host. Tinyforge already
+// owns the docker socket, so the threat model is "any admin == host root,"
+// and these blocks raise the bar for any *future* viewer-to-admin
+// escalation as well as honest-mistake guardrails.
+//
 // Current rules:
 //   - No service may set `privileged: true`.
-//   - Every service must declare an image (compose supports build: too, but
-//     Tinyforge v1 disallows building from context to avoid arbitrary-code exec).
+//   - Every service must declare an image (build contexts disallowed).
+//   - No host-IPC / host-PID / host-userns / host networking.
+//   - No `cap_add`, `security_opt`, `devices`.
+//   - `volumes` may not bind-mount the docker socket, /, /etc, /var, /proc,
+//     /sys, /root, or /home — list is conservative; operators with real
+//     bind-mount needs should ship a Source plugin or a dedicated wizard.
 func Validate(spec ComposeSpec) error {
 	for name, svc := range spec.Services {
 		if svc.Privileged {
@@ -47,6 +72,121 @@ func Validate(spec ComposeSpec) error {
 		if svc.Image == "" {
 			return fmt.Errorf("service %q: image is required (build contexts not supported)", name)
 		}
+		if svc.Build != nil {
+			return fmt.Errorf("service %q: build: is not supported (use image:)", name)
+		}
+		if isBlockedNamespaceMode(svc.NetworkMode) {
+			return fmt.Errorf("service %q: network_mode %q is not allowed", name, svc.NetworkMode)
+		}
+		if isBlockedNamespaceMode(svc.Pid) {
+			return fmt.Errorf("service %q: pid: %q is not allowed", name, svc.Pid)
+		}
+		if isBlockedNamespaceMode(svc.Ipc) {
+			return fmt.Errorf("service %q: ipc: %q is not allowed", name, svc.Ipc)
+		}
+		if isHostMode(svc.UsernsMode) {
+			return fmt.Errorf("service %q: userns_mode %q is not allowed", name, svc.UsernsMode)
+		}
+		if len(svc.CapAdd) > 0 {
+			return fmt.Errorf("service %q: cap_add is not allowed", name)
+		}
+		if len(svc.SecurityOpt) > 0 {
+			return fmt.Errorf("service %q: security_opt is not allowed", name)
+		}
+		if len(svc.Devices) > 0 {
+			return fmt.Errorf("service %q: devices is not allowed", name)
+		}
+		for _, v := range svc.Volumes {
+			if host, ok := bindMountHostPath(v); ok {
+				if isBlockedBindMount(host) {
+					return fmt.Errorf("service %q: bind-mounting %q is not allowed", name, host)
+				}
+			}
+		}
 	}
 	return nil
 }
+
+// isHostMode reports a host-namespace share, i.e. network_mode / pid / ipc /
+// userns_mode set to "host". (It deliberately does NOT match "host-gateway",
+// which is an extra_hosts value, not a namespace mode — matching it here only
+// produced misleading rejections.)
+func isHostMode(v string) bool {
+	return v == "host"
+}
+
+// isBlockedNamespaceMode reports a namespace mode that must be rejected for
+// network_mode / pid / ipc: either host sharing ("host") or joining another
+// container's / compose service's namespace ("container:<id>",
+// "service:<name>"). The container/service joins are a lateral-movement and
+// sandbox-escape vector — a malicious service could attach to a victim
+// container's network or PID namespace.
+func isBlockedNamespaceMode(v string) bool {
+	return isHostMode(v) ||
+		strings.HasPrefix(v, "container:") ||
+		strings.HasPrefix(v, "service:")
+}
+
+// bindMountHostPath extracts the host-side path from a compose volume
+// declaration. Compose accepts two shapes: a short string "src:dst[:mode]"
+// and a long form map with a "source" key. Returns ok=false for named
+// volumes (no host source).
+func bindMountHostPath(v any) (string, bool) {
+	switch t := v.(type) {
+	case string:
+		// "named:/in/container" has no '/' or '.' prefix on the source.
+		if t == "" {
+			return "", false
+		}
+		parts := strings.SplitN(t, ":", 3)
+		src := parts[0]
+		if strings.HasPrefix(src, "/") || strings.HasPrefix(src, ".") || strings.HasPrefix(src, "~") {
+			return src, true
+		}
+		return "", false
+	case map[string]any:
+		if typ, _ := t["type"].(string); typ != "" && typ != "bind" {
+			return "", false
+		}
+		if src, ok := t["source"].(string); ok {
+			if strings.HasPrefix(src, "/") || strings.HasPrefix(src, ".") || strings.HasPrefix(src, "~") {
+				return src, true
+			}
+		}
+	}
+	return "", false
+}
+
+// isBlockedBindMount returns true for paths that obviously escape the
+// container's intended sandbox. Conservative deny-list — operators with
+// legitimate bind-mount needs should write a dedicated Source plugin
+// rather than tunnel them through compose.
+func isBlockedBindMount(host string) bool {
+	// Normalize trailing slash so "/var" and "/var/" both match.
+	clean := strings.TrimRight(host, "/")
+	if clean == "" || clean == "/" {
+		return true
+	}
+	// Relative ("./x", "../x", ".") and home-relative ("~/...") sources are
+	// resolved by Docker against the compose working directory (which
+	// Tinyforge controls and never intends as a host-bind source) or left
+	// unexpanded — and "../" can climb out of that directory entirely. The
+	// absolute-prefix deny-list below can't see these, so reject them
+	// outright rather than give a false sense of coverage.
+	if strings.HasPrefix(clean, ".") || strings.HasPrefix(clean, "~") {
+		return true
+	}
+	// Specific blocked files / sockets.
+	switch clean {
+	case "/var/run/docker.sock", "/run/docker.sock":
+		return true
+	}
+	// Blocked prefixes (cover sub-paths too).
+	blocked := []string{"/etc", "/var", "/proc", "/sys", "/root", "/home", "/boot", "/dev"}
+	for _, p := range blocked {
+		if clean == p || strings.HasPrefix(clean, p+"/") {
+			return true
+		}
+	}
+	return false
+}
@@ -50,34 +50,7 @@ func ValidateBaseURL(raw string) error {
 func NewSafeHTTPClient(timeout time.Duration) *http.Client {
 	dialer := &net.Dialer{Timeout: 10 * time.Second, KeepAlive: 30 * time.Second}
 	transport := &http.Transport{
-		DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
-			host, port, err := net.SplitHostPort(addr)
-			if err != nil {
-				return nil, err
-			}
-			// If the caller passed a literal IP, skip the DNS round-trip.
-			if literal := net.ParseIP(host); literal != nil {
-				if reason := blockReason(literal); reason != "" {
-					return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, literal, reason)
-				}
-				return dialer.DialContext(ctx, network, addr)
-			}
-			ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
-			if err != nil {
-				return nil, err
-			}
-			if len(ips) == 0 {
-				return nil, fmt.Errorf("no addresses for %s", host)
-			}
-			for _, ip := range ips {
-				if reason := blockReason(ip.IP); reason != "" {
-					return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, ip.IP, reason)
-				}
-			}
-			// Bind to the first resolved IP so a rebind between resolution
-			// and connect cannot redirect the request to a blocked address.
-			return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), port))
-		},
+		DialContext:         SafeDialContext(dialer),
 		MaxIdleConns:        16,
 		IdleConnTimeout:     30 * time.Second,
 		TLSHandshakeTimeout: 10 * time.Second,
@@ -85,6 +58,43 @@ func NewSafeHTTPClient(timeout time.Duration) *http.Client {
 	return &http.Client{Timeout: timeout, Transport: transport}
 }

+// SafeDialContext returns a DialContext that rejects loopback, link-local,
+// multicast, unspecified, and cloud-metadata addresses at connect time,
+// re-resolving and binding to the resolved IP so a DNS rebind between
+// resolution and connect cannot slip through. Exposed so other transports
+// (e.g. the outbound notification client) can apply the same SSRF policy
+// without duplicating it or losing their own connection-pool tuning.
+func SafeDialContext(dialer *net.Dialer) func(ctx context.Context, network, addr string) (net.Conn, error) {
+	return func(ctx context.Context, network, addr string) (net.Conn, error) {
+		host, port, err := net.SplitHostPort(addr)
+		if err != nil {
+			return nil, err
+		}
+		// If the caller passed a literal IP, skip the DNS round-trip.
+		if literal := net.ParseIP(host); literal != nil {
+			if reason := blockReason(literal); reason != "" {
+				return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, literal, reason)
+			}
+			return dialer.DialContext(ctx, network, addr)
+		}
+		ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
+		if err != nil {
+			return nil, err
+		}
+		if len(ips) == 0 {
+			return nil, fmt.Errorf("no addresses for %s", host)
+		}
+		for _, ip := range ips {
+			if reason := blockReason(ip.IP); reason != "" {
+				return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, ip.IP, reason)
+			}
+		}
+		// Bind to the first resolved IP so a rebind between resolution
+		// and connect cannot redirect the request to a blocked address.
+		return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), port))
+	}
+}
+
 // blockReason returns a human label for why an IP is rejected, or ""
 // if the IP is allowed. Centralized so all callers share the same
 // policy.
@@ -92,6 +102,13 @@ func blockReason(ip net.IP) string {
 	if ip == nil {
 		return "nil address"
 	}
+	// Normalize IPv4-mapped IPv6 (::ffff:x.x.x.x) so the loopback / link-local
+	// classifiers below catch them. net.IP.To4() returns the 4-byte form for
+	// IPv4-mapped addresses; net's IsLoopback already handles this, but pin
+	// the conversion to avoid future surprises if the std-lib semantics drift.
+	if v4 := ip.To4(); v4 != nil {
+		ip = v4
+	}
 	switch {
 	case ip.IsLoopback():
 		return "loopback"
@@ -104,5 +121,22 @@ func blockReason(ip net.IP) string {
 	case ip.IsMulticast():
 		return "multicast"
 	}
+	// Cloud metadata endpoints — AWS / GCP / Azure are covered by the
+	// link-local block (169.254.169.254). The rest must be enumerated.
+	if metadataIPSet[ip.String()] {
+		return "cloud metadata endpoint"
+	}
 	return ""
 }
+
+// metadataIPSet enumerates well-known cloud metadata IPs that are NOT
+// covered by net.IP.IsLinkLocalUnicast. Updating this set is the lightest
+// way to keep up with new providers without changing the policy shape.
+var metadataIPSet = map[string]bool{
+	// Alibaba Cloud ECS metadata.
+	"100.100.100.200": true,
+	// Oracle Cloud Infrastructure metadata.
+	"192.0.0.192": true,
+	// AWS IMDS over IPv6 (ULA — not link-local, must be listed).
+	"fd00:ec2::254": true,
+}
@@ -234,17 +234,17 @@ func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.Con
 	found := make([]bool, len(targets))

 	var wg sync.WaitGroup
+loop:
 	for i, t := range targets {
 		// Acquire the semaphore in the parent loop so ctx cancellation
 		// short-circuits the queue rather than spawning goroutines that
-		// block on an unreachable slot.
+		// block on an unreachable slot. The labelled break exits the for
+		// loop directly; a bare `break` inside `select` would only break
+		// the select and let the loop continue.
 		select {
 		case sem <- struct{}{}:
 		case <-ctx.Done():
-			break
-		}
-		if ctx.Err() != nil {
-			break
+			break loop
 		}
 		wg.Add(1)
 		go func(i int, t target) {
@@ -2,6 +2,7 @@ package store

 import (
 	"database/sql"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
@@ -9,6 +10,22 @@ import (
 	"github.com/google/uuid"
 )

+// validateExtraJSON ensures the extra_json column never receives an
+// invalid JSON document. The codemap (docs/CODEMAPS/container-extra-json.md)
+// is explicit that readers tolerate unknown keys — but only if the value
+// is valid JSON at all. A buggy plugin writing `"not json"` would silently
+// break every reader, with no schema-level check to catch it. Guarding at
+// the store boundary keeps the invariant cheap and obvious.
+func validateExtraJSON(v string) error {
+	if v == "" {
+		return nil
+	}
+	if !json.Valid([]byte(v)) {
+		return fmt.Errorf("extra_json: not valid JSON (%d bytes)", len(v))
+	}
+	return nil
+}
+
 // containerColumns is the canonical column list for `containers` queries.
 // stage_id is populated by the deployer for project containers (so ListProxyRoutes
 // survives stage renames) and left empty for stacks and sites.
@@ -42,6 +59,9 @@ func (s *Store) CreateContainer(c Container) (Container, error) {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return Container{}, err
+	}

 	_, err := s.db.Exec(
 		`INSERT INTO containers (`+containerColumns+`)
@@ -77,6 +97,9 @@ func (s *Store) UpsertContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}

 	// SQLite UPSERT — INSERT...ON CONFLICT(id) DO UPDATE.
 	_, err := s.db.Exec(
@@ -129,6 +152,9 @@ func (s *Store) ReconcileContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}

 	// extra_json is deliberately NOT in the ON CONFLICT SET clause: the
 	// reconciler can't observe per-face route IDs from Docker, and
@@ -321,6 +347,9 @@ func (s *Store) UpdateContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}
 	result, err := s.db.Exec(
 		`UPDATE containers SET workload_id=?, workload_kind=?, role=?, stage_id=?, container_id=?,
 			image_ref=?, image_tag=?, host=?, state=?, port=?,
@@ -0,0 +1,171 @@
+package store
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// ErrLockHeld is returned when another Tinyforge process appears to be
+// running against the same data directory. SQLite + SetMaxOpenConns(1)
+// makes this otherwise-silent collision a recipe for double-fired
+// schedulers, double-polled registries, and `extra_json` RMW corruption.
+var ErrLockHeld = errors.New("data directory is locked by another tinyforge process")
+
+// Lockfile is a portable PID file. AcquireLockfile takes it; the returned
+// Release function removes it. The contract:
+//
+//   - Lockfile is created with O_CREATE|O_EXCL — atomic on POSIX, atomic
+//     on NTFS / ReFS via the equivalent.
+//   - On collision, the existing file's PID is read; if the PID is dead,
+//     we treat the lock as stale (process crashed without cleanup),
+//     reclaim it, and proceed. Live PID → ErrLockHeld.
+//   - flock is intentionally not used: cross-platform consistency wins
+//     over advisory-lock semantics for the single-instance use case.
+type Lockfile struct {
+	path string
+}
+
+// AcquireLockfile creates a PID-file lock under dataDir. Returns a
+// Release function the caller must defer. If another live process holds
+// the lock, returns ErrLockHeld with a hint pointing at the lockfile.
+//
+// Reclaim atomicity: when the existing lockfile names a dead PID, the
+// replacement is serialized through an auxiliary reclaim lock (see
+// reclaimStaleLock) so that, of N processes booting concurrently against
+// the same stale lockfile, EXACTLY ONE reclaims it and the rest get
+// ErrLockHeld. A bare `os.Remove`+`O_EXCL` retry — or a rename, which is
+// "last-writer-wins" — cannot guarantee this: multiple reclaimers can each
+// end up believing they own the lock, defeating the single-instance guard.
+func AcquireLockfile(dataDir string) (release func(), err error) {
+	path := filepath.Join(dataDir, "tinyforge.lock")
+
+	// First try: clean acquire.
+	if rel, ok, err := tryCreateExclusive(path); ok {
+		return rel, nil
+	} else if err != nil {
+		return nil, err
+	}
+
+	// Existing lockfile — read PID and decide whether to reclaim.
+	pid, readErr := readLockPID(path)
+	if readErr == nil && processAlive(pid) {
+		return nil, fmt.Errorf("%w (held by pid %d, lockfile=%s)", ErrLockHeld, pid, path)
+	}
+	// Stale lock (dead pid) or malformed file — reclaim under serialization.
+	reason := "malformed existing lockfile"
+	if readErr == nil {
+		reason = fmt.Sprintf("stale lockfile (dead pid %d)", pid)
+	}
+	return reclaimStaleLock(path, reason)
+}
+
+// tryCreateExclusive attempts an atomic O_CREATE|O_EXCL create at path.
+// Returns (release, true, nil) on success; (nil, false, nil) when the
+// file already exists; (nil, false, err) on any other error.
+func tryCreateExclusive(path string) (func(), bool, error) {
+	f, openErr := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
+	if openErr != nil {
+		if os.IsExist(openErr) {
+			return nil, false, nil
+		}
+		return nil, false, fmt.Errorf("open lockfile: %w", openErr)
+	}
+	if _, err := fmt.Fprintf(f, "%d\n", os.Getpid()); err != nil {
+		_ = f.Close()
+		_ = os.Remove(path)
+		return nil, false, fmt.Errorf("write lockfile: %w", err)
+	}
+	if err := f.Close(); err != nil {
+		_ = os.Remove(path)
+		return nil, false, fmt.Errorf("close lockfile: %w", err)
+	}
+	return func() { _ = os.Remove(path) }, true, nil
+}
+
+// reclaimStaleLock replaces a stale/malformed lockfile with one holding our
+// PID, serialized by an auxiliary reclaim lock. Holding the reclaim lock
+// (O_EXCL) guarantees that only one process performs the remove-and-recreate
+// of the main lockfile at a time, so concurrent reclaimers cannot each end
+// up "owning" the lock the way a rename or unguarded remove+create would
+// allow. The reclaim lock is itself liveness-checked so a reclaimer that
+// crashed mid-reclaim cannot wedge startup forever.
+func reclaimStaleLock(lockPath, reason string) (func(), error) {
+	reclaimPath := lockPath + ".reclaim"
+	if err := acquireReclaimLock(reclaimPath); err != nil {
+		return nil, fmt.Errorf("%w (%v; %s)", ErrLockHeld, err, reason)
+	}
+	defer func() { _ = os.Remove(reclaimPath) }()
+
+	// Serialized now. Re-check the main lock: another process may have fully
+	// reclaimed it between our liveness probe and our taking the reclaim lock.
+	if pid, perr := readLockPID(lockPath); perr == nil && processAlive(pid) {
+		return nil, fmt.Errorf("%w (reclaimed by pid %d while we waited; %s)",
+			ErrLockHeld, pid, reason)
+	}
+
+	// Safe to replace: remove the stale file, then create a fresh exclusive
+	// one. Both run while we hold the reclaim lock, so no other reclaimer can
+	// observe the gap.
+	if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
+		return nil, fmt.Errorf("%w (could not remove stale lockfile %s: %v; %s)",
+			ErrLockHeld, lockPath, err, reason)
+	}
+	rel, ok, err := tryCreateExclusive(lockPath)
+	if err != nil {
+		return nil, err
+	}
+	if !ok {
+		// Should be impossible while we hold the reclaim lock; fail safe.
+		return nil, fmt.Errorf("%w (lockfile reappeared during reclaim of %s; %s)",
+			ErrLockHeld, lockPath, reason)
+	}
+	return rel, nil
+}
+
+// acquireReclaimLock takes the auxiliary reclaim lock with O_EXCL. An
+// existing reclaim lock is honoured only while its recorded PID is alive (a
+// genuine concurrent reclaim); a stale one (dead/foreign PID) is removed once
+// and re-attempted so a crashed reclaimer cannot block boot indefinitely. Of
+// concurrent callers, O_EXCL ensures at most one acquires it; the rest fail
+// and back off to ErrLockHeld.
+func acquireReclaimLock(reclaimPath string) error {
+	for attempt := 0; attempt < 2; attempt++ {
+		f, err := os.OpenFile(reclaimPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
+		if err == nil {
+			if _, werr := fmt.Fprintf(f, "%d\n", os.Getpid()); werr != nil {
+				_ = f.Close()
+				_ = os.Remove(reclaimPath)
+				return fmt.Errorf("write reclaim lock %s: %v", reclaimPath, werr)
+			}
+			return f.Close()
+		}
+		if !os.IsExist(err) {
+			return fmt.Errorf("create reclaim lock %s: %v", reclaimPath, err)
+		}
+		// Reclaim lock present. A live owner means a real concurrent reclaim.
+		if pid, perr := readLockPID(reclaimPath); perr == nil && processAlive(pid) {
+			return fmt.Errorf("concurrent reclaim in progress (pid %d)", pid)
+		}
+		// Stale reclaim lock — clear it and retry the exclusive create once.
+		if rerr := os.Remove(reclaimPath); rerr != nil && !os.IsNotExist(rerr) {
+			return fmt.Errorf("remove stale reclaim lock %s: %v", reclaimPath, rerr)
+		}
+	}
+	return fmt.Errorf("could not acquire reclaim lock %s after retry", reclaimPath)
+}
+
+func readLockPID(path string) (int, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return 0, err
+	}
+	pidStr := strings.TrimSpace(string(data))
+	if pidStr == "" {
+		return 0, errors.New("empty lockfile")
+	}
+	return strconv.Atoi(pidStr)
+}
@@ -0,0 +1,137 @@
+package store
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+)
+
+func TestAcquireLockfile_FreshDir(t *testing.T) {
+	dir := t.TempDir()
+	release, err := AcquireLockfile(dir)
+	if err != nil {
+		t.Fatalf("AcquireLockfile: %v", err)
+	}
+	defer release()
+
+	// Lockfile should exist with our PID.
+	data, err := os.ReadFile(filepath.Join(dir, "tinyforge.lock"))
+	if err != nil {
+		t.Fatalf("read lockfile: %v", err)
+	}
+	want := fmt.Sprintf("%d\n", os.Getpid())
+	if string(data) != want {
+		t.Errorf("lockfile content = %q, want %q", data, want)
+	}
+}
+
+func TestAcquireLockfile_HeldByLivePID_Refused(t *testing.T) {
+	dir := t.TempDir()
+	// Plant a lockfile holding the current PID (which is obviously alive).
+	if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
+		[]byte(fmt.Sprintf("%d\n", os.Getpid())), 0o600); err != nil {
+		t.Fatalf("plant lockfile: %v", err)
+	}
+	release, err := AcquireLockfile(dir)
+	if err == nil {
+		release()
+		t.Fatal("expected ErrLockHeld, got nil")
+	}
+	if !errors.Is(err, ErrLockHeld) {
+		t.Errorf("error = %v, want wrap of ErrLockHeld", err)
+	}
+}
+
+func TestAcquireLockfile_StalePID_Reclaimed(t *testing.T) {
+	dir := t.TempDir()
+	// PID 1 is init/launchd/systemd on POSIX and the System Idle Process
+	// on Windows — never our process, and very unlikely to be dead. We
+	// use a deliberately-impossible PID instead: a 31-bit value far
+	// above any plausible system maximum.
+	stalePID := 2147483640
+	if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
+		[]byte(fmt.Sprintf("%d\n", stalePID)), 0o600); err != nil {
+		t.Fatalf("plant stale lockfile: %v", err)
+	}
+	release, err := AcquireLockfile(dir)
+	if err != nil {
+		t.Fatalf("expected reclaim of stale lock, got: %v", err)
+	}
+	defer release()
+
+	// Verify it now holds OUR pid, not the stale one.
+	data, err := os.ReadFile(filepath.Join(dir, "tinyforge.lock"))
+	if err != nil {
+		t.Fatalf("read lockfile after reclaim: %v", err)
+	}
+	want := fmt.Sprintf("%d\n", os.Getpid())
+	if string(data) != want {
+		t.Errorf("lockfile content after reclaim = %q, want %q", data, want)
+	}
+}
+
+func TestAcquireLockfile_ConcurrentReclaim_SingleWinner(t *testing.T) {
+	dir := t.TempDir()
+	// Plant a stale lockfile (impossibly high, certainly-dead PID), then have
+	// many goroutines race to reclaim it. Exactly one must win; the rest must
+	// be refused with ErrLockHeld. A "last-writer-wins" reclaim would let
+	// several goroutines all believe they own the lock.
+	stalePID := 2147483640
+	if err := os.WriteFile(filepath.Join(dir, "tinyforge.lock"),
+		[]byte(fmt.Sprintf("%d\n", stalePID)), 0o600); err != nil {
+		t.Fatalf("plant stale lockfile: %v", err)
+	}
+
+	const n = 16
+	var (
+		wg       sync.WaitGroup
+		mu       sync.Mutex
+		winners  int
+		releases []func()
+	)
+	start := make(chan struct{})
+	for i := 0; i < n; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			<-start
+			release, err := AcquireLockfile(dir)
+			if err != nil {
+				if !errors.Is(err, ErrLockHeld) {
+					t.Errorf("loser error = %v, want wrap of ErrLockHeld", err)
+				}
+				return
+			}
+			mu.Lock()
+			winners++
+			releases = append(releases, release)
+			mu.Unlock()
+		}()
+	}
+	close(start)
+	wg.Wait()
+
+	for _, r := range releases {
+		r()
+	}
+	if winners != 1 {
+		t.Fatalf("concurrent reclaim winners = %d, want exactly 1", winners)
+	}
+}
+
+func TestAcquireLockfile_ReleaseRemovesFile(t *testing.T) {
+	dir := t.TempDir()
+	release, err := AcquireLockfile(dir)
+	if err != nil {
+		t.Fatalf("AcquireLockfile: %v", err)
+	}
+	release()
+
+	path := filepath.Join(dir, "tinyforge.lock")
+	if _, err := os.Stat(path); !os.IsNotExist(err) {
+		t.Errorf("lockfile still present after release: %v", err)
+	}
+}
@@ -0,0 +1,33 @@
+//go:build !windows
+
+package store
+
+import (
+	"errors"
+	"os"
+	"syscall"
+)
+
+// processAlive checks whether the given PID belongs to a running process.
+// On POSIX, kill(pid, 0) sends no signal but returns ESRCH if the PID is
+// dead, EPERM if alive-but-foreign-owned (still "alive" for our purposes).
+//
+// os.FindProcess never returns a non-nil error on Linux / macOS / *BSD
+// for any PID value — it just records the integer. The probe is purely
+// the Signal(0) result. We keep the FindProcess call to obtain the
+// *os.Process handle Signal needs; we don't branch on its error.
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	proc, _ := os.FindProcess(pid)
+	if proc == nil {
+		return false
+	}
+	err := proc.Signal(syscall.Signal(0))
+	if err == nil {
+		return true
+	}
+	// EPERM = alive but not ours; ESRCH = dead.
+	return errors.Is(err, os.ErrPermission) || errors.Is(err, syscall.EPERM)
+}
@@ -0,0 +1,30 @@
+//go:build windows
+
+package store
+
+import (
+	"golang.org/x/sys/windows"
+)
+
+// processAlive returns true when the given PID is currently held by a
+// running Windows process. OpenProcess with PROCESS_QUERY_LIMITED_INFORMATION
+// is the supported way to check liveness without elevation.
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
+	if err != nil {
+		return false
+	}
+	defer windows.CloseHandle(h)
+	var exitCode uint32
+	if err := windows.GetExitCodeProcess(h, &exitCode); err != nil {
+		// Conservative: if we can't ask, assume alive so we don't reclaim
+		// an active lock. Worst case the operator sees ErrLockHeld and
+		// removes the lockfile by hand.
+		return true
+	}
+	const stillActive = 259 // STILL_ACTIVE
+	return exitCode == stillActive
+}
@@ -278,12 +278,20 @@ const (
 // containers.workload_kind and workloads.kind. After the hard cutover the
 // backing project / stack / static_site tables are gone — these constants
 // are just strings used to filter the unified containers index in the UI.
+//
+// `build` is the dockerfile-source kind: a container built from a
+// Dockerfile in a Git repo. Operationally it looks like a site (one
+// container, one optional public face) but its origin is the build
+// pipeline, not a static-asset extract. Dashboard filters that need to
+// distinguish "I built this from source" from "I served files from a
+// repo" should key on this value.
 type WorkloadKind string

 const (
 	WorkloadKindProject WorkloadKind = "project"
 	WorkloadKindStack   WorkloadKind = "stack"
 	WorkloadKindSite    WorkloadKind = "site"
+	WorkloadKindBuild   WorkloadKind = "build"
 )

 // Workload is the unifying primitive that abstracts Project, Stack, and StaticSite.
@@ -316,6 +324,31 @@ type Workload struct {
 	UpdatedAt               string `json:"updated_at"`
 }

+// WorkloadNotification is one configured outbound notification route for
+// a workload. Multiple rows per workload model the "one Slack channel
+// for failures, one Discord webhook for successes" routing the legacy
+// single notification_url column could not express.
+//
+// EventTypes is a comma-separated allow-list (e.g. "build_failure" or
+// "deploy_success,deploy_failure"). An empty EventTypes means the row
+// fires for every event type — the cheapest way to keep the existing
+// single-destination behaviour expressible in the new shape.
+//
+// Secret round-trips through the same crypto envelope as other stored
+// secrets; the API layer strips it from responses.
+type WorkloadNotification struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	Secret     string `json:"-"`
+	EventTypes string `json:"event_types"`
+	Enabled    bool   `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
 // Container is the normalized index of every Tinyforge-managed container.
 // Replaces the project-specific Instance table after migration. Subdomain/
 // proxy fields are hoisted as first-class columns because ListProxyRoutes,
@@ -55,11 +55,20 @@ func New(dbPath string) (*Store, error) {
 	db.SetMaxOpenConns(1)
 	db.SetConnMaxLifetime(0)

-	// Enable WAL mode and foreign keys for better concurrency and referential integrity.
+	// Enable WAL mode and foreign keys for better concurrency and
+	// referential integrity. `synchronous=NORMAL` pairs with WAL to skip
+	// the per-write fsync — the OS still flushes on checkpoint, durability
+	// is preserved across clean shutdowns, and crashes lose at most the
+	// last few committed transactions (acceptable for a tinyforge box).
+	// cache_size=-20000 = 20 MiB page cache, temp_store=MEMORY keeps
+	// indexer scratch off disk; both are pure perf knobs.
 	pragmas := []string{
 		"PRAGMA journal_mode=WAL",
+		"PRAGMA synchronous=NORMAL",
 		"PRAGMA foreign_keys=ON",
 		"PRAGMA busy_timeout=5000",
+		"PRAGMA cache_size=-20000",
+		"PRAGMA temp_store=MEMORY",
 	}
 	for _, p := range pragmas {
 		if _, err := db.Exec(p); err != nil {
@@ -284,6 +293,24 @@ func (s *Store) runMigrations() error {
 			created_at                  TEXT NOT NULL DEFAULT (datetime('now')),
 			updated_at                  TEXT NOT NULL DEFAULT (datetime('now'))
 		)`,
+		// workload_notifications: per-workload notification destinations.
+		// Each row is one route (Slack channel, Discord webhook, generic
+		// receiver, ...). event_types is a comma-separated allow-list —
+		// empty means "all events". When zero rows exist for a workload
+		// the dispatcher falls back to the legacy single notification_url
+		// column on workloads so existing setups keep working unchanged.
+		`CREATE TABLE IF NOT EXISTS workload_notifications (
+			id           TEXT PRIMARY KEY,
+			workload_id  TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+			name         TEXT NOT NULL,
+			url          TEXT NOT NULL,
+			secret       TEXT NOT NULL DEFAULT '',
+			event_types  TEXT NOT NULL DEFAULT '',
+			enabled      INTEGER NOT NULL DEFAULT 1,
+			sort_order   INTEGER NOT NULL DEFAULT 0,
+			created_at   TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at   TEXT NOT NULL DEFAULT (datetime('now'))
+		)`,
 		// workload_trigger_bindings: many-to-many between workloads and
 		// triggers. binding_config is the per-binding override applied on
 		// top of trigger.config (top-level JSON merge, binding wins).
@@ -427,6 +454,7 @@ func (s *Store) runMigrations() error {
 		`CREATE UNIQUE INDEX IF NOT EXISTS idx_triggers_webhook_secret ON triggers(webhook_secret) WHERE webhook_secret != ''`,
 		`CREATE INDEX IF NOT EXISTS idx_bindings_workload         ON workload_trigger_bindings(workload_id)`,
 		`CREATE INDEX IF NOT EXISTS idx_bindings_trigger          ON workload_trigger_bindings(trigger_id)`,
+		`CREATE INDEX IF NOT EXISTS idx_workload_notifs_workload  ON workload_notifications(workload_id)`,
 	}
 	for _, idx := range indexes {
 		if _, err := s.db.Exec(idx); err != nil {
@@ -434,13 +462,215 @@ func (s *Store) runMigrations() error {
 		}
 	}

-	if err := s.backfillTriggersFromWorkloads(); err != nil {
+	// schema_versions table gates one-shot data migrations like the
+	// trigger backfill below. Without this, the backfill scan ran on
+	// every boot even on fully-migrated DBs — wasted I/O and (more
+	// importantly) made it impossible to tell whether a "no rows
+	// processed" was a clean state or a missed-migration bug.
+	if _, err := s.db.Exec(`CREATE TABLE IF NOT EXISTS schema_versions (
+		version    INTEGER PRIMARY KEY,
+		applied_at TEXT NOT NULL DEFAULT (datetime('now'))
+	)`); err != nil {
+		return fmt.Errorf("create schema_versions: %w", err)
+	}
+
+	if err := s.runOnce(1, "trigger backfill", s.backfillTriggersFromWorkloads); err != nil {
+		// Backfill failure is non-fatal — we log and let the operator
+		// retry. The version is only recorded on success.
 		slog.Warn("trigger backfill", "error", err)
 	}

 	return nil
 }

+// runOnce executes fn at most one time per database lifetime, recording
+// success in schema_versions. Useful for data migrations whose source
+// table eventually disappears (so re-running becomes pointless or
+// dangerous).
+func (s *Store) runOnce(version int, label string, fn func() error) error {
+	var applied int
+	if err := s.db.QueryRow(`SELECT COUNT(*) FROM schema_versions WHERE version = ?`, version).Scan(&applied); err != nil {
+		return fmt.Errorf("check %s: %w", label, err)
+	}
+	if applied > 0 {
+		return nil
+	}
+	if err := fn(); err != nil {
+		return err
+	}
+	if _, err := s.db.Exec(`INSERT INTO schema_versions (version) VALUES (?)`, version); err != nil {
+		return fmt.Errorf("mark %s applied: %w", label, err)
+	}
+	slog.Info("schema migration applied", "version", version, "label", label)
+	return nil
+}
+
+// RunOnce is the public counterpart of runOnce, exposed so cmd/server can
+// gate post-store-open migrations (e.g. crypto re-encryption that needs
+// the ENCRYPTION_KEY which Store does not own) through the same
+// schema_versions ledger.
+func (s *Store) RunOnce(version int, label string, fn func() error) error {
+	return s.runOnce(version, label, fn)
+}
+
+// EnvelopeMigrator describes the contract a crypto package implements to
+// rewrite legacy unprefixed-hex ciphertext as versioned envelope values.
+// hasEnvelope reports whether a value already carries the new prefix.
+// decrypt returns plaintext for either form; encrypt always produces the
+// new envelope form. By accepting closures the store stays free of any
+// import on internal/crypto, mirroring the rest of the package layout.
+type EnvelopeMigrator struct {
+	HasEnvelope func(value string) bool
+	Decrypt     func(ciphertext string) (string, error)
+	Encrypt     func(plaintext string) (string, error)
+}
+
+// MigrateSecretsToEnvelope walks every column known to carry an encrypted
+// secret and rewrites legacy unprefixed-hex values into the new
+// envelope form using the current encryption key.
+//
+// Behaviour, per-row:
+//   - empty value → skip (no secret stored)
+//   - already-envelope value → skip (already migrated)
+//   - decrypt fails → skip (value is either plaintext from a v0 boot
+//     OR ciphertext from a rotated key; either way we cannot safely
+//     re-encrypt and leaving it alone preserves the existing read
+//     semantics)
+//   - decrypt succeeds → encrypt to envelope form + UPDATE
+//
+// The whole sweep runs in a single transaction so a power-loss
+// mid-migration leaves the DB in either the pre- or post-migration
+// state, never half. Idempotent via schema_versions version 2 — the
+// next boot is a no-op.
+//
+// Columns covered:
+//   - settings.npm_password
+//   - settings.cloudflare_api_token
+//   - auth_settings.oidc_client_secret
+//   - registries.token
+//   - workload_env.value WHERE encrypted=1
+func (s *Store) MigrateSecretsToEnvelope(m EnvelopeMigrator) error {
+	return s.runOnce(2, "secrets envelope migration", func() error {
+		tx, err := s.db.Begin()
+		if err != nil {
+			return fmt.Errorf("begin: %w", err)
+		}
+		defer func() { _ = tx.Rollback() }()
+
+		// Single-row tables (settings, auth_settings) — read-update inline.
+		singleRowColumns := []struct {
+			table, column string
+		}{
+			{"settings", "npm_password"},
+			{"settings", "cloudflare_api_token"},
+			{"auth_settings", "oidc_client_secret"},
+		}
+		for _, c := range singleRowColumns {
+			var v string
+			err := tx.QueryRow(
+				fmt.Sprintf(`SELECT %s FROM %s LIMIT 1`, c.column, c.table),
+			).Scan(&v)
+			if err != nil {
+				if errors.Is(err, sql.ErrNoRows) {
+					continue
+				}
+				// auth_settings may not exist on a brand-new DB until
+				// the OIDC code touches it; treat as nothing-to-migrate.
+				slog.Debug("envelope migration: column read skipped",
+					"table", c.table, "column", c.column, "error", err)
+				continue
+			}
+			migrated, ok := tryMigrate(m, v)
+			if !ok {
+				continue
+			}
+			if _, err := tx.Exec(
+				fmt.Sprintf(`UPDATE %s SET %s = ?`, c.table, c.column),
+				migrated,
+			); err != nil {
+				return fmt.Errorf("update %s.%s: %w", c.table, c.column, err)
+			}
+		}
+
+		// Multi-row: registries.token
+		if err := migrateRowColumn(tx, m,
+			`SELECT id, token FROM registries WHERE token != ''`,
+			`UPDATE registries SET token = ? WHERE id = ?`,
+		); err != nil {
+			return fmt.Errorf("registries.token: %w", err)
+		}
+
+		// Multi-row: workload_env.value WHERE encrypted=1
+		if err := migrateRowColumn(tx, m,
+			`SELECT id, value FROM workload_env WHERE encrypted = 1 AND value != ''`,
+			`UPDATE workload_env SET value = ? WHERE id = ?`,
+		); err != nil {
+			return fmt.Errorf("workload_env.value: %w", err)
+		}
+
+		if err := tx.Commit(); err != nil {
+			return fmt.Errorf("commit: %w", err)
+		}
+		return nil
+	})
+}
+
+// migrateRowColumn applies the envelope rewrite to every (id, value)
+// pair returned by selectQ. updateQ takes (newValue, id) as parameters.
+// Each row is its own attempt; one row failing migration (decrypt fail)
+// does not abort the others.
+func migrateRowColumn(tx *sql.Tx, m EnvelopeMigrator, selectQ, updateQ string) error {
+	rows, err := tx.Query(selectQ)
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+	type pending struct{ id, newValue string }
+	var updates []pending
+	for rows.Next() {
+		var id, value string
+		if err := rows.Scan(&id, &value); err != nil {
+			return err
+		}
+		newValue, ok := tryMigrate(m, value)
+		if !ok {
+			continue
+		}
+		updates = append(updates, pending{id, newValue})
+	}
+	if err := rows.Err(); err != nil {
+		return err
+	}
+	for _, u := range updates {
+		if _, err := tx.Exec(updateQ, u.newValue, u.id); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// tryMigrate returns the envelope-form ciphertext + true when the input
+// is a legacy unprefixed value that decrypts successfully with the
+// current key. Returns ("", false) for anything else: empty, already
+// envelope, plaintext, or decrypt-failed (rotated-key case).
+func tryMigrate(m EnvelopeMigrator, v string) (string, bool) {
+	if v == "" {
+		return "", false
+	}
+	if m.HasEnvelope(v) {
+		return "", false
+	}
+	plaintext, err := m.Decrypt(v)
+	if err != nil {
+		return "", false
+	}
+	enc, err := m.Encrypt(plaintext)
+	if err != nil {
+		return "", false
+	}
+	return enc, true
+}
+
 // backfillTriggersFromWorkloads converts embedded trigger config on
 // workload rows into standalone trigger + binding rows. Runs once per
 // boot and is idempotent — only workloads with non-empty trigger_kind
@@ -0,0 +1,159 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+
+	"github.com/google/uuid"
+)
+
+const workloadNotificationColumns = `id, workload_id, name, url, secret,
+	event_types, enabled, sort_order, created_at, updated_at`
+
+func scanWorkloadNotification(scanner interface{ Scan(...any) error }) (WorkloadNotification, error) {
+	var n WorkloadNotification
+	var enabled int
+	err := scanner.Scan(
+		&n.ID, &n.WorkloadID, &n.Name, &n.URL, &n.Secret,
+		&n.EventTypes, &enabled, &n.SortOrder, &n.CreatedAt, &n.UpdatedAt,
+	)
+	n.Enabled = enabled != 0
+	return n, err
+}
+
+// CreateWorkloadNotification inserts a notification route. Returns the
+// populated row (with assigned id + timestamps) so callers don't need to
+// follow up with a Get.
+func (s *Store) CreateWorkloadNotification(n WorkloadNotification) (WorkloadNotification, error) {
+	if n.WorkloadID == "" {
+		return WorkloadNotification{}, fmt.Errorf("workload_id is required")
+	}
+	if n.URL == "" {
+		return WorkloadNotification{}, fmt.Errorf("url is required")
+	}
+	if n.ID == "" {
+		n.ID = uuid.New().String()
+	}
+	n.CreatedAt = Now()
+	n.UpdatedAt = n.CreatedAt
+
+	_, err := s.db.Exec(
+		`INSERT INTO workload_notifications (`+workloadNotificationColumns+`)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		n.ID, n.WorkloadID, n.Name, n.URL, n.Secret,
+		n.EventTypes, BoolToInt(n.Enabled), n.SortOrder, n.CreatedAt, n.UpdatedAt,
+	)
+	if err != nil {
+		return WorkloadNotification{}, fmt.Errorf("insert workload_notification: %w", err)
+	}
+	return n, nil
+}
+
+// ListWorkloadNotifications returns every notification row for a
+// workload ordered by (sort_order, created_at) so the UI stays stable
+// across reorderings.
+func (s *Store) ListWorkloadNotifications(workloadID string) ([]WorkloadNotification, error) {
+	rows, err := s.db.Query(
+		`SELECT `+workloadNotificationColumns+`
+		 FROM workload_notifications
+		 WHERE workload_id = ?
+		 ORDER BY sort_order, created_at`,
+		workloadID,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("list workload_notifications: %w", err)
+	}
+	defer rows.Close()
+
+	out := []WorkloadNotification{}
+	for rows.Next() {
+		n, err := scanWorkloadNotification(rows)
+		if err != nil {
+			return nil, fmt.Errorf("scan workload_notification: %w", err)
+		}
+		out = append(out, n)
+	}
+	return out, rows.Err()
+}
+
+// GetWorkloadNotification fetches one notification row by id. Returns
+// ErrNotFound when the row does not exist so callers can return 404
+// cleanly.
+func (s *Store) GetWorkloadNotification(id string) (WorkloadNotification, error) {
+	n, err := scanWorkloadNotification(s.db.QueryRow(
+		`SELECT `+workloadNotificationColumns+`
+		 FROM workload_notifications WHERE id = ?`, id,
+	))
+	if errors.Is(err, sql.ErrNoRows) {
+		return WorkloadNotification{}, fmt.Errorf("workload_notification %s: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return WorkloadNotification{}, fmt.Errorf("query workload_notification: %w", err)
+	}
+	return n, nil
+}
+
+// UpdateWorkloadNotification rewrites an existing row. WorkloadID is
+// immutable — re-anchoring a route to a different workload would invite
+// silent reassignments after a paste-bug in the UI; recreate instead.
+func (s *Store) UpdateWorkloadNotification(n WorkloadNotification) error {
+	if n.ID == "" {
+		return fmt.Errorf("id is required")
+	}
+	if n.URL == "" {
+		return fmt.Errorf("url is required")
+	}
+	n.UpdatedAt = Now()
+	res, err := s.db.Exec(
+		`UPDATE workload_notifications
+		 SET name = ?, url = ?, secret = ?, event_types = ?,
+		     enabled = ?, sort_order = ?, updated_at = ?
+		 WHERE id = ?`,
+		n.Name, n.URL, n.Secret, n.EventTypes,
+		BoolToInt(n.Enabled), n.SortOrder, n.UpdatedAt, n.ID,
+	)
+	if err != nil {
+		return fmt.Errorf("update workload_notification: %w", err)
+	}
+	rows, _ := res.RowsAffected()
+	if rows == 0 {
+		return fmt.Errorf("workload_notification %s: %w", n.ID, ErrNotFound)
+	}
+	return nil
+}
+
+// DeleteWorkloadNotification drops a single notification row.
+// Idempotent: missing id returns ErrNotFound so the API can map it to
+// 404 cleanly.
+func (s *Store) DeleteWorkloadNotification(id string) error {
+	res, err := s.db.Exec(`DELETE FROM workload_notifications WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete workload_notification: %w", err)
+	}
+	rows, _ := res.RowsAffected()
+	if rows == 0 {
+		return fmt.Errorf("workload_notification %s: %w", id, ErrNotFound)
+	}
+	return nil
+}
+
+// MatchesEventType returns true when the notification row's EventTypes
+// allow-list includes eventType (or is empty, meaning "match all").
+// Helper exported so the notification dispatcher can fan-out filtering
+// inline without duplicating the comma-split parser.
+func (n WorkloadNotification) MatchesEventType(eventType string) bool {
+	if !n.Enabled {
+		return false
+	}
+	if n.EventTypes == "" {
+		return true
+	}
+	for _, et := range strings.Split(n.EventTypes, ",") {
+		if strings.TrimSpace(et) == eventType {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,170 @@
+package store
+
+import (
+	"errors"
+	"testing"
+)
+
+// seedWorkloadForNotifications creates a minimal workload row so the FK
+// constraint on workload_notifications is satisfied. Returns the new
+// workload's ID for tests to reference.
+func seedWorkloadForNotifications(t *testing.T, s *Store, name string) string {
+	t.Helper()
+	w, err := s.CreateWorkload(Workload{
+		Kind:       string(WorkloadKindProject),
+		Name:       name,
+		SourceKind: "image",
+	})
+	if err != nil {
+		t.Fatalf("seed workload: %v", err)
+	}
+	return w.ID
+}
+
+func TestCreateWorkloadNotification_RoundTrip(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+
+	created, err := s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID,
+		Name:       "Slack alerts",
+		URL:        "https://hooks.slack.test/x",
+		Secret:     "shh",
+		EventTypes: "deploy_failure,build_failure",
+		Enabled:    true,
+	})
+	if err != nil {
+		t.Fatalf("CreateWorkloadNotification: %v", err)
+	}
+	if created.ID == "" {
+		t.Fatal("expected ID to be assigned")
+	}
+
+	got, err := s.GetWorkloadNotification(created.ID)
+	if err != nil {
+		t.Fatalf("Get: %v", err)
+	}
+	if got.URL != "https://hooks.slack.test/x" || got.Name != "Slack alerts" {
+		t.Errorf("row mismatch: %+v", got)
+	}
+	if !got.Enabled {
+		t.Error("expected Enabled=true")
+	}
+	if got.EventTypes != "deploy_failure,build_failure" {
+		t.Errorf("event_types = %q", got.EventTypes)
+	}
+}
+
+func TestCreateWorkloadNotification_RejectsMissingURL(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+	_, err := s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID,
+		Name:       "broken",
+		URL:        "",
+	})
+	if err == nil {
+		t.Fatal("expected URL validation error")
+	}
+}
+
+func TestListWorkloadNotifications_SortedByOrder(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+
+	// Insert out of order; ListWorkloadNotifications should return
+	// them sorted by SortOrder ascending.
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "C", URL: "https://c.test", SortOrder: 30,
+	})
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "A", URL: "https://a.test", SortOrder: 10,
+	})
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "B", URL: "https://b.test", SortOrder: 20,
+	})
+
+	rows, err := s.ListWorkloadNotifications(wlID)
+	if err != nil {
+		t.Fatalf("list: %v", err)
+	}
+	if len(rows) != 3 {
+		t.Fatalf("len = %d, want 3", len(rows))
+	}
+	if rows[0].Name != "A" || rows[1].Name != "B" || rows[2].Name != "C" {
+		t.Errorf("sort order wrong: %q %q %q", rows[0].Name, rows[1].Name, rows[2].Name)
+	}
+}
+
+func TestUpdateWorkloadNotification_PersistsChanges(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+	n, _ := s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "old", URL: "https://old.test", Enabled: true,
+	})
+	n.Name = "new"
+	n.URL = "https://new.test"
+	n.Enabled = false
+	n.EventTypes = "deploy_success"
+	if err := s.UpdateWorkloadNotification(n); err != nil {
+		t.Fatalf("update: %v", err)
+	}
+	got, _ := s.GetWorkloadNotification(n.ID)
+	if got.Name != "new" || got.URL != "https://new.test" || got.Enabled {
+		t.Errorf("update did not persist: %+v", got)
+	}
+}
+
+func TestDeleteWorkloadNotification_ReturnsNotFoundForMissing(t *testing.T) {
+	s := newTestStore(t)
+	err := s.DeleteWorkloadNotification("nope")
+	if !errors.Is(err, ErrNotFound) {
+		t.Errorf("expected ErrNotFound, got %v", err)
+	}
+}
+
+func TestDeleteWorkloadNotification_CascadesFromWorkload(t *testing.T) {
+	s := newTestStore(t)
+	wlID := seedWorkloadForNotifications(t, s, "app1")
+	_, _ = s.CreateWorkloadNotification(WorkloadNotification{
+		WorkloadID: wlID, Name: "x", URL: "https://x.test",
+	})
+	if err := s.DeleteWorkload(wlID); err != nil {
+		t.Fatalf("delete workload: %v", err)
+	}
+	rows, err := s.ListWorkloadNotifications(wlID)
+	if err != nil {
+		t.Fatalf("list after cascade: %v", err)
+	}
+	if len(rows) != 0 {
+		t.Errorf("expected cascade delete to remove rows, got %d", len(rows))
+	}
+}
+
+func TestMatchesEventType_AllowList(t *testing.T) {
+	cases := []struct {
+		eventTypes string
+		probe      string
+		want       bool
+	}{
+		{"", "deploy_success", true},                          // empty = all
+		{"deploy_success,deploy_failure", "deploy_success", true},
+		{"deploy_success,deploy_failure", "build_failure", false},
+		{"build_failure", "build_failure", true},
+		{" deploy_success , build_failure ", "build_failure", true}, // whitespace tolerated
+	}
+	for _, c := range cases {
+		n := WorkloadNotification{Enabled: true, EventTypes: c.eventTypes}
+		got := n.MatchesEventType(c.probe)
+		if got != c.want {
+			t.Errorf("MatchesEventType(%q, %q) = %v, want %v", c.eventTypes, c.probe, got, c.want)
+		}
+	}
+}
+
+func TestMatchesEventType_DisabledNeverMatches(t *testing.T) {
+	n := WorkloadNotification{Enabled: false, EventTypes: ""}
+	if n.MatchesEventType("any") {
+		t.Error("disabled row should never match")
+	}
+}
@@ -173,11 +173,24 @@ func (s *Store) UpdateWorkload(w Workload) error {
 	return nil
 }

-// DeleteWorkload removes a workload row. Cascading deletes for the matching
-// project/stack/site row stay with the kind-specific Delete functions; this
-// only removes the workload entry.
+// DeleteWorkload removes a workload row. Cascading deletes for FK-backed
+// child tables (workload_env, workload_volumes, workload_trigger_bindings)
+// happen via SQLite's ON DELETE CASCADE. The `containers` table doesn't
+// yet have an FK to workloads (planned migration — see ops notes), so we
+// drop its rows explicitly here in the same transaction to prevent zombie
+// container rows from outliving their owning workload.
 func (s *Store) DeleteWorkload(id string) error {
-	result, err := s.db.Exec(`DELETE FROM workloads WHERE id = ?`, id)
+	tx, err := s.db.Begin()
+	if err != nil {
+		return fmt.Errorf("begin: %w", err)
+	}
+	defer func() { _ = tx.Rollback() }()
+
+	// Explicit container cleanup until the FK migration lands.
+	if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
+		return fmt.Errorf("delete containers: %w", err)
+	}
+	result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
 	if err != nil {
 		return fmt.Errorf("delete workload: %w", err)
 	}
@@ -188,6 +201,9 @@ func (s *Store) DeleteWorkload(id string) error {
 	if n == 0 {
 		return fmt.Errorf("workload %s: %w", id, ErrNotFound)
 	}
+	if err := tx.Commit(); err != nil {
+		return fmt.Errorf("commit: %w", err)
+	}
 	return nil
 }

@@ -169,6 +169,18 @@ func SaveFile(rootPath, relativePath string, r io.Reader) error {

 // safePath resolves a relative path within rootPath and validates it doesn't escape.
 // Resolves symlinks to prevent symlink-based traversal attacks.
+//
+// The check used to be `strings.HasPrefix(absResolved, absRoot)` which has
+// a classic boundary bug: a sibling root at /data/vol10 would pass the
+// prefix test for /data/vol1. The fix enforces a separator boundary so
+// the only allowed cases are absResolved == absRoot OR absResolved begins
+// with absRoot + separator.
+//
+// For paths that don't yet exist (e.g. SaveFile creating a new file),
+// EvalSymlinks returns an error and we fall back to the lexical path.
+// In that case we walk every existing ancestor with EvalSymlinks too —
+// if any ancestor is a symlink that escapes the root, we reject. This
+// closes the prior gap where pre-planted symlinks could divert writes.
 func safePath(rootPath, relativePath string) (string, error) {
 	if relativePath == "" {
 		return rootPath, nil
@@ -176,7 +188,7 @@ func safePath(rootPath, relativePath string) (string, error) {

 	// Clean and ensure no traversal.
 	cleaned := filepath.Clean(relativePath)
-	if strings.Contains(cleaned, "..") {
+	if cleaned == ".." || strings.HasPrefix(cleaned, ".."+string(filepath.Separator)) || strings.Contains(cleaned, string(filepath.Separator)+".."+string(filepath.Separator)) {
 		return "", fmt.Errorf("path traversal not allowed")
 	}

@@ -191,18 +203,66 @@ func safePath(rootPath, relativePath string) (string, error) {
 		absRoot = realRoot
 	}

-	// Resolve the target path including symlinks.
+	// Resolve the target path. If the leaf doesn't exist (write path),
+	// walk parent directories — any of which may already be a symlink.
 	absResolved, err := filepath.Abs(absPath)
 	if err != nil {
 		return "", fmt.Errorf("resolve path: %w", err)
 	}
 	if realResolved, err := filepath.EvalSymlinks(absResolved); err == nil {
 		absResolved = realResolved
+	} else {
+		// Leaf missing — resolve the deepest existing ancestor and
+		// re-join the unresolved tail. This catches a pre-planted
+		// symlink in any parent dir. An error here means an ancestor
+		// could not be resolved (e.g. a symlink we cannot follow): we MUST
+		// reject rather than fall back to the lexical path, which still
+		// carries the absRoot prefix and would let a symlink ancestor that
+		// escapes the root slip past the boundary check below.
+		resolved, tailErr := resolveExistingAncestor(absResolved)
+		if tailErr != nil {
+			return "", fmt.Errorf("path traversal not allowed")
+		}
+		if resolved != "" {
+			absResolved = resolved
+		}
 	}

-	if !strings.HasPrefix(absResolved, absRoot) {
+	if absResolved != absRoot && !strings.HasPrefix(absResolved, absRoot+string(filepath.Separator)) {
 		return "", fmt.Errorf("path traversal not allowed")
 	}

 	return absPath, nil
 }
+
+// resolveExistingAncestor walks p upward until it finds an existing
+// directory, resolves its symlinks, then rejoins the missing tail.
+// Returns ("", nil) when no ancestor exists (vanishingly rare).
+func resolveExistingAncestor(p string) (string, error) {
+	tail := ""
+	cur := p
+	for {
+		if cur == "" || cur == "/" || cur == filepath.VolumeName(cur)+string(filepath.Separator) {
+			return "", nil
+		}
+		info, err := os.Lstat(cur)
+		if err == nil {
+			real, rerr := filepath.EvalSymlinks(cur)
+			if rerr != nil {
+				return "", rerr
+			}
+			_ = info
+			if tail == "" {
+				return real, nil
+			}
+			return filepath.Join(real, tail), nil
+		}
+		// Move one level up.
+		parent := filepath.Dir(cur)
+		if parent == cur {
+			return "", nil
+		}
+		tail = filepath.Join(filepath.Base(cur), tail)
+		cur = parent
+	}
+}
@@ -131,8 +131,14 @@ const maxWebhookBodyBytes = 256 * 1024 // 256 KiB
 // PluginDispatcher is what the plugin-workload webhook handler needs from
 // the deployer: the canonical Source-dispatch entry point plus access to
 // the same Deps bundle so Trigger.Match can read store / crypto.
+//
+// DispatchTeardown is required so the preview-deploy flow can tear down
+// an ephemeral per-branch child workload when its upstream branch is
+// deleted. Same teardown path the API /workloads/{id} DELETE route uses;
+// nil error on a clean teardown lets the caller delete the workload row.
 type PluginDispatcher interface {
 	DispatchPlugin(ctx context.Context, w pluginWorkload, intent pluginIntent) error
+	DispatchTeardown(ctx context.Context, w pluginWorkload) error
 	PluginDeps() pluginDeps
 }

@@ -13,8 +13,10 @@ import (

 	"github.com/go-chi/chi/v5"

+	"github.com/alexei/tinyforge/internal/metrics"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
 )

 // maxTriggerFanOutConcurrency caps how many bindings dispatch in
@@ -44,6 +46,17 @@ const (
 	ReasonConfigError     = "config merge error"
 	ReasonMatchError      = "match error"
 	ReasonDispatchFailed  = "dispatch failed"
+	ReasonPreviewError    = "preview materialize error"
+	ReasonPreviewTorndown = "preview torn down"
+	// ReasonPreviewNoop: a branch-delete webhook arrived but no preview was
+	// ever materialized for that branch — a legitimate clean skip, distinct
+	// from "no binding matched" so it isn't misreported as a wiring problem.
+	ReasonPreviewNoop = "preview noop"
+	// ReasonPreviewOrphaned: the preview container was torn down but its
+	// workload row could not be deleted, leaving an orphan row. Surfaced
+	// distinctly so the partial failure is visible rather than masquerading
+	// as a clean teardown.
+	ReasonPreviewOrphaned = "preview torn down (row orphaned)"
 )

 // handleTriggerWebhook processes an inbound webhook for a first-class
@@ -172,7 +185,7 @@ func (h *Handler) handleTriggerWebhook(w http.ResponseWriter, r *http.Request) {
 		switch {
 		case r.Deployed:
 			deployed++
-		case r.Reason == ReasonBindingDisabled:
+		case r.Reason == ReasonBindingDisabled, r.Reason == ReasonPreviewNoop:
 			skipped++
 		case r.Reason == ReasonNoMatch:
 			noMatch++
@@ -194,8 +207,10 @@ func (h *Handler) handleTriggerWebhook(w http.ResponseWriter, r *http.Request) {
 	case noMatch == len(results)-skipped:
 		delivery.Detail = "no binding matched"
 	default:
-		delivery.Detail = fmt.Sprintf("matched=0 skipped=%d errored=%d", skipped, errored)
+		delivery.Detail = fmt.Sprintf("matched=0 skipped=%d errored=%d nomatch=%d",
+			skipped, errored, noMatch)
 	}
+	metrics.WebhookDeliveriesTotal.Inc(delivery.Outcome)
 	respondWebhookJSON(w, http.StatusOK, map[string]any{
 		"success":  true,
 		"trigger":  trg.Name,
@@ -326,6 +341,18 @@ func (h *Handler) fireBinding(
 	if intent.TriggeredBy == "" {
 		intent.TriggeredBy = "trigger-webhook"
 	}
+
+	// Preview-deploy fork: the git trigger plugin attaches preview_branch
+	// metadata when BranchPattern matches a non-baseline branch. Route
+	// the dispatch through a per-branch child workload rather than
+	// redeploying the parent template. The fork is intentionally before
+	// the dispatch so the template's container never gets clobbered by
+	// a feature-branch push.
+	if previewBranch := intent.Metadata["preview_branch"]; previewBranch != "" {
+		fired, reason := h.handlePreviewIntent(ctx, row, intent, previewBranch)
+		return fired, reason
+	}
+
 	if err := h.plugins.DispatchPlugin(ctx, pwl, *intent); err != nil {
 		slog.Warn("webhook: dispatch failed",
 			"trigger", trg.Name, "workload", row.Name, "error", err)
@@ -336,3 +363,72 @@ func (h *Handler) fireBinding(
 	return true, intent.Reason
 }

+// handlePreviewIntent dispatches an intent that targeted a non-baseline
+// branch on a preview-template workload. Two paths:
+//
+//  1. Branch deleted: find the matching preview workload, dispatch
+//     Teardown, then delete the workload row so the dashboard reflects
+//     the upstream state.
+//  2. Branch pushed: materialize (or reuse) the preview workload, then
+//     dispatch the deploy against it. The template workload itself is
+//     never deployed against a feature branch.
+//
+// On any error the helper logs and returns a generic reason — the
+// fan-out caller treats these the same as a normal dispatch failure.
+func (h *Handler) handlePreviewIntent(
+	ctx context.Context,
+	template store.Workload,
+	intent *plugin.DeploymentIntent,
+	branch string,
+) (bool, string) {
+	deleted := intent.Metadata["preview_deleted"] == "1"
+	if deleted {
+		child, ok, err := preview.FindPreviewForBranch(h.store, template.ID, branch)
+		if err != nil {
+			slog.Warn("webhook: preview lookup failed",
+				"template", template.Name, "branch", branch, "error", err)
+			return false, ReasonPreviewError
+		}
+		if !ok {
+			// Branch was deleted upstream but we never materialized a
+			// preview for it — nothing to do. Report as a distinct noop so
+			// it isn't bucketed as "no binding matched".
+			return false, ReasonPreviewNoop
+		}
+		childPwl := toPluginWorkload(child)
+		if err := h.plugins.DispatchTeardown(ctx, childPwl); err != nil {
+			slog.Warn("webhook: preview teardown dispatch failed",
+				"template", template.Name, "preview", child.Name, "error", err)
+			return false, ReasonDispatchFailed
+		}
+		if err := h.store.DeleteWorkload(child.ID); err != nil {
+			// Container is gone but the row is orphaned. Surface this as a
+			// distinct reason so the partial failure is visible rather than
+			// reported as a clean teardown; the operator can delete the row
+			// from the dashboard if it sticks around.
+			slog.Warn("webhook: preview row delete failed (orphaned row)",
+				"template", template.Name, "preview", child.Name, "error", err)
+			return true, ReasonPreviewOrphaned
+		}
+		slog.Info("webhook: preview torn down",
+			"template", template.Name, "branch", branch, "preview", child.Name)
+		return true, ReasonPreviewTorndown
+	}
+
+	child, err := preview.MaterializeForBranch(h.store, template, branch)
+	if err != nil {
+		slog.Warn("webhook: preview materialize failed",
+			"template", template.Name, "branch", branch, "error", err)
+		return false, ReasonPreviewError
+	}
+	childPwl := toPluginWorkload(child)
+	if err := h.plugins.DispatchPlugin(ctx, childPwl, *intent); err != nil {
+		slog.Warn("webhook: preview dispatch failed",
+			"template", template.Name, "preview", child.Name, "error", err)
+		return false, ReasonDispatchFailed
+	}
+	slog.Info("webhook: triggered preview deploy",
+		"template", template.Name, "branch", branch, "preview", child.Name, "reason", intent.Reason)
+	return true, intent.Reason
+}
+
@@ -327,6 +327,10 @@ func parseGitLabPushEvent(body []byte, headers http.Header) vendorParseResult {
 			Ref:       probe.Ref,
 			CommitSHA: probe.After,
 			Pusher:    pusher,
+			// GitLab does not emit `deleted: true`; the canonical signal
+			// is an all-zero `after` SHA. Same parser helper used for the
+			// GitHub / Gitea fallback so the two branches agree.
+			Deleted: isZeroSHA(probe.After),
 		},
 	}
 	if strings.HasPrefix(probe.Ref, "refs/heads/") {
@@ -346,6 +350,7 @@ func parseGenericGitPush(body []byte) (plugin.InboundEvent, error) {
 	var probe struct {
 		Ref        string `json:"ref"`
 		After      string `json:"after"`
+		Deleted    bool   `json:"deleted"`
 		Repository struct {
 			FullName string `json:"full_name"`
 			CloneURL string `json:"clone_url"`
@@ -370,6 +375,12 @@ func parseGenericGitPush(body []byte) (plugin.InboundEvent, error) {
 	if pusher == "" {
 		pusher = probe.Pusher.Username
 	}
+	// Branch / tag deletion is signalled either by the explicit
+	// `deleted: true` flag (GitHub / Gitea) or by an all-zero `after`
+	// SHA (older shapes). Both are honoured so the preview-deploy flow
+	// can tear down ephemeral workloads even when a vendor omits the
+	// boolean flag.
+	deleted := probe.Deleted || isZeroSHA(probe.After)
 	evt := plugin.InboundEvent{
 		Kind: "git-push",
 		Git: &plugin.GitEvent{
@@ -377,6 +388,7 @@ func parseGenericGitPush(body []byte) (plugin.InboundEvent, error) {
 			Ref:       probe.Ref,
 			CommitSHA: probe.After,
 			Pusher:    pusher,
+			Deleted:   deleted,
 		},
 	}
 	if strings.HasPrefix(probe.Ref, "refs/heads/") {
@@ -388,3 +400,19 @@ func parseGenericGitPush(body []byte) (plugin.InboundEvent, error) {
 	}
 	return evt, nil
 }
+
+// isZeroSHA returns true when sha is the canonical "no commit" sentinel
+// (40 zeros) that vendors emit on the `after` field of a branch- or
+// tag-delete push event. Length-tolerant because some test fixtures
+// truncate the SHA.
+func isZeroSHA(sha string) bool {
+	if sha == "" {
+		return false
+	}
+	for _, r := range sha {
+		if r != '0' {
+			return false
+		}
+	}
+	return len(sha) >= 7
+}
@@ -0,0 +1,81 @@
+package plugin
+
+import (
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/notify"
+)
+
+// DispatchNotificationForWorkload sends `event` to every notification
+// route configured for the workload. Resolution order:
+//
+//  1. workload_notifications rows matching `event.Type` — multi-route
+//     fan-out (e.g. Slack alerts + Discord successes per workload).
+//  2. If zero matching rows AND the legacy single-URL columns on the
+//     workload row are set, send to that URL — backwards compat for
+//     installs that pre-date the new table.
+//  3. Otherwise, fall through to settings.notification_url so the global
+//     destination still fires for workloads with no per-row config.
+//
+// Secrets are decrypted via deps.EncKey before sending. A failed decrypt
+// degrades to "send unsigned" with a warning rather than dropping the
+// notification — the operator still gets the alert, they just need to
+// re-save the secret. Fire-and-forget: failures are logged inside
+// deps.Notifier and never bubble up here.
+//
+// Callers (static / dockerfile / image / compose plugins) pass an
+// already-populated Event; this helper does not synthesize the payload
+// shape, only the routing.
+func DispatchNotificationForWorkload(deps Deps, w Workload, event notify.Event) {
+	if deps.Notifier == nil {
+		return
+	}
+	rows, err := deps.Store.ListWorkloadNotifications(w.ID)
+	if err != nil {
+		slog.Warn("notify: list workload routes failed",
+			"workload", w.ID, "error", err)
+		rows = nil
+	}
+
+	matched := 0
+	for _, n := range rows {
+		if !n.MatchesEventType(event.Type) {
+			continue
+		}
+		matched++
+		secret := ""
+		if n.Secret != "" {
+			dec, derr := crypto.Decrypt(deps.EncKey, n.Secret)
+			if derr != nil {
+				slog.Warn("notify: decrypt workload secret failed — sending unsigned",
+					"workload", w.ID, "route", n.Name, "error", derr)
+			} else {
+				secret = dec
+			}
+		}
+		deps.Notifier.SendSigned(n.URL, secret, notify.TierSite, event)
+	}
+	if matched > 0 {
+		return
+	}
+
+	// Legacy fallback: single per-workload destination on workloads.notification_url.
+	if w.NotificationURL != "" {
+		deps.Notifier.SendSigned(w.NotificationURL, w.NotificationSecret, notify.TierSite, event)
+		return
+	}
+
+	// Global fallback so a one-line config in settings still notifies
+	// every workload without a per-row override.
+	settings, err := deps.Store.GetSettings()
+	if err != nil {
+		slog.Warn("notify: settings lookup for global fallback failed",
+			"workload", w.ID, "error", err)
+		return
+	}
+	if settings.NotificationURL == "" {
+		return
+	}
+	deps.Notifier.SendSigned(settings.NotificationURL, settings.NotificationSecret, notify.TierSettings, event)
+}
@@ -32,6 +32,23 @@ type Config struct {

 type source struct{}

+// composeRunner is the slice of stack.Compose this plugin actually
+// drives. Defined locally per the "interfaces where they are used"
+// idiom so the plugin can be unit-tested without a real docker compose
+// binary. `*stack.Compose` satisfies it implicitly.
+type composeRunner interface {
+	Up(ctx context.Context, projectName, yamlPath string) (string, error)
+	Down(ctx context.Context, projectName string, removeVolumes bool) (string, error)
+	Ps(ctx context.Context, projectName, yamlPath string) ([]stack.Service, error)
+}
+
+// newComposeRunner returns the runner the plugin should call. Tests
+// swap this var with a fake; production code never touches it. The
+// indirection costs one function-pointer dereference per Deploy /
+// Teardown / Reconcile call — negligible against the docker compose
+// exec it gates.
+var newComposeRunner = func() composeRunner { return stack.NewCompose("") }
+
 func init() { plugin.RegisterSource(&source{}) }

 func (*source) Kind() string { return "compose" }
@@ -82,7 +99,7 @@ func (*source) Deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload,
 		return fmt.Errorf("compose source: write yaml: %w", err)
 	}

-	compose := stack.NewCompose("")
+	compose := newComposeRunner()
 	out, err := compose.Up(ctx, projectName, yamlPath)
 	if err != nil {
 		return fmt.Errorf("compose source: docker compose up: %w (output: %s)", err, truncate(out, 1024))
@@ -105,7 +122,7 @@ func (*source) Teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload
 	cfg, _ := plugin.SourceConfigOf[Config](w)
 	projectName := composeProjectName(cfg.ComposeProjectName, w)

-	compose := stack.NewCompose("")
+	compose := newComposeRunner()
 	if _, err := compose.Down(ctx, projectName, true); err != nil {
 		// Log but proceed — the DB rows must not be orphaned.
 		slog.Warn("compose source: docker compose down", "workload", w.ID, "error", err)
@@ -139,7 +156,7 @@ func (*source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workloa
 	projectName := composeProjectName(cfg.ComposeProjectName, w)
 	yamlPath, _ := writeYAMLIfChanged(w.ID, cfg.ComposeYAML)

-	compose := stack.NewCompose("")
+	compose := newComposeRunner()
 	services, err := compose.Ps(ctx, projectName, yamlPath)
 	if err != nil {
 		// Likely no compose project running for this workload. Mark
@@ -162,7 +179,7 @@ func (*source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workloa

 // syncContainers shares its body with Reconcile minus the missing-row
 // fallback — Deploy expects compose ps to succeed since `up` just ran.
-func syncContainers(ctx context.Context, deps plugin.Deps, compose *stack.Compose, w plugin.Workload, projectName, yamlPath string) error {
+func syncContainers(ctx context.Context, deps plugin.Deps, compose composeRunner, w plugin.Workload, projectName, yamlPath string) error {
 	services, err := compose.Ps(ctx, projectName, yamlPath)
 	if err != nil {
 		return fmt.Errorf("compose ps: %w", err)
@@ -204,7 +221,17 @@ var projectNameSanitizer = regexp.MustCompile(`[^a-z0-9_-]`)

 func composeProjectName(explicit string, w plugin.Workload) string {
 	if explicit != "" {
-		return explicit
+		// Apply the same sanitizer to operator-supplied names so a value
+		// like "--foo" cannot reach the docker CLI and be re-parsed as a
+		// flag. Reuses the canonical lower+[^a-z0-9_-]→"-" + trim path.
+		san := strings.ToLower(explicit)
+		san = projectNameSanitizer.ReplaceAllString(san, "-")
+		san = strings.Trim(san, "-")
+		if san != "" {
+			return san
+		}
+		// Fall through to the derived name if sanitization stripped
+		// everything (operator passed e.g. "---" — degenerate input).
 	}
 	name := strings.ToLower(w.Name)
 	name = projectNameSanitizer.ReplaceAllString(name, "-")
@@ -0,0 +1,512 @@
+package compose
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/stack"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// fakeRunner stands in for *stack.Compose. Every method records its
+// inputs and returns whatever the test set on the corresponding field.
+// Defaults are happy-path: empty services from Ps, no error from Up /
+// Down. Fields are slice-typed so a single fakeRunner can serve a
+// sequence of calls (Deploy issues Up + Ps in order).
+type fakeRunner struct {
+	mu sync.Mutex
+
+	upCalls    []runnerCall
+	upOuts     []string
+	upErrs     []error
+	downCalls  []runnerCall
+	downOuts   []string
+	downErrs   []error
+	psCalls    []runnerCall
+	psResults  [][]stack.Service
+	psErrs     []error
+	upCallIdx  int
+	psCallIdx  int
+	downCallI  int
+}
+
+type runnerCall struct {
+	ProjectName   string
+	YAMLPath      string
+	RemoveVolumes bool
+}
+
+func (f *fakeRunner) Up(_ context.Context, projectName, yamlPath string) (string, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.upCalls = append(f.upCalls, runnerCall{ProjectName: projectName, YAMLPath: yamlPath})
+	out, err := pop(f.upOuts, f.upErrs, f.upCallIdx)
+	f.upCallIdx++
+	return out, err
+}
+
+func (f *fakeRunner) Down(_ context.Context, projectName string, removeVolumes bool) (string, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.downCalls = append(f.downCalls, runnerCall{ProjectName: projectName, RemoveVolumes: removeVolumes})
+	out, err := pop(f.downOuts, f.downErrs, f.downCallI)
+	f.downCallI++
+	return out, err
+}
+
+func (f *fakeRunner) Ps(_ context.Context, projectName, yamlPath string) ([]stack.Service, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.psCalls = append(f.psCalls, runnerCall{ProjectName: projectName, YAMLPath: yamlPath})
+
+	idx := f.psCallIdx
+	f.psCallIdx++
+	var svcs []stack.Service
+	if idx < len(f.psResults) {
+		svcs = f.psResults[idx]
+	}
+	var err error
+	if idx < len(f.psErrs) {
+		err = f.psErrs[idx]
+	}
+	return svcs, err
+}
+
+// pop returns the nth element of outs/errs or zero values when n is
+// past the end. Lets a test set a single expected response without
+// padding slices for every other call.
+func pop(outs []string, errs []error, n int) (string, error) {
+	var out string
+	if n < len(outs) {
+		out = outs[n]
+	}
+	var err error
+	if n < len(errs) {
+		err = errs[n]
+	}
+	return out, err
+}
+
+// withFakeRunner swaps newComposeRunner for the duration of one test
+// and restores the original on cleanup. Tests that need to inspect the
+// fake post-hoc keep the returned pointer.
+func withFakeRunner(t *testing.T, f *fakeRunner) {
+	t.Helper()
+	orig := newComposeRunner
+	newComposeRunner = func() composeRunner { return f }
+	t.Cleanup(func() { newComposeRunner = orig })
+}
+
+func testStore(t *testing.T) *store.Store {
+	t.Helper()
+	st, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("open store: %v", err)
+	}
+	t.Cleanup(func() { _ = st.Close() })
+	return st
+}
+
+// seedWorkload creates the parent workload row that container rows FK
+// onto. Returns the workload's ID so callers can reuse it.
+func seedWorkload(t *testing.T, st *store.Store, name, yamlText string) string {
+	t.Helper()
+	cfg := Config{ComposeYAML: yamlText}
+	body, err := json.Marshal(cfg)
+	if err != nil {
+		t.Fatalf("marshal config: %v", err)
+	}
+	w, err := st.CreateWorkload(store.Workload{
+		Kind:         "plugin",
+		Name:         name,
+		SourceKind:   "compose",
+		SourceConfig: string(body),
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	return w.ID
+}
+
+func TestDeploy_HappyPath(t *testing.T) {
+	withTempDir(t) // isolates the YAML scratch dir under t.TempDir()
+
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx:alpine\n"
+	wid := seedWorkload(t, deps.Store, "myapp", yamlText)
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "myapp",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+
+	fake := &fakeRunner{
+		psResults: [][]stack.Service{{
+			{Service: "web", State: "running", Status: "Up 5 seconds"},
+		}},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	if err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{}); err != nil {
+		t.Fatalf("Deploy: %v", err)
+	}
+
+	// Up called exactly once with the workload-derived project name.
+	if len(fake.upCalls) != 1 {
+		t.Fatalf("Up called %d times, want 1", len(fake.upCalls))
+	}
+	if !strings.HasPrefix(fake.upCalls[0].ProjectName, "tf-myapp-") {
+		t.Errorf("Up projectName = %q, want prefix tf-myapp-", fake.upCalls[0].ProjectName)
+	}
+	if !strings.HasSuffix(fake.upCalls[0].YAMLPath, "compose.yml") {
+		t.Errorf("Up yamlPath = %q, want suffix compose.yml", fake.upCalls[0].YAMLPath)
+	}
+
+	// Ps follows Up to enumerate the resulting containers.
+	if len(fake.psCalls) != 1 {
+		t.Fatalf("Ps called %d times, want 1", len(fake.psCalls))
+	}
+
+	// Service row written.
+	row, err := deps.Store.GetContainerByID(wid + ":web")
+	if err != nil {
+		t.Fatalf("get container row: %v", err)
+	}
+	if row.WorkloadID != wid {
+		t.Errorf("row.WorkloadID = %q, want %q", row.WorkloadID, wid)
+	}
+	if row.Role != "web" {
+		t.Errorf("row.Role = %q, want %q", row.Role, "web")
+	}
+	if row.State != "running" {
+		t.Errorf("row.State = %q, want %q", row.State, "running")
+	}
+}
+
+func TestDeploy_EmptyYAMLConfig_RejectsBeforeExec(t *testing.T) {
+	deps := plugin.Deps{Store: testStore(t)}
+	wid := seedWorkload(t, deps.Store, "empty", "services:\n  web:\n    image: x\n")
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "empty",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: ""}),
+	}
+
+	fake := &fakeRunner{}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{})
+	if err == nil {
+		t.Fatal("Deploy accepted empty compose_yaml")
+	}
+	if !strings.Contains(err.Error(), "empty compose_yaml") {
+		t.Errorf("error = %v, want substring \"empty compose_yaml\"", err)
+	}
+	if len(fake.upCalls) != 0 {
+		t.Errorf("Up should not have been called; got %d calls", len(fake.upCalls))
+	}
+}
+
+func TestDeploy_UpFailure_PropagatesAndIncludesTruncatedOutput(t *testing.T) {
+	withTempDir(t)
+
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: bad-image\n"
+	wid := seedWorkload(t, deps.Store, "fail", yamlText)
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "fail",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+
+	bigOut := strings.Repeat("docker compose log noise ", 200) // > 1024 bytes
+	fake := &fakeRunner{
+		upOuts: []string{bigOut},
+		upErrs: []error{errors.New("exit status 1")},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{})
+	if err == nil {
+		t.Fatal("Deploy accepted Up failure")
+	}
+	if !strings.Contains(err.Error(), "docker compose up") {
+		t.Errorf("error = %v, want substring \"docker compose up\"", err)
+	}
+	if !strings.Contains(err.Error(), "exit status 1") {
+		t.Errorf("error = %v, want wrapped Up err", err)
+	}
+	if !strings.Contains(err.Error(), "(truncated)") {
+		t.Errorf("error = %v, want truncated-output marker", err)
+	}
+	// Ps must not be called when Up failed.
+	if len(fake.psCalls) != 0 {
+		t.Errorf("Ps called %d times after Up failure; want 0", len(fake.psCalls))
+	}
+}
+
+func TestDeploy_UpSucceedsButPsFails_SurfacesError(t *testing.T) {
+	// `up` succeeded but enumerate failed — Deploy must surface so the UI
+	// doesn't show an empty containers index for a running stack.
+	withTempDir(t)
+
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx\n"
+	wid := seedWorkload(t, deps.Store, "psfail", yamlText)
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "psfail",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+
+	fake := &fakeRunner{
+		psErrs: []error{errors.New("compose ps boom")},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{})
+	if err == nil {
+		t.Fatal("Deploy ignored Ps failure")
+	}
+	if !strings.Contains(err.Error(), "sync container rows") {
+		t.Errorf("error = %v, want substring \"sync container rows\"", err)
+	}
+}
+
+func TestTeardown_DropsContainerRows_EvenWhenDownFails(t *testing.T) {
+	// docker compose down failing must not orphan rows in the DB.
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	wid := seedWorkload(t, deps.Store, "tdown", "services:\n  web:\n    image: nginx\n")
+
+	// Seed two service rows the way Deploy would.
+	for _, role := range []string{"web", "db"} {
+		if err := deps.Store.UpsertContainer(store.Container{
+			ID:           wid + ":" + role,
+			WorkloadID:   wid,
+			WorkloadKind: "compose",
+			Role:         role,
+			Host:         "local",
+			State:        "running",
+		}); err != nil {
+			t.Fatalf("seed container: %v", err)
+		}
+	}
+
+	fake := &fakeRunner{downErrs: []error{errors.New("compose project unknown")}}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "tdown",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: "services:\n  web:\n    image: nginx\n"}),
+	}
+	if err := src.Teardown(context.Background(), deps, w); err != nil {
+		t.Fatalf("Teardown: %v", err)
+	}
+
+	// Down requested removeVolumes=true (matches the docstring claim).
+	if len(fake.downCalls) != 1 {
+		t.Fatalf("Down calls = %d, want 1", len(fake.downCalls))
+	}
+	if !fake.downCalls[0].RemoveVolumes {
+		t.Errorf("Down removeVolumes = false, want true (workload teardown is destructive)")
+	}
+
+	// Rows gone despite the Down error.
+	for _, role := range []string{"web", "db"} {
+		if _, err := deps.Store.GetContainerByID(wid + ":" + role); !errors.Is(err, store.ErrNotFound) {
+			t.Errorf("container row %q survived teardown: err=%v", role, err)
+		}
+	}
+}
+
+func TestTeardown_HappyPath(t *testing.T) {
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	wid := seedWorkload(t, deps.Store, "tdown2", "services:\n  web:\n    image: nginx\n")
+
+	if err := deps.Store.UpsertContainer(store.Container{
+		ID:           wid + ":web",
+		WorkloadID:   wid,
+		WorkloadKind: "compose",
+		Role:         "web",
+		Host:         "local",
+		State:        "running",
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	fake := &fakeRunner{}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "tdown2",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: "services:\n  web:\n    image: nginx\n"}),
+	}
+	if err := src.Teardown(context.Background(), deps, w); err != nil {
+		t.Fatalf("Teardown: %v", err)
+	}
+	if len(fake.downCalls) != 1 {
+		t.Errorf("Down calls = %d, want 1", len(fake.downCalls))
+	}
+	if _, err := deps.Store.GetContainerByID(wid + ":web"); !errors.Is(err, store.ErrNotFound) {
+		t.Errorf("container row survived teardown: err=%v", err)
+	}
+}
+
+func TestReconcile_PsSuccess_UpsertsRows(t *testing.T) {
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx\n  db:\n    image: postgres\n"
+	wid := seedWorkload(t, deps.Store, "rec", yamlText)
+
+	fake := &fakeRunner{
+		psResults: [][]stack.Service{{
+			{Service: "web", State: "running"},
+			{Service: "db", State: "running"},
+		}},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "rec",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+	if err := src.Reconcile(context.Background(), deps, w); err != nil {
+		t.Fatalf("Reconcile: %v", err)
+	}
+
+	for _, role := range []string{"web", "db"} {
+		row, err := deps.Store.GetContainerByID(wid + ":" + role)
+		if err != nil {
+			t.Errorf("row %q missing after reconcile: %v", role, err)
+			continue
+		}
+		if row.State != "running" {
+			t.Errorf("row %q state = %q, want \"running\"", role, row.State)
+		}
+	}
+}
+
+func TestReconcile_PsFailure_MarksExistingRowsMissing(t *testing.T) {
+	// When compose ps fails (project unknown to Docker), the reconciler
+	// flips existing rows to "missing" rather than deleting them — the UI
+	// surfaces the desync to the operator.
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx\n"
+	wid := seedWorkload(t, deps.Store, "missing", yamlText)
+
+	if err := deps.Store.UpsertContainer(store.Container{
+		ID:           wid + ":web",
+		WorkloadID:   wid,
+		WorkloadKind: "compose",
+		Role:         "web",
+		Host:         "local",
+		State:        "running",
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	fake := &fakeRunner{psErrs: []error{errors.New("no such project")}}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "missing",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+	if err := src.Reconcile(context.Background(), deps, w); err != nil {
+		t.Fatalf("Reconcile returned %v; should be nil even on Ps failure", err)
+	}
+
+	row, err := deps.Store.GetContainerByID(wid + ":web")
+	if err != nil {
+		t.Fatalf("row missing entirely (should be marked, not deleted): %v", err)
+	}
+	if row.State != "missing" {
+		t.Errorf("row.State = %q, want \"missing\"", row.State)
+	}
+}
+
+func TestReconcile_FallsBackToStatusWhenStateEmpty(t *testing.T) {
+	// Some compose versions populate Status (human string) but not State
+	// (enum) for non-running services. upsertServiceRow falls back to
+	// Status; verify that here.
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  worker:\n    image: alpine\n"
+	wid := seedWorkload(t, deps.Store, "fallback", yamlText)
+
+	fake := &fakeRunner{
+		psResults: [][]stack.Service{{
+			{Service: "worker", State: "", Status: "Exit 0"},
+		}},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "fallback",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+	if err := src.Reconcile(context.Background(), deps, w); err != nil {
+		t.Fatalf("Reconcile: %v", err)
+	}
+
+	row, err := deps.Store.GetContainerByID(wid + ":worker")
+	if err != nil {
+		t.Fatalf("get row: %v", err)
+	}
+	if row.State != "Exit 0" {
+		t.Errorf("row.State = %q, want \"Exit 0\" (Status fallback)", row.State)
+	}
+}
+
+// mustMarshalConfig is a small helper that converts a Config to the
+// raw-JSON shape SourceConfig expects. Tests use it instead of
+// hand-rolling the string so a Config field rename can't drift the test
+// fixture from the production decoder.
+func mustMarshalConfig(t *testing.T, cfg Config) json.RawMessage {
+	t.Helper()
+	b, err := json.Marshal(cfg)
+	if err != nil {
+		t.Fatalf("marshal config: %v", err)
+	}
+	return json.RawMessage(b)
+}
+
+// Compile-time guards: *stack.Compose must continue to satisfy
+// composeRunner so the production path keeps building, and the fake
+// must continue to satisfy it too so a drift in the interface shape
+// fails the build here rather than at runtime.
+var (
+	_ composeRunner = (*stack.Compose)(nil)
+	_ composeRunner = (*fakeRunner)(nil)
+)
@@ -0,0 +1,574 @@
+package dockerfile
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/notify"
+	"github.com/alexei/tinyforge/internal/proxy"
+	"github.com/alexei/tinyforge/internal/staticsite"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// healthCheckDelay is the grace window after StartContainer before we
+// probe IsContainerRunning. Mirrors the static plugin's window — short
+// enough not to slow happy-path deploys, long enough to catch
+// crash-on-boot failures (missing env, bad CMD, port conflict).
+const healthCheckDelay = 3 * time.Second
+
+// deploy runs one end-to-end sync of a dockerfile workload:
+//
+//  1. fetch the latest commit SHA from the configured git provider
+//  2. skip if SHA + container + proxy are all still healthy
+//  3. clone the repo into a temp dir
+//  4. resolve the build context + Dockerfile location
+//  5. `docker build -t <tag> -f <dockerfile> <context>`
+//  6. recreate the container with the new image
+//  7. health-probe the container, surface logs on failure
+//  8. reconfigure the proxy route
+//  9. tear down the previous container (different ID) once we're sure
+//     the new one is healthy and proxied
+//
+// Each step writes its own status update so the dashboard's runtime-
+// state panel can show a useful intermediate state when the deploy
+// stalls on the slow step (almost always the build).
+func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	cfg, err := plugin.SourceConfigOf[Config](w)
+	if err != nil {
+		return fmt.Errorf("dockerfile source: decode config: %w", err)
+	}
+
+	prev, prevContainer, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+
+	// Force a full rebuild on manual / promote / first-time deploys
+	// (no Reason at all also implies manual). Schedule / git triggers
+	// honour the unchanged-SHA short-circuit so cron polling does not
+	// rebuild minute-by-minute when nothing changed.
+	force := intent.Reason == "" || intent.Reason == "manual" || intent.Reason == "promote"
+
+	// Decrypt the access token if present. Token never escapes this
+	// frame: any error message routes through sanitizeError(_, token)
+	// which redacts the literal substring.
+	token := ""
+	if cfg.AccessToken != "" {
+		decrypted, derr := crypto.Decrypt(deps.EncKey, cfg.AccessToken)
+		if derr != nil {
+			slog.Warn("dockerfile source: failed to decrypt access token",
+				"workload", w.Name, "error", derr)
+		} else {
+			token = decrypted
+		}
+	}
+
+	provider, err := staticsite.NewGitProvider(staticsite.ProviderType(cfg.Provider), cfg.BaseURL, token)
+	if err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("create provider: %v", err), token))
+		return fmt.Errorf("create provider: %w", err)
+	}
+
+	latestSHA, err := provider.GetLatestCommitSHA(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch)
+	if err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("fetch commit SHA: %v", err), token))
+		return fmt.Errorf("get latest commit: %w", err)
+	}
+
+	domain := primaryDomain(deps, w)
+
+	prevContainerID := ""
+	prevProxyRouteID := ""
+	if prevContainer != nil {
+		prevContainerID = prevContainer.ContainerID
+		prevProxyRouteID = prevContainer.ProxyRouteID
+	}
+	// Short-circuit: SHA unchanged AND container is still running AND
+	// (if there's a public face) the proxy route still exists. Manual
+	// deploys skip this entirely.
+	//
+	// We deliberately do NOT gate this on prev.Status == "deployed". A
+	// transient failure (e.g. a one-off proxy-check error) leaves the
+	// persisted status as "failed"; if we required "deployed" here, every
+	// subsequent cron/git poll with the same SHA would fall through to a
+	// full clone + docker build despite a perfectly healthy running
+	// container — a rebuild storm that burns CPU/disk until a new commit
+	// lands. Instead we trust the live container/proxy state and heal the
+	// stale status via healUnchanged.
+	if !force && latestSHA == prev.LastCommitSHA && prevContainerID != "" {
+		running, _ := deps.Docker.IsContainerRunning(ctx, prevContainerID)
+		switch {
+		case !running:
+			slog.Info("dockerfile: container not running, forcing redeploy", "workload", w.Name)
+		case domain != "":
+			proxyOK, perr := deps.Proxy.RouteExists(ctx, domain)
+			switch {
+			case perr != nil:
+				slog.Warn("dockerfile: proxy check failed, forcing redeploy",
+					"workload", w.Name, "error", perr)
+			case !proxyOK:
+				slog.Info("dockerfile: proxy route missing, forcing redeploy", "workload", w.Name)
+			default:
+				return healUnchanged(deps, w, prev, latestSHA)
+			}
+		default:
+			return healUnchanged(deps, w, prev, latestSHA)
+		}
+	}
+
+	updateStatus(deps, w, "syncing", prev.LastCommitSHA, "")
+	publishEvent(deps, w, "syncing")
+
+	// Clone the repo into a temp dir. We always download the entire
+	// repo tree (folderPath = ""); a ContextPath subset is applied
+	// at build time, not at download time, so a Dockerfile in
+	// `./docker/Dockerfile` with `ContextPath=""` still works.
+	cloneDir, err := os.MkdirTemp("", "tf-build-"+idShort(w)+"-*")
+	if err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("create clone dir: %v", err), token))
+		return fmt.Errorf("create clone dir: %w", err)
+	}
+	defer os.RemoveAll(cloneDir)
+
+	if err := provider.DownloadFolder(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch, "", cloneDir); err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("download repo: %v", err), token))
+		return fmt.Errorf("download repo: %w", err)
+	}
+
+	// Resolve the build context (with symlink-aware escape check) and
+	// verify the Dockerfile is actually present before sending the
+	// build off to the daemon.
+	contextDir, err := resolveContextDir(cloneDir, cfg.ContextPath)
+	if err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("resolve context: %v", err), token))
+		return fmt.Errorf("resolve context: %w", err)
+	}
+	if err := verifyDockerfileExists(contextDir, cfg.DockerfilePath); err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(err.Error(), token))
+		return err
+	}
+
+	imageTag := imageTagFor(w)
+	updateStatus(deps, w, "building", latestSHA, "")
+	publishEvent(deps, w, "building")
+	// Bridge per-line build output onto the event bus so /api/events
+	// subscribers (the dashboard's live tail) can show progress while
+	// the daemon chugs. The bus is non-blocking — slow subscribers drop
+	// events rather than backpressure the build — so this is safe to
+	// call from the hot scan loop.
+	logFn := func(line string) {
+		publishBuildLog(deps, w, line)
+	}
+	if err := deps.Docker.BuildImageAt(ctx, contextDir, cfg.DockerfilePath, imageTag, logFn); err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("docker build: %v", err), token))
+		return fmt.Errorf("docker build: %w", err)
+	}
+
+	env := buildEnv(deps, w.ID)
+	containerPort := strconv.Itoa(cfg.Port)
+
+	settings, err := deps.Store.GetSettings()
+	if err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("get settings: %v", err), token))
+		return fmt.Errorf("get settings: %w", err)
+	}
+
+	networkName := settings.Network
+	networkID, err := deps.Docker.EnsureNetwork(ctx, networkName)
+	if err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("ensure network: %v", err), token))
+		return fmt.Errorf("ensure network: %w", err)
+	}
+
+	containerName := containerNameFor(w)
+
+	// Per-face proxy labels (Traefik consumes these; NPM ignores them).
+	labels := map[string]string{}
+	if domain != "" {
+		if l := deps.Proxy.ContainerLabels(domain, cfg.Port); l != nil {
+			for k, v := range l {
+				labels[k] = v
+			}
+		}
+	}
+
+	cc := docker.ContainerConfig{
+		Name:         containerName,
+		Image:        imageTag,
+		Env:          env,
+		ExposedPorts: []string{containerPort + "/tcp"},
+		NetworkName:  networkName,
+		NetworkID:    networkID,
+		Labels:       labels,
+		WorkloadID:   w.ID,
+		// Dockerfile workloads are tagged as "build" so the dashboard
+		// and any filtered query can distinguish them from static sites
+		// (which serve files) and image-source containers (which pull
+		// pre-built images from a registry).
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Role:         "",
+	}
+
+	containerID, err := deps.Docker.CreateContainer(ctx, cc)
+	if err != nil {
+		// Name conflict — best-effort cleanup of any prior container
+		// (by ID first; by name as a fallback) and one retry.
+		if prevContainerID != "" {
+			deps.Docker.StopContainer(ctx, prevContainerID, 10)
+			deps.Docker.RemoveContainer(ctx, prevContainerID, true)
+		}
+		removeContainerByName(ctx, deps, containerName)
+
+		containerID, err = deps.Docker.CreateContainer(ctx, cc)
+		if err != nil {
+			updateStatus(deps, w, "failed", latestSHA,
+				sanitizeError(fmt.Sprintf("create container: %v", err), token))
+			return fmt.Errorf("create container: %w", err)
+		}
+	}
+
+	if err := deps.Docker.StartContainer(ctx, containerID); err != nil {
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("start container: %v", err), token))
+		return fmt.Errorf("start container: %w", err)
+	}
+
+	// Brief health-check window — catch crash-on-boot. ctx-aware so a
+	// cancelled deploy returns promptly. On failure surface the tail
+	// of the container's logs as the error reason; that's almost
+	// always what the operator needs to debug.
+	select {
+	case <-ctx.Done():
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA, "deploy cancelled before health check")
+		return ctx.Err()
+	case <-time.After(healthCheckDelay):
+	}
+	running, runErr := deps.Docker.IsContainerRunning(ctx, containerID)
+	if runErr != nil || !running {
+		logMsg := "container exited immediately after start"
+		if logs, logErr := deps.Docker.ContainerLogs(ctx, containerID, false, "40"); logErr == nil {
+			buf, _ := io.ReadAll(logs)
+			logs.Close()
+			if len(buf) > 0 {
+				// Pass `env` so any decrypted KEY=VALUE pair that the
+				// container's startup output happens to echo (think
+				// `RUN echo $DB_PASSWORD` in a debug Dockerfile) is
+				// redacted before it lands in the operator-visible
+				// last_error field.
+				logMsg = sanitizeErrorWithSecrets(string(buf), token, env)
+			}
+		}
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA, logMsg)
+		return fmt.Errorf("container not running: %s", logMsg)
+	}
+
+	// Resolve proxy target: in-network DNS by default, NPM-remote
+	// override uses (settings.ServerIP, hostPort).
+	forwardHost := containerName
+	forwardPort := cfg.Port
+	if settings.NpmRemote && settings.ProxyProvider == "npm" {
+		if settings.ServerIP != "" {
+			hostPort, hpErr := deps.Docker.InspectContainerPort(ctx, containerID, containerPort+"/tcp")
+			if hpErr != nil {
+				slog.Warn("dockerfile: could not get host port for remote NPM",
+					"workload", w.Name, "error", hpErr)
+			} else {
+				forwardHost = settings.ServerIP
+				forwardPort = int(hostPort)
+			}
+		}
+	}
+
+	// Configure proxy if a domain is set. Replace any prior route
+	// in-place so traffic shifts atomically over to the new container.
+	proxyRouteID := prevProxyRouteID
+	if domain != "" {
+		if prevProxyRouteID != "" {
+			deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
+		}
+		routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
+			SSLCertificateID: settings.SSLCertificateID,
+		})
+		if rerr != nil {
+			slog.Warn("dockerfile: failed to configure proxy",
+				"workload", w.Name, "domain", domain,
+				"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "error", rerr)
+		} else {
+			proxyRouteID = routeID
+			slog.Info("dockerfile: proxy configured",
+				"workload", w.Name, "domain", domain,
+				"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "routeID", routeID)
+		}
+	}
+
+	// Drop the previous container only after the new one is healthy
+	// + routed. Different-ID-than-previous tells us we created a
+	// fresh one (vs returning the same ID via UpsertContainer reuse).
+	if prevContainerID != "" && prevContainerID != containerID {
+		deps.Docker.StopContainer(ctx, prevContainerID, 10)
+		deps.Docker.RemoveContainer(ctx, prevContainerID, true)
+	}
+
+	// Single transactional write of new state + container metadata.
+	// On failure: tear down the just-created container + proxy route
+	// so we don't leave orphans behind for the next deploy to trip
+	// over.
+	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+		rs.LastCommitSHA = latestSHA
+		rs.LastSyncAt = store.Now()
+		rs.LastError = ""
+		rs.Status = "deployed"
+
+		c.ContainerID = containerID
+		c.ProxyRouteID = proxyRouteID
+		c.Subdomain = domain
+		c.State = "running"
+		c.Port = cfg.Port
+		c.ImageRef = imageTag
+	}); err != nil {
+		slog.Error("dockerfile: failed to persist deploy state — rolling back",
+			"workload", w.Name, "error", err)
+		if proxyRouteID != "" {
+			deps.Proxy.DeleteRoute(ctx, proxyRouteID)
+		}
+		deps.Docker.StopContainer(ctx, containerID, 10)
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("persist deploy state: %v", err), token))
+		return fmt.Errorf("persist deploy state: %w", err)
+	}
+
+	publishEvent(deps, w, "deployed")
+	dispatchBuildNotification(deps, w, domain, "deployed", "")
+
+	slog.Info("dockerfile deployed",
+		"workload", w.Name,
+		"sha", shortSHA(latestSHA),
+		"image", imageTag)
+	return nil
+}
+
+// updateStatus writes the runtime-state status/error/commit and (on
+// terminal states) fires the side effects the static plugin's helper
+// does: failures land in the event log, and a "deployed" or "failed"
+// transition dispatches an outbound notification.
+//
+// The deploy success path calls saveState directly with the full
+// container metadata; this helper covers failure / intermediate
+// transitions where only state moves.
+func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg string) {
+	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+		rs.Status = status
+		rs.LastError = errMsg
+		if commitSHA != "" {
+			rs.LastCommitSHA = commitSHA
+		}
+		switch status {
+		case "deployed":
+			c.State = "running"
+		case "stopped":
+			c.State = "stopped"
+		case "failed":
+			c.State = "failed"
+		case "syncing", "building":
+			// Don't churn the container row's state during in-progress
+			// build/sync — leave whatever the previous deploy left.
+		}
+	}); err != nil {
+		slog.Error("dockerfile: failed to update status",
+			"id", w.ID, "status", status, "error", err)
+	}
+
+	if status == "failed" {
+		publishEvent(deps, w, "failed: "+errMsg)
+	}
+
+	if status == "deployed" || status == "failed" {
+		dispatchBuildNotification(deps, w, primaryDomain(deps, w), status, errMsg)
+	}
+}
+
+// dispatchBuildNotification fans the build event out to every
+// configured notification route for the workload. Multi-destination
+// fan-out (workload_notifications rows + legacy single URL + global
+// settings fallback) is centralised in plugin.DispatchNotificationForWorkload
+// so the routing rules are identical across source kinds.
+func dispatchBuildNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
+	eventType := "build_success"
+	if status == "failed" {
+		eventType = "build_failure"
+	}
+	siteURL := ""
+	if domain != "" {
+		siteURL = "https://" + domain
+	}
+	plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
+		Type:    eventType,
+		Project: w.Name,
+		URL:     siteURL,
+		Error:   errMsg,
+	})
+}
+
+// publishEvent emits a status event on the bus AND persists an
+// event_log row. Message shape mirrors the static plugin
+// ("Build %q: %s") so the dashboard's audit feed reads consistently
+// across both kinds.
+func publishEvent(deps plugin.Deps, w plugin.Workload, status string) {
+	severity := "info"
+	if strings.HasPrefix(status, "failed") {
+		severity = "error"
+	}
+	message := fmt.Sprintf("Build %q: %s", w.Name, status)
+
+	metaBytes, err := json.Marshal(map[string]string{
+		"workload_id":   w.ID,
+		"workload_name": w.Name,
+		"status":        status,
+	})
+	if err != nil {
+		slog.Error("dockerfile: marshal event metadata", "error", err)
+		metaBytes = []byte("{}")
+	}
+	metadata := string(metaBytes)
+
+	evt, err := deps.Store.InsertEvent(store.EventLog{
+		Source:   "dockerfile",
+		Severity: severity,
+		Message:  message,
+		Metadata: metadata,
+	})
+	if err != nil {
+		slog.Error("dockerfile: failed to persist event log", "error", err)
+		return
+	}
+	deps.Events.Publish(events.Event{
+		Type: events.EventLog,
+		Payload: events.EventLogPayload{
+			ID:        evt.ID,
+			Source:    "dockerfile",
+			Severity:  severity,
+			Message:   message,
+			Metadata:  metadata,
+			CreatedAt: evt.CreatedAt,
+		},
+	})
+}
+
+// publishBuildLog emits one EventBuildLog per non-empty daemon "stream"
+// line. The trailing newline the daemon emits per line is trimmed so the
+// UI can render each event as its own row without smuggled blanks.
+// Strictly best-effort: the bus drops events under backpressure (slow
+// subscriber, no subscriber at all) and never blocks the build loop.
+func publishBuildLog(deps plugin.Deps, w plugin.Workload, line string) {
+	trimmed := strings.TrimRight(line, "\r\n")
+	if trimmed == "" {
+		return
+	}
+	deps.Events.Publish(events.Event{
+		Type: events.EventBuildLog,
+		Payload: events.BuildLogPayload{
+			WorkloadID: w.ID,
+			Line:       trimmed,
+			Stream:     "stdout",
+		},
+	})
+}
+
+// healUnchanged is the no-rebuild short-circuit result: the SHA matches and
+// the live container + proxy are healthy, so there is nothing to deploy. If a
+// prior transient failure left the persisted status as something other than
+// "deployed", repair it so the dashboard reflects reality and we stop treating
+// a healthy workload as failed. We heal via saveState directly (NOT
+// updateStatus) so this reconciliation does not fire a spurious build-success
+// notification on every poll.
+func healUnchanged(deps plugin.Deps, w plugin.Workload, prev runtimeState, latestSHA string) error {
+	slog.Info("dockerfile: no changes", "workload", w.Name, "sha", shortSHA(latestSHA))
+	if prev.Status == "deployed" {
+		return nil
+	}
+	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+		rs.Status = "deployed"
+		rs.LastError = ""
+		c.State = "running"
+	}); err != nil {
+		slog.Warn("dockerfile: failed to heal stale status to deployed",
+			"workload", w.Name, "error", err)
+	}
+	return nil
+}
+
+// removeContainerByName enumerates Docker's view and best-effort drops
+// EVERY matching container so a name conflict in CreateContainer is
+// recoverable. Container names are unique per daemon, but the recovery
+// path exists precisely because a conflict occurred — a prior partial
+// deploy can leave more than one matching artifact, so we must not stop
+// at the first. Mirrors the static plugin's helper of the same name.
+func removeContainerByName(ctx context.Context, deps plugin.Deps, name string) {
+	containers, err := deps.Docker.ListContainers(ctx, nil)
+	if err != nil {
+		return
+	}
+	for _, c := range containers {
+		if c.Name == name {
+			deps.Docker.StopContainer(ctx, c.ID, 10)
+			deps.Docker.RemoveContainer(ctx, c.ID, true)
+		}
+	}
+}
+
+// primaryDomain mirrors the static plugin's helper of the same name —
+// derives an FQDN from the workload's first enabled public face, with
+// the same bare-subdomain + settings.Domain fall-through.
+func primaryDomain(deps plugin.Deps, w plugin.Workload) string {
+	for _, f := range w.PublicFaces {
+		if f.Subdomain == "" && f.Domain == "" {
+			continue
+		}
+		switch {
+		case f.Subdomain != "" && f.Domain != "":
+			return f.Subdomain + "." + f.Domain
+		case f.Subdomain == "" && f.Domain != "":
+			return f.Domain
+		case f.Subdomain != "" && f.Domain == "":
+			settings, err := deps.Store.GetSettings()
+			if err != nil || settings.Domain == "" {
+				return f.Subdomain
+			}
+			return f.Subdomain + "." + settings.Domain
+		}
+	}
+	return ""
+}
+
+// shortSHA truncates a commit SHA for log lines. Keeps the deploy log
+// readable without losing the "is this the same commit?" signal.
+func shortSHA(sha string) string {
+	if len(sha) > 8 {
+		return sha[:8]
+	}
+	return sha
+}
@@ -0,0 +1,131 @@
+// Package dockerfile implements the "dockerfile" source: a git-repo-backed
+// deployable that builds a Docker image from a user-supplied Dockerfile
+// and runs one container. This is the "self-hosted Vercel" Source —
+// users point at a Git repo containing a Dockerfile and Tinyforge
+// handles clone → build → run → proxy in one shot, with no external CI
+// pipeline.
+//
+// Architecturally the plugin sits between `static` (clones a Git repo,
+// builds an image, runs one container) and `image` (richer runtime
+// shape: ports, healthcheck, env, volumes). The deploy pipeline mirrors
+// static — same git-fetch, same image-tag/container-name shape, same
+// container-row state persistence — but the build step uses the
+// operator's Dockerfile instead of generating one.
+//
+// The full pipeline is implemented inline in this package
+// (deploy.go / teardown.go / reconcile.go) so a new dockerfile source
+// kind is usable immediately on init() — no separate registration step
+// in the deployer.
+package dockerfile
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// Config is the per-workload source config blob. Mirrors the shape of
+// the static plugin's Config so the UI wizard can largely reuse the
+// existing Git-discovery + branch-picker + repo-picker components.
+//
+// Build-side fields:
+//
+//   - DockerfilePath: path to the Dockerfile *within the context*
+//     directory. Defaults to "Dockerfile". Use e.g. "docker/Dockerfile"
+//     when the operator's repo keeps Dockerfiles in a subfolder.
+//   - ContextPath: subfolder of the cloned repo to use as the build
+//     context. Defaults to "" (repo root). Use e.g. "./api" when the
+//     repo's Dockerfile lives next to a backend service in a monorepo.
+//
+// Runtime-side fields:
+//
+//   - Port: container port the workload listens on. Required.
+//   - Healthcheck: optional curl-style probe; empty disables.
+//
+// Env vars and volume mounts are handled out-of-band via the
+// workload_env and workload_volumes tables, mirroring the image source.
+type Config struct {
+	Provider       string `json:"provider"`         // "gitea" | "github" | "gitlab"; "" = autodetect
+	BaseURL        string `json:"base_url"`         // e.g. https://git.example.com
+	RepoOwner      string `json:"repo_owner"`
+	RepoName       string `json:"repo_name"`
+	Branch         string `json:"branch"`
+	ContextPath    string `json:"context_path"`     // path within repo (root by default)
+	DockerfilePath string `json:"dockerfile_path"`  // relative to context_path; "Dockerfile" by default
+	AccessToken    string `json:"access_token"`     // encrypted; optional for public repos
+
+	Port        int    `json:"port"`
+	Healthcheck string `json:"healthcheck,omitempty"`
+}
+
+type source struct{}
+
+// Eager registration — the deploy pipeline lives entirely inside this
+// package, so the kind is usable as soon as init() fires.
+func init() { plugin.RegisterSource(&source{}) }
+
+func (*source) Kind() string { return "dockerfile" }
+
+func (*source) SchemaSample() any {
+	return Config{
+		Provider:       "gitea",
+		BaseURL:        "https://git.example.com",
+		RepoOwner:      "owner",
+		RepoName:       "myservice",
+		Branch:         "main",
+		ContextPath:    "",
+		DockerfilePath: "Dockerfile",
+		Port:           8080,
+	}
+}
+
+// Validate rejects obviously-malformed configs before the deploy
+// pipeline materializes a temp dir, downloads a repo, and burns
+// minutes of build time on input that was never going to work.
+func (*source) Validate(cfg json.RawMessage) error {
+	var c Config
+	if len(cfg) == 0 {
+		return fmt.Errorf("dockerfile source: config is required")
+	}
+	if err := json.Unmarshal(cfg, &c); err != nil {
+		return fmt.Errorf("dockerfile source: invalid json: %w", err)
+	}
+	if strings.TrimSpace(c.RepoOwner) == "" || strings.TrimSpace(c.RepoName) == "" {
+		return fmt.Errorf("dockerfile source: repo_owner and repo_name are required")
+	}
+	if c.Port <= 0 || c.Port > 65535 {
+		return fmt.Errorf("dockerfile source: port must be between 1 and 65535 (got %d)", c.Port)
+	}
+	// Defense in depth: a leading "/" or any ".." segment in
+	// DockerfilePath / ContextPath would escape the build context. The
+	// plugin's deploy() does its own normalization too; rejecting here
+	// gives the operator a clear error at save-time instead of a
+	// confusing "no such file" mid-build.
+	for _, p := range []string{c.DockerfilePath, c.ContextPath} {
+		if p == "" {
+			continue
+		}
+		if strings.HasPrefix(p, "/") {
+			return fmt.Errorf("dockerfile source: %q must be relative", p)
+		}
+		if strings.Contains(p, "..") {
+			return fmt.Errorf("dockerfile source: %q must not contain '..'", p)
+		}
+	}
+	return nil
+}
+
+func (*source) Deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	return deploy(ctx, deps, w, intent)
+}
+
+func (*source) Teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	return teardown(ctx, deps, w)
+}
+
+func (*source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	return reconcile(ctx, deps, w)
+}
@@ -0,0 +1,288 @@
+package dockerfile
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// ── Source interface plumbing ───────────────────────────────────────
+
+func TestSource_Kind(t *testing.T) {
+	if (&source{}).Kind() != "dockerfile" {
+		t.Fatalf("Kind = %q, want \"dockerfile\"", (&source{}).Kind())
+	}
+}
+
+func TestSource_Registered_AtInit(t *testing.T) {
+	// init() runs once on import; we just verify the registry returns
+	// our concrete kind. A failure here is a regression of the global
+	// plugin.RegisterSource path or our package-level init.
+	got, err := plugin.GetSource("dockerfile")
+	if err != nil {
+		t.Fatalf("GetSource(dockerfile): %v", err)
+	}
+	if got.Kind() != "dockerfile" {
+		t.Fatalf("registered source has wrong kind: %q", got.Kind())
+	}
+}
+
+func TestSource_SchemaSample_RoundTrips(t *testing.T) {
+	s := (&source{}).SchemaSample()
+	raw, err := json.Marshal(s)
+	if err != nil {
+		t.Fatalf("marshal sample: %v", err)
+	}
+	if err := (&source{}).Validate(raw); err != nil {
+		t.Fatalf("Validate(sample) = %v, want nil", err)
+	}
+}
+
+// ── Validate ────────────────────────────────────────────────────────
+
+func TestValidate_RejectsEmpty(t *testing.T) {
+	if err := (&source{}).Validate(nil); err == nil {
+		t.Fatal("expected error on empty config, got nil")
+	}
+}
+
+func TestValidate_RejectsMissingRepo(t *testing.T) {
+	cases := []Config{
+		{RepoName: "x", Port: 80},                  // owner missing
+		{RepoOwner: "y", Port: 80},                 // name missing
+		{RepoOwner: "  ", RepoName: "x", Port: 80}, // owner whitespace-only
+	}
+	for i, c := range cases {
+		raw, _ := json.Marshal(c)
+		if err := (&source{}).Validate(raw); err == nil {
+			t.Errorf("case %d: expected error, got nil", i)
+		}
+	}
+}
+
+func TestValidate_RejectsBadPort(t *testing.T) {
+	for _, port := range []int{0, -1, 70000} {
+		raw, _ := json.Marshal(Config{RepoOwner: "a", RepoName: "b", Port: port})
+		if err := (&source{}).Validate(raw); err == nil {
+			t.Errorf("port %d: expected error, got nil", port)
+		}
+	}
+}
+
+func TestValidate_RejectsPathEscape(t *testing.T) {
+	cases := []Config{
+		{RepoOwner: "a", RepoName: "b", Port: 80, DockerfilePath: "/etc/passwd"},
+		{RepoOwner: "a", RepoName: "b", Port: 80, DockerfilePath: "../../etc/passwd"},
+		{RepoOwner: "a", RepoName: "b", Port: 80, ContextPath: "../../"},
+		{RepoOwner: "a", RepoName: "b", Port: 80, ContextPath: "/etc"},
+	}
+	for i, c := range cases {
+		raw, _ := json.Marshal(c)
+		if err := (&source{}).Validate(raw); err == nil {
+			t.Errorf("case %d: expected path-escape rejection, got nil", i)
+		}
+	}
+}
+
+func TestValidate_AcceptsValid(t *testing.T) {
+	raw, _ := json.Marshal(Config{
+		RepoOwner:      "owner",
+		RepoName:       "repo",
+		Port:           8080,
+		DockerfilePath: "docker/Dockerfile",
+		ContextPath:    "services/api",
+	})
+	if err := (&source{}).Validate(raw); err != nil {
+		t.Fatalf("Validate(valid) = %v", err)
+	}
+}
+
+// ── Naming helpers ──────────────────────────────────────────────────
+
+func TestNaming_SameNameDifferentIDs_NoCollision(t *testing.T) {
+	a := plugin.Workload{ID: "aaaaaaaa-rest", Name: "svc"}
+	b := plugin.Workload{ID: "bbbbbbbb-rest", Name: "svc"}
+	if containerNameFor(a) == containerNameFor(b) {
+		t.Errorf("container names collide: %q", containerNameFor(a))
+	}
+	if imageTagFor(a) == imageTagFor(b) {
+		t.Errorf("image tags collide: %q", imageTagFor(a))
+	}
+}
+
+func TestNaming_ShortIDsPassThrough(t *testing.T) {
+	w := plugin.Workload{ID: "abc", Name: "tiny"}
+	if !strings.HasSuffix(containerNameFor(w), "-abc") {
+		t.Errorf("container name lost short id: %q", containerNameFor(w))
+	}
+}
+
+// ── Context + Dockerfile resolution ─────────────────────────────────
+
+func TestResolveContextDir_Empty_ReturnsRoot(t *testing.T) {
+	dir := t.TempDir()
+	got, err := resolveContextDir(dir, "")
+	if err != nil {
+		t.Fatalf("resolveContextDir: %v", err)
+	}
+	if real, _ := filepath.EvalSymlinks(dir); got != real && got != dir {
+		t.Errorf("got %q, want %q (or symlink-resolved equivalent)", got, dir)
+	}
+}
+
+func TestResolveContextDir_Subfolder_OK(t *testing.T) {
+	dir := t.TempDir()
+	sub := filepath.Join(dir, "api")
+	if err := os.MkdirAll(sub, 0o755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	got, err := resolveContextDir(dir, "api")
+	if err != nil {
+		t.Fatalf("resolveContextDir: %v", err)
+	}
+	if !strings.HasSuffix(got, "api") {
+		t.Errorf("got %q, expected suffix 'api'", got)
+	}
+}
+
+func TestResolveContextDir_NonexistentSubfolder(t *testing.T) {
+	dir := t.TempDir()
+	if _, err := resolveContextDir(dir, "missing"); err == nil {
+		t.Fatal("expected error for missing subfolder")
+	}
+}
+
+func TestResolveContextDir_RejectsEscape(t *testing.T) {
+	dir := t.TempDir()
+	// resolveContextDir is the second wall — Validate is the first.
+	// We pass an absolute escape via a synthesized symlink. Even if
+	// the user bypasses Validate (e.g. by direct DB edit), this must
+	// still reject.
+	outside := t.TempDir()
+	link := filepath.Join(dir, "escape")
+	if err := os.Symlink(outside, link); err != nil {
+		t.Skipf("symlink unsupported in this environment: %v", err)
+	}
+	if _, err := resolveContextDir(dir, "escape"); err == nil {
+		t.Fatal("expected escape-path rejection")
+	}
+}
+
+func TestVerifyDockerfileExists_Present(t *testing.T) {
+	dir := t.TempDir()
+	if err := os.WriteFile(filepath.Join(dir, "Dockerfile"), []byte("FROM scratch\n"), 0o644); err != nil {
+		t.Fatalf("write: %v", err)
+	}
+	if err := verifyDockerfileExists(dir, ""); err != nil {
+		t.Fatalf("verifyDockerfileExists(default) = %v, want nil", err)
+	}
+}
+
+func TestVerifyDockerfileExists_Missing(t *testing.T) {
+	dir := t.TempDir()
+	if err := verifyDockerfileExists(dir, ""); err == nil {
+		t.Fatal("expected error for missing Dockerfile")
+	}
+}
+
+func TestVerifyDockerfileExists_CustomPath(t *testing.T) {
+	dir := t.TempDir()
+	if err := os.MkdirAll(filepath.Join(dir, "docker"), 0o755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, "docker", "Dockerfile.prod"), []byte("FROM scratch\n"), 0o644); err != nil {
+		t.Fatalf("write: %v", err)
+	}
+	if err := verifyDockerfileExists(dir, "docker/Dockerfile.prod"); err != nil {
+		t.Fatalf("verifyDockerfileExists(custom) = %v, want nil", err)
+	}
+}
+
+func TestVerifyDockerfileExists_RejectsAbsolutePath(t *testing.T) {
+	dir := t.TempDir()
+	if err := verifyDockerfileExists(dir, "/etc/passwd"); err == nil {
+		t.Fatal("expected error for absolute dockerfile path")
+	}
+}
+
+// ── Sanitiser ───────────────────────────────────────────────────────
+
+func TestSanitizeError_RedactsToken(t *testing.T) {
+	tok := "ghp_supersecret"
+	got := sanitizeError("401 from gitea token="+tok+" ok", tok)
+	if strings.Contains(got, tok) {
+		t.Errorf("token leaked: %q", got)
+	}
+	if !strings.Contains(got, "[REDACTED]") {
+		t.Errorf("missing [REDACTED] marker: %q", got)
+	}
+}
+
+func TestSanitizeError_CollapsesWhitespace(t *testing.T) {
+	got := sanitizeError("a\nb\rc\td", "")
+	if strings.ContainsAny(got, "\n\r\t") {
+		t.Errorf("did not collapse: %q", got)
+	}
+}
+
+func TestSanitizeError_TruncatesUTF8Safe(t *testing.T) {
+	// 1000 copies of a 2-byte rune = 2000 bytes, well over the 240
+	// cap. Output must remain valid UTF-8 (no torn rune at the cap).
+	long := strings.Repeat("é", 1000)
+	got := sanitizeError(long, "")
+	if !strings.HasSuffix(got, "…") {
+		t.Errorf("missing ellipsis: %q", got)
+	}
+	// Walk the result: every byte should be either an ASCII char or
+	// part of a complete UTF-8 sequence. utf8.ValidString is the
+	// canonical guard but a simple "ends on rune boundary" check
+	// suffices for this fixture.
+	if !isValidUTF8Slice([]byte(got)) {
+		t.Errorf("truncation produced broken UTF-8: %q", got)
+	}
+}
+
+func isValidUTF8Slice(b []byte) bool {
+	for i := 0; i < len(b); {
+		switch {
+		case b[i] < 0x80:
+			i++
+		case b[i] < 0xC0:
+			return false // continuation byte at sequence start
+		case b[i] < 0xE0:
+			if i+1 >= len(b) {
+				return false
+			}
+			i += 2
+		case b[i] < 0xF0:
+			if i+2 >= len(b) {
+				return false
+			}
+			i += 3
+		default:
+			if i+3 >= len(b) {
+				return false
+			}
+			i += 4
+		}
+	}
+	return true
+}
+
+// ── State row ID ────────────────────────────────────────────────────
+
+func TestContainerRowID_Deterministic(t *testing.T) {
+	w := plugin.Workload{ID: "abcd1234-rest"}
+	a := containerRowID(w)
+	b := containerRowID(w)
+	if a != b {
+		t.Errorf("containerRowID not deterministic: %q vs %q", a, b)
+	}
+	if !strings.HasSuffix(a, ":dockerfile") {
+		t.Errorf("containerRowID missing suffix: %q", a)
+	}
+}
@@ -0,0 +1,37 @@
+package dockerfile
+
+import (
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// buildEnv flattens workload_env rows into the KEY=VALUE list Docker
+// expects. Mirrors the static plugin's env helper exactly so the two
+// plugins handle decrypt failures the same way: log + skip the one
+// entry rather than fail the deploy. Bricking a build because one
+// rotated key missed an env entry would be worse than running with
+// the variable unset and a single warning in the operator's log.
+func buildEnv(deps plugin.Deps, workloadID string) []string {
+	rows, err := deps.Store.ListWorkloadEnv(workloadID)
+	if err != nil {
+		slog.Warn("dockerfile source: list workload env", "workload", workloadID, "error", err)
+		return nil
+	}
+	out := make([]string, 0, len(rows))
+	for _, e := range rows {
+		value := e.Value
+		if e.Encrypted {
+			decrypted, err := crypto.Decrypt(deps.EncKey, e.Value)
+			if err != nil {
+				slog.Warn("dockerfile source: decrypt env value",
+					"workload", workloadID, "key", e.Key, "error", err)
+				continue
+			}
+			value = decrypted
+		}
+		out = append(out, e.Key+"="+value)
+	}
+	return out
+}
@@ -0,0 +1,141 @@
+package dockerfile
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// resolveContextDir picks the directory the Docker build context will
+// be packed from, defensively. Returns an error rather than a directory
+// outside the cloned tree even if ContextPath contains a tricky
+// sequence — Validate already rejects ".." and leading "/", but
+// EvalSymlinks here is the second wall.
+//
+// ctx may be "" (use cloneRoot as-is) or a relative subpath like
+// "./api" or "services/api".
+func resolveContextDir(cloneRoot, ctx string) (string, error) {
+	cloneRoot, err := filepath.Abs(cloneRoot)
+	if err != nil {
+		return "", fmt.Errorf("abs cloneRoot: %w", err)
+	}
+	if real, err := filepath.EvalSymlinks(cloneRoot); err == nil {
+		cloneRoot = real
+	}
+	if ctx == "" || ctx == "." || ctx == "./" {
+		return cloneRoot, nil
+	}
+	candidate := filepath.Join(cloneRoot, filepath.FromSlash(ctx))
+	candidate, err = filepath.Abs(candidate)
+	if err != nil {
+		return "", fmt.Errorf("abs candidate: %w", err)
+	}
+	// Resolve symlinks BEFORE the prefix check so a planted symlink
+	// inside the clone cannot escape the build context.
+	if real, err := filepath.EvalSymlinks(candidate); err == nil {
+		candidate = real
+	}
+	if candidate != cloneRoot && !strings.HasPrefix(candidate, cloneRoot+string(filepath.Separator)) {
+		return "", fmt.Errorf("context path %q escapes clone root", ctx)
+	}
+	info, err := os.Stat(candidate)
+	if err != nil {
+		return "", fmt.Errorf("stat context_path %q: %w", ctx, err)
+	}
+	if !info.IsDir() {
+		return "", fmt.Errorf("context_path %q is not a directory", ctx)
+	}
+	return candidate, nil
+}
+
+// verifyDockerfileExists checks that the named Dockerfile is present in
+// the resolved context. Returns a focused error for the operator instead
+// of letting the daemon error out with a less obvious message later.
+//
+// dockerfilePath is the value from Config.DockerfilePath — relative to
+// the context dir, "Dockerfile" by default.
+func verifyDockerfileExists(contextDir, dockerfilePath string) error {
+	if dockerfilePath == "" {
+		dockerfilePath = "Dockerfile"
+	}
+	if strings.HasPrefix(dockerfilePath, "/") || strings.Contains(dockerfilePath, "..") {
+		return fmt.Errorf("dockerfile_path %q must be relative and contain no '..'", dockerfilePath)
+	}
+	full := filepath.Join(contextDir, filepath.FromSlash(dockerfilePath))
+	info, err := os.Stat(full)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return fmt.Errorf("Dockerfile not found at %s/%s", filepath.Base(contextDir), dockerfilePath)
+		}
+		return fmt.Errorf("stat Dockerfile %q: %w", dockerfilePath, err)
+	}
+	if info.IsDir() {
+		return fmt.Errorf("dockerfile_path %q points at a directory, not a file", dockerfilePath)
+	}
+	return nil
+}
+
+// sanitizeError clamps an error string before it lands in
+// containers.extra_json (last_error) or echoes through an outbound
+// notification webhook. Mirrors the static-plugin helper of the same
+// name so both plugins agree on the surface area they expose to
+// operators.
+func sanitizeError(msg, accessToken string) string {
+	return sanitizeErrorWithSecrets(msg, accessToken, nil)
+}
+
+// sanitizeErrorWithSecrets is the dockerfile-plugin-specific extension:
+// when capturing container build/runtime logs into last_error we ALSO
+// need to redact decrypted env-var values, because a malicious or
+// debug-laden Dockerfile can `RUN echo $SECRET` and land a runtime
+// secret in operator-readable state via /api/workloads/{id}/runtime-state.
+//
+// envKV is the same []string the docker client receives — entries shaped
+// "KEY=VALUE". We split on the first '=' and redact every non-empty
+// VALUE longer than 3 chars (shorter values produce too many false-
+// positive substring matches against words like "is" / "of").
+func sanitizeErrorWithSecrets(msg, accessToken string, envKV []string) string {
+	if msg == "" {
+		return ""
+	}
+	if accessToken != "" {
+		msg = strings.ReplaceAll(msg, accessToken, "[REDACTED]")
+	}
+	for _, kv := range envKV {
+		eq := strings.IndexByte(kv, '=')
+		if eq < 0 {
+			continue
+		}
+		value := kv[eq+1:]
+		if len(value) < 4 {
+			continue
+		}
+		msg = strings.ReplaceAll(msg, value, "[REDACTED]")
+	}
+	msg = strings.Map(func(r rune) rune {
+		switch r {
+		case '\n', '\r', '\t':
+			return ' '
+		}
+		return r
+	}, msg)
+	const maxLen = 240
+	if len(msg) > maxLen {
+		// Rune-aware truncation: walk back to the previous rune
+		// boundary so multi-byte chars at the cap don't tear.
+		cut := maxLen
+		for cut > 0 && !isRuneStart(msg[cut]) {
+			cut--
+		}
+		msg = msg[:cut] + "…"
+	}
+	return msg
+}
+
+// isRuneStart reports whether b is a leading byte of a UTF-8 sequence.
+// Used to walk back from a byte-offset cut to a rune boundary.
+func isRuneStart(b byte) bool {
+	return b&0xC0 != 0x80
+}
@@ -0,0 +1,32 @@
+package dockerfile
+
+import (
+	"fmt"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// idShort is the first 8 chars of the workload ID. Same shape as the
+// static plugin — workload names are not UNIQUE in the schema, the ID
+// short suffix is what keeps two same-named workloads from clobbering
+// each other's container/image artifacts.
+func idShort(w plugin.Workload) string {
+	if len(w.ID) < 8 {
+		return w.ID
+	}
+	return w.ID[:8]
+}
+
+// containerNameFor is the deterministic container name. Prefix `tf-build-`
+// distinguishes a dockerfile-built container from `dw-site-` (static) and
+// per-stage image names at a glance in `docker ps`.
+func containerNameFor(w plugin.Workload) string {
+	return fmt.Sprintf("tf-build-%s-%s", w.Name, idShort(w))
+}
+
+// imageTagFor is the deterministic image tag the build step emits. Same
+// shape as the container name so `docker images` shows the linkage at a
+// glance.
+func imageTagFor(w plugin.Workload) string {
+	return fmt.Sprintf("tf-build-%s-%s:latest", w.Name, idShort(w))
+}
@@ -0,0 +1,72 @@
+package dockerfile
+
+import (
+	"context"
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// reconcile syncs the container row's state column with Docker reality
+// for this workload's single container, and marks the runtime state as
+// "failed" if the container is gone or has crashed. Same shape as the
+// static plugin's reconcile — minimal, no automatic re-build on a
+// missing container. The dashboard surfaces the failed status; the
+// operator triggers redeploy explicitly.
+//
+// Auto-redeploy could be added later, but it should be gated on a
+// per-workload toggle: a crash loop with auto-rebuild would burn CPU
+// rebuilding the same broken commit forever.
+func reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	st, prevContainer, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+	if prevContainer == nil || prevContainer.ContainerID == "" {
+		return nil
+	}
+
+	running, err := deps.Docker.IsContainerRunning(ctx, prevContainer.ContainerID)
+	if err != nil {
+		// Most likely "no such container" — mark missing so the UI
+		// surfaces it; runtime status moves to "failed" so the
+		// dashboard and operator event triggers see the regression.
+		if uerr := deps.Store.UpdateContainerState(prevContainer.ID, "missing"); uerr != nil {
+			slog.Warn("dockerfile: mark missing", "workload", w.Name, "error", uerr)
+		}
+		if st.Status == "deployed" {
+			if uerr := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+				rs.Status = "failed"
+				rs.LastError = "container not found"
+				c.State = "missing"
+			}); uerr != nil {
+				slog.Warn("dockerfile: persist missing-state", "workload", w.Name, "error", uerr)
+			}
+			publishEvent(deps, w, "failed: container not found")
+		}
+		return nil
+	}
+
+	desired := "running"
+	if !running {
+		desired = "stopped"
+	}
+	if prevContainer.State != desired {
+		if err := deps.Store.UpdateContainerState(prevContainer.ID, desired); err != nil {
+			slog.Warn("dockerfile: state sync", "workload", w.Name, "error", err)
+		}
+	}
+
+	if !running && st.Status == "deployed" {
+		if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+			rs.Status = "failed"
+			rs.LastError = "container stopped unexpectedly"
+			c.State = "stopped"
+		}); err != nil {
+			slog.Warn("dockerfile: persist crashed-state", "workload", w.Name, "error", err)
+		}
+		publishEvent(deps, w, "failed: container stopped unexpectedly")
+	}
+	return nil
+}
@@ -0,0 +1,179 @@
+package dockerfile
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"sync"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// runtimeState is the per-workload state we persist inside the
+// container row's extra_json blob. Mirrors the static plugin's
+// runtimeState shape so anyone reading the DB can interpret the two
+// kinds identically.
+//
+// LastImageDigest is the build's image ID — distinct from a registry
+// digest (we never push) but useful for "did the build actually
+// produce a different artifact?" diffing when we add caching later.
+type runtimeState struct {
+	LastCommitSHA   string `json:"last_commit_sha,omitempty"`
+	LastImageDigest string `json:"last_image_digest,omitempty"`
+	LastSyncAt      string `json:"last_sync_at,omitempty"`
+	LastError       string `json:"last_error,omitempty"`
+	Status          string `json:"status,omitempty"`
+}
+
+// runtimeStateKeys lists every JSON field name owned by runtimeState.
+// saveState strips these from the generic map before re-emitting so
+// the typed values do not double-write under both their JSON tag and
+// any subsequent extension's tag.
+var runtimeStateKeys = []string{
+	"last_commit_sha", "last_image_digest", "last_sync_at", "last_error", "status",
+}
+
+// containerRowID is the deterministic container row ID. Stable across
+// redeploys so saveState upserts in place.
+func containerRowID(w plugin.Workload) string {
+	return w.ID + ":dockerfile"
+}
+
+// loadState returns the persisted runtime state plus the underlying
+// container row. Both values are zero on first deploy.
+func loadState(deps plugin.Deps, w plugin.Workload) (runtimeState, *store.Container, error) {
+	row, err := deps.Store.GetContainerByID(containerRowID(w))
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			return runtimeState{}, nil, nil
+		}
+		return runtimeState{}, nil, fmt.Errorf("dockerfile source: load state: %w", err)
+	}
+	st := runtimeState{}
+	if row.ExtraJSON != "" && row.ExtraJSON != "{}" {
+		if err := json.Unmarshal([]byte(row.ExtraJSON), &st); err != nil {
+			slog.Debug("dockerfile source: decode extra_json", "workload", w.ID, "error", err)
+		}
+	}
+	return st, &row, nil
+}
+
+// saveLocks serializes per-workload RMW of the container row. Same
+// pattern as the static plugin — SQLite's MaxOpenConns=1 serializes
+// statements but not the caller's read-then-write intent, so two
+// concurrent deploys for the same workload could stomp each other's
+// container_id / proxy_route_id without this mutex.
+//
+// Entries are reference-counted and removed only when the last holder
+// releases. This bounds memory (no per-workload-ID leak) WITHOUT the
+// use-after-delete hazard of deleting an entry on teardown: deleting a
+// live entry while a concurrent saveState still holds (or is about to
+// lock) it would let a fresh saveState mint a SECOND mutex for the same
+// workload, losing the RMW serialization the lock exists to provide.
+var saveLocks struct {
+	mu    sync.Mutex
+	locks map[string]*saveLock
+}
+
+type saveLock struct {
+	mu   sync.Mutex
+	refs int
+}
+
+// acquireSaveLock returns the per-workload lock (creating it on first use),
+// registers this caller as a holder, and takes the lock. Pair with
+// releaseSaveLock. The outer mutex is held only for the bookkeeping; callers
+// contend on the returned per-workload lock.
+func acquireSaveLock(workloadID string) *saveLock {
+	saveLocks.mu.Lock()
+	if saveLocks.locks == nil {
+		saveLocks.locks = map[string]*saveLock{}
+	}
+	l, ok := saveLocks.locks[workloadID]
+	if !ok {
+		l = &saveLock{}
+		saveLocks.locks[workloadID] = l
+	}
+	l.refs++
+	saveLocks.mu.Unlock()
+	l.mu.Lock()
+	return l
+}
+
+// releaseSaveLock unlocks and drops the caller's reference, removing the map
+// entry once no holders remain. Because refs is incremented under saveLocks.mu
+// before the entry can be observed for deletion, an entry with a pending
+// acquirer is never deleted.
+func releaseSaveLock(workloadID string, l *saveLock) {
+	l.mu.Unlock()
+	saveLocks.mu.Lock()
+	l.refs--
+	if l.refs == 0 {
+		delete(saveLocks.locks, workloadID)
+	}
+	saveLocks.mu.Unlock()
+}
+
+// saveState upserts the container row, calling mutate so callers can
+// adjust both the typed runtime state and the row's first-class fields
+// in one transaction. Unknown keys in extra_json survive the round-trip
+// so future writers can extend the blob without forcing this struct to
+// grow.
+func saveState(deps plugin.Deps, w plugin.Workload, mutate func(*runtimeState, *store.Container)) error {
+	lk := acquireSaveLock(w.ID)
+	defer releaseSaveLock(w.ID, lk)
+
+	prev, prevRow, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+
+	row := store.Container{
+		ID:           containerRowID(w),
+		WorkloadID:   w.ID,
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Host:         "local",
+	}
+	if prevRow != nil {
+		row = *prevRow
+	}
+
+	generic := map[string]json.RawMessage{}
+	if row.ExtraJSON != "" && row.ExtraJSON != "{}" {
+		if err := json.Unmarshal([]byte(row.ExtraJSON), &generic); err != nil {
+			slog.Debug("dockerfile source: decode extra_json (generic)", "workload", w.ID, "error", err)
+		}
+	}
+	for _, k := range runtimeStateKeys {
+		delete(generic, k)
+	}
+
+	state := prev
+	mutate(&state, &row)
+
+	typedBytes, err := json.Marshal(state)
+	if err != nil {
+		return fmt.Errorf("dockerfile source: marshal state: %w", err)
+	}
+	typedMap := map[string]json.RawMessage{}
+	if err := json.Unmarshal(typedBytes, &typedMap); err != nil {
+		return fmt.Errorf("dockerfile source: re-decode typed state: %w", err)
+	}
+	for k, v := range typedMap {
+		generic[k] = v
+	}
+
+	merged, err := json.Marshal(generic)
+	if err != nil {
+		return fmt.Errorf("dockerfile source: marshal merged state: %w", err)
+	}
+	row.ExtraJSON = string(merged)
+	row.LastSeenAt = store.Now()
+
+	if err := deps.Store.UpsertContainer(row); err != nil {
+		return fmt.Errorf("dockerfile source: upsert container row: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,51 @@
+package dockerfile
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// teardown drops every artifact deploy created: the running container,
+// the proxy route, the container index row. Idempotent — a workload
+// that never deployed is a no-op.
+//
+// The built image tag is left in place: removing it would invalidate
+// the docker build cache (next deploy of the same workload would
+// rebuild from scratch). Operators can prune unused images via the
+// existing Settings → Prune Images path.
+func teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	_, prevContainer, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+	if prevContainer == nil {
+		return nil
+	}
+
+	// Proxy first so traffic stops landing on a container that is
+	// about to disappear.
+	if prevContainer.ProxyRouteID != "" {
+		if err := deps.Proxy.DeleteRoute(ctx, prevContainer.ProxyRouteID); err != nil {
+			slog.Warn("dockerfile: failed to remove proxy route", "workload", w.Name, "error", err)
+		}
+	}
+
+	if prevContainer.ContainerID != "" {
+		if err := deps.Docker.RemoveContainer(ctx, prevContainer.ContainerID, true); err != nil {
+			slog.Warn("dockerfile: failed to remove container", "workload", w.Name, "error", err)
+		}
+	}
+
+	if err := deps.Store.DeleteContainer(prevContainer.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
+		slog.Warn("dockerfile: failed to delete container row", "workload", w.Name, "error", err)
+	}
+	// The per-workload save-mutex is reference-counted (see state.go) and
+	// frees itself when the last holder releases, so teardown no longer
+	// deletes it explicitly — doing so could race a concurrent saveState
+	// and break the RMW serialization the lock provides.
+	return nil
+}
@@ -444,22 +444,12 @@ func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg
 }

 // dispatchSiteNotification fires a site_sync_success or
-// site_sync_failure event to the configured outbound webhook.
-// Resolution: per-workload URL+secret first, then fall through to
-// settings.notification_url/secret. Always best-effort.
+// site_sync_failure event for the workload via the shared multi-route
+// dispatcher in plugin.DispatchNotificationForWorkload. Resolution
+// order (workload_notifications → legacy single URL → settings global)
+// is identical to the dockerfile plugin's path so receivers see
+// consistent fan-out behaviour across source kinds.
 func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
-	if deps.Notifier == nil {
-		return
-	}
-	settings, err := deps.Store.GetSettings()
-	if err != nil {
-		slog.Warn("static site: notify settings lookup failed", "site", w.ID, "error", err)
-		return
-	}
-	url, secret, tier := resolveSiteTarget(w, settings)
-	if url == "" {
-		return
-	}
 	eventType := "site_sync_success"
 	if status == "failed" {
 		eventType = "site_sync_failure"
@@ -468,7 +458,7 @@ func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, statu
 	if domain != "" {
 		siteURL = "https://" + domain
 	}
-	deps.Notifier.SendSigned(url, secret, tier, notify.Event{
+	plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
 		Type:    eventType,
 		Project: w.Name,
 		URL:     siteURL,
@@ -476,16 +466,6 @@ func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, statu
 	})
 }

-// resolveSiteTarget mirrors the legacy resolveSiteTarget helper but
-// reads notification config off the workload row (where it now lives
-// post-refactor) rather than the static_sites row.
-func resolveSiteTarget(w plugin.Workload, settings store.Settings) (string, string, notify.Tier) {
-	if w.NotificationURL != "" {
-		return w.NotificationURL, w.NotificationSecret, notify.TierSite
-	}
-	return settings.NotificationURL, settings.NotificationSecret, notify.TierSettings
-}
-
 // publishEvent emits a static_site_status event on the bus AND
 // persists an event_log row so the dashboard's audit trail picks it
 // up. Message format ("Static site \"%s\": %s") is preserved verbatim
@@ -165,30 +165,42 @@ func TestContainerRowID_Deterministic(t *testing.T) {
 	}
 }

-func TestLockFor_ReturnsSameLockForSameWorkload(t *testing.T) {
-	// Suffix by t.Name() so the package-global saveLocks map cannot
-	// bleed key state between tests (or between -count=N runs).
+func TestSaveLock_FreedWhenIdle(t *testing.T) {
+	// After the last holder releases, the reference-counted entry must be
+	// removed from the map so the lock table cannot grow without bound.
+	// Suffix by t.Name() so the package-global saveLocks map cannot bleed
+	// key state between tests (or between -count=N runs).
 	key := t.Name() + "-wid"
-	a := lockFor(key)
-	b := lockFor(key)
-	if a != b {
-		t.Fatalf("lockFor returned distinct locks for same workload: %p vs %p", a, b)
+	lk := acquireSaveLock(key)
+	saveLocks.mu.Lock()
+	_, present := saveLocks.locks[key]
+	saveLocks.mu.Unlock()
+	if !present {
+		t.Fatal("acquireSaveLock did not register the entry while held")
+	}
+	releaseSaveLock(key, lk)
+	saveLocks.mu.Lock()
+	_, stillPresent := saveLocks.locks[key]
+	saveLocks.mu.Unlock()
+	if stillPresent {
+		t.Fatal("releaseSaveLock left the entry behind after the last holder released")
 	}
 }

-func TestLockFor_ReturnsDistinctLocksForDifferentWorkloads(t *testing.T) {
-	a := lockFor(t.Name() + "-a")
-	b := lockFor(t.Name() + "-b")
-	if a == b {
-		t.Fatalf("lockFor returned same lock for different workloads: %p", a)
-	}
+func TestSaveLock_DistinctWorkloadsDoNotSerialize(t *testing.T) {
+	// Two different workloads must be lockable at the same time. If they
+	// shared a mutex the second acquire would block forever (deadlock).
+	a := acquireSaveLock(t.Name() + "-a")
+	b := acquireSaveLock(t.Name() + "-b")
+	releaseSaveLock(t.Name()+"-b", b)
+	releaseSaveLock(t.Name()+"-a", a)
 }

-func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
-	// Two goroutines holding the same lock must run sequentially. The
-	// counter would race past 2 if locking were broken; with the lock,
-	// the increment is observed monotonically.
-	lk := lockFor(t.Name() + "-wid")
+func TestSaveLock_SerializesConcurrentAcquisitions(t *testing.T) {
+	// Goroutines acquiring the same workload's lock must run sequentially.
+	// The counter would race past 1 if locking were broken; with the lock,
+	// peak in-flight stays at 1.
+	key := t.Name() + "-wid"
 	var (
 		wg      sync.WaitGroup
 		mu      sync.Mutex
@@ -199,8 +211,8 @@ func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			lk.Lock()
-			defer lk.Unlock()
+			lk := acquireSaveLock(key)
+			defer releaseSaveLock(key, lk)

 			mu.Lock()
 			counter++
@@ -216,15 +228,15 @@ func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
 	}
 	wg.Wait()
 	if peak != 1 {
-		t.Fatalf("lockFor failed to serialize: peak in-flight = %d, want 1", peak)
+		t.Fatalf("acquireSaveLock failed to serialize: peak in-flight = %d, want 1", peak)
 	}
 }

-func TestLockFor_ConcurrentMapAccessIsSafe(t *testing.T) {
-	// Distinct workloads acquired in parallel must not panic on map
-	// access — exercises the outer-mutex protection inside lockFor.
-	// Each iteration uses a unique key so the test stresses the
-	// insertion path (the common case for "first deploy" callers).
+func TestSaveLock_ConcurrentMapAccessIsSafe(t *testing.T) {
+	// Distinct workloads acquired+released in parallel must not panic on map
+	// access — exercises the outer-mutex protection inside acquire/release.
+	// Each iteration uses a unique key so the test stresses the insertion +
+	// refcount-cleanup paths (the common case for "first deploy" callers).
 	prefix := t.Name() + "-"
 	var wg sync.WaitGroup
 	for i := 0; i < 50; i++ {
@@ -232,9 +244,9 @@ func TestLockFor_ConcurrentMapAccessIsSafe(t *testing.T) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			lk := lockFor(prefix + strconv.Itoa(i))
-			lk.Lock()
-			lk.Unlock()
+			key := prefix + strconv.Itoa(i)
+			lk := acquireSaveLock(key)
+			releaseSaveLock(key, lk)
 		}()
 	}
 	wg.Wait()
@@ -80,26 +80,55 @@ func loadState(deps plugin.Deps, w plugin.Workload) (runtimeState, *store.Contai
 // container_id / proxy_route_id and orphaning Docker resources. The
 // mutex caps the concurrency at 1 per workload; cross-workload
 // parallelism is unaffected.
+//
+// Entries are reference-counted and removed only when the last holder
+// releases. This bounds memory (no per-workload-ID leak) WITHOUT the
+// use-after-delete hazard of deleting an entry on teardown: deleting a
+// live entry while a concurrent saveState still holds (or is about to
+// lock) it would let a fresh saveState mint a SECOND mutex for the same
+// workload, losing the RMW serialization the lock exists to provide.
 var saveLocks struct {
 	mu    sync.Mutex
-	locks map[string]*sync.Mutex
+	locks map[string]*saveLock
 }

-// lockFor returns the per-workload mutex, creating it on first use.
-// The outer mutex is held only briefly during map lookup; the returned
-// per-workload lock is what callers actually contend on.
-func lockFor(workloadID string) *sync.Mutex {
+type saveLock struct {
+	mu   sync.Mutex
+	refs int
+}
+
+// acquireSaveLock returns the per-workload lock (creating it on first use),
+// registers this caller as a holder, and takes the lock. Pair with
+// releaseSaveLock. The outer mutex is held only for the bookkeeping; callers
+// contend on the returned per-workload lock.
+func acquireSaveLock(workloadID string) *saveLock {
 	saveLocks.mu.Lock()
-	defer saveLocks.mu.Unlock()
 	if saveLocks.locks == nil {
-		saveLocks.locks = map[string]*sync.Mutex{}
+		saveLocks.locks = map[string]*saveLock{}
 	}
-	m, ok := saveLocks.locks[workloadID]
+	l, ok := saveLocks.locks[workloadID]
 	if !ok {
-		m = &sync.Mutex{}
-		saveLocks.locks[workloadID] = m
+		l = &saveLock{}
+		saveLocks.locks[workloadID] = l
 	}
-	return m
+	l.refs++
+	saveLocks.mu.Unlock()
+	l.mu.Lock()
+	return l
+}
+
+// releaseSaveLock unlocks and drops the caller's reference, removing the map
+// entry once no holders remain. Because refs is incremented under saveLocks.mu
+// before the entry can be observed for deletion, an entry with a pending
+// acquirer is never deleted.
+func releaseSaveLock(workloadID string, l *saveLock) {
+	l.mu.Unlock()
+	saveLocks.mu.Lock()
+	l.refs--
+	if l.refs == 0 {
+		delete(saveLocks.locks, workloadID)
+	}
+	saveLocks.mu.Unlock()
 }

 // saveState upserts the container row, calling mutate so callers can
@@ -115,9 +144,8 @@ func lockFor(workloadID string) *sync.Mutex {
 // Per-workload mutex serializes concurrent callers so two parallel
 // Deploys can't read the same prior state and race their writes.
 func saveState(deps plugin.Deps, w plugin.Workload, mutate func(*runtimeState, *store.Container)) error {
-	lk := lockFor(w.ID)
-	lk.Lock()
-	defer lk.Unlock()
+	lk := acquireSaveLock(w.ID)
+	defer releaseSaveLock(w.ID, lk)

 	prev, prevRow, err := loadState(deps, w)
 	if err != nil {
@@ -185,14 +185,23 @@ func TestSaveState_RecoversFromInvalidExtraJSON(t *testing.T) {
 	deps, _ := testDeps(t)
 	w := plugin.Workload{ID: t.Name() + "-wid", Name: "site"}

+	// UpsertContainer now validates extra_json at the boundary, so this
+	// test seeds a valid row first and corrupts it via raw SQL to
+	// simulate a pre-existing bad row from an upgrade / external edit.
 	if err := deps.Store.UpsertContainer(store.Container{
 		ID:           containerRowID(w),
 		WorkloadID:   w.ID,
 		WorkloadKind: string(store.WorkloadKindSite),
 		Host:         "local",
-		ExtraJSON:    `{not json`,
+		ExtraJSON:    `{}`,
 	}); err != nil {
-		t.Fatalf("seed bad row: %v", err)
+		t.Fatalf("seed row: %v", err)
+	}
+	if _, err := deps.Store.DB().Exec(
+		`UPDATE containers SET extra_json = ? WHERE id = ?`,
+		`{not json`, containerRowID(w),
+	); err != nil {
+		t.Fatalf("corrupt extra_json: %v", err)
 	}

 	err := saveState(deps, w, func(state *runtimeState, _ *store.Container) {
@@ -66,5 +66,8 @@ func teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
 	if err := deps.Store.DeleteContainer(prevContainer.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
 		slog.Warn("static site: failed to delete container row", "site", w.Name, "error", err)
 	}
+	// The per-workload save-mutex is reference-counted (see state.go) and
+	// frees itself when the last holder releases, so teardown no longer
+	// deletes it explicitly — doing so could race a concurrent saveState.
 	return nil
 }
@@ -18,11 +18,19 @@ import (
 // match the event repo). Mode controls whether branch pushes or tag
 // pushes fire the deploy. Branch is exact-matched when Mode=="push";
 // TagPattern is glob-matched when Mode=="tag".
+//
+// BranchPattern is the preview-deploy escape hatch: when non-empty in
+// "push" mode it overrides Branch and matches the event branch as a glob
+// (`feat/*`, `release-*`, `*` for "any branch"). The trigger returns an
+// intent whose Metadata["preview_branch"] holds the matched branch — the
+// dispatcher uses that signal to materialize an ephemeral per-branch
+// child workload rather than redeploying the parent.
 type Config struct {
-	Repo       string `json:"repo"`
-	Mode       string `json:"mode"` // "push" | "tag"
-	Branch     string `json:"branch"`
-	TagPattern string `json:"tag_pattern"`
+	Repo          string `json:"repo"`
+	Mode          string `json:"mode"` // "push" | "tag"
+	Branch        string `json:"branch"`
+	BranchPattern string `json:"branch_pattern"`
+	TagPattern    string `json:"tag_pattern"`
 }

 type trigger struct{}
@@ -49,7 +57,15 @@ func (*trigger) Validate(cfg json.RawMessage) error {
 	}
 	switch c.Mode {
 	case "push":
-		// Branch is optional ("" means any branch).
+		// Branch is optional ("" means any branch). BranchPattern is
+		// validated as a path.Match glob if present; misconfigured
+		// patterns are rejected at the boundary rather than letting them
+		// fail silently inside Match.
+		if c.BranchPattern != "" {
+			if _, err := path.Match(c.BranchPattern, "probe"); err != nil {
+				return fmt.Errorf("git trigger: invalid branch_pattern %q: %w", c.BranchPattern, err)
+			}
+		}
 	case "tag":
 		pattern := c.TagPattern
 		if pattern == "" {
@@ -90,8 +106,24 @@ func (*trigger) Match(ctx context.Context, deps plugin.Deps, w plugin.Workload,
 	if evt.Git.Tag != "" {
 		meta["tag"] = evt.Git.Tag
 	}
+	// Preview-deploy signal: when BranchPattern is set AND the matched
+	// branch is NOT the configured baseline Branch, flag this dispatch
+	// for materialization as a per-branch child workload. The dispatcher
+	// reads preview_branch and decides whether to spawn a preview row;
+	// a baseline-branch push falls through to a normal redeploy of the
+	// template itself.
+	if cfg.Mode == "push" && cfg.BranchPattern != "" && evt.Git.Branch != "" && evt.Git.Branch != cfg.Branch {
+		meta["preview_branch"] = evt.Git.Branch
+		if evt.Git.Deleted {
+			meta["preview_deleted"] = "1"
+		}
+	}
+	reason := "git-push"
+	if meta["preview_deleted"] == "1" {
+		reason = "git-branch-deleted"
+	}
 	return &plugin.DeploymentIntent{
-		Reason:      "git-push",
+		Reason:      reason,
 		Reference:   evt.Git.CommitSHA,
 		Metadata:    meta,
 		TriggeredAt: time.Now().UTC(),
@@ -106,6 +138,17 @@ func refMatches(cfg Config, ref string) bool {
 		if !ok {
 			return false
 		}
+		// Pattern-mode preview filter: any branch whose name matches the
+		// glob is in scope. The baseline `cfg.Branch` is also allowed so
+		// pushes to the template's primary branch keep redeploying the
+		// template itself.
+		if cfg.BranchPattern != "" {
+			if cfg.Branch != "" && cfg.Branch == branch {
+				return true
+			}
+			matched, err := path.Match(cfg.BranchPattern, branch)
+			return err == nil && matched
+		}
 		return cfg.Branch == "" || cfg.Branch == branch
 	case "tag":
 		tag, ok := strings.CutPrefix(ref, "refs/tags/")
@@ -56,14 +56,21 @@ type ImagePushEvent struct {

 // GitEvent covers both push (commits) and tag-create flavors. Vendor is
 // "gitea" | "github" | "gitlab" | "" (autodetected).
+//
+// Deleted is true when the push event reports a branch / tag was deleted.
+// Used by the preview-deploy flow to tear down ephemeral per-branch
+// workloads when a feature branch is removed upstream. Inferred from
+// GitHub-style `deleted: true` and Gitea's identical convention; GitLab
+// signals deletion via after-SHA zeros (parsed at vendor level).
 type GitEvent struct {
-	Vendor   string
-	Repo     string // owner/name
-	Ref      string // refs/heads/main or refs/tags/v1.2.3
-	Branch   string // populated for branch refs
-	Tag      string // populated for tag refs
+	Vendor    string
+	Repo      string // owner/name
+	Ref       string // refs/heads/main or refs/tags/v1.2.3
+	Branch    string // populated for branch refs
+	Tag       string // populated for tag refs
 	CommitSHA string
 	Pusher    string
+	Deleted   bool
 }

 // ManualEvent represents a user-initiated deploy from the UI or API.
@@ -0,0 +1,239 @@
+// Package preview implements branch-pattern preview deploys. A "template"
+// workload is one whose git trigger has a BranchPattern configured; when
+// an inbound push event names a branch other than the template's primary
+// Branch, the dispatcher materializes (or reuses) a child workload via
+// MaterializeForBranch and dispatches the deploy against the child. The
+// child is then torn down on a matching branch-delete event.
+//
+// The package is intentionally narrow:
+//   - it does not know about Docker, the proxy, or any plugin internals
+//   - it operates over a Store interface so the webhook handler can mock
+//     it in tests
+//   - it owns the per-branch naming + subdomain mangling so the wiring
+//     code (trigger fan-out) stays a pure dispatch path
+package preview
+
+import (
+	"encoding/json"
+	"fmt"
+	"regexp"
+	"strings"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// Store is the slice of the persistence layer the preview package needs.
+// Defined locally so tests can fake it without dragging the full Store.
+type Store interface {
+	GetWorkloadByID(id string) (store.Workload, error)
+	ListChildrenByParent(parentID string) ([]store.Workload, error)
+	CreateWorkload(w store.Workload) (store.Workload, error)
+	DeleteWorkload(id string) error
+}
+
+// branchSlugPattern strips characters that are unsafe inside a Docker
+// container name, hostname label, or filesystem path. Compiled once.
+var branchSlugPattern = regexp.MustCompile(`[^a-z0-9-]+`)
+
+// slugifyBranch converts a git ref-component into a safe slug. Lowercase,
+// hyphen-only, length-capped to 32 so name + slug fit inside the Docker
+// 63-char container-name and 63-char DNS-label limits with room for the
+// `tf-build-` prefix.
+func slugifyBranch(branch string) string {
+	b := strings.ToLower(branch)
+	b = strings.ReplaceAll(b, "/", "-")
+	b = branchSlugPattern.ReplaceAllString(b, "-")
+	b = strings.Trim(b, "-")
+	if b == "" {
+		return "branch"
+	}
+	if len(b) > 32 {
+		b = strings.Trim(b[:32], "-")
+		if b == "" {
+			b = "branch"
+		}
+	}
+	return b
+}
+
+// findExistingPreview returns the child workload whose source_config
+// already names `branch`, if any. Linear scan over the children list —
+// fine because the bound is "branches a single team keeps open at once"
+// which is in the dozens, not thousands.
+func findExistingPreview(children []store.Workload, branch string) (store.Workload, bool) {
+	for _, c := range children {
+		var cfg struct {
+			Branch string `json:"branch"`
+		}
+		if c.SourceConfig != "" {
+			_ = json.Unmarshal([]byte(c.SourceConfig), &cfg)
+		}
+		if cfg.Branch == branch {
+			return c, true
+		}
+	}
+	return store.Workload{}, false
+}
+
+// patchSourceConfigBranch returns a copy of the template's source_config
+// with the `branch` field replaced. Unknown keys round-trip so plugin-
+// specific config (port, dockerfile path, storage settings, ...) survive.
+// A malformed source_config is replaced rather than propagated so the
+// preview workload has a clean baseline.
+func patchSourceConfigBranch(sourceConfig, branch string) (string, error) {
+	if branch == "" {
+		return "", fmt.Errorf("preview: branch is empty")
+	}
+	m := map[string]json.RawMessage{}
+	if sourceConfig != "" && sourceConfig != "{}" {
+		if err := json.Unmarshal([]byte(sourceConfig), &m); err != nil {
+			m = map[string]json.RawMessage{}
+		}
+	}
+	enc, err := json.Marshal(branch)
+	if err != nil {
+		return "", fmt.Errorf("preview: encode branch: %w", err)
+	}
+	m["branch"] = enc
+	out, err := json.Marshal(m)
+	if err != nil {
+		return "", fmt.Errorf("preview: encode source_config: %w", err)
+	}
+	return string(out), nil
+}
+
+// patchPublicFacesSubdomain prefixes every public face's Subdomain with
+// the branch slug so two preview deploys never collide on the same FQDN.
+// Faces with no subdomain are left untouched — the operator clearly
+// didn't want a per-branch host carved out for that face.
+func patchPublicFacesSubdomain(publicFaces, slug string) (string, error) {
+	if publicFaces == "" || publicFaces == "[]" {
+		return publicFaces, nil
+	}
+	var faces []map[string]any
+	if err := json.Unmarshal([]byte(publicFaces), &faces); err != nil {
+		// Malformed faces MUST fail loudly: returning the template's faces
+		// verbatim would give the preview the SAME subdomains as the
+		// template, so the preview's proxy route would clobber the template's
+		// (the exact collision the slug prefix exists to prevent).
+		return "", fmt.Errorf("preview: parse public_faces: %w", err)
+	}
+	for _, f := range faces {
+		sub, ok := f["subdomain"].(string)
+		if !ok || sub == "" {
+			continue
+		}
+		f["subdomain"] = slug + "-" + sub
+	}
+	out, err := json.Marshal(faces)
+	if err != nil {
+		return "", fmt.Errorf("preview: re-encode public_faces: %w", err)
+	}
+	return string(out), nil
+}
+
+// IsPreviewChild reports whether child was materialized as a branch preview
+// of template (vs. an operator-created stage-chain member that merely shares
+// the parent link — both use parent_workload_id). It reverses the exact
+// MaterializeForBranch naming formula — name == template.Name + "/" +
+// slugifyBranch(child's branch) — so a hand-named stage workload under the
+// same parent is never mistaken for a preview and cascade-deleted.
+func IsPreviewChild(template, child store.Workload) bool {
+	if child.ParentWorkloadID != template.ID {
+		return false
+	}
+	var cfg struct {
+		Branch string `json:"branch"`
+	}
+	if child.SourceConfig != "" {
+		_ = json.Unmarshal([]byte(child.SourceConfig), &cfg)
+	}
+	if cfg.Branch == "" {
+		return false
+	}
+	return child.Name == template.Name+"/"+slugifyBranch(cfg.Branch)
+}
+
+// ListPreviewChildren returns every preview workload materialized from
+// template. Used by the delete path to cascade-teardown previews so deleting
+// a template does not orphan their containers, proxy routes, and rows.
+func ListPreviewChildren(s Store, template store.Workload) ([]store.Workload, error) {
+	children, err := s.ListChildrenByParent(template.ID)
+	if err != nil {
+		return nil, fmt.Errorf("preview: list children: %w", err)
+	}
+	out := make([]store.Workload, 0, len(children))
+	for _, c := range children {
+		if IsPreviewChild(template, c) {
+			out = append(out, c)
+		}
+	}
+	return out, nil
+}
+
+// MaterializeForBranch returns the existing preview workload for
+// (template, branch) or creates one if none exists. The new workload
+// inherits the template's source kind, trigger kind, notification
+// settings, and public faces (with the branch slug prefixed onto each
+// subdomain). Idempotent: a second call with the same arguments returns
+// the same workload row.
+func MaterializeForBranch(s Store, template store.Workload, branch string) (store.Workload, error) {
+	if branch == "" {
+		return store.Workload{}, fmt.Errorf("preview: branch is required")
+	}
+
+	children, err := s.ListChildrenByParent(template.ID)
+	if err != nil {
+		return store.Workload{}, fmt.Errorf("preview: list children: %w", err)
+	}
+	if existing, ok := findExistingPreview(children, branch); ok {
+		return existing, nil
+	}
+
+	slug := slugifyBranch(branch)
+	newCfg, err := patchSourceConfigBranch(template.SourceConfig, branch)
+	if err != nil {
+		return store.Workload{}, err
+	}
+	newFaces, err := patchPublicFacesSubdomain(template.PublicFaces, slug)
+	if err != nil {
+		return store.Workload{}, err
+	}
+
+	// Webhook + notification secrets are NOT copied to the preview. The
+	// trigger dispatch reaches previews via the parent's trigger binding,
+	// not via a per-preview inbound webhook, so the preview never needs
+	// its own signing secret. Keeping these empty also stops the preview
+	// from masquerading as a first-class workload in webhook routes.
+	child := store.Workload{
+		Kind:             template.Kind,
+		Name:             template.Name + "/" + slug,
+		AppID:            template.AppID,
+		SourceKind:       template.SourceKind,
+		SourceConfig:     newCfg,
+		TriggerKind:      template.TriggerKind,
+		TriggerConfig:    template.TriggerConfig,
+		PublicFaces:      newFaces,
+		ParentWorkloadID: template.ID,
+	}
+	created, err := s.CreateWorkload(child)
+	if err != nil {
+		return store.Workload{}, fmt.Errorf("preview: create child: %w", err)
+	}
+	return created, nil
+}
+
+// FindPreviewForBranch looks up an existing preview without creating
+// one. Returns (Workload{}, false, nil) when no preview exists. Errors
+// only on a store failure.
+func FindPreviewForBranch(s Store, templateID, branch string) (store.Workload, bool, error) {
+	if templateID == "" || branch == "" {
+		return store.Workload{}, false, nil
+	}
+	children, err := s.ListChildrenByParent(templateID)
+	if err != nil {
+		return store.Workload{}, false, fmt.Errorf("preview: list children: %w", err)
+	}
+	w, ok := findExistingPreview(children, branch)
+	return w, ok, nil
+}
@@ -0,0 +1,200 @@
+package preview
+
+import (
+	"encoding/json"
+	"errors"
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// fakeStore is a minimal in-memory store satisfying the preview.Store
+// interface. Tests verify business logic without the SQLite layer.
+type fakeStore struct {
+	workloads map[string]store.Workload
+	createErr error
+}
+
+func newFakeStore() *fakeStore {
+	return &fakeStore{workloads: map[string]store.Workload{}}
+}
+
+func (f *fakeStore) GetWorkloadByID(id string) (store.Workload, error) {
+	w, ok := f.workloads[id]
+	if !ok {
+		return store.Workload{}, errors.New("not found")
+	}
+	return w, nil
+}
+
+func (f *fakeStore) ListChildrenByParent(parentID string) ([]store.Workload, error) {
+	out := []store.Workload{}
+	for _, w := range f.workloads {
+		if w.ParentWorkloadID == parentID {
+			out = append(out, w)
+		}
+	}
+	return out, nil
+}
+
+func (f *fakeStore) CreateWorkload(w store.Workload) (store.Workload, error) {
+	if f.createErr != nil {
+		return store.Workload{}, f.createErr
+	}
+	if w.ID == "" {
+		w.ID = "preview-" + w.Name
+	}
+	f.workloads[w.ID] = w
+	return w, nil
+}
+
+func (f *fakeStore) DeleteWorkload(id string) error {
+	delete(f.workloads, id)
+	return nil
+}
+
+func TestSlugifyBranch_StripsUnsafeChars(t *testing.T) {
+	cases := []struct {
+		in   string
+		want string
+	}{
+		{"main", "main"},
+		{"Feature/User-Auth", "feature-user-auth"},
+		{"PR#42", "pr-42"},
+		{"release/v1.2.3", "release-v1-2-3"},
+		{"___", "branch"},
+		{strings.Repeat("a", 50), strings.Repeat("a", 32)},
+	}
+	for _, c := range cases {
+		got := slugifyBranch(c.in)
+		if got != c.want {
+			t.Errorf("slugifyBranch(%q) = %q, want %q", c.in, got, c.want)
+		}
+	}
+}
+
+func TestPatchSourceConfigBranch_PreservesUnknownKeys(t *testing.T) {
+	src := `{"port":3000,"dockerfile_path":"Dockerfile","branch":"main","provider":"github"}`
+	out, err := patchSourceConfigBranch(src, "feat/x")
+	if err != nil {
+		t.Fatalf("patch: %v", err)
+	}
+	var got map[string]any
+	if err := json.Unmarshal([]byte(out), &got); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if got["branch"] != "feat/x" {
+		t.Errorf("branch = %v, want feat/x", got["branch"])
+	}
+	if got["port"] == nil || got["dockerfile_path"] == nil || got["provider"] == nil {
+		t.Errorf("unknown keys dropped: %+v", got)
+	}
+}
+
+func TestPatchPublicFacesSubdomain_PrefixesSubdomains(t *testing.T) {
+	faces := `[{"subdomain":"app","domain":"example.com"},{"subdomain":"","domain":"raw.example.com"}]`
+	out, err := patchPublicFacesSubdomain(faces, "feat-x")
+	if err != nil {
+		t.Fatalf("patch: %v", err)
+	}
+	var got []map[string]any
+	if err := json.Unmarshal([]byte(out), &got); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if got[0]["subdomain"] != "feat-x-app" {
+		t.Errorf("first subdomain = %v, want feat-x-app", got[0]["subdomain"])
+	}
+	if got[1]["subdomain"] != "" {
+		t.Errorf("empty subdomain must stay empty, got %v", got[1]["subdomain"])
+	}
+}
+
+func TestMaterializeForBranch_CreatesNewWhenMissing(t *testing.T) {
+	fs := newFakeStore()
+	template := store.Workload{
+		ID:           "tmpl-1",
+		Kind:         "project",
+		Name:         "myapp",
+		AppID:        "app-1",
+		SourceKind:   "dockerfile",
+		SourceConfig: `{"branch":"main","port":3000}`,
+		TriggerKind:  "git",
+		PublicFaces:  `[{"subdomain":"www","domain":"x.test"}]`,
+	}
+	fs.workloads[template.ID] = template
+
+	child, err := MaterializeForBranch(fs, template, "feat/login")
+	if err != nil {
+		t.Fatalf("materialize: %v", err)
+	}
+	if child.ParentWorkloadID != template.ID {
+		t.Errorf("parent = %q, want %q", child.ParentWorkloadID, template.ID)
+	}
+	if !strings.Contains(child.Name, "feat-login") {
+		t.Errorf("name = %q, want it to include slug", child.Name)
+	}
+	var cfg map[string]any
+	if err := json.Unmarshal([]byte(child.SourceConfig), &cfg); err != nil {
+		t.Fatalf("decode child source_config: %v", err)
+	}
+	if cfg["branch"] != "feat/login" {
+		t.Errorf("child branch = %v, want feat/login", cfg["branch"])
+	}
+	if cfg["port"] == nil {
+		t.Errorf("child should inherit template port; got %+v", cfg)
+	}
+	var faces []map[string]any
+	if err := json.Unmarshal([]byte(child.PublicFaces), &faces); err != nil {
+		t.Fatalf("decode child faces: %v", err)
+	}
+	if !strings.HasPrefix(faces[0]["subdomain"].(string), "feat-login-") {
+		t.Errorf("face subdomain = %v, want feat-login- prefix", faces[0]["subdomain"])
+	}
+}
+
+func TestMaterializeForBranch_ReusesExisting(t *testing.T) {
+	fs := newFakeStore()
+	template := store.Workload{
+		ID:           "tmpl-1",
+		Kind:         "project",
+		Name:         "myapp",
+		SourceKind:   "dockerfile",
+		SourceConfig: `{"branch":"main"}`,
+	}
+	fs.workloads[template.ID] = template
+
+	first, err := MaterializeForBranch(fs, template, "feat/x")
+	if err != nil {
+		t.Fatalf("first materialize: %v", err)
+	}
+	second, err := MaterializeForBranch(fs, template, "feat/x")
+	if err != nil {
+		t.Fatalf("second materialize: %v", err)
+	}
+	if first.ID != second.ID {
+		t.Errorf("expected idempotence: got %q then %q", first.ID, second.ID)
+	}
+	if len(fs.workloads) != 2 {
+		t.Errorf("expected exactly one preview created, store has %d", len(fs.workloads))
+	}
+}
+
+func TestMaterializeForBranch_RejectsEmptyBranch(t *testing.T) {
+	fs := newFakeStore()
+	_, err := MaterializeForBranch(fs, store.Workload{ID: "tmpl"}, "")
+	if err == nil {
+		t.Fatal("expected error for empty branch")
+	}
+}
+
+func TestFindPreviewForBranch_MissingReturnsFalse(t *testing.T) {
+	fs := newFakeStore()
+	_, ok, err := FindPreviewForBranch(fs, "tmpl", "feat/x")
+	if err != nil {
+		t.Fatalf("find: %v", err)
+	}
+	if ok {
+		t.Error("expected ok=false for missing preview")
+	}
+}