chore: post-merge cleanup — remove merged plan folders (volume-snapshot-restore, gitops)

feat(volsnap): volume snapshot restore (backlog #6 )
Restore a captured volume snapshot onto an image workload's live host-bind data volumes, then redeploy — the most destructive workload action, built to the adversarially-reviewed design (C1–C6) with all data-loss guards. - Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from the workload's CURRENT config (never the tamperable manifest), per-filesystem disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and crash-recovery sweep (RecoverInterruptedRestores) wired before serving. - internal/keyedmutex: shared per-key lock; deployer now serializes every deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked for the restore re-dispatch, no deadlock). - Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir only), decompression-bomb cap, manifest-index bounds. - POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore header (CSRF), per-workload single-flight (409). - WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru). Scope: image-source only; scopes absolute/stage/project (driven off the same supportedScopes constant capture uses). Plan-reviewed before coding; per-phase go/security/ts reviews; final review READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path traversal (re-derive target from current config + base containment). Plan: plans/volume-snapshot-restore/
2026-06-22 17:25:34 +03:00 · 2026-06-22 17:23:52 +03:00 · 2026-06-22 16:04:28 +03:00 · 2026-06-21 23:32:02 +03:00 · 2026-06-19 17:09:17 +03:00 · 2026-06-19 16:51:20 +03:00
246 changed files with 33942 additions and 3744 deletions
@@ -1,9 +1,47 @@
+# VCS / tooling
 .git
-node_modules
-web/node_modules
-web/build
-data
-*.md
-plans/
-.claude/
+.gitignore
 .dockerignore
+.gitea/
+.github/
+.claude/
+.code-review-graph/
+.vex.toml
+.facts-sync.json
+.facts-suggestions.md
+
+# Node / frontend build artifacts (frontend stage rebuilds web/build)
+node_modules/
+web/node_modules/
+web/build/
+web/.svelte-kit/
+
+# Runtime / local data
+data/
+.env
+.env.*
+*.log
+
+# Compiled binaries (rebuilt inside the image)
+tinyforge
+tinyforge.exe
+tinyforge-server.exe
+server.exe
+docker-watcher
+docker-watcher.exe
+docker-watcher.exe~
+/cli
+/cli.exe
+
+# Build/orchestration files not needed inside the image
+Dockerfile
+docker-compose.yml
+Makefile
+*.example.yaml
+
+# Docs / planning / design (not needed at runtime)
+*.md
+docs/
+plans/
+design-mockups/
+test-data/
@@ -0,0 +1,27 @@
+# Facts Repo Suggestions
+
+Pending suggestions to push back to claude-code-facts.
+
+---
+
+## 2026-06-21: Buildx + registry buildcache DOES work on the TrueNAS Gitea runner
+
+**Target file:** gitea-python-ci-cd.md
+**Section:** "## 7. Docker Build" and "## 9. Gitea vs GitHub Actions Differences"
+**Reason:** The doc's compatibility table says "Docker Buildx — May not work (runner networking)" and the Docker section uses plain `docker build` + `docker push --all-tags`. In practice, `docker/setup-buildx-action@v3` + `docker/build-push-action@v5` with `cache-from/to: type=registry,ref=$REGISTRY:buildcache,mode=max` (and `type=gha` for no-push CI builds) works on the current `git.dolgolyov-family.by` runner — verified in the notify-bridge and tiny-forge pipelines. Recommend adding a "buildx path (preferred when it works)" variant alongside the conservative plain-`docker build` path, and softening the row to "Usually works; falls back to plain `docker build`."
+
+---
+
+## 2026-06-21: Quote `if:` expressions that contain a colon
+
+**Target file:** gitea-python-ci-cd.md
+**Section:** "## 9. Gitea vs GitHub Actions Differences" (or a new "Workflow gotchas")
+**Reason:** A common skip-guard `if: ${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}` contains `: ` inside the literal, which makes strict YAML parsers (PyYAML, and validators) treat it as a nested mapping and error with "mapping values are not allowed here". Gitea's parser is lenient and accepts the unquoted form, but it fails any standard YAML lint. Fix: wrap the whole expression in double quotes — `if: "${{ ... 'chore: release v' ... }}"`.
+
+---
+
+## 2026-06-21: Add a "Go on Gitea" CI/CD note
+
+**Target file:** gitea-python-ci-cd.md (or a new gitea-go-ci-cd.md)
+**Section:** new
+**Reason:** The doc is Python-only. The same release/Docker patterns apply to Go services with these deltas: pin `setup-go` to match the `go` directive in `go.mod` (a mismatch silently triggers a slow `GOTOOLCHAIN=auto` toolchain download); gate on `go vet ./...` + `go test ./internal/...`; multi-stage Dockerfile with `--mount=type=cache,target=/go/pkg/mod` and `target=/root/.cache/go-build` (requires `# syntax=docker/dockerfile:1.7`); `CGO_ENABLED=0 -ldflags="-s -w"` static binary on an `alpine` runtime with a non-root user and a `wget --spider` HEALTHCHECK.
@@ -5,34 +5,70 @@ on:
    branches: [main]
  pull_request:
    branches: [main]
+  workflow_dispatch:

 jobs:
-  build:
+  frontend:
+    # Skip the build on release-bump commits — the tag push runs release.yml.
+    if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+          cache: npm
+          cache-dependency-path: web/package-lock.json
+
+      - name: Install frontend dependencies
+        working-directory: web
+        run: npm ci --no-audit
+
+      - name: Svelte check
+        working-directory: web
+        run: npm run check
+
+      - name: Unit tests (vitest)
+        working-directory: web
+        run: npm run test
+
+      - name: Build frontend
+        working-directory: web
+        run: npm run build
+
+  backend:
+    if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-go@v5
        with:
-          go-version: '1.24'
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-
-      - name: Install frontend dependencies
-        working-directory: web
-        run: npm ci --no-audit
-
-      - name: Build frontend
-        working-directory: web
-        run: npm run build
+          go-version: '1.25'
+          cache-dependency-path: go.sum

      - name: Vet Go code
        run: go vet ./...

-      - name: Build Go binary
-        run: CGO_ENABLED=0 go build -ldflags="-s -w" -o tinyforge ./cmd/server
+      - name: Run Go tests
+        run: go test ./internal/... -count=1

-      - name: Build Docker image
-        run: docker build -t tinyforge:dev .
+  build-image:
+    if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
+    needs: [frontend, backend]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build Docker image (no push)
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: false
+          tags: tinyforge:ci-${{ gitea.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
@@ -10,19 +10,109 @@ env:
  REGISTRY: git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge

 jobs:
-  create-release:
+  # ───────────────────────────────────────────────────────────────────────
+  # Gate the release on a passing test suite. A tagged release must never
+  # ship code that fails `go vet` / `go test`.
+  # ───────────────────────────────────────────────────────────────────────
+  test:
    runs-on: ubuntu-latest
-    outputs:
-      release_id: ${{ steps.create.outputs.release_id }}
    steps:
-      - name: Fetch RELEASE_NOTES.md only
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.25'
+          cache-dependency-path: go.sum
+
+      - name: Vet Go code
+        run: go vet ./...
+
+      - name: Run Go tests
+        run: go test ./internal/... -count=1
+
+  # ───────────────────────────────────────────────────────────────────────
+  # Build + push the image FIRST. If this fails, no release is created
+  # (create-release depends on it) — so we never leave an orphan release
+  # pointing at a tag with no published image.
+  # ───────────────────────────────────────────────────────────────────────
+  build-docker:
+    needs: test
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Compute tags
+        id: meta
+        run: |
+          TAG="${{ gitea.ref_name }}"
+          VERSION="${TAG#v}"
+          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
+          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
+          # Detect pre-release (alpha/beta/rc) — these do NOT get :latest.
+          if echo "$TAG" | grep -qE '(alpha|beta|rc)'; then
+            echo "is_pre=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pre=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Gitea Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.SERVER_HOST }}
+          username: ${{ gitea.actor }}
+          password: ${{ secrets.DEPLOY_TOKEN }}
+
+      - name: Build and push image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}:${{ steps.meta.outputs.tag }}
+            ${{ env.REGISTRY }}:${{ steps.meta.outputs.version }}
+            ${{ env.REGISTRY }}:sha-${{ gitea.sha }}
+            ${{ steps.meta.outputs.is_pre == 'false' && format('{0}:latest', env.REGISTRY) || '' }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}:buildcache,mode=max
+
+      - name: Trigger redeploy webhook
+        if: steps.meta.outputs.is_pre == 'false'
+        continue-on-error: true
+        run: |
+          if [ -n "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" ]; then
+            echo "Triggering redeploy webhook..."
+            curl -sf -X POST "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" \
+              --max-time 30 || echo "::warning::Redeploy webhook failed"
+          else
+            echo "DOCKER_REDEPLOY_WEBHOOK_URL not set — skipping auto-deploy"
+          fi
+
+  # ───────────────────────────────────────────────────────────────────────
+  # Create the Gitea release LAST — body = RELEASE_NOTES.md + auto-changelog.
+  # ───────────────────────────────────────────────────────────────────────
+  create-release:
+    needs: build-docker
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout (full history for changelog)
        uses: actions/checkout@v4
        with:
-          sparse-checkout: RELEASE_NOTES.md
-          sparse-checkout-cone-mode: false
+          fetch-depth: 0
+
+      - name: Generate changelog
+        id: changelog
+        run: |
+          PREV_TAG=$(git tag --sort=-v:refname | head -2 | tail -1)
+          if [ -z "$PREV_TAG" ] || [ "$PREV_TAG" = "${{ gitea.ref_name }}" ]; then
+            git log --oneline --no-decorate -n 20 > /tmp/changelog.txt
+          else
+            git log --oneline --no-decorate "${PREV_TAG}..HEAD" > /tmp/changelog.txt
+          fi

      - name: Create Gitea release
-        id: create
        env:
          DEPLOY_TOKEN: ${{ secrets.DEPLOY_TOKEN }}
        run: |
@@ -42,74 +132,49 @@ jobs:
            echo "Found RELEASE_NOTES.md"
          else
            export RELEASE_NOTES=""
-            echo "No RELEASE_NOTES.md found — release will have no body"
+            echo "No RELEASE_NOTES.md found — release body = changelog only"
          fi

-          BODY_JSON=$(python3 -c "
+          # Build release body (notes + changelog) via Python to avoid shell
+          # escaping and CLI length limits.
+          export TAG VERSION IS_PRE
+          python3 <<'PY'
          import json, os
-          notes = os.environ.get('RELEASE_NOTES', '')
-          print(json.dumps(notes.strip()))
-          ")

-          # Create release via Gitea API
-          RELEASE=$(curl -s -X POST "$BASE_URL/releases" \
+          notes = os.environ.get('RELEASE_NOTES', '')
+          changelog = open('/tmp/changelog.txt').read().strip()
+
+          sections = []
+          if notes.strip():
+              sections.append(notes.strip())
+          if changelog:
+              sections.append('## Changelog\n\n' + changelog)
+
+          payload = {
+              'tag_name': os.environ['TAG'],
+              'name': os.environ['VERSION'],
+              'body': '\n\n'.join(sections),
+              'draft': False,
+              'prerelease': os.environ['IS_PRE'] == 'true',
+          }
+          with open('/tmp/release-payload.json', 'w') as f:
+              json.dump(payload, f)
+          PY
+
+          HTTP=$(curl -s -o /tmp/release-resp.json -w "%{http_code}" \
+            -X POST "$BASE_URL/releases" \
            -H "Authorization: token $DEPLOY_TOKEN" \
            -H "Content-Type: application/json" \
-            -d "{
-              \"tag_name\": \"$TAG\",
-              \"name\": \"$VERSION\",
-              \"body\": $BODY_JSON,
-              \"draft\": false,
-              \"prerelease\": $IS_PRE
-            }")
+            --data-binary @/tmp/release-payload.json)

-          # Fallback: if release already exists for this tag, reuse it
-          RELEASE_ID=$(echo "$RELEASE" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])" 2>/dev/null)
-          if [ -z "$RELEASE_ID" ]; then
-            echo "::warning::Release already exists for tag $TAG — reusing existing release"
-            RELEASE=$(curl -s "$BASE_URL/releases/tags/$TAG" \
-              -H "Authorization: token $DEPLOY_TOKEN")
-            RELEASE_ID=$(echo "$RELEASE" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
-          fi
-          echo "release_id=$RELEASE_ID" >> "$GITHUB_OUTPUT"
-          echo "Created release $RELEASE_ID for $TAG"
-
-  build-docker:
-    needs: create-release
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Login to Gitea Container Registry
-        id: docker-login
-        continue-on-error: true
-        run: |
-          echo "${{ secrets.DEPLOY_TOKEN }}" | docker login \
-            "$SERVER_HOST" -u "${{ gitea.actor }}" --password-stdin
-
-      - name: Build and tag
-        if: steps.docker-login.outcome == 'success'
-        run: |
-          TAG="${{ gitea.ref_name }}"
-          VERSION="${TAG#v}"
-          docker build -t "$REGISTRY:$TAG" -t "$REGISTRY:$VERSION" .
-          # Tag as 'latest' only for stable releases
-          if ! echo "$TAG" | grep -qE '(alpha|beta|rc)'; then
-            docker tag "$REGISTRY:$TAG" "$REGISTRY:latest"
-          fi
-
-      - name: Push
-        if: steps.docker-login.outcome == 'success'
-        run: docker push "$REGISTRY" --all-tags
-
-      - name: Trigger Portainer redeploy
-        if: steps.docker-login.outcome == 'success'
-        continue-on-error: true
-        run: |
-          if [ -n "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" ]; then
-            echo "Triggering Portainer redeploy..."
-            curl -sf -X POST "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" \
-              --max-time 30 || echo "::warning::Portainer webhook failed"
+          echo "POST /releases → HTTP $HTTP"
+          if [ "$HTTP" = "201" ]; then
+            RELEASE_ID=$(python3 -c "import json; print(json.load(open('/tmp/release-resp.json'))['id'])")
+            echo "Created release $RELEASE_ID for $TAG"
+          elif [ "$HTTP" = "409" ] || grep -q "already exists" /tmp/release-resp.json; then
+            echo "::warning::Release already exists for tag $TAG — reusing"
          else
-            echo "DOCKER_REDEPLOY_WEBHOOK_URL not set — skipping auto-deploy"
+            echo "::error::Failed to create release for $TAG (HTTP $HTTP)"
+            head -c 2000 /tmp/release-resp.json; echo
+            exit 1
          fi
@@ -6,7 +6,10 @@ data/
 .env
 tinyforge
 tinyforge.exe
+/cli
+/cli.exe
 server.exe
+tinyforge-server.exe
 docker-watcher
 docker-watcher.exe
 docker-watcher.exe~
@@ -0,0 +1,57 @@
+# vex configuration — https://github.com/tenatarika/vex
+#
+# Place this file in your project root as .vex.toml
+
+# Glob patterns to exclude from indexing (gitignore syntax, on top of .gitignore)
+# exclude = [
+#     "vendor/**",
+#     "node_modules/**",
+#     "*.generated.go",
+#     "dist/**",
+# ]
+
+# Default output format: "text", "json", or "compact"
+# format = "text"
+
+# Enable semantic embeddings by default (slower indexing, enables meaning-based search)
+semantic = true
+
+# Automatically run `vex update` before search if the index is stale
+auto_update = true
+
+# Embedder used for semantic indexing. Known IDs: minilm-l6-v2 (default).
+# Changing the embedder requires a full reindex.
+# embedder = "minilm-l6-v2"
+
+# Cache directory override. Defaults to the platform cache location.
+#   macOS:   ~/Library/Caches/vex
+#   Linux:   $XDG_CACHE_HOME/vex   (fallback: ~/.cache/vex)
+#   Windows: %LOCALAPPDATA%\vex    (fallback: %USERPROFILE%\AppData\Local\vex)
+# Accepts absolute paths, "~/..." or paths relative to this file (e.g. "./.vex/cache").
+# Can also be overridden per-invocation with --cache-dir or $VEX_CACHE_DIR.
+# cache_dir = "./.vex/cache"
+
+# Store the index inside the project as `<project>/.vex_cache/`. Useful when
+# the cache should travel with the project (e.g. on a moved or renamed
+# directory). vex writes a `.gitignore` inside it so contents are not
+# committed. Overridden by `cache_dir`, `--cache-dir`, or $VEX_CACHE_DIR.
+# local_cache = false
+
+# Thread count for parallel indexing (index/update/watch).
+#   * unset  — 80% of available cores, rounded up (default, leaves headroom)
+#   * 0      — use all cores (explicit opt-in to max throughput)
+#   * N      — exactly N workers
+# Overridable per-invocation with `-j/--jobs` or $VEX_JOBS.
+# jobs = 4
+
+# Build the persistent call-graph section. Disabling falls back to live-scan
+# for `vex callers`/`vex callees` (slower per-query, but saves indexing
+# time on large monorepos). The opt-out is persisted in the manifest so
+# `vex update` does not silently re-add the section.
+# Per-invocation override: `vex index --no-call-graph`.
+# call_graph = true
+
+# Build the BM25 channel. Disabling drops the third RRF channel and keeps
+# only structural (+ semantic). Same persistence rules as `call_graph`.
+# Per-invocation override: `vex index --no-bm25`.
+# bm25 = true
@@ -12,3 +12,33 @@ Start/restart with: `./scripts/dev-server.sh`
 ## Frontend

 - **Boolean inputs use `ToggleSwitch`** (`$lib/components/ToggleSwitch.svelte`) — the slide-style switch is the unified control across the WebUI. Do not introduce raw `<input type="checkbox">` elements; place a `<ToggleSwitch>` next to a label/help block instead.
+- **Confirmations & destructive actions use `ConfirmDialog`** (`$lib/components/ConfirmDialog.svelte`) — never native `window.confirm` / `alert`. For navigation guards (e.g. the unsaved-changes prompt on `/apps/new`), `cancel()` the navigation in `beforeNavigate`, open `ConfirmDialog`, and re-issue the navigation with a bypass flag on confirm. Native `beforeunload` is acceptable only for hard tab-close/reload, where the browser forbids custom UI.
+- **Source-config shape: `$lib/workload/sourceForms.ts`** is the single source of truth (seed/serialize/validity for image/compose/static/dockerfile), consumed by both `/apps/new` and `/apps/[id]`. Don't re-inline seed/serialize logic.
+- **"App" = workload with `source_kind !== ''`.** Triggers are first-class bindings (`workload_trigger_bindings`), NOT on the workload row — never gate app lists/counts on `trigger_kind` (it's empty for plugin workloads). Legacy pre-cutover `kind:project/stack/site` rows have an empty `source_kind` and must be excluded everywhere.
+- **i18n parity is mandatory** — every key in BOTH `web/src/lib/i18n/{en,ru}.json`. A missing key is NOT a build error (`$t` returns the key string), so verify parity manually.
+
+## Backend
+
+- **Per-workload deploy lock.** Every deploy entrypoint (API deploy, rollback, promote,
+  generic-hooks, webhook trigger dispatch) funnels through `deployer.DispatchPlugin`, which
+  holds a per-workload `keyedmutex` lock (`internal/keyedmutex`) for the whole dispatch;
+  `DispatchTeardown` takes it too. This serializes all container/volume mutation per workload.
+  Do NOT add a deploy/teardown path that bypasses `DispatchPlugin`. Operations that must run
+  a deploy *while already holding* the lock (volume-snapshot restore) use
+  `Deployer.LockWorkload` + `RedeployLocked` (the unlocked dispatch) — calling `DispatchPlugin`
+  under the held lock would deadlock (Go mutexes are not reentrant). `activeWg` is a global
+  drain barrier for shutdown, NOT a per-workload lock.
+- **Volume snapshot restore** lives in `volsnap.Engine.Restore` (engine-owned, not the API
+  handler): preflight re-resolves volumes from the workload's CURRENT config (never the
+  snapshot manifest — that's tamper-influenceable) → lock → stop → extract-to-tmp →
+  pre-restore snapshot → journal → atomic rename swap → redeploy. A startup
+  `RecoverInterruptedRestores` sweep replays the journal after a crash; it MUST be wired (with
+  `SetLifecycle`) before the API serves. The archive extractor treats the tar as untrusted
+  (zip-slip/type-allowlist/bomb-cap); the endpoint requires an `X-Confirm-Restore: <sid>`
+  header (CSRF), like the DB restore.
+
+## Build & Test
+
+- Frontend (from `web/`): `npm run check` (svelte-check — expect 0 errors), `npm run build`, `npm run test` (vitest; pure-logic units like `sourceForms.test.ts`).
+- Backend (repo root): `go build ./...`, `go vet ./internal/...`, `go test ./internal/...`.
+- `./scripts/dev-server.sh` rebuilds the SPA + restarts the Go server on :8090; it kills the prior process, so a previous background dev-server task reporting **exit 1 is expected**, not a failure.
@@ -1,3 +1,4 @@
+# syntax=docker/dockerfile:1.7
 # Stage 1: Build frontend
 FROM node:20-alpine AS frontend-builder

@@ -9,25 +10,33 @@ COPY web/ ./
 RUN npm run build

 # Stage 2: Build Go binary
-FROM golang:1.24-alpine AS backend-builder
+FROM golang:1.25-alpine AS backend-builder

 RUN apk add --no-cache git ca-certificates

 WORKDIR /build
 COPY go.mod go.sum ./
 ENV GOTOOLCHAIN=auto
-RUN go mod download
+# Cache mounts persist the module + build caches across rebuilds (BuildKit).
+RUN --mount=type=cache,target=/go/pkg/mod \
+    go mod download

 COPY . .
 # Copy built frontend into the expected embed location.
 COPY --from=frontend-builder /build/web/build ./web/build

-RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /tinyforge ./cmd/server
+RUN --mount=type=cache,target=/go/pkg/mod \
+    --mount=type=cache,target=/root/.cache/go-build \
+    CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /tinyforge ./cmd/server

 # Stage 3: Minimal runtime image
 FROM alpine:3.19

-RUN apk add --no-cache ca-certificates tzdata
+LABEL org.opencontainers.image.source="https://git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge"
+LABEL org.opencontainers.image.title="Tinyforge"
+LABEL org.opencontainers.image.description="Self-hosted Docker deployment + mini-CI platform"
+
+RUN apk add --no-cache ca-certificates tzdata wget

 # Create non-root user.
 RUN addgroup -g 1000 -S app && adduser -u 1000 -S app -G app
@@ -46,4 +55,10 @@ EXPOSE 8080
 ENV DATA_DIR=/app/data
 ENV LISTEN_ADDR=:8080

+VOLUME /app/data
+
+# /readyz is the public readiness probe (pings the DB); /livez is liveness.
+HEALTHCHECK --interval=30s --timeout=5s --retries=3 --start-period=10s \
+    CMD wget --no-verbose --tries=1 --spider http://localhost:8080/readyz || exit 1
+
 ENTRYPOINT ["/app/tinyforge"]
@@ -11,6 +11,15 @@ Self-hosted deployment platform with a web dashboard. Deploy Docker containers f
 - **Multi-stage projects** (dev, staging, prod) with tag pattern matching
 - **Real-time deploy logs** via SSE streaming

+### Branch Preview Environments
+
+Get an isolated, throwaway deploy for every feature branch:
+
+- Add a **branch pattern** (e.g. `feat/*`) to a workload's **git trigger** (Triggers panel → git trigger → *Branch pattern*).
+- Pushing to any branch matching the pattern deploys an **isolated per-branch preview** — a child workload that inherits the source config, served at a **slug-prefixed subdomain** (`feat-login-app.example.com`) so previews never collide with each other or the main deploy.
+- Previews are **automatically torn down** when the branch is deleted upstream.
+- Manage live previews from the app's **Preview environments** panel (`/apps/[id]`): open each branch's URL or tear it down manually. A torn-down preview is recreated on the next push to its branch.
+
 ### Static Sites

 Deploy static sites and Deno-powered APIs directly from Git repositories:
@@ -106,6 +115,46 @@ curl -X POST https://your-domain/api/webhook/<secret> \
 3. Enter your provider's Issuer URL, Client ID, and Client Secret
 4. Set the Redirect URL to `https://your-domain/api/auth/oidc/callback`

+## CLI
+
+`tinyforge` is a terminal client for driving a server from the shell, built on the same HTTP API as the web UI.
+
+### Build
+
+```bash
+go build -o tinyforge ./cmd/cli      # ./tinyforge (tinyforge.exe on Windows)
+```
+
+### Usage
+
+```bash
+# Log in once — caches a 24h token in ~/.tinyforge/config.json (mode 0600)
+tinyforge login --base-url http://localhost:8090
+# ...or non-interactively (no password echo / shell-history leak):
+TINYFORGE_PASSWORD=… tinyforge login --base-url http://localhost:8090 --user admin
+
+tinyforge apps                              # list apps + container state
+tinyforge deploy my-app                     # deploy and wait for completion
+tinyforge deploy my-app --ref v1.2.3 --note "hotfix"
+tinyforge logs my-app -f                    # follow logs (Ctrl-C to stop)
+tinyforge status                            # server health + current user
+tinyforge status my-app                     # one app's containers
+tinyforge logout                            # revoke + clear the cached token
+```
+
+### Server & token resolution
+
+| Setting  | Flag         | Env               | Default                  |
+| -------- | ------------ | ----------------- | ------------------------ |
+| Base URL | `--base-url` | `TINYFORGE_URL`   | `http://localhost:8080`  |
+| Token    | `--token`    | `TINYFORGE_TOKEN` | cached by `login`        |
+| Config   | `--config`   | `TINYFORGE_CONFIG`| `~/.tinyforge/config.json` |
+
+### Notes
+
+- Login returns a **24h JWT** — there is no long-lived API token yet, so unattended use re-logs in when the token expires. `deploy` / `stop` / `start` require an **admin** account.
+- The token is sent as an `Authorization: Bearer` header (never placed in the URL) and the config file is written with `0600` permissions.
+
 ## Development

 ```bash
@@ -0,0 +1,149 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"sort"
+	"strings"
+	"text/tabwriter"
+	"time"
+)
+
+func runApps(args []string) error {
+	// Accept an optional "list" subcommand: `tinyforge apps` == `tinyforge apps list`.
+	if len(args) > 0 && args[0] == "list" {
+		args = args[1:]
+	}
+	fs := flag.NewFlagSet("apps", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge apps [list] [--base-url URL]\n\nList apps (workloads with a source) and their container state.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	var workloads []Workload
+	if err := sess.client.doJSON(ctx, "GET", "/api/workloads", nil, &workloads); err != nil {
+		return err
+	}
+
+	// One extra call fetches every container so state can be shown without an
+	// N+1 per-app request.
+	var containers []Container
+	if err := sess.client.doJSON(ctx, "GET", "/api/containers", nil, &containers); err != nil {
+		return err
+	}
+	byWorkload := map[string][]Container{}
+	for _, c := range containers {
+		byWorkload[c.WorkloadID] = append(byWorkload[c.WorkloadID], c)
+	}
+
+	apps := make([]Workload, 0, len(workloads))
+	for _, w := range workloads {
+		if w.isApp() {
+			apps = append(apps, w)
+		}
+	}
+	sort.Slice(apps, func(i, j int) bool { return apps[i].Name < apps[j].Name })
+
+	if len(apps) == 0 {
+		fmt.Println("No apps yet. Create one in the web UI, then deploy with 'tinyforge deploy <app>'.")
+		return nil
+	}
+
+	tw := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
+	fmt.Fprintln(tw, "NAME\tSOURCE\tSTATE\tID")
+	for _, w := range apps {
+		fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", w.Name, w.SourceKind, stateSummary(byWorkload[w.ID]), idShort(w.ID))
+	}
+	return tw.Flush()
+}
+
+// stateSummary condenses a workload's containers into one status word.
+func stateSummary(cs []Container) string {
+	if len(cs) == 0 {
+		return "—"
+	}
+	running := 0
+	for _, c := range cs {
+		if c.State == "running" {
+			running++
+		}
+	}
+	switch {
+	case running == len(cs):
+		return "running"
+	case running == 0:
+		return cs[0].State // e.g. stopped / failed / missing
+	default:
+		return fmt.Sprintf("%d/%d running", running, len(cs))
+	}
+}
+
+// resolveApp maps a user-supplied reference (name, full id, or id prefix) to a
+// single app workload. Exact id wins, then exact name, then a unique id prefix.
+func resolveApp(ctx context.Context, c *Client, ref string) (Workload, error) {
+	var workloads []Workload
+	if err := c.doJSON(ctx, "GET", "/api/workloads", nil, &workloads); err != nil {
+		return Workload{}, err
+	}
+
+	var byID, byName, byPrefix []Workload
+	for _, w := range workloads {
+		if !w.isApp() {
+			continue
+		}
+		switch {
+		case w.ID == ref:
+			byID = append(byID, w)
+		case strings.EqualFold(w.Name, ref):
+			byName = append(byName, w)
+		case len(ref) >= 6 && strings.HasPrefix(w.ID, ref):
+			byPrefix = append(byPrefix, w)
+		}
+	}
+
+	if len(byID) == 1 {
+		return byID[0], nil
+	}
+	if len(byName) == 1 {
+		return byName[0], nil
+	}
+	if len(byName) > 1 {
+		return Workload{}, ambiguousErr(ref, byName)
+	}
+	if len(byPrefix) == 1 {
+		return byPrefix[0], nil
+	}
+	if len(byPrefix) > 1 {
+		return Workload{}, ambiguousErr(ref, byPrefix)
+	}
+	return Workload{}, fmt.Errorf("no app matching %q (try 'tinyforge apps list')", ref)
+}
+
+func ambiguousErr(ref string, matches []Workload) error {
+	var b strings.Builder
+	fmt.Fprintf(&b, "%q matches multiple apps; use the id:\n", ref)
+	for _, w := range matches {
+		fmt.Fprintf(&b, "  %s  %s\n", idShort(w.ID), w.Name)
+	}
+	return fmt.Errorf("%s", strings.TrimRight(b.String(), "\n"))
+}
+
+func idShort(id string) string {
+	if len(id) > 8 {
+		return id[:8]
+	}
+	return id
+}
@@ -0,0 +1,232 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+)
+
+// apiError carries the server's error message plus the HTTP status, so callers
+// can distinguish auth failures (401) from other errors without losing the
+// server's message (e.g. "invalid credentials" vs "invalid or expired token").
+type apiError struct {
+	status int
+	msg    string
+}
+
+func (e *apiError) Error() string { return e.msg }
+
+// isAuthError reports whether err is a 401 from the API.
+func isAuthError(err error) bool {
+	var ae *apiError
+	return errors.As(err, &ae) && ae.status == http.StatusUnauthorized
+}
+
+// Client talks to the Tinyforge HTTP API. It has no global timeout so that
+// long synchronous deploys and follow streams work; callers pass a context
+// with the appropriate deadline.
+type Client struct {
+	baseURL string
+	token   string
+	http    *http.Client
+}
+
+func newClient(baseURL, token string) *Client {
+	return &Client{
+		baseURL: strings.TrimRight(baseURL, "/"),
+		token:   token,
+		http:    &http.Client{},
+	}
+}
+
+// apiEnvelope mirrors the server's response wrapper. The server's struct is
+// unexported, so the CLI defines its own matching shape. Data is deferred so a
+// single decode path serves every endpoint.
+type apiEnvelope struct {
+	Success bool            `json:"success"`
+	Data    json.RawMessage `json:"data"`
+	Error   string          `json:"error"`
+}
+
+// SessionToken is the data payload of POST /api/auth/login.
+type SessionToken struct {
+	Token     string `json:"token"`
+	ExpiresAt string `json:"expires_at"`
+}
+
+// User is the data payload of GET /api/auth/me.
+type User struct {
+	ID       string `json:"id"`
+	Username string `json:"username"`
+	Email    string `json:"email"`
+	Role     string `json:"role"`
+}
+
+// Workload is the subset of the workload row the CLI needs. An "app" is a
+// workload with a non-empty SourceKind.
+type Workload struct {
+	ID         string `json:"id"`
+	Name       string `json:"name"`
+	Kind       string `json:"kind"`
+	AppID      string `json:"app_id"`
+	SourceKind string `json:"source_kind"`
+	CreatedAt  string `json:"created_at"`
+}
+
+func (w Workload) isApp() bool { return w.SourceKind != "" }
+
+// Container is the subset of a container row the CLI needs. State is one of
+// running|stopped|failed|missing|starting|created|restarting|paused|...
+type Container struct {
+	ID          string `json:"id"`
+	WorkloadID  string `json:"workload_id"`
+	Role        string `json:"role"`
+	ContainerID string `json:"container_id"`
+	ImageRef    string `json:"image_ref"`
+	State       string `json:"state"`
+	Port        int    `json:"port"`
+	Subdomain   string `json:"subdomain"`
+	CreatedAt   string `json:"created_at"`
+}
+
+// DeployResult is the data payload of POST /api/workloads/{id}/deploy.
+type DeployResult struct {
+	WorkloadID  string `json:"workload_id"`
+	Reference   string `json:"reference"`
+	TriggeredBy string `json:"triggered_by"`
+}
+
+// doJSON performs a JSON request and unwraps the response envelope. body may be
+// nil. out may be nil when the caller does not need the data payload. A 401
+// maps to errNotAuthenticated; any other non-success surfaces the server's
+// error message.
+func (c *Client) doJSON(ctx context.Context, method, path string, body, out any) error {
+	var reqBody io.Reader
+	if body != nil {
+		buf, err := json.Marshal(body)
+		if err != nil {
+			return fmt.Errorf("encode request: %w", err)
+		}
+		reqBody = bytes.NewReader(buf)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, reqBody)
+	if err != nil {
+		return fmt.Errorf("build request: %w", err)
+	}
+	if body != nil {
+		req.Header.Set("Content-Type", "application/json")
+	}
+	c.authorize(req)
+
+	resp, err := c.http.Do(req)
+	if err != nil {
+		return fmt.Errorf("%s %s: %w", method, path, err)
+	}
+	defer resp.Body.Close()
+
+	raw, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
+	if err != nil {
+		return fmt.Errorf("read response: %w", err)
+	}
+
+	var env apiEnvelope
+	if jsonErr := json.Unmarshal(raw, &env); jsonErr != nil {
+		// Non-JSON body (e.g. a proxy error page). Surface status + a snippet,
+		// preserving auth-error typing for 401s with an unparseable body.
+		if resp.StatusCode >= 400 {
+			return &apiError{status: resp.StatusCode, msg: fmt.Sprintf(
+				"%s %s: unexpected response (status %d): %s", method, path, resp.StatusCode, snippet(raw))}
+		}
+		return fmt.Errorf("%s %s: decode response: %w", method, path, jsonErr)
+	}
+	if resp.StatusCode >= 400 || !env.Success {
+		msg := env.Error
+		if msg == "" {
+			msg = fmt.Sprintf("%s %s: request failed (status %d)", method, path, resp.StatusCode)
+		}
+		return &apiError{status: resp.StatusCode, msg: msg}
+	}
+	if out != nil && len(env.Data) > 0 {
+		if err := json.Unmarshal(env.Data, out); err != nil {
+			return fmt.Errorf("decode response data: %w", err)
+		}
+	}
+	return nil
+}
+
+// authorize attaches the bearer token. Using the Authorization header (rather
+// than a ?token= query param) keeps the JWT out of server and proxy logs.
+func (c *Client) authorize(req *http.Request) {
+	if c.token != "" {
+		req.Header.Set("Authorization", "Bearer "+c.token)
+	}
+}
+
+// streamSSE opens an SSE stream and invokes onData for each `data:` payload.
+// Comment lines (heartbeats, beginning with ':') and blanks are skipped. The
+// stream ends on EOF, context cancellation, or when onData returns an error.
+func (c *Client) streamSSE(ctx context.Context, path string, onData func(payload []byte) error) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
+	if err != nil {
+		return fmt.Errorf("build request: %w", err)
+	}
+	req.Header.Set("Accept", "text/event-stream")
+	c.authorize(req)
+
+	resp, err := c.http.Do(req)
+	if err != nil {
+		return fmt.Errorf("GET %s: %w", path, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
+		var env apiEnvelope
+		msg := fmt.Sprintf("GET %s: stream failed (status %d)", path, resp.StatusCode)
+		if json.Unmarshal(raw, &env) == nil && env.Error != "" {
+			msg = env.Error
+		}
+		return &apiError{status: resp.StatusCode, msg: msg}
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	scanner.Buffer(make([]byte, 0, 64<<10), 2<<20) // tolerate long log lines
+	for scanner.Scan() {
+		line := scanner.Text()
+		if line == "" || strings.HasPrefix(line, ":") {
+			continue // blank separator or SSE comment/heartbeat
+		}
+		data, ok := strings.CutPrefix(line, "data:")
+		if !ok {
+			continue // ignore event:/id: fields — the API uses default events
+		}
+		if err := onData([]byte(strings.TrimPrefix(data, " "))); err != nil {
+			return err
+		}
+	}
+	if err := scanner.Err(); err != nil && !errors.Is(err, context.Canceled) {
+		return fmt.Errorf("read stream: %w", err)
+	}
+	return nil
+}
+
+// snippet returns a short, single-line view of an unexpected response body.
+func snippet(b []byte) string {
+	const max = 200
+	s := strings.TrimSpace(string(b))
+	s = strings.ReplaceAll(s, "\n", " ")
+	if len(s) > max {
+		s = s[:max] + "…"
+	}
+	if s == "" {
+		s = "(empty body)"
+	}
+	return s
+}
@@ -0,0 +1,148 @@
+package main
+
+import (
+	"bytes"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// defaultBaseURL matches the server's default LISTEN_ADDR (:8080). The dev
+// server runs on :8090; point at it with --base-url or $TINYFORGE_URL.
+const defaultBaseURL = "http://localhost:8080"
+
+// Config is the persisted CLI state at ~/.tinyforge/config.json.
+type Config struct {
+	BaseURL   string `json:"base_url"`
+	Token     string `json:"token"`
+	ExpiresAt string `json:"expires_at"`
+}
+
+// globals holds the cross-cutting flags every command accepts.
+type globals struct {
+	baseURL    *string
+	token      *string
+	configPath *string
+}
+
+// addGlobalFlags registers the shared flags on a command's flag set.
+func addGlobalFlags(fs *flag.FlagSet) *globals {
+	return &globals{
+		baseURL:    fs.String("base-url", "", "Tinyforge server URL (default $TINYFORGE_URL or "+defaultBaseURL+")"),
+		token:      fs.String("token", "", "auth token (default $TINYFORGE_TOKEN or cached config)"),
+		configPath: fs.String("config", "", "config file path (default $TINYFORGE_CONFIG or ~/.tinyforge/config.json)"),
+	}
+}
+
+// configFilePath resolves the config file location with precedence:
+// --config flag > $TINYFORGE_CONFIG > ~/.tinyforge/config.json.
+func configFilePath(g *globals) (string, error) {
+	if g != nil && *g.configPath != "" {
+		return *g.configPath, nil
+	}
+	if env := os.Getenv("TINYFORGE_CONFIG"); env != "" {
+		return env, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", fmt.Errorf("locate home directory: %w", err)
+	}
+	return filepath.Join(home, ".tinyforge", "config.json"), nil
+}
+
+// loadConfig reads the config file. A missing file yields a zero Config and no
+// error — first run is not a failure.
+func loadConfig(path string) (Config, error) {
+	var cfg Config
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return cfg, nil
+		}
+		return cfg, fmt.Errorf("read config %s: %w", path, err)
+	}
+	// An empty or whitespace-only file (e.g. freshly touched) is treated as
+	// "no config yet" rather than a parse error.
+	if len(bytes.TrimSpace(data)) == 0 {
+		return cfg, nil
+	}
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return cfg, fmt.Errorf("parse config %s: %w", path, err)
+	}
+	return cfg, nil
+}
+
+// saveConfig writes the config file with 0600 permissions, since it holds a
+// bearer token. The parent directory is created if absent.
+func saveConfig(path string, cfg Config) error {
+	if dir := filepath.Dir(path); dir != "" {
+		if err := os.MkdirAll(dir, 0o700); err != nil {
+			return fmt.Errorf("create config dir: %w", err)
+		}
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return fmt.Errorf("encode config: %w", err)
+	}
+	if err := os.WriteFile(path, append(data, '\n'), 0o600); err != nil {
+		return fmt.Errorf("write config %s: %w", path, err)
+	}
+	// os.WriteFile only applies the mode when creating the file; Chmod ensures
+	// 0600 even when overwriting a pre-existing, looser-permissioned config.
+	if err := os.Chmod(path, 0o600); err != nil {
+		return fmt.Errorf("secure config %s: %w", path, err)
+	}
+	return nil
+}
+
+// resolveBaseURL applies precedence: --base-url > $TINYFORGE_URL > config > default.
+func resolveBaseURL(g *globals, cfg Config) string {
+	if g != nil && *g.baseURL != "" {
+		return *g.baseURL
+	}
+	if env := os.Getenv("TINYFORGE_URL"); env != "" {
+		return env
+	}
+	if cfg.BaseURL != "" {
+		return cfg.BaseURL
+	}
+	return defaultBaseURL
+}
+
+// resolveToken applies precedence: --token > $TINYFORGE_TOKEN > config.
+func resolveToken(g *globals, cfg Config) string {
+	if g != nil && *g.token != "" {
+		return *g.token
+	}
+	if env := os.Getenv("TINYFORGE_TOKEN"); env != "" {
+		return env
+	}
+	return cfg.Token
+}
+
+// session bundles the resolved client with the loaded config and its path, so
+// commands can both make requests and persist updates (e.g. login).
+type session struct {
+	client     *Client
+	cfg        Config
+	configPath string
+}
+
+// newSession loads config and builds a client with resolved base URL + token.
+func newSession(g *globals) (*session, error) {
+	path, err := configFilePath(g)
+	if err != nil {
+		return nil, err
+	}
+	cfg, err := loadConfig(path)
+	if err != nil {
+		return nil, err
+	}
+	return &session{
+		client:     newClient(resolveBaseURL(g, cfg), resolveToken(g, cfg)),
+		cfg:        cfg,
+		configPath: path,
+	}, nil
+}
@@ -0,0 +1,73 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"time"
+)
+
+func runDeploy(args []string) error {
+	fs := flag.NewFlagSet("deploy", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	ref := fs.String("ref", "", "image tag / git ref / source-specific deploy target")
+	note := fs.String("note", "", "free-text note recorded with the deploy")
+	timeout := fs.Duration("timeout", 15*time.Minute, "max time to wait for the deploy to finish")
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge deploy <app> [--ref TAG] [--note TEXT] [--timeout DUR]\n\n"+
+			"Trigger a deploy and wait for it to finish. Requires an admin token.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	if fs.NArg() != 1 {
+		fs.Usage()
+		return fmt.Errorf("expected exactly one app (name or id)")
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+
+	// Resolve the app on a short deadline; the deploy itself gets the full one.
+	resolveCtx, cancelResolve := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancelResolve()
+	app, err := resolveApp(resolveCtx, sess.client, fs.Arg(0))
+	if err != nil {
+		return err
+	}
+
+	body := map[string]string{}
+	if *ref != "" {
+		body["reference"] = *ref
+	}
+	if *note != "" {
+		body["note"] = *note
+	}
+
+	fmt.Printf("Deploying %s%s…\n", app.Name, refSuffix(*ref))
+
+	// The endpoint returns 202 but blocks until the deploy completes, so a
+	// success here means it finished; allow plenty of time for pull/build.
+	ctx, cancel := context.WithTimeout(context.Background(), *timeout)
+	defer cancel()
+
+	var result DeployResult
+	if err := sess.client.doJSON(ctx, "POST", "/api/workloads/"+app.ID+"/deploy", body, &result); err != nil {
+		return err
+	}
+
+	fmt.Printf("Deploy of %s completed (triggered by %s).\n", app.Name, result.TriggeredBy)
+	fmt.Printf("Follow with: tinyforge logs %s -f\n", app.Name)
+	return nil
+}
+
+func refSuffix(ref string) string {
+	if ref == "" {
+		return ""
+	}
+	return fmt.Sprintf(" @ %s", ref)
+}
@@ -0,0 +1,136 @@
+package main
+
+import (
+	"bufio"
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+func runLogin(args []string) error {
+	fs := flag.NewFlagSet("login", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	user := fs.String("user", "", "username (prompted if omitted)")
+	pass := fs.String("password", "", "password (insecure; prefer $TINYFORGE_PASSWORD or the prompt)")
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge login [--user U] [--password P] [--base-url URL]\n\n"+
+			"Authenticate against the server and cache the token.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+
+	username := *user
+	if username == "" {
+		username, err = promptLine("Username: ")
+		if err != nil {
+			return err
+		}
+	}
+
+	password := *pass
+	if password == "" {
+		password = os.Getenv("TINYFORGE_PASSWORD")
+	}
+	if password == "" {
+		password, err = promptPassword("Password: ")
+		if err != nil {
+			return err
+		}
+	}
+	if username == "" || password == "" {
+		return fmt.Errorf("username and password are required")
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	var tok SessionToken
+	body := map[string]string{"username": username, "password": password}
+	if err := sess.client.doJSON(ctx, "POST", "/api/auth/login", body, &tok); err != nil {
+		return err
+	}
+
+	// Persist the resolved base URL alongside the token so later commands need
+	// no flags. The token file is written 0600 by saveConfig.
+	sess.cfg.BaseURL = sess.client.baseURL
+	sess.cfg.Token = tok.Token
+	sess.cfg.ExpiresAt = tok.ExpiresAt
+	if err := saveConfig(sess.configPath, sess.cfg); err != nil {
+		return err
+	}
+
+	fmt.Printf("Logged in to %s as %s.\n", sess.client.baseURL, username)
+	if exp := friendlyExpiry(tok.ExpiresAt); exp != "" {
+		fmt.Printf("Token valid until %s.\n", exp)
+	}
+	return nil
+}
+
+func runLogout(args []string) error {
+	fs := flag.NewFlagSet("logout", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+	if sess.client.token == "" {
+		fmt.Println("Not logged in.")
+		return nil
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+
+	// Best-effort server-side revocation; clear the local token regardless.
+	revokeErr := sess.client.doJSON(ctx, "POST", "/api/auth/logout", nil, nil)
+
+	sess.cfg.Token = ""
+	sess.cfg.ExpiresAt = ""
+	if err := saveConfig(sess.configPath, sess.cfg); err != nil {
+		return err
+	}
+
+	if revokeErr != nil {
+		fmt.Printf("Cleared local token (server revocation skipped: %v).\n", revokeErr)
+		return nil
+	}
+	fmt.Println("Logged out.")
+	return nil
+}
+
+// promptLine reads a single trimmed line from stdin.
+func promptLine(label string) (string, error) {
+	fmt.Fprint(os.Stderr, label)
+	r := bufio.NewReader(os.Stdin)
+	line, err := r.ReadString('\n')
+	if err != nil && line == "" {
+		return "", fmt.Errorf("read input: %w", err)
+	}
+	return strings.TrimSpace(line), nil
+}
+
+// friendlyExpiry formats an RFC3339 expiry as a local time, best-effort.
+func friendlyExpiry(s string) string {
+	if s == "" {
+		return ""
+	}
+	t, err := time.Parse(time.RFC3339, s)
+	if err != nil {
+		return s
+	}
+	return t.Local().Format("2006-01-02 15:04 MST")
+}
@@ -0,0 +1,143 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"net/url"
+	"os"
+	"os/signal"
+	"strings"
+	"time"
+)
+
+func runLogs(args []string) error {
+	fs := flag.NewFlagSet("logs", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	follow := fs.Bool("f", false, "follow the log stream (Ctrl-C to stop)")
+	tail := fs.Int("tail", 200, "number of trailing lines to show (max 5000)")
+	container := fs.String("container", "", "container row id/prefix or role (when an app has several)")
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge logs <app> [-f] [--tail N] [--container CID]\n\nPrint or follow a container's logs.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	if fs.NArg() != 1 {
+		fs.Usage()
+		return fmt.Errorf("expected exactly one app (name or id)")
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+
+	resolveCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	app, err := resolveApp(resolveCtx, sess.client, fs.Arg(0))
+	if err != nil {
+		return err
+	}
+
+	var containers []Container
+	if err := sess.client.doJSON(resolveCtx, "GET", "/api/workloads/"+app.ID+"/containers", nil, &containers); err != nil {
+		return err
+	}
+	target, err := chooseContainer(containers, *container)
+	if err != nil {
+		return err
+	}
+
+	q := url.Values{}
+	q.Set("tail", fmt.Sprintf("%d", *tail))
+	base := "/api/workloads/" + app.ID + "/containers/" + target.ID + "/logs"
+
+	if !*follow {
+		var lines []string
+		if err := sess.client.doJSON(resolveCtx, "GET", base+"?"+q.Encode(), nil, &lines); err != nil {
+			return err
+		}
+		for _, line := range lines {
+			fmt.Println(line)
+		}
+		return nil
+	}
+
+	// Follow: stream until EOF or Ctrl-C.
+	q.Set("follow", "true")
+	ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
+	defer stop()
+
+	err = sess.client.streamSSE(ctx, base+"?"+q.Encode(), func(payload []byte) error {
+		var frame struct {
+			Line string `json:"line"`
+		}
+		if json.Unmarshal(payload, &frame) != nil {
+			return nil // ignore frames we can't parse
+		}
+		fmt.Println(frame.Line)
+		return nil
+	})
+	if ctx.Err() != nil { // user interrupted — clean exit
+		return nil
+	}
+	return err
+}
+
+// chooseContainer selects which container to read. With an explicit selector,
+// it matches the row id (exact or prefix) or the role. Otherwise it uses the
+// sole container, or the sole running one, and errors with a list when the
+// choice is ambiguous.
+func chooseContainer(cs []Container, selector string) (Container, error) {
+	if len(cs) == 0 {
+		return Container{}, fmt.Errorf("app has no containers yet — deploy it first")
+	}
+
+	if selector != "" {
+		var matches []Container
+		for _, c := range cs {
+			if c.ID == selector || strings.EqualFold(c.Role, selector) ||
+				(len(selector) >= 6 && strings.HasPrefix(c.ID, selector)) {
+				matches = append(matches, c)
+			}
+		}
+		switch len(matches) {
+		case 1:
+			return matches[0], nil
+		case 0:
+			return Container{}, fmt.Errorf("no container matching %q\n%s", selector, containerList(cs))
+		default:
+			return Container{}, fmt.Errorf("%q matches multiple containers\n%s", selector, containerList(cs))
+		}
+	}
+
+	if len(cs) == 1 {
+		return cs[0], nil
+	}
+	var running []Container
+	for _, c := range cs {
+		if c.State == "running" {
+			running = append(running, c)
+		}
+	}
+	if len(running) == 1 {
+		return running[0], nil
+	}
+	return Container{}, fmt.Errorf("app has %d containers; pick one with --container:\n%s", len(cs), containerList(cs))
+}
+
+func containerList(cs []Container) string {
+	var b strings.Builder
+	for _, c := range cs {
+		role := c.Role
+		if role == "" {
+			role = "(default)"
+		}
+		fmt.Fprintf(&b, "  %s  %-12s %s\n", idShort(c.ID), role, c.State)
+	}
+	return strings.TrimRight(b.String(), "\n")
+}
@@ -0,0 +1,95 @@
+// Command tinyforge is a terminal client for a Tinyforge server.
+//
+// It drives the existing HTTP API: log in to obtain a 24h JWT, then list
+// apps, trigger deploys, stream logs, and check status. The token is cached
+// in ~/.tinyforge/config.json (mode 0600) so subsequent commands reuse it.
+//
+// Usage:
+//
+//	tinyforge login [--user U] [--password P]
+//	tinyforge apps [list]
+//	tinyforge deploy <app> [--ref TAG] [--note TEXT]
+//	tinyforge logs <app> [-f] [--tail N] [--container CID]
+//	tinyforge status [<app>]
+//	tinyforge logout
+//	tinyforge version
+//
+// The target server is resolved from --base-url, then $TINYFORGE_URL, then the
+// saved config, then http://localhost:8080.
+package main
+
+import (
+	"fmt"
+	"os"
+)
+
+// version is the CLI build version. Overridable at build time via
+// -ldflags "-X main.version=...".
+var version = "dev"
+
+func main() {
+	if len(os.Args) < 2 {
+		usage(os.Stderr)
+		os.Exit(2)
+	}
+
+	cmd, args := os.Args[1], os.Args[2:]
+
+	var err error
+	switch cmd {
+	case "login":
+		err = runLogin(args)
+	case "logout":
+		err = runLogout(args)
+	case "apps":
+		err = runApps(args)
+	case "deploy":
+		err = runDeploy(args)
+	case "logs":
+		err = runLogs(args)
+	case "status":
+		err = runStatus(args)
+	case "version", "--version", "-v":
+		fmt.Printf("tinyforge %s\n", version)
+	case "help", "-h", "--help":
+		usage(os.Stdout)
+	default:
+		fmt.Fprintf(os.Stderr, "tinyforge: unknown command %q\n\n", cmd)
+		usage(os.Stderr)
+		os.Exit(2)
+	}
+
+	if err != nil {
+		// Authenticated commands that hit a 401 get a re-login hint; the login
+		// command itself surfaces the server message ("invalid credentials").
+		if cmd != "login" && isAuthError(err) {
+			err = fmt.Errorf("%w — run 'tinyforge login'", err)
+		}
+		fmt.Fprintf(os.Stderr, "tinyforge: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func usage(w *os.File) {
+	fmt.Fprint(w, `tinyforge — terminal client for a Tinyforge server
+
+Usage:
+  tinyforge <command> [flags]
+
+Commands:
+  login              Authenticate and cache a token
+  logout             Revoke the cached token and clear it
+  apps [list]        List your apps (workloads with a source)
+  deploy <app>       Trigger a deploy (waits for completion)
+  logs <app>         Print container logs (use -f to follow)
+  status [<app>]     Show server health, or one app's containers
+  version            Print the CLI version
+
+Global flags (accepted by any command):
+  --base-url URL     Server URL (default $TINYFORGE_URL or http://localhost:8080)
+  --token TOKEN      Auth token (default $TINYFORGE_TOKEN or cached config)
+  --config PATH      Config file (default $TINYFORGE_CONFIG or ~/.tinyforge/config.json)
+
+Run "tinyforge <command> -h" for command-specific flags.
+`)
+}
@@ -0,0 +1,38 @@
+//go:build !windows
+
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+)
+
+// promptPassword reads a password from stdin with echo disabled via stty. If
+// stty is unavailable (no tty, missing binary), it falls back to an echoed
+// read so the command still works in pipes/CI.
+func promptPassword(label string) (string, error) {
+	fmt.Fprint(os.Stderr, label)
+
+	echoDisabled := stty("-echo") == nil
+	if echoDisabled {
+		defer func() {
+			_ = stty("echo")
+			fmt.Fprintln(os.Stderr) // the Enter keystroke was not echoed
+		}()
+	}
+
+	line, err := bufio.NewReader(os.Stdin).ReadString('\n')
+	if err != nil && line == "" {
+		return "", fmt.Errorf("read password: %w", err)
+	}
+	return strings.TrimRight(line, "\r\n"), nil
+}
+
+func stty(arg string) error {
+	cmd := exec.Command("stty", arg)
+	cmd.Stdin = os.Stdin
+	return cmd.Run()
+}
@@ -0,0 +1,45 @@
+//go:build windows
+
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+	"syscall"
+	"unsafe"
+)
+
+// enableEchoInput is the Windows console mode bit that echoes typed input.
+const enableEchoInput = 0x0004
+
+// promptPassword reads a password from the console with echo disabled, using
+// kernel32 directly so no third-party dependency is needed. If the console
+// mode cannot be changed (e.g. piped stdin), it falls back to an echoed read.
+func promptPassword(label string) (string, error) {
+	fmt.Fprint(os.Stderr, label)
+
+	kernel32 := syscall.NewLazyDLL("kernel32.dll")
+	getConsoleMode := kernel32.NewProc("GetConsoleMode")
+	setConsoleMode := kernel32.NewProc("SetConsoleMode")
+	handle := syscall.Handle(os.Stdin.Fd())
+
+	var mode uint32
+	echoDisabled := false
+	if r, _, _ := getConsoleMode.Call(uintptr(handle), uintptr(unsafe.Pointer(&mode))); r != 0 {
+		if ret, _, _ := setConsoleMode.Call(uintptr(handle), uintptr(mode&^enableEchoInput)); ret != 0 {
+			echoDisabled = true
+			defer setConsoleMode.Call(uintptr(handle), uintptr(mode))
+		}
+	}
+
+	line, err := bufio.NewReader(os.Stdin).ReadString('\n')
+	if echoDisabled {
+		fmt.Fprintln(os.Stderr) // the Enter keystroke was not echoed
+	}
+	if err != nil && line == "" {
+		return "", fmt.Errorf("read password: %w", err)
+	}
+	return strings.TrimRight(line, "\r\n"), nil
+}
@@ -0,0 +1,122 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"text/tabwriter"
+	"time"
+)
+
+func runStatus(args []string) error {
+	fs := flag.NewFlagSet("status", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge status [<app>]\n\nWith no app: server health and the logged-in user.\nWith an app: that app's containers.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	if fs.NArg() == 0 {
+		return serverStatus(ctx, sess)
+	}
+	return appStatus(ctx, sess.client, fs.Arg(0))
+}
+
+func serverStatus(ctx context.Context, sess *session) error {
+	fmt.Printf("Server:  %s\n", sess.client.baseURL)
+
+	var me User
+	if err := sess.client.doJSON(ctx, "GET", "/api/auth/me", nil, &me); err != nil {
+		fmt.Printf("User:    not logged in (%v)\n", err)
+	} else {
+		fmt.Printf("User:    %s (%s)\n", me.Username, me.Role)
+	}
+	if exp := friendlyExpiry(sess.cfg.ExpiresAt); exp != "" {
+		fmt.Printf("Token:   valid until %s\n", exp)
+	}
+
+	var health map[string]any
+	if err := sess.client.doJSON(ctx, "GET", "/api/health", nil, &health); err != nil {
+		return err
+	}
+	fmt.Printf("DB:      %s\n", connState(health, "database"))
+	docker := connState(health, "docker")
+	if v := nestedString(health, "docker", "version"); v != "" {
+		docker += " (v" + v + ")"
+	}
+	fmt.Printf("Docker:  %s\n", docker)
+	if _, ok := health["proxy"]; ok {
+		fmt.Printf("Proxy:   %s\n", connState(health, "proxy"))
+	}
+	return nil
+}
+
+func appStatus(ctx context.Context, c *Client, ref string) error {
+	app, err := resolveApp(ctx, c, ref)
+	if err != nil {
+		return err
+	}
+	var containers []Container
+	if err := c.doJSON(ctx, "GET", "/api/workloads/"+app.ID+"/containers", nil, &containers); err != nil {
+		return err
+	}
+
+	fmt.Printf("%s  (%s, %s)\n", app.Name, app.SourceKind, idShort(app.ID))
+	if len(containers) == 0 {
+		fmt.Println("No containers — not deployed yet.")
+		return nil
+	}
+
+	tw := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
+	fmt.Fprintln(tw, "ROLE\tSTATE\tIMAGE\tPORT\tSUBDOMAIN\tCONTAINER")
+	for _, c := range containers {
+		role := c.Role
+		if role == "" {
+			role = "(default)"
+		}
+		port := ""
+		if c.Port != 0 {
+			port = fmt.Sprintf("%d", c.Port)
+		}
+		fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\n",
+			role, c.State, c.ImageRef, port, c.Subdomain, idShort(c.ID))
+	}
+	return tw.Flush()
+}
+
+// connState reads health[section].connected and renders connected/disconnected,
+// appending the section's error string when present.
+func connState(health map[string]any, section string) string {
+	m, ok := health[section].(map[string]any)
+	if !ok {
+		return "unknown"
+	}
+	connected, _ := m["connected"].(bool)
+	if connected {
+		return "connected"
+	}
+	if msg, ok := m["error"].(string); ok && msg != "" {
+		return "disconnected (" + msg + ")"
+	}
+	return "disconnected"
+}
+
+func nestedString(m map[string]any, section, key string) string {
+	sub, ok := m[section].(map[string]any)
+	if !ok {
+		return ""
+	}
+	s, _ := sub[key].(string)
+	return s
+}
@@ -28,6 +28,7 @@ import (
 	"github.com/alexei/tinyforge/internal/health"
 	"github.com/alexei/tinyforge/internal/logging"
 	"github.com/alexei/tinyforge/internal/logscanner"
+	"github.com/alexei/tinyforge/internal/metricalert"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/npm"
 	"github.com/alexei/tinyforge/internal/proxy"
@@ -36,6 +37,7 @@ import (
 	"github.com/alexei/tinyforge/internal/stale"
 	"github.com/alexei/tinyforge/internal/stats"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
 	"github.com/alexei/tinyforge/internal/webhook"
 	"github.com/alexei/tinyforge/internal/workload/plugin"

@@ -43,6 +45,7 @@ import (
 	// itself with internal/workload/plugin. Adding a new Source or Trigger
 	// is a matter of dropping a new package and adding it to this list.
 	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/compose"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/dockerfile"
 	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/image"
 	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/static"
 	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/git"
@@ -62,6 +65,20 @@ func main() {
 		os.Exit(1)
 	}

+	// Acquire single-instance lockfile BEFORE opening the DB. SQLite +
+	// SetMaxOpenConns(1) does not protect against two Tinyforge processes
+	// sharing a data directory; without this guard a misconfigured
+	// systemd unit, container restart race, or `tinyforge` shell typo can
+	// silently double-fire schedulers, double-poll registries, and
+	// corrupt `extra_json` RMW. The lockfile is a PID file under
+	// $DATA_DIR/tinyforge.lock — collisions with dead PIDs are reclaimed.
+	releaseLock, err := store.AcquireLockfile(dataDir)
+	if err != nil {
+		slog.Error("could not acquire data-dir lock", "data_dir", dataDir, "error", err)
+		os.Exit(1)
+	}
+	defer releaseLock()
+
 	// Open database.
 	dbPath := filepath.Join(dataDir, "tinyforge.db")
 	db, err := store.New(dbPath)
@@ -78,6 +95,21 @@ func main() {
 		os.Exit(1)
 	}

+	// One-shot migration: rewrite every legacy unprefixed-hex secret
+	// in the DB into the new tf1: envelope form. Idempotent (gated by
+	// schema_versions version 2). Lets the rest of the codebase treat
+	// envelope-presence as a stable invariant for future key rotations.
+	// Failures here are logged but non-fatal: a partial migration just
+	// means some columns keep working through Decrypt's legacy
+	// fallback until the next manual save re-encrypts them.
+	if err := db.MigrateSecretsToEnvelope(store.EnvelopeMigrator{
+		HasEnvelope: crypto.HasEnvelope,
+		Decrypt:     func(v string) (string, error) { return crypto.Decrypt(encKey, v) },
+		Encrypt:     func(v string) (string, error) { return crypto.Encrypt(encKey, v) },
+	}); err != nil {
+		slog.Warn("secrets envelope migration", "error", err)
+	}
+
 	// Import seed config on first launch (idempotent).
 	seedPath := envOrDefault("SEED_FILE", "./tinyforge.yaml")
 	if err := config.ImportSeed(db, seedPath); err != nil {
@@ -197,7 +229,8 @@ func main() {
 			switch {
 			case r.Deployed:
 				deployed++
-			case r.Reason == webhook.ReasonBindingDisabled, r.Reason == webhook.ReasonNoMatch:
+			case r.Reason == webhook.ReasonBindingDisabled, r.Reason == webhook.ReasonNoMatch,
+				r.Reason == webhook.ReasonPreviewNoop:
 				// not a failure — silent
 			default:
 				errored++
@@ -291,6 +324,19 @@ func main() {
 	}
 	dep.SetPreDeployBackuper(backupEngine)

+	// Initialize volume-snapshot engine (per-workload data-volume archives).
+	snapshotEngine, err := volsnap.New(db, dataDir)
+	if err != nil {
+		slog.Error("create snapshot engine", "error", err)
+		os.Exit(1)
+	}
+	// Reclaim snapshot files orphaned by workload deletes (rows CASCADE, files don't).
+	if cleaned, err := snapshotEngine.CleanOrphans(); err != nil {
+		slog.Warn("snapshots: clean orphans on startup", "error", err)
+	} else if cleaned > 0 {
+		slog.Info("snapshots: cleaned orphan files on startup", "count", cleaned)
+	}
+
 	// Clean orphaned backup files and prune on startup.
 	if cleaned, err := backupEngine.CleanOrphans(); err != nil {
 		slog.Warn("backup: clean orphans on startup", "error", err)
@@ -359,11 +405,30 @@ func main() {
 	}
 	defer logScanMgr.Stop()

+	// Metric-alert manager: evaluates threshold rules against recent
+	// container stats samples and emits event_log entries on breach.
+	// The store satisfies RuleSource/SampleSource/EventSink; the event
+	// bus is the Publisher.
+	metricAlertMgr := metricalert.New(db, db, db, eventBus)
+	metricAlertMgr.Start()
+	defer metricAlertMgr.Stop()
+
 	// Build API server.
 	apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
 	apiServer.SetStaleScanner(staleScanner)
 	apiServer.SetLogScanReloader(logScanMgr)
 	apiServer.SetBackupEngine(backupEngine)
+	apiServer.SetSnapshotEngine(snapshotEngine)
+	// Wire the restore lifecycle seam and reconcile any restore interrupted by a
+	// crash, BEFORE the HTTP server starts serving — so a half-applied restore is
+	// completed/reverted first and the restore endpoint is never reachable
+	// without its safety net.
+	snapshotEngine.SetLifecycle(&restoreLifecycle{dep: dep, docker: dockerClient, store: db})
+	if n, err := snapshotEngine.RecoverInterruptedRestores(); err != nil {
+		slog.Warn("snapshots: recover interrupted restores on startup", "error", err)
+	} else if n > 0 {
+		slog.Info("snapshots: recovered interrupted restores on startup", "count", n)
+	}
 	apiServer.SetDBPath(dbPath)
 	apiServer.SetBackupSettingsChangedCallback(scheduleAutobackup)
 	apiServer.SetDNSProvider(dnsProvider)
@@ -420,6 +485,7 @@ func main() {
 	eventBus.Unsubscribe(notifySub)
 	staleScanner.Stop()
 	statsCollector.Stop()
+	metricAlertMgr.Stop()

 	// Drain in-progress deploys and notifications.
 	dep.Drain()
@@ -0,0 +1,70 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/deployer"
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// restoreStopTimeoutSeconds bounds the graceful-stop window per container during
+// a restore quiesce before Docker kills it.
+const restoreStopTimeoutSeconds = 10
+
+// restoreLifecycle adapts the deployer + Docker client + store to the
+// volsnap.Lifecycle seam the volume-snapshot restore flow needs. It lives in the
+// composition root so the volsnap package stays decoupled from deployer/docker.
+type restoreLifecycle struct {
+	dep    *deployer.Deployer
+	docker *docker.Client
+	store  *store.Store
+}
+
+// Lock takes the deployer's per-workload deploy lock so the restore serializes
+// against every deploy entrypoint (C1).
+func (l *restoreLifecycle) Lock(workloadID string) func() { return l.dep.LockWorkload(workloadID) }
+
+// StopContainers stops every running container for the workload (quiesce before
+// the volume swap, C4) and returns the image tag the newest running container
+// was on, so the redeploy brings the SAME version back up. ListContainersByWorkload
+// returns rows newest-first, so the first running row is the newest.
+func (l *restoreLifecycle) StopContainers(ctx context.Context, workloadID string) (string, error) {
+	rows, err := l.store.ListContainersByWorkload(workloadID)
+	if err != nil {
+		return "", fmt.Errorf("list containers: %w", err)
+	}
+	tag := ""
+	for _, c := range rows {
+		if c.State != "running" || c.ContainerID == "" {
+			continue
+		}
+		if tag == "" && c.ImageTag != "" {
+			tag = c.ImageTag // newest running container's tag
+		}
+		if err := l.docker.StopContainer(ctx, c.ContainerID, restoreStopTimeoutSeconds); err != nil {
+			return "", fmt.Errorf("stop container %s: %w", c.ContainerID, err)
+		}
+		if err := l.store.UpdateContainerState(c.ID, "stopped"); err != nil {
+			slog.Warn("restore: mark container stopped", "container", c.ID, "error", err)
+		}
+	}
+	return tag, nil
+}
+
+// Redeploy re-dispatches the workload via the deployer's unlocked path (the
+// restore already holds the per-workload lock). reference pins the image tag.
+func (l *restoreLifecycle) Redeploy(ctx context.Context, w store.Workload, reference string) error {
+	intent := plugin.DeploymentIntent{
+		Reason:      "restore",
+		Reference:   reference,
+		Metadata:    map[string]string{"note": "redeploy after volume snapshot restore"},
+		TriggeredAt: time.Now().UTC(),
+		TriggeredBy: "restore",
+	}
+	return l.dep.RedeployLocked(ctx, plugin.WorkloadFromStore(w), intent)
+}
@@ -1,7 +1,13 @@
 services:
  tinyforge:
+    # Default: build from source so a fresh clone works out of the box.
    build: .
-    image: tinyforge:latest
+    # Image name doubles as the Gitea registry tag. To DEPLOY the pre-built
+    # image instead of building (e.g. Portainer pulling on a webhook), comment
+    # out `build:` above — compose will then pull this tag. `:latest` is pushed
+    # only for stable (non pre-release) releases, and the registry may require
+    # `docker login git.dolgolyov-family.by` first if the package is private.
+    image: git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge:latest
    container_name: tinyforge
    restart: unless-stopped
    ports:
@@ -31,7 +37,10 @@ services:
    networks:
      - staging-net
    healthcheck:
-      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/api/auth/login"]
+      # /readyz is the public readiness probe (pings the DB, rate-limited).
+      # The previous target (/api/auth/login) is POST-only, so a GET/spider
+      # request returned 405 and the container was always reported unhealthy.
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/readyz"]
      interval: 30s
      timeout: 5s
      retries: 3
@@ -1,6 +1,6 @@
 # Tinyforge Codemaps — Index

-**Last Updated:** 2026-05-16
+**Last Updated:** 2026-05-16 (added `container-extra-json` policy doc)

 This directory contains architectural maps of key Tinyforge subsystems. Each codemap focuses on one major area: core data types, contract surfaces, integration points, and recipes for extending the system.

@@ -8,6 +8,7 @@ This directory contains architectural maps of key Tinyforge subsystems. Each cod

 - **[Workload Plugin](./workload-plugin.md)** — Source × Trigger plugin contracts; registry lookups; webhook fan-out; how to add new kinds.
 - **[Discovery & Runtime API](./discovery-and-runtime.md)** — `/api/discovery/*` helpers (Git provider probe, repo/branch/tree pickers, image conflicts); `/api/workloads/{id}/runtime-state` + `/storage` + `/stop` + `/start`; SSRF-safe HTTP client in `internal/staticsite`.
+- **[`containers.extra_json` Evolution Policy](./container-extra-json.md)** — Ownership model, reader/writer rules, wholesale-overwrite vs preserve-unknown-keys patterns, concurrency invariants; checklist for adding a new field without breaking older deployers.

 ## Cross-References

@@ -0,0 +1,105 @@
+# `containers.extra_json` — Evolution Policy
+
+**Last Updated:** 2026-05-16
+
+`extra_json` is a TEXT column on the `containers` table that source plugins use to persist source-specific runtime state that hasn't been promoted to a first-class column. It is the single forward-compatibility seam between the canonical container row and per-source needs that arise after a schema is in production.
+
+This doc captures the rules every reader and writer must follow so new sources can extend the blob without breaking older ones.
+
+## Schema position
+
+- Column: `containers.extra_json TEXT NOT NULL DEFAULT '{}'` ([`internal/store/store.go:233`](../../internal/store/store.go#L233)).
+- All four write paths (`CreateContainer`, `UpsertContainer`, `ReconcileContainer`, `UpdateContainer`) normalize `""` → `'{}'` before the SQL exec — readers can assume a non-empty JSON object string and never need to handle SQL `NULL` or the empty-string edge.
+- Defined on the `Container` model: [`internal/store/models.go:342-347`](../../internal/store/models.go#L342-L347).
+
+## Ownership model
+
+**One container row → one owning source.** Sources never write to a row that belongs to another source. In practice:
+
+| Source kind | Row key                                | Number of rows per workload | Writes `extra_json` today?  |
+| ----------- | -------------------------------------- | --------------------------- | --------------------------- |
+| `static`    | deterministic `<workloadID>:site`      | exactly 1                   | yes (preserve-unknown-keys) |
+| `image`     | UUID per deployed container            | 1 + N (blue-green rolls)    | yes (wholesale-overwrite)   |
+| `compose`   | deterministic `<workloadID>:<service>` | N (one per compose service) | no — left at `'{}'` default |
+
+Two sources cannot contend on the same row, so the policy below is concerned with **forward compatibility across versions of the same source**, not cross-source contention. When compose (or any future source) starts writing `extra_json`, the same rules apply.
+
+## Reader rules — ALL readers
+
+1. **Tolerate unknown keys.** Decode into a typed struct using `encoding/json`; Go's default unmarshaller silently drops unknown keys, which is the desired behaviour. Never use `json.Decoder.DisallowUnknownFields()` on `extra_json`.
+2. **Tolerate decode failure as non-fatal where the row's first-class columns are useful.** A corrupted `extra_json` is debug-logged and the reader falls back to zero state — see `workload_runtime.go:118-133` for the canonical pattern. The container's `ContainerID`, `State`, `ProxyRouteID`, etc. live in their own columns and are still trustworthy.
+3. **Tolerate `''` and `'{}'`.** Both are equivalent to "no extras yet". Readers must short-circuit before json.Unmarshal to avoid `unexpected end of JSON input` on the empty case.
+
+## Writer rules — by mutation style
+
+Two distinct write patterns live in the codebase today. Pick the one that matches your source's needs.
+
+### Wholesale-overwrite (image source pattern)
+
+When the writer owns 100% of the blob's shape and discards old contents on every write:
+
+```go
+// internal/workload/plugin/source/image/image.go:341-343
+extra := containerExtra{ProxyRoutes: faceRoutes}
+if b, err := json.Marshal(extra); err == nil {
+    created.ExtraJSON = string(b)
+}
+```
+
+- Cheap and simple.
+- **Loses unknown keys written by future versions of the same source.** Only use when you are certain no other writer (including a future version of this code) needs to round-trip an unknown key.
+- The `containerExtra` struct must be **additive-only**: never rename or remove a field once shipped, and never change its JSON type. Mark new fields with `omitempty` so older readers downgrading to an older codebase don't see surprise nulls.
+
+### Preserve-unknown-keys (static source pattern)
+
+When future versions of the source (or sibling writers) may add fields and the current writer must round-trip them:
+
+```go
+// internal/workload/plugin/source/static/state.go saveState
+//   1. Decode existing blob into map[string]json.RawMessage.
+//   2. Strip every key the current typed-state struct owns
+//      (runtimeStateKeys) so a cleared field actually drops.
+//   3. Apply caller's mutate() to the typed state.
+//   4. Re-marshal typed state, splice its keys back into the
+//      generic map (overwriting any historical sibling).
+//   5. Marshal the merged map back into extra_json.
+```
+
+- Slightly more expensive (two round-trips through `json`).
+- Preserves keys the current writer doesn't know about — required for safe rolling deploys where a newer instance writes a new key, an older instance then reads, mutates, and writes back.
+- Must declare the typed key set explicitly (`runtimeStateKeys`) so step 2 can strip them. This invariant is fenced by `TestRuntimeState_JSONTagsRoundTrip` in [`state_integration_test.go`](../../internal/workload/plugin/source/static/state_integration_test.go).
+
+**Default to preserve-unknown-keys for any new source.** Wholesale-overwrite is acceptable for the image source today because the row's lifetime is short (replaced on every blue-green roll) and only one writer touches it. Sources whose container rows are long-lived (static, future compose-with-stateful-services) should preserve unknown keys.
+
+## Concurrency
+
+`UpsertContainer` is atomic at the SQL layer — SQLite serializes statements through one connection ([`internal/store/store.go:55`](../../internal/store/store.go#L55) `SetMaxOpenConns(1)`) with WAL mode enabled ([`store.go:60`](../../internal/store/store.go#L60)). That guarantees no torn write on a single row, and concurrent readers see a consistent snapshot — they read either the pre- or post-write state, never a half-applied one.
+
+What that does **not** guarantee is atomic read-modify-write across two Go goroutines. The static source serializes its RMW through a per-workload `sync.Mutex` keyed by workload ID (`internal/workload/plugin/source/static/state.go` `lockFor` + `saveState`). Any source that does its own read-modify-write on `extra_json` must do the same — verified in `TestSaveState_ConcurrentWritesDoNotLoseUpdates` (which loses 15+ markers per 20-writer run when the mutex is disabled, as confirmed in commit `ef62a41`).
+
+If a future source is purely wholesale-overwrite from a single writer, no lock is needed.
+
+## What `extra_json` is NOT for
+
+- **Workload-level config.** Workload config goes in `workloads.source_config` and is the operator's surface.
+- **Cross-source state.** If two sources need the same data, promote it to a column.
+- **Anything queryable.** SQLite can JSON-path `extra_json` but no index supports it; readers always pull the column wholesale and parse in Go.
+- **Secrets.** Anything sensitive lives in `workload_env` (per-entry encrypt flag) or another encrypted table.
+
+## Adding a new field — checklist
+
+1. Add the field to your source's typed struct with `omitempty` and a stable `json:"snake_case"` tag.
+2. If you use the **preserve-unknown-keys** pattern, add the JSON key to your `*Keys` slice (the equivalent of `runtimeStateKeys`).
+3. Confirm older readers (older deploys of the same binary) still parse the blob — `encoding/json` should drop the unknown key silently. Add a regression test if there's any doubt.
+4. Document the new field in this codemap if it's load-bearing for cross-source code (e.g., the proxy_routes map drives `ListProxyRoutes`).
+
+## Pointers
+
+- Container model + `ExtraJSON` comment: [`internal/store/models.go:342-347`](../../internal/store/models.go#L342-L347)
+- Schema declaration: [`internal/store/store.go:233`](../../internal/store/store.go#L233)
+- Store-level normalization (`'{}'` default) across all four write paths: [`internal/store/containers.go:42-43`](../../internal/store/containers.go#L42-L43) (CreateContainer), `:77-78` (UpsertContainer), `:129-130` (ReconcileContainer), `:321-322` (UpdateContainer).
+- Wholesale-overwrite writer + struct: [`image.go:341-343`](../../internal/workload/plugin/source/image/image.go#L341-L343) writes; [`image.go:481-487`](../../internal/workload/plugin/source/image/image.go#L481-L487) defines `containerExtra`; [`image.go:449-456`](../../internal/workload/plugin/source/image/image.go#L449-L456) reads it back in Teardown.
+- Preserve-unknown-keys example + concurrency lock: [`internal/workload/plugin/source/static/state.go`](../../internal/workload/plugin/source/static/state.go).
+- Canonical "decode-and-tolerate" consumer (the only cross-source reader in tree today): [`internal/api/workload_runtime.go:118-133`](../../internal/api/workload_runtime.go#L118-L133) decodes the static-only typed fields and falls back to first-class columns when the blob is empty, missing keys, or malformed.
+
+Note: no cross-source consumer reads `extra_json` in `internal/store/`. The proxy/route data exposed by `ListProxyRoutes` ([`containers.go:196`](../../internal/store/containers.go#L196)) comes from first-class columns (`proxy_route_id`, `subdomain`, `port`); the `proxy_routes` map inside `extra_json` is read only by the image source's own Teardown for cleanup.
@@ -500,13 +500,15 @@ covers the use case — `promote-from` works, the UI shows the relationship.
 Probably can leave the legacy `stages` table dropped entirely once cutover
 proceeds.

-### `Container.extra_json` evolution
+### ~~`Container.extra_json` evolution~~ — DONE (2026-05-16)

-Currently only the image source uses it (per-face proxy route IDs). If
-other sources gain similar needs (compose service health metadata, static
-build SHAs), the schema there should stay versionless and additive — every
-reader must tolerate unknown keys. Document this in the source plugin
-guide alongside the codemap entry.
+Both writer patterns now have an active example in-tree (image source
+clobbers, static source preserves) and the policy is documented in
+[`docs/CODEMAPS/container-extra-json.md`](CODEMAPS/container-extra-json.md):
+ownership model, wholesale-overwrite vs preserve-unknown-keys, reader
+tolerance for unknown keys + decode failure, the per-workload mutex
+requirement for any read-modify-write writer, and a checklist for adding
+a new field without breaking older deployers.

 ## File pointers for the next session

@@ -0,0 +1,77 @@
+# GitOps: config-as-code with `.tinyforge.yml`
+
+A **dockerfile** or **static** workload can read part of its deploy config from a
+`.tinyforge.yml` file in its own repo. Tinyforge fetches the file, shows you how it
+differs from the live config (**drift**), and applies it when you click **Sync** — so the
+repo becomes the source of truth for the declared fields.
+
+This is opt-in per workload and **manual-sync only** in v1: nothing is applied automatically
+on deploy, and a sync never runs without an explicit admin action.
+
+## Enabling it
+
+1. Open the workload (Apps → your app).
+2. In the **GitOps** panel, toggle it on. The default file path is `.tinyforge.yml` at the
+   repo root; change it if your file lives elsewhere (e.g. `deploy/.tinyforge.yml`).
+3. Add a `.tinyforge.yml` to the repo (schema below) and push.
+4. The panel shows the parsed file and any drift vs. the live config. Click **Sync now** to
+   apply the repo's values to the workload.
+
+Only **dockerfile** and **static** sources are eligible — they're the git-backed sources.
+`image` and `compose` workloads don't show the panel.
+
+## `.tinyforge.yml` schema (v1)
+
+```yaml
+version: 1            # required, must be 1
+deploy:
+  # dockerfile only:
+  port: 8080          # container port the app listens on
+  healthcheck: /healthz   # HTTP path probed before a blue-green cutover ("" to disable)
+  # dockerfile + static:
+  deploy_strategy: blue-green   # "" | recreate | blue-green
+```
+
+Notes:
+
+- **Only the fields above are honored.** Unknown keys are rejected with an error (so a typo
+  surfaces instead of being silently ignored).
+- Fields you omit are **left untouched** — the file overlays only what it declares; it never
+  clears the rest of your config.
+- The file is **source-aware**: a `static` workload only honors `deploy_strategy` (a static
+  site has no port/healthcheck); `port`/`healthcheck` in a static site's file are ignored.
+- `deploy_strategy: ""` and `recreate` are equivalent (both are the default for dockerfile
+  and static), so they never show as drift against each other.
+
+## What `.tinyforge.yml` does **not** contain
+
+- **No repo location** (provider / owner / repo / branch) and **no access token** — those
+  stay in Tinyforge's encrypted database. This is deliberate: it keeps credentials out of
+  your repo. (You need the repo coords to find the file in the first place, so they can't
+  live in it.)
+
+## Drift and sync
+
+- **Drift** is computed only over the fields the file declares, after normalization (so a
+  defaulted strategy or a YAML-int vs stored-number difference isn't a false positive).
+- **Sync** fetches the file, merges the declared fields onto a copy of the live config,
+  **validates the merged result** with the source's own rules, and only persists it if it
+  passes — a bad file is rejected as a whole and never leaves a partial config. The sync is
+  recorded to the workload's activity log (not the deploy ledger — it changes config, it
+  isn't a deploy).
+- While GitOps is enabled, the edit form shows a banner noting which fields the repo manages;
+  editing them in the UI works, but the next Sync overwrites them with the repo's values.
+
+## Not in v1 (planned)
+
+These are intentionally out of scope for the first version; the design leaves clean seams
+for them:
+
+- **`env` and `faces` (public subdomains)** — they live in separate stores and (for `env`)
+  would re-introduce a secrets-in-repo risk; deferred to a typed multi-target apply.
+- **Auto-apply on deploy** — applying the repo config automatically on every push. v1 keeps
+  a human in the loop with the drift view + manual Sync. When added, it will read the file
+  at the exact deployed commit (a source-plugin concern), not at dispatch time.
+- **Multi-workload reconcile** — one repo declaring/creating/deleting many workloads
+  (the full Flux/Argo model). v1 is per-workload, config-only, with no create/delete.
+- **`image` / `compose` sources** — not git-backed / overlapping config surface.
@@ -0,0 +1,223 @@
+# Deploy History + One-Click Rollback — Implementation Plan
+
+**Status:** planned (review incorporated) · **Feature rank:** #1 · **Date:** 2026-06-19
+
+## Review findings incorporated (adversarial pass)
+
+- **BLOCKER — never persist the raw deploy error** (it can carry registry-auth bytes /
+  compose stdout — see `compose.go` SECURITY comment + `workloads_plugin.go:198`).
+  `deploy_history.error` only ever gets a **fixed generic marker**
+  (`"deploy failed (see server logs)"`) on failure; the raw error goes to `slog` only.
+  `capDeployStatus(err.Error())` is rejected.
+- **BLOCKER — don't double-count metrics.** `DispatchPlugin` already calls
+  `metrics.DeploysTotal.Inc(...)`; recording slots into the **existing** outcome block,
+  not a re-added metrics line.
+- **FIX — no runtime-state store getter exists.** static/dockerfile `LastCommitSHA`
+  lives in `containers.extra_json` on a deterministic-ID row
+  (`GetContainerByID(w.ID+":site")` / `+":dockerfile"`, decode `ExtraJSON`). Moot for
+  Phase-1 rollback (image-only) but the resolver must use this, not a fictional getter.
+- **FIX — cascade is distrusted here.** `DeleteWorkload` explicitly deletes containers
+  rather than relying on the FK. Match that: add `DELETE FROM deploy_history WHERE
+  workload_id = ?` inside the `DeleteWorkload` transaction, and make the cascade test a
+  hard gate.
+- **FIX — keep recording off the hot path's tail.** `DispatchPlugin` runs synchronously
+  on the request goroutine; the INSERT is cheap but `PruneDeployHistory` runs in a
+  goroutine. Draining-rejected attempts (beginDispatch fail) record nothing — correct,
+  a never-run deploy must not appear as a rollback target.
+- **FIX — pagination:** use `parseLimit(raw, 50, 200)` (not the unclamped
+  `listWorkloadEvents` style); parse `offset` separately, clamp negatives to 0.
+
+
+## Problem
+
+Tinyforge has *failure* rollback (a failed deploy unwinds its own new container —
+[image.go:258](../../internal/workload/plugin/source/image/image.go)), but **no way to
+revert a *successful* deploy to a prior version.** Blue-green's `enforceMaxInstances`
+deletes the old container rows after cutover, so once `v3` replaces `v2` there is no
+record of `v2` and nothing to roll back to. The only "history" is free-text
+`event_log` rows (`"deployed"`) — not structured, not version-pinned, not replayable.
+
+This is the single most-requested capability for any deploy tool, and the plumbing is
+90% there: every deploy flows through one choke point, and the manual-deploy endpoint
+already accepts a `reference` override.
+
+## Key architectural facts (verified against current code)
+
+- **Single dispatch choke point:** `Deployer.DispatchPlugin(ctx, w, intent)` in
+  [internal/deployer/dispatch.go](../../internal/deployer/dispatch.go) routes *every*
+  source kind and already computes a success/failure `outcome`. This is where history
+  is recorded.
+- **`intent.Reference` is the version handle:** image source resolves
+  `tag := intent.Reference` (falling back to `DefaultTag`/`latest`). The manual deploy
+  endpoint ([workloads_plugin.go](../../internal/api/workloads_plugin.go)) already accepts
+  `{reference, note}` and builds a `manual` intent. **Rollback = deploy with a pinned
+  reference + a distinct reason.**
+- **Effective vs requested reference:** for a *manual* image deploy `intent.Reference`
+  is often `""` (means `DefaultTag`). The *effective* deployed tag is written onto the
+  freshest container row (`store.Container.ImageTag`). For static/dockerfile the
+  effective version is `runtime_state.LastCommitSHA`, resolved inside the source.
+- **Built-from-source sources don't honor a SHA reference on Deploy** — static and
+  dockerfile clone `cfg.Branch` HEAD and capture `latestSHA`; they cannot yet check out
+  an arbitrary commit. So **SHA-pinned rollback for them needs a source change (later
+  phase).** Image-tag rollback works today.
+- **Migration pattern:** additive statements in `runMigrations()` /
+  `workloadTables` in [store.go](../../internal/store/store.go); workload-scoped tables
+  use `REFERENCES workloads(id) ON DELETE CASCADE`. Per-table CRUD lives in its own
+  `internal/store/<table>.go`, model in `models.go`.
+- **Idempotency note:** the image source's same-tag short-circuit returns *before* it
+  arms its `EmitDeployEvent` defer, so a no-op deploy emits no timeline event. History
+  recorded at `DispatchPlugin` will still log it as a `success` attempt — acceptable
+  (history = ledger of attempts), but called out so the divergence is intentional.
+
+## Scope
+
+### Phase 1 (this plan)
+1. Persistent, structured **deploy-history ledger** for **all** source kinds (success
+   *and* failure) — powers an audit timeline and the rollback action.
+2. **One-click rollback** for the **image** source (redeploy a pinned tag).
+3. Read-only history panel on `/apps/[id]`; rollback button shown only for entries that
+   are `success` + have a non-empty reference + a rollback-capable source kind.
+
+### Explicitly out of scope (future phases, table already supports them)
+- SHA-pinned rebuild rollback for static/dockerfile (needs source checkout-by-commit).
+- Config-snapshot rollback for compose (no artifact reference).
+- Promotion (dev→staging→prod) — separate feature, will reuse this ledger.
+
+## Data model
+
+New table `deploy_history` (added to `workloadTables` in `runMigrations`):
+
+```sql
+CREATE TABLE IF NOT EXISTS deploy_history (
+    id            INTEGER PRIMARY KEY AUTOINCREMENT,
+    workload_id   TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+    source_kind   TEXT NOT NULL DEFAULT '',
+    reference     TEXT NOT NULL DEFAULT '',   -- effective artifact: image tag | commit sha | ''
+    reason        TEXT NOT NULL DEFAULT '',   -- manual|registry-push|git-push|cron|rollback|promote
+    triggered_by  TEXT NOT NULL DEFAULT '',
+    note          TEXT NOT NULL DEFAULT '',
+    outcome       TEXT NOT NULL DEFAULT '',   -- success | failure
+    error         TEXT NOT NULL DEFAULT '',   -- truncated, secret-free
+    started_at    TEXT NOT NULL DEFAULT '',
+    finished_at   TEXT NOT NULL DEFAULT ''
+);
+CREATE INDEX IF NOT EXISTS idx_deploy_history_workload
+    ON deploy_history(workload_id, id DESC);
+```
+
+**Why a dedicated table (not `event_log`):** structured + queryable, version-pinned,
+carries the replayable `reference`, and its retention is independent of the human event
+feed. `event_log` stays the free-text timeline; `deploy_history` is the version ledger.
+
+Go model in `models.go` (`DeployHistoryEntry`, mirrors `MetricAlertRule` style).
+
+## Backend changes
+
+### 1. Store — `internal/store/deploy_history.go` (new) + `models.go` + `store.go`
+- `DeployHistoryEntry` struct.
+- `InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error)`.
+- `ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error)`
+  — ordered `id DESC`; default/clamped limit (e.g. 50, max 200) via existing `parseLimit`
+  conventions at the API layer.
+- `GetDeployHistory(id int64) (DeployHistoryEntry, error)` — for rollback lookup;
+  `ErrNotFound` on miss.
+- `PruneDeployHistory(workloadID string, keep int) error` — keep newest `keep` per
+  workload (mirror the stats-prune pattern). Called best-effort after insert.
+- Migration: append `CREATE TABLE` + index to `workloadTables`.
+- Table test `deploy_history_test.go` (insert/list/get/prune, cascade-on-workload-delete).
+
+### 2. Deployer — record at the choke point (`internal/deployer/dispatch.go`)
+Wrap the existing `src.Deploy(...)` call:
+```go
+started := store.Now()
+err = src.Deploy(ctx, d.PluginDeps(), w, intent)
+outcome := "success"; if err != nil { outcome = "failure" }
+metrics.DeploysTotal.Inc(w.SourceKind, outcome)
+d.recordDeployHistory(w, intent, outcome, err, started) // best-effort, never blocks
+return err
+```
+- `recordDeployHistory` resolves the **effective reference** and inserts a row.
+  Best-effort: a store failure is logged, never propagated (same contract as
+  `maybeBackupBeforeDeploy` and `EmitDeployEvent`).
+- **Effective-reference resolver** (`internal/deployer/deploy_ref.go`, unit-tested):
+  1. start from `intent.Reference`;
+  2. `image`: read newest `ListContainersByWorkload(w.ID)` row (by `CreatedAt`), prefer
+     its `ImageTag` when non-empty — captures the `DefaultTag`/`latest` resolution;
+  3. `static`/`dockerfile`: when still empty, read persisted runtime state
+     `LastCommitSHA` (verify exact store getter during impl);
+  4. `compose`/unknown: leave as-is (may be `""`).
+- **Error sanitization:** reuse the `capDeployStatus` cap (256 runes) idea — store a
+  short, secret-free `error`. The raw error keeps going to `slog` only. (The deploy
+  error already carries a generic client message; the wrapped detail must not be
+  persisted verbatim because it can echo registry-auth / compose-stdout bytes — same
+  caller contract documented on `EmitDeployEvent`.)
+- Recording does **not** run for `DispatchReconcile` (periodic, not a deploy) or
+  `DispatchTeardown`.
+
+### 3. API — `internal/api/deploy_history.go` (new) + `router.go`
+- `GET /api/workloads/{id}/deploys?limit=&offset=` → `listWorkloadDeploys` (read; any
+  authenticated user — mirrors `listWorkloadEvents`). Uses `parseLimit`.
+- `POST /api/workloads/{id}/rollback` → `rollbackWorkload` (`auth.AdminOnly`), body
+  `{deploy_id}`:
+  1. load workload (404 if missing; 400 if `source_kind == ""`);
+  2. `GetDeployHistory(deploy_id)`; 404 if missing, 400 if its `workload_id` ≠ path id
+     (no cross-workload replay);
+  3. guard: `outcome == "success"`, `reference != ""`, and `source_kind` is
+     rollback-capable (`image` in Phase 1) → else 400 with a clear message;
+  4. build `manual`-shaped intent `{Reason: "rollback", Reference: row.reference,
+     Metadata: {"note": "rollback to " + row.reference, "rollback_of": <id>},
+     TriggeredBy: actor}`;
+  5. `deployer.DispatchPlugin(...)`; 202 on accept (same shape as deploy).
+- Register both routes inside the existing `r.Route("/workloads/{id}", …)` block in
+  [router.go](../../internal/api/router.go), next to `/deploy` and `/events`.
+- A `RollbackCapable(sourceKind) bool` helper (single source of truth, shared with the
+  list response so the frontend can render the button state without hardcoding kinds).
+- The list response includes a per-entry `rollbackable bool` computed server-side.
+
+## Frontend changes (`web/`)
+
+- **`DeployHistoryPanel.svelte`** (new, in `lib/components/`): table of entries —
+  short reference, reason badge, `outcome` `StatusBadge` (ok/bad), `triggered_by`,
+  relative time. For `rollbackable` rows a **Roll back** button → `ConfirmDialog`
+  ("Roll back <name> to <reference>?") → `POST …/rollback {deploy_id}` → `Toast` +
+  refresh history and container state. Loading via `Skeleton`; `EmptyState` when no
+  rows. Reuses existing components only.
+- Mount the panel on **`/apps/[id]`** alongside the activity timeline (it is the
+  *structured, actionable* sibling of the free-text timeline).
+- **i18n:** add keys under a `deployHistory.*` namespace to **both**
+  `web/src/lib/i18n/en.json` and `ru.json` (parity is mandatory and not a build error —
+  verify manually per CLAUDE.md).
+- API client: add `listDeploys(id, params)` and `rollback(id, deployId)` to the existing
+  workload API module.
+
+## Testing
+
+- **Store:** `deploy_history_test.go` — insert/list ordering, get, prune-keeps-newest,
+  cascade delete with workload.
+- **Deployer:** extend `deployer` tests — `DispatchPlugin` writes one `success` row and
+  one `failure` row (with sanitized error); reconcile/teardown write none. Resolver unit
+  test (`deploy_ref_test.go`) for the image read-back + empty fallbacks.
+- **API:** rollback guards — cross-workload id → 400; non-success/empty-ref/
+  non-image → 400; happy path → 202 and a `rollback`-reason history row appears.
+- **Web:** keep it light (the panel is mostly presentational); a `sourceForms`-style
+  pure-logic unit only if a non-trivial helper emerges.
+- Gates: `go build ./...`, `go vet ./internal/...`, `go test ./internal/...`,
+  `cd web && npm run check && npm run test`, then `./scripts/dev-server.sh`.
+
+## Risks / mitigations
+
+- **Recording must never break a deploy** → best-effort insert, errors only logged
+  (matches existing `EmitDeployEvent` / pre-deploy-backup contracts).
+- **Secret leakage via `error`** → store only a capped, generic reason; raw error to
+  `slog` only.
+- **Unbounded growth** → `PruneDeployHistory` keeps newest N per workload.
+- **Rollback to a vanished image tag** → the image source's `PullImage` fails and its
+  own failure-rollback leaves the live container untouched; the rollback attempt is
+  recorded as `failure`. No special handling needed.
+- **No-op rollback (target already running, `MaxInstances>1`)** → image short-circuit
+  returns `nil`; recorded as `success`. Acceptable.
+
+## Rollout
+
+Single PR. Additive migration (no destructive DDL). No settings changes. Backward
+compatible: existing workloads simply start accumulating history on their next deploy.
@@ -0,0 +1,98 @@
+# Configurable Deploy Strategy — Implementation Plan
+
+**Status:** planned (workflow-designed + adversarially reviewed) · **Feature rank:** #3 · **Date:** 2026-06-19
+
+## Problem
+
+`image` does zero-downtime blue-green; `dockerfile` and `static` **stop+remove the old
+container before creating the new one** on every redeploy (a real downtime window).
+`compose` is stack-managed. Give operators a per-workload **deploy strategy** and bring
+blue-green to the built-from-source sources.
+
+## Design (chosen via a 3-proposal judge panel; "minimal" won, 9/10)
+
+Per-source `deploy_strategy` field **inside each source's `SourceConfig` JSON blob** —
+**no new DB column, no migration, no `dispatch.go` change**. Values: `""` (back-compat
+default), `"recreate"`, `"blue-green"`. Round-trips opaquely through
+`plugin.WorkloadFromStore` / `SourceConfigOf[Config]`; validated in each source's existing
+`Validate(json.RawMessage)` (runs on create **and** update at `workloads_plugin.go:291`).
+
+**Per-source default (load-bearing):** a single shared default would silently flip
+image's native blue-green to recreate, so each source has a tiny `effectiveStrategy`:
+- `image`: `""` → **blue-green**
+- `dockerfile` / `static` / `compose`: `""` → **recreate**
+
+The blue-green branch for dockerfile/static uses a **transient two-container / single-row
+swap** so `state.go`, `teardown.go`, and `reconcile.go` (which read one deterministic row)
+stay **untouched** — the lowest-risk way to ship gap-free cutover.
+
+## Review fixes folded in (adversarial pass)
+
+1. **BLOCKER — ordering / crash-safety.** Blue-green order MUST be: create+start green →
+   readiness-gate green → `ConfigureRoute(green)` (upsert) → **`saveState(green)` into the
+   single row FIRST** → only THEN stop+remove blue (captured before saveState). The single
+   row must always point at a running container; reaping blue before persisting green
+   orphans green and makes the reconciler flip a healthy workload to `failed`.
+2. **Unique green name is load-bearing.** dockerfile/static names are deterministic
+   (`tf-build-<name>-<id>` / `dw-site-<name>-<id>`) and double as the proxy `forwardHost`.
+   The green container needs a genuinely unique name (`…-<ms-hex>`, lifted from
+   `image.buildContainerName`) set in **both** `cc.Name` **and** the `ConfigureRoute`
+   `forwardHost`.
+3. **Readiness, not liveness.** Before cutover, use `deps.Health.Check(ctx, http://<green>:
+   <port><healthcheck>)` when a healthcheck path is configured (dockerfile has `Healthcheck`);
+   fall back to the existing 3s liveness gate otherwise. Don't advertise "zero-downtime" on
+   the liveness-only path.
+4. **Pure upsert.** Drop the pre-`DeleteRoute`; call only `ConfigureRoute` (upsert-by-FQDN
+   for NPM repoints in place; Traefik is label-driven). **Traefik caveat:** blue+green
+   briefly carry the same host-rule labels → momentary dual-serve; documented as a
+   Traefik-only phase-1 limitation (NPM, the common case, is gap-free).
+5. **deno + storage → force recreate.** When `static` has `StorageEnabled && mode==deno`,
+   `effectiveStrategy` forces `recreate` — blue-green would mount the same RW named volume
+   into both containers (a concurrent-writer window recreate never had).
+6. **image `recreate` gets its own shape.** Don't reuse `rollbackNew` (assumes blue
+   survives). image `recreate` = reap existing running containers **after** a successful
+   pull, then create green; on green failure the downtime is the accepted recreate
+   contract (logged distinctly, not as a non-disruptive rollback).
+7. Image tag `:latest` shared by blue/green is **safe** — containers pin image-by-id at
+   create (no fix needed).
+
+## Files (phase 1, backend-only)
+
+- **NEW** `internal/workload/plugin/strategy.go` — `StrategyRecreate`/`StrategyBlueGreen`
+  consts, `ValidateStrategy(value string, allowBlueGreen bool) error`,
+  `BuildGreenName(name, id string, ts time.Time) string` (lifted unique-suffix scheme).
+  `+ strategy_test.go`.
+- `image/image.go` — `DeployStrategy` on Config; `effectiveStrategy` (""→blue-green);
+  Validate; honor `recreate` (reap-after-pull + dedicated log).
+- `dockerfile/dockerfile.go` (Config + Validate) + `dockerfile/deploy.go` (blue-green
+  branch, fixes 1–4) + `dockerfile/deploy_test.go`.
+- `static/static.go` (Config + Validate) + `static/deploy.go` (blue-green branch + deno
+  gate, fixes 1–5) + `static/deploy_test.go`.
+- `compose/compose.go` — Config field + Validate rejects `blue-green` (allowBlueGreen=false)
+  + test.
+
+## Phase 1 backward-compat lock (mandatory, unit-tested)
+`ValidateStrategy("", …)` returns nil; every `effectiveStrategy("")` returns the source's
+historical default. Existing rows (no `deploy_strategy` key) decode `""` → today's exact
+behavior, byte-for-byte.
+
+## Later phases (deferred)
+- **P2 (UI):** `sourceForms.ts` seed/serialize + `/apps/new` & `/apps/[id]` select +
+  en/ru i18n (hide blue-green for compose).
+- **P3 (harden):** mandatory HTTP readiness probe for static; connection draining before
+  blue removal; Traefik label suppression at cutover.
+- **P4 (architecture):** extract image's proven sequence into a shared
+  `plugin.DeploySingleContainer`; migrate dockerfile/static to the multi-row model
+  (crash-safe mid-swap; unlocks `MaxInstances>1`).
+- **P5:** true `rolling` (needs a backend-pool primitive on `proxy.Provider`) + compose
+  green-project blue-green.
+
+## Test plan
+Table-driven, TDD: `ValidateStrategy` accept/reject matrix (incl. `allowBlueGreen=false`,
+reserved `rolling` rejected, `""` accepted); per-source `effectiveStrategy` defaults +
+deno-storage→recreate; dockerfile/static blue-green deploy tests asserting (a) green named
+≠ deterministic name, (b) collision teardown NOT run, (c) `ConfigureRoute` called with
+`forwardHost==green` and NO preceding `DeleteRoute`, (d) `saveState(green)` **before**
+`RemoveContainer(blue)`, (e) single row ends at green; failure path: green fails gate →
+green removed, blue + route untouched; compose rejects blue-green. Gates: `go build`,
+`go vet`, `go test ./internal/...`, `npm run check/test`, `./scripts/dev-server.sh`.
@@ -0,0 +1,84 @@
+# Per-Workload Metrics Graph — Implementation Plan
+
+**Status:** planned · **Feature rank:** #2 · **Date:** 2026-06-19
+
+## Problem
+
+Stats are collected per container (`container_stats_samples`, CPU/mem/net/disk) and
+charted **globally** on the dashboard (`SystemResourcesCard` + `ResourceChart`), but
+`/apps/[id]` shows only live snapshots — there's no per-workload "is my app leaking
+memory / pegging CPU over the last few hours" view. This is a daily question and the
+data already exists; we just need a per-workload query + a panel that reuses the chart.
+
+## Verified facts
+
+- `ContainerStatsSample.OwnerID` == the **container row id** (`containers.id`), confirmed
+  by `lookupInstanceName` → `GetContainerByID(sm.OwnerID)` in
+  [stats_history.go](../../internal/api/stats_history.go). `OwnerType` ∈ {instance, site}.
+- Each sample's `ts` is that container's own Docker-stats `Timestamp.Unix()`
+  ([collector.go](../../internal/stats/collector.go)) — NOT one shared tick stamp. In a
+  multi-container tick the per-second truncation usually collapses them to the same
+  integer `ts`, so per-`ts` aggregation works; a ±1s split at a second boundary is
+  cosmetic for a trend line. (Reviewer-corrected.) The handler 404s on an unknown
+  workload id but returns `[]` for a known workload with no samples yet.
+- `ResourceChart.svelte` takes a fully-built `EChartsOption` from the parent; the parent
+  owns series/axes (see `SystemResourcesCard`). Reads stay available when Docker is down
+  (samples come from SQLite, not the daemon).
+- Per-workload reads (`/events`, `/runtime-state`) are open to any authenticated user;
+  this endpoint follows suit (no `AdminOnly`).
+
+## Backend
+
+1. **Store** — `ListContainerStatsSamplesByWorkload(workloadID string, sinceTS int64)`:
+   ```sql
+   SELECT cs.container_id, cs.owner_type, cs.owner_id, cs.ts,
+          cs.cpu_percent, cs.memory_usage, cs.memory_limit,
+          cs.network_rx, cs.network_tx, cs.block_read, cs.block_write
+   FROM container_stats_samples cs
+   JOIN containers c ON c.id = cs.owner_id
+   WHERE c.workload_id = ? AND cs.ts >= ?
+   ORDER BY cs.ts ASC
+   ```
+   Returns `[]ContainerStatsSample`.
+
+2. **API** — `getWorkloadStatsHistory` (GET `/api/workloads/{id}/stats/history?window=`):
+   reuse `parseWindow`/`sinceTimestamp`; aggregate samples **per ts** into a compact
+   series so multi-container workloads (compose) sum correctly:
+   ```go
+   type workloadStatsPoint struct {
+       TS          int64   `json:"ts"`
+       CPUPercent  float64 `json:"cpu_percent"`   // sum across the workload's containers
+       MemoryUsage int64   `json:"memory_usage"`  // sum bytes
+       MemoryLimit int64   `json:"memory_limit"`  // max (effective ceiling)
+   }
+   ```
+   Always returns `[]` (never 503) — empty when stats are disabled / Docker was down /
+   the workload is new. Register in the `/workloads/{id}` route block.
+
+3. **Tests** — store: join scopes to the right workload (A's samples ≠ B's); API:
+   per-ts aggregation sums two containers at the same tick.
+
+## Frontend
+
+4. **api.ts** — `WorkloadStatsPoint` type + `fetchWorkloadStatsHistory(id, window, signal)`.
+5. **`WorkloadMetricsPanel.svelte`** — window selector (30m / 2h / 6h), fetch + 15s poll
+   (mirror `SystemResourcesCard`), build an `EChartsOption` with **two series**: CPU %
+   on the left axis, Memory (MiB) on the right axis (absolute bytes, because
+   `memory_limit` is often 0/unlimited so a % would divide by zero). `EmptyState`/ hint
+   when there are no samples. Render via `ResourceChart`. Mount on `/apps/[id]` near the
+   deploy-history panel.
+6. **i18n** — `apps.detail.metrics.*` in both en.json and ru.json (parity mandatory).
+
+## Risks / mitigations
+
+- **Docker down / stats disabled** → empty series, friendly hint (no error). SQLite read
+  path is independent of the daemon.
+- **memory_limit = 0 (unlimited)** → plot absolute MiB, not %, to avoid div-by-zero.
+- **Sparse sampling** → chart shows whatever ticks exist; window selector lets the user
+  widen. No interpolation.
+- **Auth** → read-only, any authenticated user (consistent with other per-workload reads).
+
+## Rollout
+
+Single change set, additive, no migration. Reuses the existing `echarts` dependency and
+`ResourceChart` component.
@@ -10,8 +10,11 @@ require (
 	github.com/moby/moby/api v1.54.0
 	github.com/moby/moby/client v0.3.0
 	github.com/robfig/cron/v3 v3.0.1
+	github.com/yuin/goldmark v1.8.2
 	golang.org/x/crypto v0.28.0
 	golang.org/x/oauth2 v0.25.0
+	golang.org/x/sync v0.20.0
+	golang.org/x/sys v0.33.0
 	gopkg.in/yaml.v3 v3.0.1
 	modernc.org/sqlite v1.34.5
 )
@@ -34,15 +37,12 @@ require (
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	github.com/yuin/goldmark v1.8.2 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
 	go.opentelemetry.io/otel v1.35.0 // indirect
 	go.opentelemetry.io/otel/metric v1.35.0 // indirect
 	go.opentelemetry.io/otel/trace v1.35.0 // indirect
 	golang.org/x/mod v0.18.0 // indirect
-	golang.org/x/sync v0.20.0 // indirect
-	golang.org/x/sys v0.33.0 // indirect
 	golang.org/x/tools v0.22.0 // indirect
 	modernc.org/libc v1.55.3 // indirect
 	modernc.org/mathutil v1.6.0 // indirect
@@ -85,8 +85,6 @@ golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
 golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70=
 golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
-golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
-golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
 golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -16,13 +16,12 @@ import (
 )

 // rateLimitedLogin wraps the login handler with per-IP rate limiting.
+// Uses clientIP() so X-Forwarded-For is honored only when the request
+// arrives from a configured trusted-proxy CIDR — preventing remote
+// attackers from spoofing the header to bypass the per-IP login limiter.
 func (s *Server) rateLimitedLogin(rl *rateLimiter) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
-		ip := r.RemoteAddr
-		if fwd := r.Header.Get("X-Forwarded-For"); fwd != "" {
-			ip = fwd
-		}
-		if !rl.allow(ip) {
+		if !rl.allow(clientIP(r)) {
 			respondError(w, http.StatusTooManyRequests, "too many login attempts, try again later")
 			return
 		}
@@ -1,7 +1,6 @@
 package api

 import (
-	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -118,7 +117,22 @@ func (s *Server) deleteBackup(w http.ResponseWriter, r *http.Request) {
 }

 // restoreBackup handles POST /api/backups/{id}/restore.
-// This replaces the current database with the backup and triggers a graceful shutdown.
+//
+// Restore happens in three documented stages so a failure at any stage
+// leaves the live DB intact:
+//
+//  1. PRE-FLIGHT (sync, before the HTTP response): PrepareRestore opens
+//     the candidate read-only and runs `PRAGMA integrity_check`. If it
+//     fails the live DB is untouched and we return 400 with the reason.
+//
+//  2. SAFETY NET: a pre-restore backup of the LIVE DB is created so the
+//     operator can roll back even if the candidate is later discovered
+//     to be missing data.
+//
+//  3. SWAP (async, after the response is flushed): close the live DB,
+//     atomic-rename the candidate over the live path, wipe WAL/SHM,
+//     trigger graceful shutdown. supervisord / systemd / docker
+//     restart=on-failure brings the process back with the new DB.
 func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	if s.backupEngine == nil {
 		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
@@ -126,13 +140,44 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	}

 	id := chi.URLParam(r, "id")
-	restorePath, err := s.backupEngine.RestorePath(id)
-	if err != nil {
-		respondError(w, http.StatusNotFound, "backup not found: "+err.Error())
+
+	// CSRF / accidental-fire guard: the restore endpoint is the most
+	// destructive surface in the API (replaces the whole DB). Even
+	// though it sits behind AdminOnly + Bearer JWT, a blind cross-site
+	// POST or a misclicked button in any open admin tab can fire it.
+	// Require the operator's client to echo X-Confirm-Restore: <id>
+	// — matching the path param — so a CSRF post-form / image-src
+	// trick can't trigger restore (browsers don't let cross-origin
+	// requests set custom headers without a preflight).
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != id {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal backup id)")
 		return
 	}

-	// Create a safety backup before restore so the user can undo if needed.
+	// Single-flight guard: a rapid double-click would otherwise spawn
+	// two goroutines racing s.store.Close() and the candidate-over-
+	// live rename. CAS to true here; if someone else won, return 409.
+	if !s.restoreInFlight.CompareAndSwap(false, true) {
+		respondError(w, http.StatusConflict, "a restore is already in progress")
+		return
+	}
+	// Do NOT release the flag — the restore path triggers shutdown.
+	// A failed restore is also terminal (the DB may be closed); a
+	// fresh process boot is the recovery path.
+	// PRE-FLIGHT: refuse before touching anything if the candidate is
+	// not a valid SQLite database or fails integrity_check. This is the
+	// guard the prior code lacked — a corrupt backup would silently
+	// overwrite a healthy live DB.
+	restorePath, err := s.backupEngine.PrepareRestore(id)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	// SAFETY NET: pre-restore snapshot of the live DB. A failure here
+	// is logged but does not abort — the integrity-checked candidate
+	// is still safer than refusing to restore.
 	if _, err := s.backupEngine.CreateBackup("pre-restore"); err != nil {
 		slog.Warn("failed to create pre-restore backup", "error", err)
 	}
@@ -153,41 +198,37 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
 	go func() {
 		time.Sleep(500 * time.Millisecond)

-		// Close the current database to release locks.
+		// Once we begin closing the live DB the process can no longer serve
+		// requests against a sane store, so EVERY exit path from here must
+		// trigger shutdown. Returning early would leave the server limping
+		// on a closed/half-swapped database with no path to recovery except
+		// an external kill. shutdownFunc → graceful shutdown → main returns
+		// → deferred releaseLock()/db.Close() run, and the supervisor reopens
+		// whatever DB is on disk on the next boot.
+		triggerShutdown := func() {
+			if s.shutdownFunc != nil {
+				s.shutdownFunc()
+			}
+		}
+
+		// Close the current database to release locks. AtomicReplaceDB
+		// expects the live file to be unmapped before swap (especially
+		// important on Windows where open files cannot be renamed over).
 		if err := s.store.Close(); err != nil {
-			slog.Error("restore: failed to close database", "error", err)
+			slog.Error("restore: failed to close database, restarting", "error", err)
+			triggerShutdown()
 			return
 		}

-		// Copy the backup file over the main database using streaming (no full read into memory).
-		src, err := os.Open(restorePath)
-		if err != nil {
-			slog.Error("restore: failed to open backup file", "error", err)
+		if err := s.backupEngine.AtomicReplaceDB(restorePath, s.dbPath); err != nil {
+			slog.Error("restore: atomic replace failed, restarting", "error", err)
+			triggerShutdown()
 			return
 		}
-		defer src.Close()
-
-		dst, err := os.Create(s.dbPath)
-		if err != nil {
-			slog.Error("restore: failed to create database file", "error", err)
-			return
-		}
-		defer dst.Close()
-
-		if _, err := io.Copy(dst, src); err != nil {
-			slog.Error("restore: failed to copy backup to database", "error", err)
-			return
-		}
-
-		// Remove WAL and SHM files to ensure clean state.
-		os.Remove(s.dbPath + "-wal")
-		os.Remove(s.dbPath + "-shm")

 		slog.Info("restore: database replaced, triggering shutdown")

 		// Signal the server to shut down gracefully so it can be restarted.
-		if s.shutdownFunc != nil {
-			s.shutdownFunc()
-		}
+		triggerShutdown()
 	}()
 }
@@ -0,0 +1,151 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strconv"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// parseOffset parses a pagination offset, clamping anything invalid or
+// negative to 0. parseLimit (secrets.go) handles the limit half.
+func parseOffset(raw string) int {
+	n, err := strconv.Atoi(raw)
+	if err != nil || n < 0 {
+		return 0
+	}
+	return n
+}
+
+// rollbackCapableKinds is the single source of truth for which source kinds
+// support reference-pinned redeploy. The image source resolves
+// intent.Reference as the tag, so replaying a prior tag is a real rollback.
+// static/dockerfile clone branch HEAD and cannot yet check out an arbitrary
+// commit (a later phase); compose has no single artifact handle.
+var rollbackCapableKinds = map[string]bool{"image": true}
+
+// RollbackCapable reports whether a source kind supports one-click rollback.
+// Used by both the list response (per-row `rollbackable` flag) and the
+// rollback guard so the UI and the server never disagree.
+func RollbackCapable(sourceKind string) bool { return rollbackCapableKinds[sourceKind] }
+
+// listWorkloadDeploys handles GET /api/workloads/{id}/deploys. Read-only,
+// open to any authenticated user (mirrors the per-workload events feed).
+// Returns the structured deploy ledger newest-first with a server-computed
+// `rollbackable` flag per row.
+func (s *Server) listWorkloadDeploys(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+
+	q := r.URL.Query()
+	limit := parseLimit(q.Get("limit"), 50, 200)
+	offset := parseOffset(q.Get("offset"))
+
+	rows, err := s.store.ListDeployHistory(id, limit, offset)
+	if err != nil {
+		slog.Error("failed to list deploy history", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list deploy history")
+		return
+	}
+	for i := range rows {
+		rows[i].Rollbackable = rows[i].Outcome == "success" &&
+			rows[i].Reference != "" &&
+			RollbackCapable(rows[i].SourceKind)
+	}
+	respondJSON(w, http.StatusOK, rows)
+}
+
+// rollbackWorkload handles POST /api/workloads/{id}/rollback. Admin-only
+// (same gate as /deploy). Body: {"deploy_id": <id>}. It resolves the pinned
+// reference from a prior successful, rollback-capable ledger row belonging
+// to this workload and replays it as a `rollback`-reason deploy.
+func (s *Server) rollbackWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	row, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	if row.SourceKind == "" {
+		respondError(w, http.StatusBadRequest, "workload has no source_kind; cannot roll back")
+		return
+	}
+
+	var body struct {
+		DeployID int64 `json:"deploy_id"`
+	}
+	if !decodeJSONStrict(w, r, &body) {
+		return
+	}
+	if body.DeployID <= 0 {
+		respondError(w, http.StatusBadRequest, "deploy_id is required")
+		return
+	}
+
+	entry, err := s.store.GetDeployHistory(body.DeployID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "deploy history entry")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get deploy history")
+		return
+	}
+	// No cross-workload replay: the entry must belong to the path workload.
+	if entry.WorkloadID != id {
+		respondError(w, http.StatusBadRequest, "deploy entry does not belong to this workload")
+		return
+	}
+	if entry.Outcome != "success" {
+		respondError(w, http.StatusBadRequest, "cannot roll back to a failed deploy")
+		return
+	}
+	if entry.Reference == "" || !RollbackCapable(row.SourceKind) {
+		respondError(w, http.StatusBadRequest, "this deploy is not rollback-capable")
+		return
+	}
+
+	actor := "manual"
+	if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
+		actor = claims.Username
+	}
+	intent := plugin.DeploymentIntent{
+		Reason:    "rollback",
+		Reference: entry.Reference,
+		Metadata: map[string]string{
+			"note":        "rollback to " + entry.Reference,
+			"rollback_of": strconv.FormatInt(entry.ID, 10),
+		},
+		TriggeredAt: time.Now().UTC(),
+		TriggeredBy: actor,
+	}
+	if err := s.deployer.DispatchPlugin(r.Context(), toPluginWorkload(row), intent); err != nil {
+		// Raw error stays in the server log; client gets a generic message
+		// (the wrapped error can carry registry-auth bytes).
+		slog.Warn("rollback dispatch failed", "workload", id, "actor", actor,
+			"reference", entry.Reference, "error", err)
+		respondError(w, http.StatusInternalServerError, "rollback failed; see server logs")
+		return
+	}
+	respondJSON(w, http.StatusAccepted, map[string]any{
+		"workload_id":  id,
+		"reference":    entry.Reference,
+		"rollback_of":  entry.ID,
+		"triggered_by": actor,
+	})
+}
@@ -0,0 +1,126 @@
+package api
+
+import (
+	"net/http"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// createImageWorkload creates an image-source workload through the API so
+// source_kind is persisted exactly as production does, returning its id.
+func createImageWorkload(t *testing.T, e *apiTestEnv, name string) string {
+	t.Helper()
+	resp := e.do(t, http.MethodPost, "/api/workloads", pluginWorkloadRequest{
+		Name: name, SourceKind: "image", SourceConfig: validImageSourceConfig(),
+	})
+	if resp.StatusCode != http.StatusCreated {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("create workload: status %d", resp.StatusCode)
+	}
+	var got plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("create workload envelope error: %q", errMsg)
+	}
+	return got.ID
+}
+
+func TestListWorkloadDeploys_ComputesRollbackable(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := createImageWorkload(t, e, "app")
+
+	// success + reference + image  => rollbackable
+	e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+	// failure                      => not rollbackable
+	e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "v2", Outcome: "failure",
+	})
+	// success but empty reference  => not rollbackable
+	e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "", Outcome: "success",
+	})
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/deploys", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var rows []store.DeployHistoryEntry
+	if errMsg := decodeEnvelope(t, resp, &rows); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(rows) != 3 {
+		t.Fatalf("expected 3 rows, got %d", len(rows))
+	}
+	// Newest-first: empty-ref success, failure, then v1 success.
+	if !rows[2].Rollbackable {
+		t.Fatalf("v1 success row should be rollbackable: %+v", rows[2])
+	}
+	if rows[1].Rollbackable || rows[0].Rollbackable {
+		t.Fatalf("failure / empty-ref rows must not be rollbackable")
+	}
+}
+
+func TestRollback_HappyPath_DispatchesRollbackIntent(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := createImageWorkload(t, e, "app")
+	entry, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+
+	before := e.dispatcher.deployCount.Load()
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/rollback",
+		map[string]any{"deploy_id": entry.ID})
+	if resp.StatusCode != http.StatusAccepted {
+		errMsg := decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 202 (err=%q)", resp.StatusCode, errMsg)
+	}
+	if got := e.dispatcher.deployCount.Load(); got != before+1 {
+		t.Fatalf("expected one dispatch, got delta %d", got-before)
+	}
+	intent := e.dispatcher.lastIntent.Load()
+	if intent == nil || intent.Reason != "rollback" || intent.Reference != "v1" {
+		t.Fatalf("expected rollback intent for v1, got %+v", intent)
+	}
+}
+
+func TestRollback_Guards(t *testing.T) {
+	e := newAPITestEnv(t)
+	imageID := createImageWorkload(t, e, "img")
+	otherID := createImageWorkload(t, e, "other")
+
+	success, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: imageID, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+	failed, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: imageID, SourceKind: "image", Reference: "v2", Outcome: "failure",
+	})
+	otherWL, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: otherID, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+
+	cases := []struct {
+		name     string
+		workload string
+		body     any
+		wantCode int
+	}{
+		{"missing deploy_id", imageID, map[string]any{}, http.StatusBadRequest},
+		{"zero deploy_id", imageID, map[string]any{"deploy_id": 0}, http.StatusBadRequest},
+		{"unknown deploy_id", imageID, map[string]any{"deploy_id": 999999}, http.StatusNotFound},
+		{"unknown workload", "nope", map[string]any{"deploy_id": success.ID}, http.StatusNotFound},
+		{"failed deploy", imageID, map[string]any{"deploy_id": failed.ID}, http.StatusBadRequest},
+		{"cross-workload entry", imageID, map[string]any{"deploy_id": otherWL.ID}, http.StatusBadRequest},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			resp := e.do(t, http.MethodPost, "/api/workloads/"+c.workload+"/rollback", c.body)
+			if resp.StatusCode != c.wantCode {
+				errMsg := decodeEnvelope(t, resp, nil)
+				t.Fatalf("status = %d, want %d (err=%q)", resp.StatusCode, c.wantCode, errMsg)
+			}
+		})
+	}
+}
@@ -9,6 +9,7 @@ import (
 	"strings"
 	"time"

+	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/staticsite"
 )

@@ -350,6 +351,54 @@ func (s *Server) listImageConflicts(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, conflicts)
 }

+// inspectImageRequest is the body for POST /api/discovery/image/inspect.
+type inspectImageRequest struct {
+	Image string `json:"image"`
+}
+
+// inspectImageResponse mirrors the frontend InspectResult shape the
+// new-app wizard pre-fills from: the first exposed port (parsed to int,
+// 0 when none) and the image's HEALTHCHECK command string.
+type inspectImageResponse struct {
+	Port        int    `json:"port"`
+	Healthcheck string `json:"healthcheck"`
+}
+
+// inspectImageMetadata inspects a LOCAL image and returns its first
+// exposed port + healthcheck so the wizard can pre-fill those fields.
+// POST /api/discovery/image/inspect.
+//
+// This inspects local images only — it does not pull. When the image is
+// not present locally the docker call fails; we return a generic,
+// non-leaky 400 rather than the git-specific upstreamError so a raw
+// docker daemon string (which may echo the ref) never reaches the client.
+func (s *Server) inspectImageMetadata(w http.ResponseWriter, r *http.Request) {
+	var req inspectImageRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	image := strings.TrimSpace(req.Image)
+	if image == "" {
+		respondError(w, http.StatusBadRequest, "image is required")
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	info, err := s.docker.InspectImage(ctx, image)
+	if err != nil {
+		slog.Warn("inspect image metadata failed", "error", err)
+		respondError(w, http.StatusBadRequest, "could not inspect image — make sure it is pulled locally and the reference is correct")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, inspectImageResponse{
+		Port:        docker.ExtractPort(info.ExposedPorts),
+		Healthcheck: info.Healthcheck,
+	})
+}
+
 // stripImageTag returns the image reference with the trailing :tag
 // removed, taking care to leave a registry port (e.g. registry:5000/foo)
 // intact. Digest references (image@sha256:...) are returned unchanged.
@@ -348,3 +348,32 @@ func (s *Server) pruneImages(w http.ResponseWriter, r *http.Request) {
 		"space_reclaimed_mb": reclaimedBytes / (1024 * 1024),
 	})
 }
+
+// pruneBuildCache handles POST /api/docker/prune-build-cache. It removes
+// unused Docker build-cache records daemon-wide (all=false), so an app's next
+// rebuild still hits its warm cache. The build cache is regenerable by
+// definition — pruning only forces slower rebuilds, never data loss — and the
+// dockerfile/static deploy paths never reclaim it on teardown, so it grows
+// monotonically until pruned here.
+func (s *Server) pruneBuildCache(w http.ResponseWriter, r *http.Request) {
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
+		return
+	}
+
+	result, err := s.docker.PruneBuildCache(r.Context(), false)
+	if err != nil {
+		slog.Error("prune: build cache", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	slog.Info("prune: build cache",
+		"caches_deleted", result.CachesDeleted,
+		"space_reclaimed_mb", result.SpaceReclaimed/(1024*1024))
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"caches_deleted":     result.CachesDeleted,
+		"space_reclaimed_mb": result.SpaceReclaimed / (1024 * 1024),
+	})
+}
@@ -37,6 +37,36 @@ func (s *Server) listEventLog(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, events)
 }

+// listWorkloadEvents handles GET /api/workloads/{id}/events — the per-app
+// activity/deploy timeline. The workload id is pinned from the path, so a
+// client cannot widen the scope to other workloads or the global feed.
+// Supports the same severity/limit/offset query params as listEventLog.
+func (s *Server) listWorkloadEvents(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+
+	q := r.URL.Query()
+	limit, _ := strconv.Atoi(q.Get("limit"))
+	offset, _ := strconv.Atoi(q.Get("offset"))
+
+	events, err := s.store.ListEvents(store.EventLogFilter{
+		WorkloadID: id,
+		Severity:   q.Get("severity"),
+		Limit:      limit,
+		Offset:     offset,
+	})
+	if err != nil {
+		slog.Error("failed to list workload events", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list events")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, events)
+}
+
 // getEventLogStats handles GET /api/events/log/stats.
 func (s *Server) getEventLogStats(w http.ResponseWriter, r *http.Request) {
 	stats, err := s.store.GetEventStats()
@@ -0,0 +1,364 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+	"sync"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/gitops"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// keyedMutex is a lazily-populated per-key lock. Used to serialize a critical
+// section per workload id (the GitOps sync) without a global lock.
+type keyedMutex struct {
+	mu sync.Mutex
+	m  map[string]*sync.Mutex
+}
+
+// lock acquires the mutex for key and returns its unlock func.
+func (k *keyedMutex) lock(key string) func() {
+	k.mu.Lock()
+	if k.m == nil {
+		k.m = make(map[string]*sync.Mutex)
+	}
+	mu, ok := k.m[key]
+	if !ok {
+		mu = &sync.Mutex{}
+		k.m[key] = mu
+	}
+	k.mu.Unlock()
+
+	mu.Lock()
+	return mu.Unlock
+}
+
+// gitOpsStatusResponse is the single rich payload the GitOps panel reads — it
+// folds the file preview, parsed status, and drift into one response so the UI
+// makes a single call (no separate /drift round-trip).
+type gitOpsStatusResponse struct {
+	Eligible   bool                `json:"eligible"`     // source kind supports GitOps
+	Enabled    bool                `json:"enabled"`      // opt-in flag on the workload
+	Path       string              `json:"path"`         // repo-relative config path
+	Status     string              `json:"status"`       // disabled|ok|no_file|fetch_failed|invalid
+	Raw        string              `json:"raw"`          // the .tinyforge.yml text, when present
+	Message    string              `json:"message"`      // token-redacted detail for non-ok
+	CommitSHA  string              `json:"commit_sha"`   // ref the file was read at
+	LastSyncAt string              `json:"last_sync_at"` // last successful sync ("" = never)
+	Drift      []gitops.DriftEntry `json:"drift"`        // declared fields that differ from live
+	DriftCount int                 `json:"drift_count"`
+	// ManagedFields lists every source_config key the repo overlay declares
+	// (not just the drifting ones) so the UI can lock exactly those fields on
+	// the edit form. Populated only when the file parsed (status ok).
+	ManagedFields []string `json:"managed_fields"`
+}
+
+// getWorkloadGitOps handles GET /api/workloads/{id}/gitops. Read-only; open to
+// any authenticated user. When GitOps is enabled it fetches the repo's
+// .tinyforge.yml live and computes drift against the stored source_config.
+func (s *Server) getWorkloadGitOps(w http.ResponseWriter, r *http.Request) {
+	row, ok := s.loadWorkload(w, chi.URLParam(r, "id"))
+	if !ok {
+		return
+	}
+
+	resp := gitOpsStatusResponse{
+		Eligible:   gitops.IsEligibleSource(row.SourceKind),
+		Enabled:    row.GitOpsEnabled,
+		Path:       row.GitOpsPath,
+		Status:     "disabled",
+		LastSyncAt: row.GitOpsLastSyncAt,
+		CommitSHA:  row.GitOpsCommitSHA,
+		Drift:      []gitops.DriftEntry{},
+	}
+	if resp.Path == "" {
+		resp.Path = ".tinyforge.yml"
+	}
+
+	// Only reach out to the repo when GitOps is actually on.
+	if row.GitOpsEnabled && resp.Eligible {
+		ref, err := s.gitOpsRepoRef(row)
+		if err != nil {
+			// Decoding/decrypt failure: surface as fetch_failed, never the raw
+			// error (it can carry the token / config bytes).
+			slog.Warn("gitops: build repo ref", "workload", row.ID, "error", err)
+			resp.Status = string(gitops.StatusFetchFailed)
+			resp.Message = "could not read repo settings for this workload"
+			respondJSON(w, http.StatusOK, resp)
+			return
+		}
+		res := gitops.Fetch(r.Context(), ref)
+		resp.Status = string(res.Status)
+		resp.CommitSHA = firstNonEmpty(res.CommitSHA, row.GitOpsCommitSHA)
+		resp.Message = res.Message
+		if len(res.Raw) > 0 {
+			resp.Raw = string(res.Raw)
+		}
+		if res.Status == gitops.StatusOK {
+			drift, derr := gitops.Drift(res.Spec, json.RawMessage(row.SourceConfig), row.SourceKind)
+			if derr != nil {
+				slog.Warn("gitops: drift", "workload", row.ID, "error", derr)
+			} else if drift != nil {
+				resp.Drift = drift
+			}
+			resp.DriftCount = len(resp.Drift)
+			resp.ManagedFields = planFields(gitops.BuildPlan(res.Spec, row.SourceKind))
+		}
+	}
+
+	respondJSON(w, http.StatusOK, resp)
+}
+
+// setWorkloadGitOps handles PUT /api/workloads/{id}/gitops. Admin-only.
+// Body: {"enabled": bool, "path": string}. Enabling is refused for source
+// kinds that aren't git-backed; the path is validated against traversal.
+func (s *Server) setWorkloadGitOps(w http.ResponseWriter, r *http.Request) {
+	row, ok := s.loadWorkload(w, chi.URLParam(r, "id"))
+	if !ok {
+		return
+	}
+
+	var body struct {
+		Enabled bool   `json:"enabled"`
+		Path    string `json:"path"`
+	}
+	if !decodeJSONStrict(w, r, &body) {
+		return
+	}
+
+	if body.Enabled && !gitops.IsEligibleSource(row.SourceKind) {
+		respondError(w, http.StatusBadRequest,
+			"GitOps is only available for dockerfile and static sources")
+		return
+	}
+
+	path := strings.TrimSpace(body.Path)
+	if path != "" && !validGitOpsPath(path) {
+		respondError(w, http.StatusBadRequest,
+			"invalid path: must be a repo-relative file (no \"..\", no leading slash)")
+		return
+	}
+
+	if err := s.store.SetWorkloadGitOps(row.ID, body.Enabled, path); err != nil {
+		slog.Error("gitops: set", "workload", row.ID, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to update GitOps settings")
+		return
+	}
+	if path == "" {
+		path = ".tinyforge.yml"
+	}
+	respondJSON(w, http.StatusOK, map[string]any{"enabled": body.Enabled, "path": path})
+}
+
+// syncWorkloadGitOps handles POST /api/workloads/{id}/gitops/sync. Admin-only.
+// It fetches the repo's .tinyforge.yml, merges the declared overlay onto the
+// live source_config (validate-then-commit), persists it, and records the sync.
+// Explicit action only — there is no auto-apply on deploy in v1.
+func (s *Server) syncWorkloadGitOps(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+	// Serialize the whole read→merge→write per workload so two concurrent
+	// syncs can't clobber each other (review S5). Load the row INSIDE the lock
+	// so each sync merges off the latest persisted config.
+	unlock := s.gitopsSync.lock(id)
+	defer unlock()
+
+	row, ok := s.loadWorkload(w, id)
+	if !ok {
+		return
+	}
+	if !gitops.IsEligibleSource(row.SourceKind) {
+		respondError(w, http.StatusBadRequest,
+			"GitOps is only available for dockerfile and static sources")
+		return
+	}
+	if !row.GitOpsEnabled {
+		respondError(w, http.StatusBadRequest, "enable GitOps for this workload first")
+		return
+	}
+
+	ref, err := s.gitOpsRepoRef(row)
+	if err != nil {
+		slog.Warn("gitops: build repo ref", "workload", row.ID, "error", err)
+		respondError(w, http.StatusBadGateway, "could not read repo settings for this workload")
+		return
+	}
+
+	res := gitops.Fetch(r.Context(), ref)
+	switch res.Status {
+	case gitops.StatusOK:
+		// proceed
+	case gitops.StatusNoFile:
+		respondError(w, http.StatusBadRequest, "no "+ref.Path+" found on branch "+ref.Branch)
+		return
+	case gitops.StatusInvalid:
+		respondError(w, http.StatusBadRequest, "invalid "+ref.Path+": "+res.Message)
+		return
+	default: // fetch_failed
+		slog.Warn("gitops: fetch failed", "workload", row.ID, "detail", res.Message)
+		respondError(w, http.StatusBadGateway, "could not fetch "+ref.Path+" from the repo")
+		return
+	}
+
+	src, err := plugin.GetSource(row.SourceKind)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "unknown source kind")
+		return
+	}
+	plan := gitops.BuildPlan(res.Spec, row.SourceKind)
+	merged, err := gitops.MergeAndValidate(json.RawMessage(row.SourceConfig), plan, src.Validate)
+	if err != nil {
+		// The merged config failed the source's own Validate — the file
+		// declares something this workload can't accept. Safe to surface (it
+		// describes config shape, not secrets).
+		respondError(w, http.StatusBadRequest, "the repo config was rejected: "+err.Error())
+		return
+	}
+
+	// Persist via a full-row update off the row we loaded (single read →
+	// merge → write). A per-workload sync lock that closes the remaining
+	// edit-vs-sync window is a Phase 4 hardening item.
+	row.SourceConfig = string(merged)
+	if err := s.store.UpdateWorkload(row); err != nil {
+		slog.Error("gitops: persist merged config", "workload", row.ID, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to apply the repo config")
+		return
+	}
+	if err := s.store.RecordGitOpsSync(row.ID, res.CommitSHA, store.Now()); err != nil {
+		slog.Warn("gitops: record sync", "workload", row.ID, "error", err)
+	}
+
+	actor := "manual"
+	if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
+		actor = claims.Username
+	}
+	appliedFields := planFields(plan)
+	s.recordGitOpsEvent(row.ID, res.CommitSHA, actor, appliedFields)
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"status":         "applied",
+		"commit_sha":     res.CommitSHA,
+		"applied_fields": appliedFields,
+		"triggered_by":   actor,
+	})
+}
+
+// loadWorkload fetches a workload by id, writing the appropriate error response
+// and returning ok=false on miss. Shared by the GitOps handlers.
+func (s *Server) loadWorkload(w http.ResponseWriter, id string) (store.Workload, bool) {
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return store.Workload{}, false
+	}
+	row, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return store.Workload{}, false
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return store.Workload{}, false
+	}
+	return row, true
+}
+
+// gitOpsRepoRef builds a gitops.RepoRef from a workload's source_config: it
+// decodes the common git coords (identical keys across dockerfile + static)
+// and decrypts the access token. The gitops package stays decoupled from the
+// store/crypto by taking the plain coords.
+func (s *Server) gitOpsRepoRef(row store.Workload) (gitops.RepoRef, error) {
+	var c struct {
+		Provider    string `json:"provider"`
+		BaseURL     string `json:"base_url"`
+		RepoOwner   string `json:"repo_owner"`
+		RepoName    string `json:"repo_name"`
+		Branch      string `json:"branch"`
+		AccessToken string `json:"access_token"`
+	}
+	if err := json.Unmarshal([]byte(row.SourceConfig), &c); err != nil {
+		return gitops.RepoRef{}, fmt.Errorf("decode source_config: %w", err)
+	}
+	token := ""
+	if c.AccessToken != "" {
+		dec, err := crypto.Decrypt(s.encKey, c.AccessToken)
+		if err != nil {
+			return gitops.RepoRef{}, fmt.Errorf("decrypt access token: %w", err)
+		}
+		token = dec
+	}
+	branch := c.Branch
+	if branch == "" {
+		branch = "main"
+	}
+	path := row.GitOpsPath
+	if path == "" {
+		path = ".tinyforge.yml"
+	}
+	return gitops.RepoRef{
+		Provider: c.Provider,
+		BaseURL:  c.BaseURL,
+		Owner:    c.RepoOwner,
+		Repo:     c.RepoName,
+		Branch:   branch,
+		Token:    token,
+		Path:     path,
+	}, nil
+}
+
+// recordGitOpsEvent writes a sync to the per-workload event log — the audit
+// trail for a config-only sync, kept OUT of deploy_history (which the rollback
+// feature treats as redeployable rows).
+func (s *Server) recordGitOpsEvent(workloadID, sha, actor string, fields []string) {
+	meta, _ := json.Marshal(map[string]any{"commit_sha": sha, "by": actor, "fields": fields})
+	if _, err := s.store.InsertEvent(store.EventLog{
+		Source:     "gitops",
+		WorkloadID: workloadID,
+		Severity:   "info",
+		Message:    "GitOps config synced from repo",
+		Metadata:   string(meta),
+	}); err != nil {
+		slog.Warn("gitops: record event", "workload", workloadID, "error", err)
+	}
+}
+
+// validGitOpsPath rejects absolute paths, traversal, and URL-significant or
+// control characters so a stored config path can't escape the repo (review M2)
+// or smuggle a query/fragment onto the provider's raw-file URL (review LOW-1).
+func validGitOpsPath(p string) bool {
+	if p == "" || len(p) > 255 {
+		return false
+	}
+	if strings.HasPrefix(p, "/") || strings.HasPrefix(p, "\\") {
+		return false
+	}
+	if strings.Contains(p, "..") {
+		return false
+	}
+	for _, r := range p {
+		if r < 0x20 || r == 0x7f || r == '?' || r == '#' || r == ' ' || r == '\\' {
+			return false
+		}
+	}
+	return true
+}
+
+// planFields returns the source_config keys an apply plan touches.
+func planFields(plan gitops.ApplyPlan) []string {
+	fields := make([]string, 0, len(plan.SourceConfigPatch))
+	for k := range plan.SourceConfigPatch {
+		fields = append(fields, k)
+	}
+	return fields
+}
@@ -0,0 +1,48 @@
+package api
+
+import (
+	"sort"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/gitops"
+)
+
+func TestValidGitOpsPath(t *testing.T) {
+	cases := []struct {
+		path string
+		ok   bool
+	}{
+		{".tinyforge.yml", true},
+		{"deploy/.tinyforge.yml", true},
+		{"config/app.yaml", true},
+		{"/etc/passwd", false},      // absolute
+		{"\\windows\\path", false},  // absolute (backslash)
+		{"../../etc/passwd", false}, // traversal
+		{"deploy/../../x", false},   // traversal mid-path
+		{"foo?ref=evil", false},     // query-param injection (LOW-1)
+		{"foo#frag", false},         // fragment injection
+		{"with space.yml", false},   // whitespace
+		{"", false},                 // empty
+	}
+	for _, c := range cases {
+		if got := validGitOpsPath(c.path); got != c.ok {
+			t.Errorf("validGitOpsPath(%q) = %v, want %v", c.path, got, c.ok)
+		}
+	}
+}
+
+func TestPlanFields(t *testing.T) {
+	spec := gitops.Spec{Version: 1, Deploy: gitops.DeploySpec{
+		Port:           ptrInt(8080),
+		DeployStrategy: ptrStr("blue-green"),
+	}}
+	got := planFields(gitops.BuildPlan(spec, gitops.SourceDockerfile))
+	sort.Strings(got)
+	want := []string{"deploy_strategy", "port"}
+	if len(got) != len(want) || got[0] != want[0] || got[1] != want[1] {
+		t.Fatalf("planFields = %v, want %v", got, want)
+	}
+}
+
+func ptrInt(i int) *int       { return &i }
+func ptrStr(s string) *string { return &s }
@@ -0,0 +1,64 @@
+package api
+
+import (
+	"context"
+	"log/slog"
+	"net/http"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/metrics"
+)
+
+// livez always returns 200 if the process is up. Used by container
+// orchestrators / load balancers / Docker HEALTHCHECK as the "is the
+// binary alive" probe. Intentionally does NOT touch the DB or Docker —
+// a slow DB must not cause restart loops.
+func (s *Server) livez(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	_, _ = w.Write([]byte("ok\n"))
+}
+
+// readyz returns 200 only when the process can actually serve traffic:
+// SQLite is reachable, the encryption key is loaded, the deployer is
+// not draining. The response body is intentionally minimal — the
+// specific failing probe name is recorded in slog (operator-visible)
+// rather than returned to unauthenticated callers. This avoids handing
+// reconnaissance to an attacker who can hit /readyz during an outage
+// ("DB down" vs "encryption key missing" leaks operational state).
+func (s *Server) readyz(w http.ResponseWriter, r *http.Request) {
+	ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
+	defer cancel()
+
+	// DB ping: cheap and exact — exercises the connection pool, file
+	// lock, and busy-timeout. A failing ping means SQLite WAL is wedged
+	// or the data dir is gone.
+	if err := s.store.DB().PingContext(ctx); err != nil {
+		slog.Warn("readyz: db ping failed", "error", err)
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		_, _ = w.Write([]byte("not ready\n"))
+		return
+	}
+
+	// Encryption key sanity: if it's zero we cannot decrypt any stored
+	// secret, so the deployer paths will all explode at first use.
+	if s.encKey == ([32]byte{}) {
+		slog.Warn("readyz: encryption key not loaded")
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		_, _ = w.Write([]byte("not ready\n"))
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	_, _ = w.Write([]byte("ready\n"))
+}
+
+// metricsExport writes the process-wide metrics registry in Prometheus
+// text format. Admin-only by router placement; surface is intentionally
+// thin (no histograms / quantiles, only counters) to keep the binary
+// dependency-free.
+func (s *Server) metricsExport(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+	_ = metrics.DefaultRegistry.WritePrometheus(w)
+}
@@ -0,0 +1,235 @@
+// Package api: metric-alert rule HTTP handlers. The evaluator lives in
+// internal/metricalert; this file is the REST surface that lets
+// operators create, edit, and delete threshold rules. Mirrors the
+// log-scan rule handlers.
+package api
+
+import (
+	"errors"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
+// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
+// immutable on update (per store.UpdateMetricAlertRule) so it only takes
+// effect on create.
+type metricAlertRuleInput struct {
+	WorkloadID      *string  `json:"workload_id"`
+	Name            *string  `json:"name"`
+	Metric          *string  `json:"metric"`
+	Comparator      *string  `json:"comparator"`
+	Threshold       *float64 `json:"threshold"`
+	Severity        *string  `json:"severity"`
+	CooldownSeconds *int     `json:"cooldown_seconds"`
+	Enabled         *bool    `json:"enabled"`
+}
+
+// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
+// query filter `workload_id=...` returns rules applying to that workload
+// (its own rows plus globals).
+func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
+	if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
+		out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "list metric alert rules")
+			return
+		}
+		respondJSON(w, http.StatusOK, out)
+		return
+	}
+	out, err := s.store.ListMetricAlertRules()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list metric alert rules")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
+func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	rule, err := s.store.GetMetricAlertRule(id)
+	if err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, rule)
+}
+
+// createMetricAlertRule handles POST /api/metric-alert-rules.
+func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	var in metricAlertRuleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	rule := store.MetricAlertRule{
+		WorkloadID:      derefString(in.WorkloadID),
+		Name:            derefString(in.Name),
+		Metric:          derefString(in.Metric),
+		Comparator:      derefString(in.Comparator),
+		Threshold:       derefFloat64(in.Threshold),
+		Severity:        firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
+		CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
+		Enabled:         in.Enabled == nil || *in.Enabled,
+	}
+	if msg := validateMetricAlertInput(rule); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.CreateMetricAlertRule(rule)
+	if err != nil {
+		if isMetricAlertValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusCreated, out)
+}
+
+// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
+// workload_id is immutable; name/metric/comparator/threshold/severity/
+// cooldown/enabled are individually overridable.
+func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	existing, err := s.store.GetMetricAlertRule(id)
+	if err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	var in metricAlertRuleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	if in.Name != nil {
+		existing.Name = *in.Name
+	}
+	if in.Metric != nil && *in.Metric != "" {
+		existing.Metric = *in.Metric
+	}
+	if in.Comparator != nil && *in.Comparator != "" {
+		existing.Comparator = *in.Comparator
+	}
+	if in.Threshold != nil {
+		existing.Threshold = *in.Threshold
+	}
+	if in.Severity != nil && *in.Severity != "" {
+		existing.Severity = *in.Severity
+	}
+	if in.CooldownSeconds != nil {
+		existing.CooldownSeconds = *in.CooldownSeconds
+	}
+	if in.Enabled != nil {
+		existing.Enabled = *in.Enabled
+	}
+	if msg := validateMetricAlertInput(existing); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.UpdateMetricAlertRule(existing)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "metric alert rule")
+			return
+		}
+		if isMetricAlertValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
+func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	if err := s.store.DeleteMetricAlertRule(id); err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	w.WriteHeader(http.StatusNoContent)
+}
+
+// validateMetricAlertInput does boundary validation so we return a
+// clear 400 before hitting the store. The store re-validates the same
+// invariants as a backstop.
+func validateMetricAlertInput(rule store.MetricAlertRule) string {
+	if strings.TrimSpace(rule.Name) == "" {
+		return "name is required"
+	}
+	switch rule.Metric {
+	case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
+	default:
+		return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
+	}
+	switch rule.Comparator {
+	case store.MetricComparatorGT, store.MetricComparatorLT:
+	default:
+		return "invalid comparator: must be gt or lt"
+	}
+	switch rule.Severity {
+	case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
+	default:
+		return "invalid severity: must be info, warn, or error"
+	}
+	if rule.CooldownSeconds < 0 {
+		return "cooldown_seconds must be >= 0"
+	}
+	return ""
+}
+
+// isMetricAlertValidationErr maps the store's validation errors to 400
+// rather than 500 without leaking driver text.
+func isMetricAlertValidationErr(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := err.Error()
+	for _, needle := range []string{
+		"name is required",
+		"invalid metric",
+		"invalid comparator",
+		"invalid severity",
+		"cooldown_seconds must be",
+	} {
+		if strings.Contains(msg, needle) {
+			return true
+		}
+	}
+	return false
+}
+
+func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	raw := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(raw, 10, 64)
+	if err != nil || id <= 0 {
+		respondError(w, http.StatusBadRequest, "invalid rule id")
+		return 0, false
+	}
+	return id, true
+}
+
+func derefFloat64(p *float64) float64 {
+	if p == nil {
+		return 0
+	}
+	return *p
+}
@@ -1,14 +1,119 @@
 package api

 import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
 	"log/slog"
+	"net"
 	"net/http"
+	"os"
 	"runtime/debug"
 	"strings"
 	"sync"
 	"time"
+
+	"github.com/alexei/tinyforge/internal/metrics"
 )

+// requestIDKey is the context key under which the generated/forwarded
+// X-Request-ID is stored. Exported indirectly via RequestIDFromContext
+// so handlers and services downstream of the API layer can thread it
+// into their own slog calls without re-extracting from headers.
+type requestIDKeyType struct{}
+
+var requestIDKey = requestIDKeyType{}
+
+// RequestIDFromContext returns the correlation ID for the request, or
+// "" when called outside the API request path.
+func RequestIDFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(requestIDKey).(string); ok {
+		return v
+	}
+	return ""
+}
+
+// requestID middleware ensures every request has a stable correlation
+// ID. Honors a caller-supplied X-Request-ID when the request comes from
+// a trusted proxy AND the value matches a safe character set; otherwise
+// generates a fresh 128-bit ID. The ID is echoed back as X-Request-ID
+// and stitched into every subsequent slog call via the context value
+// the `logging` middleware reads.
+//
+// Format clamp: a compromised reverse proxy (or one that mis-parses an
+// untrusted header) could forward an ID containing newlines, semicolons,
+// or other separator characters. Those would corrupt structured log
+// parsers that assume one record per line / key-value. Restricting to
+// `[A-Za-z0-9._-]{1,64}` covers UUIDs, hex IDs, and trace-context IDs
+// without any sharp edges.
+func requestID(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		rid := r.Header.Get("X-Request-ID")
+		if rid == "" || !isTrustedPeer(r) || !isValidRequestID(rid) {
+			rid = newRequestID()
+		}
+		w.Header().Set("X-Request-ID", rid)
+		ctx := context.WithValue(r.Context(), requestIDKey, rid)
+		next.ServeHTTP(w, r.WithContext(ctx))
+	})
+}
+
+// isValidRequestID enforces `[A-Za-z0-9._-]{1,64}` without compiling a
+// regex on the request path. Single linear scan, no allocations.
+func isValidRequestID(s string) bool {
+	if len(s) == 0 || len(s) > 64 {
+		return false
+	}
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		switch {
+		case c >= 'A' && c <= 'Z':
+		case c >= 'a' && c <= 'z':
+		case c >= '0' && c <= '9':
+		case c == '.' || c == '_' || c == '-':
+		default:
+			return false
+		}
+	}
+	return true
+}
+
+// isTrustedPeer is a thin wrapper around the TRUSTED_PROXY_CIDRS allow-
+// list — we honor a forwarded request-id only from upstreams we already
+// trust for X-Forwarded-For. Otherwise an internet client could spam
+// log files with attacker-chosen IDs.
+func isTrustedPeer(r *http.Request) bool {
+	peer := r.RemoteAddr
+	if host, _, err := net.SplitHostPort(peer); err == nil {
+		peer = host
+	}
+	if len(trustedProxyCIDRs) == 0 {
+		return false
+	}
+	ip := net.ParseIP(peer)
+	if ip == nil {
+		return false
+	}
+	for _, n := range trustedProxyCIDRs {
+		if n.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
+func newRequestID() string {
+	var b [16]byte
+	if _, err := rand.Read(b[:]); err != nil {
+		// Fall back to time-based suffix if crypto/rand is unavailable
+		// — extremely unlikely outside of broken environments, but the
+		// ID is for tracing not security, so a deterministic fallback
+		// is preferable to a panic.
+		return "ts-" + time.Now().UTC().Format("20060102T150405.000000000")
+	}
+	return hex.EncodeToString(b[:])
+}
+
 // logging is an HTTP middleware that logs every request with method, path,
 // status code, and duration. Webhook URLs are redacted before being logged
 // because the secret is the only authenticator — leaking it to log
@@ -20,15 +125,58 @@ func logging(next http.Handler) http.Handler {

 		next.ServeHTTP(wrapped, r)

-		slog.Info("http request",
+		fields := []any{
 			"method", r.Method,
 			"path", redactPath(r.URL.Path),
 			"status", wrapped.status,
 			"duration", time.Since(start).String(),
-		)
+		}
+		if rq := redactQuery(r.URL.RawQuery); rq != "" {
+			fields = append(fields, "query", rq)
+		}
+		if rid := RequestIDFromContext(r.Context()); rid != "" {
+			fields = append(fields, "request_id", rid)
+		}
+		slog.Info("http request", fields...)
+
+		// Lightweight per-request counter. Bucket by status class so
+		// the cardinality stays at 5 × #methods regardless of how many
+		// distinct response codes we emit.
+		metrics.HTTPRequestsTotal.Inc(bucketMethod(r.Method), statusClass(wrapped.status))
 	})
 }

+// bucketMethod normalises HTTP method names against the standard set
+// so a malicious client cannot spam arbitrary method tokens (RFC 7230
+// allows any token) and inflate the metrics map. Anything off the
+// allow-list collapses to "other".
+func bucketMethod(m string) string {
+	switch m {
+	case "GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS", "CONNECT", "TRACE":
+		return m
+	}
+	return "other"
+}
+
+// statusClass buckets a status code into "1xx".."5xx" / "other". Keeps
+// metrics cardinality bounded so a chatty endpoint can't explode the
+// metrics map with one series per distinct response code.
+func statusClass(code int) string {
+	switch {
+	case code >= 100 && code < 200:
+		return "1xx"
+	case code >= 200 && code < 300:
+		return "2xx"
+	case code >= 300 && code < 400:
+		return "3xx"
+	case code >= 400 && code < 500:
+		return "4xx"
+	case code >= 500 && code < 600:
+		return "5xx"
+	}
+	return "other"
+}
+
 // redactPath strips secrets from URL paths that carry them in segments.
 // Only the canonical /api/webhook/triggers/{secret} surface remains after
 // the hard cutover.
@@ -40,6 +188,45 @@ func redactPath(path string) string {
 	return path
 }

+// redactQueryKeys is the case-insensitive set of query-parameter names whose
+// values are masked before a URL lands in the request log. `token` is used by
+// SSE/EventSource when a custom header can't be set; the rest are
+// defence-in-depth against sensitive values ever appearing in a query string.
+var redactQueryKeys = map[string]struct{}{
+	"token":         {},
+	"secret":        {},
+	"password":      {},
+	"passwd":        {},
+	"api_key":       {},
+	"apikey":        {},
+	"access_token":  {},
+	"client_secret": {},
+	"sig":           {},
+	"signature":     {},
+}
+
+// redactQuery masks the values of sensitive query parameters (see
+// redactQueryKeys) in a URL's raw query before it lands in the request log.
+// Key matching is case-insensitive. Returns the input unchanged when there is
+// nothing to redact so a malformed URL surfaces naturally.
+func redactQuery(rawQuery string) string {
+	if rawQuery == "" {
+		return ""
+	}
+	parts := strings.Split(rawQuery, "&")
+	for i, p := range parts {
+		eq := strings.IndexByte(p, '=')
+		if eq < 0 {
+			continue
+		}
+		key := strings.ToLower(p[:eq])
+		if _, ok := redactQueryKeys[key]; ok {
+			parts[i] = p[:eq+1] + "***"
+		}
+	}
+	return strings.Join(parts, "&")
+}
+
 // recovery is an HTTP middleware that catches panics and returns a 500 response.
 func recovery(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -54,16 +241,49 @@ func recovery(next http.Handler) http.Handler {
 }

 // securityHeaders sets standard security headers on all responses.
+//
+// Strict-Transport-Security is emitted only when the request arrived
+// over HTTPS (direct TLS or forwarded). Emitting HSTS over plain HTTP
+// is harmless to compliant browsers but flags as an issue in scanners
+// and confuses some reverse proxies.
+//
+// The CSP keeps `'unsafe-inline'` for now because SvelteKit injects
+// inline boot scripts and styles; removing it requires a nonce-based
+// strategy threaded through the SvelteKit handle hook. Tracked as a
+// follow-up; documented in the security report.
 func securityHeaders(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("X-Content-Type-Options", "nosniff")
 		w.Header().Set("X-Frame-Options", "DENY")
 		w.Header().Set("Referrer-Policy", "strict-origin-when-cross-origin")
-		w.Header().Set("Content-Security-Policy", "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; connect-src 'self'; font-src 'self'")
+		w.Header().Set("Permissions-Policy", "camera=(), microphone=(), geolocation=(), payment=()")
+		w.Header().Set("Content-Security-Policy",
+			"default-src 'self'; "+
+				"script-src 'self' 'unsafe-inline'; "+
+				"style-src 'self' 'unsafe-inline'; "+
+				"img-src 'self' data:; "+
+				"connect-src 'self'; "+
+				"font-src 'self'; "+
+				"frame-ancestors 'none'; "+
+				"base-uri 'self'; "+
+				"form-action 'self'")
+		if isHTTPS(r) {
+			w.Header().Set("Strict-Transport-Security", "max-age=31536000; includeSubDomains")
+		}
 		next.ServeHTTP(w, r)
 	})
 }

+func isHTTPS(r *http.Request) bool {
+	if r.TLS != nil {
+		return true
+	}
+	if r.Header.Get("X-Forwarded-Proto") == "https" {
+		return true
+	}
+	return false
+}
+
 // cors is an HTTP middleware that handles CORS for same-origin requests.
 // The frontend is served from the same origin, so cross-origin requests are not expected.
 func cors(next http.Handler) http.Handler {
@@ -164,10 +384,7 @@ func jsonContentType(next http.Handler) http.Handler {
 func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			ip := r.RemoteAddr
-			if fwd := r.Header.Get("X-Forwarded-For"); fwd != "" {
-				ip = fwd
-			}
+			ip := clientIP(r)
 			if !rl.allow(ip) {
 				respondError(w, http.StatusTooManyRequests, "rate limit exceeded")
 				return
@@ -177,6 +394,100 @@ func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
 	}
 }

+// trustedProxyCIDRs is the parsed allow-list of upstream proxy networks
+// whose X-Forwarded-For header we honor. Set TRUSTED_PROXY_CIDRS to a
+// comma-separated list of CIDRs (e.g. "127.0.0.1/32,10.0.0.0/8") to
+// enable. When unset (the default) X-Forwarded-For is ignored entirely
+// and rate limiting + audit logging use r.RemoteAddr — preventing a
+// remote attacker from spoofing the header to bypass per-IP limiters.
+var trustedProxyCIDRs = parseTrustedProxyCIDRs(os.Getenv("TRUSTED_PROXY_CIDRS"))
+
+func parseTrustedProxyCIDRs(raw string) []*net.IPNet {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+	var nets []*net.IPNet
+	for _, p := range strings.Split(raw, ",") {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		// Allow bare IPs as /32 (IPv4) or /128 (IPv6).
+		if !strings.Contains(p, "/") {
+			if ip := net.ParseIP(p); ip != nil {
+				if ip.To4() != nil {
+					p += "/32"
+				} else {
+					p += "/128"
+				}
+			}
+		}
+		_, n, err := net.ParseCIDR(p)
+		if err != nil {
+			slog.Warn("ignoring invalid TRUSTED_PROXY_CIDRS entry", "value", p, "error", err)
+			continue
+		}
+		nets = append(nets, n)
+	}
+	return nets
+}
+
+// clientIP returns the per-request "client" address used for rate-limit
+// keying and audit attribution. X-Forwarded-For is honored ONLY when the
+// direct peer (r.RemoteAddr) belongs to a configured trusted-proxy CIDR;
+// otherwise the header is ignored to prevent header-spoofing bypasses.
+func clientIP(r *http.Request) string {
+	peer := r.RemoteAddr
+	if host, _, err := net.SplitHostPort(peer); err == nil {
+		peer = host
+	}
+	if len(trustedProxyCIDRs) == 0 {
+		return peer
+	}
+	peerIP := net.ParseIP(peer)
+	if peerIP == nil || !isTrustedProxy(peerIP) {
+		return peer
+	}
+	fwd := r.Header.Get("X-Forwarded-For")
+	if fwd == "" {
+		return peer
+	}
+	// Walk X-Forwarded-For from the RIGHTMOST entry (the address closest to
+	// us, appended by our trusted peer) leftward, skipping entries that are
+	// themselves trusted proxies, and return the first untrusted address.
+	// The LEFTMOST entry is fully client-controlled — trusting it (as a
+	// naive `fwd[:firstComma]` does) lets an attacker spoof their rate-limit
+	// and audit identity by prepending a forged value, defeating the per-IP
+	// login limiter.
+	parts := strings.Split(fwd, ",")
+	for i := len(parts) - 1; i >= 0; i-- {
+		candidate := strings.TrimSpace(parts[i])
+		ip := net.ParseIP(candidate)
+		if ip == nil {
+			continue
+		}
+		if isTrustedProxy(ip) {
+			continue
+		}
+		return candidate
+	}
+	// Every forwarded entry was a trusted proxy (or unparseable) — fall back
+	// to the direct peer.
+	return peer
+}
+
+// isTrustedProxy reports whether ip falls within a configured
+// trusted-proxy CIDR.
+func isTrustedProxy(ip net.IP) bool {
+	for _, n := range trustedProxyCIDRs {
+		if n.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
 // statusRecorder wraps http.ResponseWriter to capture the status code.
 type statusRecorder struct {
 	http.ResponseWriter
@@ -4,6 +4,7 @@ import (
 	"context"
 	"log/slog"
 	"sync"
+	"sync/atomic"

 	"github.com/go-chi/chi/v5"

@@ -13,11 +14,13 @@ import (
 	"github.com/alexei/tinyforge/internal/dns"
 	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/keyedmutex"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/npm"
 	"github.com/alexei/tinyforge/internal/proxy"
 	"github.com/alexei/tinyforge/internal/stale"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
 	"github.com/alexei/tinyforge/internal/webhook"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 )
@@ -50,17 +53,34 @@ type Server struct {
 	oidcProvider  *auth.OIDCProvider
 	staleScanner  *stale.Scanner

+	// gitopsSync serializes the GitOps sync (read→merge→write) per workload so
+	// two concurrent syncs can't race on source_config (review S5).
+	gitopsSync keyedMutex
+
+	// volRestoreInFlight is a per-workload single-flight guard for volume
+	// snapshot restore: a concurrent restore of the same workload is rejected
+	// fast with 409 (TryLock) rather than queuing behind the deployer lock.
+	volRestoreInFlight keyedmutex.Mutex
+
 	dnsProviderMu        sync.RWMutex
 	dnsProvider          dns.Provider
 	onDNSProviderChanged DNSProviderChangedFunc

 	backupEngine            *backup.Engine
+	snapshotEngine          *volsnap.Engine
 	sseGate                 *sseGate
 	logScanReloader         LogScanReloader
 	dbPath                  string
 	shutdownFunc            func()                                // called after restore to trigger graceful shutdown
 	onBackupSettingsChanged func(enabled bool, intervalHours int) // called when backup settings change
 	onProxyProviderChanged  func(provider proxy.Provider)         // called when proxy provider changes
+
+	// restoreInFlight is a process-wide guard against double-firing
+	// the restore endpoint. A rapid double-click would otherwise
+	// schedule two goroutines racing s.store.Close() and the
+	// candidate-over-live rename. CAS to true at the entry point;
+	// reject the second caller with 409 Conflict.
+	restoreInFlight atomic.Bool
 }

 // NewServer creates a new API Server with all required dependencies.
@@ -111,6 +131,11 @@ func (s *Server) SetBackupEngine(engine *backup.Engine) {
 	s.backupEngine = engine
 }

+// SetSnapshotEngine sets the volume-snapshot engine on the server.
+func (s *Server) SetSnapshotEngine(engine *volsnap.Engine) {
+	s.snapshotEngine = engine
+}
+
 // SetDBPath sets the database file path (needed for restore).
 func (s *Server) SetDBPath(path string) {
 	s.dbPath = path
@@ -157,13 +182,32 @@ func (s *Server) SetDNSProviderChangedCallback(fn DNSProviderChangedFunc) {

 // initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
 func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
-	// Decrypt the OIDC client secret if it's encrypted.
+	// Decrypt the OIDC client secret. The prior code did a try-decrypt
+	// and silently treated failures as plaintext — under a rotated key
+	// that sent ciphertext upstream to the OP. Now:
+	//   - If the value carries the tf1: envelope → fail loud on
+	//     decrypt failure (rotated key / corrupted ciphertext).
+	//   - If the value is unprefixed (legacy ciphertext from v0 or true
+	//     plaintext from an old migration) → try decrypt; on failure
+	//     accept as plaintext (the only safe legacy interpretation).
 	clientSecret := as.OIDCClientSecret
 	if clientSecret != "" {
-		if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
+		switch {
+		case crypto.HasEnvelope(clientSecret):
+			decrypted, err := crypto.Decrypt(s.encKey, clientSecret)
+			if err != nil {
+				slog.Error("OIDC client secret could not be decrypted — refusing to initialize provider",
+					"error", err,
+					"hint", "rotate ENCRYPTION_KEY back, OR re-save OIDC settings to re-encrypt with the current key")
+				return
+			}
 			clientSecret = decrypted
+		default:
+			// Legacy v0 value: try decrypt; on failure assume plaintext.
+			if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
+				clientSecret = decrypted
+			}
 		}
-		// If decrypt fails, assume it's already plaintext (migration scenario).
 	}
 	provider, err := auth.NewOIDCProvider(ctx, auth.OIDCConfig{
 		IssuerURL:    as.OIDCIssuerURL,
@@ -183,12 +227,29 @@ func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
 func (s *Server) Router() chi.Router {
 	r := chi.NewRouter()

-	// Global middleware.
+	// Global middleware. requestID runs first so every downstream log
+	// line (and the access log emitted by `logging`) carries the same
+	// correlation id, plus the response carries it back on the
+	// X-Request-ID header for the operator to grep across services.
+	r.Use(requestID)
 	r.Use(recovery)
 	r.Use(securityHeaders)
 	r.Use(logging)
 	r.Use(cors)

+	// Unauthenticated health probes — mounted at the root so container
+	// orchestrators / load balancers can hit them without knowing about
+	// the /api prefix. /livez intentionally does no work and stays
+	// unbounded; /readyz pings the DB and is rate-limited to keep an
+	// unauthenticated flood from serialising behind SQLite's single
+	// writer connection (busy-timeout = 5s) and log-amplifying every
+	// request via the structured access log. The 10-per-minute budget
+	// is the existing rateLimiter default — generous for k8s readiness
+	// probes (typically every 5-10s), restrictive for an attacker.
+	r.Get("/livez", s.livez)
+	readyLimiter := newRateLimiter()
+	r.With(rateLimitMiddleware(readyLimiter)).Get("/readyz", s.readyz)
+
 	loginLimiter := newRateLimiter()
 	webhookLimiter := newRateLimiter()

@@ -232,6 +293,7 @@ func (s *Server) Router() chi.Router {
 				r.Post("/discovery/git/branches", s.listGitBranches)
 				r.Post("/discovery/git/tree", s.listGitTree)
 				r.Get("/discovery/image/conflicts", s.listImageConflicts)
+				r.Post("/discovery/image/inspect", s.inspectImageMetadata)
 			})

 			// Read-only endpoints (any authenticated user).
@@ -245,16 +307,18 @@ func (s *Server) Router() chi.Router {
 			r.Get("/events/log/stats", s.getEventLogStats)
 			r.Get("/registries", s.listRegistries)
 			r.Route("/registries/{id}", func(r chi.Router) {
+				// All registry probes are admin-gated. The /tags and
+				// /images endpoints used to be open to any authenticated
+				// user, but they make outbound requests using the
+				// admin-encrypted registry token — a viewer could
+				// effectively drive arbitrary requests against a private
+				// registry under admin credentials.
+				r.Use(auth.AdminOnly)
 				r.Get("/tags/*", s.listRegistryTags)
 				r.Get("/images", s.listRegistryImages)
-
-				// Admin-only registry mutations.
-				r.Group(func(r chi.Router) {
-					r.Use(auth.AdminOnly)
-					r.Put("/", s.updateRegistry)
-					r.Delete("/", s.deleteRegistry)
-					r.Post("/test", s.testRegistry)
-				})
+				r.Put("/", s.updateRegistry)
+				r.Delete("/", s.deleteRegistry)
+				r.Post("/test", s.testRegistry)
 			})
 			r.Get("/settings", s.getSettings)
 			r.Get("/settings/npm-certificates", s.listNpmCertificates)
@@ -282,11 +346,44 @@ func (s *Server) Router() chi.Router {
 				r.With(auth.AdminOnly).Post("/start", s.startPluginWorkload)
 				r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload)

+				// Deploy ledger + rollback. The history feed is read-only
+				// (any authenticated user); rollback is a redeploy, so it is
+				// admin-gated like /deploy.
+				r.Get("/deploys", s.listWorkloadDeploys)
+				r.With(auth.AdminOnly).Post("/rollback", s.rollbackWorkload)
+
+				// GitOps config-as-code (dockerfile/static). The status read
+				// (incl. live drift) is open to any authenticated user; enable/
+				// disable and sync mutate config, so they are admin-gated.
+				r.Get("/gitops", s.getWorkloadGitOps)
+				r.With(auth.AdminOnly).Put("/gitops", s.setWorkloadGitOps)
+				r.With(auth.AdminOnly).Post("/gitops/sync", s.syncWorkloadGitOps)
+
+				// Volume snapshots (admin-only). Capture/list a workload's
+				// host-bind data volumes; {sid}-scoped download/delete live
+				// in the global admin group alongside backups.
+				r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots)
+				r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable)
+				r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot)
+				// Restore overwrites live volume data and restarts the app — the
+				// most destructive workload action. Admin-gated + X-Confirm-Restore
+				// header (CSRF) + per-workload single-flight, mirroring DB restore.
+				r.With(auth.AdminOnly).Post("/snapshots/{sid}/restore", s.restoreWorkloadSnapshot)
+
 				// Runtime view: per-source persisted state + storage usage.
 				// Read-only; safe for any authenticated user.
 				r.Get("/runtime-state", s.getWorkloadRuntimeState)
 				r.Get("/storage", s.getWorkloadStorage)

+				// Per-workload metrics history (CPU/memory time-series),
+				// aggregated across the workload's containers. Read-only.
+				r.Get("/stats/history", s.getWorkloadStatsHistory)
+
+				// Per-workload activity / deploy timeline (read-only). Scoped
+				// to this workload's event-log rows; the global feed lives at
+				// /events/log.
+				r.Get("/events", s.listWorkloadEvents)
+
 				// Per-workload env vars. Listing open to authenticated readers;
 				// mutations admin-gated. Encrypted values are write-only after store.
 				r.Get("/env", s.listWorkloadEnv)
@@ -312,6 +409,15 @@ func (s *Server) Router() chi.Router {
 				// of /triggers/{id}/bindings keyed on the workload side.
 				r.Get("/triggers", s.listBindingsForWorkload)
 				r.With(auth.AdminOnly).Post("/triggers", s.bindTriggerToWorkload)
+
+				// Per-workload notification routes — multi-destination
+				// fan-out (Slack channel + Discord webhook + ...). When
+				// zero rows are configured the dispatcher falls back to
+				// the legacy single-URL columns on the workload row.
+				r.Get("/notifications", s.listWorkloadNotifications)
+				r.With(auth.AdminOnly).Post("/notifications", s.createWorkloadNotification)
+				r.With(auth.AdminOnly).Put("/notifications/{nid}", s.updateWorkloadNotification)
+				r.With(auth.AdminOnly).Delete("/notifications/{nid}", s.deleteWorkloadNotification)
 			})

 			// Global container index, joined to workload + app names.
@@ -370,6 +476,26 @@ func (s *Server) Router() chi.Router {
 				r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
 			})

+			// Metric-alert rules.
+			r.Get("/metric-alert-rules", s.listMetricAlertRules)
+			r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/metric-alert-rules", s.createMetricAlertRule)
+				r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule)
+				r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule)
+			})
+
+			// Shared secrets (env vars shared across workloads by scope).
+			r.Get("/shared-secrets", s.listSharedSecrets)
+			r.Get("/shared-secrets/{id}", s.getSharedSecret)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/shared-secrets", s.createSharedSecret)
+				r.Patch("/shared-secrets/{id}", s.updateSharedSecret)
+				r.Delete("/shared-secrets/{id}", s.deleteSharedSecret)
+			})
+
 			// System resources (read-only).
 			r.Get("/system/stats", s.getSystemStats)
 			r.Get("/system/stats/history", s.getSystemStatsHistory)
@@ -379,6 +505,12 @@ func (s *Server) Router() chi.Router {
 			r.Group(func(r chi.Router) {
 				r.Use(auth.AdminOnly)

+				// Prometheus-format metrics export. Admin-only so the
+				// counter cardinality cannot be enumerated by a low-trust
+				// viewer to map internal endpoints / sources / outcomes.
+				// Scrape with bearer auth from your Prometheus job.
+				r.Get("/metrics", s.metricsExport)
+
 				// Config export (reveals registry/global details).
 				r.Get("/config/export", s.exportConfig)

@@ -414,6 +546,7 @@ func (s *Server) Router() chi.Router {

 				// Docker management.
 				r.Post("/docker/prune-images", s.pruneImages)
+				r.Post("/docker/prune-build-cache", s.pruneBuildCache)

 				// NPM connection test.
 				r.Post("/settings/npm/test", s.testNpmConnection)
@@ -431,6 +564,11 @@ func (s *Server) Router() chi.Router {
 				r.Get("/backups/{id}/download", s.downloadBackup)
 				r.Delete("/backups/{id}", s.deleteBackup)
 				r.Post("/backups/{id}/restore", s.restoreBackup)
+
+				// Volume-snapshot download/delete (workload-scoped capture +
+				// list live under /workloads/{id}/snapshots).
+				r.Get("/snapshots/{sid}/download", s.downloadSnapshot)
+				r.Delete("/snapshots/{sid}", s.deleteSnapshot)
 			})
 		})
 	})
@@ -0,0 +1,272 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// sharedSecretRow is the JSON shape returned to clients. The secret value is
+// NEVER returned — once stored it is write-only (mirroring workload_env). The
+// has_value flag lets the UI show whether a value is set without exposing it;
+// to rotate, the operator submits a new value.
+type sharedSecretRow struct {
+	ID          string `json:"id"`
+	Name        string `json:"name"`
+	HasValue    bool   `json:"has_value"`
+	Encrypted   bool   `json:"encrypted"`
+	Scope       string `json:"scope"`
+	AppID       string `json:"app_id"`
+	Description string `json:"description"`
+	Enabled     bool   `json:"enabled"`
+	CreatedAt   string `json:"created_at"`
+	UpdatedAt   string `json:"updated_at"`
+}
+
+func toSharedSecretRow(sec store.SharedSecret) sharedSecretRow {
+	return sharedSecretRow{
+		ID:          sec.ID,
+		Name:        sec.Name,
+		HasValue:    sec.Value != "",
+		Encrypted:   sec.Encrypted,
+		Scope:       sec.Scope,
+		AppID:       sec.AppID,
+		Description: sec.Description,
+		Enabled:     sec.Enabled,
+		CreatedAt:   sec.CreatedAt,
+		UpdatedAt:   sec.UpdatedAt,
+	}
+}
+
+// listSharedSecrets handles GET /api/shared-secrets. Values are redacted.
+func (s *Server) listSharedSecrets(w http.ResponseWriter, r *http.Request) {
+	rows, err := s.store.ListSharedSecrets()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list shared secrets")
+		return
+	}
+	out := make([]sharedSecretRow, 0, len(rows))
+	for _, sec := range rows {
+		out = append(out, toSharedSecretRow(sec))
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getSharedSecret handles GET /api/shared-secrets/{id}. Value is redacted.
+func (s *Server) getSharedSecret(w http.ResponseWriter, r *http.Request) {
+	sec, err := s.store.GetSharedSecret(chi.URLParam(r, "id"))
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get shared secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, toSharedSecretRow(sec))
+}
+
+// createSharedSecretRequest is the POST body. Encrypted=true (the default for
+// a non-empty value) causes the value to be encrypted at rest with the global
+// key before it ever reaches the store.
+type createSharedSecretRequest struct {
+	Name        string `json:"name"`
+	Value       string `json:"value"`
+	Encrypted   *bool  `json:"encrypted"` // defaults true
+	Scope       string `json:"scope"`     // global | app
+	AppID       string `json:"app_id"`    // required when scope == app
+	Description string `json:"description"`
+	Enabled     *bool  `json:"enabled"` // defaults true
+}
+
+func (s *Server) createSharedSecret(w http.ResponseWriter, r *http.Request) {
+	var req createSharedSecretRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.Name = strings.TrimSpace(req.Name)
+	if !validEnvKey(req.Name) {
+		respondError(w, http.StatusBadRequest, "name must be a valid env key [A-Za-z_][A-Za-z0-9_]*")
+		return
+	}
+	if msg := validateSharedSecretScope(req.Scope, req.AppID); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+
+	encrypted := true
+	if req.Encrypted != nil {
+		encrypted = *req.Encrypted
+	}
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+
+	value, err := s.encryptSecretValue(req.Value, encrypted)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "encrypt value")
+		return
+	}
+
+	sec, err := s.store.CreateSharedSecret(store.SharedSecret{
+		Name:        req.Name,
+		Value:       value,
+		Encrypted:   encrypted,
+		Scope:       req.Scope,
+		AppID:       strings.TrimSpace(req.AppID),
+		Description: req.Description,
+		Enabled:     enabled,
+	})
+	if err != nil {
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "a shared secret with this scope and name already exists")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create shared secret")
+		return
+	}
+	respondJSON(w, http.StatusCreated, toSharedSecretRow(sec))
+}
+
+// updateSharedSecretRequest is the PATCH body. Every field is optional; nil
+// means "leave unchanged". A nil Value preserves the stored ciphertext (so a
+// metadata-only edit can't accidentally blank a secret); a non-nil Value
+// rotates it (re-encrypted under the effective Encrypted flag).
+type updateSharedSecretRequest struct {
+	Name        *string `json:"name"`
+	Value       *string `json:"value"`
+	Encrypted   *bool   `json:"encrypted"`
+	Scope       *string `json:"scope"`
+	AppID       *string `json:"app_id"`
+	Description *string `json:"description"`
+	Enabled     *bool   `json:"enabled"`
+}
+
+func (s *Server) updateSharedSecret(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	existing, err := s.store.GetSharedSecret(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get shared secret")
+		return
+	}
+
+	var req updateSharedSecretRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+
+	merged := existing
+	if req.Name != nil {
+		merged.Name = strings.TrimSpace(*req.Name)
+		if !validEnvKey(merged.Name) {
+			respondError(w, http.StatusBadRequest, "name must be a valid env key [A-Za-z_][A-Za-z0-9_]*")
+			return
+		}
+	}
+	if req.Encrypted != nil {
+		merged.Encrypted = *req.Encrypted
+	}
+	if req.Scope != nil {
+		merged.Scope = *req.Scope
+	}
+	if req.AppID != nil {
+		merged.AppID = strings.TrimSpace(*req.AppID)
+	}
+	if req.Description != nil {
+		merged.Description = *req.Description
+	}
+	if req.Enabled != nil {
+		merged.Enabled = *req.Enabled
+	}
+	if msg := validateSharedSecretScope(merged.Scope, merged.AppID); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+
+	// Value handling: only (re)encrypt when the caller supplied a new value.
+	// Otherwise keep the stored ciphertext untouched — but if the Encrypted
+	// flag flipped without a new value we cannot transcode the opaque stored
+	// bytes, so reject that ambiguous request rather than corrupting the row.
+	if req.Value != nil {
+		v, encErr := s.encryptSecretValue(*req.Value, merged.Encrypted)
+		if encErr != nil {
+			respondError(w, http.StatusInternalServerError, "encrypt value")
+			return
+		}
+		merged.Value = v
+	} else if req.Encrypted != nil && *req.Encrypted != existing.Encrypted {
+		respondError(w, http.StatusBadRequest, "changing 'encrypted' requires resubmitting 'value'")
+		return
+	}
+
+	sec, err := s.store.UpdateSharedSecret(merged)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "a shared secret with this scope and name already exists")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update shared secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, toSharedSecretRow(sec))
+}
+
+func (s *Server) deleteSharedSecret(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if err := s.store.DeleteSharedSecret(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete shared secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
+}
+
+// encryptSecretValue encrypts value with the global key when encrypted is set
+// and the value is non-empty; otherwise it returns the value unchanged. An
+// empty value stays empty (no value set) regardless of the flag.
+func (s *Server) encryptSecretValue(value string, encrypted bool) (string, error) {
+	if !encrypted || value == "" {
+		return value, nil
+	}
+	enc, err := crypto.Encrypt(s.encKey, value)
+	if err != nil {
+		slog.Error("encrypt shared secret value", "error", err)
+		return "", err
+	}
+	return enc, nil
+}
+
+// validateSharedSecretScope returns a non-empty 400 message when the scope /
+// app_id pairing is invalid; "" when valid. Mirrors the store-side invariant
+// so the API rejects with a clear message before hitting the store.
+func validateSharedSecretScope(scope, appID string) string {
+	switch scope {
+	case store.SharedSecretScopeGlobal:
+		return ""
+	case store.SharedSecretScopeApp:
+		if strings.TrimSpace(appID) == "" {
+			return "app_id is required when scope is 'app'"
+		}
+		return ""
+	default:
+		return "scope must be 'global' or 'app'"
+	}
+}
@@ -32,9 +32,26 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
 	w.WriteHeader(http.StatusOK)
 	flusher.Flush()

-	// Subscribe to instance status, deploy status, and persistent event log events.
+	// Build logs are high-volume: a single verbose `docker build` can emit
+	// thousands of lines. Streaming them to EVERY connection would flood each
+	// subscriber's bounded bus buffer and evict status/log events for ALL
+	// clients. So build logs are delivered ONLY to connections that opt in
+	// with ?workload_id=<id>, and only for that workload. Connections without
+	// the param (e.g. the global dashboard) never receive build-log frames.
+	buildLogWorkloadID := r.URL.Query().Get("workload_id")
 	sub := s.eventBus.Subscribe(func(evt events.Event) bool {
-		return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus || evt.Type == events.EventLog
+		switch evt.Type {
+		case events.EventInstanceStatus, events.EventDeployStatus, events.EventLog:
+			return true
+		case events.EventBuildLog:
+			if buildLogWorkloadID == "" {
+				return false
+			}
+			p, ok := evt.Payload.(events.BuildLogPayload)
+			return ok && p.WorkloadID == buildLogWorkloadID
+		default:
+			return false
+		}
 	})
 	defer s.eventBus.Unsubscribe(sub)

@@ -1,12 +1,15 @@
 package api

 import (
+	"errors"
 	"log/slog"
 	"net/http"
 	"sort"
 	"strconv"
 	"time"

+	"github.com/go-chi/chi/v5"
+
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 )
@@ -85,6 +88,76 @@ func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, samples)
 }

+// workloadStatsPoint is one aggregated time bucket for a workload's metrics
+// graph: every container the workload owns is summed at each timestamp so a
+// multi-container (compose) workload reads as a single line. MemoryLimit is
+// the max across containers — the effective ceiling — though the UI plots
+// absolute MiB because the limit is often 0 (unlimited).
+type workloadStatsPoint struct {
+	TS          int64   `json:"ts"`
+	CPUPercent  float64 `json:"cpu_percent"`
+	MemoryUsage int64   `json:"memory_usage"`
+	MemoryLimit int64   `json:"memory_limit"`
+}
+
+// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
+// Read-only and open to any authenticated user (mirrors the per-workload
+// events/runtime-state feeds). Always returns a (possibly empty) array — never
+// 503 — because samples come from SQLite, which is available even when the
+// Docker daemon is down or stats collection is disabled. Unknown workload id
+// 404s; a known workload with no samples yet returns [].
+func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+
+	samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
+	if err != nil {
+		slog.Error("failed to list workload stats samples", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list samples")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
+}
+
+// aggregateWorkloadStats folds per-container samples into one series keyed by
+// timestamp: CPU% and memory usage are summed across the workload's containers,
+// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
+// that order without an extra sort.
+func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
+	points := make([]workloadStatsPoint, 0)
+	idx := make(map[int64]int) // ts → index in points
+	for _, sm := range samples {
+		if i, ok := idx[sm.TS]; ok {
+			points[i].CPUPercent += sm.CPUPercent
+			points[i].MemoryUsage += sm.MemoryUsage
+			if sm.MemoryLimit > points[i].MemoryLimit {
+				points[i].MemoryLimit = sm.MemoryLimit
+			}
+			continue
+		}
+		idx[sm.TS] = len(points)
+		points = append(points, workloadStatsPoint{
+			TS:          sm.TS,
+			CPUPercent:  sm.CPUPercent,
+			MemoryUsage: sm.MemoryUsage,
+			MemoryLimit: sm.MemoryLimit,
+		})
+	}
+	return points
+}
+
 // listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
 // Returns the top-N most recent samples across containers, sorted by CPU or
 // memory. Container IDs are stripped for non-admins so a low-privilege viewer
@@ -0,0 +1,64 @@
+package api
+
+import (
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+func TestAggregateWorkloadStats_SumsPerTimestamp(t *testing.T) {
+	// Two containers reporting at the same two ticks → summed per ts.
+	samples := []store.ContainerStatsSample{
+		{TS: 100, CPUPercent: 10, MemoryUsage: 1000, MemoryLimit: 4000},
+		{TS: 100, CPUPercent: 5, MemoryUsage: 500, MemoryLimit: 8000},
+		{TS: 200, CPUPercent: 20, MemoryUsage: 2000, MemoryLimit: 4000},
+	}
+	pts := aggregateWorkloadStats(samples)
+	if len(pts) != 2 {
+		t.Fatalf("expected 2 buckets, got %d", len(pts))
+	}
+	if pts[0].TS != 100 || pts[0].CPUPercent != 15 || pts[0].MemoryUsage != 1500 {
+		t.Fatalf("ts=100 bucket wrong: %+v", pts[0])
+	}
+	// Memory limit takes the max across containers.
+	if pts[0].MemoryLimit != 8000 {
+		t.Fatalf("expected max memory limit 8000, got %d", pts[0].MemoryLimit)
+	}
+	if pts[1].TS != 200 || pts[1].CPUPercent != 20 {
+		t.Fatalf("ts=200 bucket wrong: %+v", pts[1])
+	}
+}
+
+func TestAggregateWorkloadStats_Empty(t *testing.T) {
+	pts := aggregateWorkloadStats(nil)
+	if pts == nil {
+		t.Fatal("expected non-nil empty slice for clean JSON []")
+	}
+	if len(pts) != 0 {
+		t.Fatalf("expected 0 points, got %d", len(pts))
+	}
+}
+
+func TestWorkloadStatsHistory_UnknownWorkload404(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, "GET", "/api/workloads/nope/stats/history", nil)
+	if resp.StatusCode != 404 {
+		t.Fatalf("expected 404 for unknown workload, got %d", resp.StatusCode)
+	}
+}
+
+func TestWorkloadStatsHistory_KnownWorkloadEmpty(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := createImageWorkload(t, e, "metrics-app")
+	resp := e.do(t, "GET", "/api/workloads/"+id+"/stats/history", nil)
+	if resp.StatusCode != 200 {
+		t.Fatalf("expected 200, got %d", resp.StatusCode)
+	}
+	var pts []workloadStatsPoint
+	if errMsg := decodeEnvelope(t, resp, &pts); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(pts) != 0 {
+		t.Fatalf("expected empty series for app with no samples, got %d", len(pts))
+	}
+}
@@ -89,12 +89,16 @@ func toTriggerViewWithCount(row store.TriggerWithBindingCount) triggerView {
 // triggerRequest is the create/update body. Config is opaque per kind.
 // Auto-generates a webhook secret on create when WebhookEnabled is true;
 // the secret is exposed only via the /webhook subresource.
+//
+// WebhookRequireSignature is a *bool so we can distinguish "field omitted
+// by client" (nil → apply secure default of true when webhook is enabled)
+// from an explicit opt-out (false → respected).
 type triggerRequest struct {
 	Kind                    string          `json:"kind"`
 	Name                    string          `json:"name"`
 	Config                  json.RawMessage `json:"config"`
 	WebhookEnabled          bool            `json:"webhook_enabled"`
-	WebhookRequireSignature bool            `json:"webhook_require_signature"`
+	WebhookRequireSignature *bool           `json:"webhook_require_signature,omitempty"`
 }

 // Same per-blob caps used on the workload pluginWorkloadRequest path —
@@ -134,12 +138,26 @@ func (s *Server) getTrigger(w http.ResponseWriter, r *http.Request) {
 // buildTriggerFromRequest assembles a store.Trigger ready for insert.
 // Centralized so the standalone create endpoint and the inline-bind
 // endpoint cannot drift on secret-generation defaults.
+//
+// SECURITY: a new trigger with webhook enabled defaults to require_signature
+// = true. Operators can opt out at create time for receivers that do not
+// support HMAC, but the safer default avoids the "freshly-created trigger
+// accepts unsigned posts to its URL" footgun.
 func buildTriggerFromRequest(req triggerRequest) store.Trigger {
+	// Secure default: if webhook is enabled and the operator did NOT
+	// explicitly set require_signature, force it on. Explicit false is
+	// preserved (legacy receivers without HMAC support still work).
+	requireSig := false
+	if req.WebhookRequireSignature != nil {
+		requireSig = *req.WebhookRequireSignature
+	} else if req.WebhookEnabled {
+		requireSig = true
+	}
 	t := store.Trigger{
 		Kind:                    req.Kind,
 		Name:                    strings.TrimSpace(req.Name),
 		Config:                  string(req.Config),
-		WebhookRequireSignature: req.WebhookRequireSignature,
+		WebhookRequireSignature: requireSig,
 	}
 	if req.WebhookEnabled {
 		t.WebhookSecret = generateWebhookSecret()
@@ -199,7 +217,13 @@ func (s *Server) updateTrigger(w http.ResponseWriter, r *http.Request) {
 	if len(req.Config) > 0 {
 		existing.Config = string(req.Config)
 	}
-	existing.WebhookRequireSignature = req.WebhookRequireSignature
+	if req.WebhookRequireSignature != nil {
+		existing.WebhookRequireSignature = *req.WebhookRequireSignature
+	} else if req.WebhookEnabled && !existing.WebhookRequireSignature {
+		// Re-enabling webhook without specifying the signature flag —
+		// take the secure default.
+		existing.WebhookRequireSignature = true
+	}
 	wasEnabled := existing.WebhookSecret != ""
 	if req.WebhookEnabled && !wasEnabled {
 		// false→true transition: rotate both secrets so re-enabling
@@ -0,0 +1,243 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"io"
+	"log/slog"
+	"net/http"
+	"os"
+	"path/filepath"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
+)
+
+// listWorkloadSnapshots handles GET /api/workloads/{id}/snapshots.
+func (s *Server) listWorkloadSnapshots(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	snaps, err := s.snapshotEngine.List(id)
+	if err != nil {
+		slog.Error("snapshots: list", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	respondJSON(w, http.StatusOK, snaps)
+}
+
+// snapshotableVolume is the sanitized view of a volume in the snapshotable
+// response — it omits the resolved host path so internal layout is not leaked.
+type snapshotableVolume struct {
+	Target string `json:"target"`
+	Scope  string `json:"scope"`
+	Source string `json:"source"`
+}
+
+// getWorkloadSnapshotable handles GET /api/workloads/{id}/snapshotable. It
+// tells the UI which volumes can be snapshotted and which are skipped (and
+// why), so users are never misled about coverage.
+func (s *Server) getWorkloadSnapshotable(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	workload, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "workload not found")
+		return
+	}
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	refs, skipped, err := volsnap.SnapshotableVolumes(s.store, workload, settings)
+	if err != nil {
+		slog.Error("snapshots: enumerate", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	volumes := make([]snapshotableVolume, 0, len(refs))
+	for _, ref := range refs {
+		volumes = append(volumes, snapshotableVolume{Target: ref.Target, Scope: ref.Scope, Source: ref.Source})
+	}
+	if skipped == nil {
+		skipped = []volsnap.SkippedVolume{}
+	}
+	respondJSON(w, http.StatusOK, map[string]any{
+		"volumes": volumes,
+		"skipped": skipped,
+	})
+}
+
+// createWorkloadSnapshot handles POST /api/workloads/{id}/snapshots.
+func (s *Server) createWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	workload, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "workload not found")
+		return
+	}
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	var body struct {
+		Label string `json:"label"`
+	}
+	if r.ContentLength != 0 {
+		if err := json.NewDecoder(io.LimitReader(r.Body, 1<<20)).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+			respondError(w, http.StatusBadRequest, "invalid JSON body")
+			return
+		}
+	}
+
+	snap, err := s.snapshotEngine.Create(workload, settings, body.Label)
+	if err != nil {
+		// "no snapshottable volume data" is client-actionable (400, safe to
+		// echo). Any other error is server-side: log the detail, return a
+		// generic 500 so internal paths / DB text never reach the client.
+		if errors.Is(err, volsnap.ErrNoSnapshotData) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		slog.Error("snapshots: create", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	respondJSON(w, http.StatusCreated, snap)
+}
+
+// deleteSnapshot handles DELETE /api/snapshots/{sid}.
+func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	sid := chi.URLParam(r, "sid")
+	if err := s.snapshotEngine.Delete(sid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondError(w, http.StatusNotFound, "snapshot not found")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "failed to delete snapshot")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
+}
+
+// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore.
+//
+// This is the most destructive workload action: it overwrites the app's live
+// volume data with the snapshot and recreates its containers. It is guarded like
+// the DB restore — admin-only, an X-Confirm-Restore header that must echo the
+// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and
+// a per-workload single-flight so a double-click can't stack two restores. All
+// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this
+// handler only validates and delegates.
+func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	sid := chi.URLParam(r, "sid")
+
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal snapshot id)")
+		return
+	}
+
+	// Up-front validation for precise client errors (Engine.Restore re-checks
+	// ownership + source kind under the lock).
+	snap, err := s.snapshotEngine.Get(sid)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot not found")
+		return
+	}
+	if snap.WorkloadID != id {
+		respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload")
+		return
+	}
+	row, ok := s.loadWorkload(w, id)
+	if !ok {
+		return
+	}
+	if row.SourceKind != "image" {
+		respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads")
+		return
+	}
+
+	// Per-workload single-flight: reject a concurrent restore of the SAME
+	// workload with 409 rather than queuing it behind the deployer lock.
+	release, ok := s.volRestoreInFlight.TryLock(id)
+	if !ok {
+		respondError(w, http.StatusConflict, "a restore is already in progress for this workload")
+		return
+	}
+	defer release()
+
+	if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil {
+		// Raw error (which can carry resolved host paths) stays in the log; the
+		// client gets a generic message.
+		slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err)
+		respondError(w, http.StatusInternalServerError, "restore failed; see server logs")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{
+		"status":      "restored",
+		"workload_id": id,
+		"snapshot_id": sid,
+	})
+}
+
+// downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
+// tar.gz archive. The resolved path is containment-checked against the
+// snapshot directory.
+func (s *Server) downloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	sid := chi.URLParam(r, "sid")
+	snap, err := s.snapshotEngine.Get(sid)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot not found")
+		return
+	}
+	path, err := s.snapshotEngine.FilePath(snap)
+	if err != nil {
+		respondError(w, http.StatusForbidden, "access denied")
+		return
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot file not found on disk")
+		return
+	}
+	defer f.Close()
+	stat, err := f.Stat()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to read snapshot file")
+		return
+	}
+	name := filepath.Base(snap.Filename)
+	w.Header().Set("Content-Type", "application/gzip")
+	w.Header().Set("Content-Disposition", "attachment; filename=\""+name+"\"")
+	http.ServeContent(w, r, name, stat.ModTime(), f)
+}
@@ -0,0 +1,385 @@
+package api
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
+	"github.com/alexei/tinyforge/internal/webhook"
+)
+
+// newSnapshotEnv builds an API test env with the volume-snapshot engine wired
+// (the shared newAPITestEnv does not wire it). dataDir holds the snapshot
+// archives; baseVol is where host-bind volume directories resolve.
+func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
+	t.Helper()
+	st, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("create store: %v", err)
+	}
+	t.Cleanup(func() { st.Close() })
+
+	encKey := [32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	dispatcher := &fakeAPIDispatcher{}
+	wh := webhook.NewHandler(st)
+	wh.SetPluginDispatcher(dispatcher)
+	srv := NewServer(st, nil, nil, nil, dispatcher, nil, wh, nil, encKey)
+
+	snapEng, err := volsnap.New(st, t.TempDir())
+	if err != nil {
+		t.Fatalf("snapshot engine: %v", err)
+	}
+	srv.SetSnapshotEngine(snapEng)
+
+	httpsrv := httptest.NewServer(srv.Router())
+	t.Cleanup(httpsrv.Close)
+
+	la := auth.NewLocalAuth(encKey)
+	tok, err := la.GenerateToken(auth.Claims{UserID: "u-admin", Username: "admin", Role: "admin"})
+	if err != nil {
+		t.Fatalf("mint token: %v", err)
+	}
+
+	baseVol := t.TempDir()
+	settings, _ := st.GetSettings()
+	settings.BaseVolumePath = baseVol
+	if err := st.UpdateSettings(settings); err != nil {
+		t.Fatalf("update settings: %v", err)
+	}
+
+	return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol
+}
+
+// doRestore issues an authenticated restore POST, optionally setting the
+// X-Confirm-Restore header (pass confirm="" to omit it).
+func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response {
+	t.Helper()
+	req, err := http.NewRequest(http.MethodPost,
+		e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil)
+	if err != nil {
+		t.Fatalf("new request: %v", err)
+	}
+	req.Header.Set("Authorization", "Bearer "+e.adminToken)
+	if confirm != "" {
+		req.Header.Set("X-Confirm-Restore", confirm)
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		t.Fatalf("do request: %v", err)
+	}
+	return resp
+}
+
+// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the
+// deep restore behavior is covered by the volsnap engine tests.
+type okLifecycle struct{ tag string }
+
+func (l *okLifecycle) Lock(string) func()                                     { return func() {} }
+func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil }
+func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
+
+func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
+	snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
+
+	// Missing header → 400.
+	resp := e.doRestore(t, w.ID, snap.ID, "")
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("missing header status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+	// Mismatched header → 400.
+	resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid")
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_WrongWorkload(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
+	snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
+
+	resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_NonImageWorkload(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`})
+	snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
+
+	resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_NotFound(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
+
+	resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid")
+	if resp.StatusCode != http.StatusNotFound {
+		t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestRestoreSnapshot_HappyPath(t *testing.T) {
+	e, baseVol := newSnapshotEnv(t)
+	e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"})
+
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name: "data-app", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil {
+		t.Fatalf("set volume: %v", err)
+	}
+	id8 := w.ID
+	if len(id8) > 8 {
+		id8 = id8[:8]
+	}
+	hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
+	if err := os.MkdirAll(hostDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	settings, _ := e.store.GetSettings()
+	snap, err := e.snapEngine.Create(w, settings, "base")
+	if err != nil {
+		t.Fatalf("create snapshot: %v", err)
+	}
+	// Drift the live data, then restore.
+	if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body)
+	}
+	resp.Body.Close()
+	if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" {
+		t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got)
+	}
+}
+
+// blockingLifecycle blocks in Lock until released, signaling when entered — so
+// a test can hold one restore in-flight and assert a second is rejected 409.
+type blockingLifecycle struct {
+	entered chan struct{}
+	release chan struct{}
+	once    sync.Once
+}
+
+func (l *blockingLifecycle) Lock(string) func() {
+	l.once.Do(func() { close(l.entered) })
+	<-l.release
+	return func() {}
+}
+func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil }
+func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
+
+// seedRestorable creates an image workload with a project volume + live data and
+// a captured snapshot, returning the workload and snapshot ids.
+func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) {
+	t.Helper()
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name: "sf-app", Kind: "project", SourceKind: "image",
+		SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	id8 := w.ID
+	if len(id8) > 8 {
+		id8 = id8[:8]
+	}
+	hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data")
+	if err := os.MkdirAll(hostDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	settings, _ := e.store.GetSettings()
+	snap, err := e.snapEngine.Create(w, settings, "base")
+	if err != nil {
+		t.Fatalf("create snapshot: %v", err)
+	}
+	return w.ID, snap.ID
+}
+
+func TestRestoreSnapshot_SingleFlight409(t *testing.T) {
+	e, baseVol := newSnapshotEnv(t)
+	wid, sid := seedRestorable(t, e, baseVol)
+	bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})}
+	e.snapEngine.SetLifecycle(bl)
+
+	// Restore #1: passes validation, takes the single-flight, then blocks inside
+	// the engine's Lock.
+	go func() {
+		resp := e.doRestore(t, wid, sid, sid)
+		resp.Body.Close()
+	}()
+
+	select {
+	case <-bl.entered:
+	case <-time.After(3 * time.Second):
+		t.Fatal("first restore never reached the lifecycle lock")
+	}
+
+	// Restore #2 for the same workload must be rejected fast with 409.
+	resp := e.doRestore(t, wid, sid, sid)
+	got := resp.StatusCode
+	resp.Body.Close()
+	close(bl.release) // let #1 finish
+	if got != http.StatusConflict {
+		t.Fatalf("concurrent restore status = %d, want 409", got)
+	}
+}
+
+func TestVolumeSnapshots_EndToEnd(t *testing.T) {
+	e, baseVol := newSnapshotEnv(t)
+
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name:         "data-app",
+		Kind:         "project",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"registry.example.com/owner/app","port":8080}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{
+		WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project",
+	}); err != nil {
+		t.Fatalf("set volume: %v", err)
+	}
+
+	// Materialize the resolved host-bind dir with a file so there is data to
+	// capture. Layout mirrors ResolveWorkloadPath for project scope:
+	// <baseVol>/<name>-<id8>/<source>.
+	id8 := w.ID
+	if len(id8) > 8 {
+		id8 = id8[:8]
+	}
+	hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
+	if err := os.MkdirAll(hostDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("important"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	// snapshotable lists the one host-bind volume.
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshotable", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("snapshotable status = %d", resp.StatusCode)
+	}
+	var snapable struct {
+		Volumes []map[string]string `json:"volumes"`
+		Skipped []map[string]string `json:"skipped"`
+	}
+	decodeEnvelope(t, resp, &snapable)
+	if len(snapable.Volumes) != 1 || snapable.Volumes[0]["target"] != "/data" {
+		t.Fatalf("expected 1 snapshotable volume /data, got %+v", snapable)
+	}
+
+	// Create a snapshot.
+	resp = e.do(t, http.MethodPost, "/api/workloads/"+w.ID+"/snapshots", map[string]string{"label": "before upgrade"})
+	if resp.StatusCode != http.StatusCreated {
+		t.Fatalf("create snapshot status = %d", resp.StatusCode)
+	}
+	var snap store.VolumeSnapshot
+	decodeEnvelope(t, resp, &snap)
+	if snap.ID == "" || snap.SizeBytes == 0 || snap.Label != "before upgrade" {
+		t.Fatalf("unexpected snapshot: %+v", snap)
+	}
+
+	// It appears in the list.
+	resp = e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshots", nil)
+	var list []store.VolumeSnapshot
+	decodeEnvelope(t, resp, &list)
+	if len(list) != 1 || list[0].ID != snap.ID {
+		t.Fatalf("expected 1 snapshot in list, got %+v", list)
+	}
+
+	// Download streams a non-empty gzip archive (not the JSON envelope).
+	resp = e.do(t, http.MethodGet, "/api/snapshots/"+snap.ID+"/download", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("download status = %d", resp.StatusCode)
+	}
+	if ct := resp.Header.Get("Content-Type"); ct != "application/gzip" {
+		t.Errorf("download content-type = %q, want application/gzip", ct)
+	}
+	data, _ := io.ReadAll(resp.Body)
+	resp.Body.Close()
+	if len(data) == 0 {
+		t.Error("download body is empty")
+	}
+
+	// Delete removes it.
+	resp = e.do(t, http.MethodDelete, "/api/snapshots/"+snap.ID, nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("delete status = %d", resp.StatusCode)
+	}
+	resp = e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshots", nil)
+	var after []store.VolumeSnapshot
+	decodeEnvelope(t, resp, &after)
+	if len(after) != 0 {
+		t.Fatalf("expected 0 snapshots after delete, got %d", len(after))
+	}
+}
+
+func TestCreateSnapshot_NoVolumeData_Returns400(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name:         "no-vol-app",
+		Kind:         "project",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"x","port":80}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+w.ID+"/snapshots", nil)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("expected 400 for an app with no snapshottable volumes, got %d", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestSnapshotEndpoints_RequireWorkload(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	// snapshotable on an unknown workload → 404.
+	resp := e.do(t, http.MethodGet, "/api/workloads/does-not-exist/snapshotable", nil)
+	if resp.StatusCode != http.StatusNotFound {
+		t.Fatalf("snapshotable unknown workload = %d, want 404", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
@@ -13,18 +13,29 @@ import (
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
 )

 // chainNode is the lightweight shape returned by /chain — we deliberately
 // don't return full plugin.Workload values for ancestor/descendant rows
 // because the secret fields don't belong in a chain-traversal response.
+//
+// IsPreview / PreviewBranch surface branch-preview children to the UI so it
+// can render them in a dedicated "Preview environments" panel rather than as
+// undistinguished stage children. They are computed against the chain's
+// `self` workload via preview.IsPreviewChild — the canonical "this child is a
+// branch preview" test that reverses the MaterializeForBranch naming formula.
+// Both are zero-valued (false / "") for the parent and self nodes and for
+// operator-created stage children.
 type chainNode struct {
-	ID         string `json:"id"`
-	Name       string `json:"name"`
-	SourceKind string `json:"source_kind"`
-	TriggerKind string `json:"trigger_kind"`
-	CreatedAt  string `json:"created_at"`
-	UpdatedAt  string `json:"updated_at"`
+	ID            string `json:"id"`
+	Name          string `json:"name"`
+	SourceKind    string `json:"source_kind"`
+	TriggerKind   string `json:"trigger_kind"`
+	IsPreview     bool   `json:"is_preview"`
+	PreviewBranch string `json:"preview_branch,omitempty"`
+	CreatedAt     string `json:"created_at"`
+	UpdatedAt     string `json:"updated_at"`
 }

 func chainNodeOf(w store.Workload) chainNode {
@@ -38,6 +49,32 @@ func chainNodeOf(w store.Workload) chainNode {
 	}
 }

+// previewBranchOf extracts the branch a preview child was materialized for
+// from its source_config (the `branch` key MaterializeForBranch wrote).
+// Returns "" on a missing/malformed config — the caller only calls this for
+// rows preview.IsPreviewChild already confirmed, so a blank result just means
+// the JSON couldn't be decoded.
+func previewBranchOf(w store.Workload) string {
+	var cfg struct {
+		Branch string `json:"branch"`
+	}
+	if w.SourceConfig != "" {
+		_ = json.Unmarshal([]byte(w.SourceConfig), &cfg)
+	}
+	return cfg.Branch
+}
+
+// childChainNode builds a chainNode for a child row, marking it as a branch
+// preview (and attaching its branch) when it was materialized from `self`.
+func childChainNode(self, child store.Workload) chainNode {
+	node := chainNodeOf(child)
+	if preview.IsPreviewChild(self, child) {
+		node.IsPreview = true
+		node.PreviewBranch = previewBranchOf(child)
+	}
+	return node
+}
+
 // getWorkloadChain handles GET /api/workloads/{id}/chain.
 //
 // Returns the workload's parent (or nil), itself, and its direct children
@@ -76,7 +113,7 @@ func (s *Server) getWorkloadChain(w http.ResponseWriter, r *http.Request) {
 	}
 	children := make([]chainNode, 0, len(childRows))
 	for _, c := range childRows {
-		children = append(children, chainNodeOf(c))
+		children = append(children, childChainNode(self, c))
 	}

 	respondJSON(w, http.StatusOK, map[string]any{
@@ -0,0 +1,147 @@
+package api
+
+import (
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// TestChildChainNode_MarksPreviewChildren verifies the /chain DTO builder
+// distinguishes branch-preview children (materialized by the preview package)
+// from operator-created stage children that merely share the parent link.
+// The discriminator is preview.IsPreviewChild, which reverses the
+// MaterializeForBranch naming formula: name == template.Name + "/" + slug.
+func TestChildChainNode_MarksPreviewChildren(t *testing.T) {
+	template := store.Workload{
+		ID:         "tmpl-1",
+		Name:       "myapp",
+		SourceKind: "dockerfile",
+	}
+
+	tests := []struct {
+		name       string
+		child      store.Workload
+		wantPrev   bool
+		wantBranch string
+	}{
+		{
+			name: "preview child is marked with its branch",
+			child: store.Workload{
+				ID:               "child-prev",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login","port":3000}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   true,
+			wantBranch: "feat/login",
+		},
+		{
+			name: "operator-named stage child sharing the parent is not a preview",
+			child: store.Workload{
+				ID:               "child-stage",
+				Name:             "myapp-staging",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"main"}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			name: "child of a different parent is not a preview of self",
+			child: store.Workload{
+				ID:               "child-other",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login"}`,
+				ParentWorkloadID: "some-other-template",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			name: "child with no branch in source_config is not a preview",
+			child: store.Workload{
+				ID:               "child-nobranch",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			// Same parent + a valid branch, but the name carries an extra
+			// suffix so it fails ONLY the slug-equality check (expected
+			// "myapp/feat-login", got "myapp/feat-login-staging"). The
+			// branch alone must not be enough to mark a preview.
+			name: "valid branch but name fails the slug match is not a preview",
+			child: store.Workload{
+				ID:               "child-slugmiss",
+				Name:             "myapp/feat-login-staging",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login","port":3000}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			// Uppercase + slash branch: slugifyBranch lowercases and maps
+			// "/" -> "-", so "Feature/Login" -> "feature-login" and the name
+			// "myapp/feature-login" matches. PreviewBranch must echo the RAW
+			// branch from source_config ("Feature/Login"), not the slug.
+			name: "uppercase slash branch matches and keeps raw branch",
+			child: store.Workload{
+				ID:               "child-upper",
+				Name:             "myapp/feature-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"Feature/Login","port":8080}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   true,
+			wantBranch: "Feature/Login",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			node := childChainNode(template, tc.child)
+			if node.IsPreview != tc.wantPrev {
+				t.Errorf("IsPreview = %v, want %v", node.IsPreview, tc.wantPrev)
+			}
+			if node.PreviewBranch != tc.wantBranch {
+				t.Errorf("PreviewBranch = %q, want %q", node.PreviewBranch, tc.wantBranch)
+			}
+			// Base fields must always round-trip regardless of preview status.
+			if node.ID != tc.child.ID || node.Name != tc.child.Name {
+				t.Errorf("base fields mangled: got id=%q name=%q", node.ID, node.Name)
+			}
+		})
+	}
+}
+
+// TestPreviewBranchOf_ToleratesMalformedConfig confirms the branch extractor
+// returns "" rather than panicking on a missing or invalid source_config.
+func TestPreviewBranchOf_ToleratesMalformedConfig(t *testing.T) {
+	cases := []struct {
+		name string
+		cfg  string
+		want string
+	}{
+		{"valid branch", `{"branch":"release/v1"}`, "release/v1"},
+		{"empty config", ``, ""},
+		{"empty object", `{}`, ""},
+		{"malformed json", `{not-json`, ""},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got := previewBranchOf(store.Workload{SourceConfig: c.cfg})
+			if got != c.want {
+				t.Errorf("previewBranchOf(%q) = %q, want %q", c.cfg, got, c.want)
+			}
+		})
+	}
+}
@@ -2,48 +2,17 @@ package api

 import (
 	"encoding/json"
-	"log/slog"

 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

-// toPluginWorkload converts a persisted store.Workload row into the value
-// shape that Source / Trigger plugins consume. Lives in the api package
-// (rather than store or plugin) to keep plugin's dependency graph free of
-// store imports and avoid the cycle that would form otherwise.
-//
-// SourceConfig / TriggerConfig are passed through as raw JSON; the matching
-// plugin decodes them with plugin.SourceConfigOf[T] / TriggerConfigOf[T].
-// PublicFaces is decoded eagerly because every consumer needs the parsed
-// slice (proxy registration, UI, validation).
+// toPluginWorkload is a local alias for the shared plugin.WorkloadFromStore
+// converter, kept so the api package's many call sites read tersely and pair
+// visually with fromPluginWorkload below. The conversion logic lives in the
+// plugin package (the single home shared with reconciler / webhook).
 func toPluginWorkload(w store.Workload) plugin.Workload {
-	var faces []plugin.PublicFace
-	if w.PublicFaces != "" {
-		if err := json.Unmarshal([]byte(w.PublicFaces), &faces); err != nil {
-			slog.Warn("workload: invalid public_faces JSON, treating as empty",
-				"workload", w.ID, "error", err)
-			faces = nil
-		}
-	}
-	return plugin.Workload{
-		ID:                      w.ID,
-		Name:                    w.Name,
-		GroupID:                 w.AppID,
-		ParentWorkloadID:        w.ParentWorkloadID,
-		SourceKind:              w.SourceKind,
-		SourceConfig:            json.RawMessage(w.SourceConfig),
-		TriggerKind:             w.TriggerKind,
-		TriggerConfig:           json.RawMessage(w.TriggerConfig),
-		PublicFaces:             faces,
-		NotificationURL:         w.NotificationURL,
-		NotificationSecret:      w.NotificationSecret,
-		WebhookSecret:           w.WebhookSecret,
-		WebhookSigningSecret:    w.WebhookSigningSecret,
-		WebhookRequireSignature: w.WebhookRequireSignature,
-		CreatedAt:               w.CreatedAt,
-		UpdatedAt:               w.UpdatedAt,
-	}
+	return plugin.WorkloadFromStore(w)
 }

 // fromPluginWorkload is the symmetric direction — used by /api/workloads
@@ -0,0 +1,231 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// workloadNotificationRow is the JSON shape returned to clients. The
+// `secret_set` boolean replaces the actual ciphertext: once stored a
+// secret is write-only, mirroring how workload_env hides encrypted
+// values. Rotating means submitting a new value.
+type workloadNotificationRow struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	SecretSet  bool   `json:"secret_set"`
+	EventTypes string `json:"event_types"`
+	Enabled    bool   `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
+func toWorkloadNotificationRow(n store.WorkloadNotification) workloadNotificationRow {
+	return workloadNotificationRow{
+		ID:         n.ID,
+		WorkloadID: n.WorkloadID,
+		Name:       n.Name,
+		URL:        n.URL,
+		SecretSet:  n.Secret != "",
+		EventTypes: n.EventTypes,
+		Enabled:    n.Enabled,
+		SortOrder:  n.SortOrder,
+		CreatedAt:  n.CreatedAt,
+		UpdatedAt:  n.UpdatedAt,
+	}
+}
+
+func (s *Server) listWorkloadNotifications(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	rows, err := s.store.ListWorkloadNotifications(id)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workload notifications")
+		return
+	}
+	out := make([]workloadNotificationRow, 0, len(rows))
+	for _, n := range rows {
+		out = append(out, toWorkloadNotificationRow(n))
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// workloadNotificationRequest is the POST/PUT body. Secret is the raw
+// plaintext webhook signing key; the server encrypts it at rest with
+// the global encryption key before INSERT. An empty Secret on UPDATE
+// leaves the stored secret untouched so the operator can edit the URL
+// or event filter without re-entering the secret each time.
+type workloadNotificationRequest struct {
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	Secret     string `json:"secret"`
+	EventTypes string `json:"event_types"`
+	Enabled    *bool  `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+}
+
+func (s *Server) createWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	var req workloadNotificationRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.URL = strings.TrimSpace(req.URL)
+	req.Name = strings.TrimSpace(req.Name)
+	if req.URL == "" {
+		respondError(w, http.StatusBadRequest, "url is required")
+		return
+	}
+	encSecret := ""
+	if req.Secret != "" {
+		v, err := crypto.Encrypt(s.encKey, req.Secret)
+		if err != nil {
+			slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
+			respondError(w, http.StatusInternalServerError, "encrypt secret")
+			return
+		}
+		encSecret = v
+	}
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+	created, err := s.store.CreateWorkloadNotification(store.WorkloadNotification{
+		WorkloadID: id,
+		Name:       req.Name,
+		URL:        req.URL,
+		Secret:     encSecret,
+		EventTypes: req.EventTypes,
+		Enabled:    enabled,
+		SortOrder:  req.SortOrder,
+	})
+	if err != nil {
+		slog.Error("workload notifications: create", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "create workload notification")
+		return
+	}
+	respondJSON(w, http.StatusCreated, toWorkloadNotificationRow(created))
+}
+
+func (s *Server) updateWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	nid := chi.URLParam(r, "nid")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	existing, err := s.store.GetWorkloadNotification(nid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload_notification")
+		return
+	}
+	if existing.WorkloadID != id {
+		// Route mismatch — the row exists but under a different workload.
+		// Return 404 rather than 403 so we don't leak the existence of
+		// foreign rows to an unauthorised caller.
+		respondNotFound(w, "workload_notification")
+		return
+	}
+
+	var req workloadNotificationRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.URL = strings.TrimSpace(req.URL)
+	req.Name = strings.TrimSpace(req.Name)
+	if req.URL == "" {
+		respondError(w, http.StatusBadRequest, "url is required")
+		return
+	}
+
+	existing.Name = req.Name
+	existing.URL = req.URL
+	existing.EventTypes = req.EventTypes
+	existing.SortOrder = req.SortOrder
+	if req.Enabled != nil {
+		existing.Enabled = *req.Enabled
+	}
+	// Empty Secret on UPDATE preserves the stored ciphertext — explicit
+	// rotation requires sending the new plaintext. This avoids forcing
+	// the operator to re-enter their secret on every URL edit.
+	if req.Secret != "" {
+		v, err := crypto.Encrypt(s.encKey, req.Secret)
+		if err != nil {
+			slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
+			respondError(w, http.StatusInternalServerError, "encrypt secret")
+			return
+		}
+		existing.Secret = v
+	}
+
+	if err := s.store.UpdateWorkloadNotification(existing); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		slog.Error("workload notifications: update", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "update workload notification")
+		return
+	}
+	respondJSON(w, http.StatusOK, toWorkloadNotificationRow(existing))
+}
+
+func (s *Server) deleteWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	nid := chi.URLParam(r, "nid")
+	existing, err := s.store.GetWorkloadNotification(nid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload_notification")
+		return
+	}
+	if existing.WorkloadID != id {
+		respondNotFound(w, "workload_notification")
+		return
+	}
+	if err := s.store.DeleteWorkloadNotification(nid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		slog.Error("workload notifications: delete", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "delete workload notification")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{"success": true})
+}
@@ -82,16 +82,27 @@ func (s *Server) getWorkloadRuntimeState(w http.ResponseWriter, r *http.Request)

 	payload := runtimeStatePayload{SourceKind: workload.SourceKind}

-	if workload.SourceKind != "static" {
+	// Both static and dockerfile sources persist their runtime state into
+	// containers.extra_json under a deterministic row id. The shapes
+	// match (status / last_commit_sha / last_sync_at / last_error) so the
+	// handler can decode them identically. The suffix differs per source
+	// kind: static uses ":site", dockerfile uses ":dockerfile".
+	var rowSuffix string
+	switch workload.SourceKind {
+	case "static":
+		rowSuffix = ":site"
+	case "dockerfile":
+		rowSuffix = ":dockerfile"
+	default:
 		respondJSON(w, http.StatusOK, payload)
 		return
 	}

-	// The static plugin owns one container row per workload at the
-	// deterministic ID <workloadID>:site. A missing row means the
-	// workload has never been deployed — return HasState=false so the
-	// UI can prompt the operator to deploy.
-	row, err := s.store.GetContainerByID(id + ":site")
+	// The owning plugin maintains one container row per workload at the
+	// deterministic ID. A missing row means the workload has never been
+	// deployed — return HasState=false so the UI can prompt the operator
+	// to deploy.
+	row, err := s.store.GetContainerByID(id + rowSuffix)
 	if err != nil {
 		if errors.Is(err, store.ErrNotFound) {
 			respondJSON(w, http.StatusOK, payload)
@@ -130,6 +130,13 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
 		SourceKind:   "static",
 		SourceConfig: `{"provider":"gitea"}`,
 	})
+	// Seed a row with a valid extra_json first, then corrupt it via raw
+	// SQL. Prior to the write-side validateExtraJSON guard this test
+	// could pass a malformed string straight to UpsertContainer; the
+	// guard now rejects that at the boundary, which is the correct
+	// behaviour. The reader resilience this test verifies remains
+	// relevant for pre-existing bad rows from upgrades or external
+	// manipulation, so we still produce one via direct SQL.
 	if err := e.store.UpsertContainer(store.Container{
 		ID:           wl.ID + ":site",
 		WorkloadID:   wl.ID,
@@ -137,10 +144,16 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
 		Host:         "local",
 		ContainerID:  "abc",
 		State:        "running",
-		ExtraJSON:    `{this is not json`,
+		ExtraJSON:    `{}`,
 	}); err != nil {
 		t.Fatalf("seed: %v", err)
 	}
+	if _, err := e.store.DB().Exec(
+		`UPDATE containers SET extra_json = ? WHERE id = ?`,
+		`{this is not json`, wl.ID+":site",
+	); err != nil {
+		t.Fatalf("corrupt extra_json: %v", err)
+	}
 	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
 	if resp.StatusCode != http.StatusOK {
 		t.Fatalf("status = %d, want 200 (decode is non-fatal)", resp.StatusCode)
@@ -155,6 +168,57 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
 	}
 }

+func TestGetWorkloadRuntimeState_DockerfileSourceDeployed_DecodesExtraJSON(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         "build-app",
+		SourceKind:   "dockerfile",
+		SourceConfig: `{"provider":"gitea","port":3000}`,
+	})
+	if err != nil {
+		t.Fatalf("seed workload: %v", err)
+	}
+	extra, _ := json.Marshal(map[string]any{
+		"status":          "deployed",
+		"last_commit_sha": "deadbeef",
+		"last_sync_at":    "2026-05-23T10:00:00Z",
+		"last_error":      "",
+	})
+	if err := e.store.UpsertContainer(store.Container{
+		ID:           wl.ID + ":dockerfile",
+		WorkloadID:   wl.ID,
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Host:         "local",
+		ContainerID:  "ffeeddcc",
+		State:        "running",
+		ExtraJSON:    string(extra),
+	}); err != nil {
+		t.Fatalf("seed container: %v", err)
+	}
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if !got.HasState {
+		t.Fatalf("HasState = false, want true")
+	}
+	if got.SourceKind != "dockerfile" {
+		t.Errorf("SourceKind = %q, want dockerfile", got.SourceKind)
+	}
+	if got.ContainerID != "ffeeddcc" || got.State != "running" {
+		t.Errorf("container fields = (%q,%q), want (ffeeddcc, running)", got.ContainerID, got.State)
+	}
+	if got.Status != "deployed" || got.LastCommitSHA != "deadbeef" {
+		t.Errorf("runtime fields = %+v, want deployed/deadbeef", got)
+	}
+}
+
 // =============================================================================
 // GET /api/workloads/{id}/storage
 // =============================================================================
@@ -14,6 +14,7 @@ import (
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
 )

 // pluginWorkloadRequest is the JSON body accepted by create + update.
@@ -227,6 +228,28 @@ func (s *Server) deletePluginWorkload(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	// Cascade-teardown any branch previews materialized from this workload
+	// so deleting a template does not orphan their containers, proxy routes,
+	// and rows. Operator-managed stage-chain children (which share the same
+	// parent link) are deliberately left alone — only previews are auto-owned
+	// by the template (see preview.IsPreviewChild).
+	if previews, err := preview.ListPreviewChildren(s.store, row); err != nil {
+		slog.Warn("delete workload: list preview children", "workload", id, "error", err)
+	} else {
+		for _, child := range previews {
+			if child.SourceKind != "" {
+				if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(child)); err != nil {
+					slog.Warn("delete workload: preview child teardown error",
+						"workload", id, "child", child.ID, "error", err)
+				}
+			}
+			if err := s.store.DeleteWorkload(child.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
+				slog.Warn("delete workload: preview child delete error",
+					"workload", id, "child", child.ID, "error", err)
+			}
+		}
+	}
+
 	if row.SourceKind != "" {
 		if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(row)); err != nil {
 			slog.Warn("delete workload: teardown error",
@@ -15,6 +15,7 @@ import (
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/crypto"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
 	"github.com/alexei/tinyforge/internal/webhook"
 	"github.com/alexei/tinyforge/internal/workload/plugin"

@@ -75,6 +76,7 @@ type apiTestEnv struct {
 	dispatcher *fakeAPIDispatcher
 	adminToken string
 	encKey     [32]byte
+	snapEngine *volsnap.Engine // set by newSnapshotEnv; nil otherwise
 }

 func (e *apiTestEnv) close() { e.srv.Close() }
@@ -670,9 +672,9 @@ func TestGetWorkloadChain_ParentSelfChildren(t *testing.T) {

 	resp := e.do(t, http.MethodGet, "/api/workloads/"+parentID+"/chain", nil)
 	var got struct {
-		Parent   *map[string]any   `json:"parent"`
-		Self     map[string]any    `json:"self"`
-		Children []map[string]any  `json:"children"`
+		Parent   *map[string]any  `json:"parent"`
+		Self     map[string]any   `json:"self"`
+		Children []map[string]any `json:"children"`
 	}
 	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
 		t.Fatalf("envelope error: %q", errMsg)
@@ -85,9 +85,15 @@ func (la *LocalAuth) cleanBlacklist() {
 	}
 }

+// bcryptCost is the work factor used for new password hashes. Bumped from
+// the library default (10) to 12 so cost grows with hardware. Existing
+// hashes at lower costs still verify — bcrypt encodes the cost in the
+// stored hash itself.
+const bcryptCost = 12
+
 // HashPassword hashes a plaintext password using bcrypt.
 func HashPassword(password string) (string, error) {
-	hash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
+	hash, err := bcrypt.GenerateFromPassword([]byte(password), bcryptCost)
 	if err != nil {
 		return "", fmt.Errorf("hash password: %w", err)
 	}
@@ -1,13 +1,17 @@
 package backup

 import (
+	"database/sql"
 	"fmt"
+	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"sync"
 	"time"

+	_ "modernc.org/sqlite" // read-only candidate inspection via PRAGMA integrity_check
+
 	"github.com/alexei/tinyforge/internal/store"
 )

@@ -129,6 +133,17 @@ func (e *Engine) RestorePath(id string) (string, error) {
 		return "", fmt.Errorf("get backup: %w", err)
 	}

+	// Filename comes from a DB row. Defence-in-depth: a backup file must live
+	// directly under backupDir, so reject any value carrying a path separator
+	// or traversal before joining. A poisoned row (future import path, manual
+	// insert) must never let restore read — and then atomically copy over the
+	// live DB — an arbitrary file. CreateBackup builds safe base names; this
+	// enforces the same invariant on read.
+	if backup.Filename == "" || backup.Filename == "." || backup.Filename == ".." ||
+		backup.Filename != filepath.Base(backup.Filename) {
+		return "", fmt.Errorf("backup: invalid filename %q", backup.Filename)
+	}
+
 	filePath := filepath.Join(e.backupDir, backup.Filename)
 	if _, err := os.Stat(filePath); err != nil {
 		return "", fmt.Errorf("backup file not found: %w", err)
@@ -137,6 +152,153 @@ func (e *Engine) RestorePath(id string) (string, error) {
 	return filePath, nil
 }

+// PrepareRestore validates a backup candidate before the caller swaps it
+// over the live DB. Runs three checks in order:
+//
+//  1. The candidate file exists and is non-empty.
+//  2. SQLite header magic matches (catches corrupted or partial downloads).
+//  3. `PRAGMA integrity_check` against a temp copy returns "ok"
+//     (catches WAL/page corruption that the header check misses).
+//
+// On success returns the candidate path. On failure returns a wrapped
+// error describing which probe rejected the file, so the operator can
+// see exactly why a "restore" was refused rather than getting a corrupt
+// DB at next boot.
+//
+// We use a *temp copy* for integrity_check because attaching the
+// candidate read-only into the live process would still hold a file
+// handle SQLite considers writable on Windows.
+func (e *Engine) PrepareRestore(id string) (string, error) {
+	path, err := e.RestorePath(id)
+	if err != nil {
+		return "", err
+	}
+
+	info, err := os.Stat(path)
+	if err != nil {
+		return "", fmt.Errorf("restore: stat candidate: %w", err)
+	}
+	if info.Size() < 100 {
+		return "", fmt.Errorf("restore: candidate %s is suspiciously small (%d bytes)", path, info.Size())
+	}
+
+	// SQLite file header: "SQLite format 3\x00" (16 bytes).
+	hdr, err := readHead(path, 16)
+	if err != nil {
+		return "", fmt.Errorf("restore: read header: %w", err)
+	}
+	if string(hdr) != "SQLite format 3\x00" {
+		return "", fmt.Errorf("restore: candidate %s is not a SQLite database (header mismatch)", path)
+	}
+
+	if err := integrityCheck(path); err != nil {
+		return "", fmt.Errorf("restore: integrity check failed: %w", err)
+	}
+
+	return path, nil
+}
+
+func readHead(path string, n int) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	buf := make([]byte, n)
+	// io.ReadFull (not f.Read) guarantees the buffer is filled.
+	// A bare Read can short-return on some filesystems / on small
+	// files, which would skew the SQLite-header magic check below.
+	if _, err := io.ReadFull(f, buf); err != nil {
+		return nil, err
+	}
+	return buf, nil
+}
+
+// integrityCheck opens the candidate read-only and runs
+// `PRAGMA integrity_check`. We use immutable=1 so the driver does not
+// try to create WAL/SHM sidecars or upgrade the journal mode on the
+// candidate — both of which fail with "attempt to write a readonly
+// database" against a backup file. Anything other than the single row
+// `"ok"` is treated as corruption.
+func integrityCheck(path string) error {
+	db, err := sql.Open("sqlite", "file:"+path+"?mode=ro&immutable=1")
+	if err != nil {
+		return fmt.Errorf("open candidate: %w", err)
+	}
+	defer db.Close()
+
+	rows, err := db.Query("PRAGMA integrity_check")
+	if err != nil {
+		return fmt.Errorf("pragma integrity_check: %w", err)
+	}
+	defer rows.Close()
+
+	if !rows.Next() {
+		return fmt.Errorf("integrity_check returned no rows")
+	}
+	var result string
+	if err := rows.Scan(&result); err != nil {
+		return fmt.Errorf("scan integrity_check: %w", err)
+	}
+	if result != "ok" {
+		return fmt.Errorf("integrity_check: %s", result)
+	}
+	return nil
+}
+
+// AtomicReplaceDB writes a backup candidate into place atomically.
+// The caller is expected to:
+//  1. Call PrepareRestore(id) → candidatePath.
+//  2. Take a "pre-restore" backup of the current DB via CreateBackup.
+//  3. Close the live *sql.DB.
+//  4. Call AtomicReplaceDB(candidatePath, livePath).
+//  5. Trigger graceful shutdown; main() will re-open on next start.
+//
+// AtomicReplaceDB also wipes WAL/SHM sidecar files so the new DB starts
+// from a clean checkpoint state. Failure to remove sidecars is logged
+// but non-fatal — SQLite recreates them on open.
+func (e *Engine) AtomicReplaceDB(candidatePath, livePath string) error {
+	// Copy candidate to a tmp file next to the live DB, then rename
+	// atomically. On Windows os.Rename across volumes fails, so we
+	// keep tmp on the same dir as the destination.
+	tmp := livePath + ".restore.tmp"
+	if err := copyFile(candidatePath, tmp); err != nil {
+		return fmt.Errorf("copy candidate to %s: %w", tmp, err)
+	}
+	// Best-effort: remove WAL/SHM so SQLite re-checkpoints from the
+	// restored main file rather than a stale WAL pointing at the old
+	// DB's pages.
+	for _, sidecar := range []string{livePath + "-wal", livePath + "-shm"} {
+		if err := os.Remove(sidecar); err != nil && !os.IsNotExist(err) {
+			slog.Warn("restore: remove sidecar", "path", sidecar, "error", err)
+		}
+	}
+	if err := os.Rename(tmp, livePath); err != nil {
+		// Clean up tmp on rename failure so we don't leak a partial file.
+		_ = os.Remove(tmp)
+		return fmt.Errorf("rename %s → %s: %w", tmp, livePath, err)
+	}
+	slog.Info("restore: database file replaced atomically", "live", livePath)
+	return nil
+}
+
+func copyFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
+	if err != nil {
+		return err
+	}
+	if _, err := io.Copy(out, in); err != nil {
+		_ = out.Close()
+		return err
+	}
+	return out.Close()
+}
+
 // Prune removes old backups exceeding the retention count.
 // Returns the number of backups pruned.
 func (e *Engine) Prune(retentionCount int) (int, error) {
@@ -0,0 +1,113 @@
+package backup
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// newTestEngine spins up an isolated store + engine pair for tests.
+// Each test gets its own tempdir so backup files do not collide.
+func newTestEngine(t *testing.T) (*Engine, *store.Store, string) {
+	t.Helper()
+	dir := t.TempDir()
+	dbPath := filepath.Join(dir, "tinyforge.db")
+	st, err := store.New(dbPath)
+	if err != nil {
+		t.Fatalf("store.New: %v", err)
+	}
+	t.Cleanup(func() { _ = st.Close() })
+
+	eng, err := New(st, dbPath, dir)
+	if err != nil {
+		t.Fatalf("backup.New: %v", err)
+	}
+	return eng, st, dbPath
+}
+
+func TestPrepareRestore_RejectsTinyFile(t *testing.T) {
+	eng, st, _ := newTestEngine(t)
+
+	// Plant a backup row with a tiny file masquerading as a backup.
+	tinyPath := filepath.Join(eng.BackupDir(), "tinyforge-manual-junk.db")
+	if err := os.WriteFile(tinyPath, []byte("hi"), 0o600); err != nil {
+		t.Fatalf("write tiny: %v", err)
+	}
+	bk, err := st.CreateBackup(store.Backup{
+		Filename:   "tinyforge-manual-junk.db",
+		SizeBytes:  2,
+		BackupType: "manual",
+	})
+	if err != nil {
+		t.Fatalf("CreateBackup row: %v", err)
+	}
+
+	if _, err := eng.PrepareRestore(bk.ID); err == nil {
+		t.Fatal("expected PrepareRestore to reject tiny file, got nil")
+	} else if !strings.Contains(err.Error(), "suspiciously small") {
+		t.Errorf("error = %v, want 'suspiciously small'", err)
+	}
+}
+
+func TestPrepareRestore_RejectsNonSQLite(t *testing.T) {
+	eng, st, _ := newTestEngine(t)
+
+	// 200 bytes of non-SQLite garbage: passes the size check, fails
+	// the header magic check.
+	garbagePath := filepath.Join(eng.BackupDir(), "tinyforge-manual-bogus.db")
+	junk := make([]byte, 200)
+	for i := range junk {
+		junk[i] = byte('x')
+	}
+	if err := os.WriteFile(garbagePath, junk, 0o600); err != nil {
+		t.Fatalf("write junk: %v", err)
+	}
+	bk, err := st.CreateBackup(store.Backup{
+		Filename:   "tinyforge-manual-bogus.db",
+		SizeBytes:  int64(len(junk)),
+		BackupType: "manual",
+	})
+	if err != nil {
+		t.Fatalf("CreateBackup row: %v", err)
+	}
+
+	if _, err := eng.PrepareRestore(bk.ID); err == nil {
+		t.Fatal("expected PrepareRestore to reject non-SQLite blob, got nil")
+	} else if !strings.Contains(err.Error(), "header") {
+		t.Errorf("error = %v, want header mismatch", err)
+	}
+}
+
+func TestPrepareRestore_AcceptsValidVacuumInto(t *testing.T) {
+	eng, _, _ := newTestEngine(t)
+
+	// A fresh CreateBackup from the engine itself is, by construction,
+	// a valid SQLite database — VACUUM INTO produces a clean copy.
+	bk, err := eng.CreateBackup("manual")
+	if err != nil {
+		t.Fatalf("CreateBackup: %v", err)
+	}
+	path, err := eng.PrepareRestore(bk.ID)
+	if err != nil {
+		t.Fatalf("PrepareRestore on valid backup: %v", err)
+	}
+	if path == "" {
+		t.Errorf("PrepareRestore returned empty path")
+	}
+}
+
+func TestPrepareRestore_UnknownID(t *testing.T) {
+	eng, _, _ := newTestEngine(t)
+
+	_, err := eng.PrepareRestore("nonexistent-id")
+	if err == nil {
+		t.Fatal("expected error for unknown id, got nil")
+	}
+	if errors.Is(err, store.ErrNotFound) {
+		// fine — wrapped through RestorePath
+	}
+}
@@ -10,11 +10,26 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"strings"
 )

 // ErrNoKey is returned when ENCRYPTION_KEY is not set.
 var ErrNoKey = errors.New("ENCRYPTION_KEY environment variable is not set")

+// ErrDecryptFailed wraps any cipher.Open / decoder failure. Callers
+// upgrading from the silent-fallback pattern (treat-as-plaintext when
+// decrypt errored) MUST instead surface this — a rotated key would
+// otherwise silently leak ciphertext to upstream services as if it
+// were plaintext.
+var ErrDecryptFailed = errors.New("crypto: decrypt failed (wrong key, corrupted ciphertext, or unversioned legacy value)")
+
+// envelopeV1Prefix tags ciphertext produced by Encrypt going forward.
+// Older databases may carry unprefixed hex blobs from the v0 era; those
+// are still readable via Decrypt for backward compatibility, but every
+// new write goes through EncryptV1 and emits the prefix so a future key
+// rotation has a clean fail-loud signal.
+const envelopeV1Prefix = "tf1:"
+
 // DeriveKey computes a 32-byte AES-256 key from the given passphrase using SHA-256.
 // This is acceptable when ENCRYPTION_KEY is a high-entropy random string (e.g., 32+ hex chars).
 // For human-chosen passphrases, consider Argon2id or PBKDF2 with a salt instead.
@@ -35,7 +50,8 @@ func KeyFromEnv() ([32]byte, error) {
 }

 // Encrypt encrypts plaintext using AES-256-GCM with a random nonce.
-// The returned ciphertext is hex-encoded: nonce || ciphertext+tag.
+// Returns a versioned envelope (tf1:<hex>) so downstream readers can
+// distinguish ciphertext from accidentally-stored plaintext.
 func Encrypt(key [32]byte, plaintext string) (string, error) {
 	block, err := aes.NewCipher(key[:])
 	if err != nil {
@@ -53,14 +69,34 @@ func Encrypt(key [32]byte, plaintext string) (string, error) {
 	}

 	sealed := gcm.Seal(nonce, nonce, []byte(plaintext), nil)
-	return hex.EncodeToString(sealed), nil
+	return envelopeV1Prefix + hex.EncodeToString(sealed), nil
 }

-// Decrypt decrypts a hex-encoded ciphertext produced by Encrypt.
-func Decrypt(key [32]byte, ciphertextHex string) (string, error) {
-	data, err := hex.DecodeString(ciphertextHex)
+// HasEnvelope reports whether the value is a v1-prefixed ciphertext.
+// Useful for router-level "decrypt only if encrypted" decision points
+// that previously relied on `err == nil` from a try-decrypt — that
+// pattern silently masked rotated-key failures.
+func HasEnvelope(value string) bool {
+	return strings.HasPrefix(value, envelopeV1Prefix)
+}
+
+// Decrypt decrypts an envelope (tf1:<hex>). For backward compatibility
+// it also accepts unprefixed hex from the v0 era — but only when the
+// resulting plaintext is valid; a wrong key for legacy data now returns
+// ErrDecryptFailed instead of silently treating ciphertext as
+// plaintext.
+//
+// Callers MUST NOT swallow the error and fall back to "use as-is".
+// That pattern is the exact footgun the envelope versioning removes.
+func Decrypt(key [32]byte, ciphertext string) (string, error) {
+	hexBlob := ciphertext
+	if strings.HasPrefix(hexBlob, envelopeV1Prefix) {
+		hexBlob = hexBlob[len(envelopeV1Prefix):]
+	}
+
+	data, err := hex.DecodeString(hexBlob)
 	if err != nil {
-		return "", fmt.Errorf("decode hex: %w", err)
+		return "", fmt.Errorf("%w: decode hex: %v", ErrDecryptFailed, err)
 	}

 	block, err := aes.NewCipher(key[:])
@@ -75,15 +111,15 @@ func Decrypt(key [32]byte, ciphertextHex string) (string, error) {

 	nonceSize := gcm.NonceSize()
 	if len(data) < nonceSize {
-		return "", errors.New("ciphertext too short")
+		return "", fmt.Errorf("%w: ciphertext too short", ErrDecryptFailed)
 	}

 	nonce := data[:nonceSize]
-	ciphertext := data[nonceSize:]
+	body := data[nonceSize:]

-	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
+	plaintext, err := gcm.Open(nil, nonce, body, nil)
 	if err != nil {
-		return "", fmt.Errorf("decrypt: %w", err)
+		return "", fmt.Errorf("%w: %v", ErrDecryptFailed, err)
 	}

 	return string(plaintext), nil
@@ -0,0 +1,76 @@
+package deployer
+
+import (
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// deployHistoryKeepPerWorkload bounds the ledger per workload. Newer rows
+// always have larger ids, so pruning keeps the most recent N — enough for a
+// useful rollback menu without unbounded growth on hot workloads.
+const deployHistoryKeepPerWorkload = 50
+
+// recordDeployHistory appends one ledger row for a completed dispatch.
+//
+// Best-effort: a store failure is logged and swallowed — recording must
+// never turn a successful deploy into a failed request (same contract as
+// EmitDeployEvent and the pre-deploy backup). The raw deploy error is NEVER
+// persisted: it can carry registry-auth bytes or compose stdout, so only a
+// fixed, secret-free marker lands in the row (raw detail goes to slog at the
+// call site). Called only from DispatchPlugin — reconcile/teardown ticks are
+// not deploys and must not appear in the ledger.
+func (d *Deployer) recordDeployHistory(w plugin.Workload, intent plugin.DeploymentIntent, outcome string, deployErr error, startedAt string) {
+	if d.store == nil {
+		return
+	}
+	entry := store.DeployHistoryEntry{
+		WorkloadID:  w.ID,
+		SourceKind:  w.SourceKind,
+		Reference:   d.effectiveReference(w, intent),
+		Reason:      intent.Reason,
+		TriggeredBy: intent.TriggeredBy,
+		Note:        intent.Metadata["note"], // nil map read is safe
+		Outcome:     outcome,
+		StartedAt:   startedAt,
+		FinishedAt:  store.Now(),
+	}
+	if deployErr != nil {
+		entry.Error = "deploy failed (see server logs)"
+	}
+	if _, err := d.store.InsertDeployHistory(entry); err != nil {
+		slog.Warn("deploy history: insert failed", "workload", w.ID, "error", err)
+		return
+	}
+	// Cheap indexed DELETE — negligible next to a multi-second deploy, so it
+	// stays inline rather than on an untracked goroutine that could outrace
+	// graceful shutdown's db.Close().
+	if err := d.store.PruneDeployHistory(w.ID, deployHistoryKeepPerWorkload); err != nil {
+		slog.Warn("deploy history: prune failed", "workload", w.ID, "error", err)
+	}
+}
+
+// effectiveReference resolves the artifact handle to record (and, for
+// rollback-capable sources, to replay). It starts from the trigger-supplied
+// intent.Reference and, for the image source, prefers the tag actually
+// written onto the freshest container row — capturing the DefaultTag /
+// "latest" resolution the source performs when intent.Reference is empty
+// (e.g. a manual deploy with no override). ListContainersByWorkload returns
+// newest-first, so rows[0] is the just-deployed container on success.
+//
+// For static/dockerfile the git trigger already supplies the commit SHA as
+// intent.Reference; a manual deploy of those may record an empty reference
+// (acceptable — they are not rollback-capable in this phase). compose has no
+// single artifact handle.
+func (d *Deployer) effectiveReference(w plugin.Workload, intent plugin.DeploymentIntent) string {
+	ref := intent.Reference
+	if w.SourceKind == "image" && d.store != nil {
+		if rows, err := d.store.ListContainersByWorkload(w.ID); err == nil && len(rows) > 0 {
+			if tag := rows[0].ImageTag; tag != "" {
+				ref = tag
+			}
+		}
+	}
+	return ref
+}
@@ -5,6 +5,7 @@
 package deployer

 import (
+	"context"
 	"fmt"
 	"log/slog"
 	"sync"
@@ -14,9 +15,11 @@ import (
 	"github.com/alexei/tinyforge/internal/docker"
 	"github.com/alexei/tinyforge/internal/events"
 	"github.com/alexei/tinyforge/internal/health"
+	"github.com/alexei/tinyforge/internal/keyedmutex"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/proxy"
 	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

 // Deployer owns the dependency bundle each Source plugin needs at deploy
@@ -34,9 +37,44 @@ type Deployer struct {
 	dnsMu    sync.RWMutex
 	dns      dns.Provider // nil when wildcard DNS is active

+	// proxyMu protects hot-swap of d.proxy from runtime settings updates
+	// (SetProxyProvider) racing with PluginDeps() reads on the deploy path.
+	proxyMu sync.RWMutex
+
 	// Graceful shutdown: tracks in-progress deploys.
+	//
+	// drainMu serializes the "is-draining check + activeWg.Add(1)" in
+	// beginDispatch against the "set shuttingDown + Wait()" in Drain. Without
+	// it, a dispatch could pass the draining check, Drain could then flip the
+	// flag and start Wait() with a zero counter, and the dispatch could call
+	// Add(1) concurrently with Wait — a documented sync.WaitGroup misuse
+	// (panic risk) that also lets a deploy slip past the drain barrier.
+	drainMu      sync.Mutex
 	activeWg     sync.WaitGroup
 	shuttingDown atomic.Bool
+
+	// workloadLocks serializes deploy-class operations per workload id so two
+	// concurrent mutators of the same workload (a manual deploy, a webhook/
+	// trigger dispatch, a rollback, a promote, OR a volume-snapshot restore)
+	// can never interleave their container/volume changes. Every deploy
+	// entrypoint funnels through DispatchPlugin, so locking there gates them
+	// all at one choke point. This is the per-workload lock activeWg is NOT
+	// (activeWg is a global drain barrier for graceful shutdown).
+	workloadLocks keyedmutex.Mutex
+}
+
+// LockWorkload acquires the per-workload deploy lock for an external critical
+// section (volume-snapshot restore) and returns the release func. The restore
+// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked
+// (which does NOT re-acquire it).
+func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) }
+
+// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock,
+// because the caller (restore) already holds it via LockWorkload. Calling the
+// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant.
+// Not for general use.
+func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	return d.dispatchLocked(ctx, w, intent)
 }

 // EventPublisher is the interface for publishing events to the event bus.
@@ -73,7 +111,11 @@ func New(
 }

 // SetProxyProvider updates the proxy provider at runtime (e.g., when settings change).
+// Guarded by proxyMu so concurrent deploys that read d.proxy via PluginDeps()
+// observe a coherent value (previously a torn-pointer race under -race).
 func (d *Deployer) SetProxyProvider(provider proxy.Provider) {
+	d.proxyMu.Lock()
+	defer d.proxyMu.Unlock()
 	d.proxy = provider
 }

@@ -84,20 +126,34 @@ func (d *Deployer) SetPreDeployBackuper(b PreDeployBackuper) {
 	d.backuper = b
 }

-// MaybeBackupBeforeDeploy creates a "pre-deploy" Tinyforge DB snapshot when
-// the setting is enabled. Failures are logged but do not abort the deploy:
-// missing a backup is preferable to refusing to ship a fix. Exposed so
-// Source plugins can opt into the same behaviour.
-func (d *Deployer) MaybeBackupBeforeDeploy(deployID string, settings store.Settings) {
-	if !settings.AutoBackupBeforeDeploy || d.backuper == nil {
+// maybeBackupBeforeDeploy takes a "pre-deploy" Tinyforge DB snapshot before a
+// deploy when the operator enabled auto_backup_before_deploy. It is called on
+// the unified deploy path (DispatchPlugin) so the setting actually fires — its
+// predecessor was orphaned when the legacy executeDeploy pipeline (its only
+// caller) was removed in the workload-first cutover, silently disabling the
+// setting.
+//
+// Fail-open: a nil backuper, a settings-load error, or a backup failure all
+// skip the snapshot without blocking the deploy — missing a backup is
+// preferable to refusing to ship a fix.
+func (d *Deployer) maybeBackupBeforeDeploy(workloadID string) {
+	if d.backuper == nil {
+		return
+	}
+	settings, err := d.store.GetSettings()
+	if err != nil {
+		slog.Warn("pre-deploy backup: load settings", "workload", workloadID, "error", err)
+		return
+	}
+	if !settings.AutoBackupBeforeDeploy {
 		return
 	}
 	backup, err := d.backuper.CreateBackup("pre-deploy")
 	if err != nil {
-		slog.Warn("pre-deploy backup failed", "deploy_id", deployID, "error", err)
+		slog.Warn("pre-deploy backup failed", "workload", workloadID, "error", err)
 		return
 	}
-	slog.Info("pre-deploy backup created", "deploy_id", deployID, "backup_id", backup.ID, "filename", backup.Filename)
+	slog.Info("pre-deploy backup created", "workload", workloadID, "backup_id", backup.ID, "filename", backup.Filename)
 }

 // SetDNSProvider sets the DNS provider for managing DNS records during deployments.
@@ -110,8 +166,11 @@ func (d *Deployer) SetDNSProvider(provider dns.Provider) {

 // Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
 func (d *Deployer) Drain() {
-	if !d.shuttingDown.CompareAndSwap(false, true) {
-		// Already draining.
+	d.drainMu.Lock()
+	already := d.shuttingDown.Swap(true)
+	d.drainMu.Unlock()
+	if already {
+		slog.Info("deployer: drain already in progress")
 	}
 	slog.Info("deployer: draining in-progress deploys")
 	d.activeWg.Wait()
@@ -121,11 +180,17 @@ func (d *Deployer) Drain() {
 // ShuttingDown reports whether Drain() has been called.
 func (d *Deployer) ShuttingDown() bool { return d.shuttingDown.Load() }

-// rejectIfDraining is exposed in case any plugin wants the same hard-stop
-// behaviour the legacy pipeline used.
-func (d *Deployer) rejectIfDraining() error {
+// beginDispatch atomically rejects when draining and otherwise registers the
+// in-flight unit on activeWg. The shuttingDown check and the Add(1) MUST be
+// done together under drainMu (see the field comment): Drain sets the flag
+// under the same mutex before Wait(), so once Wait() observes a zero counter
+// no further Add can race it. Callers must defer d.activeWg.Done() on success.
+func (d *Deployer) beginDispatch() error {
+	d.drainMu.Lock()
+	defer d.drainMu.Unlock()
 	if d.shuttingDown.Load() {
 		return fmt.Errorf("deployer is shutting down, rejecting new deploy")
 	}
+	d.activeWg.Add(1)
 	return nil
 }
@@ -4,26 +4,76 @@ import (
 	"context"
 	"fmt"

+	"github.com/alexei/tinyforge/internal/metrics"
+	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

 // DispatchPlugin routes a DeploymentIntent for w to the matching Source
-// plugin. This is the new unified deploy path; the legacy executeDeploy
-// remains in place until Phase 6 ports image-deploy logic into
-// source/image. While both exist, callers must pick: webhook/registry
-// triggers + image deploys still go through the legacy path, while
-// /api/hooks/generic + the unified webhook ingress go through here.
+// plugin. This is the unified deploy path for every source kind (the legacy
+// executeDeploy pipeline was removed in the workload-first cutover). When the
+// operator enables auto_backup_before_deploy, a pre-deploy Tinyforge DB
+// snapshot is taken here, after the source resolves and before it runs.
 func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	// C1: serialize all deploy-class work per workload. Held across the whole
+	// deploy so a concurrent deploy/rollback/promote/trigger — or a volume
+	// restore (which redeploys via RedeployLocked while holding this) — can
+	// never interleave container changes for the same workload.
+	unlock := d.workloadLocks.Lock(w.ID)
+	defer unlock()
+	return d.dispatchLocked(ctx, w, intent)
+}
+
+// dispatchLocked is DispatchPlugin's body, assuming the per-workload lock is
+// already held. RedeployLocked calls it directly during restore.
+func (d *Deployer) dispatchLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	if err := d.beginDispatch(); err != nil {
+		metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining")
+		return err
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
+		// Unknown source: use the constant "unknown" sentinel for the
+		// label so a typo-spam attack can't grow the metrics map with
+		// one series per bogus source_kind. The actual user-supplied
+		// value still surfaces via the wrapped error / event log.
+		metrics.DeploysTotal.Inc("unknown", "unknown_source")
 		return fmt.Errorf("dispatch %s: %w", w.Name, err)
 	}
-	return src.Deploy(ctx, d.PluginDeps(), w, intent)
+	// Optional operator-enabled pre-deploy DB snapshot. Fail-open: never
+	// blocks shipping a deploy. Runs before any source-internal idempotency
+	// check (e.g. the image source's same-tag short-circuit), so a same-tag
+	// redeploy still snapshots — "backup before every deploy attempt".
+	d.maybeBackupBeforeDeploy(w.ID)
+	startedAt := store.Now()
+	err = src.Deploy(ctx, d.PluginDeps(), w, intent)
+	outcome := "success"
+	if err != nil {
+		outcome = "failure"
+	}
+	metrics.DeploysTotal.Inc(w.SourceKind, outcome)
+	// Append to the structured deploy ledger (powers the per-app history
+	// panel + rollback). Best-effort and secret-free; see recordDeployHistory.
+	// Only DispatchPlugin records — reconcile/teardown are not deploys.
+	d.recordDeployHistory(w, intent, outcome, err, startedAt)
+	return err
 }

 // DispatchTeardown routes a teardown call to the matching Source plugin.
-// Used when a workload is deleted.
+// Used when a workload is deleted. Tracked via activeWg so Drain() honours
+// in-progress teardowns just like deploys.
 func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error {
+	// Teardown mutates the same containers/routes a deploy does, so it takes the
+	// per-workload lock too (C1). Callers tear down distinct workload ids
+	// sequentially (e.g. preview children then parent), never nested, so no
+	// self-deadlock.
+	unlock := d.workloadLocks.Lock(w.ID)
+	defer unlock()
+	if err := d.beginDispatch(); err != nil {
+		return err
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
 		return fmt.Errorf("dispatch teardown %s: %w", w.Name, err)
@@ -33,8 +83,17 @@ func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) erro

 // DispatchReconcile routes a Reconcile call. Periodic reconciler iterates
 // every Workload and calls this; idle Sources should make it a cheap
-// no-op.
+// no-op. Tracked via activeWg so a long-running reconcile blocks Drain().
 func (d *Deployer) DispatchReconcile(ctx context.Context, w plugin.Workload) error {
+	if err := d.beginDispatch(); err != nil {
+		// Silent skip — reconcile is a periodic tick, not a user-initiated
+		// action, so we don't want to surface "draining" errors back to the
+		// reconciler loop. The next tick after restart will catch up. Routing
+		// through beginDispatch keeps the activeWg.Add atomic with the drain
+		// check (see Drain) instead of a bare shuttingDown.Load + Add race.
+		return nil
+	}
+	defer d.activeWg.Done()
 	src, err := plugin.GetSource(w.SourceKind)
 	if err != nil {
 		return fmt.Errorf("dispatch reconcile %s: %w", w.Name, err)
@@ -52,10 +111,13 @@ func (d *Deployer) PluginDeps() plugin.Deps {
 	d.dnsMu.RLock()
 	dnsProvider := d.dns
 	d.dnsMu.RUnlock()
+	d.proxyMu.RLock()
+	proxyProvider := d.proxy
+	d.proxyMu.RUnlock()
 	return plugin.Deps{
 		Store:    d.store,
 		Docker:   d.docker,
-		Proxy:    d.proxy,
+		Proxy:    proxyProvider,
 		DNS:      dnsProvider,
 		Health:   d.health,
 		Notifier: d.notifier,
@@ -21,9 +21,9 @@ import (
 type fakeSource struct {
 	kind string

-	mu          sync.Mutex
-	deployErr   error
-	teardownErr error
+	mu           sync.Mutex
+	deployErr    error
+	teardownErr  error
 	reconcileErr error

 	deployCount    atomic.Int32
@@ -34,8 +34,8 @@ type fakeSource struct {
 	lastDeps   plugin.Deps
 }

-func (f *fakeSource) Kind() string                 { return f.kind }
-func (f *fakeSource) SchemaSample() any            { return struct{}{} }
+func (f *fakeSource) Kind() string                   { return f.kind }
+func (f *fakeSource) SchemaSample() any              { return struct{}{} }
 func (f *fakeSource) Validate(json.RawMessage) error { return nil }

 func (f *fakeSource) Deploy(_ context.Context, deps plugin.Deps, _ plugin.Workload, intent plugin.DeploymentIntent) error {
@@ -250,6 +250,84 @@ func TestDispatchReconcile_PropagatesSourceError(t *testing.T) {
 	}
 }

+// ---- Deploy history recording ----------------------------------------------
+
+// seedDispatchWorkload inserts a real workloads row so deploy_history's FK
+// (workload_id REFERENCES workloads) is satisfied, then returns a plugin
+// workload pointing at the fake source.
+func seedDispatchWorkload(t *testing.T, d *Deployer) plugin.Workload {
+	t.Helper()
+	row, err := d.store.CreateWorkload(store.Workload{Kind: "project", RefID: "dh", Name: "dh"})
+	if err != nil {
+		t.Fatalf("CreateWorkload: %v", err)
+	}
+	return plugin.Workload{ID: row.ID, Name: "dh", SourceKind: "dispatchertest"}
+}
+
+func TestDispatchPlugin_RecordsSuccessHistory(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	w := seedDispatchWorkload(t, d)
+
+	intent := plugin.DeploymentIntent{Reason: "manual", Reference: "v9", TriggeredBy: "alice",
+		Metadata: map[string]string{"note": "ship it"}}
+	if err := d.DispatchPlugin(context.Background(), w, intent); err != nil {
+		t.Fatalf("DispatchPlugin: %v", err)
+	}
+	rows, err := d.store.ListDeployHistory(w.ID, 10, 0)
+	if err != nil {
+		t.Fatalf("ListDeployHistory: %v", err)
+	}
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 history row, got %d", len(rows))
+	}
+	got := rows[0]
+	if got.Outcome != "success" || got.Reason != "manual" || got.Reference != "v9" {
+		t.Fatalf("unexpected row: %+v", got)
+	}
+	if got.TriggeredBy != "alice" || got.Note != "ship it" {
+		t.Fatalf("intent fields not recorded: %+v", got)
+	}
+	if got.Error != "" {
+		t.Fatalf("success row must have empty error, got %q", got.Error)
+	}
+}
+
+func TestDispatchPlugin_RecordsFailureWithoutLeakingError(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	w := seedDispatchWorkload(t, d)
+
+	// A deploy error carrying a "secret" must never reach the persisted row.
+	dispatchTestSource.setDeployErr(errors.New("compose up failed (output: SUPER_SECRET=hunter2)"))
+	_ = d.DispatchPlugin(context.Background(), w, plugin.DeploymentIntent{Reason: "manual"})
+
+	rows, _ := d.store.ListDeployHistory(w.ID, 10, 0)
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 history row, got %d", len(rows))
+	}
+	if rows[0].Outcome != "failure" {
+		t.Fatalf("expected failure outcome, got %q", rows[0].Outcome)
+	}
+	if strings.Contains(rows[0].Error, "hunter2") || strings.Contains(rows[0].Error, "SECRET") {
+		t.Fatalf("raw error leaked into history: %q", rows[0].Error)
+	}
+}
+
+func TestDispatchReconcile_RecordsNoHistory(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	w := seedDispatchWorkload(t, d)
+
+	if err := d.DispatchReconcile(context.Background(), w); err != nil {
+		t.Fatalf("DispatchReconcile: %v", err)
+	}
+	rows, _ := d.store.ListDeployHistory(w.ID, 10, 0)
+	if len(rows) != 0 {
+		t.Fatalf("reconcile must not write history, got %d rows", len(rows))
+	}
+}
+
 // ---- PluginDeps -------------------------------------------------------------

 func TestPluginDeps_PassesStoreAndEncKey(t *testing.T) {
@@ -0,0 +1,107 @@
+package deployer
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// fakeBackuper records pre-deploy backup calls so the dispatch wiring can be
+// asserted. err (when set) simulates a backup failure.
+type fakeBackuper struct {
+	count    atomic.Int32
+	lastType atomic.Value // string
+	err      error
+}
+
+func (f *fakeBackuper) CreateBackup(backupType string) (store.Backup, error) {
+	f.count.Add(1)
+	f.lastType.Store(backupType)
+	if f.err != nil {
+		return store.Backup{}, f.err
+	}
+	return store.Backup{ID: "b1", Filename: "tinyforge-pre-deploy.db"}, nil
+}
+
+func setAutoBackup(t *testing.T, d *Deployer, enabled bool) {
+	t.Helper()
+	s, err := d.store.GetSettings()
+	if err != nil {
+		t.Fatalf("get settings: %v", err)
+	}
+	s.AutoBackupBeforeDeploy = enabled
+	if err := d.store.UpdateSettings(s); err != nil {
+		t.Fatalf("update settings: %v", err)
+	}
+}
+
+// Regression: the pre-deploy backup hook was orphaned after the cutover (no
+// caller on DispatchPlugin), making auto_backup_before_deploy a silent no-op.
+func TestDispatchPlugin_PreDeployBackup_FiresWhenEnabled(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	b := &fakeBackuper{}
+	d.SetPreDeployBackuper(b)
+	setAutoBackup(t, d, true)
+
+	if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
+		t.Fatalf("dispatch: %v", err)
+	}
+	if got := b.count.Load(); got != 1 {
+		t.Fatalf("CreateBackup called %d times, want 1", got)
+	}
+	if bt, _ := b.lastType.Load().(string); bt != "pre-deploy" {
+		t.Fatalf("backup type = %q, want pre-deploy", bt)
+	}
+	if got := dispatchTestSource.deployCount.Load(); got != 1 {
+		t.Fatalf("Deploy ran %d times, want 1", got)
+	}
+}
+
+func TestDispatchPlugin_PreDeployBackup_SkippedWhenDisabled(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	b := &fakeBackuper{}
+	d.SetPreDeployBackuper(b)
+	setAutoBackup(t, d, false)
+
+	if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
+		t.Fatalf("dispatch: %v", err)
+	}
+	if got := b.count.Load(); got != 0 {
+		t.Fatalf("CreateBackup called %d times, want 0 (setting off)", got)
+	}
+}
+
+func TestDispatchPlugin_PreDeployBackup_NilBackuperNoPanic(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	setAutoBackup(t, d, true) // enabled, but no backuper wired
+
+	if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
+		t.Fatalf("dispatch must not panic/fail with a nil backuper: %v", err)
+	}
+	if got := dispatchTestSource.deployCount.Load(); got != 1 {
+		t.Fatalf("Deploy ran %d times, want 1", got)
+	}
+}
+
+func TestDispatchPlugin_PreDeployBackup_FailOpen(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	b := &fakeBackuper{err: errors.New("disk full")}
+	d.SetPreDeployBackuper(b)
+	setAutoBackup(t, d, true)
+
+	// A failed backup is logged but must NOT block the deploy.
+	if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
+		t.Fatalf("deploy must succeed when backup fails (fail-open): %v", err)
+	}
+	if got := dispatchTestSource.deployCount.Load(); got != 1 {
+		t.Fatalf("Deploy ran %d times, want 1 (despite backup failure)", got)
+	}
+}
@@ -2,20 +2,58 @@ package docker

 import (
 	"archive/tar"
+	"bufio"
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 	"strings"

+	"github.com/moby/moby/api/types/build"
 	"github.com/moby/moby/client"
 )

-// BuildImage builds a Docker image from a directory containing a Dockerfile.
-// The directory is packaged as a tar archive and sent to the Docker daemon.
-// The tag parameter is the image name:tag to apply (e.g., "dw-site-myapp:latest").
+// BuildImage builds a Docker image from a directory containing a Dockerfile
+// at the context root. Kept as a thin wrapper around BuildImageAt for the
+// static-site plugin which always emits its generated Dockerfile at the
+// context root. New code should prefer BuildImageAt so the Dockerfile path
+// is explicit.
 func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
+	return c.BuildImageAt(ctx, contextDir, "Dockerfile", tag, nil)
+}
+
+// BuildImageAt builds a Docker image from a tar of contextDir, using the
+// Dockerfile at `dockerfile` *inside* the context (typically "Dockerfile"
+// but may be e.g. "docker/Dockerfile" when the user-supplied repo layout
+// keeps Dockerfiles in a subfolder).
+//
+// The dockerfile argument is the path *relative to contextDir*. Empty
+// strings are normalised to "Dockerfile" so callers can pass through a
+// user config value without sanitising twice.
+//
+// logFn, if non-nil, is invoked for every non-empty `stream` line the
+// daemon emits during the build. Callers use this to forward live build
+// progress (e.g. SSE bus). Errors from the daemon are NOT delivered via
+// logFn — they surface as the returned error so the caller's failure
+// path stays the single source of truth.
+func (c *Client) BuildImageAt(ctx context.Context, contextDir, dockerfile, tag string, logFn func(line string)) error {
+	if dockerfile == "" {
+		dockerfile = "Dockerfile"
+	}
+	// Normalise to forward slashes — the tar entry names use them and the
+	// Docker daemon expects the same.
+	dockerfile = filepath.ToSlash(dockerfile)
+	// Defence-in-depth: the dockerfile path is relative to contextDir and
+	// is increasingly user/config-supplied (subfolder Dockerfiles). Reject
+	// absolute paths and any `..` traversal at the boundary so a value like
+	// "../../etc/passwd" can never be handed to the daemon's build options,
+	// regardless of which builder backend resolves it.
+	if filepath.IsAbs(dockerfile) || strings.HasPrefix(dockerfile, "/") ||
+		dockerfile == ".." || strings.HasPrefix(dockerfile, "../") || strings.Contains(dockerfile, "/../") {
+		return fmt.Errorf("docker build: invalid dockerfile path %q (must be relative to the build context, no traversal)", dockerfile)
+	}
 	// Create tar archive of the build context.
 	pr, pw := io.Pipe()

@@ -50,16 +88,14 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
 				return nil
 			}

-			file, err := os.Open(path)
-			if err != nil {
-				return fmt.Errorf("open %s: %w", path, err)
+			// Per-file close, NOT defer. `defer file.Close()` inside the
+			// WalkFunc only runs when the outer goroutine returns — for a
+			// build context with thousands of files (node_modules-heavy
+			// repo) that leaks one fd per file until the walk completes
+			// and trips EMFILE on default ulimit=1024 systems.
+			if err := streamFileIntoTar(tw, path, relPath); err != nil {
+				return err
 			}
-			defer file.Close()
-
-			if _, err := io.Copy(tw, file); err != nil {
-				return fmt.Errorf("copy %s to tar: %w", relPath, err)
-			}
-
 			return nil
 		})

@@ -69,8 +105,16 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
 		pw.CloseWithError(err)
 	}()

+	// Pin the legacy builder explicitly. On Docker Engine 23+ BuildKit
+	// is the default for the CLI, but the daemon honours the explicit
+	// Version field on ImageBuildOptions. Legacy builder does NOT support
+	// `RUN --mount=type=bind,source=/host` so a malicious Dockerfile
+	// cannot mount host paths into the build context. Switching to
+	// BuildKit later requires (a) Dockerfile-content validation to
+	// reject bind-mount hints, or (b) an explicit per-workload opt-in.
 	resp, err := c.api.ImageBuild(ctx, pr, client.ImageBuildOptions{
-		Dockerfile:  "Dockerfile",
+		Version:     build.BuilderV1,
+		Dockerfile:  dockerfile,
 		Tags:        []string{tag},
 		Remove:      true,
 		ForceRemove: true,
@@ -80,16 +124,71 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
 	}
 	defer resp.Body.Close()

-	// Read the build output to completion (required for the build to finish).
-	output, err := io.ReadAll(resp.Body)
-	if err != nil {
+	// Drain the daemon's NDJSON stream to completion. The stream MUST
+	// be read for the build to finish — closing the body early aborts
+	// the build. We parse line-by-line into the {Stream, Error} shape
+	// the daemon emits so an honest `{"error":"..."}` line surfaces
+	// without false positives from informational `{"stream":"error
+	// handling: retrying..."}` chatter that the old strings.Contains
+	// path would have flagged.
+	type buildLine struct {
+		Stream string `json:"stream,omitempty"`
+		Error  string `json:"error,omitempty"`
+	}
+	scanner := bufio.NewScanner(resp.Body)
+	// Some build steps emit single lines exceeding the default 64 KiB
+	// (e.g. a fat go-mod-download dump). Bump to 1 MiB so we don't
+	// silently truncate and miss the trailing error line.
+	scanner.Buffer(make([]byte, 64*1024), 1024*1024)
+	var firstErr string
+	for scanner.Scan() {
+		line := scanner.Bytes()
+		if len(line) == 0 {
+			continue
+		}
+		var bl buildLine
+		if err := json.Unmarshal(line, &bl); err != nil {
+			// Non-JSON line — daemon shouldn't produce these, but
+			// don't fail the build over a parse hiccup.
+			continue
+		}
+		if bl.Error != "" && firstErr == "" {
+			firstErr = bl.Error
+		}
+		if logFn != nil && bl.Stream != "" {
+			logFn(bl.Stream)
+		}
+	}
+	if err := scanner.Err(); err != nil {
 		return fmt.Errorf("read build output for %s: %w", tag, err)
 	}
-
-	// Check for error in build output.
-	if strings.Contains(string(output), `"error"`) {
-		return fmt.Errorf("build image %s: build errors in output", tag)
+	if firstErr != "" {
+		return fmt.Errorf("build image %s: %s", tag, firstErr)
 	}

 	return nil
 }
+
+// streamFileIntoTar opens path, copies its contents into the tar writer
+// under the given relPath header, and closes the file *before returning*
+// — i.e. once per file, not deferred to the end of the entire walk.
+// Extracted so the per-iteration close discipline is obvious at the
+// callsite and the file handle isn't accidentally hoisted into the
+// caller's defer stack via a future refactor.
+func streamFileIntoTar(tw *tar.Writer, path, relPath string) error {
+	file, err := os.Open(path)
+	if err != nil {
+		return fmt.Errorf("open %s: %w", path, err)
+	}
+	_, copyErr := io.Copy(tw, file)
+	// Close BEFORE returning so the fd is released even on copy
+	// failure. Capture both errors so the more-specific copy error
+	// wins when both fire.
+	if cerr := file.Close(); cerr != nil && copyErr == nil {
+		copyErr = cerr
+	}
+	if copyErr != nil {
+		return fmt.Errorf("copy %s to tar: %w", relPath, copyErr)
+	}
+	return nil
+}
@@ -108,3 +108,29 @@ func (c *Client) GetSystemStats(ctx context.Context) (SystemStats, error) {

 	return stats, nil
 }
+
+// BuildCachePruneResult reports the outcome of a build-cache prune.
+type BuildCachePruneResult struct {
+	CachesDeleted  int   `json:"caches_deleted"`  // number of cache records removed
+	SpaceReclaimed int64 `json:"space_reclaimed"` // bytes reclaimed
+}
+
+// PruneBuildCache deletes unused Docker build-cache records and returns the
+// number of records removed and bytes reclaimed. Docker's build-cache API is
+// prune-by-filter only — there is no surgical per-record eviction — so this
+// is the daemon-wide "prune unused" operation.
+//
+// When all is false (the default), only build cache not currently in use is
+// removed, so an app's next rebuild still hits its warm cache. When all is
+// true, every build-cache record is removed regardless of use, forcing a cold
+// rebuild for every app.
+func (c *Client) PruneBuildCache(ctx context.Context, all bool) (BuildCachePruneResult, error) {
+	res, err := c.api.BuildCachePrune(ctx, client.BuildCachePruneOptions{All: all})
+	if err != nil {
+		return BuildCachePruneResult{}, fmt.Errorf("prune build cache: %w", err)
+	}
+	return BuildCachePruneResult{
+		CachesDeleted:  len(res.Report.CachesDeleted),
+		SpaceReclaimed: int64(res.Report.SpaceReclaimed),
+	}, nil
+}
@@ -27,6 +27,13 @@ const (

 	// EventStackStatus is emitted when a compose stack status changes.
 	EventStackStatus EventType = "stack_status"
+
+	// EventBuildLog is emitted for each line of a streaming image build.
+	// Per-line events are ephemeral (not persisted to the event_log) — they
+	// exist to drive a live tail UI during the slow "building" phase of a
+	// dockerfile-source deploy. Subscribers should filter by WorkloadID
+	// because every dockerfile deploy on the box publishes on the same bus.
+	EventBuildLog EventType = "build_log"
 )

 // Event is a single event published on the bus.
@@ -62,12 +69,13 @@ type DeployStatusPayload struct {

 // EventLogPayload is the payload for EventLog events (audit trail).
 type EventLogPayload struct {
-	ID        int64  `json:"id"`
-	Source    string `json:"source"`
-	Severity  string `json:"severity"`
-	Message   string `json:"message"`
-	Metadata  string `json:"metadata"`
-	CreatedAt string `json:"created_at"`
+	ID         int64  `json:"id"`
+	Source     string `json:"source"`
+	WorkloadID string `json:"workload_id"`
+	Severity   string `json:"severity"`
+	Message    string `json:"message"`
+	Metadata   string `json:"metadata"`
+	CreatedAt  string `json:"created_at"`
 }

 // StaticSiteStatusPayload is the payload for EventStaticSiteStatus events.
@@ -77,6 +85,14 @@ type StaticSiteStatusPayload struct {
 	Status string `json:"status"`
 }

+// BuildLogPayload is the payload for EventBuildLog events. One event
+// per non-empty line read off the daemon's NDJSON build stream.
+type BuildLogPayload struct {
+	WorkloadID string `json:"workload_id"`
+	Line       string `json:"line"`
+	Stream     string `json:"stream,omitempty"`
+}
+
 // StackStatusPayload is the payload for EventStackStatus events.
 type StackStatusPayload struct {
 	StackID string `json:"stack_id"`
@@ -0,0 +1,83 @@
+package gitops
+
+// source_config JSON keys this package can overlay. Kept as constants so the
+// apply, merge, and drift paths agree on the exact key strings.
+const (
+	keyPort           = "port"
+	keyHealthcheck    = "healthcheck"
+	keyDeployStrategy = "deploy_strategy"
+)
+
+// Source kinds eligible for GitOps in v1 (git-backed sources only).
+const (
+	SourceDockerfile = "dockerfile"
+	SourceStatic     = "static"
+)
+
+// supportedKeys returns the source_config keys a given source kind accepts
+// from a .tinyforge.yml overlay. A field declared in the file but not in this
+// set is ignored (not applied, not drift-compared) so a shared file can target
+// either source without producing dead keys or false drift.
+//
+// dockerfile: port + healthcheck + deploy_strategy (its real run knobs).
+// static:     deploy_strategy only (a static site has no port/healthcheck).
+func supportedKeys(sourceKind string) map[string]bool {
+	switch sourceKind {
+	case SourceDockerfile:
+		return map[string]bool{keyPort: true, keyHealthcheck: true, keyDeployStrategy: true}
+	case SourceStatic:
+		return map[string]bool{keyDeployStrategy: true}
+	default:
+		return nil
+	}
+}
+
+// IsEligibleSource reports whether GitOps may be enabled for a source kind.
+func IsEligibleSource(sourceKind string) bool {
+	return supportedKeys(sourceKind) != nil
+}
+
+// ApplyPlan is the typed, multi-target plan for applying an overlay. In v1 only
+// SourceConfigPatch is populated; EnvUpserts/Faces are reserved so env (the
+// workload_env table) and faces (the public_faces column) can be added later
+// without reshaping the apply path — they are NOT in v1 (env would re-open the
+// secrets-in-repo hole; faces live in a sibling store).
+type ApplyPlan struct {
+	// SourceConfigPatch holds the source_config keys to overlay onto the live
+	// config. Only keys supported by the target source are present.
+	SourceConfigPatch map[string]any
+
+	// reserved for future phases — see package doc.
+	// EnvUpserts []store.WorkloadEnv
+	// Faces      []plugin.PublicFace
+}
+
+// declaredValues returns the present (non-nil) overlay fields keyed by their
+// source_config JSON key, before the per-source filter. Shared by BuildPlan and
+// Drift so they agree on what the file declared.
+func declaredValues(spec Spec) map[string]any {
+	out := map[string]any{}
+	if spec.Deploy.Port != nil {
+		out[keyPort] = *spec.Deploy.Port
+	}
+	if spec.Deploy.Healthcheck != nil {
+		out[keyHealthcheck] = *spec.Deploy.Healthcheck
+	}
+	if spec.Deploy.DeployStrategy != nil {
+		out[keyDeployStrategy] = *spec.Deploy.DeployStrategy
+	}
+	return out
+}
+
+// BuildPlan maps the present, source-supported overlay fields to a patch for
+// the given source kind. Unsupported/absent fields are dropped.
+func BuildPlan(spec Spec, sourceKind string) ApplyPlan {
+	allowed := supportedKeys(sourceKind)
+	patch := map[string]any{}
+	for k, v := range declaredValues(spec) {
+		if allowed[k] {
+			patch[k] = v
+		}
+	}
+	return ApplyPlan{SourceConfigPatch: patch}
+}
@@ -0,0 +1,122 @@
+package gitops
+
+import (
+	"encoding/json"
+	"fmt"
+	"strconv"
+)
+
+// DriftEntry is one field where the repo-declared value differs from the live
+// stored value. Values are display strings; comparison is done on normalized
+// forms so cosmetic differences (default coercion, YAML int vs JSON number)
+// don't register as drift.
+type DriftEntry struct {
+	Field     string `json:"field"`
+	RepoValue string `json:"repo_value"`
+	LiveValue string `json:"live_value"`
+}
+
+// driftFieldOrder is the stable order drift entries are reported in.
+var driftFieldOrder = []string{keyPort, keyHealthcheck, keyDeployStrategy}
+
+// Drift compares the declared overlay (the present, source-supported fields)
+// against the live source_config and returns the fields that differ. Only
+// declared fields are considered — a key the file omits is "unmanaged",
+// neither drift nor clean (review C5). Comparison is post-normalization.
+func Drift(spec Spec, live json.RawMessage, sourceKind string) ([]DriftEntry, error) {
+	liveMap := map[string]any{}
+	if len(live) > 0 {
+		if err := json.Unmarshal(live, &liveMap); err != nil {
+			return nil, fmt.Errorf("gitops: decode live source_config: %w", err)
+		}
+	}
+	allowed := supportedKeys(sourceKind)
+	declared := declaredValues(spec)
+
+	var entries []DriftEntry
+	for _, k := range driftFieldOrder {
+		repoVal, ok := declared[k]
+		if !ok || !allowed[k] {
+			continue
+		}
+		liveVal, livePresent := liveMap[k]
+		if normalizeField(k, repoVal) == normalizeField(k, liveVal) {
+			continue
+		}
+		entries = append(entries, DriftEntry{
+			Field:     k,
+			RepoValue: displayField(k, repoVal, true),
+			LiveValue: displayField(k, liveVal, livePresent),
+		})
+	}
+	return entries, nil
+}
+
+// normalizeField returns the canonical comparison form of a field value.
+func normalizeField(key string, v any) string {
+	switch key {
+	case keyDeployStrategy:
+		// "" and "recreate" are the same effective strategy for dockerfile and
+		// static (see each source's effectiveStrategy).
+		s := toStr(v)
+		if s == "" || s == "recreate" {
+			return "recreate"
+		}
+		return s
+	case keyPort:
+		return canonInt(v)
+	default:
+		return toStr(v)
+	}
+}
+
+// displayField renders a value for the UI. present=false means the key is
+// absent from the live config.
+func displayField(key string, v any, present bool) string {
+	if !present {
+		return "(unset)"
+	}
+	if key == keyDeployStrategy {
+		if s := toStr(v); s == "" {
+			return "recreate (default)"
+		}
+	}
+	switch n := v.(type) {
+	case float64:
+		// JSON numbers decode as float64; show whole numbers without ".0".
+		return strconv.FormatInt(int64(n), 10)
+	case nil:
+		return "(unset)"
+	default:
+		return fmt.Sprint(v)
+	}
+}
+
+// canonInt coerces any numeric representation (YAML int, JSON float64, etc.)
+// to a base-10 integer string for value-equality comparison.
+func canonInt(v any) string {
+	switch n := v.(type) {
+	case int:
+		return strconv.Itoa(n)
+	case int64:
+		return strconv.FormatInt(n, 10)
+	case float64:
+		return strconv.FormatInt(int64(n), 10)
+	case json.Number:
+		return n.String()
+	case nil:
+		return "0"
+	default:
+		return fmt.Sprint(v)
+	}
+}
+
+func toStr(v any) string {
+	if v == nil {
+		return ""
+	}
+	if s, ok := v.(string); ok {
+		return s
+	}
+	return fmt.Sprint(v)
+}
@@ -0,0 +1,96 @@
+package gitops
+
+import (
+	"context"
+	"errors"
+	"strings"
+
+	"github.com/alexei/tinyforge/internal/staticsite"
+)
+
+// maxConfigBytes caps the .tinyforge.yml fetch. The file is tiny; the cap
+// stops a hostile/misconfigured repo from streaming an unbounded body.
+const maxConfigBytes = 64 * 1024
+
+// Status is the outcome of a Fetch. All outcomes are values (not errors) so a
+// caller always has something to show: an absent file or a provider blip is a
+// normal state, not a 500.
+type Status string
+
+const (
+	StatusOK          Status = "ok"           // file present and parsed
+	StatusNoFile      Status = "no_file"      // GitOps enabled, no file at path
+	StatusFetchFailed Status = "fetch_failed" // transport/auth/5xx error
+	StatusInvalid     Status = "invalid"      // file present but failed to parse
+)
+
+// RepoRef is the minimal repo locator Fetch needs. The caller (API layer)
+// extracts these from the workload's source_config and decrypts the token —
+// this package stays decoupled from the store and source plugins.
+type RepoRef struct {
+	Provider string // "gitea" | "github" | "gitlab" | "" (autodetect from BaseURL)
+	BaseURL  string
+	Owner    string
+	Repo     string
+	Branch   string
+	Token    string // decrypted; "" for public repos
+	Path     string // repo-relative file path; defaults to .tinyforge.yml
+}
+
+// Result carries everything the API/UI needs about a fetch. Message is a
+// human-safe, token-redacted detail for non-ok statuses.
+type Result struct {
+	Status    Status
+	Raw       []byte
+	Spec      Spec
+	CommitSHA string
+	Message   string
+}
+
+// Fetch reads the .tinyforge.yml from a workload's repo and parses it. Every
+// failure mode is encoded in Result.Status (never a returned error), with any
+// detail token-redacted in Result.Message. A missing file is StatusNoFile, not
+// a failure — never a reason to block or clear config.
+func Fetch(ctx context.Context, ref RepoRef) Result {
+	provider, err := staticsite.NewGitProvider(staticsite.ProviderType(ref.Provider), ref.BaseURL, ref.Token)
+	if err != nil {
+		return Result{Status: StatusFetchFailed, Message: redact(err, ref.Token)}
+	}
+
+	// Best-effort: the SHA lets the UI show which ref the file came from. A
+	// failure here doesn't sink the fetch — the file read below is what matters.
+	sha, _ := provider.GetLatestCommitSHA(ctx, ref.Owner, ref.Repo, ref.Branch)
+
+	path := ref.Path
+	if path == "" {
+		path = ".tinyforge.yml"
+	}
+	data, err := provider.DownloadFile(ctx, ref.Owner, ref.Repo, ref.Branch, path, maxConfigBytes)
+	if err != nil {
+		if errors.Is(err, staticsite.ErrFileNotFound) {
+			return Result{Status: StatusNoFile, CommitSHA: sha}
+		}
+		return Result{Status: StatusFetchFailed, CommitSHA: sha, Message: redact(err, ref.Token)}
+	}
+
+	spec, err := ParseSpec(data)
+	if err != nil {
+		// Parse errors describe YAML structure (line/col), not the token.
+		return Result{Status: StatusInvalid, Raw: data, CommitSHA: sha, Message: err.Error()}
+	}
+	return Result{Status: StatusOK, Raw: data, Spec: spec, CommitSHA: sha}
+}
+
+// redact strips the access token from an error message so a fetch failure can
+// be surfaced or persisted without leaking the credential (mirrors the
+// sanitizeError convention in the static/dockerfile sources).
+func redact(err error, token string) string {
+	if err == nil {
+		return ""
+	}
+	msg := err.Error()
+	if token != "" {
+		msg = strings.ReplaceAll(msg, token, "[redacted]")
+	}
+	return msg
+}
@@ -0,0 +1,162 @@
+package gitops
+
+import (
+	"encoding/json"
+	"errors"
+	"strings"
+	"testing"
+)
+
+func strp(s string) *string { return &s }
+func intp(i int) *int       { return &i }
+
+func TestParseSpec(t *testing.T) {
+	s, err := ParseSpec([]byte("version: 1\ndeploy:\n  port: 8080\n  deploy_strategy: blue-green\n"))
+	if err != nil {
+		t.Fatalf("valid parse: %v", err)
+	}
+	if s.Version != 1 || s.Deploy.Port == nil || *s.Deploy.Port != 8080 {
+		t.Fatalf("unexpected spec: %+v", s)
+	}
+	if s.Deploy.Healthcheck != nil {
+		t.Fatalf("omitted healthcheck must stay nil")
+	}
+
+	// Unknown keys are rejected — incl. an attempt to declare env (out of v1).
+	if _, err := ParseSpec([]byte("version: 1\ndeploy:\n  env:\n    FOO: bar\n")); err == nil {
+		t.Fatalf("expected unknown-field error for deploy.env")
+	}
+	if _, err := ParseSpec([]byte("version: 1\nworkloads: []\n")); err == nil {
+		t.Fatalf("expected unknown-field error for top-level workloads")
+	}
+	if _, err := ParseSpec([]byte("version: 2\n")); err == nil {
+		t.Fatalf("expected unsupported-version error")
+	}
+	if _, err := ParseSpec(nil); err == nil {
+		t.Fatalf("expected empty-file error")
+	}
+}
+
+func TestBuildPlan_SourceAware(t *testing.T) {
+	spec := Spec{Version: 1, Deploy: DeploySpec{
+		Port: intp(8080), Healthcheck: strp("/h"), DeployStrategy: strp("blue-green"),
+	}}
+
+	df := BuildPlan(spec, SourceDockerfile).SourceConfigPatch
+	if df[keyPort] != 8080 || df[keyHealthcheck] != "/h" || df[keyDeployStrategy] != "blue-green" {
+		t.Fatalf("dockerfile patch wrong: %+v", df)
+	}
+
+	// static has no port/healthcheck — they must NOT leak into its patch.
+	st := BuildPlan(spec, SourceStatic).SourceConfigPatch
+	if _, ok := st[keyPort]; ok {
+		t.Fatalf("static patch must not contain port")
+	}
+	if _, ok := st[keyHealthcheck]; ok {
+		t.Fatalf("static patch must not contain healthcheck")
+	}
+	if st[keyDeployStrategy] != "blue-green" {
+		t.Fatalf("static should keep deploy_strategy: %+v", st)
+	}
+
+	if IsEligibleSource("image") || IsEligibleSource("compose") {
+		t.Fatalf("only dockerfile/static are GitOps-eligible in v1")
+	}
+	if !IsEligibleSource(SourceDockerfile) || !IsEligibleSource(SourceStatic) {
+		t.Fatalf("dockerfile + static must be eligible")
+	}
+}
+
+func TestMergeAndValidate_PreservesOmittedFields(t *testing.T) {
+	live := json.RawMessage(`{"repo_owner":"o","repo_name":"r","port":3000,"healthcheck":"/old","deploy_strategy":""}`)
+	spec := Spec{Version: 1, Deploy: DeploySpec{Port: intp(8080)}} // only port declared
+	merged, err := MergeAndValidate(live, BuildPlan(spec, SourceDockerfile), func(json.RawMessage) error { return nil })
+	if err != nil {
+		t.Fatal(err)
+	}
+	var m map[string]any
+	if err := json.Unmarshal(merged, &m); err != nil {
+		t.Fatal(err)
+	}
+	if m["port"].(float64) != 8080 {
+		t.Fatalf("declared port not applied: %v", m["port"])
+	}
+	if m["healthcheck"] != "/old" {
+		t.Fatalf("undeclared healthcheck must be preserved, got %v", m["healthcheck"])
+	}
+	if m["repo_owner"] != "o" {
+		t.Fatalf("untouched repo_owner lost")
+	}
+}
+
+func TestMergeAndValidate_RejectsInvalidMergedConfig(t *testing.T) {
+	live := json.RawMessage(`{"port":3000}`)
+	spec := Spec{Version: 1, Deploy: DeploySpec{DeployStrategy: strp("rolling")}}
+	_, err := MergeAndValidate(live, BuildPlan(spec, SourceDockerfile), func(c json.RawMessage) error {
+		var x struct {
+			DeployStrategy string `json:"deploy_strategy"`
+		}
+		_ = json.Unmarshal(c, &x)
+		if x.DeployStrategy == "rolling" {
+			return errors.New("invalid deploy_strategy")
+		}
+		return nil
+	})
+	if err == nil {
+		t.Fatalf("expected the merged config to be rejected as a whole")
+	}
+}
+
+func TestDrift_DeclaredOnly_WithNormalization(t *testing.T) {
+	// live: port 3000, healthcheck "/h", strategy "" (== recreate effective).
+	live := json.RawMessage(`{"port":3000,"healthcheck":"/h","deploy_strategy":"","registry_name":"x"}`)
+	// declare: port (changed) + deploy_strategy "recreate" (equal to "" -> no drift).
+	spec := Spec{Version: 1, Deploy: DeploySpec{Port: intp(8080), DeployStrategy: strp("recreate")}}
+	d, err := Drift(spec, live, SourceDockerfile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(d) != 1 {
+		t.Fatalf("want exactly 1 drift (port), got %d: %+v", len(d), d)
+	}
+	if d[0].Field != keyPort || d[0].RepoValue != "8080" || d[0].LiveValue != "3000" {
+		t.Fatalf("port drift wrong: %+v", d[0])
+	}
+}
+
+func TestDrift_StaticIgnoresUnsupportedFields(t *testing.T) {
+	live := json.RawMessage(`{"deploy_strategy":"recreate","mode":"static"}`)
+	// port declared but unsupported for static -> ignored; strategy differs -> drift.
+	spec := Spec{Version: 1, Deploy: DeploySpec{Port: intp(8080), DeployStrategy: strp("blue-green")}}
+	d, err := Drift(spec, live, SourceStatic)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(d) != 1 || d[0].Field != keyDeployStrategy {
+		t.Fatalf("static should only drift on deploy_strategy: %+v", d)
+	}
+}
+
+func TestDrift_UnsetLiveValue(t *testing.T) {
+	spec := Spec{Version: 1, Deploy: DeploySpec{Healthcheck: strp("/up")}}
+	d, err := Drift(spec, json.RawMessage(`{}`), SourceDockerfile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(d) != 1 || d[0].RepoValue != "/up" || d[0].LiveValue != "(unset)" {
+		t.Fatalf("unset live should render as (unset): %+v", d)
+	}
+}
+
+func TestRedact_StripsToken(t *testing.T) {
+	msg := redact(errors.New("execute request: token ghp_SECRET rejected"), "ghp_SECRET")
+	if strings.Contains(msg, "ghp_SECRET") {
+		t.Fatalf("token leaked: %s", msg)
+	}
+	if !strings.Contains(msg, "[redacted]") {
+		t.Fatalf("expected redaction marker: %s", msg)
+	}
+	if redact(nil, "x") != "" {
+		t.Fatalf("nil error should redact to empty string")
+	}
+}
@@ -0,0 +1,48 @@
+package gitops
+
+import (
+	"encoding/json"
+	"fmt"
+)
+
+// MergeAndValidate overlays the plan's SourceConfigPatch onto a copy of the
+// live source_config and returns the merged JSON — but only after the target
+// source's own Validate accepts the *merged* result. This is the hard apply
+// gate (review C4):
+//
+//   - omitted-field-preserving: keys the file doesn't declare are untouched, so
+//     a partial .tinyforge.yml never clears live config;
+//   - validate-then-commit: a patch that would produce an invalid config (e.g.
+//     deploy_strategy "blue-green" on a source that rejects it, or a bad port)
+//     is refused as a whole — the function never returns a partial/empty config;
+//   - pure: it does not write anything; the caller persists the returned bytes.
+//
+// validate is the matching Source.Validate (passed in to keep this package
+// decoupled from the source plugins).
+func MergeAndValidate(live json.RawMessage, plan ApplyPlan, validate func(json.RawMessage) error) (json.RawMessage, error) {
+	// Decode the live config into a generic map we can overlay. An empty/null
+	// live config starts from an empty object rather than failing.
+	merged := map[string]any{}
+	if len(live) > 0 {
+		if err := json.Unmarshal(live, &merged); err != nil {
+			return nil, fmt.Errorf("gitops: decode live source_config: %w", err)
+		}
+	}
+
+	// Overlay only the declared patch keys — everything else is preserved.
+	for k, v := range plan.SourceConfigPatch {
+		merged[k] = v
+	}
+
+	out, err := json.Marshal(merged)
+	if err != nil {
+		return nil, fmt.Errorf("gitops: encode merged source_config: %w", err)
+	}
+
+	if validate != nil {
+		if err := validate(out); err != nil {
+			return nil, fmt.Errorf("gitops: merged config rejected: %w", err)
+		}
+	}
+	return out, nil
+}
@@ -0,0 +1,57 @@
+// Package gitops implements config-as-code for repo-backed workloads: a
+// dockerfile/static workload can read a small .tinyforge.yml from its own repo
+// that declares a subset of its deploy config. The package is deliberately
+// decoupled from the store and source plugins — it takes a RepoRef (repo
+// coords + a decrypted token) and a live source_config blob, and returns a
+// validated merged config + a field-level drift report. It never writes to the
+// database and never decides to deploy.
+//
+// v1 scope (see plans/gitops/PLAN.md): only source_config-resident fields are
+// overlayable, and the set is source-aware (dockerfile: port/healthcheck/
+// deploy_strategy; static: deploy_strategy). env/faces live in separate stores
+// and are intentionally out of v1; the typed ApplyPlan reserves their slots.
+package gitops
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+
+	"gopkg.in/yaml.v3"
+)
+
+// Spec is the parsed shape of a .tinyforge.yml file (v1).
+type Spec struct {
+	Version int        `yaml:"version"`
+	Deploy  DeploySpec `yaml:"deploy"`
+}
+
+// DeploySpec carries the overlayable deploy fields. Pointers so an omitted key
+// is distinguishable from a zero value — only present (non-nil) fields are
+// applied or drift-compared, so an absent key never clears live config.
+type DeploySpec struct {
+	Port           *int    `yaml:"port"`
+	Healthcheck    *string `yaml:"healthcheck"`
+	DeployStrategy *string `yaml:"deploy_strategy"`
+}
+
+// ParseSpec decodes a .tinyforge.yml body. Unknown keys are rejected
+// (KnownFields) so a typo or an unsupported field — e.g. someone trying to
+// declare env/faces in v1 — surfaces as an error instead of being silently
+// dropped. Only version 1 is accepted.
+func ParseSpec(data []byte) (Spec, error) {
+	var s Spec
+	dec := yaml.NewDecoder(bytes.NewReader(data))
+	dec.KnownFields(true)
+	if err := dec.Decode(&s); err != nil {
+		if errors.Is(err, io.EOF) {
+			return Spec{}, fmt.Errorf("gitops: empty .tinyforge.yml")
+		}
+		return Spec{}, fmt.Errorf("gitops: parse .tinyforge.yml: %w", err)
+	}
+	if s.Version != 1 {
+		return Spec{}, fmt.Errorf("gitops: unsupported version %d (want 1)", s.Version)
+	}
+	return s, nil
+}
@@ -0,0 +1,48 @@
+// Package keyedmutex provides a lazily-populated per-key mutex, so a critical
+// section can be serialized per key (e.g. per workload id) without a global
+// lock. It is the shared form of the pattern that originated inline in the
+// GitOps sync handler; the deployer (per-workload deploy serialization) and the
+// volume-snapshot restore single-flight both use it.
+package keyedmutex
+
+import "sync"
+
+// Mutex hands out one *sync.Mutex per key on demand. The zero value is ready to
+// use. The internal map only grows (one entry per distinct key ever locked),
+// which is bounded in practice by the number of workloads.
+type Mutex struct {
+	mu sync.Mutex
+	m  map[string]*sync.Mutex
+}
+
+func (k *Mutex) get(key string) *sync.Mutex {
+	k.mu.Lock()
+	defer k.mu.Unlock()
+	if k.m == nil {
+		k.m = make(map[string]*sync.Mutex)
+	}
+	mu, ok := k.m[key]
+	if !ok {
+		mu = &sync.Mutex{}
+		k.m[key] = mu
+	}
+	return mu
+}
+
+// Lock blocks until the mutex for key is acquired, then returns its unlock func.
+func (k *Mutex) Lock(key string) func() {
+	mu := k.get(key)
+	mu.Lock()
+	return mu.Unlock
+}
+
+// TryLock attempts to acquire the mutex for key without blocking. On success it
+// returns the unlock func and true; if the key is already locked it returns nil
+// and false so the caller can reject (e.g. HTTP 409) instead of queuing.
+func (k *Mutex) TryLock(key string) (func(), bool) {
+	mu := k.get(key)
+	if !mu.TryLock() {
+		return nil, false
+	}
+	return mu.Unlock, true
+}
@@ -0,0 +1,83 @@
+package keyedmutex
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestLockSerializesSameKey(t *testing.T) {
+	var m Mutex
+	unlock := m.Lock("a")
+
+	acquired := make(chan struct{})
+	go func() {
+		u := m.Lock("a")
+		close(acquired)
+		u()
+	}()
+
+	select {
+	case <-acquired:
+		t.Fatal("second Lock on the same key acquired while the first was held")
+	case <-time.After(50 * time.Millisecond):
+		// expected: blocked
+	}
+	unlock()
+	select {
+	case <-acquired:
+		// expected: now acquired
+	case <-time.After(time.Second):
+		t.Fatal("second Lock did not acquire after release")
+	}
+}
+
+func TestLockIndependentKeys(t *testing.T) {
+	var m Mutex
+	unlockA := m.Lock("a")
+	defer unlockA()
+	// A different key must not block.
+	done := make(chan struct{})
+	go func() { u := m.Lock("b"); u(); close(done) }()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("Lock on an independent key blocked")
+	}
+}
+
+func TestTryLock(t *testing.T) {
+	var m Mutex
+	unlock, ok := m.TryLock("a")
+	if !ok {
+		t.Fatal("TryLock should succeed on a free key")
+	}
+	if _, ok := m.TryLock("a"); ok {
+		t.Fatal("TryLock should fail while the key is held")
+	}
+	unlock()
+	u2, ok := m.TryLock("a")
+	if !ok {
+		t.Fatal("TryLock should succeed after release")
+	}
+	u2()
+}
+
+func TestConcurrentLockNoRace(t *testing.T) {
+	var m Mutex
+	var wg sync.WaitGroup
+	counter := 0
+	for i := 0; i < 50; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			u := m.Lock("shared")
+			counter++ // protected by the keyed lock
+			u()
+		}()
+	}
+	wg.Wait()
+	if counter != 50 {
+		t.Errorf("counter = %d, want 50 (lost updates ⇒ lock not serializing)", counter)
+	}
+}
@@ -0,0 +1,349 @@
+// Package metricalert implements a background goroutine that
+// periodically evaluates operator-configured metric-threshold rules
+// against recent container stats samples. On breach (subject to a
+// per-rule-per-workload cooldown) it emits an event into the existing
+// event_log + event-bus pipeline — the same fan-out used by the
+// log-scanner — instead of building any new notification plumbing.
+package metricalert
+
+import (
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// EvalInterval is how often the evaluator tick fires.
+const EvalInterval = 30 * time.Second
+
+// lookbackSeconds bounds how far back we pull samples each tick. Stats
+// are collected at most every few seconds (see internal/stats), so a
+// 120s window comfortably captures the latest reading per container
+// even if collection briefly stalls.
+const lookbackSeconds = 120
+
+// RuleSource is the read-side seam for fetching the current rule rows.
+// Real callers pass *store.Store; tests pass a fake.
+type RuleSource interface {
+	ListMetricAlertRules() ([]store.MetricAlertRule, error)
+}
+
+// SampleSource fetches the recent container stats samples to evaluate.
+type SampleSource interface {
+	ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error)
+}
+
+// EventSink writes a breach into event_log.
+type EventSink interface {
+	InsertEvent(store.EventLog) (store.EventLog, error)
+}
+
+// Publisher fans the breach out on the event bus. Matches *events.Bus.
+type Publisher interface {
+	Publish(events.Event)
+}
+
+// Source identifies metric-alert events in event_log + the bus.
+const eventSource = "metric_alert"
+
+// Manager owns the evaluation loop lifecycle. It mirrors
+// stats.Collector: a once-guarded Start/Stop pair with stop/done
+// channels and a single-goroutine run loop.
+type Manager struct {
+	rules   RuleSource
+	samples SampleSource
+	sink    EventSink
+	pub     Publisher
+
+	// now is swappable in tests so cooldown windows can be exercised
+	// deterministically. Defaults to time.Now.
+	now func() time.Time
+
+	// mu guards lastFired. The run loop is single-goroutine today, but
+	// Start/Stop and a future ReloadRules may touch shared state; the
+	// mutex is cheap insurance.
+	mu        sync.Mutex
+	lastFired map[string]time.Time // "ruleID:ownerID" -> last emit time
+
+	startOnce sync.Once
+	stopOnce  sync.Once
+	started   bool
+	stop      chan struct{}
+	done      chan struct{}
+}
+
+// New wires a manager with the supplied dependencies. Call Start to
+// begin evaluating.
+func New(rules RuleSource, samples SampleSource, sink EventSink, pub Publisher) *Manager {
+	return &Manager{
+		rules:     rules,
+		samples:   samples,
+		sink:      sink,
+		pub:       pub,
+		now:       time.Now,
+		lastFired: map[string]time.Time{},
+		stop:      make(chan struct{}),
+		done:      make(chan struct{}),
+	}
+}
+
+// Start launches the background loop. Returns immediately. The loop
+// exits when Stop is called. Safe to call multiple times — only the
+// first call has an effect.
+func (m *Manager) Start() {
+	m.startOnce.Do(func() {
+		m.started = true
+		go m.run()
+	})
+}
+
+// Stop signals the loop to exit and blocks until it has finished the
+// in-flight tick. If Start was never called, Stop returns immediately.
+func (m *Manager) Stop() {
+	m.stopOnce.Do(func() {
+		close(m.stop)
+		if !m.started {
+			close(m.done)
+		}
+	})
+	<-m.done
+}
+
+// run is the main loop. It evaluates once shortly after start, then on
+// every EvalInterval tick, until Stop is called.
+func (m *Manager) run() {
+	defer close(m.done)
+
+	// Settle delay so the app + first stats samples exist before the
+	// first evaluation.
+	select {
+	case <-time.After(3 * time.Second):
+	case <-m.stop:
+		return
+	}
+
+	ticker := time.NewTicker(EvalInterval)
+	defer ticker.Stop()
+	m.evaluate(m.now())
+	for {
+		select {
+		case <-m.stop:
+			return
+		case <-ticker.C:
+			m.evaluate(m.now())
+		}
+	}
+}
+
+// evaluate runs one pass: load rules + recent samples, reduce to the
+// freshest sample per (owner, container), and emit on breach subject to
+// cooldown. Best-effort throughout — a bad rule or sample never crashes
+// the loop.
+func (m *Manager) evaluate(now time.Time) {
+	rules, err := m.rules.ListMetricAlertRules()
+	if err != nil {
+		slog.Warn("metricalert: list rules", "error", err)
+		return
+	}
+	if len(rules) == 0 {
+		return
+	}
+
+	since := now.Unix() - lookbackSeconds
+	samples, err := m.samples.ListAllRecentContainerStatsSamples(since)
+	if err != nil {
+		slog.Warn("metricalert: list samples", "error", err)
+		return
+	}
+	latest := latestPerContainer(samples)
+	if len(latest) == 0 {
+		return
+	}
+
+	for _, rule := range rules {
+		if !rule.Enabled {
+			continue
+		}
+		for _, sample := range latest {
+			// Per-workload rules only match their workload; "" matches all.
+			if rule.WorkloadID != "" && rule.WorkloadID != sample.OwnerID {
+				continue
+			}
+			value, ok := metricValue(rule.Metric, sample)
+			if !ok {
+				continue // e.g. memory_percent with a zero limit
+			}
+			if !breached(rule.Comparator, value, rule.Threshold) {
+				continue
+			}
+			if m.coolingDown(rule, sample.OwnerID, now) {
+				continue
+			}
+			m.emit(rule, sample, value)
+			m.recordFire(rule, sample.OwnerID, now)
+		}
+	}
+}
+
+// latestPerContainer keeps only the most recent sample per
+// (OwnerID, ContainerID), so each container is judged on its freshest
+// reading rather than every historical row in the window.
+func latestPerContainer(samples []store.ContainerStatsSample) []store.ContainerStatsSample {
+	newest := map[string]store.ContainerStatsSample{}
+	for _, s := range samples {
+		key := s.OwnerID + "\x00" + s.ContainerID
+		if prev, ok := newest[key]; !ok || s.TS > prev.TS {
+			newest[key] = s
+		}
+	}
+	out := make([]store.ContainerStatsSample, 0, len(newest))
+	for _, s := range newest {
+		out = append(out, s)
+	}
+	return out
+}
+
+// metricValue resolves a rule's metric against a sample. The bool is
+// false when the sample can't be judged for that metric (memory_percent
+// with a zero/unknown limit) so the caller skips it instead of dividing
+// by zero.
+func metricValue(metric string, s store.ContainerStatsSample) (float64, bool) {
+	switch metric {
+	case store.MetricCPUPercent:
+		return s.CPUPercent, true
+	case store.MetricMemoryPercent:
+		if s.MemoryLimit <= 0 {
+			return 0, false
+		}
+		return float64(s.MemoryUsage) / float64(s.MemoryLimit) * 100, true
+	case store.MetricMemoryBytes:
+		return float64(s.MemoryUsage), true
+	default:
+		return 0, false
+	}
+}
+
+// breached returns whether value crosses threshold per the comparator.
+func breached(comparator string, value, threshold float64) bool {
+	switch comparator {
+	case store.MetricComparatorGT:
+		return value > threshold
+	case store.MetricComparatorLT:
+		return value < threshold
+	default:
+		return false
+	}
+}
+
+// cooldownKey is the per-rule-per-workload cooldown key.
+func cooldownKey(ruleID int64, ownerID string) string {
+	return fmt.Sprintf("%d:%s", ruleID, ownerID)
+}
+
+func (m *Manager) coolingDown(rule store.MetricAlertRule, ownerID string, now time.Time) bool {
+	if rule.CooldownSeconds <= 0 {
+		return false
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	last, ok := m.lastFired[cooldownKey(rule.ID, ownerID)]
+	if !ok {
+		return false
+	}
+	return now.Sub(last) < time.Duration(rule.CooldownSeconds)*time.Second
+}
+
+func (m *Manager) recordFire(rule store.MetricAlertRule, ownerID string, now time.Time) {
+	m.mu.Lock()
+	m.lastFired[cooldownKey(rule.ID, ownerID)] = now
+	m.mu.Unlock()
+}
+
+// emit persists the breach as an event_log row and publishes it on the
+// bus. WorkloadID routes the alert to that app's activity timeline.
+// Metadata is JSON-marshalled (never string-concatenated). Any
+// marshal/insert failure is logged and skipped — emitting must never
+// crash the loop.
+func (m *Manager) emit(rule store.MetricAlertRule, sample store.ContainerStatsSample, value float64) {
+	message := formatMessage(rule, value)
+	meta := map[string]any{
+		"workload_id": sample.OwnerID,
+		"rule":        rule.Name,
+		"metric":      rule.Metric,
+		"value":       value,
+		"threshold":   rule.Threshold,
+		"comparator":  rule.Comparator,
+	}
+	metaJSON, err := json.Marshal(meta)
+	if err != nil {
+		slog.Error("metricalert: marshal metadata", "rule", rule.Name, "error", err)
+		return
+	}
+	severity := rule.Severity
+	if severity == "" {
+		severity = store.LogScanSeverityWarn
+	}
+	evt, err := m.sink.InsertEvent(store.EventLog{
+		Source:     eventSource,
+		Severity:   severity,
+		Message:    message,
+		WorkloadID: sample.OwnerID,
+		Metadata:   string(metaJSON),
+	})
+	if err != nil {
+		slog.Error("metricalert: persist event", "rule", rule.Name, "error", err)
+		return
+	}
+	if m.pub != nil {
+		m.pub.Publish(events.Event{
+			Type: events.EventLog,
+			Payload: events.EventLogPayload{
+				ID:         evt.ID,
+				Source:     eventSource,
+				WorkloadID: sample.OwnerID,
+				Severity:   severity,
+				Message:    message,
+				Metadata:   string(metaJSON),
+				CreatedAt:  evt.CreatedAt,
+			},
+		})
+	}
+}
+
+// formatMessage builds a concise, human, secret-free breach line. The
+// only operator-supplied text is rule.Name; the rest are numbers and
+// fixed labels.
+func formatMessage(rule store.MetricAlertRule, value float64) string {
+	label, unit := metricLabelUnit(rule.Metric)
+	word := comparatorWord(rule.Comparator)
+	return fmt.Sprintf("%s: %s is %.0f%s (threshold %s %.0f%s)",
+		rule.Name, label, value, unit, word, rule.Threshold, unit)
+}
+
+func metricLabelUnit(metric string) (label, unit string) {
+	switch metric {
+	case store.MetricCPUPercent:
+		return "CPU", "%"
+	case store.MetricMemoryPercent:
+		return "Memory", "%"
+	case store.MetricMemoryBytes:
+		return "Memory", " bytes"
+	default:
+		return metric, ""
+	}
+}
+
+func comparatorWord(comparator string) string {
+	switch comparator {
+	case store.MetricComparatorGT:
+		return ">"
+	case store.MetricComparatorLT:
+		return "<"
+	default:
+		return comparator
+	}
+}
@@ -0,0 +1,284 @@
+package metricalert
+
+import (
+	"testing"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// --- fakes -----------------------------------------------------------
+
+type fakeRules struct {
+	rules []store.MetricAlertRule
+	err   error
+}
+
+func (f *fakeRules) ListMetricAlertRules() ([]store.MetricAlertRule, error) {
+	return f.rules, f.err
+}
+
+type fakeSamples struct {
+	samples []store.ContainerStatsSample
+	err     error
+	since   int64 // captured arg of the last call
+}
+
+func (f *fakeSamples) ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error) {
+	f.since = sinceTS
+	return f.samples, f.err
+}
+
+type recordedEvent struct {
+	evt store.EventLog
+}
+
+type fakeSink struct {
+	events []recordedEvent
+	err    error
+	nextID int64
+}
+
+func (f *fakeSink) InsertEvent(e store.EventLog) (store.EventLog, error) {
+	if f.err != nil {
+		return store.EventLog{}, f.err
+	}
+	f.nextID++
+	e.ID = f.nextID
+	e.CreatedAt = "2026-05-29T00:00:00Z"
+	f.events = append(f.events, recordedEvent{evt: e})
+	return e, nil
+}
+
+type fakePublisher struct {
+	published []events.Event
+}
+
+func (f *fakePublisher) Publish(e events.Event) {
+	f.published = append(f.published, e)
+}
+
+func newManager(rules []store.MetricAlertRule, samples []store.ContainerStatsSample) (*Manager, *fakeSink, *fakePublisher) {
+	sink := &fakeSink{}
+	pub := &fakePublisher{}
+	m := New(&fakeRules{rules: rules}, &fakeSamples{samples: samples}, sink, pub)
+	return m, sink, pub
+}
+
+// --- tests -----------------------------------------------------------
+
+func TestEvaluate_BreachEmits(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Severity: "error",
+		CooldownSeconds: 300, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", OwnerType: "instance", TS: 100, CPUPercent: 95,
+	}}
+	m, sink, pub := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("expected 1 event, got %d", len(sink.events))
+	}
+	got := sink.events[0].evt
+	if got.Source != "metric_alert" {
+		t.Errorf("source = %q, want metric_alert", got.Source)
+	}
+	if got.Severity != "error" {
+		t.Errorf("severity = %q, want error", got.Severity)
+	}
+	if got.WorkloadID != "w1" {
+		t.Errorf("workload_id = %q, want w1", got.WorkloadID)
+	}
+	if got.Metadata == "" || got.Metadata == "{}" {
+		t.Errorf("metadata should be populated JSON, got %q", got.Metadata)
+	}
+	if len(pub.published) != 1 {
+		t.Fatalf("expected 1 published event, got %d", len(pub.published))
+	}
+	payload, ok := pub.published[0].Payload.(events.EventLogPayload)
+	if !ok {
+		t.Fatalf("published payload is not EventLogPayload")
+	}
+	if payload.WorkloadID != "w1" || payload.Source != "metric_alert" {
+		t.Errorf("payload workload/source mismatch: %+v", payload)
+	}
+}
+
+func TestEvaluate_NoBreachNoEmit(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10,
+	}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 0 {
+		t.Fatalf("expected no events for non-breach, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_DisabledRuleSkipped(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: false,
+	}}
+	samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 0 {
+		t.Fatalf("disabled rule should not emit, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_PerWorkloadScoping(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "w2-only", WorkloadID: "w2", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{
+		{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}, // breach but wrong workload
+		{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95}, // breach, correct workload
+	}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("expected 1 event (only w2), got %d", len(sink.events))
+	}
+	if sink.events[0].evt.WorkloadID != "w2" {
+		t.Errorf("event should be scoped to w2, got %q", sink.events[0].evt.WorkloadID)
+	}
+}
+
+func TestEvaluate_GlobalRuleMatchesAll(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "global", WorkloadID: "", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{
+		{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95},
+		{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95},
+	}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 2 {
+		t.Fatalf("global rule should fire for both workloads, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_MemoryPercentDivByZeroSkip(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 50, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 1000, MemoryLimit: 0,
+	}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 0 {
+		t.Fatalf("zero memory limit should be skipped for percent rule, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_MemoryPercentBreaches(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 90, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 950, MemoryLimit: 1000, // 95%
+	}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("95%% should breach 90%% threshold, got %d events", len(sink.events))
+	}
+}
+
+func TestEvaluate_CooldownSuppressesSecondEmit(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, CooldownSeconds: 300, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
+	m, sink, _ := newManager(rules, samples)
+
+	base := time.Unix(1000, 0)
+	m.evaluate(base)
+	// 10s later — still inside the 300s cooldown window.
+	m.evaluate(base.Add(10 * time.Second))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("cooldown should suppress second emit, got %d events", len(sink.events))
+	}
+
+	// Past the window — should fire again.
+	m.evaluate(base.Add(301 * time.Second))
+	if len(sink.events) != 2 {
+		t.Fatalf("should re-fire after cooldown elapses, got %d events", len(sink.events))
+	}
+}
+
+func TestEvaluate_LatestSamplePerContainer(t *testing.T) {
+	// Two samples for the same container: an old non-breaching reading
+	// and a newer breaching one. Only the freshest should be judged.
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{
+		{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10},
+		{ContainerID: "c1", OwnerID: "w1", TS: 150, CPUPercent: 95},
+	}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("expected exactly 1 event from freshest sample, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_LessThanComparator(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-idle", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorLT, Threshold: 5, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 1}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("1%% < 5%% threshold should breach lt rule, got %d events", len(sink.events))
+	}
+}
+
+func TestEvaluate_NoRulesNoFetch(t *testing.T) {
+	// With no rules there's nothing to do; we shouldn't even query samples.
+	samplesSrc := &fakeSamples{samples: nil}
+	m := New(&fakeRules{rules: nil}, samplesSrc, &fakeSink{}, &fakePublisher{})
+	m.evaluate(time.Unix(200, 0))
+	if samplesSrc.since != 0 {
+		t.Errorf("samples should not be queried when there are no rules")
+	}
+}
@@ -0,0 +1,250 @@
+// Package metrics provides a minimal Prometheus text-format exposition
+// of Tinyforge's operational counters. We deliberately do NOT import the
+// official client_golang library: the metrics set here is small, the text
+// format is simple, and avoiding the dependency keeps `tinyforge` a fast
+// single-binary install.
+//
+// Every counter is a sync/atomic.Int64 — cheap, lock-free, and safe to
+// touch from any goroutine. Histograms / gauges aren't modeled yet; the
+// few we need (request latency p50/p99) live downstream of slog and can
+// be added when the operator actually wants them.
+package metrics
+
+import (
+	"fmt"
+	"io"
+	"log/slog"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+)
+
+// Registry holds the process-wide counter set. A single zero-value
+// Registry is ready to use — see DefaultRegistry below for the
+// recommended way to grab the global handle.
+type Registry struct {
+	mu       sync.RWMutex
+	counters map[string]*counter
+}
+
+type counter struct {
+	name   string
+	help   string
+	labels []string // label names, ordered as declared at registration
+	series map[string]*atomic.Int64
+	// seriesMu only protects insertion of new label tuples — increments
+	// on existing tuples are lock-free via the atomic.
+	seriesMu sync.Mutex
+}
+
+// DefaultRegistry is the process-wide registry. All Tinyforge metrics
+// register against it. Tests can instantiate their own Registry.
+var DefaultRegistry = newRegistry()
+
+func newRegistry() *Registry {
+	return &Registry{counters: make(map[string]*counter)}
+}
+
+// NewCounter declares a counter on the default registry. Call once at
+// package init or during NewServer; subsequent calls with the same name
+// return the existing counter so re-registration is safe.
+//
+// label names define the dimensions; calls to Inc must pass values in
+// the same order. Use the empty slice for label-less counters.
+func NewCounter(name, help string, labels ...string) *Counter {
+	return DefaultRegistry.NewCounter(name, help, labels...)
+}
+
+// NewCounter on a specific Registry — useful in tests.
+func (r *Registry) NewCounter(name, help string, labels ...string) *Counter {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if c, ok := r.counters[name]; ok {
+		return &Counter{c: c}
+	}
+	c := &counter{
+		name:   name,
+		help:   help,
+		labels: append([]string(nil), labels...),
+		series: make(map[string]*atomic.Int64),
+	}
+	r.counters[name] = c
+	return &Counter{c: c}
+}
+
+// Counter is the public handle returned by NewCounter. Pass it around as
+// a value — the underlying state lives on the registry.
+type Counter struct {
+	c *counter
+}
+
+// Inc atomically increments the counter for the given label values.
+// Passing the wrong number of values is a programmer error; we surface
+// it as a panic during testing rather than silently aggregating into a
+// bogus series.
+func (c Counter) Inc(labelValues ...string) {
+	c.Add(1, labelValues...)
+}
+
+// Add atomically adds delta. Negative delta is rejected (counters are
+// monotonic by definition).
+func (c Counter) Add(delta int64, labelValues ...string) {
+	if delta < 0 {
+		return
+	}
+	if len(labelValues) != len(c.c.labels) {
+		// Programmer error. This used to panic to surface the bug, but Add
+		// runs on hot paths (HTTP middleware, deploy dispatch) and several
+		// callers are off the request goroutine, where a panic would take
+		// down the whole process rather than a single request. Log loudly
+		// and drop the sample so a mislabeled call site can never crash the
+		// server; the bug still shows up immediately in the logs and in
+		// tests via the error output.
+		slog.Error("metrics: label count mismatch — dropping sample",
+			"counter", c.c.name, "want", len(c.c.labels), "got", len(labelValues))
+		return
+	}
+	key := encodeKey(labelValues)
+	c.c.seriesMu.Lock()
+	v, ok := c.c.series[key]
+	if !ok {
+		v = new(atomic.Int64)
+		c.c.series[key] = v
+	}
+	c.c.seriesMu.Unlock()
+	v.Add(delta)
+}
+
+// encodeKey joins label values with a 0x1f separator. Prometheus label
+// values may contain anything except `"` and `\n`, which we escape on
+// exposition only — the key here is just a map index.
+func encodeKey(values []string) string {
+	return strings.Join(values, "\x1f")
+}
+
+// WritePrometheus dumps the registry in the text exposition format
+// Prometheus / VictoriaMetrics / OpenMetrics understands. Stable
+// ordering: counters alphabetical by name; series alphabetical by
+// encoded label tuple.
+func (r *Registry) WritePrometheus(w io.Writer) error {
+	r.mu.RLock()
+	names := make([]string, 0, len(r.counters))
+	for n := range r.counters {
+		names = append(names, n)
+	}
+	r.mu.RUnlock()
+	sort.Strings(names)
+
+	for _, name := range names {
+		r.mu.RLock()
+		c := r.counters[name]
+		r.mu.RUnlock()
+		if err := writeCounter(w, c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func writeCounter(w io.Writer, c *counter) error {
+	if _, err := fmt.Fprintf(w, "# HELP %s %s\n# TYPE %s counter\n", c.name, escapeHelp(c.help), c.name); err != nil {
+		return err
+	}
+	// Snapshot the series map under a SINGLE lock acquisition. The
+	// previous shape acquired+released seriesMu twice per emitted
+	// series (once for the key list, once per Load), contending with
+	// every hot-path Inc on the HTTP request path. The *atomic.Int64
+	// pointers are stable for the lifetime of the registry (we never
+	// delete entries), so reading them after the unlock is safe.
+	type sample struct {
+		key string
+		val *atomic.Int64
+	}
+	c.seriesMu.Lock()
+	samples := make([]sample, 0, len(c.series))
+	for k, v := range c.series {
+		samples = append(samples, sample{k, v})
+	}
+	c.seriesMu.Unlock()
+
+	sort.Slice(samples, func(i, j int) bool { return samples[i].key < samples[j].key })
+
+	for _, s := range samples {
+		val := s.val.Load()
+		labels := decodeKey(s.key, c.labels)
+		if labels == "" {
+			if _, err := fmt.Fprintf(w, "%s %d\n", c.name, val); err != nil {
+				return err
+			}
+			continue
+		}
+		if _, err := fmt.Fprintf(w, "%s{%s} %d\n", c.name, labels, val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func decodeKey(key string, names []string) string {
+	if key == "" || len(names) == 0 {
+		return ""
+	}
+	values := strings.Split(key, "\x1f")
+	if len(values) != len(names) {
+		// Should not happen — encodeKey/decode are symmetric.
+		return ""
+	}
+	parts := make([]string, len(names))
+	for i, n := range names {
+		parts[i] = fmt.Sprintf(`%s="%s"`, n, escapeLabelValue(values[i]))
+	}
+	return strings.Join(parts, ",")
+}
+
+func escapeHelp(s string) string {
+	r := strings.NewReplacer("\\", "\\\\", "\n", "\\n")
+	return r.Replace(s)
+}
+
+func escapeLabelValue(s string) string {
+	r := strings.NewReplacer("\\", "\\\\", "\n", "\\n", `"`, `\"`)
+	return r.Replace(s)
+}
+
+// ── Pre-declared counters ────────────────────────────────────────────
+//
+// These are the counters Tinyforge surfaces to operators. Adding more is
+// a one-line NewCounter call at the call site — no central catalogue,
+// just keep names lowercase_snake with the `tinyforge_` prefix.
+
+var (
+	HTTPRequestsTotal = NewCounter(
+		"tinyforge_http_requests_total",
+		"Total HTTP requests handled, partitioned by method and outcome class.",
+		"method", "status_class",
+	)
+	DeploysTotal = NewCounter(
+		"tinyforge_deploys_total",
+		"Total deploys dispatched, partitioned by source kind and outcome.",
+		"source_kind", "outcome",
+	)
+	WebhookDeliveriesTotal = NewCounter(
+		"tinyforge_webhook_deliveries_total",
+		"Total inbound webhook deliveries, partitioned by outcome.",
+		"outcome",
+	)
+	SchedulerTicksTotal = NewCounter(
+		"tinyforge_scheduler_ticks_total",
+		"Total scheduler ticks. The dispatched counter is the success measure.",
+	)
+	SchedulerDispatchedTotal = NewCounter(
+		"tinyforge_scheduler_dispatched_total",
+		"Triggers actually dispatched by the scheduler.",
+	)
+	OutboundNotifyTotal = NewCounter(
+		"tinyforge_outbound_notify_total",
+		"Outbound notification dispatch attempts, partitioned by outcome.",
+		"outcome",
+	)
+)
@@ -16,6 +16,8 @@ import (
 	"time"

 	"github.com/google/uuid"
+
+	"github.com/alexei/tinyforge/internal/metrics"
 )

 // Event represents a deployment / site-sync notification payload.
@@ -83,17 +85,68 @@ type TestResult struct {
 // Notifications are fire-and-forget by default — failures are logged but do
 // not propagate. SendSyncForTest is the exception, used only by the manual
 // test endpoint.
+//
+// outboundSem caps the number of in-flight outbound notifications. Without
+// it a single burst (e.g. 1000 event triggers firing on a noisy log scan)
+// would spawn 1000 simultaneous TCP connections, which both DoSes the
+// receiver and exhausts local FDs.
 type Notifier struct {
-	httpClient *http.Client
-	wg         sync.WaitGroup
+	httpClient  *http.Client
+	wg          sync.WaitGroup
+	outboundSem chan struct{}
 }

+// maxOutboundNotifications bounds the in-flight outbound webhook fan-out.
+// Sized to keep small bursts non-blocking while preventing a runaway storm
+// from starving the rest of the process. Tunable later via settings if any
+// operator legitimately needs more concurrency.
+const maxOutboundNotifications = 32
+
 // New creates a Notifier with sensible defaults.
 func New() *Notifier {
+	// Transport with bounded host pooling so a slow receiver cannot pin
+	// arbitrarily many sockets open. MaxConnsPerHost mirrors the worker
+	// pool size; idle pruning keeps long-lived processes from holding
+	// stale TCP entries indefinitely.
+	//
+	// NOTE: we deliberately do NOT apply the staticsite SSRF dialer here.
+	// Notification URLs are admin-configured, and an admin already has
+	// Docker-socket (host-root-equivalent) access, so the SSRF surface adds
+	// nothing they couldn't already reach. Blocking loopback/private targets
+	// would instead break the common self-hosted pattern of notifying a
+	// same-host sidecar/bridge (e.g. service-to-notification-bridge on
+	// 127.0.0.1). See the security review (rated LOW / out of trust boundary).
+	tr := &http.Transport{
+		MaxIdleConns:        64,
+		MaxIdleConnsPerHost: 8,
+		MaxConnsPerHost:     maxOutboundNotifications,
+		IdleConnTimeout:     90 * time.Second,
+	}
 	return &Notifier{
 		httpClient: &http.Client{
-			Timeout: 10 * time.Second,
+			Timeout:   10 * time.Second,
+			Transport: tr,
 		},
+		outboundSem: make(chan struct{}, maxOutboundNotifications),
+	}
+}
+
+// acquireSlot reserves an outbound slot, respecting ctx so a backed-up
+// queue cannot starve a request that already has its own deadline.
+func (n *Notifier) acquireSlot(ctx context.Context) bool {
+	select {
+	case n.outboundSem <- struct{}{}:
+		return true
+	case <-ctx.Done():
+		return false
+	}
+}
+
+func (n *Notifier) releaseSlot() {
+	select {
+	case <-n.outboundSem:
+	default:
+		// Drained during shutdown — never block.
 	}
 }

@@ -128,8 +181,15 @@ func (n *Notifier) SendSigned(webhookURL, secret string, tier Tier, event Event)
 	n.wg.Add(1)
 	go func() {
 		defer n.wg.Done()
-		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 		defer cancel()
+		if !n.acquireSlot(ctx) {
+			slog.Warn("notify: dropped — outbound queue saturated",
+				"tier", tier, "host", safeHost(webhookURL), "delivery", delivery, "event", event.Type)
+			metrics.OutboundNotifyTotal.Inc("dropped")
+			return
+		}
+		defer n.releaseSlot()

 		_, err := n.doSend(ctx, webhookURL, secret, tier, delivery, event)
 		// URL host only — never log the secret or full URL with user-info.
@@ -138,11 +198,13 @@ func (n *Notifier) SendSigned(webhookURL, secret string, tier Tier, event Event)
 			slog.Warn("notify: webhook send failed",
 				"tier", tier, "host", host, "delivery", delivery,
 				"event", event.Type, "signed", secret != "", "error", err)
+			metrics.OutboundNotifyTotal.Inc("failure")
 			return
 		}
 		slog.Info("notify: webhook dispatched",
 			"tier", tier, "host", host, "delivery", delivery,
 			"event", event.Type, "signed", secret != "")
+		metrics.OutboundNotifyTotal.Inc("success")
 	}()
 }

@@ -166,8 +228,15 @@ func (n *Notifier) SendPayload(webhookURL, secret, eventType string, payload any
 	n.wg.Add(1)
 	go func() {
 		defer n.wg.Done()
-		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 		defer cancel()
+		if !n.acquireSlot(ctx) {
+			slog.Warn("notify: dropped trigger payload — outbound queue saturated",
+				"tier", TierEventTrigger, "host", safeHost(webhookURL), "delivery", delivery, "event", eventType)
+			metrics.OutboundNotifyTotal.Inc("dropped")
+			return
+		}
+		defer n.releaseSlot()

 		_, err := n.doSendRaw(ctx, webhookURL, secret, TierEventTrigger, delivery, eventType, timestamp, payload)
 		host := safeHost(webhookURL)
@@ -175,11 +244,13 @@ func (n *Notifier) SendPayload(webhookURL, secret, eventType string, payload any
 			slog.Warn("notify: trigger webhook send failed",
 				"tier", TierEventTrigger, "host", host, "delivery", delivery,
 				"event", eventType, "signed", secret != "", "error", err)
+			metrics.OutboundNotifyTotal.Inc("failure")
 			return
 		}
 		slog.Info("notify: trigger webhook dispatched",
 			"tier", TierEventTrigger, "host", host, "delivery", delivery,
 			"event", eventType, "signed", secret != "")
+		metrics.OutboundNotifyTotal.Inc("success")
 	}()
 }

@@ -15,8 +15,8 @@ package reconciler

 import (
 	"context"
-	"encoding/json"
 	"errors"
+	"fmt"
 	"log/slog"
 	"sync"
 	"time"
@@ -110,17 +110,37 @@ func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
 	if err != nil {
 		return err
 	}
+
+	// Load every workload ONCE per tick and index by ID. This replaces both
+	// the former N+1 GetWorkloadByID (one DB read per container) in the
+	// upsert loop and the second ListWorkloads("") in the plugin pass: net 1
+	// query per tick, 0 GetWorkloadByID.
+	//
+	// On error we return BEFORE the upsert loop and leave state untouched
+	// this tick (the next tick retries). We must NOT proceed with an empty
+	// map and fall through to markMissingRows: with no container resolving,
+	// `seen` would be empty and markMissingRows would flip EVERY live row to
+	// 'missing'. Aborting early is the safe choice.
+	rows, err := r.store.ListWorkloads("")
+	if err != nil {
+		return fmt.Errorf("reconciler: list workloads: %w", err)
+	}
+	byID := make(map[string]store.Workload, len(rows))
+	for _, w := range rows {
+		byID[w.ID] = w
+	}
+
 	seen := make(map[string]struct{}, len(items)) // container row IDs we touched

 	for _, item := range items {
-		rowID := r.upsertFromItem(item)
+		rowID := r.upsertFromItem(item, byID)
 		if rowID != "" {
 			seen[rowID] = struct{}{}
 		}
 	}

 	r.markMissingRows(seen)
-	r.reconcilePluginWorkloads(ctx)
+	r.reconcilePluginWorkloads(ctx, rows)
 	return nil
 }

@@ -137,20 +157,18 @@ func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
 //
 // No-op when the plugin dispatcher hasn't been wired (boot-time race,
 // disabled deployments, tests).
-func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context) {
+//
+// rows is the workload set already loaded once by ReconcileOnce — passed
+// through rather than re-queried so a tick costs a single ListWorkloads.
+func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context, rows []store.Workload) {
 	if r.plugins == nil {
 		return
 	}
-	rows, err := r.store.ListWorkloads("")
-	if err != nil {
-		slog.Warn("reconciler: list workloads for plugin pass", "error", err)
-		return
-	}
 	for _, w := range rows {
 		if w.SourceKind == "" {
 			continue
 		}
-		pw := toPluginWorkload(w)
+		pw := plugin.WorkloadFromStore(w)
 		if err := r.plugins.DispatchReconcile(ctx, pw); err != nil {
 			slog.Warn("reconciler: plugin reconcile failed",
 				"workload", w.ID, "kind", w.SourceKind, "error", err)
@@ -158,33 +176,6 @@ func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context) {
 	}
 }

-// toPluginWorkload mirrors the api / webhook converters; kept local to
-// avoid an import dependency between those packages.
-func toPluginWorkload(w store.Workload) plugin.Workload {
-	var faces []plugin.PublicFace
-	if w.PublicFaces != "" {
-		_ = json.Unmarshal([]byte(w.PublicFaces), &faces)
-	}
-	return plugin.Workload{
-		ID:                      w.ID,
-		Name:                    w.Name,
-		GroupID:                 w.AppID,
-		ParentWorkloadID:        w.ParentWorkloadID,
-		SourceKind:              w.SourceKind,
-		SourceConfig:            json.RawMessage(w.SourceConfig),
-		TriggerKind:             w.TriggerKind,
-		TriggerConfig:           json.RawMessage(w.TriggerConfig),
-		PublicFaces:             faces,
-		NotificationURL:         w.NotificationURL,
-		NotificationSecret:      w.NotificationSecret,
-		WebhookSecret:           w.WebhookSecret,
-		WebhookSigningSecret:    w.WebhookSigningSecret,
-		WebhookRequireSignature: w.WebhookRequireSignature,
-		CreatedAt:               w.CreatedAt,
-		UpdatedAt:               w.UpdatedAt,
-	}
-}
-
 func (r *Reconciler) loop(ctx context.Context) {
 	defer r.wg.Done()

@@ -214,9 +205,9 @@ func (r *Reconciler) loop(ctx context.Context) {
 // After the hard cutover only the canonical tinyforge.workload.id label
 // path is honored — every Source plugin labels its containers with the
 // workload identity at create time.
-func (r *Reconciler) upsertFromItem(item docker.ReconcileItem) string {
+func (r *Reconciler) upsertFromItem(item docker.ReconcileItem, byID map[string]store.Workload) string {
 	if id := item.Labels[docker.LabelWorkloadID]; id != "" {
-		return r.upsertByWorkloadLabel(item, id)
+		return r.upsertByWorkloadLabel(item, id, byID)
 	}
 	return ""
 }
@@ -233,9 +224,9 @@ func (r *Reconciler) upsertFromItem(item docker.ReconcileItem) string {
 // known workload row is silently ignored. Anyone with Docker socket access
 // could otherwise spawn a container with a forged label and steal the
 // canonical slot for an existing workload.
-func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string) string {
-	w, err := r.store.GetWorkloadByID(workloadID)
-	if err != nil {
+func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string, byID map[string]store.Workload) string {
+	w, ok := byID[workloadID]
+	if !ok {
 		// Forged or stale label — log once at debug; tick rate keeps logs quiet.
 		slog.Debug("reconciler: unknown workload_id label", "workload_id", workloadID, "container_id", item.ID)
 		return ""
@@ -257,6 +257,138 @@ func TestReconcileSkipsProjectInsertWithoutDeployerRow(t *testing.T) {
 	}
 }

+// TestReconcileBatchingPreservesBehavior locks Fix A: loading all workloads
+// once per tick (and resolving labels from that in-memory map instead of an
+// N+1 GetWorkloadByID) must produce the same outcome as the per-container
+// lookup did. With multiple containers across multiple workloads plus a forged
+// label and a stale row, after one ReconcileOnce: known-workload containers
+// are upserted with the snapshot State, the forged-label container is skipped,
+// and the absent stale row is flipped to missing.
+func TestReconcileBatchingPreservesBehavior(t *testing.T) {
+	st := newTestStore(t)
+
+	w1 := makeWorkload(t, st, "batch-a", "stack")
+	w2 := makeWorkload(t, st, "batch-b", "stack")
+
+	// A stale row for w2 whose container is gone — must be marked missing.
+	if err := st.UpsertContainer(store.Container{
+		ID: w2.ID + ":old", WorkloadID: w2.ID, WorkloadKind: "stack",
+		Role: "old", ContainerID: "docker-vanished", State: "running",
+	}); err != nil {
+		t.Fatalf("seed stale row: %v", err)
+	}
+
+	fake := &fakeDocker{items: []docker.ReconcileItem{
+		{
+			ID: "docker-a1", Name: "batch-a-web-1", Image: "nginx:1.27", State: "running",
+			Labels: map[string]string{
+				docker.LabelManaged:      "true",
+				docker.LabelWorkloadID:   w1.ID,
+				docker.LabelWorkloadKind: "stack",
+				docker.LabelRole:         "web",
+			},
+			Ports: []uint16{8080},
+		},
+		{
+			ID: "docker-b1", Name: "batch-b-api-1", Image: "redis:7", State: "exited",
+			Labels: map[string]string{
+				docker.LabelManaged:      "true",
+				docker.LabelWorkloadID:   w2.ID,
+				docker.LabelWorkloadKind: "stack",
+				docker.LabelRole:         "api",
+			},
+		},
+		{
+			// Forged label — no such workload. Must be skipped entirely.
+			ID: "docker-evil", Name: "evil", Image: "nginx", State: "running",
+			Labels: map[string]string{
+				docker.LabelManaged:      "true",
+				docker.LabelWorkloadID:   "wl-forged",
+				docker.LabelWorkloadKind: "stack",
+				docker.LabelRole:         "web",
+			},
+		},
+	}}
+
+	r := New(st, fake, 0)
+	if err := r.ReconcileOnce(context.Background()); err != nil {
+		t.Fatalf("ReconcileOnce: %v", err)
+	}
+
+	// w1: one row, bound to docker-a1, running.
+	w1Rows, _ := st.ListContainersByWorkload(w1.ID)
+	if len(w1Rows) != 1 {
+		t.Fatalf("w1: expected 1 row, got %d", len(w1Rows))
+	}
+	if w1Rows[0].ContainerID != "docker-a1" || w1Rows[0].State != "running" || w1Rows[0].Role != "web" {
+		t.Fatalf("w1 row wrong: %+v", w1Rows[0])
+	}
+
+	// w2: the new api container is present (exited→stopped); the stale row is missing.
+	api, _ := st.GetContainerByID(w2.ID + ":api")
+	if api.ContainerID != "docker-b1" || api.State != "stopped" {
+		t.Fatalf("w2 api row wrong: %+v", api)
+	}
+	old, _ := st.GetContainerByID(w2.ID + ":old")
+	if old.State != "missing" {
+		t.Fatalf("w2 stale row should be missing, got %q", old.State)
+	}
+
+	// Forged label produced no row anywhere.
+	all, _ := st.ListContainers(store.ContainerFilter{})
+	for _, c := range all {
+		if c.ContainerID == "docker-evil" {
+			t.Fatalf("forged-label container was adopted: %+v", c)
+		}
+	}
+}
+
+// TestReconcileSyncsImageContainerState locks the Fix B coupling: the generic
+// reconciler upsert pass — NOT image.Reconcile — is what syncs an image
+// container's State from the snapshot. An image container carries the
+// workload_id / kind=image / role=image labels at create time, so a present
+// container's row gets its State written here, proving the per-container
+// inspect formerly in image.Reconcile is redundant.
+func TestReconcileSyncsImageContainerState(t *testing.T) {
+	st := newTestStore(t)
+	w := makeWorkload(t, st, "img", "image")
+
+	// Deployer pre-created the image container row (running). Docker now
+	// reports it exited — the generic pass must sync it to stopped.
+	if err := st.UpsertContainer(store.Container{
+		ID: "img-deploy-uuid", WorkloadID: w.ID, WorkloadKind: "image",
+		Role: "image", ContainerID: "docker-img", State: "running",
+	}); err != nil {
+		t.Fatalf("seed image row: %v", err)
+	}
+
+	fake := &fakeDocker{items: []docker.ReconcileItem{{
+		ID: "docker-img", Image: "ghcr.io/owner/app:v1", State: "exited",
+		Labels: map[string]string{
+			docker.LabelManaged:      "true",
+			docker.LabelWorkloadID:   w.ID,
+			docker.LabelWorkloadKind: "image",
+			docker.LabelRole:         "image",
+		},
+		Ports: []uint16{3000},
+	}}}
+
+	// No plugin reconciler wired — proves the state sync comes from the
+	// generic upsert pass, not from image.Reconcile.
+	r := New(st, fake, 0)
+	if err := r.ReconcileOnce(context.Background()); err != nil {
+		t.Fatalf("ReconcileOnce: %v", err)
+	}
+
+	got, _ := st.GetContainerByID("img-deploy-uuid")
+	if got.State != "stopped" {
+		t.Fatalf("image container state not synced by generic pass: got %q want stopped", got.State)
+	}
+	if got.Port != 3000 || got.ImageRef != "ghcr.io/owner/app:v1" {
+		t.Fatalf("image container docker fields not synced: %+v", got)
+	}
+}
+
 func TestReconcileNormalizesState(t *testing.T) {
 	st := newTestStore(t)
 	w := makeWorkload(t, st, "norm", "stack")
@@ -27,6 +27,7 @@ import (
 	"sync"
 	"time"

+	"github.com/alexei/tinyforge/internal/metrics"
 	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 	"github.com/alexei/tinyforge/internal/workload/plugin/trigger/schedule"
@@ -124,6 +125,7 @@ func (s *Scheduler) loop(ctx context.Context) {
 // TickOnce runs a single sweep. Exposed for tests and for the boot
 // kick. On error per-trigger the loop continues with the next row.
 func (s *Scheduler) TickOnce(ctx context.Context) {
+	metrics.SchedulerTicksTotal.Inc()
 	rows, err := s.store.ListTriggers("schedule")
 	if err != nil {
 		slog.Warn("scheduler: list triggers", "error", err)
@@ -226,5 +228,6 @@ func (s *Scheduler) fire(ctx context.Context, t store.Trigger, now time.Time) {
 		slog.Warn("scheduler: dispatch", "trigger", t.Name, "error", err)
 		return
 	}
+	metrics.SchedulerDispatchedTotal.Inc()
 	slog.Info("scheduler: fired", "trigger", t.Name, "kind", t.Kind, "at", ts)
 }
@@ -92,17 +92,27 @@ func (c *Compose) Ps(ctx context.Context, projectName, yamlPath string) ([]Servi
 }

 // Logs runs `docker compose -p <projectName> logs --no-color --tail=<n> <service>`.
-// If service is empty, logs for all services are returned.
+// If service is empty, logs for all services are returned. The service arg
+// is preceded by `--` so a service name that begins with `-` cannot be
+// re-parsed as a flag by the docker CLI (flag-injection guard).
 func (c *Compose) Logs(ctx context.Context, projectName, service string, tail int) (string, error) {
 	args := []string{"logs", "--no-color", fmt.Sprintf("--tail=%d", tail)}
 	if service != "" {
-		args = append(args, service)
+		args = append(args, "--", service)
 	}
 	return c.run(ctx, projectName, args...)
 }

-// run executes `docker compose -p <projectName> <args...>` and returns combined output.
+// run executes `docker compose -p <projectName> <args...>` and returns
+// combined output. projectName is verified not to begin with `-` because
+// `docker compose -p '--foo'` would otherwise be re-parsed as a flag —
+// the callers already sanitize project names through projectNameSanitizer,
+// but a belt-and-braces refusal here means any future caller cannot
+// accidentally bypass the sanitizer.
 func (c *Compose) run(ctx context.Context, projectName string, args ...string) (string, error) {
+	if projectName == "" || strings.HasPrefix(projectName, "-") {
+		return "", fmt.Errorf("docker compose: refusing project name %q", projectName)
+	}
 	full := append([]string{"compose", "-p", projectName}, args...)
 	cmd := exec.CommandContext(ctx, c.binary, full...)
 	var buf bytes.Buffer
@@ -2,6 +2,7 @@ package stack

 import (
 	"fmt"
+	"strings"

 	"gopkg.in/yaml.v3"
 )
@@ -15,11 +16,25 @@ type ComposeSpec struct {
 }

 // ServiceSpec captures the subset of compose service fields we inspect.
+//
+// All host-escape-adjacent fields are decoded here even though Tinyforge
+// itself never reads them at runtime — surfacing them to Validate() is the
+// only way to *reject* them. Add new fields here when blocking a new
+// escape vector.
 type ServiceSpec struct {
-	Image      string            `yaml:"image,omitempty"`
-	Ports      []any             `yaml:"ports,omitempty"`
-	Labels     map[string]string `yaml:"labels,omitempty"`
-	Privileged bool              `yaml:"privileged,omitempty"`
+	Image       string            `yaml:"image,omitempty"`
+	Build       any               `yaml:"build,omitempty"` // banned — see Validate
+	Ports       []any             `yaml:"ports,omitempty"`
+	Labels      map[string]string `yaml:"labels,omitempty"`
+	Privileged  bool              `yaml:"privileged,omitempty"`
+	Volumes     []any             `yaml:"volumes,omitempty"`
+	NetworkMode string            `yaml:"network_mode,omitempty"`
+	Pid         string            `yaml:"pid,omitempty"`
+	Ipc         string            `yaml:"ipc,omitempty"`
+	UsernsMode  string            `yaml:"userns_mode,omitempty"`
+	CapAdd      []string          `yaml:"cap_add,omitempty"`
+	Devices     []any             `yaml:"devices,omitempty"`
+	SecurityOpt []string          `yaml:"security_opt,omitempty"`
 }

 // Parse decodes YAML into a ComposeSpec. Returns a descriptive error on failure.
@@ -35,10 +50,20 @@ func Parse(yamlText string) (ComposeSpec, error) {
 }

 // Validate enforces Tinyforge-level constraints beyond compose schema validity.
+// All blocked fields below are documented host-escape vectors: any one of
+// them on its own gives the container root on the host. Tinyforge already
+// owns the docker socket, so the threat model is "any admin == host root,"
+// and these blocks raise the bar for any *future* viewer-to-admin
+// escalation as well as honest-mistake guardrails.
+//
 // Current rules:
 //   - No service may set `privileged: true`.
-//   - Every service must declare an image (compose supports build: too, but
-//     Tinyforge v1 disallows building from context to avoid arbitrary-code exec).
+//   - Every service must declare an image (build contexts disallowed).
+//   - No host-IPC / host-PID / host-userns / host networking.
+//   - No `cap_add`, `security_opt`, `devices`.
+//   - `volumes` may not bind-mount the docker socket, /, /etc, /var, /proc,
+//     /sys, /root, or /home — list is conservative; operators with real
+//     bind-mount needs should ship a Source plugin or a dedicated wizard.
 func Validate(spec ComposeSpec) error {
 	for name, svc := range spec.Services {
 		if svc.Privileged {
@@ -47,6 +72,121 @@ func Validate(spec ComposeSpec) error {
 		if svc.Image == "" {
 			return fmt.Errorf("service %q: image is required (build contexts not supported)", name)
 		}
+		if svc.Build != nil {
+			return fmt.Errorf("service %q: build: is not supported (use image:)", name)
+		}
+		if isBlockedNamespaceMode(svc.NetworkMode) {
+			return fmt.Errorf("service %q: network_mode %q is not allowed", name, svc.NetworkMode)
+		}
+		if isBlockedNamespaceMode(svc.Pid) {
+			return fmt.Errorf("service %q: pid: %q is not allowed", name, svc.Pid)
+		}
+		if isBlockedNamespaceMode(svc.Ipc) {
+			return fmt.Errorf("service %q: ipc: %q is not allowed", name, svc.Ipc)
+		}
+		if isHostMode(svc.UsernsMode) {
+			return fmt.Errorf("service %q: userns_mode %q is not allowed", name, svc.UsernsMode)
+		}
+		if len(svc.CapAdd) > 0 {
+			return fmt.Errorf("service %q: cap_add is not allowed", name)
+		}
+		if len(svc.SecurityOpt) > 0 {
+			return fmt.Errorf("service %q: security_opt is not allowed", name)
+		}
+		if len(svc.Devices) > 0 {
+			return fmt.Errorf("service %q: devices is not allowed", name)
+		}
+		for _, v := range svc.Volumes {
+			if host, ok := bindMountHostPath(v); ok {
+				if isBlockedBindMount(host) {
+					return fmt.Errorf("service %q: bind-mounting %q is not allowed", name, host)
+				}
+			}
+		}
 	}
 	return nil
 }
+
+// isHostMode reports a host-namespace share, i.e. network_mode / pid / ipc /
+// userns_mode set to "host". (It deliberately does NOT match "host-gateway",
+// which is an extra_hosts value, not a namespace mode — matching it here only
+// produced misleading rejections.)
+func isHostMode(v string) bool {
+	return v == "host"
+}
+
+// isBlockedNamespaceMode reports a namespace mode that must be rejected for
+// network_mode / pid / ipc: either host sharing ("host") or joining another
+// container's / compose service's namespace ("container:<id>",
+// "service:<name>"). The container/service joins are a lateral-movement and
+// sandbox-escape vector — a malicious service could attach to a victim
+// container's network or PID namespace.
+func isBlockedNamespaceMode(v string) bool {
+	return isHostMode(v) ||
+		strings.HasPrefix(v, "container:") ||
+		strings.HasPrefix(v, "service:")
+}
+
+// bindMountHostPath extracts the host-side path from a compose volume
+// declaration. Compose accepts two shapes: a short string "src:dst[:mode]"
+// and a long form map with a "source" key. Returns ok=false for named
+// volumes (no host source).
+func bindMountHostPath(v any) (string, bool) {
+	switch t := v.(type) {
+	case string:
+		// "named:/in/container" has no '/' or '.' prefix on the source.
+		if t == "" {
+			return "", false
+		}
+		parts := strings.SplitN(t, ":", 3)
+		src := parts[0]
+		if strings.HasPrefix(src, "/") || strings.HasPrefix(src, ".") || strings.HasPrefix(src, "~") {
+			return src, true
+		}
+		return "", false
+	case map[string]any:
+		if typ, _ := t["type"].(string); typ != "" && typ != "bind" {
+			return "", false
+		}
+		if src, ok := t["source"].(string); ok {
+			if strings.HasPrefix(src, "/") || strings.HasPrefix(src, ".") || strings.HasPrefix(src, "~") {
+				return src, true
+			}
+		}
+	}
+	return "", false
+}
+
+// isBlockedBindMount returns true for paths that obviously escape the
+// container's intended sandbox. Conservative deny-list — operators with
+// legitimate bind-mount needs should write a dedicated Source plugin
+// rather than tunnel them through compose.
+func isBlockedBindMount(host string) bool {
+	// Normalize trailing slash so "/var" and "/var/" both match.
+	clean := strings.TrimRight(host, "/")
+	if clean == "" || clean == "/" {
+		return true
+	}
+	// Relative ("./x", "../x", ".") and home-relative ("~/...") sources are
+	// resolved by Docker against the compose working directory (which
+	// Tinyforge controls and never intends as a host-bind source) or left
+	// unexpanded — and "../" can climb out of that directory entirely. The
+	// absolute-prefix deny-list below can't see these, so reject them
+	// outright rather than give a false sense of coverage.
+	if strings.HasPrefix(clean, ".") || strings.HasPrefix(clean, "~") {
+		return true
+	}
+	// Specific blocked files / sockets.
+	switch clean {
+	case "/var/run/docker.sock", "/run/docker.sock":
+		return true
+	}
+	// Blocked prefixes (cover sub-paths too).
+	blocked := []string{"/etc", "/var", "/proc", "/sys", "/root", "/home", "/boot", "/dev"}
+	for _, p := range blocked {
+		if clean == p || strings.HasPrefix(clean, p+"/") {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,55 @@
+package staticsite
+
+import (
+	"context"
+	"log/slog"
+)
+
+// CommitStatusReporter pushes deploy outcomes back to the git provider as a
+// commit status, gated on the per-workload report_commit_status flag. It is
+// strictly best-effort: every call is wrapped so a reporting failure is logged
+// at Warn and NEVER propagates to fail or block the deploy.
+//
+// The provider + identifiers are captured once at deploy start so the hot
+// transition points (pending/success/failure) read as one-liners. A nil
+// receiver (reporting disabled) makes Report a no-op, so callers don't have to
+// guard each transition.
+//
+// It lives in the staticsite package (alongside GitProvider / CommitStatus)
+// rather than the plugin package so the source plugins can share it without
+// staticsite taking a dependency on plugin. It is parameterized on primitives
+// (not plugin.Workload) for the same reason.
+type CommitStatusReporter struct {
+	provider  GitProvider
+	owner     string
+	repo      string
+	sha       string
+	targetURL string
+	enabled   bool
+}
+
+// NewCommitStatusReporter builds a reporter from the resolved deploy inputs.
+// When enabled is false (report_commit_status off) or the SHA is empty, the
+// returned reporter's Report method is inert.
+func NewCommitStatusReporter(provider GitProvider, owner, repo, sha, targetURL string, enabled bool) *CommitStatusReporter {
+	return &CommitStatusReporter{
+		provider:  provider,
+		owner:     owner,
+		repo:      repo,
+		sha:       sha,
+		targetURL: targetURL,
+		enabled:   enabled,
+	}
+}
+
+// Report sends one commit status, swallowing (and logging) any error. Safe to
+// call on a nil/disabled reporter or with a nil provider/empty SHA.
+func (r *CommitStatusReporter) Report(ctx context.Context, workloadName, workloadID string, status CommitStatus, description string) {
+	if r == nil || !r.enabled || r.provider == nil || r.sha == "" {
+		return
+	}
+	if err := r.provider.SetCommitStatus(ctx, r.owner, r.repo, r.sha, status, r.targetURL, description); err != nil {
+		slog.Warn("commit-status report failed (ignored)",
+			"workload", workloadName, "workload_id", workloadID, "status", string(status), "error", err)
+	}
+}
@@ -0,0 +1,125 @@
+package staticsite
+
+import (
+	"context"
+	"errors"
+	"testing"
+)
+
+// fakeReporterProvider is a stub GitProvider that records SetCommitStatus
+// calls. Only the methods the reporter exercises are meaningful; the rest
+// satisfy the interface and panic if ever hit so a mis-wired test is loud.
+type fakeReporterProvider struct {
+	calls   []reporterStatusCall
+	failErr error // when set, SetCommitStatus returns it (best-effort path)
+}
+
+type reporterStatusCall struct {
+	owner, repo, sha string
+	status           CommitStatus
+	targetURL, descr string
+}
+
+func (f *fakeReporterProvider) SetCommitStatus(_ context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
+	f.calls = append(f.calls, reporterStatusCall{owner, repo, sha, status, targetURL, description})
+	return f.failErr
+}
+
+func (*fakeReporterProvider) Name() string { return "fake" }
+func (*fakeReporterProvider) TestConnection(context.Context, string, string) error {
+	panic("unused")
+}
+func (*fakeReporterProvider) ListRepos(context.Context, string) ([]RepoInfo, error) {
+	panic("unused")
+}
+func (*fakeReporterProvider) ListBranches(context.Context, string, string) ([]string, error) {
+	panic("unused")
+}
+func (*fakeReporterProvider) GetLatestCommitSHA(context.Context, string, string, string) (string, error) {
+	panic("unused")
+}
+func (*fakeReporterProvider) ListTree(context.Context, string, string, string) ([]FolderEntry, error) {
+	panic("unused")
+}
+func (*fakeReporterProvider) DownloadFolder(context.Context, string, string, string, string, string) error {
+	panic("unused")
+}
+func (*fakeReporterProvider) DownloadFile(context.Context, string, string, string, string, int64) ([]byte, error) {
+	panic("unused")
+}
+
+// Enabled: forwards to the provider with the captured identifiers + target.
+func TestCommitStatusReporter_Enabled_Calls(t *testing.T) {
+	fp := &fakeReporterProvider{}
+	r := NewCommitStatusReporter(fp, "owner", "pages", "abc123", "https://app.example.com", true)
+
+	r.Report(context.Background(), "site", "wid-1", CommitStatusPending, "Tinyforge: deploying")
+	r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "Tinyforge: deployed")
+
+	if len(fp.calls) != 2 {
+		t.Fatalf("calls = %d, want 2", len(fp.calls))
+	}
+	first := fp.calls[0]
+	if first.owner != "owner" || first.repo != "pages" || first.sha != "abc123" {
+		t.Errorf("identifiers wrong: %+v", first)
+	}
+	if first.status != CommitStatusPending {
+		t.Errorf("first status = %q, want pending", first.status)
+	}
+	if first.targetURL != "https://app.example.com" {
+		t.Errorf("targetURL = %q", first.targetURL)
+	}
+	if fp.calls[1].status != CommitStatusSuccess {
+		t.Errorf("second status = %q, want success", fp.calls[1].status)
+	}
+}
+
+// Disabled: the reporter is inert.
+func TestCommitStatusReporter_Disabled_NoCalls(t *testing.T) {
+	fp := &fakeReporterProvider{}
+	r := NewCommitStatusReporter(fp, "owner", "pages", "abc123", "", false)
+
+	r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "x")
+	if len(fp.calls) != 0 {
+		t.Fatalf("expected no calls when disabled, got %d", len(fp.calls))
+	}
+}
+
+// An empty SHA (e.g. a provider that couldn't resolve the branch) must not
+// produce a status call even when reporting is enabled.
+func TestCommitStatusReporter_EmptySHA_NoCalls(t *testing.T) {
+	fp := &fakeReporterProvider{}
+	r := NewCommitStatusReporter(fp, "owner", "pages", "", "", true)
+
+	r.Report(context.Background(), "site", "wid-1", CommitStatusPending, "x")
+	if len(fp.calls) != 0 {
+		t.Fatalf("expected no calls with empty SHA, got %d", len(fp.calls))
+	}
+}
+
+// A provider error must be swallowed (best-effort) — Report never panics or
+// propagates. We assert it returns normally after a failing provider call.
+func TestCommitStatusReporter_ProviderError_Swallowed(t *testing.T) {
+	fp := &fakeReporterProvider{failErr: errors.New("boom")}
+	r := NewCommitStatusReporter(fp, "owner", "pages", "abc123", "", true)
+
+	// Should not panic / propagate.
+	r.Report(context.Background(), "site", "wid-1", CommitStatusFailure, "Tinyforge: deploy failed")
+	if len(fp.calls) != 1 {
+		t.Fatalf("expected the failing call to still be recorded, got %d", len(fp.calls))
+	}
+}
+
+// A nil reporter (constructed only when needed in some call paths) is safe.
+func TestCommitStatusReporter_NilSafe(t *testing.T) {
+	var r *CommitStatusReporter
+	// Must not panic.
+	r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "x")
+}
+
+// A nil provider on an enabled reporter is also a no-op (defensive guard).
+func TestCommitStatusReporter_NilProvider_NoPanic(t *testing.T) {
+	r := NewCommitStatusReporter(nil, "owner", "pages", "abc123", "", true)
+	// Must not panic.
+	r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "x")
+}
@@ -0,0 +1,331 @@
+package staticsite
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+// ── State mapping (pure) ────────────────────────────────────────────
+//
+// Each provider maps the provider-agnostic CommitStatus onto its own API
+// vocabulary. Gitea/GitHub accept the same four words; GitLab collapses
+// failure+error into "failed".
+
+func TestGiteaState_Mapping(t *testing.T) {
+	cases := map[CommitStatus]string{
+		CommitStatusPending: "pending",
+		CommitStatusSuccess: "success",
+		CommitStatusFailure: "failure",
+		CommitStatusError:   "error",
+		CommitStatus("???"): "pending", // unknown -> pending fallback
+	}
+	for in, want := range cases {
+		if got := giteaState(in); got != want {
+			t.Errorf("giteaState(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestGitHubState_Mapping(t *testing.T) {
+	cases := map[CommitStatus]string{
+		CommitStatusPending: "pending",
+		CommitStatusSuccess: "success",
+		CommitStatusFailure: "failure",
+		CommitStatusError:   "error",
+		CommitStatus("???"): "pending",
+	}
+	for in, want := range cases {
+		if got := githubState(in); got != want {
+			t.Errorf("githubState(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestGitLabState_Mapping(t *testing.T) {
+	cases := map[CommitStatus]string{
+		CommitStatusPending: "pending",
+		CommitStatusSuccess: "success",
+		CommitStatusFailure: "failed", // GitLab has no "failure"
+		CommitStatusError:   "failed", // error also collapses to "failed"
+		CommitStatus("???"): "pending",
+	}
+	for in, want := range cases {
+		if got := gitlabState(in); got != want {
+			t.Errorf("gitlabState(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+func TestTruncateDescription(t *testing.T) {
+	short := "Tinyforge: deploying"
+	if got := truncateDescription(short); got != short {
+		t.Errorf("short description mutated: %q", got)
+	}
+	long := strings.Repeat("x", 200)
+	got := truncateDescription(long)
+	if len([]rune(got)) > maxCommitStatusDescription {
+		t.Errorf("truncated length = %d runes, want <= %d", len([]rune(got)), maxCommitStatusDescription)
+	}
+	if !strings.HasSuffix(got, "…") {
+		t.Errorf("missing ellipsis on truncation: %q", got)
+	}
+}
+
+// ── Endpoint + body construction (httptest) ─────────────────────────
+//
+// The SSRF-safe client refuses loopback, so for these tests we swap the
+// provider's httpClient for a plain one pointed at httptest. This still
+// exercises the real URL/body construction inside each SetCommitStatus.
+
+type capturedRequest struct {
+	method string
+	path   string // r.URL.EscapedPath() — preserves %2F so GitLab's encoded project path is observable
+	rawQ   string
+	body   map[string]string
+	auth   string
+	token  string // PRIVATE-TOKEN (GitLab)
+}
+
+func newCaptureServer(t *testing.T, capture *capturedRequest) *httptest.Server {
+	t.Helper()
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		capture.method = r.Method
+		capture.path = r.URL.EscapedPath()
+		capture.rawQ = r.URL.RawQuery
+		capture.auth = r.Header.Get("Authorization")
+		capture.token = r.Header.Get("PRIVATE-TOKEN")
+		raw, _ := io.ReadAll(r.Body)
+		if len(raw) > 0 {
+			_ = json.Unmarshal(raw, &capture.body)
+		}
+		w.WriteHeader(http.StatusCreated)
+		_, _ = w.Write([]byte(`{}`))
+	}))
+}
+
+func TestGitea_SetCommitStatus_Request(t *testing.T) {
+	var cap capturedRequest
+	srv := newCaptureServer(t, &cap)
+	defer srv.Close()
+
+	f := NewGiteaContentFetcher(srv.URL, "tok123")
+	f.httpClient = srv.Client() // bypass SSRF guard for loopback test server
+
+	err := f.SetCommitStatus(context.Background(), "owner", "repo", "abc123",
+		CommitStatusSuccess, "https://app.example.com", "deployed")
+	if err != nil {
+		t.Fatalf("SetCommitStatus: %v", err)
+	}
+
+	if cap.method != http.MethodPost {
+		t.Errorf("method = %q, want POST", cap.method)
+	}
+	if want := "/api/v1/repos/owner/repo/statuses/abc123"; cap.path != want {
+		t.Errorf("path = %q, want %q", cap.path, want)
+	}
+	if cap.body["state"] != "success" {
+		t.Errorf("state = %q, want success", cap.body["state"])
+	}
+	if cap.body["context"] != "tinyforge" {
+		t.Errorf("context = %q, want tinyforge", cap.body["context"])
+	}
+	if cap.body["target_url"] != "https://app.example.com" {
+		t.Errorf("target_url = %q", cap.body["target_url"])
+	}
+	if cap.body["description"] != "deployed" {
+		t.Errorf("description = %q, want deployed", cap.body["description"])
+	}
+	if cap.auth != "token tok123" {
+		t.Errorf("auth = %q, want 'token tok123'", cap.auth)
+	}
+}
+
+func TestGitHub_SetCommitStatus_Request(t *testing.T) {
+	var cap capturedRequest
+	srv := newCaptureServer(t, &cap)
+	defer srv.Close()
+
+	// Force GHE-style apiBase so we hit the server's path; the github.com
+	// branch hard-codes api.github.com which the SSRF client would block.
+	g := NewGitHubProvider(srv.URL, "ghp_tok")
+	g.apiBase = srv.URL + "/api/v3"
+	g.httpClient = srv.Client()
+
+	err := g.SetCommitStatus(context.Background(), "octo", "cat", "deadbeef",
+		CommitStatusFailure, "", "failed")
+	if err != nil {
+		t.Fatalf("SetCommitStatus: %v", err)
+	}
+
+	if want := "/api/v3/repos/octo/cat/statuses/deadbeef"; cap.path != want {
+		t.Errorf("path = %q, want %q", cap.path, want)
+	}
+	if cap.body["state"] != "failure" {
+		t.Errorf("state = %q, want failure", cap.body["state"])
+	}
+	if cap.body["context"] != "tinyforge" {
+		t.Errorf("context = %q, want tinyforge", cap.body["context"])
+	}
+	if cap.auth != "Bearer ghp_tok" {
+		t.Errorf("auth = %q, want 'Bearer ghp_tok'", cap.auth)
+	}
+}
+
+func TestGitLab_SetCommitStatus_Request(t *testing.T) {
+	var cap capturedRequest
+	srv := newCaptureServer(t, &cap)
+	defer srv.Close()
+
+	g := NewGitLabProvider(srv.URL, "glpat-xyz")
+	g.httpClient = srv.Client()
+
+	err := g.SetCommitStatus(context.Background(), "grp", "proj", "cafe01",
+		CommitStatusError, "https://app.example.com", "boom")
+	if err != nil {
+		t.Fatalf("SetCommitStatus: %v", err)
+	}
+
+	// GitLab uses the URL-encoded project path + sha in the path, and the
+	// status metadata as query params.
+	if want := "/api/v4/projects/grp%2Fproj/statuses/cafe01"; cap.path != want {
+		t.Errorf("path = %q, want %q", cap.path, want)
+	}
+	q, err := parseQuery(cap.rawQ)
+	if err != nil {
+		t.Fatalf("parse query %q: %v", cap.rawQ, err)
+	}
+	if q["state"] != "failed" { // error -> failed
+		t.Errorf("state = %q, want failed", q["state"])
+	}
+	if q["name"] != "tinyforge" {
+		t.Errorf("name = %q, want tinyforge", q["name"])
+	}
+	if q["target_url"] != "https://app.example.com" {
+		t.Errorf("target_url = %q", q["target_url"])
+	}
+	if q["description"] != "boom" {
+		t.Errorf("description = %q, want boom", q["description"])
+	}
+	if cap.token != "glpat-xyz" {
+		t.Errorf("PRIVATE-TOKEN = %q, want glpat-xyz", cap.token)
+	}
+}
+
+// parseQuery is a tiny wrapper so the test reads the first value of each
+// query key without dragging net/url into every assertion.
+func parseQuery(raw string) (map[string]string, error) {
+	out := map[string]string{}
+	if raw == "" {
+		return out, nil
+	}
+	for _, pair := range strings.Split(raw, "&") {
+		kv := strings.SplitN(pair, "=", 2)
+		k := urlDecode(kv[0])
+		v := ""
+		if len(kv) == 2 {
+			v = urlDecode(kv[1])
+		}
+		if _, ok := out[k]; !ok {
+			out[k] = v
+		}
+	}
+	return out, nil
+}
+
+func urlDecode(s string) string {
+	dec, err := decodeQueryComponent(s)
+	if err != nil {
+		return s
+	}
+	return dec
+}
+
+// decodeQueryComponent decodes one application/x-www-form-urlencoded
+// component (handles %XX and '+'-as-space) without importing net/url here.
+func decodeQueryComponent(s string) (string, error) {
+	var b strings.Builder
+	for i := 0; i < len(s); i++ {
+		switch s[i] {
+		case '+':
+			b.WriteByte(' ')
+		case '%':
+			if i+2 >= len(s) {
+				return s, errPercent
+			}
+			hi, lo := fromHex(s[i+1]), fromHex(s[i+2])
+			if hi < 0 || lo < 0 {
+				return s, errPercent
+			}
+			b.WriteByte(byte(hi<<4 | lo))
+			i += 2
+		default:
+			b.WriteByte(s[i])
+		}
+	}
+	return b.String(), nil
+}
+
+var errPercent = &decodeErr{}
+
+type decodeErr struct{}
+
+func (*decodeErr) Error() string { return "bad percent-encoding" }
+
+func fromHex(c byte) int {
+	switch {
+	case c >= '0' && c <= '9':
+		return int(c - '0')
+	case c >= 'a' && c <= 'f':
+		return int(c-'a') + 10
+	case c >= 'A' && c <= 'F':
+		return int(c-'A') + 10
+	}
+	return -1
+}
+
+// TestSetCommitStatus_NonOK_ReturnsError verifies a non-2xx provider
+// response surfaces as an error (the deploy hook logs + swallows it, but
+// the provider method itself must report it).
+func TestSetCommitStatus_NonOK_ReturnsError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusUnauthorized)
+		_, _ = w.Write([]byte(`{"message":"bad token"}`))
+	}))
+	defer srv.Close()
+
+	f := NewGiteaContentFetcher(srv.URL, "tok")
+	f.httpClient = srv.Client()
+
+	err := f.SetCommitStatus(context.Background(), "o", "r", "sha", CommitStatusPending, "", "x")
+	if err == nil {
+		t.Fatal("expected error on 401, got nil")
+	}
+	if !strings.Contains(err.Error(), "401") {
+		t.Errorf("error missing status code: %v", err)
+	}
+}
+
+// TestSetCommitStatus_RespectsContext ensures the call honours context
+// cancellation (defensive — the deploy hook passes the deploy ctx).
+func TestSetCommitStatus_RespectsContext(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		time.Sleep(200 * time.Millisecond)
+		w.WriteHeader(http.StatusCreated)
+	}))
+	defer srv.Close()
+
+	f := NewGiteaContentFetcher(srv.URL, "")
+	f.httpClient = srv.Client()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
+	defer cancel()
+	if err := f.SetCommitStatus(ctx, "o", "r", "sha", CommitStatusPending, "", "x"); err == nil {
+		t.Fatal("expected context-deadline error, got nil")
+	}
+}
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"os"
 	"path/filepath"
 	"strings"
@@ -294,6 +295,15 @@ func (f *GiteaContentFetcher) DownloadFolder(ctx context.Context, owner, repo, b
 	return nil
 }

+// DownloadFile fetches a single file's raw bytes via Gitea's raw endpoint
+// (also serves Forgejo/Gogs), capped at maxBytes. Returns ErrFileNotFound on
+// a 404 so an absent config file reads as a non-error state.
+func (f *GiteaContentFetcher) DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error) {
+	p := strings.TrimPrefix(path, "/")
+	fileURL := fmt.Sprintf("%s/api/v1/repos/%s/%s/raw/%s?ref=%s", f.baseURL, owner, repo, p, ref)
+	return getFileBytes(ctx, f.httpClient, fileURL, maxBytes, f.setAuth)
+}
+
 // TestConnection verifies that the repository is accessible.
 func (f *GiteaContentFetcher) TestConnection(ctx context.Context, owner, repo string) error {
 	url := fmt.Sprintf("%s/api/v1/repos/%s/%s", f.baseURL, owner, repo)
@@ -304,6 +314,54 @@ func (f *GiteaContentFetcher) TestConnection(ctx context.Context, owner, repo st
 	return nil
 }

+// SetCommitStatus reports a deploy status on a commit via Gitea's commit-
+// status API (also serves Forgejo/Gogs). The "context" field is fixed to
+// "tinyforge" so repeated deploys update one status row.
+func (f *GiteaContentFetcher) SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
+	state := giteaState(status)
+	body, err := json.Marshal(map[string]string{
+		"state":       state,
+		"target_url":  targetURL,
+		"description": truncateDescription(description),
+		"context":     commitStatusContext,
+	})
+	if err != nil {
+		return fmt.Errorf("marshal status: %w", err)
+	}
+	// Path-escape each identifier so the URL shape matches the other
+	// provider methods and a hostile owner/repo/sha can't break out of
+	// the intended path. The SSRF-safe client guards the host.
+	apiURL := fmt.Sprintf("%s/api/v1/repos/%s/%s/statuses/%s",
+		f.baseURL, url.PathEscape(owner), url.PathEscape(repo), url.PathEscape(sha))
+	if err := postJSON(ctx, f.httpClient, apiURL, body, f.setAuth); err != nil {
+		return fmt.Errorf("set commit status: %w", err)
+	}
+	return nil
+}
+
+// setAuth applies the Gitea token header (no-op when the token is empty).
+func (f *GiteaContentFetcher) setAuth(req *http.Request) {
+	if f.token != "" {
+		req.Header.Set("Authorization", "token "+f.token)
+	}
+}
+
+// giteaState maps a provider-agnostic CommitStatus onto Gitea's API
+// vocabulary. Gitea accepts the same four words Tinyforge uses, so this is
+// a 1:1 mapping with a "pending" fallback for any unknown value.
+func giteaState(status CommitStatus) string {
+	switch status {
+	case CommitStatusSuccess:
+		return "success"
+	case CommitStatusFailure:
+		return "failure"
+	case CommitStatusError:
+		return "error"
+	default:
+		return "pending"
+	}
+}
+
 // doGet performs an authenticated GET request and returns the response body.
 func (f *GiteaContentFetcher) doGet(ctx context.Context, url string) ([]byte, error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"os"
 	"path/filepath"
 	"strings"
@@ -115,6 +116,43 @@ func (g *GitHubProvider) TestConnection(ctx context.Context, owner, repo string)
 	return err
 }

+// SetCommitStatus reports a deploy status on a commit via GitHub's commit-
+// status API (works for github.com and GitHub Enterprise — apiBase already
+// carries the /api/v3 suffix for GHE). The "context" field is fixed to
+// "tinyforge" so repeated deploys update one status row.
+func (g *GitHubProvider) SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
+	body, err := json.Marshal(map[string]string{
+		"state":       githubState(status),
+		"target_url":  targetURL,
+		"description": truncateDescription(description),
+		"context":     commitStatusContext,
+	})
+	if err != nil {
+		return fmt.Errorf("marshal status: %w", err)
+	}
+	apiURL := fmt.Sprintf("%s/repos/%s/%s/statuses/%s",
+		g.apiBase, url.PathEscape(owner), url.PathEscape(repo), url.PathEscape(sha))
+	if err := postJSON(ctx, g.httpClient, apiURL, body, g.setAuth); err != nil {
+		return fmt.Errorf("set commit status: %w", err)
+	}
+	return nil
+}
+
+// githubState maps a provider-agnostic CommitStatus onto GitHub's API
+// vocabulary. GitHub accepts the same four words Tinyforge uses.
+func githubState(status CommitStatus) string {
+	switch status {
+	case CommitStatusSuccess:
+		return "success"
+	case CommitStatusFailure:
+		return "failure"
+	case CommitStatusError:
+		return "error"
+	default:
+		return "pending"
+	}
+}
+
 func (g *GitHubProvider) ListBranches(ctx context.Context, owner, repo string) ([]string, error) {
 	var allBranches []string
 	page := 1
@@ -250,6 +288,19 @@ func (g *GitHubProvider) DownloadFolder(ctx context.Context, owner, repo, branch
 	return nil
 }

+// DownloadFile fetches a single file's raw bytes via the GitHub contents API
+// using the raw media type (works for both github.com and GHE), capped at
+// maxBytes. Returns ErrFileNotFound on a 404.
+func (g *GitHubProvider) DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error) {
+	p := strings.TrimPrefix(path, "/")
+	fileURL := fmt.Sprintf("%s/repos/%s/%s/contents/%s?ref=%s", g.apiBase, owner, repo, p, ref)
+	auth := func(r *http.Request) {
+		g.setAuth(r)
+		r.Header.Set("Accept", "application/vnd.github.raw+json")
+	}
+	return getFileBytes(ctx, g.httpClient, fileURL, maxBytes, auth)
+}
+
 func (g *GitHubProvider) doGet(ctx context.Context, url string) ([]byte, error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	if err != nil {
@@ -95,6 +95,45 @@ func (g *GitLabProvider) TestConnection(ctx context.Context, owner, repo string)
 	return err
 }

+// SetCommitStatus reports a deploy status on a commit via GitLab's commit-
+// status API. GitLab's state vocabulary differs (pending/running/success/
+// failed/canceled), so failure AND error both map to "failed". The status
+// metadata (name/target_url/description) is passed as query parameters,
+// which is how GitLab's POST .../statuses/{sha} endpoint accepts them.
+func (g *GitLabProvider) SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
+	q := url.Values{}
+	q.Set("state", gitlabState(status))
+	q.Set("name", commitStatusContext)
+	if targetURL != "" {
+		q.Set("target_url", targetURL)
+	}
+	if description != "" {
+		q.Set("description", truncateDescription(description))
+	}
+	apiURL := fmt.Sprintf("%s/projects/%s/statuses/%s?%s",
+		g.apiBase, projectPath(owner, repo), url.PathEscape(sha), q.Encode())
+	// No JSON body — all fields ride as query params. Reuse postJSON for
+	// the SSRF-safe POST + 2xx handling; an empty body is valid here.
+	if err := postJSON(ctx, g.httpClient, apiURL, nil, g.setAuth); err != nil {
+		return fmt.Errorf("set commit status: %w", err)
+	}
+	return nil
+}
+
+// gitlabState maps a provider-agnostic CommitStatus onto GitLab's API
+// vocabulary. GitLab has no "failure"/"error" split — both map to
+// "failed".
+func gitlabState(status CommitStatus) string {
+	switch status {
+	case CommitStatusSuccess:
+		return "success"
+	case CommitStatusFailure, CommitStatusError:
+		return "failed"
+	default:
+		return "pending"
+	}
+}
+
 func (g *GitLabProvider) ListBranches(ctx context.Context, owner, repo string) ([]string, error) {
 	var allBranches []string
 	page := 1
@@ -234,6 +273,22 @@ func (g *GitLabProvider) DownloadFolder(ctx context.Context, owner, repo, branch
 	return nil
 }

+// DownloadFile fetches a single file's raw bytes via GitLab's raw endpoint,
+// capped at maxBytes. Returns ErrFileNotFound on a 404. owner/repo/ref are
+// path-escaped; the file path is passed through verbatim to preserve its `/`
+// separators (a `..` segment is harmless — the bytes are only parsed in
+// memory, never written to disk, so there is no local-traversal sink).
+func (g *GitLabProvider) DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error) {
+	p := strings.TrimPrefix(path, "/")
+	fileURL := fmt.Sprintf("%s/%s/%s/-/raw/%s/%s",
+		g.rawBase,
+		url.PathEscape(owner),
+		url.PathEscape(repo),
+		url.PathEscape(ref),
+		p)
+	return getFileBytes(ctx, g.httpClient, fileURL, maxBytes, g.setAuth)
+}
+
 func (g *GitLabProvider) doGet(ctx context.Context, apiURL string) ([]byte, error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiURL, nil)
 	if err != nil {
@@ -1,7 +1,9 @@
 package staticsite

 import (
+	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -11,6 +13,11 @@ import (
 	"time"
 )

+// ErrFileNotFound is returned by GitProvider.DownloadFile when the file is
+// absent (HTTP 404). Callers use it to distinguish "no file" (a normal,
+// non-error state for GitOps) from a genuine fetch failure.
+var ErrFileNotFound = errors.New("file not found")
+
 // RepoInfo represents a repository returned by the provider's list/search API.
 type RepoInfo struct {
 	Owner       string `json:"owner"`
@@ -21,6 +28,40 @@ type RepoInfo struct {
 	HTMLURL     string `json:"html_url"`
 }

+// CommitStatus is the deploy outcome reported back to the git provider as
+// a commit status. The values are provider-agnostic; each implementation
+// maps them onto its own API vocabulary (Gitea/GitHub use the same four
+// words, GitLab collapses failure/error into "failed").
+type CommitStatus string
+
+const (
+	CommitStatusPending CommitStatus = "pending"
+	CommitStatusSuccess CommitStatus = "success"
+	CommitStatusFailure CommitStatus = "failure"
+	CommitStatusError   CommitStatus = "error"
+)
+
+// commitStatusContext is the status "context"/"name" key reported to every
+// provider so repeated deploys update the same status row rather than
+// piling up new ones.
+const commitStatusContext = "tinyforge"
+
+// maxCommitStatusDescription caps the human-readable description so a
+// provider can't reject the request for an over-long field.
+const maxCommitStatusDescription = 140
+
+// truncateDescription clamps a status description to the provider-safe
+// length, appending an ellipsis when it had to cut.
+func truncateDescription(s string) string {
+	if len(s) <= maxCommitStatusDescription {
+		return s
+	}
+	// Reserve room for the ellipsis rune; cut on a byte boundary that
+	// stays under the cap. Descriptions are short ASCII strings in
+	// practice, so a simple byte cut is fine here.
+	return s[:maxCommitStatusDescription-1] + "…"
+}
+
 // GitProvider abstracts Git hosting API operations.
 // Implementations exist for Gitea/Forgejo/Gogs, GitHub, and GitLab.
 type GitProvider interface {
@@ -45,6 +86,18 @@ type GitProvider interface {

 	// DownloadFolder downloads all files from a folder path to a local directory.
 	DownloadFolder(ctx context.Context, owner, repo, branch, folderPath, destDir string) error
+
+	// DownloadFile fetches a single file's bytes from a ref (branch/sha),
+	// capped at maxBytes. Returns ErrFileNotFound on a 404 so callers can
+	// treat an absent file as a non-error state. Used to read a small in-repo
+	// config file (e.g. .tinyforge.yml) without materializing a whole tree.
+	DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error)
+
+	// SetCommitStatus reports a deploy status on a commit. Best-effort;
+	// callers ignore errors beyond logging. targetURL and description are
+	// optional (pass "" to omit); description is truncated to a provider-
+	// safe length by the implementation.
+	SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error
 }

 // ProviderType identifies a Git hosting provider.
@@ -135,6 +188,74 @@ func httpGet(ctx context.Context, client *http.Client, url string) (int, error)
 	return resp.StatusCode, nil
 }

+// postJSON is a shared helper for POSTing a JSON body to a provider API
+// endpoint with the caller's auth applied. It accepts any 2xx as success
+// (status APIs return 201 Created on Gitea/GitHub, 200/201 on GitLab) and
+// returns a status-code-only error on non-2xx — it must NOT echo the
+// response body: the deploy hook logs this error best-effort, and a
+// hostile/misconfigured provider could reflect the request's auth token
+// back in its body. The body bytes must already be marshalled by the caller.
+func postJSON(ctx context.Context, client *http.Client, url string, body []byte, authHeader func(r *http.Request)) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
+	if err != nil {
+		return fmt.Errorf("create request: %w", err)
+	}
+	if authHeader != nil {
+		authHeader(req)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := client.Do(req)
+	if err != nil {
+		return fmt.Errorf("execute request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return fmt.Errorf("unexpected status %d", resp.StatusCode)
+	}
+	return nil
+}
+
+// getFileBytes GETs fileURL with the caller's auth applied and returns the
+// body, enforcing a maxBytes cap. Returns ErrFileNotFound on 404; a
+// status-code-only error otherwise (it must NOT echo the response body — a
+// hostile/misconfigured provider could reflect the request's auth token back).
+func getFileBytes(ctx context.Context, client *http.Client, fileURL string, maxBytes int64, authHeader func(r *http.Request)) ([]byte, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, fileURL, nil)
+	if err != nil {
+		return nil, fmt.Errorf("create request: %w", err)
+	}
+	if authHeader != nil {
+		authHeader(req)
+	}
+
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("execute request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	switch {
+	case resp.StatusCode == http.StatusNotFound:
+		return nil, ErrFileNotFound
+	case resp.StatusCode != http.StatusOK:
+		return nil, fmt.Errorf("unexpected status %d", resp.StatusCode)
+	}
+
+	// Read one byte past the cap so an over-size file is detected rather than
+	// silently truncated.
+	data, err := io.ReadAll(io.LimitReader(resp.Body, maxBytes+1))
+	if err != nil {
+		return nil, fmt.Errorf("read response: %w", err)
+	}
+	if int64(len(data)) > maxBytes {
+		return nil, fmt.Errorf("file exceeds %d byte cap", maxBytes)
+	}
+	return data, nil
+}
+
 // downloadFileHTTP is a shared helper for downloading a file from a URL.
 func downloadFileHTTP(ctx context.Context, client *http.Client, url, localPath string, authHeader func(r *http.Request)) error {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
@@ -50,34 +50,7 @@ func ValidateBaseURL(raw string) error {
 func NewSafeHTTPClient(timeout time.Duration) *http.Client {
 	dialer := &net.Dialer{Timeout: 10 * time.Second, KeepAlive: 30 * time.Second}
 	transport := &http.Transport{
-		DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
-			host, port, err := net.SplitHostPort(addr)
-			if err != nil {
-				return nil, err
-			}
-			// If the caller passed a literal IP, skip the DNS round-trip.
-			if literal := net.ParseIP(host); literal != nil {
-				if reason := blockReason(literal); reason != "" {
-					return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, literal, reason)
-				}
-				return dialer.DialContext(ctx, network, addr)
-			}
-			ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
-			if err != nil {
-				return nil, err
-			}
-			if len(ips) == 0 {
-				return nil, fmt.Errorf("no addresses for %s", host)
-			}
-			for _, ip := range ips {
-				if reason := blockReason(ip.IP); reason != "" {
-					return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, ip.IP, reason)
-				}
-			}
-			// Bind to the first resolved IP so a rebind between resolution
-			// and connect cannot redirect the request to a blocked address.
-			return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), port))
-		},
+		DialContext:         SafeDialContext(dialer),
 		MaxIdleConns:        16,
 		IdleConnTimeout:     30 * time.Second,
 		TLSHandshakeTimeout: 10 * time.Second,
@@ -85,6 +58,43 @@ func NewSafeHTTPClient(timeout time.Duration) *http.Client {
 	return &http.Client{Timeout: timeout, Transport: transport}
 }

+// SafeDialContext returns a DialContext that rejects loopback, link-local,
+// multicast, unspecified, and cloud-metadata addresses at connect time,
+// re-resolving and binding to the resolved IP so a DNS rebind between
+// resolution and connect cannot slip through. Exposed so other transports
+// (e.g. the outbound notification client) can apply the same SSRF policy
+// without duplicating it or losing their own connection-pool tuning.
+func SafeDialContext(dialer *net.Dialer) func(ctx context.Context, network, addr string) (net.Conn, error) {
+	return func(ctx context.Context, network, addr string) (net.Conn, error) {
+		host, port, err := net.SplitHostPort(addr)
+		if err != nil {
+			return nil, err
+		}
+		// If the caller passed a literal IP, skip the DNS round-trip.
+		if literal := net.ParseIP(host); literal != nil {
+			if reason := blockReason(literal); reason != "" {
+				return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, literal, reason)
+			}
+			return dialer.DialContext(ctx, network, addr)
+		}
+		ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
+		if err != nil {
+			return nil, err
+		}
+		if len(ips) == 0 {
+			return nil, fmt.Errorf("no addresses for %s", host)
+		}
+		for _, ip := range ips {
+			if reason := blockReason(ip.IP); reason != "" {
+				return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, ip.IP, reason)
+			}
+		}
+		// Bind to the first resolved IP so a rebind between resolution
+		// and connect cannot redirect the request to a blocked address.
+		return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), port))
+	}
+}
+
 // blockReason returns a human label for why an IP is rejected, or ""
 // if the IP is allowed. Centralized so all callers share the same
 // policy.
@@ -92,6 +102,13 @@ func blockReason(ip net.IP) string {
 	if ip == nil {
 		return "nil address"
 	}
+	// Normalize IPv4-mapped IPv6 (::ffff:x.x.x.x) so the loopback / link-local
+	// classifiers below catch them. net.IP.To4() returns the 4-byte form for
+	// IPv4-mapped addresses; net's IsLoopback already handles this, but pin
+	// the conversion to avoid future surprises if the std-lib semantics drift.
+	if v4 := ip.To4(); v4 != nil {
+		ip = v4
+	}
 	switch {
 	case ip.IsLoopback():
 		return "loopback"
@@ -104,5 +121,22 @@ func blockReason(ip net.IP) string {
 	case ip.IsMulticast():
 		return "multicast"
 	}
+	// Cloud metadata endpoints — AWS / GCP / Azure are covered by the
+	// link-local block (169.254.169.254). The rest must be enumerated.
+	if metadataIPSet[ip.String()] {
+		return "cloud metadata endpoint"
+	}
 	return ""
 }
+
+// metadataIPSet enumerates well-known cloud metadata IPs that are NOT
+// covered by net.IP.IsLinkLocalUnicast. Updating this set is the lightest
+// way to keep up with new providers without changing the policy shape.
+var metadataIPSet = map[string]bool{
+	// Alibaba Cloud ECS metadata.
+	"100.100.100.200": true,
+	// Oracle Cloud Infrastructure metadata.
+	"192.0.0.192": true,
+	// AWS IMDS over IPv6 (ULA — not link-local, must be listed).
+	"fd00:ec2::254": true,
+}
@@ -234,17 +234,17 @@ func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.Con
 	found := make([]bool, len(targets))

 	var wg sync.WaitGroup
+loop:
 	for i, t := range targets {
 		// Acquire the semaphore in the parent loop so ctx cancellation
 		// short-circuits the queue rather than spawning goroutines that
-		// block on an unreachable slot.
+		// block on an unreachable slot. The labelled break exits the for
+		// loop directly; a bare `break` inside `select` would only break
+		// the select and let the loop continue.
 		select {
 		case sem <- struct{}{}:
 		case <-ctx.Done():
-			break
-		}
-		if ctx.Err() != nil {
-			break
+			break loop
 		}
 		wg.Add(1)
 		go func(i int, t target) {
@@ -2,6 +2,7 @@ package store

 import (
 	"database/sql"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
@@ -9,6 +10,22 @@ import (
 	"github.com/google/uuid"
 )

+// validateExtraJSON ensures the extra_json column never receives an
+// invalid JSON document. The codemap (docs/CODEMAPS/container-extra-json.md)
+// is explicit that readers tolerate unknown keys — but only if the value
+// is valid JSON at all. A buggy plugin writing `"not json"` would silently
+// break every reader, with no schema-level check to catch it. Guarding at
+// the store boundary keeps the invariant cheap and obvious.
+func validateExtraJSON(v string) error {
+	if v == "" {
+		return nil
+	}
+	if !json.Valid([]byte(v)) {
+		return fmt.Errorf("extra_json: not valid JSON (%d bytes)", len(v))
+	}
+	return nil
+}
+
 // containerColumns is the canonical column list for `containers` queries.
 // stage_id is populated by the deployer for project containers (so ListProxyRoutes
 // survives stage renames) and left empty for stacks and sites.
@@ -42,6 +59,9 @@ func (s *Store) CreateContainer(c Container) (Container, error) {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return Container{}, err
+	}

 	_, err := s.db.Exec(
 		`INSERT INTO containers (`+containerColumns+`)
@@ -77,6 +97,9 @@ func (s *Store) UpsertContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}

 	// SQLite UPSERT — INSERT...ON CONFLICT(id) DO UPDATE.
 	_, err := s.db.Exec(
@@ -129,6 +152,9 @@ func (s *Store) ReconcileContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}

 	// extra_json is deliberately NOT in the ON CONFLICT SET clause: the
 	// reconciler can't observe per-face route IDs from Docker, and
@@ -321,6 +347,9 @@ func (s *Store) UpdateContainer(c Container) error {
 	if c.ExtraJSON == "" {
 		c.ExtraJSON = "{}"
 	}
+	if err := validateExtraJSON(c.ExtraJSON); err != nil {
+		return err
+	}
 	result, err := s.db.Exec(
 		`UPDATE containers SET workload_id=?, workload_kind=?, role=?, stage_id=?, container_id=?,
 			image_ref=?, image_tag=?, host=?, state=?, port=?,
@@ -0,0 +1,123 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+)
+
+// InsertDeployHistory appends one row to the per-workload deploy ledger.
+// Callers (the deployer choke point) treat this as best-effort: a failure
+// here must never fail an otherwise-successful deploy. Error is expected to
+// be a fixed, secret-free marker — never the raw source error.
+func (s *Store) InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error) {
+	if e.StartedAt == "" {
+		e.StartedAt = Now()
+	}
+	if e.FinishedAt == "" {
+		e.FinishedAt = Now()
+	}
+	res, err := s.db.Exec(
+		`INSERT INTO deploy_history
+		   (workload_id, source_kind, reference, reason, triggered_by,
+		    note, outcome, error, started_at, finished_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		e.WorkloadID, e.SourceKind, e.Reference, e.Reason, e.TriggeredBy,
+		e.Note, e.Outcome, e.Error, e.StartedAt, e.FinishedAt,
+	)
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("insert deploy history: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("get deploy history id: %w", err)
+	}
+	e.ID = id
+	return e, nil
+}
+
+// ListDeployHistory returns a workload's ledger newest-first. limit/offset
+// are assumed pre-clamped by the API layer; a non-positive limit falls back
+// to a sane default so a bad query can't return the whole table.
+func (s *Store) ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error) {
+	if limit <= 0 {
+		limit = 50
+	}
+	if offset < 0 {
+		offset = 0
+	}
+	rows, err := s.db.Query(
+		`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
+		        note, outcome, error, started_at, finished_at
+		 FROM deploy_history
+		 WHERE workload_id = ?
+		 ORDER BY id DESC
+		 LIMIT ? OFFSET ?`,
+		workloadID, limit, offset,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("query deploy history: %w", err)
+	}
+	defer rows.Close()
+
+	out := make([]DeployHistoryEntry, 0, limit)
+	for rows.Next() {
+		var e DeployHistoryEntry
+		if err := rows.Scan(
+			&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
+			&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
+		); err != nil {
+			return nil, fmt.Errorf("scan deploy history: %w", err)
+		}
+		out = append(out, e)
+	}
+	return out, rows.Err()
+}
+
+// GetDeployHistory fetches one ledger row by id, or ErrNotFound. The
+// rollback handler uses this to resolve the pinned reference to replay.
+func (s *Store) GetDeployHistory(id int64) (DeployHistoryEntry, error) {
+	row := s.db.QueryRow(
+		`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
+		        note, outcome, error, started_at, finished_at
+		 FROM deploy_history WHERE id = ?`, id,
+	)
+	var e DeployHistoryEntry
+	err := row.Scan(
+		&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
+		&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
+	)
+	if errors.Is(err, sql.ErrNoRows) {
+		return DeployHistoryEntry{}, fmt.Errorf("deploy history %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("scan deploy history: %w", err)
+	}
+	return e, nil
+}
+
+// PruneDeployHistory keeps only the newest `keep` rows for a workload,
+// deleting older ones. Bounds unbounded growth on hot workloads. Best-
+// effort and id-monotonic (newer rows always have larger ids), so it
+// deletes everything below the keep-th id. A non-positive keep is treated
+// as "keep a sane default" rather than "delete everything".
+func (s *Store) PruneDeployHistory(workloadID string, keep int) error {
+	if keep <= 0 {
+		keep = 50
+	}
+	_, err := s.db.Exec(
+		`DELETE FROM deploy_history
+		 WHERE workload_id = ?
+		   AND id NOT IN (
+		       SELECT id FROM deploy_history
+		       WHERE workload_id = ?
+		       ORDER BY id DESC
+		       LIMIT ?
+		   )`,
+		workloadID, workloadID, keep,
+	)
+	if err != nil {
+		return fmt.Errorf("prune deploy history: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,133 @@
+package store
+
+import (
+	"errors"
+	"testing"
+)
+
+func seedWorkload(t *testing.T, s *Store, name string) Workload {
+	t.Helper()
+	w, err := s.CreateWorkload(Workload{Kind: "project", RefID: name, Name: name})
+	if err != nil {
+		t.Fatalf("CreateWorkload(%s): %v", name, err)
+	}
+	return w
+}
+
+func TestDeployHistory_InsertListGet(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "app1")
+
+	first, err := s.InsertDeployHistory(DeployHistoryEntry{
+		WorkloadID: w.ID, SourceKind: "image", Reference: "v1",
+		Reason: "manual", TriggeredBy: "admin", Outcome: "success",
+	})
+	if err != nil {
+		t.Fatalf("InsertDeployHistory: %v", err)
+	}
+	if first.ID == 0 {
+		t.Fatal("expected non-zero id")
+	}
+	if first.StartedAt == "" || first.FinishedAt == "" {
+		t.Fatal("expected timestamps to be defaulted")
+	}
+
+	second, _ := s.InsertDeployHistory(DeployHistoryEntry{
+		WorkloadID: w.ID, SourceKind: "image", Reference: "v2",
+		Reason: "registry-push", Outcome: "success",
+	})
+
+	list, err := s.ListDeployHistory(w.ID, 10, 0)
+	if err != nil {
+		t.Fatalf("ListDeployHistory: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("expected 2 rows, got %d", len(list))
+	}
+	// Newest-first ordering.
+	if list[0].ID != second.ID || list[1].ID != first.ID {
+		t.Fatalf("expected newest-first ordering, got %d then %d", list[0].ID, list[1].ID)
+	}
+
+	got, err := s.GetDeployHistory(first.ID)
+	if err != nil {
+		t.Fatalf("GetDeployHistory: %v", err)
+	}
+	if got.Reference != "v1" || got.SourceKind != "image" {
+		t.Fatalf("unexpected row: %+v", got)
+	}
+}
+
+func TestDeployHistory_GetNotFound(t *testing.T) {
+	s := newTestStore(t)
+	_, err := s.GetDeployHistory(999)
+	if !errors.Is(err, ErrNotFound) {
+		t.Fatalf("expected ErrNotFound, got %v", err)
+	}
+}
+
+func TestDeployHistory_ListScopedToWorkload(t *testing.T) {
+	s := newTestStore(t)
+	a := seedWorkload(t, s, "a")
+	b := seedWorkload(t, s, "b")
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: a.ID, Outcome: "success"})
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: b.ID, Outcome: "success"})
+
+	list, _ := s.ListDeployHistory(a.ID, 10, 0)
+	if len(list) != 1 || list[0].WorkloadID != a.ID {
+		t.Fatalf("expected only workload a's rows, got %+v", list)
+	}
+}
+
+func TestDeployHistory_Pagination(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "paged")
+	for i := 0; i < 5; i++ {
+		s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	}
+	page1, _ := s.ListDeployHistory(w.ID, 2, 0)
+	page2, _ := s.ListDeployHistory(w.ID, 2, 2)
+	if len(page1) != 2 || len(page2) != 2 {
+		t.Fatalf("expected 2 per page, got %d and %d", len(page1), len(page2))
+	}
+	if page1[0].ID == page2[0].ID {
+		t.Fatal("expected distinct rows across pages")
+	}
+}
+
+func TestDeployHistory_Prune(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "noisy")
+	for i := 0; i < 10; i++ {
+		s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	}
+	if err := s.PruneDeployHistory(w.ID, 3); err != nil {
+		t.Fatalf("PruneDeployHistory: %v", err)
+	}
+	list, _ := s.ListDeployHistory(w.ID, 100, 0)
+	if len(list) != 3 {
+		t.Fatalf("expected 3 rows after prune, got %d", len(list))
+	}
+	// Prune keeps the newest rows.
+	all, _ := s.ListDeployHistory(w.ID, 100, 0)
+	for i := 1; i < len(all); i++ {
+		if all[i-1].ID < all[i].ID {
+			t.Fatal("expected newest-first after prune")
+		}
+	}
+}
+
+func TestDeployHistory_CascadeOnWorkloadDelete(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "doomed")
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "failure"})
+
+	if err := s.DeleteWorkload(w.ID); err != nil {
+		t.Fatalf("DeleteWorkload: %v", err)
+	}
+	list, _ := s.ListDeployHistory(w.ID, 100, 0)
+	if len(list) != 0 {
+		t.Fatalf("expected history removed with workload, got %d rows", len(list))
+	}
+}
--- a/Show More
+++ b/Show More