ci: align Gitea CI/CD + Docker with the notify-bridge template

Adopt the proven notify-bridge pipeline pattern and fix deployment bugs. Workflows: - build.yml: split into parallel frontend / backend / build-image jobs. Run svelte-check + vitest + `go vet ./...` + `go test ./internal/...` (tests were never executed in CI). Use buildx with GHA layer cache and pin Go to 1.25. Quote the `if:` skip-guard so it is valid YAML. - release.yml: gate the release on a passing test job, then build & push the image, then create the Gitea release LAST so a failed image build can no longer leave an orphan release. Use buildx + registry buildcache, a hard registry login (a push failure now fails the release), and auto-generate a changelog between tags. Docker: - Dockerfile: pin golang to 1.25 (matches go.mod's `go 1.25.0`), add BuildKit cache mounts for the module + build caches, an OCI source label, VOLUME /app/data, and a HEALTHCHECK on /readyz. - docker-compose.yml: fix the healthcheck — it targeted POST-only /api/auth/login (405 -> always unhealthy); now /readyz. Point the image name at the Gitea registry tag with build-from-source as the default. - .dockerignore: exclude ~95 MB of stray binaries, logs, env, and CI/doc files from the build context.
fix(web): keep the image-ref conflict indicator from reflowing the form
2026-06-21 20:51:13 +03:00 · 2026-06-08 16:13:30 +03:00 · 2026-06-08 16:13:30 +03:00 · 2026-06-08 16:06:37 +03:00 · 2026-06-08 15:39:25 +03:00 · 2026-06-02 14:56:10 +03:00
404 changed files with 82960 additions and 9329 deletions
@@ -1,7 +1,24 @@
 {
  "permissions": {
    "allow": [
-      "Bash(npm install:*)"
+      "Bash(npm install:*)",
+      "Bash(go build:*)",
+      "Bash(npx svelte-check:*)",
+      "Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:8090/api/settings)",
+      "Bash(npm run:*)",
+      "Bash(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:8090/)",
+      "Bash(go vet:*)",
+      "Bash(git checkout:*)",
+      "Bash(git stash:*)",
+      "Bash(echo \"EXIT: $?\")",
+      "Bash(./scripts/dev-server.sh)",
+      "Bash(go doc:*)",
+      "Bash(ls -la /c/Users/Alexei/Documents/docker-watcher/internal/*/)",
+      "Bash(go get:*)"
+    ],
+    "additionalDirectories": [
+      "C:\\Users\\Alexei\\Documents\\docker-watcher\\internal",
+      "C:\\Users\\Alexei\\Documents\\docker-watcher\\web\\src\\routes\\projects\\[id]\\volumes\\[volId]"
    ]
  }
 }
@@ -1,9 +1,47 @@
+# VCS / tooling
 .git
-node_modules
-web/node_modules
-web/build
-data
-*.md
-plans/
-.claude/
+.gitignore
 .dockerignore
+.gitea/
+.github/
+.claude/
+.code-review-graph/
+.vex.toml
+.facts-sync.json
+.facts-suggestions.md
+
+# Node / frontend build artifacts (frontend stage rebuilds web/build)
+node_modules/
+web/node_modules/
+web/build/
+web/.svelte-kit/
+
+# Runtime / local data
+data/
+.env
+.env.*
+*.log
+
+# Compiled binaries (rebuilt inside the image)
+tinyforge
+tinyforge.exe
+tinyforge-server.exe
+server.exe
+docker-watcher
+docker-watcher.exe
+docker-watcher.exe~
+/cli
+/cli.exe
+
+# Build/orchestration files not needed inside the image
+Dockerfile
+docker-compose.yml
+Makefile
+*.example.yaml
+
+# Docs / planning / design (not needed at runtime)
+*.md
+docs/
+plans/
+design-mockups/
+test-data/
@@ -0,0 +1,7 @@
+# Required: protects all credentials stored in the database (AES-256).
+# Generate with: openssl rand -hex 32
+ENCRYPTION_KEY=
+
+# Required on first launch: password for the default admin user.
+# After initial setup, this can be removed.
+ADMIN_PASSWORD=
@@ -0,0 +1,74 @@
+name: Build
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  frontend:
+    # Skip the build on release-bump commits — the tag push runs release.yml.
+    if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+          cache: npm
+          cache-dependency-path: web/package-lock.json
+
+      - name: Install frontend dependencies
+        working-directory: web
+        run: npm ci --no-audit
+
+      - name: Svelte check
+        working-directory: web
+        run: npm run check
+
+      - name: Unit tests (vitest)
+        working-directory: web
+        run: npm run test
+
+      - name: Build frontend
+        working-directory: web
+        run: npm run build
+
+  backend:
+    if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.25'
+          cache-dependency-path: go.sum
+
+      - name: Vet Go code
+        run: go vet ./...
+
+      - name: Run Go tests
+        run: go test ./internal/... -count=1
+
+  build-image:
+    if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
+    needs: [frontend, backend]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build Docker image (no push)
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: false
+          tags: tinyforge:ci-${{ gitea.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
@@ -0,0 +1,180 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+env:
+  SERVER_HOST: git.dolgolyov-family.by
+  REGISTRY: git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge
+
+jobs:
+  # ───────────────────────────────────────────────────────────────────────
+  # Gate the release on a passing test suite. A tagged release must never
+  # ship code that fails `go vet` / `go test`.
+  # ───────────────────────────────────────────────────────────────────────
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.25'
+          cache-dependency-path: go.sum
+
+      - name: Vet Go code
+        run: go vet ./...
+
+      - name: Run Go tests
+        run: go test ./internal/... -count=1
+
+  # ───────────────────────────────────────────────────────────────────────
+  # Build + push the image FIRST. If this fails, no release is created
+  # (create-release depends on it) — so we never leave an orphan release
+  # pointing at a tag with no published image.
+  # ───────────────────────────────────────────────────────────────────────
+  build-docker:
+    needs: test
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Compute tags
+        id: meta
+        run: |
+          TAG="${{ gitea.ref_name }}"
+          VERSION="${TAG#v}"
+          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
+          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
+          # Detect pre-release (alpha/beta/rc) — these do NOT get :latest.
+          if echo "$TAG" | grep -qE '(alpha|beta|rc)'; then
+            echo "is_pre=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pre=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Gitea Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.SERVER_HOST }}
+          username: ${{ gitea.actor }}
+          password: ${{ secrets.DEPLOY_TOKEN }}
+
+      - name: Build and push image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}:${{ steps.meta.outputs.tag }}
+            ${{ env.REGISTRY }}:${{ steps.meta.outputs.version }}
+            ${{ env.REGISTRY }}:sha-${{ gitea.sha }}
+            ${{ steps.meta.outputs.is_pre == 'false' && format('{0}:latest', env.REGISTRY) || '' }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}:buildcache,mode=max
+
+      - name: Trigger redeploy webhook
+        if: steps.meta.outputs.is_pre == 'false'
+        continue-on-error: true
+        run: |
+          if [ -n "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" ]; then
+            echo "Triggering redeploy webhook..."
+            curl -sf -X POST "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" \
+              --max-time 30 || echo "::warning::Redeploy webhook failed"
+          else
+            echo "DOCKER_REDEPLOY_WEBHOOK_URL not set — skipping auto-deploy"
+          fi
+
+  # ───────────────────────────────────────────────────────────────────────
+  # Create the Gitea release LAST — body = RELEASE_NOTES.md + auto-changelog.
+  # ───────────────────────────────────────────────────────────────────────
+  create-release:
+    needs: build-docker
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout (full history for changelog)
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Generate changelog
+        id: changelog
+        run: |
+          PREV_TAG=$(git tag --sort=-v:refname | head -2 | tail -1)
+          if [ -z "$PREV_TAG" ] || [ "$PREV_TAG" = "${{ gitea.ref_name }}" ]; then
+            git log --oneline --no-decorate -n 20 > /tmp/changelog.txt
+          else
+            git log --oneline --no-decorate "${PREV_TAG}..HEAD" > /tmp/changelog.txt
+          fi
+
+      - name: Create Gitea release
+        env:
+          DEPLOY_TOKEN: ${{ secrets.DEPLOY_TOKEN }}
+        run: |
+          TAG="${{ gitea.ref_name }}"
+          VERSION="${TAG#v}"
+          BASE_URL="${{ gitea.server_url }}/api/v1/repos/${{ gitea.repository }}"
+
+          # Detect pre-release (alpha/beta/rc)
+          IS_PRE="false"
+          if echo "$TAG" | grep -qE '(alpha|beta|rc)'; then
+            IS_PRE="true"
+          fi
+
+          # Read release notes if present
+          if [ -f RELEASE_NOTES.md ]; then
+            export RELEASE_NOTES=$(cat RELEASE_NOTES.md)
+            echo "Found RELEASE_NOTES.md"
+          else
+            export RELEASE_NOTES=""
+            echo "No RELEASE_NOTES.md found — release body = changelog only"
+          fi
+
+          # Build release body (notes + changelog) via Python to avoid shell
+          # escaping and CLI length limits.
+          export TAG VERSION IS_PRE
+          python3 <<'PY'
+          import json, os
+
+          notes = os.environ.get('RELEASE_NOTES', '')
+          changelog = open('/tmp/changelog.txt').read().strip()
+
+          sections = []
+          if notes.strip():
+              sections.append(notes.strip())
+          if changelog:
+              sections.append('## Changelog\n\n' + changelog)
+
+          payload = {
+              'tag_name': os.environ['TAG'],
+              'name': os.environ['VERSION'],
+              'body': '\n\n'.join(sections),
+              'draft': False,
+              'prerelease': os.environ['IS_PRE'] == 'true',
+          }
+          with open('/tmp/release-payload.json', 'w') as f:
+              json.dump(payload, f)
+          PY
+
+          HTTP=$(curl -s -o /tmp/release-resp.json -w "%{http_code}" \
+            -X POST "$BASE_URL/releases" \
+            -H "Authorization: token $DEPLOY_TOKEN" \
+            -H "Content-Type: application/json" \
+            --data-binary @/tmp/release-payload.json)
+
+          echo "POST /releases → HTTP $HTTP"
+          if [ "$HTTP" = "201" ]; then
+            RELEASE_ID=$(python3 -c "import json; print(json.load(open('/tmp/release-resp.json'))['id'])")
+            echo "Created release $RELEASE_ID for $TAG"
+          elif [ "$HTTP" = "409" ] || grep -q "already exists" /tmp/release-resp.json; then
+            echo "::warning::Release already exists for tag $TAG — reusing"
+          else
+            echo "::error::Failed to create release for $TAG (HTTP $HTTP)"
+            head -c 2000 /tmp/release-resp.json; echo
+            exit 1
+          fi
@@ -1,5 +1,17 @@
 node_modules/
 web/node_modules/
 web/build/
+web/.svelte-kit/
 data/
 .env
+tinyforge
+tinyforge.exe
+/cli
+/cli.exe
+server.exe
+tinyforge-server.exe
+docker-watcher
+docker-watcher.exe
+docker-watcher.exe~
+.claude/worktrees/
+.facts-sync.json
@@ -0,0 +1,57 @@
+# vex configuration — https://github.com/tenatarika/vex
+#
+# Place this file in your project root as .vex.toml
+
+# Glob patterns to exclude from indexing (gitignore syntax, on top of .gitignore)
+# exclude = [
+#     "vendor/**",
+#     "node_modules/**",
+#     "*.generated.go",
+#     "dist/**",
+# ]
+
+# Default output format: "text", "json", or "compact"
+# format = "text"
+
+# Enable semantic embeddings by default (slower indexing, enables meaning-based search)
+semantic = true
+
+# Automatically run `vex update` before search if the index is stale
+auto_update = true
+
+# Embedder used for semantic indexing. Known IDs: minilm-l6-v2 (default).
+# Changing the embedder requires a full reindex.
+# embedder = "minilm-l6-v2"
+
+# Cache directory override. Defaults to the platform cache location.
+#   macOS:   ~/Library/Caches/vex
+#   Linux:   $XDG_CACHE_HOME/vex   (fallback: ~/.cache/vex)
+#   Windows: %LOCALAPPDATA%\vex    (fallback: %USERPROFILE%\AppData\Local\vex)
+# Accepts absolute paths, "~/..." or paths relative to this file (e.g. "./.vex/cache").
+# Can also be overridden per-invocation with --cache-dir or $VEX_CACHE_DIR.
+# cache_dir = "./.vex/cache"
+
+# Store the index inside the project as `<project>/.vex_cache/`. Useful when
+# the cache should travel with the project (e.g. on a moved or renamed
+# directory). vex writes a `.gitignore` inside it so contents are not
+# committed. Overridden by `cache_dir`, `--cache-dir`, or $VEX_CACHE_DIR.
+# local_cache = false
+
+# Thread count for parallel indexing (index/update/watch).
+#   * unset  — 80% of available cores, rounded up (default, leaves headroom)
+#   * 0      — use all cores (explicit opt-in to max throughput)
+#   * N      — exactly N workers
+# Overridable per-invocation with `-j/--jobs` or $VEX_JOBS.
+# jobs = 4
+
+# Build the persistent call-graph section. Disabling falls back to live-scan
+# for `vex callers`/`vex callees` (slower per-query, but saves indexing
+# time on large monorepos). The opt-out is persisted in the manifest so
+# `vex update` does not silently re-add the section.
+# Per-invocation override: `vex index --no-call-graph`.
+# call_graph = true
+
+# Build the BM25 channel. Disabling drops the third RRF channel and keeps
+# only structural (+ semantic). Same persistence rules as `call_graph`.
+# Per-invocation override: `vex index --no-bm25`.
+# bm25 = true
@@ -0,0 +1,24 @@
+# Tinyforge
+
+## Dev Server
+
+Start/restart with: `./scripts/dev-server.sh`
+
+- Runs on port **8090** (avoids 8080 conflict with other local services)
+- Auto-generates `ENCRYPTION_KEY` if not set
+- Default login: `admin` / `admin123`
+- Override port: `LISTEN_ADDR=:9000 ./scripts/dev-server.sh`
+
+## Frontend
+
+- **Boolean inputs use `ToggleSwitch`** (`$lib/components/ToggleSwitch.svelte`) — the slide-style switch is the unified control across the WebUI. Do not introduce raw `<input type="checkbox">` elements; place a `<ToggleSwitch>` next to a label/help block instead.
+- **Confirmations & destructive actions use `ConfirmDialog`** (`$lib/components/ConfirmDialog.svelte`) — never native `window.confirm` / `alert`. For navigation guards (e.g. the unsaved-changes prompt on `/apps/new`), `cancel()` the navigation in `beforeNavigate`, open `ConfirmDialog`, and re-issue the navigation with a bypass flag on confirm. Native `beforeunload` is acceptable only for hard tab-close/reload, where the browser forbids custom UI.
+- **Source-config shape: `$lib/workload/sourceForms.ts`** is the single source of truth (seed/serialize/validity for image/compose/static/dockerfile), consumed by both `/apps/new` and `/apps/[id]`. Don't re-inline seed/serialize logic.
+- **"App" = workload with `source_kind !== ''`.** Triggers are first-class bindings (`workload_trigger_bindings`), NOT on the workload row — never gate app lists/counts on `trigger_kind` (it's empty for plugin workloads). Legacy pre-cutover `kind:project/stack/site` rows have an empty `source_kind` and must be excluded everywhere.
+- **i18n parity is mandatory** — every key in BOTH `web/src/lib/i18n/{en,ru}.json`. A missing key is NOT a build error (`$t` returns the key string), so verify parity manually.
+
+## Build & Test
+
+- Frontend (from `web/`): `npm run check` (svelte-check — expect 0 errors), `npm run build`, `npm run test` (vitest; pure-logic units like `sourceForms.test.ts`).
+- Backend (repo root): `go build ./...`, `go vet ./internal/...`, `go test ./internal/...`.
+- `./scripts/dev-server.sh` rebuilds the SPA + restarts the Go server on :8090; it kills the prior process, so a previous background dev-server task reporting **exit 1 is expected**, not a failure.
@@ -1,3 +1,4 @@
+# syntax=docker/dockerfile:1.7
 # Stage 1: Build frontend
 FROM node:20-alpine AS frontend-builder

@@ -9,32 +10,40 @@ COPY web/ ./
 RUN npm run build

 # Stage 2: Build Go binary
-FROM golang:1.24-alpine AS backend-builder
+FROM golang:1.25-alpine AS backend-builder

 RUN apk add --no-cache git ca-certificates

 WORKDIR /build
 COPY go.mod go.sum ./
 ENV GOTOOLCHAIN=auto
-RUN go mod download
+# Cache mounts persist the module + build caches across rebuilds (BuildKit).
+RUN --mount=type=cache,target=/go/pkg/mod \
+    go mod download

 COPY . .
 # Copy built frontend into the expected embed location.
 COPY --from=frontend-builder /build/web/build ./web/build

-RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /docker-watcher ./cmd/server
+RUN --mount=type=cache,target=/go/pkg/mod \
+    --mount=type=cache,target=/root/.cache/go-build \
+    CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /tinyforge ./cmd/server

 # Stage 3: Minimal runtime image
 FROM alpine:3.19

-RUN apk add --no-cache ca-certificates tzdata
+LABEL org.opencontainers.image.source="https://git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge"
+LABEL org.opencontainers.image.title="Tinyforge"
+LABEL org.opencontainers.image.description="Self-hosted Docker deployment + mini-CI platform"
+
+RUN apk add --no-cache ca-certificates tzdata wget

 # Create non-root user.
 RUN addgroup -g 1000 -S app && adduser -u 1000 -S app -G app

 WORKDIR /app

-COPY --from=backend-builder /docker-watcher /app/docker-watcher
+COPY --from=backend-builder /tinyforge /app/tinyforge

 # Data directory for SQLite database.
 RUN mkdir -p /app/data && chown -R app:app /app
@@ -46,4 +55,10 @@ EXPOSE 8080
 ENV DATA_DIR=/app/data
 ENV LISTEN_ADDR=:8080

-ENTRYPOINT ["/app/docker-watcher"]
+VOLUME /app/data
+
+# /readyz is the public readiness probe (pings the DB); /livez is liveness.
+HEALTHCHECK --interval=30s --timeout=5s --retries=3 --start-period=10s \
+    CMD wget --no-verbose --tries=1 --spider http://localhost:8080/readyz || exit 1
+
+ENTRYPOINT ["/app/tinyforge"]
@@ -0,0 +1,39 @@
+# Mini CI Feature Ideas
+
+Feature ideas for evolving the project from a Docker container watcher into a self-hosted mini CI/deployment platform for local developers.
+
+
+## Build Pipeline
+
+- **Build from source** — clone a repo, run a `Dockerfile` or `docker-compose.yml`, build the image locally, then deploy it. Closes the loop from source to running container.
+- **Build logs streaming** — SSE stream of `docker build` output, reusing the existing container logs streaming pattern.
+- **Build cache management** — show Docker layer cache stats, allow selective cache invalidation.
+
+## Git Integration
+
+- **Webhook receiver for push events** — Gitea/GitHub/GitLab sends a push webhook, the platform rebuilds and redeploys automatically. Reuses existing webhook infra from registry polling.
+- **Branch preview environments** — push to `feature/foo`, get a temporary deployment at `foo.preview.local`. Auto-cleanup when the branch is deleted.
+- **Commit status reporting** — push deploy status back to Gitea/GitHub as commit statuses (green check / red X).
+
+## Developer Experience
+
+- **CLI tool** — `Tinyforge deploy`, `Tinyforge logs`, `Tinyforge status` from the terminal for developers who prefer the shell.
+- **`.Tinyforge.yml` project config** — a declarative file in the repo root that defines how to build, which env vars to inject, health check paths, proxy rules. One file, full deploy config.
+- **Environment promotion** — one-click promote from `dev` to `staging` to `prod`. Builds on the existing multi-stage project model by adding a promotion workflow.
+
+## Observability
+
+- **Resource dashboard** — CPU/memory/disk per container over time (not just a snapshot). Use Docker stats API with a small ring buffer in SQLite.
+- **Deploy timeline** — a visual timeline showing deploys, rollbacks, and incidents across all projects. "What happened in my infra today?"
+- **Alerting** — container OOM, high CPU, health check failures pushed to Telegram/Discord/email/webhook.
+
+## Multi-Service Orchestration
+
+- **Compose support** — import a `docker-compose.yml` and manage the entire stack as one project. Deploy/rollback the stack atomically.
+- **Service dependency graph** — visualize which services depend on which. Block deploys if a dependency is unhealthy.
+- **Shared secrets** — secrets scoped to a project or global, injected into any service that needs them. Extends the existing encrypted secrets model from static sites.
+
+## Database / Persistence
+
+- **Database snapshots** — one-click snapshot/restore of database volumes before risky deploys.
+- **Automatic pre-deploy backup** — snapshot the data volume before every deploy, auto-prune old snapshots.
@@ -9,7 +9,7 @@ build-frontend:

 # Build the Go binary (embeds web/build/ via go:embed).
 build-backend:
-	go build -o docker-watcher ./cmd/server
+	go build -o tinyforge ./cmd/server

 # Run in development mode with hot reload.
 # Requires air (go install github.com/air-verse/air@latest).
@@ -18,4 +18,4 @@ dev:

 # Clean build artifacts.
 clean:
-	rm -rf web/build web/node_modules/.vite docker-watcher
+	rm -rf web/build web/node_modules/.vite tinyforge
@@ -1,512 +0,0 @@
-# Docker Watcher — Implementation Plan
-
-## Overview
-
-A self-hosted tool that automates Docker container deployment with Nginx Proxy Manager integration. Detects new images from Gitea/GitHub registries, deploys containers, and configures reverse proxy routing — all from a web dashboard. Supports multiple simultaneous versions of the same project. DNS is handled by a Cloudflare wildcard record (`*.dolgolyov-family.by`) — no per-project DNS management needed.
-
-## Architecture
-
-```text
-Gitea CI → pushes image → Registry
-     │                        ↓
-     │              Docker Watcher (Go)
-     │              ├── Secret webhook URL (instant)
-     │              └── Registry poller (fallback)
-     │                        ↓
-     └── or: POST /api/webhook/<secret-uuid>
-              with {"image": "registry/org/app:tag"}
-                              ↓
-                  Known project? ──────────────────┐
-                  ↓ yes                             ↓ no
-           Match tag → stage              Auto-create project
-                  ↓                       with defaults from
-           auto_deploy?                   image inspection
-           ↓ yes       ↓ no              (EXPOSE, labels)
-     Deploy now    Notify, wait                ↓
-           ↓       for UI trigger         Deploy with defaults
-           ↓           ↓
-     Pull image
-     Start new container on shared network
-     (old container stays if multi-instance)
-           ↓
-     NPM API: create proxy host (if first deploy for this subdomain)
-     (DNS already handled by Cloudflare wildcard *.domain)
-           ↓
-     Health check
-     → success: done, notify
-     → failure: remove new container, alert
-```
-
-## Decisions
-
-| Decision | Choice | Rationale |
-|----------|--------|-----------|
-| Language | Go | Single binary, excellent Docker SDK, low resource usage |
-| Web UI | SvelteKit (embedded in Go binary) | User's existing stack, lightweight |
-| Reverse proxy | Nginx Proxy Manager | Already deployed, API available |
-| DNS | Cloudflare wildcard `*.{domain}` | One-time setup, all subdomains auto-resolve |
-| Routing | Subdomain-based | No sub-path issues with SPAs |
-| Image detection | Secret webhook URL + polling | Webhook for speed, polling as fallback |
-| Config storage | SQLite (YAML for initial seed only) | Editable via UI, no manual file editing |
-| Credentials | Encrypted in SQLite (AES-256) | Single ENCRYPTION_KEY env var |
-| Webhook auth | Secret UUID in URL | No tokens needed, simple CI integration |
-| Multi-instance | Yes | Multiple tags of same project can run simultaneously |
-| Deployment target | Same TrueNAS host | Docker socket mounted |
-
-## Subdomain Convention
-
-| Type | Pattern | Example |
-|------|---------|---------|
-| Dev (default) | `stage-dev-{project}.{domain}` | `stage-dev-web-app-launcher.dolgolyov-family.by` |
-| Dev (specific tag) | `stage-dev-{project}-{tag}.{domain}` | `stage-dev-web-app-launcher-abc123.dolgolyov-family.by` |
-| Release (default) | `stage-rel-{project}.{domain}` | `stage-rel-web-app-launcher.dolgolyov-family.by` |
-| Release (specific tag) | `stage-rel-{project}-{tag}.{domain}` | `stage-rel-web-app-launcher-v1-2-0.dolgolyov-family.by` |
-| Production | `{custom}.{domain}` | `launcher.dolgolyov-family.by` |
-
-Tags are sanitized for DNS: dots → dashes, lowercase, truncated to fit DNS limits.
-
-## Configuration
-
-### First Launch
-
-```text
-YAML seed file exists? → import into SQLite → done
-No YAML?               → empty state, configure everything via UI
-```
-
-After import, all configuration lives in SQLite and is managed via the Web UI.
-YAML is never read again unless user clicks "Re-import config" or "Export config".
-
-### Seed Config Format (optional)
-
-```yaml
-global:
-  domain: dolgolyov-family.by
-  server_ip: 93.84.96.191
-  network: staging-net
-  subdomain_pattern: "stage-{stage}-{project}"
-  notification_url: https://notify.dolgolyov-family.by/webhook
-  npm:
-    url: http://npm:81
-    email: docker-watcher@dolgolyov-family.by
-    password: "npm-password-here"
-registries:
-  gitea:
-    url: https://git.dolgolyov-family.by
-    type: gitea
-    token: "gitea-token-here"
-
-projects:
-  web-app-launcher:
-    registry: gitea
-    image: git.dolgolyov-family.by/alexei/web-app-launcher
-    port: 3000
-    healthcheck: /api/health
-    env:
-      NODE_ENV: production
-    stages:
-      dev:
-        tag_pattern: "dev-*"
-        auto_deploy: true
-        max_instances: 5
-      rel:
-        tag_pattern: "v*"
-        auto_deploy: false
-        max_instances: 2
-      prod:
-        tag_pattern: "v*"
-        auto_deploy: false
-        confirm: true
-        promote_from: rel
-        max_instances: 2
-        subdomain: launcher
-```
-
-## Web UI Sections
-
-### Dashboard
-
-Overview of all projects with their running instances:
- Project name, running instance count, latest activity
- Quick status indicators (healthy / stopped / failing)
- "Quick Deploy" button for ad-hoc image deployment
-
-### Project Detail
-
-Per-project view with stages and instances:
- Each stage shows all running instances with: tag, status, URL, uptime
- Controls per instance: Stop, Start, Restart, Remove
- "Deploy new version" dropdown — lists available tags from registry
- Deploy history log
-
-### Quick Deploy
-
-For deploying images not yet configured as projects:
-1. Paste image URL (e.g., `git.dolgolyov-family.by/alexei/my-app:dev-abc123`)
-2. Docker Watcher pulls and inspects image (EXPOSE port, HEALTHCHECK, labels)
-3. Pre-fills form with sensible defaults (project name, port, stage, subdomain)
-4. User reviews, tweaks, clicks "Deploy"
-5. Project is auto-created in the DB for future use
-
-### Settings
-
- **Registries** — add/edit/delete registries, test connection
- **Credentials** — NPM, registry tokens (encrypted, shown as `••••••••`)
- **Global** — domain, server IP, Docker network, subdomain pattern, polling interval
- **Notifications** — webhook URL
- **Webhook URL** — shows the secret deploy URL, "Regenerate" button
-
-### Projects Config
-
- Add / edit / delete projects via UI
- Configure image, port, healthcheck, env vars, volumes per project
- Add / remove stages, set tag patterns, auto-deploy, subdomain overrides, max instances
-
-## Project Structure
-
-```text
-docker-watcher/
-├── cmd/
-│   └── server/
-│       └── main.go                 # Entry point
-├── internal/
-│   ├── config/
-│   │   ├── config.go               # YAML seed parsing
-│   │   └── config_test.go
-│   ├── docker/
-│   │   ├── client.go               # Docker Engine API wrapper
-│   │   ├── container.go            # Create, start, stop, remove, inspect
-│   │   └── client_test.go
-│   ├── npm/
-│   │   ├── client.go               # NPM API client (auth, CRUD proxy hosts)
-│   │   └── client_test.go
-│   ├── registry/
-│   │   ├── registry.go             # Interface
-│   │   ├── gitea.go                # Gitea registry implementation
-│   │   ├── github.go               # GitHub Container Registry (future)
-│   │   ├── poller.go               # Periodic tag polling
-│   │   └── registry_test.go
-│   ├── deployer/
-│   │   ├── deployer.go             # Orchestrates full deploy flow
-│   │   ├── rollback.go             # Rollback on failure
-│   │   └── deployer_test.go
-│   ├── health/
-│   │   ├── checker.go              # HTTP health checks with retries
-│   │   └── checker_test.go
-│   ├── notify/
-│   │   ├── notifier.go             # Webhook notifications
-│   │   └── notifier_test.go
-│   ├── webhook/
-│   │   ├── handler.go              # Secret URL webhook receiver
-│   │   └── handler_test.go
-│   ├── api/
-│   │   ├── router.go               # HTTP API for web UI
-│   │   ├── projects.go             # Project CRUD endpoints
-│   │   ├── registries.go           # Registry CRUD endpoints
-│   │   ├── settings.go             # Global settings endpoints
-│   │   ├── instances.go            # Instance start/stop/restart/remove
-│   │   ├── deploys.go              # Deploy + quick deploy endpoints
-│   │   └── middleware.go           # Auth, logging, CORS
-│   ├── store/
-│   │   ├── store.go                # SQLite schema, migrations
-│   │   ├── projects.go             # Project queries
-│   │   ├── instances.go            # Instance queries
-│   │   ├── registries.go           # Registry queries
-│   │   ├── settings.go             # Settings queries
-│   │   ├── deploys.go              # Deploy history queries
-│   │   └── store_test.go
-│   └── crypto/
-│       └── crypto.go               # AES-256 encrypt/decrypt for credentials
-├── web/                            # SvelteKit frontend
-│   ├── src/
-│   │   ├── routes/
-│   │   │   ├── +page.svelte        # Dashboard
-│   │   │   ├── projects/
-│   │   │   │   ├── +page.svelte    # Projects list + add
-│   │   │   │   └── [id]/
-│   │   │   │       └── +page.svelte # Project detail + instances
-│   │   │   ├── deploy/
-│   │   │   │   └── +page.svelte    # Quick deploy
-│   │   │   └── settings/
-│   │   │       ├── +page.svelte    # Global settings
-│   │   │       ├── registries/
-│   │   │       │   └── +page.svelte
-│   │   │       └── credentials/
-│   │   │           └── +page.svelte
-│   │   ├── lib/
-│   │   │   ├── api.ts              # API client
-│   │   │   ├── types.ts            # Shared types
-│   │   │   └── components/         # Reusable UI components
-│   │   └── app.html
-│   ├── package.json
-│   ├── svelte.config.js
-│   └── vite.config.ts
-├── docker-watcher.example.yaml     # Example seed config
-├── Dockerfile
-├── docker-compose.yml
-├── go.mod
-└── go.sum
-```
-
-## Implementation Phases
-
-### Phase 1: Foundation ✅
-
-Core infrastructure — store, config import, Docker client, NPM client.
-
-1. **Go project init** — go.mod, directory structure, dependencies
-2. **SQLite store** — schema, migrations, CRUD for projects/registries/settings/instances/deploys
-3. **Crypto** — AES-256 encrypt/decrypt for credential storage
-4. **Config seed loader** — parse YAML, import into SQLite on first launch
-5. **Docker client** — connect to socket, pull image, inspect image, list/start/stop/remove containers, manage networks
-6. **NPM client** — authenticate (JWT), create/update/delete proxy hosts, list existing hosts
-
-### Phase 2: Detection & Deployment (Registry & Poller ✅, Webhook ✅, Deployer ✅)
-
-The core loop — detecting new images and deploying them.
-
-8. **Registry client** ✅ — Gitea registry API: list tags for an image, detect new tags
-9. **Poller** ✅ — periodic check for new tags matching configured patterns
-10. **Secret webhook handler** ✅ — UUID-based URL, receives image push notifications, auto-creates unknown projects
-11. **Deployer** ✅ — orchestrate: pull → start container → NPM proxy → health check
-12. **Multi-instance support** ✅ — multiple versions per project/stage, tag-based subdomains, max_instances limit
-13. **Health checker** ✅ — HTTP GET with retries and timeout (3 retries, 5s interval, 10s timeout)
-14. **Rollback** ✅ — on health check failure: remove new container, clean up NPM, alert
-15. **Notifications** ✅ — send webhook on deploy success/failure (fire-and-forget)
-
-### Phase 3: Web UI
-
-Full dashboard for visibility, manual control, and configuration.
-
-16. **API layer** — REST endpoints for all CRUD operations + deploy/control actions
-17. **SvelteKit dashboard** — project overview, instance status, quick status indicators
-18. **Project detail view** — stages, instances, controls (stop/start/restart/remove), deploy history
-19. **Quick Deploy page** — paste image URL, auto-inspect, pre-fill form, one-click deploy
-20. **Settings pages** — registries, credentials, global settings, webhook URL management
-21. **Project config pages** — add/edit/delete projects and stages via UI
-22. **Embed in Go** ✅ — build SvelteKit to static, embed with `go:embed`, serve from Go
-23. **Real-time updates** ✅ — SSE for deploy progress and instance status changes
-
-### Phase 4: Volumes & Environment (Phase 13) -- COMPLETED
-
-Persistent storage and app-specific configuration for deployed containers.
-
-24. **Environment variables per project** — key/value pairs stored in SQLite, sensitive values encrypted
-25. **Per-stage env overrides** — e.g., `NODE_ENV=development` for dev, `NODE_ENV=production` for prod
-26. **Volume mounts per project** — configurable source/target paths with shared/isolated modes
-27. **Shared volumes** — all instances of a project mount the same host path (for stateless apps or shared uploads)
-28. **Isolated volumes** — each instance gets its own subdirectory: `{source}/{stage}-{tag}/` → `{target}` (for stateful apps with local DBs/files)
-29. **UI for volumes & env** — project settings page with key/value editor, volume list, shared/isolated toggle, per-stage override support
-
-#### Phase 13 Handoff Notes
-
- New tables: `stage_env` (id, stage_id, key, value, encrypted, timestamps), `volumes` (id, project_id, source, target, mode, timestamps)
- `stage_env` has UNIQUE(stage_id, key) constraint to prevent duplicate keys per stage
- Volume mode is either "shared" or "isolated"; default is "shared"
- Encrypted env values are encrypted with `crypto.Encrypt` before storage and decrypted at deploy time
- API masks encrypted env values as "••••••••" in responses
- Env merge order in deployer: project-level JSON `env` field parsed first, then stage-level `stage_env` records overlay (stage wins on key conflict)
- `computeVolumeMounts` appends `/{stage}-{tag}/` to source for isolated volumes
- Docker `ContainerConfig` now has `Mounts []mount.Mount` field, passed to `HostConfig.Mounts`
- Both `executeDeploy` and `blueGreenDeploy` updated to use `mergeEnvVars` and `computeVolumeMounts`
- API routes: GET/POST `/api/projects/{id}/stages/{stage}/env`, PUT/DELETE `.../env/{envId}`, GET/POST `/api/projects/{id}/volumes`, PUT/DELETE `.../volumes/{volId}`
- Frontend pages: `/projects/[id]/env` (per-stage env editor with inherited/overridden indicators), `/projects/[id]/volumes` (volume editor with shared/isolated toggle)
- Project detail page now has navigation links to env and volumes pages
-
-Volume config per project:
-```yaml
-env:
-  NODE_ENV: production
-  DATABASE_URL: postgres://db:5432/myapp    # shared external DB
-  SECRET_KEY: "..."                          # encrypted in SQLite
-volumes:
-  - source: /data/my-app/uploads
-    target: /app/uploads
-    mode: shared        # all instances share this path
-  - source: /data/my-app/data
-    target: /app/data
-    mode: isolated      # auto-appends /{stage}-{tag}/ to source
-```
-
-Stage-level env overrides:
-```yaml
-stages:
-  dev:
-    env:
-      NODE_ENV: development       # overrides project-level
-      DATABASE_URL: postgres://db:5432/myapp_dev
-  prod:
-    env:
-      NODE_ENV: production        # uses project-level default
-```
-
-### Phase 5: Hardening (Phase 12) -- COMPLETED
-
-30. **Blue-green deploys** -- start new, health check, swap, stop old (zero downtime)
-31. **Promote flow** -- enforce `promote_from` for production deploys
-32. **Auth on dashboard** -- two modes, configurable via settings:
-    - **Local auth** -- username/password stored in SQLite (bcrypt hashed), JWT session tokens
-    - **OAuth2 / OpenID Connect** -- integration with any OIDC provider (configurable client ID/secret/discovery URL)
-33. **Graceful shutdown** -- drain in-progress deploys on SIGTERM, close DB, stop poller
-34. **Structured logging** -- JSON logs via `log/slog` with deploy context
-35. **Config export** -- download current SQLite state as YAML
-36. **Dockerfile** -- multi-stage build (Node.js 20 + Go 1.23 build, alpine runtime)
-37. **docker-compose.yml** -- production-ready compose with volumes, network, env
-38. **Auth middleware** -- protects all /api/* routes except webhook and auth endpoints
-39. **Auth settings UI** -- settings page to toggle auth mode, configure OIDC, manage users
-40. **Login page** -- username/password form with OIDC SSO option
-41. **Final wiring** -- all services properly initialized and shut down in main.go
-
-#### Phase 12 Handoff Notes
-
- Auth: `auth.LocalAuth` handles JWT generation/validation, `auth.OIDCProvider` handles OIDC flow
- Default admin user created on first launch (ADMIN_PASSWORD env var, default: "admin")
- JWT secret derived from ENCRYPTION_KEY via HMAC-SHA256
- Blue-green: triggered automatically when stage has `max_instances=1`; otherwise standard deploy
- Promote: validated in `TriggerDeploy` before deploy begins
- Graceful shutdown: `deployer.Drain()` waits for in-progress deploys; poller stopped; HTTP server drained; DB closed
- Structured logging: all API, deployer, and main.go use `log/slog` JSON handler
- New dependencies: `github.com/golang-jwt/jwt/v5`, `golang.org/x/crypto/bcrypt`, `github.com/coreos/go-oidc/v3`, `golang.org/x/oauth2`
- New tables: `users` (id, username, password_hash, email, role, timestamps), `auth_settings` (single-row: auth_mode, OIDC config)
- Auth middleware applied to all `/api/*` routes except `/api/auth/login`, `/api/auth/oidc/*`, `/api/webhook/*`, `/api/config/export`
- Frontend: token stored in `localStorage`, sent as `Authorization: Bearer` header
- Run `go mod tidy` after checkout to resolve transitive dependencies
-
-## Key Dependencies (Go)
-
- `github.com/docker/docker` — Docker Engine API
- `github.com/go-chi/chi` or `net/http` — HTTP routing
- `gopkg.in/yaml.v3` — YAML seed config
- `modernc.org/sqlite` — SQLite (CGo-free)
- `github.com/robfig/cron` — Polling scheduler
- `github.com/google/uuid` — Webhook secret URL generation
-
-## Docker Compose (self-deployment)
-
-```yaml
-services:
-  docker-watcher:
-    image: docker-watcher:latest
-    container_name: docker-watcher
-    restart: unless-stopped
-    ports:
-      - "8080:8080"
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ./docker-watcher.yaml:/app/seed.yaml:ro  # optional, first launch only
-      - ./data:/app/data                          # SQLite DB
-    environment:
-      - ENCRYPTION_KEY=${ENCRYPTION_KEY}           # protects all credentials in DB
-    networks:
-      - staging-net
-
-networks:
-  staging-net:
-    external: true
-```
-
-## API Endpoints
-
-```text
-# Projects
-GET    /api/projects                                    — list all projects with instance counts
-POST   /api/projects                                    — create project
-GET    /api/projects/:id                                — project detail + stages + instances
-PUT    /api/projects/:id                                — update project config
-DELETE /api/projects/:id                                — delete project + all instances
-
-# Stages
-POST   /api/projects/:id/stages                         — add stage to project
-PUT    /api/projects/:id/stages/:stage                   — update stage config
-DELETE /api/projects/:id/stages/:stage                   — delete stage + its instances
-
-# Stage Env Overrides
-GET    /api/projects/:id/stages/:stage/env                 — list stage env vars (secrets masked)
-POST   /api/projects/:id/stages/:stage/env                 — create stage env var
-PUT    /api/projects/:id/stages/:stage/env/:envId          — update stage env var
-DELETE /api/projects/:id/stages/:stage/env/:envId          — delete stage env var
-
-# Project Volumes
-GET    /api/projects/:id/volumes                           — list project volumes
-POST   /api/projects/:id/volumes                           — create project volume
-PUT    /api/projects/:id/volumes/:volId                    — update project volume
-DELETE /api/projects/:id/volumes/:volId                    — delete project volume
-
-# Instances (running containers)
-GET    /api/projects/:id/stages/:stage/instances          — list instances for stage
-POST   /api/projects/:id/stages/:stage/instances          — deploy new instance (pick tag)
-DELETE /api/projects/:id/stages/:stage/instances/:iid     — remove instance (container + NPM proxy)
-POST   /api/projects/:id/stages/:stage/instances/:iid/stop    — stop container
-POST   /api/projects/:id/stages/:stage/instances/:iid/start   — start stopped container
-POST   /api/projects/:id/stages/:stage/instances/:iid/restart — restart container
-
-# Quick Deploy
-POST   /api/deploy/inspect                               — pull + inspect image, return defaults
-POST   /api/deploy/quick                                 — create project + deploy in one step
-
-# Registry
-GET    /api/registries                                   — list registries
-POST   /api/registries                                   — add registry
-PUT    /api/registries/:id                               — update registry
-DELETE /api/registries/:id                               — delete registry
-POST   /api/registries/:id/test                          — test connection
-GET    /api/registries/:id/tags/:image                   — list available tags
-
-# Settings
-GET    /api/settings                                     — get global settings
-PUT    /api/settings                                     — update global settings
-GET    /api/settings/webhook-url                          — get secret webhook URL
-POST   /api/settings/webhook-url/regenerate               — regenerate webhook URL
-
-# Deploy history
-GET    /api/deploys                                      — recent deploys across all projects
-GET    /api/deploys/:id/logs                             — deploy log stream (SSE)
-
-# Webhook (secret URL — no auth needed)
-POST   /api/webhook/:secret-uuid                         — receive image push notification
-```
-
-## User Workflows
-
-### Auto-Deploy (zero effort)
-
-```text
-Push code → CI builds → pushes tag → Docker Watcher detects →
-auto_deploy: true → deployed → notification with URL
-```
-
-### Manual Deploy via UI (one click)
-
-```text
-Open dashboard → project → stage → "Deploy new version" →
-pick tag from dropdown → click Deploy
-```
-
-### Quick Deploy (new project, paste image URL)
-
-```text
-Open dashboard → "Quick Deploy" → paste image URL →
-review auto-filled defaults → click Deploy →
-project auto-created + deployed
-```
-
-### Deploy via CI Webhook (zero effort after CI setup)
-
-```text
-# In .gitea/workflows/build.yml
- name: Notify Docker Watcher
-  run: |
-    curl -X POST https://watcher.dolgolyov-family.by/api/webhook/d8f2a1e9-... \
-      -d '{"image": "git.dolgolyov-family.by/alexei/my-app:dev-${{ github.sha }}"}'
-```
-
-Known project → deploys per stage config.
-Unknown project → auto-creates with defaults from image inspection, deploys.
-
-### Production Deploy (two clicks)
-
-```text
-Open dashboard → project → prod stage → "Deploy new version" →
-dropdown shows only tags running in "rel" stage (promote_from) →
-pick tag → confirmation dialog → Deploy
-```
@@ -0,0 +1,184 @@
+# Tinyforge
+
+Self-hosted deployment platform with a web dashboard. Deploy Docker containers from registries with zero-downtime blue-green strategy, host static sites and Deno APIs directly from Git repositories, and manage reverse proxy configuration — all from a single binary.
+
+## Features
+
+### Container Deployments
+
+- **Registry polling** and **webhook receiver** for automatic deployments
+- **Blue-green deploys** with health checks and automatic rollback
+- **Multi-stage projects** (dev, staging, prod) with tag pattern matching
+- **Real-time deploy logs** via SSE streaming
+
+### Branch Preview Environments
+
+Get an isolated, throwaway deploy for every feature branch:
+
+- Add a **branch pattern** (e.g. `feat/*`) to a workload's **git trigger** (Triggers panel → git trigger → *Branch pattern*).
+- Pushing to any branch matching the pattern deploys an **isolated per-branch preview** — a child workload that inherits the source config, served at a **slug-prefixed subdomain** (`feat-login-app.example.com`) so previews never collide with each other or the main deploy.
+- Previews are **automatically torn down** when the branch is deleted upstream.
+- Manage live previews from the app's **Preview environments** panel (`/apps/[id]`): open each branch's URL or tear it down manually. A torn-down preview is recreated on the next push to its branch.
+
+### Static Sites
+
+Deploy static sites and Deno-powered APIs directly from Git repositories:
+
+- **Git providers**: Gitea/Forgejo, GitHub, and GitLab (public and private repos)
+- **Static mode**: Serves HTML/CSS/JS via nginx container
+- **Deno mode**: Full-stack with TypeScript API backend + static frontend — API routes are auto-discovered from `/api` folder using a naming convention (`API_get_users`, `API_post_items`, etc.)
+- **Markdown rendering**: Optionally converts `.md` files to styled HTML
+- **Branch & folder picker**: Select any branch and subfolder as the deployment root
+- **Auto-sync**: Trigger redeployment on push or tag events, or manually
+- **Per-site secrets**: Encrypted environment variables injected at runtime
+
+### Infrastructure
+
+- **NPM / Traefik integration** for automatic reverse proxy and SSL configuration
+- **Cloudflare DNS** sync for automatic DNS record management
+- **Volume management**: Create, browse, upload, and download Docker volumes
+- **Stale container cleanup**: Detect and remove unused containers
+- **Image management**: List and prune unused Docker images
+- **Database backups**: Scheduled and manual backups with one-click restore
+- **Config export/import**: YAML-based seed configuration for reproducible setups
+
+### Auth & Security
+
+- **Local auth** with bcrypt password hashing
+- **OIDC/SSO** support for single sign-on
+- **Encrypted credential storage** (AES-256-GCM)
+- **Role-based access**: Admin and user roles
+
+## Prerequisites
+
+- Docker with Docker Compose
+- A Docker network for deployed containers (e.g. `staging-net`)
+- Nginx Proxy Manager (optional, for automatic proxy configuration)
+- Wildcard DNS pointing to your server (for subdomain-based routing)
+
+## Quick Start
+
+1. **Create the Docker network** (containers will be attached to this):
+
+   ```bash
+   docker network create staging-net
+   ```
+
+2. **Create a `.env` file** (see `.env.example`):
+
+   ```bash
+   cp .env.example .env
+   # Edit .env and set ENCRYPTION_KEY and ADMIN_PASSWORD
+   # Generate a key: openssl rand -hex 32
+   ```
+
+3. **Start Tinyforge**:
+
+   ```bash
+   docker compose up -d
+   ```
+
+4. **Open the dashboard** at `http://localhost:8080` and log in with `admin` / your `ADMIN_PASSWORD`.
+
+## Configuration
+
+### Environment Variables
+
+| Variable           | Required            | Description                                                                      |
+| ------------------ | ------------------- | -------------------------------------------------------------------------------- |
+| `ENCRYPTION_KEY`   | Yes                 | AES-256 key for encrypting stored credentials. Use `openssl rand -hex 32`        |
+| `ADMIN_PASSWORD`   | Yes (first launch)  | Password for the default admin user                                              |
+| `SEED_FILE`        | No                  | Path to YAML seed config (default: `./tinyforge.yaml`)                           |
+| `DATA_DIR`         | No                  | SQLite database directory (default: `./data`)                                    |
+| `LISTEN_ADDR`      | No                  | HTTP listen address (default: `:8080`)                                           |
+| `NPM_URL`          | No                  | Override NPM API URL (otherwise uses value from settings)                        |
+| `POLLING_INTERVAL` | No                  | Registry polling interval, Go duration string e.g. `5m` (default from settings) |
+
+### Seed Config
+
+On first launch, Tinyforge imports a YAML seed file to pre-configure registries, projects, and settings. See `tinyforge.example.yaml` for the full format.
+
+### Webhook Integration
+
+After setup, find your webhook URL at **Settings > Webhook URL** in the dashboard. Configure your CI/CD (Gitea Actions, GitHub Actions) to POST to this URL on image push:
+
+```bash
+curl -X POST https://your-domain/api/webhook/<secret> \
+  -H "Content-Type: application/json" \
+  -d '{"image": "registry.example.com/org/app:v1.2.3"}'
+```
+
+### OIDC Setup
+
+1. Go to **Settings > Auth** in the dashboard
+2. Switch auth mode to **OIDC**
+3. Enter your provider's Issuer URL, Client ID, and Client Secret
+4. Set the Redirect URL to `https://your-domain/api/auth/oidc/callback`
+
+## CLI
+
+`tinyforge` is a terminal client for driving a server from the shell, built on the same HTTP API as the web UI.
+
+### Build
+
+```bash
+go build -o tinyforge ./cmd/cli      # ./tinyforge (tinyforge.exe on Windows)
+```
+
+### Usage
+
+```bash
+# Log in once — caches a 24h token in ~/.tinyforge/config.json (mode 0600)
+tinyforge login --base-url http://localhost:8090
+# ...or non-interactively (no password echo / shell-history leak):
+TINYFORGE_PASSWORD=… tinyforge login --base-url http://localhost:8090 --user admin
+
+tinyforge apps                              # list apps + container state
+tinyforge deploy my-app                     # deploy and wait for completion
+tinyforge deploy my-app --ref v1.2.3 --note "hotfix"
+tinyforge logs my-app -f                    # follow logs (Ctrl-C to stop)
+tinyforge status                            # server health + current user
+tinyforge status my-app                     # one app's containers
+tinyforge logout                            # revoke + clear the cached token
+```
+
+### Server & token resolution
+
+| Setting  | Flag         | Env               | Default                  |
+| -------- | ------------ | ----------------- | ------------------------ |
+| Base URL | `--base-url` | `TINYFORGE_URL`   | `http://localhost:8080`  |
+| Token    | `--token`    | `TINYFORGE_TOKEN` | cached by `login`        |
+| Config   | `--config`   | `TINYFORGE_CONFIG`| `~/.tinyforge/config.json` |
+
+### Notes
+
+- Login returns a **24h JWT** — there is no long-lived API token yet, so unattended use re-logs in when the token expires. `deploy` / `stop` / `start` require an **admin** account.
+- The token is sent as an `Authorization: Bearer` header (never placed in the URL) and the config file is written with `0600` permissions.
+
+## Development
+
+```bash
+# Build frontend
+cd web && npm install && npm run build && cd ..
+
+# Run backend (requires ENCRYPTION_KEY and ADMIN_PASSWORD env vars)
+go run ./cmd/server
+
+# Or use Make
+make build
+make dev
+```
+
+## Architecture
+
+```text
+CI/Registry --> Webhook/Poller --> Deployer --> Docker + NPM
+                                      |
+Git Repo ----> Static Sites -------> Docker + NPM
+                                      |
+                                  Event Bus --> SSE --> Web Dashboard
+```
+
+- **Backend**: Go 1.24, chi router, SQLite (pure Go), Docker SDK
+- **Frontend**: SvelteKit 2, Tailwind CSS 4, TypeScript
+- **Deployment**: Single binary with embedded SPA, multi-stage Dockerfile
@@ -0,0 +1,149 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"sort"
+	"strings"
+	"text/tabwriter"
+	"time"
+)
+
+func runApps(args []string) error {
+	// Accept an optional "list" subcommand: `tinyforge apps` == `tinyforge apps list`.
+	if len(args) > 0 && args[0] == "list" {
+		args = args[1:]
+	}
+	fs := flag.NewFlagSet("apps", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge apps [list] [--base-url URL]\n\nList apps (workloads with a source) and their container state.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	var workloads []Workload
+	if err := sess.client.doJSON(ctx, "GET", "/api/workloads", nil, &workloads); err != nil {
+		return err
+	}
+
+	// One extra call fetches every container so state can be shown without an
+	// N+1 per-app request.
+	var containers []Container
+	if err := sess.client.doJSON(ctx, "GET", "/api/containers", nil, &containers); err != nil {
+		return err
+	}
+	byWorkload := map[string][]Container{}
+	for _, c := range containers {
+		byWorkload[c.WorkloadID] = append(byWorkload[c.WorkloadID], c)
+	}
+
+	apps := make([]Workload, 0, len(workloads))
+	for _, w := range workloads {
+		if w.isApp() {
+			apps = append(apps, w)
+		}
+	}
+	sort.Slice(apps, func(i, j int) bool { return apps[i].Name < apps[j].Name })
+
+	if len(apps) == 0 {
+		fmt.Println("No apps yet. Create one in the web UI, then deploy with 'tinyforge deploy <app>'.")
+		return nil
+	}
+
+	tw := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
+	fmt.Fprintln(tw, "NAME\tSOURCE\tSTATE\tID")
+	for _, w := range apps {
+		fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", w.Name, w.SourceKind, stateSummary(byWorkload[w.ID]), idShort(w.ID))
+	}
+	return tw.Flush()
+}
+
+// stateSummary condenses a workload's containers into one status word.
+func stateSummary(cs []Container) string {
+	if len(cs) == 0 {
+		return "—"
+	}
+	running := 0
+	for _, c := range cs {
+		if c.State == "running" {
+			running++
+		}
+	}
+	switch {
+	case running == len(cs):
+		return "running"
+	case running == 0:
+		return cs[0].State // e.g. stopped / failed / missing
+	default:
+		return fmt.Sprintf("%d/%d running", running, len(cs))
+	}
+}
+
+// resolveApp maps a user-supplied reference (name, full id, or id prefix) to a
+// single app workload. Exact id wins, then exact name, then a unique id prefix.
+func resolveApp(ctx context.Context, c *Client, ref string) (Workload, error) {
+	var workloads []Workload
+	if err := c.doJSON(ctx, "GET", "/api/workloads", nil, &workloads); err != nil {
+		return Workload{}, err
+	}
+
+	var byID, byName, byPrefix []Workload
+	for _, w := range workloads {
+		if !w.isApp() {
+			continue
+		}
+		switch {
+		case w.ID == ref:
+			byID = append(byID, w)
+		case strings.EqualFold(w.Name, ref):
+			byName = append(byName, w)
+		case len(ref) >= 6 && strings.HasPrefix(w.ID, ref):
+			byPrefix = append(byPrefix, w)
+		}
+	}
+
+	if len(byID) == 1 {
+		return byID[0], nil
+	}
+	if len(byName) == 1 {
+		return byName[0], nil
+	}
+	if len(byName) > 1 {
+		return Workload{}, ambiguousErr(ref, byName)
+	}
+	if len(byPrefix) == 1 {
+		return byPrefix[0], nil
+	}
+	if len(byPrefix) > 1 {
+		return Workload{}, ambiguousErr(ref, byPrefix)
+	}
+	return Workload{}, fmt.Errorf("no app matching %q (try 'tinyforge apps list')", ref)
+}
+
+func ambiguousErr(ref string, matches []Workload) error {
+	var b strings.Builder
+	fmt.Fprintf(&b, "%q matches multiple apps; use the id:\n", ref)
+	for _, w := range matches {
+		fmt.Fprintf(&b, "  %s  %s\n", idShort(w.ID), w.Name)
+	}
+	return fmt.Errorf("%s", strings.TrimRight(b.String(), "\n"))
+}
+
+func idShort(id string) string {
+	if len(id) > 8 {
+		return id[:8]
+	}
+	return id
+}
@@ -0,0 +1,232 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+)
+
+// apiError carries the server's error message plus the HTTP status, so callers
+// can distinguish auth failures (401) from other errors without losing the
+// server's message (e.g. "invalid credentials" vs "invalid or expired token").
+type apiError struct {
+	status int
+	msg    string
+}
+
+func (e *apiError) Error() string { return e.msg }
+
+// isAuthError reports whether err is a 401 from the API.
+func isAuthError(err error) bool {
+	var ae *apiError
+	return errors.As(err, &ae) && ae.status == http.StatusUnauthorized
+}
+
+// Client talks to the Tinyforge HTTP API. It has no global timeout so that
+// long synchronous deploys and follow streams work; callers pass a context
+// with the appropriate deadline.
+type Client struct {
+	baseURL string
+	token   string
+	http    *http.Client
+}
+
+func newClient(baseURL, token string) *Client {
+	return &Client{
+		baseURL: strings.TrimRight(baseURL, "/"),
+		token:   token,
+		http:    &http.Client{},
+	}
+}
+
+// apiEnvelope mirrors the server's response wrapper. The server's struct is
+// unexported, so the CLI defines its own matching shape. Data is deferred so a
+// single decode path serves every endpoint.
+type apiEnvelope struct {
+	Success bool            `json:"success"`
+	Data    json.RawMessage `json:"data"`
+	Error   string          `json:"error"`
+}
+
+// SessionToken is the data payload of POST /api/auth/login.
+type SessionToken struct {
+	Token     string `json:"token"`
+	ExpiresAt string `json:"expires_at"`
+}
+
+// User is the data payload of GET /api/auth/me.
+type User struct {
+	ID       string `json:"id"`
+	Username string `json:"username"`
+	Email    string `json:"email"`
+	Role     string `json:"role"`
+}
+
+// Workload is the subset of the workload row the CLI needs. An "app" is a
+// workload with a non-empty SourceKind.
+type Workload struct {
+	ID         string `json:"id"`
+	Name       string `json:"name"`
+	Kind       string `json:"kind"`
+	AppID      string `json:"app_id"`
+	SourceKind string `json:"source_kind"`
+	CreatedAt  string `json:"created_at"`
+}
+
+func (w Workload) isApp() bool { return w.SourceKind != "" }
+
+// Container is the subset of a container row the CLI needs. State is one of
+// running|stopped|failed|missing|starting|created|restarting|paused|...
+type Container struct {
+	ID          string `json:"id"`
+	WorkloadID  string `json:"workload_id"`
+	Role        string `json:"role"`
+	ContainerID string `json:"container_id"`
+	ImageRef    string `json:"image_ref"`
+	State       string `json:"state"`
+	Port        int    `json:"port"`
+	Subdomain   string `json:"subdomain"`
+	CreatedAt   string `json:"created_at"`
+}
+
+// DeployResult is the data payload of POST /api/workloads/{id}/deploy.
+type DeployResult struct {
+	WorkloadID  string `json:"workload_id"`
+	Reference   string `json:"reference"`
+	TriggeredBy string `json:"triggered_by"`
+}
+
+// doJSON performs a JSON request and unwraps the response envelope. body may be
+// nil. out may be nil when the caller does not need the data payload. A 401
+// maps to errNotAuthenticated; any other non-success surfaces the server's
+// error message.
+func (c *Client) doJSON(ctx context.Context, method, path string, body, out any) error {
+	var reqBody io.Reader
+	if body != nil {
+		buf, err := json.Marshal(body)
+		if err != nil {
+			return fmt.Errorf("encode request: %w", err)
+		}
+		reqBody = bytes.NewReader(buf)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, reqBody)
+	if err != nil {
+		return fmt.Errorf("build request: %w", err)
+	}
+	if body != nil {
+		req.Header.Set("Content-Type", "application/json")
+	}
+	c.authorize(req)
+
+	resp, err := c.http.Do(req)
+	if err != nil {
+		return fmt.Errorf("%s %s: %w", method, path, err)
+	}
+	defer resp.Body.Close()
+
+	raw, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
+	if err != nil {
+		return fmt.Errorf("read response: %w", err)
+	}
+
+	var env apiEnvelope
+	if jsonErr := json.Unmarshal(raw, &env); jsonErr != nil {
+		// Non-JSON body (e.g. a proxy error page). Surface status + a snippet,
+		// preserving auth-error typing for 401s with an unparseable body.
+		if resp.StatusCode >= 400 {
+			return &apiError{status: resp.StatusCode, msg: fmt.Sprintf(
+				"%s %s: unexpected response (status %d): %s", method, path, resp.StatusCode, snippet(raw))}
+		}
+		return fmt.Errorf("%s %s: decode response: %w", method, path, jsonErr)
+	}
+	if resp.StatusCode >= 400 || !env.Success {
+		msg := env.Error
+		if msg == "" {
+			msg = fmt.Sprintf("%s %s: request failed (status %d)", method, path, resp.StatusCode)
+		}
+		return &apiError{status: resp.StatusCode, msg: msg}
+	}
+	if out != nil && len(env.Data) > 0 {
+		if err := json.Unmarshal(env.Data, out); err != nil {
+			return fmt.Errorf("decode response data: %w", err)
+		}
+	}
+	return nil
+}
+
+// authorize attaches the bearer token. Using the Authorization header (rather
+// than a ?token= query param) keeps the JWT out of server and proxy logs.
+func (c *Client) authorize(req *http.Request) {
+	if c.token != "" {
+		req.Header.Set("Authorization", "Bearer "+c.token)
+	}
+}
+
+// streamSSE opens an SSE stream and invokes onData for each `data:` payload.
+// Comment lines (heartbeats, beginning with ':') and blanks are skipped. The
+// stream ends on EOF, context cancellation, or when onData returns an error.
+func (c *Client) streamSSE(ctx context.Context, path string, onData func(payload []byte) error) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
+	if err != nil {
+		return fmt.Errorf("build request: %w", err)
+	}
+	req.Header.Set("Accept", "text/event-stream")
+	c.authorize(req)
+
+	resp, err := c.http.Do(req)
+	if err != nil {
+		return fmt.Errorf("GET %s: %w", path, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
+		var env apiEnvelope
+		msg := fmt.Sprintf("GET %s: stream failed (status %d)", path, resp.StatusCode)
+		if json.Unmarshal(raw, &env) == nil && env.Error != "" {
+			msg = env.Error
+		}
+		return &apiError{status: resp.StatusCode, msg: msg}
+	}
+
+	scanner := bufio.NewScanner(resp.Body)
+	scanner.Buffer(make([]byte, 0, 64<<10), 2<<20) // tolerate long log lines
+	for scanner.Scan() {
+		line := scanner.Text()
+		if line == "" || strings.HasPrefix(line, ":") {
+			continue // blank separator or SSE comment/heartbeat
+		}
+		data, ok := strings.CutPrefix(line, "data:")
+		if !ok {
+			continue // ignore event:/id: fields — the API uses default events
+		}
+		if err := onData([]byte(strings.TrimPrefix(data, " "))); err != nil {
+			return err
+		}
+	}
+	if err := scanner.Err(); err != nil && !errors.Is(err, context.Canceled) {
+		return fmt.Errorf("read stream: %w", err)
+	}
+	return nil
+}
+
+// snippet returns a short, single-line view of an unexpected response body.
+func snippet(b []byte) string {
+	const max = 200
+	s := strings.TrimSpace(string(b))
+	s = strings.ReplaceAll(s, "\n", " ")
+	if len(s) > max {
+		s = s[:max] + "…"
+	}
+	if s == "" {
+		s = "(empty body)"
+	}
+	return s
+}
@@ -0,0 +1,148 @@
+package main
+
+import (
+	"bytes"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// defaultBaseURL matches the server's default LISTEN_ADDR (:8080). The dev
+// server runs on :8090; point at it with --base-url or $TINYFORGE_URL.
+const defaultBaseURL = "http://localhost:8080"
+
+// Config is the persisted CLI state at ~/.tinyforge/config.json.
+type Config struct {
+	BaseURL   string `json:"base_url"`
+	Token     string `json:"token"`
+	ExpiresAt string `json:"expires_at"`
+}
+
+// globals holds the cross-cutting flags every command accepts.
+type globals struct {
+	baseURL    *string
+	token      *string
+	configPath *string
+}
+
+// addGlobalFlags registers the shared flags on a command's flag set.
+func addGlobalFlags(fs *flag.FlagSet) *globals {
+	return &globals{
+		baseURL:    fs.String("base-url", "", "Tinyforge server URL (default $TINYFORGE_URL or "+defaultBaseURL+")"),
+		token:      fs.String("token", "", "auth token (default $TINYFORGE_TOKEN or cached config)"),
+		configPath: fs.String("config", "", "config file path (default $TINYFORGE_CONFIG or ~/.tinyforge/config.json)"),
+	}
+}
+
+// configFilePath resolves the config file location with precedence:
+// --config flag > $TINYFORGE_CONFIG > ~/.tinyforge/config.json.
+func configFilePath(g *globals) (string, error) {
+	if g != nil && *g.configPath != "" {
+		return *g.configPath, nil
+	}
+	if env := os.Getenv("TINYFORGE_CONFIG"); env != "" {
+		return env, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", fmt.Errorf("locate home directory: %w", err)
+	}
+	return filepath.Join(home, ".tinyforge", "config.json"), nil
+}
+
+// loadConfig reads the config file. A missing file yields a zero Config and no
+// error — first run is not a failure.
+func loadConfig(path string) (Config, error) {
+	var cfg Config
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return cfg, nil
+		}
+		return cfg, fmt.Errorf("read config %s: %w", path, err)
+	}
+	// An empty or whitespace-only file (e.g. freshly touched) is treated as
+	// "no config yet" rather than a parse error.
+	if len(bytes.TrimSpace(data)) == 0 {
+		return cfg, nil
+	}
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return cfg, fmt.Errorf("parse config %s: %w", path, err)
+	}
+	return cfg, nil
+}
+
+// saveConfig writes the config file with 0600 permissions, since it holds a
+// bearer token. The parent directory is created if absent.
+func saveConfig(path string, cfg Config) error {
+	if dir := filepath.Dir(path); dir != "" {
+		if err := os.MkdirAll(dir, 0o700); err != nil {
+			return fmt.Errorf("create config dir: %w", err)
+		}
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return fmt.Errorf("encode config: %w", err)
+	}
+	if err := os.WriteFile(path, append(data, '\n'), 0o600); err != nil {
+		return fmt.Errorf("write config %s: %w", path, err)
+	}
+	// os.WriteFile only applies the mode when creating the file; Chmod ensures
+	// 0600 even when overwriting a pre-existing, looser-permissioned config.
+	if err := os.Chmod(path, 0o600); err != nil {
+		return fmt.Errorf("secure config %s: %w", path, err)
+	}
+	return nil
+}
+
+// resolveBaseURL applies precedence: --base-url > $TINYFORGE_URL > config > default.
+func resolveBaseURL(g *globals, cfg Config) string {
+	if g != nil && *g.baseURL != "" {
+		return *g.baseURL
+	}
+	if env := os.Getenv("TINYFORGE_URL"); env != "" {
+		return env
+	}
+	if cfg.BaseURL != "" {
+		return cfg.BaseURL
+	}
+	return defaultBaseURL
+}
+
+// resolveToken applies precedence: --token > $TINYFORGE_TOKEN > config.
+func resolveToken(g *globals, cfg Config) string {
+	if g != nil && *g.token != "" {
+		return *g.token
+	}
+	if env := os.Getenv("TINYFORGE_TOKEN"); env != "" {
+		return env
+	}
+	return cfg.Token
+}
+
+// session bundles the resolved client with the loaded config and its path, so
+// commands can both make requests and persist updates (e.g. login).
+type session struct {
+	client     *Client
+	cfg        Config
+	configPath string
+}
+
+// newSession loads config and builds a client with resolved base URL + token.
+func newSession(g *globals) (*session, error) {
+	path, err := configFilePath(g)
+	if err != nil {
+		return nil, err
+	}
+	cfg, err := loadConfig(path)
+	if err != nil {
+		return nil, err
+	}
+	return &session{
+		client:     newClient(resolveBaseURL(g, cfg), resolveToken(g, cfg)),
+		cfg:        cfg,
+		configPath: path,
+	}, nil
+}
@@ -0,0 +1,73 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"time"
+)
+
+func runDeploy(args []string) error {
+	fs := flag.NewFlagSet("deploy", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	ref := fs.String("ref", "", "image tag / git ref / source-specific deploy target")
+	note := fs.String("note", "", "free-text note recorded with the deploy")
+	timeout := fs.Duration("timeout", 15*time.Minute, "max time to wait for the deploy to finish")
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge deploy <app> [--ref TAG] [--note TEXT] [--timeout DUR]\n\n"+
+			"Trigger a deploy and wait for it to finish. Requires an admin token.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	if fs.NArg() != 1 {
+		fs.Usage()
+		return fmt.Errorf("expected exactly one app (name or id)")
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+
+	// Resolve the app on a short deadline; the deploy itself gets the full one.
+	resolveCtx, cancelResolve := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancelResolve()
+	app, err := resolveApp(resolveCtx, sess.client, fs.Arg(0))
+	if err != nil {
+		return err
+	}
+
+	body := map[string]string{}
+	if *ref != "" {
+		body["reference"] = *ref
+	}
+	if *note != "" {
+		body["note"] = *note
+	}
+
+	fmt.Printf("Deploying %s%s…\n", app.Name, refSuffix(*ref))
+
+	// The endpoint returns 202 but blocks until the deploy completes, so a
+	// success here means it finished; allow plenty of time for pull/build.
+	ctx, cancel := context.WithTimeout(context.Background(), *timeout)
+	defer cancel()
+
+	var result DeployResult
+	if err := sess.client.doJSON(ctx, "POST", "/api/workloads/"+app.ID+"/deploy", body, &result); err != nil {
+		return err
+	}
+
+	fmt.Printf("Deploy of %s completed (triggered by %s).\n", app.Name, result.TriggeredBy)
+	fmt.Printf("Follow with: tinyforge logs %s -f\n", app.Name)
+	return nil
+}
+
+func refSuffix(ref string) string {
+	if ref == "" {
+		return ""
+	}
+	return fmt.Sprintf(" @ %s", ref)
+}
@@ -0,0 +1,136 @@
+package main
+
+import (
+	"bufio"
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+func runLogin(args []string) error {
+	fs := flag.NewFlagSet("login", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	user := fs.String("user", "", "username (prompted if omitted)")
+	pass := fs.String("password", "", "password (insecure; prefer $TINYFORGE_PASSWORD or the prompt)")
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge login [--user U] [--password P] [--base-url URL]\n\n"+
+			"Authenticate against the server and cache the token.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+
+	username := *user
+	if username == "" {
+		username, err = promptLine("Username: ")
+		if err != nil {
+			return err
+		}
+	}
+
+	password := *pass
+	if password == "" {
+		password = os.Getenv("TINYFORGE_PASSWORD")
+	}
+	if password == "" {
+		password, err = promptPassword("Password: ")
+		if err != nil {
+			return err
+		}
+	}
+	if username == "" || password == "" {
+		return fmt.Errorf("username and password are required")
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	var tok SessionToken
+	body := map[string]string{"username": username, "password": password}
+	if err := sess.client.doJSON(ctx, "POST", "/api/auth/login", body, &tok); err != nil {
+		return err
+	}
+
+	// Persist the resolved base URL alongside the token so later commands need
+	// no flags. The token file is written 0600 by saveConfig.
+	sess.cfg.BaseURL = sess.client.baseURL
+	sess.cfg.Token = tok.Token
+	sess.cfg.ExpiresAt = tok.ExpiresAt
+	if err := saveConfig(sess.configPath, sess.cfg); err != nil {
+		return err
+	}
+
+	fmt.Printf("Logged in to %s as %s.\n", sess.client.baseURL, username)
+	if exp := friendlyExpiry(tok.ExpiresAt); exp != "" {
+		fmt.Printf("Token valid until %s.\n", exp)
+	}
+	return nil
+}
+
+func runLogout(args []string) error {
+	fs := flag.NewFlagSet("logout", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+	if sess.client.token == "" {
+		fmt.Println("Not logged in.")
+		return nil
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+
+	// Best-effort server-side revocation; clear the local token regardless.
+	revokeErr := sess.client.doJSON(ctx, "POST", "/api/auth/logout", nil, nil)
+
+	sess.cfg.Token = ""
+	sess.cfg.ExpiresAt = ""
+	if err := saveConfig(sess.configPath, sess.cfg); err != nil {
+		return err
+	}
+
+	if revokeErr != nil {
+		fmt.Printf("Cleared local token (server revocation skipped: %v).\n", revokeErr)
+		return nil
+	}
+	fmt.Println("Logged out.")
+	return nil
+}
+
+// promptLine reads a single trimmed line from stdin.
+func promptLine(label string) (string, error) {
+	fmt.Fprint(os.Stderr, label)
+	r := bufio.NewReader(os.Stdin)
+	line, err := r.ReadString('\n')
+	if err != nil && line == "" {
+		return "", fmt.Errorf("read input: %w", err)
+	}
+	return strings.TrimSpace(line), nil
+}
+
+// friendlyExpiry formats an RFC3339 expiry as a local time, best-effort.
+func friendlyExpiry(s string) string {
+	if s == "" {
+		return ""
+	}
+	t, err := time.Parse(time.RFC3339, s)
+	if err != nil {
+		return s
+	}
+	return t.Local().Format("2006-01-02 15:04 MST")
+}
@@ -0,0 +1,143 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"net/url"
+	"os"
+	"os/signal"
+	"strings"
+	"time"
+)
+
+func runLogs(args []string) error {
+	fs := flag.NewFlagSet("logs", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	follow := fs.Bool("f", false, "follow the log stream (Ctrl-C to stop)")
+	tail := fs.Int("tail", 200, "number of trailing lines to show (max 5000)")
+	container := fs.String("container", "", "container row id/prefix or role (when an app has several)")
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge logs <app> [-f] [--tail N] [--container CID]\n\nPrint or follow a container's logs.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	if fs.NArg() != 1 {
+		fs.Usage()
+		return fmt.Errorf("expected exactly one app (name or id)")
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+
+	resolveCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	app, err := resolveApp(resolveCtx, sess.client, fs.Arg(0))
+	if err != nil {
+		return err
+	}
+
+	var containers []Container
+	if err := sess.client.doJSON(resolveCtx, "GET", "/api/workloads/"+app.ID+"/containers", nil, &containers); err != nil {
+		return err
+	}
+	target, err := chooseContainer(containers, *container)
+	if err != nil {
+		return err
+	}
+
+	q := url.Values{}
+	q.Set("tail", fmt.Sprintf("%d", *tail))
+	base := "/api/workloads/" + app.ID + "/containers/" + target.ID + "/logs"
+
+	if !*follow {
+		var lines []string
+		if err := sess.client.doJSON(resolveCtx, "GET", base+"?"+q.Encode(), nil, &lines); err != nil {
+			return err
+		}
+		for _, line := range lines {
+			fmt.Println(line)
+		}
+		return nil
+	}
+
+	// Follow: stream until EOF or Ctrl-C.
+	q.Set("follow", "true")
+	ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
+	defer stop()
+
+	err = sess.client.streamSSE(ctx, base+"?"+q.Encode(), func(payload []byte) error {
+		var frame struct {
+			Line string `json:"line"`
+		}
+		if json.Unmarshal(payload, &frame) != nil {
+			return nil // ignore frames we can't parse
+		}
+		fmt.Println(frame.Line)
+		return nil
+	})
+	if ctx.Err() != nil { // user interrupted — clean exit
+		return nil
+	}
+	return err
+}
+
+// chooseContainer selects which container to read. With an explicit selector,
+// it matches the row id (exact or prefix) or the role. Otherwise it uses the
+// sole container, or the sole running one, and errors with a list when the
+// choice is ambiguous.
+func chooseContainer(cs []Container, selector string) (Container, error) {
+	if len(cs) == 0 {
+		return Container{}, fmt.Errorf("app has no containers yet — deploy it first")
+	}
+
+	if selector != "" {
+		var matches []Container
+		for _, c := range cs {
+			if c.ID == selector || strings.EqualFold(c.Role, selector) ||
+				(len(selector) >= 6 && strings.HasPrefix(c.ID, selector)) {
+				matches = append(matches, c)
+			}
+		}
+		switch len(matches) {
+		case 1:
+			return matches[0], nil
+		case 0:
+			return Container{}, fmt.Errorf("no container matching %q\n%s", selector, containerList(cs))
+		default:
+			return Container{}, fmt.Errorf("%q matches multiple containers\n%s", selector, containerList(cs))
+		}
+	}
+
+	if len(cs) == 1 {
+		return cs[0], nil
+	}
+	var running []Container
+	for _, c := range cs {
+		if c.State == "running" {
+			running = append(running, c)
+		}
+	}
+	if len(running) == 1 {
+		return running[0], nil
+	}
+	return Container{}, fmt.Errorf("app has %d containers; pick one with --container:\n%s", len(cs), containerList(cs))
+}
+
+func containerList(cs []Container) string {
+	var b strings.Builder
+	for _, c := range cs {
+		role := c.Role
+		if role == "" {
+			role = "(default)"
+		}
+		fmt.Fprintf(&b, "  %s  %-12s %s\n", idShort(c.ID), role, c.State)
+	}
+	return strings.TrimRight(b.String(), "\n")
+}
@@ -0,0 +1,95 @@
+// Command tinyforge is a terminal client for a Tinyforge server.
+//
+// It drives the existing HTTP API: log in to obtain a 24h JWT, then list
+// apps, trigger deploys, stream logs, and check status. The token is cached
+// in ~/.tinyforge/config.json (mode 0600) so subsequent commands reuse it.
+//
+// Usage:
+//
+//	tinyforge login [--user U] [--password P]
+//	tinyforge apps [list]
+//	tinyforge deploy <app> [--ref TAG] [--note TEXT]
+//	tinyforge logs <app> [-f] [--tail N] [--container CID]
+//	tinyforge status [<app>]
+//	tinyforge logout
+//	tinyforge version
+//
+// The target server is resolved from --base-url, then $TINYFORGE_URL, then the
+// saved config, then http://localhost:8080.
+package main
+
+import (
+	"fmt"
+	"os"
+)
+
+// version is the CLI build version. Overridable at build time via
+// -ldflags "-X main.version=...".
+var version = "dev"
+
+func main() {
+	if len(os.Args) < 2 {
+		usage(os.Stderr)
+		os.Exit(2)
+	}
+
+	cmd, args := os.Args[1], os.Args[2:]
+
+	var err error
+	switch cmd {
+	case "login":
+		err = runLogin(args)
+	case "logout":
+		err = runLogout(args)
+	case "apps":
+		err = runApps(args)
+	case "deploy":
+		err = runDeploy(args)
+	case "logs":
+		err = runLogs(args)
+	case "status":
+		err = runStatus(args)
+	case "version", "--version", "-v":
+		fmt.Printf("tinyforge %s\n", version)
+	case "help", "-h", "--help":
+		usage(os.Stdout)
+	default:
+		fmt.Fprintf(os.Stderr, "tinyforge: unknown command %q\n\n", cmd)
+		usage(os.Stderr)
+		os.Exit(2)
+	}
+
+	if err != nil {
+		// Authenticated commands that hit a 401 get a re-login hint; the login
+		// command itself surfaces the server message ("invalid credentials").
+		if cmd != "login" && isAuthError(err) {
+			err = fmt.Errorf("%w — run 'tinyforge login'", err)
+		}
+		fmt.Fprintf(os.Stderr, "tinyforge: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func usage(w *os.File) {
+	fmt.Fprint(w, `tinyforge — terminal client for a Tinyforge server
+
+Usage:
+  tinyforge <command> [flags]
+
+Commands:
+  login              Authenticate and cache a token
+  logout             Revoke the cached token and clear it
+  apps [list]        List your apps (workloads with a source)
+  deploy <app>       Trigger a deploy (waits for completion)
+  logs <app>         Print container logs (use -f to follow)
+  status [<app>]     Show server health, or one app's containers
+  version            Print the CLI version
+
+Global flags (accepted by any command):
+  --base-url URL     Server URL (default $TINYFORGE_URL or http://localhost:8080)
+  --token TOKEN      Auth token (default $TINYFORGE_TOKEN or cached config)
+  --config PATH      Config file (default $TINYFORGE_CONFIG or ~/.tinyforge/config.json)
+
+Run "tinyforge <command> -h" for command-specific flags.
+`)
+}
@@ -0,0 +1,38 @@
+//go:build !windows
+
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+)
+
+// promptPassword reads a password from stdin with echo disabled via stty. If
+// stty is unavailable (no tty, missing binary), it falls back to an echoed
+// read so the command still works in pipes/CI.
+func promptPassword(label string) (string, error) {
+	fmt.Fprint(os.Stderr, label)
+
+	echoDisabled := stty("-echo") == nil
+	if echoDisabled {
+		defer func() {
+			_ = stty("echo")
+			fmt.Fprintln(os.Stderr) // the Enter keystroke was not echoed
+		}()
+	}
+
+	line, err := bufio.NewReader(os.Stdin).ReadString('\n')
+	if err != nil && line == "" {
+		return "", fmt.Errorf("read password: %w", err)
+	}
+	return strings.TrimRight(line, "\r\n"), nil
+}
+
+func stty(arg string) error {
+	cmd := exec.Command("stty", arg)
+	cmd.Stdin = os.Stdin
+	return cmd.Run()
+}
@@ -0,0 +1,45 @@
+//go:build windows
+
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+	"syscall"
+	"unsafe"
+)
+
+// enableEchoInput is the Windows console mode bit that echoes typed input.
+const enableEchoInput = 0x0004
+
+// promptPassword reads a password from the console with echo disabled, using
+// kernel32 directly so no third-party dependency is needed. If the console
+// mode cannot be changed (e.g. piped stdin), it falls back to an echoed read.
+func promptPassword(label string) (string, error) {
+	fmt.Fprint(os.Stderr, label)
+
+	kernel32 := syscall.NewLazyDLL("kernel32.dll")
+	getConsoleMode := kernel32.NewProc("GetConsoleMode")
+	setConsoleMode := kernel32.NewProc("SetConsoleMode")
+	handle := syscall.Handle(os.Stdin.Fd())
+
+	var mode uint32
+	echoDisabled := false
+	if r, _, _ := getConsoleMode.Call(uintptr(handle), uintptr(unsafe.Pointer(&mode))); r != 0 {
+		if ret, _, _ := setConsoleMode.Call(uintptr(handle), uintptr(mode&^enableEchoInput)); ret != 0 {
+			echoDisabled = true
+			defer setConsoleMode.Call(uintptr(handle), uintptr(mode))
+		}
+	}
+
+	line, err := bufio.NewReader(os.Stdin).ReadString('\n')
+	if echoDisabled {
+		fmt.Fprintln(os.Stderr) // the Enter keystroke was not echoed
+	}
+	if err != nil && line == "" {
+		return "", fmt.Errorf("read password: %w", err)
+	}
+	return strings.TrimRight(line, "\r\n"), nil
+}
@@ -0,0 +1,122 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"text/tabwriter"
+	"time"
+)
+
+func runStatus(args []string) error {
+	fs := flag.NewFlagSet("status", flag.ExitOnError)
+	g := addGlobalFlags(fs)
+	fs.Usage = func() {
+		fmt.Fprint(os.Stderr, "Usage: tinyforge status [<app>]\n\nWith no app: server health and the logged-in user.\nWith an app: that app's containers.\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	sess, err := newSession(g)
+	if err != nil {
+		return err
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	if fs.NArg() == 0 {
+		return serverStatus(ctx, sess)
+	}
+	return appStatus(ctx, sess.client, fs.Arg(0))
+}
+
+func serverStatus(ctx context.Context, sess *session) error {
+	fmt.Printf("Server:  %s\n", sess.client.baseURL)
+
+	var me User
+	if err := sess.client.doJSON(ctx, "GET", "/api/auth/me", nil, &me); err != nil {
+		fmt.Printf("User:    not logged in (%v)\n", err)
+	} else {
+		fmt.Printf("User:    %s (%s)\n", me.Username, me.Role)
+	}
+	if exp := friendlyExpiry(sess.cfg.ExpiresAt); exp != "" {
+		fmt.Printf("Token:   valid until %s\n", exp)
+	}
+
+	var health map[string]any
+	if err := sess.client.doJSON(ctx, "GET", "/api/health", nil, &health); err != nil {
+		return err
+	}
+	fmt.Printf("DB:      %s\n", connState(health, "database"))
+	docker := connState(health, "docker")
+	if v := nestedString(health, "docker", "version"); v != "" {
+		docker += " (v" + v + ")"
+	}
+	fmt.Printf("Docker:  %s\n", docker)
+	if _, ok := health["proxy"]; ok {
+		fmt.Printf("Proxy:   %s\n", connState(health, "proxy"))
+	}
+	return nil
+}
+
+func appStatus(ctx context.Context, c *Client, ref string) error {
+	app, err := resolveApp(ctx, c, ref)
+	if err != nil {
+		return err
+	}
+	var containers []Container
+	if err := c.doJSON(ctx, "GET", "/api/workloads/"+app.ID+"/containers", nil, &containers); err != nil {
+		return err
+	}
+
+	fmt.Printf("%s  (%s, %s)\n", app.Name, app.SourceKind, idShort(app.ID))
+	if len(containers) == 0 {
+		fmt.Println("No containers — not deployed yet.")
+		return nil
+	}
+
+	tw := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
+	fmt.Fprintln(tw, "ROLE\tSTATE\tIMAGE\tPORT\tSUBDOMAIN\tCONTAINER")
+	for _, c := range containers {
+		role := c.Role
+		if role == "" {
+			role = "(default)"
+		}
+		port := ""
+		if c.Port != 0 {
+			port = fmt.Sprintf("%d", c.Port)
+		}
+		fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\n",
+			role, c.State, c.ImageRef, port, c.Subdomain, idShort(c.ID))
+	}
+	return tw.Flush()
+}
+
+// connState reads health[section].connected and renders connected/disconnected,
+// appending the section's error string when present.
+func connState(health map[string]any, section string) string {
+	m, ok := health[section].(map[string]any)
+	if !ok {
+		return "unknown"
+	}
+	connected, _ := m["connected"].(bool)
+	if connected {
+		return "connected"
+	}
+	if msg, ok := m["error"].(string); ok && msg != "" {
+		return "disconnected (" + msg + ")"
+	}
+	return "disconnected"
+}
+
+func nestedString(m map[string]any, section, key string) string {
+	sub, ok := m[section].(map[string]any)
+	if !ok {
+		return ""
+	}
+	s, _ := sub[key].(string)
+	return s
+}
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"errors"
+	"fmt"
 	"io/fs"
 	"log/slog"
 	"net/http"
@@ -12,21 +13,45 @@ import (
 	"syscall"
 	"time"

-	dockerwatcher "github.com/alexei/docker-watcher"
-	"github.com/alexei/docker-watcher/internal/api"
-	"github.com/alexei/docker-watcher/internal/auth"
-	"github.com/alexei/docker-watcher/internal/config"
-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/deployer"
-	"github.com/alexei/docker-watcher/internal/docker"
-	"github.com/alexei/docker-watcher/internal/events"
-	"github.com/alexei/docker-watcher/internal/health"
-	"github.com/alexei/docker-watcher/internal/logging"
-	"github.com/alexei/docker-watcher/internal/notify"
-	"github.com/alexei/docker-watcher/internal/npm"
-	"github.com/alexei/docker-watcher/internal/registry"
-	"github.com/alexei/docker-watcher/internal/store"
-	"github.com/alexei/docker-watcher/internal/webhook"
+	"github.com/robfig/cron/v3"
+
+	tinyforge "github.com/alexei/tinyforge"
+	"github.com/alexei/tinyforge/internal/api"
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/backup"
+	"github.com/alexei/tinyforge/internal/config"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/deployer"
+	"github.com/alexei/tinyforge/internal/dns"
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/health"
+	"github.com/alexei/tinyforge/internal/logging"
+	"github.com/alexei/tinyforge/internal/logscanner"
+	"github.com/alexei/tinyforge/internal/metricalert"
+	"github.com/alexei/tinyforge/internal/notify"
+	"github.com/alexei/tinyforge/internal/npm"
+	"github.com/alexei/tinyforge/internal/proxy"
+	"github.com/alexei/tinyforge/internal/reconciler"
+	"github.com/alexei/tinyforge/internal/scheduler"
+	"github.com/alexei/tinyforge/internal/stale"
+	"github.com/alexei/tinyforge/internal/stats"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
+	"github.com/alexei/tinyforge/internal/webhook"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+
+	// Plugin registrations: each blank-import runs its init() and registers
+	// itself with internal/workload/plugin. Adding a new Source or Trigger
+	// is a matter of dropping a new package and adding it to this list.
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/compose"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/dockerfile"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/image"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/static"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/git"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/manual"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/registry"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/schedule"
 )

 func main() {
@@ -40,8 +65,22 @@ func main() {
 		os.Exit(1)
 	}

+	// Acquire single-instance lockfile BEFORE opening the DB. SQLite +
+	// SetMaxOpenConns(1) does not protect against two Tinyforge processes
+	// sharing a data directory; without this guard a misconfigured
+	// systemd unit, container restart race, or `tinyforge` shell typo can
+	// silently double-fire schedulers, double-poll registries, and
+	// corrupt `extra_json` RMW. The lockfile is a PID file under
+	// $DATA_DIR/tinyforge.lock — collisions with dead PIDs are reclaimed.
+	releaseLock, err := store.AcquireLockfile(dataDir)
+	if err != nil {
+		slog.Error("could not acquire data-dir lock", "data_dir", dataDir, "error", err)
+		os.Exit(1)
+	}
+	defer releaseLock()
+
 	// Open database.
-	dbPath := filepath.Join(dataDir, "docker-watcher.db")
+	dbPath := filepath.Join(dataDir, "tinyforge.db")
 	db, err := store.New(dbPath)
 	if err != nil {
 		slog.Error("open store", "error", err)
@@ -49,18 +88,33 @@ func main() {
 	}
 	defer db.Close()

-	// Import seed config on first launch (idempotent).
-	seedPath := envOrDefault("SEED_FILE", "./docker-watcher.yaml")
-	if err := config.ImportSeed(db, seedPath); err != nil {
-		slog.Error("seed import", "error", err)
+	// Derive encryption key from environment (required).
+	encKey, err := crypto.KeyFromEnv()
+	if err != nil {
+		slog.Error("ENCRYPTION_KEY is required — set it to a random 32+ character string")
 		os.Exit(1)
 	}

-	// Derive encryption key from environment.
-	encKey, err := crypto.KeyFromEnv()
-	if err != nil {
-		slog.Warn("encryption key not set, using default", "warning", err.Error())
-		encKey = crypto.DeriveKey("docker-watcher-default-key")
+	// One-shot migration: rewrite every legacy unprefixed-hex secret
+	// in the DB into the new tf1: envelope form. Idempotent (gated by
+	// schema_versions version 2). Lets the rest of the codebase treat
+	// envelope-presence as a stable invariant for future key rotations.
+	// Failures here are logged but non-fatal: a partial migration just
+	// means some columns keep working through Decrypt's legacy
+	// fallback until the next manual save re-encrypts them.
+	if err := db.MigrateSecretsToEnvelope(store.EnvelopeMigrator{
+		HasEnvelope: crypto.HasEnvelope,
+		Decrypt:     func(v string) (string, error) { return crypto.Decrypt(encKey, v) },
+		Encrypt:     func(v string) (string, error) { return crypto.Encrypt(encKey, v) },
+	}); err != nil {
+		slog.Warn("secrets envelope migration", "error", err)
+	}
+
+	// Import seed config on first launch (idempotent).
+	seedPath := envOrDefault("SEED_FILE", "./tinyforge.yaml")
+	if err := config.ImportSeed(db, seedPath); err != nil {
+		slog.Error("seed import", "error", err)
+		os.Exit(1)
 	}

 	// Ensure default admin user exists on first launch.
@@ -77,6 +131,14 @@ func main() {
 	}
 	defer dockerClient.Close()

+	// Start the container index reconciler. Runs one boot pass and then
+	// ticks every 30s. Boot pass populates the containers table from any
+	// running containers that predate the workload refactor; subsequent
+	// ticks catch state drift the deployer didn't witness.
+	rec := reconciler.New(db, dockerClient, 30*time.Second)
+	rec.Start(context.Background())
+	defer rec.Stop()
+
 	// Read settings for NPM URL and polling interval.
 	settings, err := db.GetSettings()
 	if err != nil {
@@ -84,49 +146,296 @@ func main() {
 		os.Exit(1)
 	}

-	// Initialize NPM client.
+	// Initialize NPM client (used for NPM-specific endpoints like certificates).
 	npmURL := envOrDefault("NPM_URL", settings.NpmURL)
 	npmClient := npm.New(npmURL)

+	// Build proxy provider based on settings.
+	var proxyProvider proxy.Provider
+	switch settings.ProxyProvider {
+	case "none":
+		proxyProvider = proxy.NewNoneProvider()
+		slog.Info("proxy provider: none")
+	case "traefik":
+		proxyProvider = proxy.NewTraefikProvider(
+			settings.TraefikEntrypoint,
+			settings.TraefikCertResolver,
+			settings.TraefikNetwork,
+			settings.TraefikAPIURL,
+		)
+		slog.Info("proxy provider: traefik", "entrypoint", settings.TraefikEntrypoint)
+	default:
+		// Default to NPM for backward compatibility (including "npm" and empty string).
+		npmPassword := ""
+		if settings.NpmPassword != "" {
+			decrypted, err := crypto.Decrypt(encKey, settings.NpmPassword)
+			if err != nil {
+				slog.Warn("failed to decrypt NPM password for proxy provider", "error", err)
+			} else {
+				npmPassword = decrypted
+			}
+		}
+		proxyProvider = proxy.NewNpmProvider(npmClient, settings.NpmEmail, npmPassword)
+		slog.Info("proxy provider: npm", "url", npmURL)
+	}
+
 	// Initialize services.
 	healthChecker := health.New()
 	notifier := notify.New()
 	eventBus := events.New()

-	dep := deployer.New(dockerClient, npmClient, db, healthChecker, notifier, eventBus, encKey)
+	// Auto-persist warn/error events from the event bus to the database.
+	stopLogger := eventBus.RegisterPersistentLogger(func(source, severity, message, metadata string) (int64, string, error) {
+		evt, err := db.InsertEvent(store.EventLog{
+			Source:   source,
+			Severity: severity,
+			Message:  message,
+			Metadata: metadata,
+		})
+		if err != nil {
+			return 0, "", err
+		}
+		return evt.ID, evt.CreatedAt, nil
+	})
+	defer stopLogger()

-	// Initialize webhook handler.
-	webhookHandler := webhook.NewHandler(db, dep, dockerClient)
+	// Event-trigger dispatcher: consume EventLog publishes off the bus
+	// and fan out to operator-configured webhook actions.
+	stopTriggerDispatcher := events.RegisterEventTriggerDispatcher(eventBus, db, notifier)
+	defer stopTriggerDispatcher()

-	// Ensure webhook secret exists.
-	_, err = webhook.EnsureWebhookSecret(db)
+	dep := deployer.New(dockerClient, proxyProvider, db, healthChecker, notifier, eventBus, encKey)
+	rec.SetPluginReconciler(dep)
+
+	// Initialize webhook handler. The single inbound surface is
+	// /api/webhook/triggers/{secret}; the plugin dispatcher wires the
+	// trigger fan-out to the deployer.
+	webhookHandler := webhook.NewHandler(db)
+	webhookHandler.SetPluginDispatcher(dep)
+
+	// Scheduler ticks every 30s and dispatches "schedule"-kind triggers
+	// through the same FanOutForTrigger path as the inbound webhook. Boot
+	// runs one sweep immediately so a daily schedule does not idle 24h
+	// after a restart before catching up.
+	sched := scheduler.New(db, func(ctx context.Context, trg store.Trigger, evt plugin.InboundEvent) error {
+		results, err := webhookHandler.FanOutForTrigger(ctx, trg, evt)
+		if err != nil {
+			return err
+		}
+		// Log per-fire summary so a schedule that quietly fails on N
+		// of M bindings is visible without parsing per-binding rows.
+		var deployed, errored int
+		for _, r := range results {
+			switch {
+			case r.Deployed:
+				deployed++
+			case r.Reason == webhook.ReasonBindingDisabled, r.Reason == webhook.ReasonNoMatch,
+				r.Reason == webhook.ReasonPreviewNoop:
+				// not a failure — silent
+			default:
+				errored++
+			}
+		}
+		slog.Info("scheduler dispatch summary",
+			"trigger", trg.Name, "bindings", len(results),
+			"deployed", deployed, "errored", errored)
+		return nil
+	}, 30*time.Second)
+	sched.Start(context.Background())
+	defer sched.Stop()
+
+	// Initialize stale container scanner.
+	staleScanner := stale.New(db, dockerClient, eventBus)
+	if err := staleScanner.Start("1h"); err != nil {
+		slog.Warn("failed to start stale scanner", "error", err)
+	}
+
+	// Start daily event log pruning cron job.
+	cronScheduler := cron.New()
+	if _, err := cronScheduler.AddFunc("@daily", func() {
+		pruned, err := db.PruneEvents(30)
+		if err != nil {
+			slog.Error("event log prune failed", "error", err)
+			return
+		}
+		if pruned > 0 {
+			slog.Info("pruned old event log entries", "count", pruned)
+		}
+	}); err != nil {
+		slog.Warn("failed to schedule event prune cron", "error", err)
+	}
+	// Webhook delivery log: keep 14 days of audit trail.
+	if _, err := cronScheduler.AddFunc("@daily", func() {
+		cutoff := time.Now().UTC().AddDate(0, 0, -14).Format("2006-01-02 15:04:05")
+		pruned, err := db.PruneWebhookDeliveriesBefore(cutoff)
+		if err != nil {
+			slog.Error("webhook delivery prune failed", "error", err)
+			return
+		}
+		if pruned > 0 {
+			slog.Info("pruned old webhook deliveries", "count", pruned)
+		}
+	}); err != nil {
+		slog.Warn("failed to schedule webhook delivery prune cron", "error", err)
+	}
+	cronScheduler.Start()
+
+	// Subscribe to error events and forward notifications.
+	notifySub := eventBus.Subscribe(func(evt events.Event) bool {
+		if evt.Type != events.EventLog {
+			return false
+		}
+		p, ok := evt.Payload.(events.EventLogPayload)
+		if !ok {
+			return false
+		}
+		return p.Severity == "error"
+	})
+	go func() {
+		for evt := range notifySub {
+			p, ok := evt.Payload.(events.EventLogPayload)
+			if !ok {
+				continue
+			}
+			currentSettings, err := db.GetSettings()
+			if err != nil || currentSettings.NotificationURL == "" {
+				continue
+			}
+			notifier.SendSigned(currentSettings.NotificationURL, currentSettings.NotificationSecret, notify.TierSettings, notify.Event{
+				Type:    p.Source + "_error",
+				Project: p.Source,
+				Error:   p.Message,
+			})
+		}
+	}()
+
+	// Initialize DNS provider from settings (nil for wildcard mode).
+	dnsProvider := initDNSProvider(settings, encKey)
+	if dnsProvider != nil {
+		dep.SetDNSProvider(dnsProvider)
+		slog.Info("DNS provider initialized", "provider", settings.DNSProvider)
+	}
+
+	// Initialize backup engine.
+	backupEngine, err := backup.New(db, dbPath, dataDir)
 	if err != nil {
-		slog.Error("ensure webhook secret", "error", err)
+		slog.Error("create backup engine", "error", err)
 		os.Exit(1)
 	}
-	slog.Info("webhook secret configured (use /api/settings/webhook-url to retrieve)")
+	dep.SetPreDeployBackuper(backupEngine)

-	// Initialize registry poller.
-	poller := registry.NewPoller(db, dep, encKey)
-	pollingInterval := envOrDefault("POLLING_INTERVAL", settings.PollingInterval)
-	if pollingInterval != "" {
-		if err := poller.Start(pollingInterval); err != nil {
-			slog.Warn("failed to start poller", "error", err)
+	// Initialize volume-snapshot engine (per-workload data-volume archives).
+	snapshotEngine, err := volsnap.New(db, dataDir)
+	if err != nil {
+		slog.Error("create snapshot engine", "error", err)
+		os.Exit(1)
+	}
+	// Reclaim snapshot files orphaned by workload deletes (rows CASCADE, files don't).
+	if cleaned, err := snapshotEngine.CleanOrphans(); err != nil {
+		slog.Warn("snapshots: clean orphans on startup", "error", err)
+	} else if cleaned > 0 {
+		slog.Info("snapshots: cleaned orphan files on startup", "count", cleaned)
+	}
+
+	// Clean orphaned backup files and prune on startup.
+	if cleaned, err := backupEngine.CleanOrphans(); err != nil {
+		slog.Warn("backup: clean orphans on startup", "error", err)
+	} else if cleaned > 0 {
+		slog.Info("backup: cleaned orphaned files on startup", "count", cleaned)
+	}
+	if settings.BackupRetentionCount > 0 {
+		if pruned, err := backupEngine.Prune(settings.BackupRetentionCount); err != nil {
+			slog.Warn("backup: prune on startup", "error", err)
+		} else if pruned > 0 {
+			slog.Info("backup: pruned old backups on startup", "count", pruned)
 		}
 	}

+	// Schedule autobackup if enabled. Track entry ID for rescheduling.
+	var backupCronID cron.EntryID
+	scheduleAutobackup := func(enabled bool, intervalHours int) {
+		// Remove existing schedule if any.
+		if backupCronID != 0 {
+			cronScheduler.Remove(backupCronID)
+			backupCronID = 0
+			slog.Info("autobackup: removed previous schedule")
+		}
+		if !enabled || intervalHours <= 0 {
+			return
+		}
+		interval := fmt.Sprintf("@every %dh", intervalHours)
+		id, err := cronScheduler.AddFunc(interval, func() {
+			b, err := backupEngine.CreateBackup("auto")
+			if err != nil {
+				slog.Error("autobackup failed", "error", err)
+				return
+			}
+			slog.Info("autobackup completed", "id", b.ID, "filename", b.Filename)
+
+			currentSettings, err := db.GetSettings()
+			if err == nil && currentSettings.BackupRetentionCount > 0 {
+				backupEngine.Prune(currentSettings.BackupRetentionCount)
+			}
+		})
+		if err != nil {
+			slog.Warn("failed to schedule autobackup", "error", err)
+		} else {
+			backupCronID = id
+			slog.Info("autobackup scheduled", "interval_hours", intervalHours)
+		}
+	}
+	scheduleAutobackup(settings.BackupEnabled, settings.BackupIntervalHours)
+
+	// Initialize resource stats collector.
+	statsCollector := stats.New(db, dockerClient)
+	statsCollector.Start()
+
+	// Log-scan manager: tails running containers and emits event_log
+	// entries when log lines match operator-configured regex rules.
+	logScanMgr := logscanner.NewManager(logscanner.Config{
+		Rules:        db,
+		Containers:   db,
+		Docker:       dockerClient,
+		Events:       db,
+		Bus:          eventBus,
+		PollInterval: 5 * time.Second,
+	})
+	if err := logScanMgr.Start(context.Background()); err != nil {
+		slog.Warn("logscanner: initial rule load failed", "error", err)
+	}
+	defer logScanMgr.Stop()
+
+	// Metric-alert manager: evaluates threshold rules against recent
+	// container stats samples and emits event_log entries on breach.
+	// The store satisfies RuleSource/SampleSource/EventSink; the event
+	// bus is the Publisher.
+	metricAlertMgr := metricalert.New(db, db, db, eventBus)
+	metricAlertMgr.Start()
+	defer metricAlertMgr.Stop()
+
 	// Build API server.
-	apiServer := api.NewServer(db, dockerClient, dep, webhookHandler, eventBus, encKey)
+	apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
+	apiServer.SetStaleScanner(staleScanner)
+	apiServer.SetLogScanReloader(logScanMgr)
+	apiServer.SetBackupEngine(backupEngine)
+	apiServer.SetSnapshotEngine(snapshotEngine)
+	apiServer.SetDBPath(dbPath)
+	apiServer.SetBackupSettingsChangedCallback(scheduleAutobackup)
+	apiServer.SetDNSProvider(dnsProvider)
+	apiServer.SetDNSProviderChangedCallback(func(provider dns.Provider) {
+		dep.SetDNSProvider(provider)
+	})
+	apiServer.SetProxyProviderChangedCallback(func(provider proxy.Provider) {
+		dep.SetProxyProvider(provider)
+	})
 	router := apiServer.Router()

 	// Serve embedded static files for the SPA frontend.
-	// The embed.FS has "web/build" as a prefix, so we sub it to get the root.
-	webBuildFS, err := fs.Sub(dockerwatcher.WebBuildFS, "web/build")
+	webBuildFS, err := fs.Sub(tinyforge.WebBuildFS, "web/build")
 	if err != nil {
 		slog.Warn("embedded frontend not available", "error", err)
 	} else {
 		staticHandler := api.StaticHandler(webBuildFS)
-		// Handle all non-API routes with the static file server.
 		router.NotFound(staticHandler.ServeHTTP)
 	}

@@ -137,7 +446,6 @@ func main() {
 		Handler:     router,
 		ReadTimeout: 30 * time.Second,
 		// WriteTimeout is disabled (0) to support SSE long-lived connections.
-		// Individual non-SSE handlers should use context timeouts as needed.
 		WriteTimeout: 0,
 		IdleTimeout:  120 * time.Second,
 	}
@@ -146,8 +454,13 @@ func main() {
 	done := make(chan os.Signal, 1)
 	signal.Notify(done, os.Interrupt, syscall.SIGTERM)

+	// Allow restore to trigger shutdown.
+	apiServer.SetShutdownFunc(func() {
+		done <- syscall.SIGTERM
+	})
+
 	go func() {
-		slog.Info("Docker Watcher started", "addr", addr)
+		slog.Info("Tinyforge started", "addr", addr)
 		if err := httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 			slog.Error("HTTP server error", "error", err)
 			os.Exit(1)
@@ -158,10 +471,16 @@ func main() {
 	slog.Info("shutting down...")

 	// Stop accepting new work.
-	poller.Stop()
+	cronScheduler.Stop()
+	eventBus.Unsubscribe(notifySub)
+	staleScanner.Stop()
+	statsCollector.Stop()
+	metricAlertMgr.Stop()

-	// Drain in-progress deploys.
+	// Drain in-progress deploys and notifications.
 	dep.Drain()
+	webhookHandler.Drain()
+	notifier.Drain()

 	// Shut down HTTP server.
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
@@ -176,7 +495,7 @@ func main() {
 		slog.Error("database close error", "error", err)
 	}

-	slog.Info("Docker Watcher stopped")
+	slog.Info("Tinyforge stopped")
 }

 // envOrDefault reads an environment variable or returns the fallback value.
@@ -188,7 +507,6 @@ func envOrDefault(key, fallback string) string {
 }

 // ensureDefaultAdmin creates a default admin user on first launch if no users exist.
-// The password comes from ADMIN_PASSWORD env var, defaulting to "admin".
 func ensureDefaultAdmin(db *store.Store) error {
 	count, err := db.UserCount()
 	if err != nil {
@@ -198,7 +516,11 @@ func ensureDefaultAdmin(db *store.Store) error {
 		return nil // Users already exist, skip.
 	}

-	password := envOrDefault("ADMIN_PASSWORD", "admin")
+	password := os.Getenv("ADMIN_PASSWORD")
+	if password == "" {
+		slog.Error("ADMIN_PASSWORD is required on first launch — set it to a secure password")
+		os.Exit(1)
+	}
 	hash, err := auth.HashPassword(password)
 	if err != nil {
 		return err
@@ -221,3 +543,30 @@ func ensureDefaultAdmin(db *store.Store) error {
 	slog.Info("default admin user created", "username", "admin")
 	return nil
 }
+
+// initDNSProvider creates a DNS provider from settings. Returns nil for wildcard mode.
+func initDNSProvider(settings store.Settings, encKey [32]byte) dns.Provider {
+	if settings.WildcardDNS || settings.DNSProvider == "" {
+		return nil
+	}
+
+	token := settings.CloudflareAPIToken
+	if token != "" {
+		decrypted, err := crypto.Decrypt(encKey, token)
+		if err != nil {
+			slog.Error("dns: failed to decrypt API token", "error", err)
+			return nil
+		}
+		token = decrypted
+	}
+
+	provider, err := dns.NewProvider(settings.DNSProvider, dns.Config{
+		Token:  token,
+		ZoneID: settings.CloudflareZoneID,
+	})
+	if err != nil {
+		slog.Error("dns: failed to create provider", "error", err)
+		return nil
+	}
+	return provider
+}
@@ -1,8 +1,14 @@
 services:
-  docker-watcher:
+  tinyforge:
+    # Default: build from source so a fresh clone works out of the box.
    build: .
-    image: docker-watcher:latest
-    container_name: docker-watcher
+    # Image name doubles as the Gitea registry tag. To DEPLOY the pre-built
+    # image instead of building (e.g. Portainer pulling on a webhook), comment
+    # out `build:` above — compose will then pull this tag. `:latest` is pushed
+    # only for stable (non pre-release) releases, and the registry may require
+    # `docker login git.dolgolyov-family.by` first if the package is private.
+    image: git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge:latest
+    container_name: tinyforge
    restart: unless-stopped
    ports:
      - "8080:8080"
@@ -10,16 +16,16 @@ services:
      # Mount Docker socket for container management.
      - /var/run/docker.sock:/var/run/docker.sock
      # Persistent data (SQLite database).
-      - docker-watcher-data:/app/data
+      - tinyforge-data:/app/data
      # Optional seed config (read on first launch only).
-      - ./docker-watcher.yaml:/app/docker-watcher.yaml:ro
+      - ./tinyforge.yaml:/app/tinyforge.yaml:ro
    environment:
      # Required: protects all credentials stored in the database.
      - ENCRYPTION_KEY=${ENCRYPTION_KEY:?Set ENCRYPTION_KEY in .env}
-      # Optional: default admin password on first launch (default: "admin").
-      - ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
+      # Required on first launch: password for the default admin user.
+      - ADMIN_PASSWORD=${ADMIN_PASSWORD:?Set ADMIN_PASSWORD in .env}
      # Optional: override seed file location.
-      - SEED_FILE=/app/docker-watcher.yaml
+      - SEED_FILE=/app/tinyforge.yaml
      # Optional: override data directory.
      - DATA_DIR=/app/data
      # Optional: override listen address.
@@ -31,16 +37,21 @@ services:
    networks:
      - staging-net
    healthcheck:
-      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/api/auth/login"]
+      # /readyz is the public readiness probe (pings the DB, rate-limited).
+      # The previous target (/api/auth/login) is POST-only, so a GET/spider
+      # request returned 405 and the container was always reported unhealthy.
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/readyz"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s

 volumes:
-  docker-watcher-data:
+  tinyforge-data:
    driver: local

+# NOTE: The staging-net network must exist before starting.
+# Create it with: docker network create staging-net
 networks:
  staging-net:
    external: true
@@ -0,0 +1,42 @@
+# Tinyforge Codemaps — Index
+
+**Last Updated:** 2026-05-16 (added `container-extra-json` policy doc)
+
+This directory contains architectural maps of key Tinyforge subsystems. Each codemap focuses on one major area: core data types, contract surfaces, integration points, and recipes for extending the system.
+
+## Codemaps
+
+- **[Workload Plugin](./workload-plugin.md)** — Source × Trigger plugin contracts; registry lookups; webhook fan-out; how to add new kinds.
+- **[Discovery & Runtime API](./discovery-and-runtime.md)** — `/api/discovery/*` helpers (Git provider probe, repo/branch/tree pickers, image conflicts); `/api/workloads/{id}/runtime-state` + `/storage` + `/stop` + `/start`; SSRF-safe HTTP client in `internal/staticsite`.
+- **[`containers.extra_json` Evolution Policy](./container-extra-json.md)** — Ownership model, reader/writer rules, wholesale-overwrite vs preserve-unknown-keys patterns, concurrency invariants; checklist for adding a new field without breaking older deployers.
+
+## Cross-References
+
+- **Workload Refactor Handoff** — [`docs/WORKLOAD_REFACTOR_TODO.md`](../WORKLOAD_REFACTOR_TODO.md) — Full status of the trigger-split, legacy cutover, and remaining Priority items
+- **Webhook Documentation** — [`docs/webhooks.md`](../webhooks.md) — Outgoing webhook events, signature scheme, receiver code samples
+- **Observability + Event Triggers** — [`docs/LOGSCAN_AND_TRIGGERS_TODO.md`](../LOGSCAN_AND_TRIGGERS_TODO.md) — Log scanning rules, event triggers, related infrastructure
+
+## How to Use These Codemaps
+
+1. **Starting a new feature** in an existing area? Read the relevant codemap first to understand the contract surface and integration seams.
+2. **Adding a new plugin kind** (Source or Trigger)? See the recipes in [`workload-plugin.md`](./workload-plugin.md) — "How to Add a New Source Kind" / "How to Add a New Trigger Kind".
+3. **Debugging a plugin dispatch failure** (deploy, webhook, reconcile)? The "Data Flow" and "Integration Points" sections map out each path end-to-end.
+4. **Reviewing someone else's plugin PR**? Check the contracts (`Source.Deploy()`, `Trigger.Match()`, etc.) against the descriptions here.
+
+## Coverage
+
+These codemaps are automatically generated from the codebase structure. If a key file or area is missing, it indicates:
+
+- The area is under active refactor (see [`WORKLOAD_REFACTOR_TODO.md`](../WORKLOAD_REFACTOR_TODO.md) for priority order)
+- The area is legacy code scheduled for deprecation
+- The area is simple enough to document inline (JSDoc + comments in the source)
+
+## Freshness
+
+Codemaps are updated whenever:
+- A new plugin kind is added
+- The contract surface changes (new Source/Trigger method, new Deps field, etc.)
+- Integration points shift (new API endpoint, new reconciler behavior, etc.)
+- A major refactor lands (see workload-refactor status for examples)
+
+When you land a change that affects these areas, please update the relevant codemap and the `Last Updated` timestamp.
@@ -0,0 +1,105 @@
+# `containers.extra_json` — Evolution Policy
+
+**Last Updated:** 2026-05-16
+
+`extra_json` is a TEXT column on the `containers` table that source plugins use to persist source-specific runtime state that hasn't been promoted to a first-class column. It is the single forward-compatibility seam between the canonical container row and per-source needs that arise after a schema is in production.
+
+This doc captures the rules every reader and writer must follow so new sources can extend the blob without breaking older ones.
+
+## Schema position
+
+- Column: `containers.extra_json TEXT NOT NULL DEFAULT '{}'` ([`internal/store/store.go:233`](../../internal/store/store.go#L233)).
+- All four write paths (`CreateContainer`, `UpsertContainer`, `ReconcileContainer`, `UpdateContainer`) normalize `""` → `'{}'` before the SQL exec — readers can assume a non-empty JSON object string and never need to handle SQL `NULL` or the empty-string edge.
+- Defined on the `Container` model: [`internal/store/models.go:342-347`](../../internal/store/models.go#L342-L347).
+
+## Ownership model
+
+**One container row → one owning source.** Sources never write to a row that belongs to another source. In practice:
+
+| Source kind | Row key                                | Number of rows per workload | Writes `extra_json` today?  |
+| ----------- | -------------------------------------- | --------------------------- | --------------------------- |
+| `static`    | deterministic `<workloadID>:site`      | exactly 1                   | yes (preserve-unknown-keys) |
+| `image`     | UUID per deployed container            | 1 + N (blue-green rolls)    | yes (wholesale-overwrite)   |
+| `compose`   | deterministic `<workloadID>:<service>` | N (one per compose service) | no — left at `'{}'` default |
+
+Two sources cannot contend on the same row, so the policy below is concerned with **forward compatibility across versions of the same source**, not cross-source contention. When compose (or any future source) starts writing `extra_json`, the same rules apply.
+
+## Reader rules — ALL readers
+
+1. **Tolerate unknown keys.** Decode into a typed struct using `encoding/json`; Go's default unmarshaller silently drops unknown keys, which is the desired behaviour. Never use `json.Decoder.DisallowUnknownFields()` on `extra_json`.
+2. **Tolerate decode failure as non-fatal where the row's first-class columns are useful.** A corrupted `extra_json` is debug-logged and the reader falls back to zero state — see `workload_runtime.go:118-133` for the canonical pattern. The container's `ContainerID`, `State`, `ProxyRouteID`, etc. live in their own columns and are still trustworthy.
+3. **Tolerate `''` and `'{}'`.** Both are equivalent to "no extras yet". Readers must short-circuit before json.Unmarshal to avoid `unexpected end of JSON input` on the empty case.
+
+## Writer rules — by mutation style
+
+Two distinct write patterns live in the codebase today. Pick the one that matches your source's needs.
+
+### Wholesale-overwrite (image source pattern)
+
+When the writer owns 100% of the blob's shape and discards old contents on every write:
+
+```go
+// internal/workload/plugin/source/image/image.go:341-343
+extra := containerExtra{ProxyRoutes: faceRoutes}
+if b, err := json.Marshal(extra); err == nil {
+    created.ExtraJSON = string(b)
+}
+```
+
+- Cheap and simple.
+- **Loses unknown keys written by future versions of the same source.** Only use when you are certain no other writer (including a future version of this code) needs to round-trip an unknown key.
+- The `containerExtra` struct must be **additive-only**: never rename or remove a field once shipped, and never change its JSON type. Mark new fields with `omitempty` so older readers downgrading to an older codebase don't see surprise nulls.
+
+### Preserve-unknown-keys (static source pattern)
+
+When future versions of the source (or sibling writers) may add fields and the current writer must round-trip them:
+
+```go
+// internal/workload/plugin/source/static/state.go saveState
+//   1. Decode existing blob into map[string]json.RawMessage.
+//   2. Strip every key the current typed-state struct owns
+//      (runtimeStateKeys) so a cleared field actually drops.
+//   3. Apply caller's mutate() to the typed state.
+//   4. Re-marshal typed state, splice its keys back into the
+//      generic map (overwriting any historical sibling).
+//   5. Marshal the merged map back into extra_json.
+```
+
+- Slightly more expensive (two round-trips through `json`).
+- Preserves keys the current writer doesn't know about — required for safe rolling deploys where a newer instance writes a new key, an older instance then reads, mutates, and writes back.
+- Must declare the typed key set explicitly (`runtimeStateKeys`) so step 2 can strip them. This invariant is fenced by `TestRuntimeState_JSONTagsRoundTrip` in [`state_integration_test.go`](../../internal/workload/plugin/source/static/state_integration_test.go).
+
+**Default to preserve-unknown-keys for any new source.** Wholesale-overwrite is acceptable for the image source today because the row's lifetime is short (replaced on every blue-green roll) and only one writer touches it. Sources whose container rows are long-lived (static, future compose-with-stateful-services) should preserve unknown keys.
+
+## Concurrency
+
+`UpsertContainer` is atomic at the SQL layer — SQLite serializes statements through one connection ([`internal/store/store.go:55`](../../internal/store/store.go#L55) `SetMaxOpenConns(1)`) with WAL mode enabled ([`store.go:60`](../../internal/store/store.go#L60)). That guarantees no torn write on a single row, and concurrent readers see a consistent snapshot — they read either the pre- or post-write state, never a half-applied one.
+
+What that does **not** guarantee is atomic read-modify-write across two Go goroutines. The static source serializes its RMW through a per-workload `sync.Mutex` keyed by workload ID (`internal/workload/plugin/source/static/state.go` `lockFor` + `saveState`). Any source that does its own read-modify-write on `extra_json` must do the same — verified in `TestSaveState_ConcurrentWritesDoNotLoseUpdates` (which loses 15+ markers per 20-writer run when the mutex is disabled, as confirmed in commit `ef62a41`).
+
+If a future source is purely wholesale-overwrite from a single writer, no lock is needed.
+
+## What `extra_json` is NOT for
+
+- **Workload-level config.** Workload config goes in `workloads.source_config` and is the operator's surface.
+- **Cross-source state.** If two sources need the same data, promote it to a column.
+- **Anything queryable.** SQLite can JSON-path `extra_json` but no index supports it; readers always pull the column wholesale and parse in Go.
+- **Secrets.** Anything sensitive lives in `workload_env` (per-entry encrypt flag) or another encrypted table.
+
+## Adding a new field — checklist
+
+1. Add the field to your source's typed struct with `omitempty` and a stable `json:"snake_case"` tag.
+2. If you use the **preserve-unknown-keys** pattern, add the JSON key to your `*Keys` slice (the equivalent of `runtimeStateKeys`).
+3. Confirm older readers (older deploys of the same binary) still parse the blob — `encoding/json` should drop the unknown key silently. Add a regression test if there's any doubt.
+4. Document the new field in this codemap if it's load-bearing for cross-source code (e.g., the proxy_routes map drives `ListProxyRoutes`).
+
+## Pointers
+
+- Container model + `ExtraJSON` comment: [`internal/store/models.go:342-347`](../../internal/store/models.go#L342-L347)
+- Schema declaration: [`internal/store/store.go:233`](../../internal/store/store.go#L233)
+- Store-level normalization (`'{}'` default) across all four write paths: [`internal/store/containers.go:42-43`](../../internal/store/containers.go#L42-L43) (CreateContainer), `:77-78` (UpsertContainer), `:129-130` (ReconcileContainer), `:321-322` (UpdateContainer).
+- Wholesale-overwrite writer + struct: [`image.go:341-343`](../../internal/workload/plugin/source/image/image.go#L341-L343) writes; [`image.go:481-487`](../../internal/workload/plugin/source/image/image.go#L481-L487) defines `containerExtra`; [`image.go:449-456`](../../internal/workload/plugin/source/image/image.go#L449-L456) reads it back in Teardown.
+- Preserve-unknown-keys example + concurrency lock: [`internal/workload/plugin/source/static/state.go`](../../internal/workload/plugin/source/static/state.go).
+- Canonical "decode-and-tolerate" consumer (the only cross-source reader in tree today): [`internal/api/workload_runtime.go:118-133`](../../internal/api/workload_runtime.go#L118-L133) decodes the static-only typed fields and falls back to first-class columns when the blob is empty, missing keys, or malformed.
+
+Note: no cross-source consumer reads `extra_json` in `internal/store/`. The proxy/route data exposed by `ListProxyRoutes` ([`containers.go:196`](../../internal/store/containers.go#L196)) comes from first-class columns (`proxy_route_id`, `subdomain`, `port`); the `proxy_routes` map inside `extra_json` is read only by the image source's own Teardown for cleanup.
@@ -0,0 +1,88 @@
+# Discovery & Runtime API — Codemap
+
+**Last Updated:** 2026-05-16
+
+Surfaces added during the static-site discovery restoration + workload runtime panel work. All endpoints sit inside the existing `/api` group (auth-middleware enforced); admin-gated routes are noted per-endpoint.
+
+## Files
+
+### Backend
+
+- [`internal/api/discovery.go`](../../internal/api/discovery.go) — six admin-gated handlers wrapping `staticsite.GitProvider` + an image-source conflict scanner.
+- [`internal/api/workload_runtime.go`](../../internal/api/workload_runtime.go) — runtime-state read, storage-usage probe (with 30s in-memory cache), and stop/start mutation handlers.
+- [`internal/staticsite/safehttp.go`](../../internal/staticsite/safehttp.go) — `NewSafeHTTPClient` + `ValidateBaseURL` + `blockReason` (loopback / link-local / multicast / unspecified blocked at dial time; RFC1918 / ULA explicitly allowed).
+- [`internal/api/discovery_test.go`](../../internal/api/discovery_test.go) — 26 table cases (image-tag stripping, source-config decoding, conflict scenarios, validator boundaries, scheme rejection).
+- [`internal/api/workload_runtime_test.go`](../../internal/api/workload_runtime_test.go) — 14 cases (404, source-kind branching, never-deployed path, malformed extra-json, nil-docker-client 503, probe cache short-circuit).
+- [`internal/staticsite/safehttp_test.go`](../../internal/staticsite/safehttp_test.go) — 16 cases (URL validation matrix, block-reason policy matrix, live dial against loopback + AWS metadata literals).
+
+### Frontend
+
+- [`web/src/lib/api.ts`](../../web/src/lib/api.ts) — typed wrappers for every endpoint, signal-aware (`AbortSignal` threaded through `post()`); `ApiError` exported so callers can narrow on `e.status`.
+- [`web/src/routes/apps/new/+page.svelte`](../../web/src/routes/apps/new/+page.svelte) — static-form discovery controls (auto-detect provider, test connection, repo / branch / folder pickers, Deno auto-detect); image-form conflict panel + Inspect button.
+- [`web/src/routes/apps/[id]/+page.svelte`](../../web/src/routes/apps/[id]/+page.svelte) — runtime-state panel, storage panel, Stop / Start / Open-site toolbar; live-state badge in hero; ContainerStats panel; webhook bindings card; responsive toolbar overflow.
+
+## Endpoint reference
+
+### Discovery (admin-only)
+
+| Method | Path                                       | Returns                          |
+| ------ | ------------------------------------------ | -------------------------------- |
+| POST   | `/api/discovery/git/detect-provider`       | `{provider: DetectedGitProvider}`|
+| POST   | `/api/discovery/git/test-connection`       | `{status: "ok"}` or 502          |
+| POST   | `/api/discovery/git/repos`                 | `RepoInfo[]`                     |
+| POST   | `/api/discovery/git/branches`              | `string[]`                       |
+| POST   | `/api/discovery/git/tree`                  | `FolderEntry[]`                  |
+| GET    | `/api/discovery/image/conflicts?image=...` | `ImageConflict[]`                |
+
+All Git endpoints accept the shared `gitProviderRequest` shape: `{provider, base_url, access_token, repo_owner, repo_name, branch, query}`. Token is plaintext over HTTPS and never persisted server-side. `provider` may be empty to trigger `staticsite.DetectProviderWithProbe`.
+
+### Workload runtime
+
+| Method | Path                                  | Auth         | Returns                         |
+| ------ | ------------------------------------- | ------------ | ------------------------------- |
+| GET    | `/api/workloads/{id}/runtime-state`   | Any auth     | `WorkloadRuntimeState`          |
+| GET    | `/api/workloads/{id}/storage`         | Any auth     | `WorkloadStorageUsage`          |
+| POST   | `/api/workloads/{id}/stop`            | Admin        | `{touched, failed}` / 409 / 502 |
+| POST   | `/api/workloads/{id}/start`           | Admin        | `{touched, failed}` / 409 / 502 |
+
+`runtime-state` decodes `containers.extra_json` for `<workloadID>:site` (the deterministic container row the static plugin maintains). Returns `{source_kind, has_state: false}` for non-static workloads or never-deployed static workloads.
+
+`storage` returns `{enabled: false}` for non-static or storage-disabled workloads. When enabled, execs `du -sb /app/data` (15s budget) via `docker.InspectSiteStorageUsage`. Results memoized for 30s in the `storageProbeCache` package-level map.
+
+`stop` / `start` iterate `store.ListContainersByWorkload` and call `docker.StopContainer(ctx, id, 10)` / `StartContainer`. Returns 409 when no container row exists ("nothing to act on"), 502 when every container failed, 200 with `{touched, failed}` counts otherwise.
+
+## Security posture
+
+- **SSRF defense** — every outbound HTTP call from `staticsite/{gitea,github,gitlab}_provider.go` and the discovery probe uses `NewSafeHTTPClient`. The `DialContext` re-resolves the host and refuses loopback / link-local / multicast / unspecified addresses. RFC1918 + ULA are intentionally allowed (self-hosted Gitea on LAN is the dominant deployment pattern).
+- **Identifier validation** — `validateGitIdent` (regex `^[A-Za-z0-9][A-Za-z0-9._-]*$`) and `validateGitBranch` (allows `/`, rejects `..`) run at the API boundary so provider URL interpolation cannot be hijacked.
+- **Error scrubbing** — upstream Git provider errors are never echoed verbatim. `upstreamError(w, op, err)` logs the detail server-side and returns a generic 502 to the client (mitigates token-reflection-in-error-page).
+- **Token handling** — tokens are plaintext in request bodies (HTTPS assumed) and never persisted. Discovery endpoints accept them per-call; nothing is stored.
+- **Auth model** — read endpoints (`runtime-state`, `storage`) are open to any authenticated user; mutation endpoints (`stop`, `start`, every `/discovery/*` POST/GET) are admin-only.
+
+## Frontend integration patterns
+
+- All long-running requests accept an optional `AbortSignal` and are cancelled on `onDestroy` via per-call AbortController plus a sequence token (`reqSeq`) so a slow earlier response cannot overwrite a faster later one. Mirror this pattern when adding new probes — see `loadRuntimeState` / `loadStorage` / `inspectImageRef` for the canonical shape.
+- The wizard's English error fallbacks live under `apps.new.errors.*` in en + ru. Parity is maintained at 1413 keys; verify with the inline `node -e ...` script in the repo root (or `npm run check`).
+- `ApiError` narrowing (`e instanceof api.ApiError && e.status === N`) replaces the older regex-over-`Error.message` pattern.
+
+## Recipes
+
+### Add a new probe endpoint
+1. Handler in `internal/api/workload_runtime.go` following the established 404-vs-409-vs-502 pattern. Log detail server-side, return generic messages.
+2. Route registration in [`internal/api/router.go`](../../internal/api/router.go) under the `/workloads/{id}` group.
+3. Typed wrapper in `web/src/lib/api.ts` with `signal?: AbortSignal` parameter.
+4. UI consumer mirrors the `loadRuntimeState` pattern: per-call seq token + AbortController stored in module scope + cancelled in `onDestroy`.
+5. Tests: table-driven with `newAPITestEnv` from [`internal/api/workloads_test.go`](../../internal/api/workloads_test.go).
+
+### Extend Git discovery to a new provider
+1. Add a new `staticsite.GitProvider` implementation (see `gitea_content.go` for the smallest reference). Use `NewSafeHTTPClient(60 * time.Second)` for outbound calls — do not introduce a raw `&http.Client{}`.
+2. Register in `staticsite.NewGitProvider` switch.
+3. Add `URL.PathEscape` on every interpolated `{owner}/{repo}/{branch}` segment in URL construction.
+4. Update `DetectProviderWithProbe` if the new provider has a known API signature worth probing for unknown hosts.
+5. Update `DetectedGitProvider` union in `web/src/lib/api.ts`.
+
+## Cross-references
+
+- **Memory** — Project memory under `[[project_discovery_restoration]]` tracks what shipped vs deferred.
+- **Workload Plugin** — [`workload-plugin.md`](./workload-plugin.md) — Source × Trigger contracts that the runtime endpoints read from.
+- **Webhook Documentation** — [`docs/webhooks.md`](../webhooks.md) — Outgoing webhook events the static plugin fires (`site_sync_success`, `site_sync_failure`).
@@ -0,0 +1,238 @@
+# Workload Plugin Architecture Codemap
+
+**Last Updated:** 2026-05-16  
+**Status:** Core contract for Source × Trigger plugin system (post-trigger-split refactor)
+
+## Abstract
+
+`internal/workload/plugin/` defines the **Source × Trigger plugin contracts** that decouple the deployer pipeline from specific deployable shapes (image, compose, static) and redeploy signals (registry push, git push, manual, cron). A Workload carries opaque config blobs; registry lookups route each to the matching plugin. New plugin kinds are added only via registration from init() — no changes needed to the API, deployer, or webhook handler.
+
+## Key Files
+
+| Path | Role |
+|------|------|
+| `internal/workload/plugin/plugin.go` | Package doc; `Deps` bundle (Store, Docker, Proxy, DNS, Health, Notifier, Events, EncKey) |
+| `internal/workload/plugin/types.go` | `Workload`, `DeploymentIntent`, `PublicFace`, `InboundEvent`, `ImagePushEvent`, `GitEvent`, `ManualEvent`; helpers `SourceConfigOf[T]`, `TriggerConfigOf[T]` |
+| `internal/workload/plugin/source.go` | `Source` interface (Kind / Validate / Deploy / Teardown / Reconcile); registries `RegisterSource` / `GetSource`; `Schemaer` optional interface; helpers `SourceKinds` / `SchemaSampleFor` |
+| `internal/workload/plugin/trigger.go` | `Trigger` interface (Kind / Validate / Match); registries `RegisterTrigger` / `GetTrigger`; helper `TriggerKinds` |
+| `internal/workload/plugin/binding.go` | `MergeJSONConfig` (top-level JSON merge for trigger + binding override); `WithEffectiveTrigger` (used by webhook fan-out to compose merged config) |
+| `internal/workload/plugin/registry.go` | `AllSources` / `AllTriggers` snapshot helpers (used by `/api/workloads/source-kinds` and `/api/workloads/trigger-kinds`) |
+
+## Architecture Overview
+
+### Contract Surface: Source vs Trigger
+
+```
+Workload (unifying user entity)
+├── SourceKind + SourceConfig (JSON blob)
+│   └── Source.Deploy()      ← routes to image, compose, or static
+│       Source.Teardown()
+│       Source.Reconcile()
+│
+└── TriggerKind + TriggerConfig (JSON blob)
+    └── Trigger.Match(InboundEvent)  ← routes to registry, git, or manual
+        returns DeploymentIntent
+```
+
+- **Source** (stateless, 5 methods): owns full container lifecycle (deploy, tear down, reconcile state)
+- **Trigger** (stateless, 3 methods): given an inbound event + workload config, decide whether to fire a deploy intent
+
+### Dispatch Seam: Deployer → Plugins
+
+```
+deployer/dispatch.go
+├── DispatchPlugin(w, intent) → plugin.GetSource(w.SourceKind) → Source.Deploy()
+├── DispatchTeardown(w)       → plugin.GetSource(w.SourceKind) → Source.Teardown()
+└── DispatchReconcile(w)      → plugin.GetSource(w.SourceKind) → Source.Reconcile()
+
+PluginDeps() assembles:
+├── Store (workload / container / webhook tables)
+├── Docker (container orchestration)
+├── Proxy (route manager)
+├── DNS (DNS provider, nil for wildcard)
+├── Health (status checker)
+├── Notifier (webhook client)
+├── Events (event bus for deploy lifecycle)
+└── EncKey (for crypto.Encrypt/Decrypt of config secrets)
+```
+
+### Webhook Fan-Out Path: Trigger → Bindings
+
+```
+webhook/trigger_handler.go: POST /api/webhook/triggers/{secret}
+├── Resolve secret → Trigger record
+├── Parse body → InboundEvent (auto-detects image-push, git-push, git-tag, manual, cron-tick)
+├── plugin.GetTrigger(trg.Kind) → Trigger plugin
+└── For each enabled workload_trigger_binding (bounded concurrency = 4):
+    ├── plugin.WithEffectiveTrigger()
+    │   └── MergeJSONConfig(trigger.config, binding.binding_config)
+    │       returns Workload copy with merged TriggerConfig
+    ├── Trigger.Match(evt, merged_workload)
+    │   returns DeploymentIntent or nil
+    └── If intent returned: DispatchPlugin(w, intent) → Source.Deploy()
+```
+
+**Key design point**: MergeJSONConfig always returns freshly allocated slices (defensive copy) so binding fan-out never risks aliasing across goroutines.
+
+## Concrete Implementations
+
+### Sources
+
+| Kind | Package | Files | Purpose |
+|------|---------|-------|---------|
+| `image` | `internal/workload/plugin/source/image/` | `image.go` + deps | Docker image deploys (blue-green multi-face proxy, registry auth) |
+| `compose` | `internal/workload/plugin/source/compose/` | `compose.go` + deps | docker-compose stacks via `internal/stack/` helpers |
+| `static` | `internal/workload/plugin/source/static/` | `deploy.go`, `teardown.go`, `reconcile.go`, `state.go`, `env.go`, `build.go`, `naming.go`, `static.go` | Git-folder-backed static site (nginx or Deno) via `internal/staticsite/` helpers |
+
+**Static source inline port note:** The legacy `/api/sites/*` HTTP surface still exists for backwards compat; the static plugin operates directly on containers + workload_env tables without synthetic static_sites rows.
+
+### Triggers
+
+| Kind | Package | Files | Purpose |
+|------|---------|-------|---------|
+| `registry` | `internal/workload/plugin/trigger/registry/` | `registry.go` | Image push events (registry webhook or watcher) |
+| `git` | `internal/workload/plugin/trigger/git/` | `git.go` | Git push / tag-create (Gitea / GitHub / GitLab) |
+| `manual` | `internal/workload/plugin/trigger/manual/` | `manual.go` | Manual-only (no auto-fire) |
+
+**Trigger lifecycle note:** As first-class records since the trigger-split refactor, triggers are bound to workloads via `workload_trigger_bindings` join table. Each binding carries optional `binding_config` (merged with trigger's `config` before Match is called).
+
+## Data Flow: Example Webhook Dispatch
+
+```
+1. Inbound POST /api/webhook/triggers/{secret}
+   ↓
+2. Lookup Trigger by secret
+   ↓
+3. Parse body → InboundEvent (detects kind: image-push, git-push, manual, ...)
+   ↓
+4. Load Trigger plugin (e.g. plugin.GetTrigger("git"))
+   ↓
+5. Load all bindings for this trigger
+   ↓
+6. For each binding (concurrent, max 4):
+   a. Merge trigger.config + binding.binding_config
+   b. Build Workload copy with merged TriggerConfig
+   c. Call Trigger.Match(evt, merged_workload)
+   d. If Match returns DeploymentIntent:
+      - Call DispatchPlugin(w, intent)
+      - Source.Deploy executes (e.g. pull image, build container)
+   e. If Match returns (nil, nil): skip silently
+   f. If Match returns error: log at warn level, continue to next binding
+   ↓
+7. Aggregate results (deployed count, skip reason counts)
+   ↓
+8. Return 200 with result summary
+```
+
+## Integration Points
+
+### API Layer (`internal/api/workloads.go`)
+
+- `/api/workloads/{id}` — GET returns Workload with SourceKind + SourceConfig
+- `/api/workloads/{id}` — PUT/POST routes to Validate (source plugin checks config schema)
+- `/api/workloads/source-kinds` — GET calls `plugin.SourceKinds()` + `plugin.SchemaSampleFor()` per kind
+- `/api/workloads/trigger-kinds` — GET calls `plugin.TriggerKinds()` + `plugin.SchemaSampleFor()` per kind
+- `/api/workloads/{id}/deploy` — POST manual deploy: builds ManualEvent, calls webhook handler
+
+### Webhook Ingress (`internal/webhook/`)
+
+- `trigger_handler.go` — POST `/api/webhook/triggers/{secret}` implements the fan-out dispatcher (see Data Flow above)
+- `parse.go` — `buildInboundEvent()` normalizes vendor-specific payloads (Gitea / GitHub / GitLab / Docker Hub / generic registry) into `InboundEvent`
+
+### Reconciler (`internal/reconciler/reconciler.go`)
+
+- `reconcilePluginWorkloads()` — iterates every workload with `SourceKind != ""`, calls `DispatchReconcile(w)` on fixed schedule (e.g. every 5 minutes)
+- Keeps containers index in sync with deployed reality (garbage-collect orphaned containers, restart crashed services)
+
+### Deployer (`internal/deployer/dispatch.go`)
+
+- `DispatchPlugin()` / `DispatchTeardown()` / `DispatchReconcile()` — route calls to the matching Source plugin
+- `PluginDeps()` — assembles the stateless dependency bundle (called per-Deploy, per-Trigger.Match)
+
+## External Dependencies
+
+| Package | Version | Used For |
+|---------|---------|----------|
+| `encoding/json` | stdlib | Config marshaling / unmarshaling |
+| `sync` (RWMutex) | stdlib | Registry thread-safety (SourceKinds, TriggerKinds, Schemaer lookup) |
+| `context` | stdlib | Timeout control in Deploy / Teardown / Reconcile / Match |
+| `internal/store` | local | Workload / container / binding / trigger table access |
+| `internal/docker` | local | Container orchestration (Sources use this) |
+| `internal/proxy` | local | Route registration (Sources use this) |
+| `internal/dns` | local | DNS record creation (Sources use this, nil for wildcard DNS) |
+| `internal/health` | local | Status checks (available to plugin Deps) |
+| `internal/notify` | local | Webhook client (available to plugin Deps) |
+| `internal/events` | local | Event bus (Sources publish lifecycle events) |
+
+## How to Add a New Source Kind
+
+1. Create `internal/workload/plugin/source/{kind}/{kind}.go` with a struct implementing `Source`:
+
+```go
+type source struct{}
+
+func init() { plugin.RegisterSource(&source{}) }
+
+func (s *source) Kind() string { return "k8s" }
+
+func (s *source) Validate(cfg json.RawMessage) error { /* ... */ }
+
+func (s *source) Deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error { /* ... */ }
+
+func (s *source) Teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error { /* ... */ }
+
+func (s *source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workload) error { /* ... */ }
+
+// Optional: implement Schemaer for JSON schema on /api/workloads/source-kinds
+func (s *source) SchemaSample() any { return Config{ /* sample fields */ } }
+```
+
+2. Blank-import the sub-package from `cmd/server/main.go` so `init()` fires at boot:
+
+```go
+import (
+    _ "github.com/alexei/tinyforge/internal/workload/plugin/source/k8s"
+)
+```
+
+3. Optionally ship a hand-rolled form in `web/src/routes/apps/{new,[id]}/+page.svelte` (per workload-first UX rule). The JSON editor remains a fallback for power users.
+
+## How to Add a New Trigger Kind
+
+1. Create `internal/workload/plugin/trigger/{kind}/{kind}.go` with a struct implementing `Trigger`:
+
+```go
+type trigger struct{}
+
+func init() { plugin.RegisterTrigger(&trigger{}) }
+
+func (t *trigger) Kind() string { return "cron" }
+
+func (t *trigger) Validate(cfg json.RawMessage) error { /* ... */ }
+
+func (t *trigger) Match(ctx context.Context, deps plugin.Deps, w plugin.Workload, evt plugin.InboundEvent) (*plugin.DeploymentIntent, error) {
+    // Decide whether this trigger fires for the given event + workload config
+    // Return (nil, nil) to skip silently, (*intent, nil) to deploy, (nil, err) for config errors
+}
+
+// Optional: implement Schemaer for JSON schema on /api/workloads/trigger-kinds
+func (t *trigger) SchemaSample() any { return Config{ /* sample fields */ } }
+```
+
+2. Blank-import from `cmd/server/main.go`.
+
+3. Ship a form variant in `web/src/lib/components/TriggerKindForm.svelte` so the `/triggers/new` page and workload bindings panel can author kind-specific config.
+
+4. **Important**: Triggers that need to handle inbound webhooks should register a route in `internal/webhook/` for their vendor-specific payload format. The webhook ingress will auto-detect the kind and call `buildInboundEvent()` to normalize it into a standard `InboundEvent` before calling Match. Manual triggers do not need a webhook handler (they fire from the UI only).
+
+## Related Areas
+
+- **Workload Refactor** — Full context on the trigger-split, hard legacy cutover, and UI migration: [`docs/WORKLOAD_REFACTOR_TODO.md`](../WORKLOAD_REFACTOR_TODO.md)
+- **Webhook Signing** — HMAC-SHA256 verification, per-tier secret resolution, receiver code samples: [`docs/webhooks.md`](../webhooks.md)
+- **Log Scanning + Event Triggers** — Observability features that build on the trigger infrastructure: [`docs/LOGSCAN_AND_TRIGGERS_TODO.md`](../LOGSCAN_AND_TRIGGERS_TODO.md)
+
+## Registry Details
+
+Both Source and Trigger registries use sync.RWMutex for thread-safe lookup. Duplicate registration panics at init() (indicating a bug, not a runtime failure). Lookup errors surface missing kinds clearly so operators can diagnose when a workload references a kind whose package was not blank-imported.
+
+**Lazy note**: Registries are not lazy — all kinds must be registered at boot before the HTTP server starts. This ensures consistent behavior across handlers and reconciler tasks.
@@ -0,0 +1,385 @@
+# Log Scanner + Event Triggers — Design Handoff
+
+Two related features. They can ship independently, but were designed together
+because they share the event_log seam.
+
+- **A. Log scanner** — tail container logs, match against rules, emit event_log
+  entries. Producer of events.
+- **B. Event triggers** — turn event_log entries into webhook / notification
+  dispatches. Consumer of events. Generalizes the existing
+  `RegisterPersistentLogger` pattern.
+
+Either half is useful alone:
+- A without B = errors get surfaced in the events UI, no external delivery.
+- B without A = manual + reconciler + deploy events can drive notifications.
+
+Recommended ship order: B first (smaller, self-contained generalization), then
+A (more moving parts, depends on container-lifecycle hooks).
+
+---
+
+## A. Log scanner — BACKEND LANDED
+
+Status:
+
+- **Schema + store CRUD** — `internal/store/log_scan_rules.go` +
+  `log_scan_rules` table added to the `observabilityTables` block.
+  Includes the `EffectiveLogScanRules(workloadID)` helper that
+  resolves global rules minus per-workload overrides plus workload-
+  only additions in one Go-side pass.
+- **Stream-selectable docker reads** — `internal/docker/container.go`
+  `ContainerLogsOpts` accepts a `ContainerLogOptions{ShowStdout,
+  ShowStderr, Follow, Tail}` so the scanner can subscribe to one
+  stream when a rule scopes itself to stdout or stderr. The legacy
+  `ContainerLogs` is preserved as a thin wrapper for back-compat.
+- **Engine** — `internal/logscanner/engine.go`: per-rule cooldown
+  (keyed on container+rule), per-container token bucket (default 10
+  events / 60s, override-able), regex match per line, hits returned
+  for the manager to persist. Pure logic, fully unit-tested.
+- **Tail goroutine** — `internal/logscanner/tail.go`: per-container
+  loop reading docker's multiplexed log frames (with TTY fallback),
+  strips the prepended RFC3339 timestamp, runs every line through the
+  engine + snapshot. Exits on container stop or context cancel.
+- **Manager** — `internal/logscanner/manager.go`: 5s polling diff
+  against `ListContainers(state=running)`, atomic.Pointer[Snapshot]
+  hot-reload, structural HitEmitter that writes event_log rows AND
+  publishes `EventLog` on the bus (so event-trigger dispatchers can
+  pick them up immediately).
+- **API** — `internal/api/log_scan_rules.go`: full CRUD,
+  `/test` endpoint accepting `{"sample_line": "..."}` and returning
+  matched/captures, plus
+  `GET /api/workloads/{id}/effective-rules` for the workload detail
+  page's future Log Rules tab. Admin-gated mutations.
+- **Wired in main.go** before the API server is constructed so the
+  reload callback is plugged via `apiServer.SetLogScanReloader`.
+- **Loop-prevention** — Same boundary as feature B: scanner publishes
+  EventLog events, dispatcher consumes them, neither writes to
+  event_log on the consume side.
+- **Tests** — `internal/logscanner/{engine,rules}_test.go` cover
+  cooldown isolation, token bucket refill, stream filtering,
+  override-replaces-global, disabled-override-suppresses-global,
+  compile-error reporting. `internal/store/log_scan_rules_test.go`
+  covers validation + cascade delete.
+
+**Frontend still pending** — `/log-scan-rules` pages, regex test box
+component, Log Rules tab on `/apps/[id]`, i18n keys. Not touched this
+turn.
+
+### Where it plugs in
+
+[internal/docker/container.go:362](../internal/docker/container.go#L362) already
+exposes `ContainerLogs(ctx, id, follow=true, tail)`. The existing SSE handler at
+[internal/api/workloads.go:43](../internal/api/workloads.go#L43)
+(`streamWorkloadContainerLogs`) is per-viewer and dies on browser disconnect —
+**do not hook the scanner there**. The scanner is a separate long-lived
+subsystem owned by the server process.
+
+Minor required change to `ContainerLogs`: expose `ShowStdout` / `ShowStderr` as
+caller-controlled. Currently hardcoded to `true`/`true`. Single existing caller
+passes "both" → no friction. Add an options struct or two booleans.
+
+### New package: `internal/logscanner/`
+
+```
+internal/logscanner/
+  manager.go    — Manager: map[containerID]*tail, lifecycle hooks
+  tail.go       — per-container goroutine; reads logs, fans to engine
+  engine.go     — rule evaluation + cooldown + rate limit
+  rules.go      — Rule struct, regex compile cache, effective-set resolver
+```
+
+**Manager lifecycle.** Subscribes to container start/stop signals. Options for
+the signal source:
+1. Add a `ContainerStarted` / `ContainerStopped` event type to the bus and
+   publish from the reconciler + deployer. Cleanest, but adds two event types.
+2. Manager polls `docker.ListContainers` every N seconds and diffs. Lazier,
+   robust to missed signals, slightly higher idle CPU. Probably fine.
+
+Pick (1) if you want zero-latency start, (2) if you want fewer moving parts.
+Defaulting to **(2) with 5s poll** — Docker container starts already take
+seconds; sub-second matching is not a requirement.
+
+**Tail goroutine.** On container start: open `ContainerLogs(follow=true,
+tail="0")` with stdout/stderr filters per rules in scope. Read line-by-line via
+`bufio.Scanner`. For each line: run through engine. On container stop or ctx
+cancel: drain and exit.
+
+**Engine.** Holds compiled regexes per rule. For each line:
+- Walk effective ruleset for this workload (see schema below).
+- For each matching rule: check cooldown (`map[ruleID]time.Time`, mutex
+  guarded). If cooled down, insert event_log row + publish + update timestamp.
+- Per-container token bucket (default: 10 events/min/container) to prevent
+  catastrophic event_log floods if a regex is too greedy.
+
+### Schema
+
+Single table, global + override pattern. No separate "overrides" table.
+
+```sql
+CREATE TABLE log_scan_rules (
+  id               INTEGER PRIMARY KEY AUTOINCREMENT,
+  workload_id      TEXT,                  -- NULL = global rule
+  overrides_id     INTEGER,               -- if set, this row overrides a global rule for one workload
+  name             TEXT NOT NULL,
+  pattern          TEXT NOT NULL,         -- regex, compiled at load
+  severity         TEXT NOT NULL,         -- info|warn|error
+  streams          TEXT NOT NULL DEFAULT 'all',  -- all|stdout|stderr
+  cooldown_seconds INTEGER NOT NULL DEFAULT 60,
+  enabled          INTEGER NOT NULL DEFAULT 1,
+  created_at       TEXT NOT NULL,
+  FOREIGN KEY (workload_id) REFERENCES workloads(id) ON DELETE CASCADE,
+  FOREIGN KEY (overrides_id) REFERENCES log_scan_rules(id) ON DELETE CASCADE
+);
+CREATE INDEX idx_log_scan_rules_workload ON log_scan_rules(workload_id);
+CREATE INDEX idx_log_scan_rules_overrides ON log_scan_rules(overrides_id);
+```
+
+**Effective ruleset for workload X:**
+1. All rows where `workload_id IS NULL AND overrides_id IS NULL` (pure globals),
+   *minus* any global that has a row with `workload_id = X AND overrides_id = global.id`.
+2. Plus all rows where `workload_id = X AND overrides_id IS NULL` (workload-only additions).
+3. Plus all override rows where `workload_id = X AND overrides_id IS NOT NULL`
+   (substitute for the global; their fields win, including `enabled=false` to
+   disable the global for this workload).
+
+A pure SQL implementation is doable with a `LEFT JOIN ... WHERE override.id IS
+NULL` for step 1 plus a `UNION ALL` for steps 2 and 3. Or compute in Go after
+two simpler queries — fine since rule counts will be small.
+
+### Output
+
+Scanner calls `store.InsertEvent` with:
+- `Source = "logscan"`
+- `Severity` from the matched rule
+- `Message` = raw matched line (truncated to ~500 chars)
+- `Metadata` JSON = `{"workload_id": ..., "container_id": ..., "rule_id": ..., "rule_name": ..., "captures": {...}}`
+
+Then `bus.Publish(EventLog, payload)`. This reuses exactly the path
+[internal/events/bus.go:158](../internal/events/bus.go#L158)
+(`RegisterPersistentLogger`) already established. SSE clients see it live, and
+the dispatcher from feature B picks it up.
+
+### Hot-reload
+
+When a rule is created/updated/deleted via the API, the manager must rebuild
+the effective ruleset for affected containers. Cheapest path: a single
+`*atomic.Pointer[ruleSnapshot]` shared across tails, replaced wholesale on any
+rule change. Each tail dereferences the snapshot per line — no locking on the
+hot path.
+
+---
+
+## B. Event triggers — BACKEND LANDED
+
+Status:
+
+- **Schema + store CRUD** — `internal/store/event_triggers.go` + table
+  creation in `internal/store/store.go` `observabilityTables`. Model:
+  `EventTrigger` in `internal/store/models.go`.
+- **Dispatcher** — `internal/events/dispatcher.go`
+  `RegisterEventTriggerDispatcher(bus, triggerSource, notifier)`.
+  Filter eval is AND-composed across severity (CSV), source (CSV), and
+  optional message regex. Compiled regexes are memoized.
+- **Webhook delivery** — extended `notify.Notifier` with
+  `SendPayload(url, secret, eventType, payload)` which reuses the
+  existing HMAC + headers infra (`X-Hub-Signature-256`, etc.). New
+  `TierEventTrigger` tier is recorded for telemetry / audit.
+- **Loop-prevention** — dispatcher does **not** call `InsertEvent`.
+  Delivery outcomes go through the notifier's existing logging only.
+- **API** — `internal/api/event_triggers.go` with admin-gated mutations:
+
+```http
+GET    /api/event-triggers
+POST   /api/event-triggers
+GET    /api/event-triggers/{id}
+PATCH  /api/event-triggers/{id}
+DELETE /api/event-triggers/{id}
+POST   /api/event-triggers/{id}/test     — synthetic event_log → notifier.SendSyncForTest
+```
+
+- **Wired in main.go** next to `RegisterPersistentLogger`.
+- **Tests** — `internal/events/dispatcher_test.go`: 10 cases covering
+  filter eval, regex caching, dispatcher fan-out, unsupported
+  action_type, trigger-source errors. CSV filter helper has dedicated
+  table-driven coverage.
+
+**Frontend still pending** — `/event-triggers` list + detail + new
+pages, the Send-test UX, i18n keys. Not touched this turn.
+
+### Where it plugs in
+
+Mirrors the `RegisterPersistentLogger` shape at
+[internal/events/bus.go:158](../internal/events/bus.go#L158):
+
+```go
+func RegisterEventTriggerDispatcher(b *Bus, triggers TriggerSource, notifier Notifier) func() {
+    sub := b.Subscribe(func(evt Event) bool { return evt.Type == EventLog })
+    go func() {
+        for evt := range sub {
+            payload, ok := evt.Payload.(EventLogPayload)
+            if !ok { continue }
+            for _, t := range triggers.Enabled() {
+                if t.matches(payload) {
+                    notifier.Send(t.ActionTarget, buildBody(t, payload))
+                }
+            }
+        }
+    }()
+    return func() { b.Unsubscribe(sub) }
+}
+```
+
+Reuses the existing notifier at
+[internal/notify/notifier.go](../internal/notify/notifier.go) — including the
+signed-delivery and `webhook_deliveries` audit trail.
+
+### Schema
+
+```sql
+CREATE TABLE event_triggers (
+  id                    INTEGER PRIMARY KEY AUTOINCREMENT,
+  name                  TEXT NOT NULL,
+  filter_severity       TEXT,            -- nullable; comma-list like 'warn,error'
+  filter_source         TEXT,            -- nullable; comma-list like 'logscan,deploy'
+  filter_message_regex  TEXT,            -- nullable; matched against message
+  action_type           TEXT NOT NULL,   -- 'webhook' | 'notification_channel'
+  action_target         TEXT NOT NULL,   -- URL or channel ID
+  enabled               INTEGER NOT NULL DEFAULT 1,
+  created_at            TEXT NOT NULL
+);
+```
+
+Filters AND together. Empty filters match all.
+
+### Loop-prevention
+
+**Critical constraint: the dispatcher must not write to event_log.** All
+delivery successes / failures land in `webhook_deliveries` (existing table) so
+the audit trail is preserved without risking trigger recursion. Keeps the
+boundary crisp:
+
+- `event_log` = system observing itself
+- `webhook_deliveries` = system talking to the outside
+
+If a user-visible "trigger fired" entry is desired in the events UI, add a
+*read-only join* from `webhook_deliveries` into the events page rather than
+writing event_log rows.
+
+---
+
+## What to defer
+
+| Item | Why | Add when |
+|---|---|---|
+| Multi-line stack trace coalescing | Real rabbit hole (which lines belong together?). | Real user pain. |
+| Capture-group templating in messages (`{{.captures.code}}`) | v1 stores captures in metadata, displays raw line. | Once real rules exist and patterns emerge. |
+| Backfilling history search | This is Loki/Grafana scope-creep. | Never (push to Loki instead if it comes up). |
+| Per-rule alert routing | v1 fans out by `(severity, source)` filter on trigger side. | When users want one rule → one channel. |
+| YAML config-as-code | Tinyforge is UI-driven everywhere else. | Probably never. |
+| Retry / backoff on trigger delivery failure | Notifier already handles delivery; whether *triggers* retry is a separate question. | If trigger reliability becomes an SLO. |
+
+---
+
+## UI footprint
+
+All boolean inputs use `ToggleSwitch` per project CLAUDE.md. All destructive
+actions use `ConfirmDialog` per memory note (no inline Yes/No strips).
+
+### New pages
+
+- **`/log-scan-rules`** — list with severity / workload filter, "+ New rule" button.
+  - Detail page: name, pattern (regex with live test box that takes a sample log line), severity, streams, cooldown, enabled toggle, scope picker (global / workload).
+- **`/event-triggers`** — list, "+ New trigger" button.
+  - Detail page: name, filters (severity multiselect, source multiselect, optional message regex), action type, action target, enabled toggle.
+
+### Augmentations
+
+- **Workload detail page** (`/apps/[id]`): new "Log Rules" tab/panel listing
+  effective rules for this workload. Each global shows an "Override for this
+  workload" button. Each override / workload-only shows edit + delete.
+- **Events page** (`/events`): entries with `source=logscan` get a small icon
+  + tooltip showing rule name. Click → jumps to rule detail.
+- **Settings sidebar**: links to `/log-scan-rules` and `/event-triggers` under
+  a new "Observability" group.
+
+### i18n keys to add
+
+Roughly 40–60 keys across `en.json` + `ru.json`. Namespace: `logscan.*` and
+`triggers.*`.
+
+---
+
+## API surface
+
+```
+GET    /api/log-scan-rules                 — list (filter: ?workload_id=, ?global=true)
+POST   /api/log-scan-rules                 — create
+GET    /api/log-scan-rules/{id}            — detail
+PATCH  /api/log-scan-rules/{id}            — update
+DELETE /api/log-scan-rules/{id}            — delete
+POST   /api/log-scan-rules/{id}/test       — body: {sample_line}; returns matched: bool, captures
+GET    /api/workloads/{id}/effective-rules — computed effective ruleset for a workload
+
+GET    /api/event-triggers                 — list
+POST   /api/event-triggers                 — create
+GET    /api/event-triggers/{id}            — detail
+PATCH  /api/event-triggers/{id}            — update
+DELETE /api/event-triggers/{id}            — delete
+POST   /api/event-triggers/{id}/test       — dispatches a synthetic event to verify the action target
+```
+
+`POST .../test` endpoints are worth shipping in v1 — they make the rule /
+trigger editing UX dramatically nicer and avoid "did I get the regex right?"
+deploy-and-pray cycles.
+
+---
+
+## File pointers (when work starts)
+
+**Backend, new:**
+- `internal/logscanner/{manager,tail,engine,rules}.go`
+- `internal/api/log_scan_rules.go`
+- `internal/api/event_triggers.go`
+- `internal/store/log_scan_rules.go`
+- `internal/store/event_triggers.go`
+- `internal/events/dispatcher.go` (or extend `bus.go` with `RegisterEventTriggerDispatcher`)
+
+**Backend, modified:**
+- [internal/docker/container.go:362](../internal/docker/container.go#L362) — expose stream selection on `ContainerLogs`
+- [internal/api/router.go](../internal/api/router.go) — register new routes
+- [cmd/server/main.go](../cmd/server/main.go) — wire `RegisterEventTriggerDispatcher` next to `RegisterPersistentLogger`, start `logscanner.Manager`
+- migrations: `internal/store/migrations/00XX_log_scan_rules.sql`, `00XX_event_triggers.sql`
+
+**Frontend, new:**
+- `web/src/routes/log-scan-rules/+page.svelte`, `[id]/+page.svelte`, `new/+page.svelte`
+- `web/src/routes/event-triggers/+page.svelte`, `[id]/+page.svelte`, `new/+page.svelte`
+- `web/src/lib/components/LogRulePanel.svelte` (workload detail tab)
+- `web/src/lib/components/RegexTestBox.svelte` (reusable)
+
+**Frontend, modified:**
+- `web/src/routes/apps/[id]/+page.svelte` — add Log Rules tab
+- `web/src/routes/events/+page.svelte` — logscan source icon + rule tooltip
+- `web/src/routes/+layout.svelte` — Observability nav group
+- `web/src/lib/i18n/{en,ru}.json` — new key namespaces
+- `web/src/lib/api.ts`, `web/src/lib/types.ts` — typed clients
+
+---
+
+## Open questions to revisit before coding
+
+1. **Container start/stop signal source** — bus events (low latency, two new
+   event types) vs polling (simpler, ~5s latency). Tentative: polling.
+2. **Trigger delivery retry** — does the dispatcher retry on webhook failure,
+   or is one shot enough since `webhook_deliveries` records failures? Tentative:
+   one shot v1; revisit if reliability complaints surface.
+3. **Where does the "logscan source icon" link go on the events page** — rule
+   detail page, or the workload's effective-rules tab? Latter is probably more
+   useful since it shows context.
+
+---
+
+## Memory pointer
+
+Add a memory after this lands describing the event_log = observe-self,
+webhook_deliveries = talk-to-outside boundary — it's the kind of invariant
+that's easy to violate accidentally when adding new event types later.
@@ -0,0 +1,530 @@
+# Workload-First Refactor — Remaining Work
+
+Handoff for resuming the refactor. The plugin architecture (Source × Trigger),
+`/api/workloads` surface, `/apps` UI, env/volume/webhook/logs/chain panels,
+multi-face proxy routes, blue-green image deploys, schema-driven wizard, and
+test coverage on triggers / image helpers / webhook parser / store upserts are
+**already landed and live**. What follows is what's still pending, in priority
+order.
+
+> ## Current focus (read this first)
+>
+> **Workload-first arc is complete (2026-05-16).** Priority 1 (trigger
+> split, static inline port, hard cutover), Priority 3 polish (`apps.*`
+> i18n namespace — 276 keys EN+RU; codemap for
+> `internal/workload/plugin/`), and Priority 4 tests (`/api/workloads/*`
+> integration tests, dispatcher coverage, compose helper coverage) all
+> shipped. The legacy `/api/{projects,stages,stacks,sites,deploys,
+> instances}/*` HTTP surface, every backing table, the project-deploy
+> pipeline, the legacy webhook routes, and the legacy frontend
+> (`/projects`, `/stacks`, `/sites`, `/deploy`) are gone.
+>
+> Coverage delta on the workload-plugin path: `internal/api`
+> 1.1% → 16.0%, `internal/deployer` 0% → 54.1%,
+> `internal/workload/plugin/source/compose` 0% → 38.5%. Trigger plugins
+> already had ≥87% coverage from the trigger-split work.
+>
+> **What's next** is open — the remaining items in the doc are nice-to-
+> haves (a /triggers deep-link from the proxies page; more compose-source
+> coverage that needs a `compose` exec seam). Pick from the task list or
+> close the arc.
+>
+> **Trigger kind expansion (2026-05-16):** added the fourth trigger
+> kind, **schedule** — interval-based recurring trigger driven by the
+> new `internal/scheduler` tick loop (default 30s, ≤5m). v1 takes a
+> Go-duration interval ("24h", "1h", "168h") with a 1-minute floor;
+> dispatches through the same `webhook.Handler.FanOutForTrigger` seam
+> the inbound HTTP webhook uses, so per-binding concurrency / outcome
+> accounting / config-merge semantics are identical. `triggers` gained
+> a `last_fired_at` column; the scheduler persists it BEFORE dispatch
+> so a panicking Match cannot wedge a tight loop. The frontend
+> picker grid grew to four columns and `/triggers/[id]` surfaces
+> "last fired" on schedule rows.
+
+## Status at a glance
+
+| Item | Priority | Status |
+| ---- | -------- | ------ |
+| Triggers as first-class reusable entities | 1 | **DONE** (2026-05-16) |
+| Static source inline port | 1 | **DONE** (2026-05-16) |
+| Hard legacy cutover | 1 | **DONE** (2026-05-16) |
+| Generalized volume scopes | 2 | DONE |
+| Kind-aware editors (compose / image / static) | 2 | DONE |
+| Vendor-specific webhook parsing | 2 | DONE |
+| Chain-panel CSS | 3 | DONE |
+| Log Rules panel on `/apps/[id]` | adjacent | DONE — uses `getEffectiveLogScanRules` + per-workload override action |
+| Docs / codemap entries for `internal/workload/plugin/` | 3 | **DONE** (2026-05-16) |
+| API-handler / dispatcher / compose-source tests | 4 | **DONE** (2026-05-16) |
+| i18n for `/apps/*` page strings | 3 | **DONE** (2026-05-16) — 276 keys added under `apps.list.*` / `apps.new.*` / `apps.detail.*` |
+
+Cross-references to the adjacent Observability work (Event Triggers + Log
+Scanner backend + drop-counter stats panel) live in
+[docs/LOGSCAN_AND_TRIGGERS_TODO.md](LOGSCAN_AND_TRIGGERS_TODO.md).
+
+## Priority 1 — Architecture unlock
+
+### ~~Triggers as first-class reusable entities~~ — DONE (2026-05-16)
+
+Trigger config used to live embedded in the workload row
+(`workload.trigger_kind` + `workload.trigger_config`). One workload owned
+exactly one trigger; one trigger served exactly one workload. The split
+makes a Trigger its own record so one inbound webhook / registry watcher /
+schedule / git-push filter fans out to many workloads.
+
+**Schema + store** — `triggers` + `workload_trigger_bindings` tables with
+`ON DELETE CASCADE`. `binding_config` JSON merges on top of `trigger.config`
+(top-level merge, binding wins). Boot-time backfill lifts every existing
+embedded trigger into a standalone trigger row + binding inside a
+per-workload transaction so a partial failure rolls back cleanly. Trigger
+names are id-suffixed unconditionally to dodge the (name, kind) collision
+race. `store.ErrUnique` sentinel translates SQLite UNIQUE violations at
+the store boundary; API handlers use `errors.Is` instead of substring
+match. `MergeJSONConfig` always returns a freshly allocated slice (no
+aliasing under fan-out).
+
+**Webhook fan-out** — new `POST /api/webhook/triggers/{secret}` resolves
+to one Trigger and fans out to every enabled binding via a bounded worker
+pool (`maxTriggerFanOutConcurrency = 4`). Per-binding errors are isolated
+(one broken workload doesn't block siblings). Outcome accounting splits
+deployed / skipped / no-match / errored cleanly. Legacy
+`POST /api/webhook/workloads/{secret}` route dropped (clean break per the
+workload-first memory; the boot backfill kept secrets resolvable at the
+new path).
+
+**API** — `/api/triggers` CRUD, `/api/triggers/{id}/webhook`,
+`/api/triggers/{id}/bindings` (list + bind), `/api/bindings/{id}` for
+update and delete, and `/api/workloads/{id}/triggers` (list + bind,
+accepts either `trigger_id` or inline `{kind, name, config, ...}`).
+Inline-create path
+runs trigger insert + binding insert inside one transaction
+(`CreateTriggerWithBindingTx`) so a binding failure can't leak an orphan
+trigger. `validateBindingConfig` enforces 8 KiB cap and runs the trigger
+plugin's `Validate()` against the merged shape on every bind/update.
+List endpoints use `LEFT JOIN ... GROUP BY` (`ListTriggersWithBindingCount`,
+`ListBindingsForTriggerWithNames`, `ListBindingsForWorkloadWithNames`) —
+no per-row N+1.
+
+**Plugin contract unchanged** — `Trigger.Match` still takes `(Workload,
+InboundEvent)`. The fan-out path uses `plugin.WithEffectiveTrigger` to
+stuff the merged config into a copied workload before the call, so the
+existing `registry`, `git`, `manual` plugins work unchanged.
+
+**Reconciler** — gate dropped from `(SourceKind != "" && TriggerKind != "")`
+to `SourceKind != ""`. A workload with a Source but no triggers still
+gets `Source.Reconcile` called every tick (manual-only deploys are
+common during early setup).
+
+**Frontend** — new pages under `web/src/routes/triggers/`:
+
+- `+page.svelte` — list with kind chips, binding count, webhook status,
+  empty state.
+- `new/+page.svelte` — wizard with kind picker (cards), name, kind-aware
+  config form (registry / git / manual + JSON fallback), webhook toggles.
+- `[id]/+page.svelte` — editable per-kind form, webhook URL panel
+  (origin-prefixed, copy + ConfirmDialog-gated rotate), bindings list
+  with per-row enabled `<ToggleSwitch>` + ConfirmDialog-gated unbind,
+  danger-zone delete.
+
+**Workload UI** — embedded trigger fields removed.
+
+- `apps/new/+page.svelte` — wizard now has Trigger step with NEW / PICK /
+  SKIP modes; bind happens after `createPluginWorkload` succeeds.
+- `apps/[id]/+page.svelte` — Bindings panel above Containers, "Add trigger"
+  modal with Inline / Pick-existing tabs, **per-binding override editor**
+  (inline disclosure with read-only base config, editable JSON override,
+  merged preview, 8 KiB byte cap, save / reset-to-inherit). Per-row
+  "OVERRIDES n FIELDS" badge surfaces deviation from the trigger.
+
+**Shared component** — `web/src/lib/components/TriggerKindForm.svelte`
+hosts the kind picker + name + per-kind config + JSON fallback + webhook
+toggles. Reused on both `/triggers/new` and the workload Add-trigger modal.
+
+**i18n** — full EN + RU coverage under `redeployTriggers.*` (standalone
+pages), `apps.detail.bindings.*` (workload bindings panel including
+`override.*`), `apps.new.triggers.*` (wizard mode picker), `nav.triggers`.
+The existing `/event-triggers` nav label was disambiguated to "Event
+Triggers" to coexist with the new `/triggers` entry.
+
+**Compliance** — three pre-existing raw `<input type="checkbox">`
+instances in `apps/new` + `apps/[id]` (render-markdown, env-encrypted)
+replaced with `<ToggleSwitch>` to honor the project rule.
+
+**Touch points (final):**
+
+- `internal/store/triggers.go`, `workload_trigger_bindings.go`, `models.go`,
+  `store.go` (schema + backfill + `translateSQLError`).
+- `internal/workload/plugin/binding.go` (`MergeJSONConfig`,
+  `WithEffectiveTrigger`).
+- `internal/webhook/trigger_handler.go` + `handler.go` (route mount,
+  legacy route removed).
+- `internal/reconciler/reconciler.go` (trigger gate dropped).
+- `internal/api/triggers.go` + `router.go` (REST surface).
+- `web/src/routes/triggers/`, `web/src/routes/apps/{new,[id]}`,
+  `web/src/lib/components/TriggerKindForm.svelte`, `web/src/lib/api.ts`,
+  `web/src/lib/i18n/{en,ru}.json`, `web/src/routes/+layout.svelte`.
+
+**Reviews shipped through go-reviewer + security-reviewer +
+typescript-reviewer subagents** — 0 CRITICAL; 5 HIGH and 4 MEDIUM
+findings addressed inline before merge.
+
+### ~~Static source inline port~~ — DONE (2026-05-16)
+
+The phantom-row adapter (`cmd/server/static_backend.go`) is deleted; the
+static plugin now operates directly on `plugin.Workload`, the `containers`
+table, and `workload_env`. The deploy pipeline body lives inline in
+`internal/workload/plugin/source/static/{deploy,teardown,reconcile,
+state,env,build,naming,static}.go`.
+
+**State migration:** the legacy `static_sites` columns
+(`last_commit_sha`, `last_sync_at`, `last_error`, `status`,
+`container_id`, `proxy_route_id`) are now persisted on the container
+row keyed `<workloadID>:site` — deterministic ID, single row per
+workload. First-class fields (`container_id`, `proxy_route_id`,
+`subdomain`, `state`, `port`, `image_ref`) move into their dedicated
+columns on the `containers` table; the rest live in
+`containers.extra_json` via a typed `runtimeState` struct that
+preserves unknown keys on round-trip (so future writers can extend
+`extra_json` without forcing this struct to grow). `workload_env`
+replaces `static_site_secrets` for plugin-native workloads.
+
+**Reused helpers:** `internal/staticsite/{provider,gitea_content,
+github_provider,gitlab_provider,markdown,deno}` stay alive (and
+exported) as helpers — providers are still imported via
+`staticsite.NewGitProvider`. The `staticsite.Manager` itself stays
+alive only to service the legacy `/api/sites/*` HTTP routes; once
+those drop in the cutover the package can be deleted entirely.
+
+**Hardening landed alongside the port** (from `go-reviewer` +
+`security-reviewer` subagent passes — 1 CRITICAL, 5 HIGH, 3 MEDIUM
+addressed before merge):
+
+- **Path-traversal defense:** providers (`gitea_content.go`,
+  `github_provider.go`, `gitlab_provider.go`) reject any tree entry
+  whose resolved local path escapes `destDir`; the static plugin's
+  `verifyDownloadInsideRoot` walks the build dir post-download as a
+  second line of defense; `copyDir` uses `filepath.WalkDir` + `Lstat`
+  to refuse symlinks and non-regular files.
+- **Error sanitization:** a `sanitizeError` helper redacts the
+  decrypted access token, collapses to one line, and clamps to 240
+  bytes before any error string lands in `runtimeState.LastError`
+  (persisted in `extra_json`) or fans out to the notification
+  webhook.
+- **Resource naming with workload-ID short suffix:** container,
+  image, and storage volume names all carry `idShort(w)` so two
+  workloads sharing a name can't clobber each other's resources
+  (workload `name` is not UNIQUE in the schema).
+- **Per-workload mutex on `saveState`:** serializes the read-modify-
+  write of `containers.extra_json` so two parallel deploys for the
+  same workload can't race to clobber each other's
+  `container_id` / `proxy_route_id`.
+- **`saveState` failure on the success path is fatal:** rolls back
+  the just-created container + proxy route and writes a "failed"
+  state, so we don't leak a running container with no row pointing
+  at it.
+- **`primaryDomain` reads `settings.Domain`** to complete a bare
+  subdomain face into a full FQDN (matches legacy Manager behavior).
+- **`time.Sleep` honors `ctx.Done()`** during the post-start health
+  window.
+- **`json.Marshal` for event metadata + `strings.HasPrefix` for
+  failed-status detection** — replaces the prior fmt.Sprintf JSON
+  template + brittle slice expression.
+
+**Touch points (final):**
+
+- `internal/workload/plugin/source/static/{static,deploy,teardown,
+  reconcile,state,env,build,naming}.go` — the inline plugin.
+- `internal/staticsite/{gitea_content,github_provider,
+  gitlab_provider}.go` — added the path-traversal guards.
+- `cmd/server/main.go` — `wireStaticBackend(...)` call removed; the
+  existing blank import on `_ "internal/workload/plugin/source/
+  static"` now drives `init()` registration.
+- `cmd/server/static_backend.go` — deleted.
+
+**Behavioral notes for operators:**
+
+- Plugin-native static workloads no longer write to the `static_sites`
+  table at all — anything querying that table for plugin-native
+  workloads (operator dashboards, ad-hoc SQL) sees stale or absent
+  values. The legacy `/api/sites/*` routes still serve original rows
+  unchanged.
+- Container labels `tinyforge.static-site` / `tinyforge.static-site-name`
+  are no longer set on plugin-native deploys; the canonical
+  `tinyforge.workload.id` / `.kind` labels (added by
+  `docker.ContainerConfig`) cover ownership.
+- Container, image, and volume names all gained an 8-char ID suffix
+  (e.g. `dw-site-mysite-a1b2c3d4`). Existing legacy-deployed sites
+  keep their old `dw-site-mysite` shape until they're redeployed
+  through the plugin path.
+
+### ~~Hard legacy cutover~~ — DONE (2026-05-16)
+
+The clean-break delete that closed the workload-first arc. Net diff:
+~30 files deleted, ~20 modified, ~12k LOC removed.
+
+**Backend deletions:**
+
+- API handlers: `internal/api/{projects,stages,stage_env,stacks,
+  static_sites,deploys,instances,volume_browser}.go`.
+- Store CRUD + tests: `internal/store/{projects,stages,stage_env,
+  stacks,static_sites,static_site_secrets,deploys,poll_state,volumes,
+  workload_sync}.go` + their `_test.go`.
+- Deployer pipeline: `internal/deployer/{bluegreen,promote,rollback,
+  subdomain,resolver_test}.go`; `deployer.go` trimmed to just the
+  dispatch surface.
+- `internal/staticsite/{manager,healthcheck}.go` and
+  `internal/stack/manager.go` (the rest of those packages are still
+  imported by the static + compose plugins as helpers).
+- Webhook routes: `handleWebhook` (project) + `handleSiteWebhook`
+  (site) handlers gone; `/api/webhook/triggers/{secret}` is the only
+  inbound surface left. The workload-side webhook URL handlers
+  (`getWorkloadWebhook` + `regenerateWorkloadWebhook`) were removed
+  in the cutover-followup pass when a security review caught them
+  minting URLs that 404'd.
+- `internal/registry/poller.go` (legacy registry poller).
+- `internal/volume/ResolvePath` (legacy resolver; the workload
+  resolver `ResolveWorkloadPath` stays).
+- `cmd/server/main.go`: dropped `staticsite.Manager`,
+  `stack.Manager`, `staticsite.HealthChecker`, registry poller,
+  `SetSiteSyncTriggerer`, `SetStaticSiteManager`, `SetStackManager`.
+
+**Schema migrations:** `internal/store/store.go` ends with
+idempotent `DROP TABLE IF EXISTS` for every legacy table
+(`projects`, `stages`, `stage_env`, `volumes`, `deploys`,
+`deploy_logs`, `poll_states`, `stacks`, `stack_revisions`,
+`stack_deploys`, `static_sites`, `static_site_secrets`). FK order is
+children-then-parents.
+
+**Frontend deletions:** `web/src/routes/{projects,stacks,sites,
+deploy}/` (entire trees); legacy components
+(`ProjectCard.svelte`, `InstanceCard.svelte`,
+`StaleContainerCard.svelte`); `api.ts` legacy functions + types
+(`Project`, `Stage`, `Stack`, `StaticSite`, `Deploy`, `Instance`,
+plus their helpers); i18n namespaces (`projects.*`, `projectDetail.*`,
+`envEditor.*`, `volumeEditor.*`, `volumeBrowser.*`, `quickDeploy.*`,
+`sites.*`, `stacks.*`, `instance.*`, `confirm.*`); nav entries.
+Dashboard rewritten to read `listWorkloads()` + `listContainers()`
+only.
+
+**Helper extractions** (to keep deletions atomic):
+`internal/store/helpers.go` (`BoolToInt`, `rowScanner`,
+`GenerateWebhookSecret`); `internal/api/secrets.go` (api shim that
+forwards to the store helper so the api + store paths share one
+secret-generation impl, no panic-vs-UUID-fallback divergence).
+
+**Reviews shipped through go-reviewer + security-reviewer +
+typescript-reviewer subagents** — 0 CRITICAL across all three; 1
+HIGH (dead-end workload webhook surface) + ~12 MEDIUMs all
+addressed inline before commit.
+
+**Behavioral notes for operators upgrading from a pre-cutover
+build:**
+
+- Existing rows in `projects` / `stages` / `stacks` / `static_sites`
+  / `static_site_secrets` / `deploys` / `deploy_logs` / `volumes`
+  / `poll_states` / `stage_env` / `stack_revisions` / `stack_deploys`
+  are dropped on first boot.
+- The legacy webhook URLs at `/api/webhook/{secret}` and
+  `/api/webhook/sites/{secret}` return 404 — operators with old CI
+  configs must repoint to `/api/webhook/triggers/{secret}` (the boot
+  backfill from the trigger-split refactor lifted any embedded
+  workload secret onto a Trigger row, so the secret value itself
+  carries over).
+- Frontend routes `/projects`, `/stacks`, `/sites`, `/deploy` are
+  gone. Nav links replaced with `/apps` (+ `/triggers` from the
+  prior arc).
+
+## Priority 2 — Behavior gaps
+
+### ~~Generalized volume scopes~~ — DONE
+
+Landed: `internal/volume.ResolveWorkloadPath` (workload-keyed; sits next to the
+legacy `ResolvePath` so legacy code paths keep working) plus the wired-through
+`computeMounts` in `internal/workload/plugin/source/image/image.go`. All
+`VolumeScope` values are now honored at deploy time:
+
+- `absolute` — host bind, validated against `settings.AllowedVolumePaths`.
+- `ephemeral` — tmpfs.
+- `instance` — per-tag dir under `<base>/<workload>-<idShort>/instance-<tag>/<source>`.
+- `stage`, `project` — both collapse to `<base>/<workload>-<idShort>/<source>`.
+- `project_named` — Docker named volume prefixed `tf-<idShort>-<name>`.
+- `named` — Docker named volume by raw name.
+
+Test coverage: `internal/volume/resolver_test.go` (table-driven, portable
+Linux/Windows). The legacy `ResolvePath` stays in place for legacy deployer +
+volume-browser callers and dies with the hard cutover.
+
+### ~~Kind-aware editors on `/apps/new` and `/apps/[id]` edit~~ — DONE
+
+All three Source plugins now have hand-rolled forms on both pages, with
+an "Advanced JSON" toggle preserved as the power-user escape hatch.
+Submit logic marshals form fields back into the same JSON shape the
+backend already expects — no API or store changes required.
+
+**Principle:** the plugin contract makes new Source / Trigger kinds cheap
+on the backend, but the UI is not cheap by default — every kind needs a
+paired hand-rolled form to be daily-driver usable. The shared JSON
+editor is the fallback for power users and brand-new plugins, not the
+end state. New Source / Trigger merge requests should treat "ship the
+kind-aware form" as part of done, not a follow-up.
+
+**Landed:**
+
+- `compose`: YAML textarea + project_name input on both `/apps/new`
+  and `/apps/[id]`.
+- `image`: form fields for image / port / healthcheck / default_tag /
+  registry_name / cpu_limit / memory_limit / max_instances on both
+  pages. Registry name is a select populated from `/api/registries`
+  (with text-input fallback when the list is empty). env + volumes
+  stay in their detail-page panels and round-trip through the form
+  via `imageFormBody` so manual edits aren't clobbered.
+- `static`: provider select (gitea / github / gitlab), base URL,
+  repo_owner / repo_name (both required), branch (default "main"),
+  folder_path, access_token (password input, for private repos),
+  mode radio (static / deno), render_markdown checkbox. The
+  storage_enabled / storage_limit_mb fields aren't surfaced as
+  form controls yet, but they round-trip through `staticFormBody`
+  so values set via the raw JSON editor survive form edits.
+
+**Still pending forms:** none — all three Source plugins now have
+hand-rolled forms on both `/apps/new` and `/apps/[id]`.
+
+The raw JSON editor stays available behind the "Advanced JSON" toggle
+(shipped with compose) so the plugin's full sample is still reachable
+for power users and for any new plugin kind without a hand-rolled form.
+
+Effort: per-kind form roughly half a turn each; can land incrementally.
+Touches `web/src/routes/apps/new/+page.svelte` and the edit block in
+`web/src/routes/apps/[id]/+page.svelte`. The Svelte side keeps
+serializing into the same `source_config` JSON shape the backend
+already expects — no API or store change required.
+
+### ~~Vendor-specific webhook parsing for `/api/webhook/workloads/{secret}`~~ — DONE
+
+Landed: `internal/webhook/vendor_parsers.go` plus rewrites in
+`internal/webhook/handler.go` `buildInboundEvent`. The dispatch order is now:
+
+1. Empty body → manual event.
+2. Vendor-specific parsers, short-circuit on a recognized `X-*-Event`
+   header — Gitea package, GitHub `package` / `registry_package`, GitHub
+   push, Gitea push, GitLab `Push Hook` / `Tag Push Hook`.
+3. Generic simple-body fallback: top-level `image` or top-level `ref` —
+   what the legacy CI integrations already send.
+
+Vendor parsers can populate fields the generic parser cannot: image
+digest, `GitEvent.Vendor`, registry host. When a vendor parser claims a
+request (header matches) it is authoritative — a malformed Gitea
+package payload surfaces as an error rather than silently falling
+through to the generic parser. Test coverage:
+`internal/webhook/vendor_parsers_test.go` covers each vendor branch +
+the routed-via-`buildInboundEvent` integration cases.
+
+Open follow-ups deferred to future turns:
+
+- GitLab Container Registry events use a custom envelope outside the
+  webhook event surface — handle if a user reports needing it.
+- Docker Hub webhook (push event) uses `{"push_data": {"tag": ...}, "repository": {...}}` — add when there's a user request.
+
+## Priority 3 — Polish
+
+### ~~Chain-panel CSS~~ — DONE
+
+Landed: rules for `.chain-row`, `.chain-card` (with hover/transform on
+anchors), `.chain-self` (brand-tinted highlight), `.chain-name`,
+`.chain-label` (70px fixed-width mono column), `.chain-children-list`
+(flex-wrap), plus a sub-600px stack to keep the panel usable on narrow
+screens. Appended at the end of the `<style>` block in
+`web/src/routes/apps/[id]/+page.svelte`.
+
+### Docs / codemap entries
+
+Nothing under `docs/CODEMAPS/` for `internal/workload/plugin/`. Should cover:
+
+- The Source × Trigger contract + registry pattern (`init()` + blank-import in
+  `cmd/server/main.go`).
+- How a new Source kind is added (write `init()` registration, blank-import,
+  add to wizard via `SchemaSample`).
+- The dispatcher seam: `deployer.DispatchPlugin` / `DispatchTeardown` /
+  `DispatchReconcile` and how the reconciler / webhook ingress / API
+  handlers all flow through it.
+
+`README.md` should mention `/apps` as the new user surface and that
+`/projects` / `/sites` / `/stacks` carry `Deprecation: true` headers.
+
+### i18n: page-level strings — PARTIAL
+
+Already i18n'd:
+
+- `nav.apps`, `nav.eventTriggers`, `nav.logScanRules` — top nav labels.
+- Log Rules panel on `/apps/[id]` reuses `logscan.panel.*` keys
+  (shipped with the Observability work).
+- All `/event-triggers/*` and `/log-scan-rules/*` page strings — keys
+  live under `triggers.*` and `logscan.*` namespaces in
+  `web/src/lib/i18n/{en,ru}.json`.
+
+Still hardcoded English:
+
+- `/apps/+page.svelte` — list page (hero, lede, stats, empty state,
+  table headers, status pills).
+- `/apps/new/+page.svelte` — wizard labels, form copy, kind-aware
+  form rows (compose / image / static all hardcoded English today).
+- `/apps/[id]/+page.svelte` — detail page sections (chain, env,
+  volumes, webhook, manual deploy, danger zone) — the Log Rules
+  panel embedded inside it is the only i18n'd section.
+
+Roughly 80–100 keys across the three `/apps/*` pages once extracted.
+Namespace: `apps.*` (with sub-namespaces `apps.list.*`, `apps.new.*`,
+`apps.detail.*`, `apps.form.*`).
+
+## Priority 4 — Tests we still don't have
+
+Solid pure-function coverage landed in the prior turn. Still missing:
+
+- **API-handler integration tests** for `/api/workloads/*` (CRUD, deploy,
+  env, volumes, webhook, chain, promote-from). Pattern: in-memory store +
+  fake deployer + fake docker / proxy / dns providers, exercise via
+  `httptest`.
+- **Deployer dispatcher**: `DispatchPlugin` / `DispatchTeardown` /
+  `DispatchReconcile` with a fake Source registered.
+- **Compose source**: `composeProjectName` sanitizer, `writeYAMLIfChanged`
+  short-circuit. (Both pure; just need fixtures.)
+- **Static source Backend adapter** in `cmd/server/static_backend.go`.
+
+## Open architectural questions
+
+### Stages chain vs explicit Stage entity
+
+`parent_workload_id` is now the canonical mechanism for stage chains
+(dev → staging → prod). Decision deferred: do we need a separate `Stage`
+entity at all, or is the chain sufficient? Currently feels like the chain
+covers the use case — `promote-from` works, the UI shows the relationship.
+Probably can leave the legacy `stages` table dropped entirely once cutover
+proceeds.
+
+### ~~`Container.extra_json` evolution~~ — DONE (2026-05-16)
+
+Both writer patterns now have an active example in-tree (image source
+clobbers, static source preserves) and the policy is documented in
+[`docs/CODEMAPS/container-extra-json.md`](CODEMAPS/container-extra-json.md):
+ownership model, wholesale-overwrite vs preserve-unknown-keys, reader
+tolerance for unknown keys + decode failure, the per-workload mutex
+requirement for any read-modify-write writer, and a checklist for adding
+a new field without breaking older deployers.
+
+## File pointers for the next session
+
+- Plugin contracts: `internal/workload/plugin/{plugin,source,trigger,types,registry}.go`
+- Source implementations: `internal/workload/plugin/source/{image,compose,static}/`
+- Trigger implementations: `internal/workload/plugin/trigger/{registry,git,manual}/`
+- Dispatcher: `internal/deployer/dispatch.go`
+- Webhook ingress (plugin path): `internal/webhook/handler.go` `handlePluginWorkloadWebhook`
+- Reconciler hook: `internal/reconciler/reconciler.go` `reconcilePluginWorkloads`
+- Static backend adapter (to be deleted post-port): `cmd/server/static_backend.go`
+- Frontend pages: `web/src/routes/apps/+page.svelte`, `web/src/routes/apps/new/+page.svelte`, `web/src/routes/apps/[id]/+page.svelte`
+- Tests: `internal/workload/plugin/trigger/*/!(_test).go`, `internal/workload/plugin/source/image/image_helpers_test.go`, `internal/webhook/inbound_event_test.go`, `internal/store/workload_env_test.go`
+
+## Memory pointer
+
+Memory at
+`C:/Users/Alexei/.claude/projects/c--Users-Alexei-Documents-docker-watcher/memory/`
+already covers the Workload-first decision and the no-migration constraint.
+Refresh as the cutover lands.
@@ -0,0 +1,101 @@
+## Feature: Docker Diagnostic Hints on Disconnection
+
+**Problem:** When Docker is unreachable, the UI shows a generic "Docker disconnected" label with no actionable guidance. Users (especially on Windows/macOS where Docker Desktop must be running) have no idea what's wrong or how to fix it.
+
+**Goal:** Enrich the health-check response with a structured diagnostic object so the frontend can display platform-aware, actionable hints.
+
+---
+
+### Backend Changes
+
+**1. Enhance `GET /api/health` response** ([health.go](../internal/api/health.go))
+
+Currently returns `{ "docker": true|false }`. Change to:
+
+```json
+{
+  "docker": {
+    "connected": false,
+    "error": "dial unix /var/run/docker.sock: connect: no such file or directory",
+    "category": "socket_not_found",
+    "hints": [
+      "Docker Desktop does not appear to be running.",
+      "Start Docker Desktop and wait for it to finish initializing.",
+      "If using a custom socket path, check DOCKER_HOST env variable."
+    ],
+    "platform": "windows",
+    "checked_at": "2026-03-30T12:34:56Z"
+  }
+}
+```
+
+**2. Create a Docker diagnostics module** (new file, e.g. `internal/docker/diagnostics.go`)
+
+Classify the Ping error into a diagnostic category and generate platform-specific hints. Follow the pattern already established in [hints.go](../internal/proxy/hints.go) for proxy validation.
+
+Error categories to handle:
+
+| Category | Error signature | Windows hints | Linux hints | macOS hints |
+|---|---|---|---|---|
+| `socket_not_found` | `no such file or directory`, `The system cannot find the file specified` | Docker Desktop not running; start it from Start Menu or system tray | Docker daemon not running; `sudo systemctl start docker` | Docker Desktop not running; start from Applications or `open -a Docker` |
+| `connection_refused` | `connection refused` | Docker Desktop is starting up — wait ~30s and retry | Docker daemon is starting; `sudo systemctl status docker` | Docker Desktop is starting; check the whale icon in the menu bar |
+| `permission_denied` | `permission denied` | Run the application as Administrator, or add your user to the `docker-users` group | Add your user to the `docker` group: `sudo usermod -aG docker $USER` then re-login | Check Docker Desktop settings -> Resources -> File Sharing |
+| `timeout` | `context deadline exceeded`, `i/o timeout` | Docker Desktop may be overloaded or hanging — restart it | Docker daemon may be overloaded; check `journalctl -u docker` | Docker Desktop may be unresponsive; restart from menu bar |
+| `tls_error` | `tls:`, `certificate` | Check Docker TLS cert configuration and `DOCKER_TLS_VERIFY` | Verify certs in `~/.docker/` match daemon config | Check `~/.docker/` TLS configuration |
+| `unknown` | (fallback) | Show raw error with link to Docker Desktop troubleshooting docs | Show raw error with `dockerd` docs link | Show raw error with Docker Desktop docs link |
+
+Detect the platform via `runtime.GOOS` in the diagnostics module (the binary runs on the host, so this is accurate).
+
+**3. Expose `runtime.GOOS` once** in diagnostics, don't scatter it through handlers.
+
+**4. Preserve backward compat** — if any external consumer depends on the old `"docker": bool` shape, consider a migration path or version the health endpoint. Internal-only API can break freely.
+
+---
+
+### Frontend Changes
+
+**5. Update the API type** ([api.ts](../web/src/lib/api.ts))
+
+```typescript
+interface DockerHealth {
+  connected: boolean;
+  error?: string;
+  category?: string;
+  hints?: string[];
+  platform?: string;
+  checked_at?: string;
+}
+
+export function getHealth(): Promise<{ docker: DockerHealth }> {
+  return get<{ docker: DockerHealth }>('/api/health');
+}
+```
+
+**6. Enhance the health indicator** ([+layout.svelte](../web/src/routes/+layout.svelte))
+
+When `dockerConnected === false`:
+- Show a clickable/expandable area (tooltip, popover, or collapsible panel) below the red dot.
+- Display the `hints` array as a bulleted list.
+- Optionally show the raw `error` in a `<details>` collapse for advanced users.
+- Show `checked_at` as relative time ("last checked 15s ago").
+- Add a manual "Retry now" button that triggers an immediate health check instead of waiting for the 30s poll.
+
+**7. Add i18n keys** ([en.json](../web/src/lib/i18n/en.json), [ru.json](../web/src/lib/i18n/ru.json))
+
+Add keys for each hint category so hints can be translated. The backend should return `category` + `platform` identifiers; the frontend can use them to look up localized hint text instead of displaying raw English strings from the backend. This keeps i18n centralized in the frontend.
+
+---
+
+### Architecture Notes
+
+- The proxy validator ([validator.go](../internal/proxy/validator.go), [hints.go](../internal/proxy/hints.go)) already implements a similar pattern: classifying errors by substring match and returning human-readable hints. Reuse that approach for consistency.
+- Keep diagnostics pure — a function that takes an `error` and `runtime.GOOS` and returns `(category string, hints []string)`. No side effects, easy to unit-test.
+- Consider caching the diagnostic result for a few seconds to avoid spamming Docker if the frontend retries rapidly.
+
+---
+
+### Testing
+
+- Unit-test the diagnostics function with synthetic errors for each category x platform combination.
+- Integration-test the health endpoint with a mock Docker client that returns each error type.
+- Frontend: test that the hint UI renders correctly for each category and collapses/expands properly.
@@ -0,0 +1,224 @@
+# Workload Refactor — Compressed Plan
+
+Status: Shipped (with explicit deferrals — see "What actually shipped" at the bottom)
+Owner: alexei.dolgolyov
+Date: 2026-05-07
+Last updated: 2026-05-09 (post multi-agent review fixes)
+
+## Goal
+
+Unify `Project`, `Stack`, and `StaticSite` under a single `Workload` primitive, and introduce a normalized `containers` index so every Tinyforge-managed container has one canonical row. This unblocks a global Containers view today and lets future workload kinds (cron jobs, one-shot tasks, databases-as-resource, functions) plug in without another tab/store/deployer branch.
+
+## Why this is the compressed plan
+
+The original 8-PR plan was designed for a live system with dual-writes and soak periods. Tinyforge has no production users yet, so all defenses against live runtime state collapse: no external label consumers, no third-party CI hitting webhook URLs, no orphaned containers to recover. Everything ships in 3 PRs against a clean slate. Solo-dev reversibility is preserved by branching, not by dual-write gymnastics.
+
+## Target architecture
+
+- `Workload` is the unifying primitive with `kind ∈ {project, stack, site, …}`. Each existing Project/Stack/StaticSite becomes a Workload row.
+- `containers` is a normalized index: every Tinyforge-managed container has one row with `workload_id`, `workload_kind`, `role`, Docker container ID, host, state, last_seen.
+- Optional `apps` table (thin nullable `app_id` on Workload) added empty; UI gated behind a feature flag, defer indefinitely until pull.
+- Stable Docker labels: `tinyforge.workload.id`, `tinyforge.workload.kind`, `tinyforge.role`, `tinyforge.managed`. Legacy `tinyforge.project` / `tinyforge.stage` / `tinyforge.instance-id` are removed in the same wave.
+- Global `/containers` UI route; per-workload container panel becomes a shared `<WorkloadContainers>` component reused by project, stack, and site detail pages.
+
+## Schema
+
+Appended to `internal/store/store.go::runMigrations()` as additive `CREATE TABLE` statements (idempotent via `CREATE TABLE IF NOT EXISTS`).
+
+```sql
+CREATE TABLE IF NOT EXISTS workloads (
+    id              TEXT PRIMARY KEY,
+    kind            TEXT NOT NULL,                 -- 'project' | 'stack' | 'site'
+    ref_id          TEXT NOT NULL,                 -- FK into projects/stacks/static_sites by kind
+    name            TEXT NOT NULL,
+    app_id          TEXT,                          -- nullable FK into apps.id
+    notification_url        TEXT NOT NULL DEFAULT '',
+    notification_secret     TEXT NOT NULL DEFAULT '',
+    webhook_secret          TEXT NOT NULL DEFAULT '',
+    webhook_signing_secret  TEXT NOT NULL DEFAULT '',
+    webhook_require_signature INTEGER NOT NULL DEFAULT 0,
+    created_at      TEXT NOT NULL,
+    updated_at      TEXT NOT NULL,
+    UNIQUE(kind, ref_id)
+);
+CREATE INDEX IF NOT EXISTS idx_workloads_app_id ON workloads(app_id);
+CREATE INDEX IF NOT EXISTS idx_workloads_kind   ON workloads(kind);
+
+CREATE TABLE IF NOT EXISTS containers (
+    id              TEXT PRIMARY KEY,
+    workload_id     TEXT NOT NULL,
+    workload_kind   TEXT NOT NULL,                 -- denormalized for filtered queries
+    role            TEXT NOT NULL,                 -- stage name (project), service name (stack), '' (site)
+    container_id    TEXT NOT NULL DEFAULT '',      -- Docker ID, '' between create+start
+    image_ref       TEXT NOT NULL DEFAULT '',
+    host            TEXT NOT NULL DEFAULT 'local',
+    state           TEXT NOT NULL DEFAULT '',      -- running | stopped | failed | removing | missing
+    port            INTEGER NOT NULL DEFAULT 0,
+    last_seen_at    TEXT NOT NULL DEFAULT '',
+    extra_json      TEXT NOT NULL DEFAULT '{}',    -- {subdomain, npm_proxy_id, proxy_route_id, ...}
+    created_at      TEXT NOT NULL,
+    updated_at      TEXT NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_containers_workload     ON containers(workload_id);
+CREATE INDEX IF NOT EXISTS idx_containers_state        ON containers(state);
+CREATE INDEX IF NOT EXISTS idx_containers_container_id ON containers(container_id);
+
+CREATE TABLE IF NOT EXISTS apps (
+    id          TEXT PRIMARY KEY,
+    name        TEXT NOT NULL,
+    description TEXT NOT NULL DEFAULT '',
+    created_at  TEXT NOT NULL,
+    updated_at  TEXT NOT NULL
+);
+```
+
+`extra_json` carries kind-specific fields (`subdomain`, `npm_proxy_id`, `proxy_route_id`) so the spine stays narrow. SQLite JSON1 is required for queries against `extra_json`; verify the driver in `go.mod` supports it before committing — fall back to dedicated columns if not.
+
+## PR 1 — Spine: schema, Workload package, reconciler
+
+Single PR, lands the data layer end-to-end. No dual-writes; project/stack/site CRUD writes directly to `workloads`.
+
+### New files
+
+- `internal/store/workloads.go` — `CreateWorkload`, `GetWorkloadByID`, `GetWorkloadByRef(kind, refID)`, `ListWorkloads`, `UpdateWorkload`, `DeleteWorkload`.
+- `internal/store/containers.go` — `UpsertContainer`, `GetContainerByDockerID`, `ListContainersByWorkload`, `ListContainers(filter)`, `MarkContainerMissing`, new `ListProxyRoutes` (mirrors the join shape from `internal/store/instances.go::ListProxyRoutes`, reading `extra_json` via `json_extract`).
+- `internal/store/apps.go` — minimal CRUD; not wired anywhere yet.
+- `internal/workload/workload.go` — `Workload` interface (`ID`, `Kind`, `Name`, `Deploy`, `Stop`, `Start`, `Delete`, `Containers`).
+- `internal/workload/adapters/project_adapter.go` — wraps `internal/deployer`.
+- `internal/workload/adapters/stack_adapter.go` — wraps `internal/stack/manager.go`.
+- `internal/workload/adapters/site_adapter.go` — wraps `internal/staticsite/manager.go`.
+- `internal/reconciler/reconciler.go` — single writer to `containers`. Reads `docker ps --filter label=tinyforge.managed`, groups by `(workload.id, role)`, upserts rows, marks absent rows `state='missing'`. Boot-time one-shot run + 30s tick.
+- `internal/reconciler/reconciler_test.go` — table-driven tests with a fake Docker client.
+
+### Modified files
+
+- `internal/store/store.go::runMigrations` — append the three `CREATE TABLE` statements (after line ~165 where the existing migrations end).
+- `internal/store/models.go` — add `Workload`, `Container`, `App` structs.
+- `internal/store/projects.go` — `CreateProject`, `UpdateProject`, `DeleteProject` wrap the write in `s.db.Begin()` and also write the matching `workloads` row. Webhook/notification secret setters update `workloads.webhook_secret` / `webhook_signing_secret` / `notification_secret` directly.
+- `internal/store/stacks.go` — same Workload write on `CreateStack` / `UpdateStack` / `DeleteStack`.
+- `internal/store/static_sites.go` — same.
+- `internal/docker/client.go` — add label constants `LabelWorkloadID`, `LabelWorkloadKind`, `LabelRole`, `LabelManaged`. **Remove** the old `LabelProject`, `LabelStage`, `LabelInstanceID` writes from the deployer.
+- `internal/deployer/deployer.go` (label injection ~line 388) — emit only the new labels.
+- `internal/deployer/bluegreen.go` (~line 97) — same.
+- `internal/stack/manager.go` — after `docker compose up`, stamp new labels on each compose-managed container via `docker container update --label-add`. Compose's own `com.docker.compose.service` becomes `role`.
+- `internal/staticsite/manager.go` — stamp new labels at container start.
+- `internal/store/instances.go` — **delete this file**. The deployer no longer creates instance rows; reconciler owns container state.
+- `internal/api/instances.go` — **delete or alias** to `/api/containers` filtered by workload. Solo dev → delete is cleaner.
+- `internal/api/proxies.go` — switch the `ListProxyRoutes` import to `containers.ListProxyRoutes`.
+- `internal/api/docker.go::buildActiveImagesSet` (~line 251) — replace the `ListAllInstances` walk with a single `containers.image_ref` query.
+- `internal/api/stale.go`, `internal/stale/scanner.go` — read from `containers` instead of `instances`.
+- `internal/webhook/matcher.go` — query `workloads.webhook_secret` directly.
+- `cmd/server/main.go` — start the reconciler goroutine after `store.New`. Drop any startup code that touched `instances`.
+
+### Tests
+
+- Extend `internal/store/store_test.go` with `TestCreateProjectAlsoCreatesWorkload`, `TestDeleteProjectCascadesWorkload`, `TestUpsertContainerIdempotent`, `TestListProxyRoutesShape`.
+- New `internal/reconciler/reconciler_test.go` with a `dockerClient` interface and a fake — assert that a slice of `types.Container` produces the expected `containers` upserts.
+- Run the existing test suite under `-race`.
+
+### Deliverable
+
+System builds, deploys a project end-to-end, deploys a stack end-to-end, deploys a static site end-to-end. `containers` table reflects reality after each deploy and after a 30s reconciler tick. The legacy `instances` table is gone.
+
+## PR 2 — API + frontend
+
+### New files
+
+- `internal/api/workloads.go` — `GET /api/workloads`, `GET /api/workloads/{id}`, `GET /api/workloads/{id}/containers`, `PATCH /api/workloads/{id}` (sets `app_id` and notification/webhook config).
+- `internal/api/containers.go` — `GET /api/containers?workload_id=&kind=&state=&app_id=`, `GET /api/containers/{id}`.
+- `internal/api/apps.go` — `GET /api/apps`, `POST /api/apps`, `PATCH /api/apps/{id}`, `DELETE /api/apps/{id}` (gated by settings flag `features.apps_grouping=true`).
+- `web/src/routes/containers/+page.svelte` — global filterable table. Reuses table patterns from `web/src/routes/proxies/+page.svelte` and `web/src/routes/containers/stale/+page.svelte` (the existing `stale/` route stays untouched).
+- `web/src/lib/components/WorkloadContainers.svelte` — shared container panel. Takes `workloadId` prop, hits `/api/workloads/{id}/containers`. Handles 1..N container rows.
+
+### Modified files
+
+- `internal/api/router.go` — register the new endpoints. Remove `/api/instances` registration.
+- `web/src/routes/projects/[id]/+page.svelte` — replace the inline instance list with `<WorkloadContainers workloadId={...}/>`.
+- `web/src/routes/stacks/[id]/+page.svelte` — same.
+- `web/src/routes/sites/[id]/+page.svelte` — same.
+- Top nav component (find under `web/src/lib/components/`) — insert a "Containers" tab between "Projects" and "Stacks". Existing tabs stay.
+- `web/src/lib/api.ts` (or wherever API client functions live) — add `listWorkloads`, `getWorkload`, `listContainers`, `getContainer`, `listApps`. Remove instance-shaped helpers.
+- `web/src/lib/types.ts` — add `Workload`, `Container`, `App` types. Remove `Instance` once unreferenced.
+
+### Deliverable
+
+User-visible: a `Containers` tab in the top nav showing every running container with kind/state/workload filters, links into the owning project/stack/site detail page, and a per-workload container panel that looks identical on all three detail pages.
+
+## PR 3 — Polish + optional Apps UI
+
+Defer indefinitely if no pull. Lands as a single PR when wanted.
+
+### Scope
+
+- Apps UI: `web/src/routes/apps/+page.svelte`, `[id]/+page.svelte`. Workload detail pages get an "App" dropdown to assign `app_id`. Gated by `features.apps_grouping=true` in settings.
+- Drop any leftover dead code referencing `Instance` types.
+- Documentation: update `CLAUDE.md` and `README.md` to describe the Workload model.
+- Optional: consolidate `internal/deployer` and `internal/stack/manager` into a single orchestrator. **Out of scope for this refactor** — adapters wrap the existing kind-specific code and that's fine. Revisit only if the duplication starts hurting.
+
+## What's explicitly deferred
+
+- Deployer + stack-manager consolidation.
+- Apps UI (schema added in PR 1, UI in PR 3 behind flag).
+- Multi-host containers (`containers.host` exists but is always `'local'`).
+- Workload-kind plugin model — the adapter registry has three hardcoded entries.
+- Webhook secret handling for old per-project URLs that may already be in CI configs (no users yet → don't care).
+
+## Risks (compressed)
+
+- **SQLite JSON1 availability.** Verify the driver in `go.mod` supports `json_extract` before committing to `extra_json`. If not, hoist `subdomain`, `npm_proxy_id`, `proxy_route_id` to dedicated columns on `containers`.
+- **`ListProxyRoutes` shape regression.** The new query reads from `containers` + `workloads` instead of `instances` + `projects` + `stages`. Worth a golden-output test before flipping `internal/api/proxies.go` over.
+- **Stack containers and label stamping.** `docker container update --label-add` is required to label compose-managed containers post-up. If the local Docker engine version doesn't support it, fall back to relying on `com.docker.compose.project` + `com.docker.compose.service` for reconciler joins.
+- **Boot-time backfill from `docker ps`.** First run needs to populate `containers` from currently-running containers using the legacy `tinyforge.instance-id` and `com.docker.compose.project` labels (since pre-refactor containers don't have the new labels). Solo-dev workaround: `docker compose down` test workloads, run the new binary against an empty Docker host, redeploy.
+
+## Concrete file paths
+
+Modified:
+- `internal/store/store.go` (migrations at line ~75–165)
+- `internal/store/projects.go`, `stacks.go`, `static_sites.go`, `models.go`, `store_test.go`
+- `internal/docker/client.go`
+- `internal/deployer/deployer.go` (~line 388), `internal/deployer/bluegreen.go` (~line 97)
+- `internal/stack/manager.go`, `internal/staticsite/manager.go`
+- `internal/api/router.go`, `proxies.go`, `docker.go` (`buildActiveImagesSet` at line 251), `stale.go`
+- `internal/stale/scanner.go`, `internal/webhook/matcher.go`
+- `cmd/server/main.go`
+- `web/src/routes/projects/[id]/+page.svelte`, `stacks/[id]/+page.svelte`, `sites/[id]/+page.svelte`
+- `web/src/lib/api.ts`, `web/src/lib/types.ts`
+- Top nav component in `web/src/lib/components/`
+
+Created:
+- `internal/store/workloads.go`, `containers.go`, `apps.go`
+- `internal/workload/workload.go`, `adapters/project_adapter.go`, `adapters/stack_adapter.go`, `adapters/site_adapter.go`
+- `internal/reconciler/reconciler.go`, `reconciler_test.go`
+- `internal/api/workloads.go`, `containers.go`, `apps.go`
+- `web/src/routes/containers/+page.svelte`
+- `web/src/lib/components/WorkloadContainers.svelte`
+
+Deleted:
+- `internal/store/instances.go`
+- `internal/api/instances.go`
+
+## What actually shipped (2026-05-09)
+
+After a multi-agent code review caught several issues, the refactor landed with the following deviations from the original plan. They are documented here so a future reader doesn't have to reconstruct them from git log.
+
+### Deferred / dropped
+
+- **`internal/workload/` package + adapters.** The plan called for a `Workload` interface (`Deploy`, `Stop`, `Start`, `Delete`, `Containers`) with `project_adapter.go`, `stack_adapter.go`, `site_adapter.go`. **Not built.** The adapters would have been thin pass-throughs to the existing kind-specific code; the duplication is real but small and the per-kind paths still type-check cleanly. The data-layer "Workload" (DB row) is the only Workload primitive today. Revisit if the per-kind branching becomes painful.
+- **`internal/api/instances.go` URL space.** Plan said "delete or alias to /api/containers." **Kept alive** but every handler that mutates a container now calls `resolveAndAuthorizeInstance` to verify the row's `(workload_id, role)` match the URL's `(project_id, stage_name)` — closes the cross-project hijack the security review flagged. URL renaming deferred until the frontend `InstanceCard` is renamed too (next refactor wave).
+- **`InstanceCard.svelte` rename.** The component is now generic enough to be `ContainerCard`, but the rename would touch 3+ call sites and i18n keys. Deferred.
+- **`extra_json` SQL column.** Schema still has the column (NOT NULL DEFAULT '{}'); Go code no longer references it (struct field, scan, INSERT, UPDATE all dropped). When/if a kind-specific need surfaces, hoist a dedicated column rather than re-introducing JSON1.
+
+### Built but not in the original plan
+
+- **`Container.stage_id` column** + index + ListProxyRoutes / ListContainersByStageID join. Survives stage renames; the original plan joined on `stages.name = containers.role` which would orphan rows on rename. The deployer populates `stage_id` for project containers; stack/site rows leave it empty.
+- **`store.ReconcileContainer`** — separate write path for the reconciler. The original `UpsertContainer` ON CONFLICT clause overwrote `subdomain`, `proxy_route_id`, `npm_proxy_id`, `image_tag` from the reconciler's empty values on every 30s tick, silently wiping deployer state. `ReconcileContainer` only updates Docker-derived fields on conflict (`container_id`, `image_ref`, `state`, `port`, `last_seen_at`, `updated_at`).
+- **Workload-existence check in the reconciler** — a `tinyforge.workload.id` label that doesn't resolve to a known workload is now rejected. Anyone with Docker socket access could otherwise spawn a container with a forged label and steal the canonical row for an existing workload.
+- **Project-kind row invention skipped.** When the reconciler sees a container with `tinyforge.workload.kind=project` and no existing row matches the docker container ID, it skips the upsert (deployer is the authoritative writer for project rows). Inventing a deterministic-key row would race with `MaxInstances > 1` deploys.
+- **Reconciler shutdown ordering** — `Stop()` cancels its child context before `wg.Wait()` so a hung `docker ps` doesn't block process shutdown.
+- **Transactional CRUD + workload sync.** Every `Create*`, `Update*`, `Delete*`, and `Set*Secret` path on `projects` / `stacks` / `static_sites` now wraps the parent UPDATE and the workload row sync in a single transaction. Closes the rotation-durability gap the security review flagged.
+- **Workload-only webhook lookup.** The legacy fallback (`GetProjectByWebhookSecret`, `GetStaticSiteByWebhookSecret`) is gone — webhook routing reads exclusively through `workloads.webhook_secret`, so a rotation that didn't commit doesn't get silently accepted.
+- **`store.GetStackByComposeProjectName`** + indexed lookup. Reconciler used to do a full-table stack scan per compose container per tick.
+- **`store.ListMissingSweepRows`** — filtered query (`container_id != '' AND state != 'missing'`) so the missing-sweep reads only candidate rows instead of the whole index.
+- **`web/src/lib/components/WorkloadContainers.svelte`** — generic detail-page panel reusable by stack and site detail pages. Project detail keeps its stage-grouped `InstanceCard` layout (containers there are sharded per-stage, not flat).
+- **Containers page polish** — kind/state filters now apply client-side over an unfiltered fetch (so tab counters reflect the whole population), URL-synced filters (`?kind=stack&state=running`) for shareable links, race-safe loads via a sequence number, full i18n with EN+RU strings, and a counter badge in the sidebar via `navCounts.containers`.
+- **`stage_id` migration.** New rows get `stage_id` from the deployer; legacy rows fall back to the (project_id, role=stage_name) join inside `ListContainersByStageID`.
@@ -0,0 +1,464 @@
+# Functionality Review — 2026-05-07
+
+Last 5 commits reviewed:
+
+1. `05440a5` feat(stats): resource metrics dashboard + sites logs/stats
+2. `0632f51` feat(webhook): per-project and per-site webhook URLs
+3. `e08acf5` refactor(settings): split General into focused pages
+4. `03d58a0` fix: treat naive backend timestamps as UTC for relative labels
+5. `90e6e59` feat: daemon health panel, brand-rail status chips, user timezone selector
+
+Method: desk review of `git diff HEAD~5 HEAD` plus targeted reads of large
+new components. No dev-server execution. Citations use absolute paths.
+
+## TL;DR
+
+- **Stats dashboard, daemon panel, timezone selector, settings split, and
+  per-entity webhooks all wire end-to-end** — every Go endpoint added in
+  these commits has a Svelte caller, every new field on the settings/health
+  shapes is rendered, and i18n is parallel-keyed in `en.json` and `ru.json`.
+- **One real flow gap:** the `WebhookPanel` confirm button (Project/Site
+  detail) does not auto-close when regenerate succeeds in the "no current
+  URL" case — it stays open until the user manually cancels. Minor.
+- **i18n is 99 % complete but three hardcoded English fallbacks slipped in:**
+  `'Docker daemon is not reachable.'` in `SystemDaemonsCard.svelte:98`, and
+  `'Service status'` / `'Close sidebar'` aria-labels plus `'Docker daemon · …
+  reachable'` / `'Proxy unreachable'` tooltips in `+layout.svelte` (lines
+  194, 201, 208, 225). All three are user-visible.
+- **Stats collector skips ticks when Docker is unreachable** but still calls
+  `prune` — confirmed safe, but the very first sample after a Docker outage
+  will show no system row for the outage window. Acceptable; documented in
+  code.
+- **Naive-UTC fix has full reach:** the fix lives in `toDate()` inside
+  `web/src/lib/format/datetime.ts:34-46`, so every one of the 15 components
+  that goes through `$fmt.*` benefits. `InstanceCard` was the only file
+  that had its own ad-hoc parser; that parser is removed.
+
+## Feature: Resource Metrics Dashboard (05440a5)
+
+**What it claims:** background CPU/memory/network/block I/O collector with
+configurable interval (5–300s, default 15) and retention (0–24h, default
+2h). New host snapshot/history/top-N API endpoints, ECharts visualisation,
+sites logs/stats reuse instance components, Docker-down 503 handling.
+
+**What works**
+
+- Collector lives in `internal/stats/collector.go:50-309`. It re-reads
+  settings every tick (`run`/`readConfig`), so `/settings/maintenance`
+  changes propagate within one tick. `interval=0` legitimately disables
+  collection (`run` polls settings every minute in that branch).
+- API endpoints and routing are wired: `internal/api/router.go:222,289-291,341-343`
+  mounts `/api/system/stats`, `/api/system/stats/history`,
+  `/api/system/stats/top`, plus the per-instance and per-site
+  `/stats/history` endpoints, all behind the auth middleware.
+- Frontend has matching helpers in `web/src/lib/api.ts:683-731`
+  (`fetchSystemStats`, `fetchSystemStatsHistory`, `fetchTopContainers`,
+  `fetchInstanceStatsHistory`, `fetchStaticSiteStats(s)History`,
+  `fetchStaticSiteLogs`).
+- `SystemResourcesCard.svelte:33-52` uses `Promise.allSettled` so a 503 on
+  the live snapshot does not blank out history (which is read from SQLite
+  and remains valid). Docker-unavailable detection at line 67 produces an
+  amber banner with the i18n key `resources.dockerUnavailable`.
+- `ContainerStats.svelte:13-15` and `ContainerLogs.svelte:14-16` define the
+  `StatsSource`/`LogSource` discriminated unions exactly as the commit
+  message describes; the site detail page uses both at
+  `web/src/routes/sites/[id]/+page.svelte:255-279`.
+- 30 m / 2 h / 6 h / 24 h window picker exists at
+  `SystemResourcesCard.svelte:213-220`. `parseWindow` in
+  `internal/api/stats_history.go:21-37` clamps any value to ≤ 24 h, so a
+  hand-crafted `?window=999h` query returns the maxed window (good).
+- History persistence survives backend restart — samples live in SQLite
+  (`container_stats_samples`, `system_stats_samples`); migrations in
+  `internal/store/store.go:128-180` create them additively with
+  `IF NOT EXISTS`.
+
+**Gaps / broken flows**
+
+- **Top-consumer rows are unlabelled by name.** `SystemResourcesCard.svelte:259-264`
+  shows only `s.container_id.slice(0,12)` plus an `instance | site` chip.
+  No project/site name, so identifying the offender requires manual lookup.
+  Backend already knows `owner_id`; resolving to a friendly name would be a
+  one-extra-fetch fix.
+- **No "stats off" UI hint.** When `stats_interval_seconds=0`, the
+  collector idles and history endpoints return `[]`. Frontend just shows
+  the "no samples yet" empty state with the *default* interval (15s)
+  hardcoded in the message (`resources.noSamples` in `en.json:51`,
+  `ru.json:51`) — it does not detect that collection is disabled. Users
+  who toggle stats off will see a confusing "samples every 15s" message
+  forever.
+- **Stats settings live on Maintenance page, not on a dedicated card.**
+  `web/src/routes/settings/maintenance/+page.svelte:117-132` has 4 fields
+  (stale, prune, stats interval, stats retention) sharing one Save button.
+  Not broken, but "Stats collection" is *not* maintenance — it's a runtime
+  observability feature. Worth a follow-up split.
+- **Top endpoint silently filters to last 2 minutes** (`stats_history.go:178`).
+  If the collector interval is 300 s, two of the last three minutes have no
+  samples and the top widget will look empty. Window should grow with
+  interval, e.g. `max(2*interval, 2m)`.
+
+**API/UI consistency**
+
+- All snake_case ↔ snake_case (Go `json:"…"` tags match the TS types in
+  `web/src/lib/types.ts:464-516`). Spot-checked
+  `ContainerStatsSample`, `SystemStats`, `SystemStatsSample` — perfect
+  alignment.
+- One subtle naming asymmetry: in `SystemStats` (live snapshot) the field
+  is `disk_total_bytes` and category breakdowns are `disk_images_bytes` etc.;
+  in `SystemStatsSample` (history row) the field is just `disk_total_bytes`
+  with no breakdown. The chart only uses workload CPU/memory percent, so
+  this is fine, but a future "disk over time" chart would have to either
+  query the live snapshot or the schema would have to grow.
+
+**i18n**
+
+- Full coverage. New keys live under `dashboard`, `resources`, and
+  `statsSettings` namespaces, mirrored in `ru.json:42-87`. No untranslated
+  strings in the touched files.
+
+## Feature: Per-Project and Per-Site Webhook URLs (0632f51)
+
+**What it claims:** replace global `settings.webhook_secret` with per-row
+secrets on `projects` and `static_sites`; remove webhook-driven autocreate;
+make site `sync_trigger=push|tag` actually trigger a sync.
+
+**What works**
+
+- Migration is additive and safe:
+  `internal/store/store.go:131-138` adds `webhook_secret TEXT NOT NULL DEFAULT ''`
+  to both tables and creates **partial unique indexes** (`WHERE webhook_secret != ''`)
+  at `store.go:240-241`, so multiple legacy rows with empty secrets do not
+  collide.
+- Lazy backfill via `EnsureProjectWebhookSecret` /
+  `EnsureStaticSiteWebhookSecret` (`internal/store/projects.go:158-171`,
+  `internal/store/static_sites.go:296-308`). UI calls `GET /webhook` first,
+  which triggers backfill — old projects "just work" the first time you
+  open them.
+- Routing in `internal/webhook/handler.go:127-133`:
+  `POST /api/webhook/{secret}` for projects, `POST /api/webhook/sites/{secret}`
+  for sites. Both return 404 for unknown/empty secrets (no information leak).
+  The order (`/sites/{secret}` first, then `/{secret}`) is correct chi-wise
+  because the literal `sites` segment beats the catch-all.
+- `siteRefMatches` (`internal/webhook/matcher.go:46-90`) implements push and
+  tag separately, with empty-Branch ⇒ accept-any-heads, and empty-TagPattern
+  ⇒ `*`. Manual sites short-circuit at `handler.go:295-303`.
+- Tests cover both happy and sad paths:
+  - `internal/webhook/matcher_test.go` (push, tag, manual, empty branch,
+    `ParseImageRef` cases)
+  - `internal/webhook/handler_test.go` (unknown-secret 404, image mismatch,
+    no-stage-match 200/skip, site push match, site manual skip,
+    site branch mismatch).
+- `WebhookPanel.svelte` is generic, used by both detail pages
+  (`projects/[id]/+page.svelte:771-776`, `sites/[id]/+page.svelte:283-288`).
+  Absolutises the URL with `window.location.origin` at line 30 so users can
+  copy a working URL.
+- Old global routes removed: no `/api/settings/webhook-url` or
+  `/api/settings/webhook-url/regenerate` in the diff (router.go:387-388
+  shows the deletion).
+
+**Gaps / broken flows**
+
+- **WebhookPanel race / minor UX**: `handleRegenerate` (lines 47-57) hides
+  the confirm strip *before* the network call. If the call fails, the user
+  sees the toast but the regenerate button reappears with no inline state.
+  Acceptable, but a "retry" affordance would help.
+- **Project image guardrail bypass when `project.Image` is empty.**
+  `handler.go:206-214`: the check is `if project.Image != "" && !imageMatches(...)`.
+  A project with an unset image accepts *any* image. Fine if treated as
+  intentional (commit message says guardrail is misconfig protection, not
+  security), but worth flagging.
+- **No "test webhook" button anywhere.** With per-entity URLs, users have
+  no way to verify before pointing CI at it. The git diff doesn't add a
+  ping endpoint either. Follow-up.
+- **Settings › Integrations page has a dead-end card** for incoming
+  webhooks (`integrations/+page.svelte:91-94`): just text saying "go to
+  the project page". No link, no list of projects. Adds friction.
+
+**API/UI consistency**
+
+- `WebhookUrlResponse` shape matches between Go (`internal/api/webhooks.go:17-20`)
+  and TS (`web/src/lib/api.ts:325-328`).
+- `Project.WebhookSecret` and `StaticSite.WebhookSecret` use `json:"-"`
+  (`internal/store/models.go:14, 253`) — secrets never leak through the
+  general project/site list endpoints. Good.
+
+**i18n**
+
+- New keys `projectDetail.webhookTitle/webhookDesc`, `sites.webhookTitle/webhookDesc`,
+  `webhookPanel.*`, `settingsIntegrations.*` exist in both `en.json` and
+  `ru.json`. Verified parallel structure.
+
+## Feature: Settings Page Split (e08acf5)
+
+**What it claims:** split the 547-line `settings/+page.svelte` into
+focused pages; group the sidebar; each page does its own partial PUT.
+
+**Sidebar groups** (from `+layout.svelte:32-50` and `64-72`):
+
+- *Overview*: General, Integrations
+- *Routing*: Registries, NPM/Traefik (conditional), DNS
+- *System*: Maintenance, Backups
+- *Security*: Authentication
+
+**Old setting → new page mapping**
+
+| Old setting (HEAD~5 `+page.svelte`) | New location | Status |
+|---|---|---|
+| Domain / Server IP / Public IP | `/settings` (Overview) | ✓ kept |
+| Network / Subdomain pattern | `/settings` | ✓ kept |
+| Polling interval / Base volume path | `/settings` | ✓ kept |
+| Notification URL | `/settings/integrations` | ✓ moved |
+| Stale threshold | `/settings/maintenance` | ✓ moved |
+| Image prune threshold | `/settings/maintenance` (Danger zone card) | ✓ moved |
+| Prune Images button | `/settings/maintenance` | ✓ moved into separate Danger card |
+| Wildcard DNS / Cloudflare token / Zone | `/settings/dns` | ✓ moved |
+| Test DNS connection | `/settings/dns` | ✓ moved |
+| Proxy provider radio | `/settings` | ✓ kept (with link to /settings/{npm|traefik}) |
+| **Global webhook URL** | n/a — feature removed (per-entity now) | ✓ intentional |
+| Stats interval / retention (NEW) | `/settings/maintenance` | ✓ added in same commit's diff |
+
+**Verdict:** every setting from the old page is reachable. Nothing
+orphaned. Credentials page (`/settings/credentials/+page.svelte`) was
+deleted and the sidebar entry was already gone at HEAD~5, so no broken
+link. Tested: the sidebar's `provider`-conditional NPM / Traefik items
+still work (`+layout.svelte:54-55`).
+
+**Gaps / broken flows**
+
+- **Each page issues an independent `getSettings()` on mount.** Navigating
+  through the sidebar reloads the entire 30-field settings blob each time.
+  Not broken, but a shared cache or layout-level fetch would halve the
+  payload. Follow-up.
+- **Save scoping is correct** — each page builds a `Partial<Settings>` of
+  only its own keys (e.g. `maintenance/+page.svelte:54-59`). Confirmed by
+  reading all four split pages.
+- **DNS page does not have an inline link to fall back from "test failed"**
+  to the General/proxy page. Minor.
+
+**i18n**
+
+- New `settings.groupMain/groupProxy/groupSystem/groupSecurity`,
+  `settingsDns.*`, `settingsIntegrations.*`, `settingsMaintenance.*`,
+  `statsSettings.*`, `settingsGeneral.globalConfigDesc/configureNpm/...`
+  all present in both locales.
+
+## Fix: Naive UTC Timestamp Handling (03d58a0)
+
+**Reach:** the fix is in `toDate()` (`web/src/lib/format/datetime.ts:34-46`)
+via `normalizeIsoUtc`. **Every** consumer of `$fmt.*` therefore inherits
+the fix:
+
+```
+web/src/routes/+layout.svelte
+web/src/routes/+page.svelte
+web/src/routes/projects/+page.svelte
+web/src/routes/projects/[id]/+page.svelte
+web/src/routes/projects/[id]/volumes/[volId]/browse/+page.svelte
+web/src/routes/sites/+page.svelte
+web/src/routes/sites/[id]/+page.svelte
+web/src/routes/stacks/+page.svelte
+web/src/routes/stacks/[id]/+page.svelte
+web/src/routes/settings/backup/+page.svelte
+web/src/lib/components/EventLogEntry.svelte
+web/src/lib/components/InstanceCard.svelte
+web/src/lib/components/StaleContainerCard.svelte
+web/src/lib/components/TimezoneSelector.svelte
+```
+
+**Audit for stragglers:** `Grep new Date(` across the frontend returns 5
+files. Two are inside `format/datetime.ts` and `stores/timezone.ts` (the
+fix itself); two are in the `TimezoneSelector` and `+layout.svelte` clock
+ticker (`new Date()` with no input — current time, not affected); one is
+`routes/events/+page.svelte:55` building a `since` *query parameter* that
+is sent to the backend, never displayed. Conclusion: **fix has 100 % reach
+for displayed timestamps**.
+
+`InstanceCard.svelte` lost its private `timeSinceCreated` parser
+(commit diff lines 32-43); now uses `$fmt.relative(instance.created_at)`.
+
+## Feature: Daemon Health Panel + Timezone Selector (90e6e59)
+
+### Daemon health panel
+
+**What it claims:** rich Docker /info + /version + NPM aggregates exposed
+via `/api/health`; status chips moved into the brand block; new
+`SystemDaemonsCard` on the dashboard; shared health store de-duplicates
+the 30 s poll.
+
+**What works**
+
+- `GET /api/health` (`internal/api/health.go:6-39`) now returns
+  `database`, `docker` (+ rich info), and conditionally `proxy` (with
+  NPM aggregates). 8 s timeout, NPM fields fetched only when ping succeeds
+  so an offline proxy doesn't amplify latency.
+- `health.ts:38-66` shared store with single 30 s poll; the layout
+  consumes it via `$health.docker/proxy/checked` (`+layout.svelte:53-56`)
+  and `SystemDaemonsCard.svelte:13-19` does the same. No duplicate
+  fetches — verified by the `inFlight` guard at `health.ts:37`.
+- Both panels render the rich payload: container running/paused/stopped
+  stacked bar, version/api/platform/kernel/cpu/memory/storage/images,
+  latency, root dir. Proxy panel shows total vs managed proxy hosts (with
+  proportion meter), access lists, certificates.
+- Brand-rail chips at `+layout.svelte:201-242` show DKR + NPM/TRF, with
+  pulse animation classes (`chip-live`/`chip-down`), running container
+  count, and proxy host count. Click on a down chip toggles `hintsExpanded`.
+
+**Daemons checked, by name:**
+
+- **Docker Engine** — connected via socket; "unhealthy" means the ping
+  failed (text from `Ping`) or the client wasn't initialised. The user
+  hint is `daemons.dockerHint` ("Check that the Docker daemon is running…").
+- **Proxy provider** — only checked when one is configured (NPM or Traefik).
+  "Unhealthy" means `Ping` failed; the panel surfaces `proxy.error` and
+  the configured URL. If proxy_provider=`none`, panel shows
+  "Not configured" with a CTA link to `/settings`.
+- **Database** — included in the JSON response but not surfaced on the
+  daemons card. The brand-rail also does not show a DB chip; if SQLite
+  is unreachable the chip rail goes "BOOT" forever (since
+  `health.ts:50-57` falls back to `prev.docker ?? {connected:false}` and
+  drops `database`). Minor — but a permanently-unreachable SQLite would
+  leave the user wondering why everything is dead with no indicator.
+
+**Gaps / broken flows**
+
+- **Hardcoded English fallbacks** (i18n leak):
+  - `web/src/routes/+layout.svelte:194` `aria-label="Close sidebar"` (was already English)
+  - `web/src/routes/+layout.svelte:201` `aria-label="Service status"` (new in this commit)
+  - `web/src/routes/+layout.svelte:208` tooltip
+    `` `Docker daemon · ${dockerHealth?.version ?? 'reachable'}` `` —
+    "Docker daemon" and "reachable" are English literals; commit added this code
+  - `web/src/routes/+layout.svelte:208` fallback `'Docker unreachable'`
+  - `web/src/routes/+layout.svelte:225` fallback `'Proxy unreachable'`
+  - `web/src/lib/components/SystemDaemonsCard.svelte:98` fallback
+    `'Docker daemon is not reachable.'`
+- **Refresh button has no debounce window**, only an in-flight guard
+  (`SystemDaemonsCard.svelte:53-61`). Spamming it triggers serial calls.
+  Acceptable.
+- **No DB-down indicator** anywhere visible to the user. Edge case but
+  worth noting.
+
+**API/UI consistency**
+
+- All Docker fields the frontend consumes (`web/src/lib/types.ts:258-285`)
+  are emitted by `dockerHealth` in `internal/api/health.go:60-100`. Cross-checked
+  every key (version, api_version, os, arch, kernel, storage_driver, root_dir,
+  ncpu, memory_total, containers, running, paused, stopped, images,
+  latency_ms). Matches.
+- `ProxyHealth` TS shape (`types.ts:289-296`) matches Go fields:
+  `provider`, `connected`, `error`, `latency_ms`, `url`, `proxy_hosts`,
+  `proxy_hosts_managed`, `access_lists`, `certificates`. Matches.
+
+**i18n**
+
+- `daemons.*` namespace fully translated in both `en.json:917-953` and
+  `ru.json:917-953` (parallel keys verified). The hardcoded strings above
+  are the only gaps.
+
+### Timezone selector
+
+**What it claims:** user IANA timezone preference with auto-detect,
+applied across all `$fmt.*` rendering, persisted in localStorage.
+
+**Persistence**
+
+- Stored at `localStorage.dw_timezone` via subscriber on the `timezonePreference`
+  writable (`web/src/lib/stores/timezone.ts:12,55-59`). Re-read on next page
+  load by `getInitialPreference` (lines 44-50). Validates the IANA string
+  before accepting it, falling back to `auto`.
+- "Auto" is a sentinel; `effectiveTimezone` derives a concrete IANA zone
+  from `Intl.DateTimeFormat().resolvedOptions().timeZone` on every read
+  (lines 66-69), so changing browser zone with auto enabled re-resolves.
+
+**Application reach**
+
+- `effectiveTimezone` is consumed by `makeFormatters` in `datetime.ts:117-119`,
+  which is the single source for the entire `$fmt` reactive store. Every
+  `$fmt.dateTime`, `$fmt.date`, `$fmt.relative` etc. respects the user
+  zone. **Verified across all 15 consumers listed under the naive-UTC fix
+  section.**
+- One subtle case: `$fmt.relative` is timezone-independent (`datetime.ts:142-156`),
+  which is correct — "5 m ago" doesn't depend on display zone.
+
+**Gaps / broken flows**
+
+- **Selector lives only on `/settings`.** Reasonable home, but no quick
+  "switch zone" affordance from the brand rail or top bar; you have to
+  navigate. Minor.
+- **No backend record.** The preference is browser-local, so logging in
+  on a fresh device shows server time. Commit message acknowledges this
+  ("purely client-side preference"). Acceptable.
+
+**i18n**
+
+- Full `timezone.*` namespace in both locales (`en.json:1117-1136`,
+  `ru.json:1117-1136`). Picker placeholder is translated.
+
+## Cross-cutting Issues
+
+### i18n leaks
+
+Three runtime strings in user-visible places are still English-only:
+
+1. `web/src/routes/+layout.svelte:201` `aria-label="Service status"` (new)
+2. `web/src/routes/+layout.svelte:208,225` chip tooltips include
+   English literals (`'Docker daemon'`, `'reachable'`, `'Docker unreachable'`,
+   `'Proxy unreachable'`).
+3. `web/src/lib/components/SystemDaemonsCard.svelte:98` fallback message
+   when `docker.error` is empty.
+
+`+layout.svelte:194` (`Close sidebar`) was already English at HEAD~5; not a
+regression but worth fixing while in the area.
+
+### Naming consistency
+
+- Backend uses `snake_case` JSON tags everywhere (`disk_total_bytes`,
+  `latency_ms`, `proxy_hosts_managed`). TypeScript interfaces use the same.
+  No drift detected.
+- One naming asymmetry: `Settings.WebhookSecret` was deleted from the
+  Go struct — clean removal. `internal/store/static_sites.go:233`,
+  `projects.go:53` use new column. SQLite column `webhook_secret` on
+  `settings` table is left alone (per the migration comment); no row
+  emits it, so it's dead weight but harmless.
+
+### Dashboard polling
+
+`SystemResourcesCard` polls every 15 s on its own (`SystemResourcesCard.svelte:79`).
+`ContainerStats` polls every 30 s. `health` store polls every 30 s.
+`navCounts` store polls separately. Multiple uncoordinated timers; OK in
+practice, but a future optimisation candidate.
+
+### Confirm dialog UX
+
+Both `WebhookPanel` and the maintenance "Prune Images" Danger zone use
+inline confirms / `ConfirmDialog`. Consistent. The brand-rail "click a down
+chip to expand hints" is a third confirm-ish pattern, fine but not
+discoverable.
+
+## Suggested Follow-ups (prioritized)
+
+1. **Localise the three hardcoded English strings** in
+   `web/src/routes/+layout.svelte:194,201,208,225` and
+   `SystemDaemonsCard.svelte:98`. ~15 min, replaces 5 literals with
+   `$t('daemons.…')` keys (which already exist for most cases — e.g.
+   `daemons.docker`, `daemons.offline`).
+2. **Add owner-name resolution to the "top consumers" widget**
+   (`SystemResourcesCard.svelte:259-264`). Currently only a 12-char ID +
+   `instance|site` chip; users have no way to know which container is
+   spiking.
+3. **Detect "stats collection disabled" (`stats_interval_seconds=0`) and
+   tailor the empty-state message** in `SystemResourcesCard.svelte`
+   instead of always saying "samples every 15 s".
+4. **Remove the dead `webhook_secret` column on `settings`** in a future
+   destructive migration window, OR officially document it as deprecated
+   in the schema comment.
+5. **Add a "Test webhook" button to `WebhookPanel.svelte`** — POSTs a
+   minimal payload to the URL and surfaces the response. Replaces
+   guesswork when wiring CI.
+6. **Add a DB-down indicator** to the brand rail (a 3rd chip "DB"). The
+   data is already in `/api/health`; only the UI needs the chip.
+7. **Top-N samples 2-minute window** in `internal/api/stats_history.go:178`
+   should scale with collector interval (`max(2*interval, 2m)`) so users
+   on slow intervals don't see a falsely-empty widget.
+8. **Settings › Integrations dead-end card** — link to the Projects and
+   Sites lists rather than just text saying "go look there".
+9. **Auto-close the WebhookPanel confirm strip on success** (it already
+   resets, but the strip stays visible until the user clicks Cancel).
@@ -0,0 +1,159 @@
+# Outgoing webhooks
+
+Tinyforge posts JSON events to a configured URL when deploys and static-site
+syncs finish. Receivers can verify each request was sent by Tinyforge and
+not tampered with by checking the **HMAC-SHA256** signature on the body.
+
+## Tiers and resolution
+
+A single global URL is rarely enough — different teams own different
+projects, and operators often want to route prod failures somewhere noisier
+than dev failures. Tinyforge supports four tiers:
+
+| Tier      | Where set                              | Used for                |
+|-----------|----------------------------------------|-------------------------|
+| `stage`   | Stage edit form                         | Per-stage deploys       |
+| `project` | Project edit form                       | All stages of a project |
+| `site`    | Static-site detail page                 | Static-site sync events |
+| `settings`| Settings → Integrations                 | Global fallback         |
+
+Resolution order:
+
+* **Deploys**: `stage → project → settings`
+* **Sites**:   `site → settings`
+
+The most-specific tier with a non-empty URL wins. The signing secret
+travels with the URL that sourced it: a stage can sign even when the
+project and global URLs are unsigned.
+
+## Signature scheme
+
+Every request includes:
+
+```
+POST /your/handler HTTP/1.1
+Content-Type: application/json
+User-Agent: Tinyforge-Webhook/1
+X-Hub-Signature-256: sha256=<hex>
+X-Tinyforge-Event: deploy_success
+X-Tinyforge-Delivery: 0f3a…-uuid
+X-Tinyforge-Timestamp: 2026-05-07T12:34:56Z
+X-Tinyforge-Tier: stage
+```
+
+The signature is `HMAC-SHA256(secret, raw_body)`, hex-encoded, with the
+`sha256=` prefix — the GitHub `X-Hub-Signature-256` format. Receivers
+already built for GitHub-style webhooks (Gitea, Forgejo, n8n, Hookdeck,
+the service-to-notification-bridge generic webhook provider) verify it
+without modification.
+
+When no signing secret is configured for the resolved tier, the
+signature header is omitted and the request goes out unsigned. This is
+intentional back-compat for receivers that don't speak HMAC.
+
+## Receiver requirements
+
+A correct verifier:
+
+1. **Reads the raw body bytes** before any JSON parse / re-serialise.
+2. Computes `HMAC-SHA256(secret, body)` and compares to the value after
+   `sha256=` in `X-Hub-Signature-256`.
+3. Uses a **constant-time** comparator (`hmac.compare_digest` /
+   `crypto.timingSafeEqual` / `hmac.Equal`).
+4. Returns 401/403 on mismatch — Tinyforge surfaces the receiver's
+   status code in the UI when the operator clicks **Send test**.
+
+### Node / TypeScript
+
+```ts
+import { createHmac, timingSafeEqual } from 'crypto';
+
+export function verify(secret: string, rawBody: Buffer, header: string): boolean {
+  const got = header.startsWith('sha256=') ? header.slice(7) : header;
+  const want = createHmac('sha256', secret).update(rawBody).digest('hex');
+  if (got.length !== want.length) return false;
+  return timingSafeEqual(Buffer.from(got, 'hex'), Buffer.from(want, 'hex'));
+}
+```
+
+### Python
+
+```python
+import hmac
+import hashlib
+
+def verify(secret: str, raw_body: bytes, header: str) -> bool:
+    got = header[7:] if header.startswith("sha256=") else header
+    want = hmac.new(secret.encode(), raw_body, hashlib.sha256).hexdigest()
+    return hmac.compare_digest(got, want)
+```
+
+### Go
+
+```go
+import (
+    "crypto/hmac"
+    "crypto/sha256"
+    "encoding/hex"
+    "strings"
+)
+
+func verify(secret string, body []byte, header string) bool {
+    got := strings.TrimPrefix(header, "sha256=")
+    mac := hmac.New(sha256.New, []byte(secret))
+    mac.Write(body)
+    want := hex.EncodeToString(mac.Sum(nil))
+    return hmac.Equal([]byte(got), []byte(want))
+}
+```
+
+## Event payload
+
+```json
+{
+  "type": "deploy_success",
+  "project": "demo-app",
+  "stage": "prod",
+  "image_tag": "v1.4.2",
+  "subdomain": "stage-prod-demo-app",
+  "url": "https://stage-prod-demo-app.example.com",
+  "timestamp": "2026-05-07T12:34:56Z"
+}
+```
+
+`type` values:
+- `deploy_success`, `deploy_failure` — sent by the deployer.
+- `site_sync_success`, `site_sync_failure` — sent by the static-site manager.
+  Use the `project` field as the site name; `stage` and `image_tag` are empty.
+- `test` — sent by the **Send test** button. Treat it as a no-op or
+  surface it in your operator log; never as a real deploy event.
+
+## Configuring the service-to-notification-bridge
+
+If you're sending Tinyforge events to the
+[service-to-notification-bridge](https://github.com/) generic webhook
+provider:
+
+1. Create a **Generic Webhook** provider.
+2. Set `auth_mode = hmac_sha256`.
+3. Paste the **same secret** Tinyforge generated (revealed via the
+   Outgoing webhook panel).
+4. Set `event_type_path = type` so deploys and site syncs map to
+   distinct event types in the bridge.
+5. Add `payload_mappings` for `project`, `stage`, `image_tag`, `url`,
+   `error` and reference them as `{{ extra.project }}` in your
+   notification templates.
+
+The bridge accepts `X-Hub-Signature-256` natively (no header rename
+needed) and reads the raw body before parsing, so step 1 of the
+receiver requirements is already met.
+
+## Rotating secrets
+
+Click **Regenerate** in the Outgoing webhook panel to rotate. The old
+secret is invalidated immediately — update receivers in lock-step or
+expect a brief window of 401s. There is no soft rollover today.
+
+To send unsigned events to a legacy receiver that can't verify, click
+**Disable signing**. Tinyforge will keep dispatching events without the
+`X-Hub-Signature-256` header until you regenerate.
@@ -1,8 +1,6 @@
-module github.com/alexei/docker-watcher
+module github.com/alexei/tinyforge

-go 1.24.0
-
-toolchain go1.25.0
+go 1.25.0

 require (
 	github.com/coreos/go-oidc/v3 v3.11.0
@@ -36,12 +34,14 @@ require (
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opencontainers/image-spec v1.1.1 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
+	github.com/yuin/goldmark v1.8.2 // indirect
 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
 	go.opentelemetry.io/otel v1.35.0 // indirect
 	go.opentelemetry.io/otel/metric v1.35.0 // indirect
 	go.opentelemetry.io/otel/trace v1.35.0 // indirect
 	golang.org/x/mod v0.18.0 // indirect
+	golang.org/x/sync v0.20.0 // indirect
 	golang.org/x/sys v0.33.0 // indirect
 	golang.org/x/tools v0.22.0 // indirect
 	modernc.org/libc v1.55.3 // indirect
@@ -63,6 +63,8 @@ github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/yuin/goldmark v1.8.2 h1:kEGpgqJXdgbkhcOgBxkC0X0PmoPG1ZyoZ117rDVp4zE=
+github.com/yuin/goldmark v1.8.2/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
 go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
 go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU=
@@ -85,6 +87,8 @@ golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70=
 golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
 golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
 golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
 golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
@@ -0,0 +1,105 @@
+package api
+
+import (
+	"errors"
+	"net/http"
+	"strings"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/go-chi/chi/v5"
+)
+
+// listApps handles GET /api/apps.
+func (s *Server) listApps(w http.ResponseWriter, r *http.Request) {
+	out, err := s.store.ListApps()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list apps")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getApp handles GET /api/apps/{id}.
+func (s *Server) getApp(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	a, err := s.store.GetAppByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "app")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get app")
+		return
+	}
+	respondJSON(w, http.StatusOK, a)
+}
+
+// createApp handles POST /api/apps. Body: {"name": "...", "description": "..."}.
+func (s *Server) createApp(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Name        string `json:"name"`
+		Description string `json:"description"`
+	}
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	req.Name = strings.TrimSpace(req.Name)
+	if req.Name == "" {
+		respondError(w, http.StatusBadRequest, "name is required")
+		return
+	}
+
+	a, err := s.store.CreateApp(store.App{
+		Name: req.Name, Description: req.Description,
+	})
+	if err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	respondJSON(w, http.StatusCreated, a)
+}
+
+// updateApp handles PUT /api/apps/{id}.
+func (s *Server) updateApp(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	var req struct {
+		Name        string `json:"name"`
+		Description string `json:"description"`
+	}
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	a, err := s.store.GetAppByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "app")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get app")
+		return
+	}
+	if name := strings.TrimSpace(req.Name); name != "" {
+		a.Name = name
+	}
+	a.Description = req.Description
+	if err := s.store.UpdateApp(a); err != nil {
+		respondError(w, http.StatusInternalServerError, "update app")
+		return
+	}
+	respondJSON(w, http.StatusOK, a)
+}
+
+// deleteApp handles DELETE /api/apps/{id}. Workloads previously assigned to
+// this app become unassigned (app_id cleared), they are NOT deleted.
+func (s *Server) deleteApp(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if err := s.store.DeleteApp(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "app")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete app")
+		return
+	}
+	w.WriteHeader(http.StatusNoContent)
+}
@@ -4,16 +4,41 @@ import (
 	"crypto/rand"
 	"encoding/hex"
 	"errors"
+	"fmt"
 	"log/slog"
 	"net/http"

 	"github.com/go-chi/chi/v5"

-	"github.com/alexei/docker-watcher/internal/auth"
-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/store"
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
 )

+// rateLimitedLogin wraps the login handler with per-IP rate limiting.
+// Uses clientIP() so X-Forwarded-For is honored only when the request
+// arrives from a configured trusted-proxy CIDR — preventing remote
+// attackers from spoofing the header to bypass the per-IP login limiter.
+func (s *Server) rateLimitedLogin(rl *rateLimiter) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if !rl.allow(clientIP(r)) {
+			respondError(w, http.StatusTooManyRequests, "too many login attempts, try again later")
+			return
+		}
+		s.login(w, r)
+	}
+}
+
+// authMode handles GET /api/auth/mode — public endpoint returning the auth mode.
+func (s *Server) authMode(w http.ResponseWriter, r *http.Request) {
+	as, err := s.store.GetAuthSettings()
+	if err != nil {
+		respondJSON(w, http.StatusOK, map[string]string{"auth_mode": "local"})
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"auth_mode": as.AuthMode})
+}
+
 // login handles POST /api/auth/login.
 func (s *Server) login(w http.ResponseWriter, r *http.Request) {
 	var req auth.LoginRequest
@@ -32,7 +57,8 @@ func (s *Server) login(w http.ResponseWriter, r *http.Request) {
 			respondError(w, http.StatusUnauthorized, "invalid credentials")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to get user: "+err.Error())
+		slog.Error("failed to get user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -47,7 +73,8 @@ func (s *Server) login(w http.ResponseWriter, r *http.Request) {
 		Role:     user.Role,
 	})
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to generate token: "+err.Error())
+		slog.Error("failed to generate token", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -64,7 +91,8 @@ func (s *Server) currentUser(w http.ResponseWriter, r *http.Request) {

 	user, err := s.store.GetUserByID(claims.UserID)
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to get user: "+err.Error())
+		slog.Error("failed to get user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -93,6 +121,7 @@ func (s *Server) oidcLogin(w http.ResponseWriter, r *http.Request) {
 		Path:     "/api/auth/oidc",
 		MaxAge:   300, // 5 minutes
 		HttpOnly: true,
+		Secure:   true,
 		SameSite: http.SameSiteLaxMode,
 	})

@@ -158,11 +187,13 @@ func (s *Server) oidcCallback(w http.ResponseWriter, r *http.Request) {
 				Role:     "viewer", // OIDC users default to viewer; admin promotes via settings
 			})
 			if err != nil {
-				respondError(w, http.StatusInternalServerError, "failed to create user: "+err.Error())
+				slog.Error("failed to create user", "error", err)
+				respondError(w, http.StatusInternalServerError, "internal server error")
 				return
 			}
 		} else {
-			respondError(w, http.StatusInternalServerError, "failed to get user: "+err.Error())
+			slog.Error("failed to get user", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 			return
 		}
 	}
@@ -173,20 +204,53 @@ func (s *Server) oidcCallback(w http.ResponseWriter, r *http.Request) {
 		Role:     user.Role,
 	})
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to generate token: "+err.Error())
+		slog.Error("failed to generate token", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

-	// Redirect to frontend with token in query parameter.
-	// The frontend extracts the token and stores it in localStorage.
-	http.Redirect(w, r, "/?token="+token.Token, http.StatusFound)
+	// Pass token via short-lived httpOnly cookie. The frontend reads it via
+	// a dedicated /api/auth/oidc/token endpoint and then the cookie is cleared.
+	http.SetCookie(w, &http.Cookie{
+		Name:     "auth_token",
+		Value:    token.Token,
+		Path:     "/api/auth/oidc",
+		MaxAge:   60,
+		HttpOnly: true,
+		Secure:   true,
+		SameSite: http.SameSiteLaxMode,
+	})
+	http.Redirect(w, r, "/?oidc=success", http.StatusFound)
+}
+
+// oidcExchangeToken handles POST /api/auth/oidc/token — exchanges the httpOnly cookie for a JSON token.
+func (s *Server) oidcExchangeToken(w http.ResponseWriter, r *http.Request) {
+	cookie, err := r.Cookie("auth_token")
+	if err != nil || cookie.Value == "" {
+		respondError(w, http.StatusUnauthorized, "no OIDC token available")
+		return
+	}
+
+	// Clear the cookie immediately.
+	http.SetCookie(w, &http.Cookie{
+		Name:     "auth_token",
+		Value:    "",
+		Path:     "/api/auth/oidc",
+		MaxAge:   -1,
+		HttpOnly: true,
+		Secure:   true,
+		SameSite: http.SameSiteLaxMode,
+	})
+
+	respondJSON(w, http.StatusOK, map[string]string{"token": cookie.Value})
 }

 // getAuthSettings handles GET /api/auth/settings.
 func (s *Server) getAuthSettings(w http.ResponseWriter, r *http.Request) {
 	as, err := s.store.GetAuthSettings()
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to get auth settings: "+err.Error())
+		slog.Error("failed to get auth settings", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}
 	// Mask the client secret for the response.
@@ -228,7 +292,8 @@ func (s *Server) updateAuthSettings(w http.ResponseWriter, r *http.Request) {
 	}

 	if err := s.store.UpdateAuthSettings(req); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to update auth settings: "+err.Error())
+		slog.Error("failed to update auth settings", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -244,7 +309,8 @@ func (s *Server) updateAuthSettings(w http.ResponseWriter, r *http.Request) {
 func (s *Server) listUsers(w http.ResponseWriter, r *http.Request) {
 	users, err := s.store.GetAllUsers()
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list users: "+err.Error())
+		slog.Error("failed to list users", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}
 	respondJSON(w, http.StatusOK, users)
@@ -266,14 +332,23 @@ func (s *Server) createUser(w http.ResponseWriter, r *http.Request) {
 		respondError(w, http.StatusBadRequest, "username and password are required")
 		return
 	}
+	if err := validatePassword(req.Password); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}

 	if req.Role == "" {
 		req.Role = "viewer"
 	}
+	if req.Role != "admin" && req.Role != "viewer" {
+		respondError(w, http.StatusBadRequest, "role must be 'admin' or 'viewer'")
+		return
+	}

 	hash, err := auth.HashPassword(req.Password)
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to hash password: "+err.Error())
+		slog.Error("failed to hash password", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -284,7 +359,8 @@ func (s *Server) createUser(w http.ResponseWriter, r *http.Request) {
 		Role:         req.Role,
 	})
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create user: "+err.Error())
+		slog.Error("failed to create user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -295,6 +371,13 @@ func (s *Server) createUser(w http.ResponseWriter, r *http.Request) {
 func (s *Server) deleteUser(w http.ResponseWriter, r *http.Request) {
 	id := chi.URLParam(r, "uid")

+	// Prevent deleting your own account.
+	claims, ok := auth.ClaimsFromContext(r.Context())
+	if ok && claims.UserID == id {
+		respondError(w, http.StatusBadRequest, "cannot delete your own account")
+		return
+	}
+
 	// Prevent deleting the last admin.
 	user, err := s.store.GetUserByID(id)
 	if err != nil {
@@ -302,7 +385,8 @@ func (s *Server) deleteUser(w http.ResponseWriter, r *http.Request) {
 			respondNotFound(w, "user")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to get user: "+err.Error())
+		slog.Error("failed to get user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -323,9 +407,128 @@ func (s *Server) deleteUser(w http.ResponseWriter, r *http.Request) {
 	}

 	if err := s.store.DeleteUser(id); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to delete user: "+err.Error())
+		slog.Error("failed to delete user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

 	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
 }
+
+// validatePassword checks that a password meets minimum complexity requirements.
+func validatePassword(password string) error {
+	if len(password) < 8 {
+		return fmt.Errorf("password must be at least 8 characters long")
+	}
+	return nil
+}
+
+// logout handles POST /api/auth/logout — revokes the current token.
+func (s *Server) logout(w http.ResponseWriter, r *http.Request) {
+	tokenStr := auth.ExtractToken(r)
+	if tokenStr != "" {
+		s.localAuth.RevokeToken(tokenStr)
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"status": "logged out"})
+}
+
+// changePassword handles PUT /api/auth/users/{uid}/password.
+func (s *Server) changePassword(w http.ResponseWriter, r *http.Request) {
+	uid := chi.URLParam(r, "uid")
+
+	var req struct {
+		Password string `json:"password"`
+	}
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+
+	if req.Password == "" {
+		respondError(w, http.StatusBadRequest, "password is required")
+		return
+	}
+	if err := validatePassword(req.Password); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	hash, err := auth.HashPassword(req.Password)
+	if err != nil {
+		slog.Error("failed to hash password", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	if err := s.store.UpdateUserPassword(uid, hash); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "user")
+			return
+		}
+		slog.Error("failed to update password", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{"status": "password updated"})
+}
+
+// updateUser handles PUT /api/auth/users/{uid}.
+func (s *Server) updateUser(w http.ResponseWriter, r *http.Request) {
+	uid := chi.URLParam(r, "uid")
+
+	var req struct {
+		Email string `json:"email"`
+		Role  string `json:"role"`
+	}
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+
+	if req.Role != "" && req.Role != "admin" && req.Role != "viewer" {
+		respondError(w, http.StatusBadRequest, "role must be 'admin' or 'viewer'")
+		return
+	}
+
+	existing, err := s.store.GetUserByID(uid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "user")
+			return
+		}
+		slog.Error("failed to get user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	// If demoting from admin, check we're not removing the last admin.
+	if existing.Role == "admin" && req.Role == "viewer" {
+		users, err := s.store.GetAllUsers()
+		if err == nil {
+			adminCount := 0
+			for _, u := range users {
+				if u.Role == "admin" {
+					adminCount++
+				}
+			}
+			if adminCount <= 1 {
+				respondError(w, http.StatusBadRequest, "cannot demote the last admin user")
+				return
+			}
+		}
+	}
+
+	if req.Email != "" {
+		existing.Email = req.Email
+	}
+	if req.Role != "" {
+		existing.Role = req.Role
+	}
+
+	if err := s.store.UpdateUser(existing); err != nil {
+		slog.Error("failed to update user", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, existing)
+}
@@ -0,0 +1,234 @@
+package api
+
+import (
+	"log/slog"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/go-chi/chi/v5"
+)
+
+// listBackups handles GET /api/backups.
+func (s *Server) listBackups(w http.ResponseWriter, r *http.Request) {
+	if s.backupEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
+		return
+	}
+
+	backups, err := s.backupEngine.ListBackups()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to list backups: "+err.Error())
+		return
+	}
+
+	if backups == nil {
+		backups = []store.Backup{}
+	}
+
+	respondJSON(w, http.StatusOK, backups)
+}
+
+// triggerBackup handles POST /api/backups.
+func (s *Server) triggerBackup(w http.ResponseWriter, r *http.Request) {
+	if s.backupEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
+		return
+	}
+
+	backup, err := s.backupEngine.CreateBackup("manual")
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to create backup: "+err.Error())
+		return
+	}
+
+	// Prune after manual backup too.
+	settings, err := s.store.GetSettings()
+	if err == nil && settings.BackupRetentionCount > 0 {
+		s.backupEngine.Prune(settings.BackupRetentionCount)
+	}
+
+	respondJSON(w, http.StatusCreated, backup)
+}
+
+// downloadBackup handles GET /api/backups/{id}/download.
+func (s *Server) downloadBackup(w http.ResponseWriter, r *http.Request) {
+	if s.backupEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
+		return
+	}
+
+	id := chi.URLParam(r, "id")
+	backup, err := s.backupEngine.GetBackup(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "backup not found")
+		return
+	}
+
+	filePath := s.backupEngine.FilePath(backup)
+
+	// Validate the resolved path stays within the backup directory to prevent path traversal.
+	absPath, err := filepath.Abs(filePath)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to resolve backup path")
+		return
+	}
+	absBackupDir, _ := filepath.Abs(s.backupEngine.BackupDir())
+	if !strings.HasPrefix(absPath, absBackupDir+string(filepath.Separator)) {
+		respondError(w, http.StatusForbidden, "access denied")
+		return
+	}
+
+	f, err := os.Open(absPath)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "backup file not found on disk")
+		return
+	}
+	defer f.Close()
+
+	stat, err := f.Stat()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to read backup file")
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/octet-stream")
+	w.Header().Set("Content-Disposition", "attachment; filename=\""+filepath.Base(backup.Filename)+"\"")
+	http.ServeContent(w, r, filepath.Base(backup.Filename), stat.ModTime(), f)
+}
+
+// deleteBackup handles DELETE /api/backups/{id}.
+func (s *Server) deleteBackup(w http.ResponseWriter, r *http.Request) {
+	if s.backupEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
+		return
+	}
+
+	id := chi.URLParam(r, "id")
+	if err := s.backupEngine.DeleteBackup(id); err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to delete backup: "+err.Error())
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
+}
+
+// restoreBackup handles POST /api/backups/{id}/restore.
+//
+// Restore happens in three documented stages so a failure at any stage
+// leaves the live DB intact:
+//
+//  1. PRE-FLIGHT (sync, before the HTTP response): PrepareRestore opens
+//     the candidate read-only and runs `PRAGMA integrity_check`. If it
+//     fails the live DB is untouched and we return 400 with the reason.
+//
+//  2. SAFETY NET: a pre-restore backup of the LIVE DB is created so the
+//     operator can roll back even if the candidate is later discovered
+//     to be missing data.
+//
+//  3. SWAP (async, after the response is flushed): close the live DB,
+//     atomic-rename the candidate over the live path, wipe WAL/SHM,
+//     trigger graceful shutdown. supervisord / systemd / docker
+//     restart=on-failure brings the process back with the new DB.
+func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
+	if s.backupEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
+		return
+	}
+
+	id := chi.URLParam(r, "id")
+
+	// CSRF / accidental-fire guard: the restore endpoint is the most
+	// destructive surface in the API (replaces the whole DB). Even
+	// though it sits behind AdminOnly + Bearer JWT, a blind cross-site
+	// POST or a misclicked button in any open admin tab can fire it.
+	// Require the operator's client to echo X-Confirm-Restore: <id>
+	// — matching the path param — so a CSRF post-form / image-src
+	// trick can't trigger restore (browsers don't let cross-origin
+	// requests set custom headers without a preflight).
+	if confirm := r.Header.Get("X-Confirm-Restore"); confirm != id {
+		respondError(w, http.StatusBadRequest,
+			"missing or mismatched X-Confirm-Restore header (must equal backup id)")
+		return
+	}
+
+	// Single-flight guard: a rapid double-click would otherwise spawn
+	// two goroutines racing s.store.Close() and the candidate-over-
+	// live rename. CAS to true here; if someone else won, return 409.
+	if !s.restoreInFlight.CompareAndSwap(false, true) {
+		respondError(w, http.StatusConflict, "a restore is already in progress")
+		return
+	}
+	// Do NOT release the flag — the restore path triggers shutdown.
+	// A failed restore is also terminal (the DB may be closed); a
+	// fresh process boot is the recovery path.
+	// PRE-FLIGHT: refuse before touching anything if the candidate is
+	// not a valid SQLite database or fails integrity_check. This is the
+	// guard the prior code lacked — a corrupt backup would silently
+	// overwrite a healthy live DB.
+	restorePath, err := s.backupEngine.PrepareRestore(id)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	// SAFETY NET: pre-restore snapshot of the live DB. A failure here
+	// is logged but does not abort — the integrity-checked candidate
+	// is still safer than refusing to restore.
+	if _, err := s.backupEngine.CreateBackup("pre-restore"); err != nil {
+		slog.Warn("failed to create pre-restore backup", "error", err)
+	}
+
+	// Send the response BEFORE closing the DB so the client gets confirmation.
+	respondJSON(w, http.StatusOK, map[string]any{
+		"status":  "restoring",
+		"message": "Database restore initiated. The server will restart shortly.",
+	})
+
+	// Flush the response.
+	if f, ok := w.(http.Flusher); ok {
+		f.Flush()
+	}
+
+	// Perform the destructive restore in a goroutine with a brief delay
+	// to allow the HTTP response to be fully sent.
+	go func() {
+		time.Sleep(500 * time.Millisecond)
+
+		// Once we begin closing the live DB the process can no longer serve
+		// requests against a sane store, so EVERY exit path from here must
+		// trigger shutdown. Returning early would leave the server limping
+		// on a closed/half-swapped database with no path to recovery except
+		// an external kill. shutdownFunc → graceful shutdown → main returns
+		// → deferred releaseLock()/db.Close() run, and the supervisor reopens
+		// whatever DB is on disk on the next boot.
+		triggerShutdown := func() {
+			if s.shutdownFunc != nil {
+				s.shutdownFunc()
+			}
+		}
+
+		// Close the current database to release locks. AtomicReplaceDB
+		// expects the live file to be unmapped before swap (especially
+		// important on Windows where open files cannot be renamed over).
+		if err := s.store.Close(); err != nil {
+			slog.Error("restore: failed to close database, restarting", "error", err)
+			triggerShutdown()
+			return
+		}
+
+		if err := s.backupEngine.AtomicReplaceDB(restorePath, s.dbPath); err != nil {
+			slog.Error("restore: atomic replace failed, restarting", "error", err)
+			triggerShutdown()
+			return
+		}
+
+		slog.Info("restore: database replaced, triggering shutdown")
+
+		// Signal the server to shut down gracefully so it can be restarted.
+		triggerShutdown()
+	}()
+}
@@ -1,21 +1,23 @@
 package api

 import (
+	"log/slog"
 	"net/http"

-	"github.com/alexei/docker-watcher/internal/config"
+	"github.com/alexei/tinyforge/internal/config"
 )

 // exportConfig handles GET /api/config/export — downloads current state as YAML.
 func (s *Server) exportConfig(w http.ResponseWriter, r *http.Request) {
 	data, err := config.ExportConfig(s.store)
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to export config: "+err.Error())
+		slog.Error("failed to export config", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

 	w.Header().Set("Content-Type", "application/x-yaml")
-	w.Header().Set("Content-Disposition", "attachment; filename=docker-watcher.yaml")
+	w.Header().Set("Content-Disposition", "attachment; filename=tinyforge.yaml")
 	w.WriteHeader(http.StatusOK)
 	w.Write(data)
 }
@@ -0,0 +1,91 @@
+package api
+
+import (
+	"errors"
+	"net/http"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/go-chi/chi/v5"
+)
+
+// containerView decorates a stored Container row with the human-readable
+// names the global Containers table needs (workload name, app name).
+// Decorating server-side avoids N+1 fetches on the frontend.
+type containerView struct {
+	store.Container
+	WorkloadName string `json:"workload_name"`
+	AppID        string `json:"app_id,omitempty"`
+	AppName      string `json:"app_name,omitempty"`
+}
+
+// listAllContainers handles GET /api/containers.
+// Query params: workload_id, kind, state, app_id (all optional, AND-combined).
+// Returns the global container index, newest first, decorated with workload
+// and app names.
+func (s *Server) listAllContainers(w http.ResponseWriter, r *http.Request) {
+	q := r.URL.Query()
+	filter := store.ContainerFilter{
+		WorkloadID:   q.Get("workload_id"),
+		WorkloadKind: q.Get("kind"),
+		State:        q.Get("state"),
+		AppID:        q.Get("app_id"),
+	}
+
+	rows, err := s.store.ListContainers(filter)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list containers")
+		return
+	}
+
+	// Pre-load workloads + apps so the join is in-memory rather than per-row.
+	workloads, err := s.store.ListWorkloads("")
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workloads")
+		return
+	}
+	wlByID := make(map[string]store.Workload, len(workloads))
+	for _, wl := range workloads {
+		wlByID[wl.ID] = wl
+	}
+
+	apps, err := s.store.ListApps()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list apps")
+		return
+	}
+	appByID := make(map[string]store.App, len(apps))
+	for _, a := range apps {
+		appByID[a.ID] = a
+	}
+
+	out := make([]containerView, 0, len(rows))
+	for _, c := range rows {
+		v := containerView{Container: c}
+		if wl, ok := wlByID[c.WorkloadID]; ok {
+			v.WorkloadName = wl.Name
+			if wl.AppID != "" {
+				v.AppID = wl.AppID
+				if app, ok := appByID[wl.AppID]; ok {
+					v.AppName = app.Name
+				}
+			}
+		}
+		out = append(out, v)
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getContainer handles GET /api/containers/{id}.
+func (s *Server) getContainer(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	c, err := s.store.GetContainerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "container")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get container")
+		return
+	}
+	respondJSON(w, http.StatusOK, c)
+}
@@ -1,180 +0,0 @@
-package api
-
-import (
-	"log/slog"
-	"net/http"
-	"strconv"
-	"strings"
-
-	"github.com/alexei/docker-watcher/internal/store"
-)
-
-// listDeploys handles GET /api/deploys.
-func (s *Server) listDeploys(w http.ResponseWriter, r *http.Request) {
-	limitStr := r.URL.Query().Get("limit")
-	limit := 50
-	if limitStr != "" {
-		if parsed, err := strconv.Atoi(limitStr); err == nil && parsed > 0 {
-			limit = parsed
-		}
-	}
-
-	deploys, err := s.store.GetRecentDeploys(limit)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list deploys: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, deploys)
-}
-
-// NOTE: getDeployLogs has been replaced by streamDeployLogs in sse.go.
-// The new handler supports both SSE streaming and JSON fallback via Accept header.
-
-// inspectRequest is the expected JSON body for POST /api/deploy/inspect.
-type inspectRequest struct {
-	Image string `json:"image"`
-}
-
-// inspectResponse is the response body for POST /api/deploy/inspect.
-type inspectResponse struct {
-	Image       string `json:"image"`
-	Port        int    `json:"port"`
-	Healthcheck string `json:"healthcheck"`
-}
-
-// inspectImage handles POST /api/deploy/inspect.
-// Pulls the image and inspects it for EXPOSE ports and healthcheck config.
-func (s *Server) inspectImage(w http.ResponseWriter, r *http.Request) {
-	var req inspectRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.Image == "" {
-		respondError(w, http.StatusBadRequest, "image is required")
-		return
-	}
-
-	ctx := r.Context()
-
-	// Pull the image first so it's available locally for inspection.
-	// Split image:tag for the pull call.
-	imageRef, tag := splitImageTag(req.Image)
-	if err := s.docker.PullImage(ctx, imageRef, tag, ""); err != nil {
-		slog.Warn("pull image for inspect", "image", req.Image, "error", err)
-		// Try to inspect anyway in case the image is already local.
-	}
-
-	info, err := s.docker.InspectImage(ctx, req.Image)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to inspect image: "+err.Error())
-		return
-	}
-
-	port := extractPort(info.ExposedPorts)
-
-	respondJSON(w, http.StatusOK, inspectResponse{
-		Image:       req.Image,
-		Port:        port,
-		Healthcheck: info.Healthcheck,
-	})
-}
-
-// quickDeployRequest is the expected JSON body for POST /api/deploy/quick.
-type quickDeployRequest struct {
-	Name     string `json:"name"`
-	Image    string `json:"image"`
-	Tag      string `json:"tag"`
-	Registry string `json:"registry"`
-	Port     int    `json:"port"`
-}
-
-// quickDeploy handles POST /api/deploy/quick.
-// Creates a project, a default stage, and triggers a deploy in one call.
-func (s *Server) quickDeploy(w http.ResponseWriter, r *http.Request) {
-	var req quickDeployRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.Image == "" {
-		respondError(w, http.StatusBadRequest, "image is required")
-		return
-	}
-	if req.Tag == "" {
-		req.Tag = "latest"
-	}
-	if req.Name == "" {
-		// Derive name from image.
-		parts := strings.Split(req.Image, "/")
-		req.Name = parts[len(parts)-1]
-	}
-
-	// Create project.
-	project, err := s.store.CreateProject(store.Project{
-		Name:     req.Name,
-		Image:    req.Image,
-		Registry: req.Registry,
-		Port:     req.Port,
-		Env:      "{}",
-		Volumes:  "{}",
-	})
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create project: "+err.Error())
-		return
-	}
-
-	// Create default stage.
-	stage, err := s.store.CreateStage(store.Stage{
-		ProjectID:    project.ID,
-		Name:         "dev",
-		TagPattern:   "*",
-		AutoDeploy:   true,
-		MaxInstances: 1,
-	})
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create stage: "+err.Error())
-		return
-	}
-
-	// Trigger deploy asynchronously.
-	deployID, err := s.deployer.AsyncTriggerDeploy(r.Context(), project.ID, stage.ID, req.Tag)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to trigger deploy: "+err.Error())
-		return
-	}
-
-	respondJSON(w, http.StatusAccepted, map[string]any{
-		"project":   project,
-		"stage":     stage,
-		"tag":       req.Tag,
-		"deploy_id": deployID,
-		"status":  "deploying",
-	})
-}
-
-// splitImageTag splits "image:tag" into image and tag parts.
-// Returns the full string and empty tag if no colon separator is found.
-func splitImageTag(ref string) (string, string) {
-	if idx := strings.LastIndex(ref, ":"); idx != -1 {
-		afterColon := ref[idx+1:]
-		if !strings.Contains(afterColon, "/") {
-			return ref[:idx], afterColon
-		}
-	}
-	return ref, ""
-}
-
-// extractPort parses the first exposed port from Docker EXPOSE entries.
-// Entries are in the form "8080/tcp" or "8080". Returns 0 if none found.
-func extractPort(exposedPorts []string) int {
-	if len(exposedPorts) == 0 {
-		return 0
-	}
-	raw := exposedPorts[0]
-	if idx := strings.Index(raw, "/"); idx != -1 {
-		raw = raw[:idx]
-	}
-	port, _ := strconv.Atoi(raw)
-	return port
-}
@@ -0,0 +1,452 @@
+package api
+
+import (
+	"context"
+	"encoding/json"
+	"log/slog"
+	"net/http"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/staticsite"
+)
+
+// Discovery endpoints feed the /apps/new wizard's auto-discovery and
+// connection-test flow. They wrap staticsite.GitProvider so the form
+// can validate a repo + token before the workload is created, browse
+// repos/branches/folders without leaving the page, and warn the operator
+// when an image is already in use by another workload.
+//
+// The endpoints are workload-agnostic on purpose — they are scoped under
+// /api/discovery rather than tied to the static_sites table the cutover
+// dropped. Any future Git-driven source plugin can reuse them.
+
+// Per-request budget for outbound calls. Short enough that a malicious
+// or stuck upstream cannot pin a worker for long; long enough for slow
+// self-hosted Gitea instances to respond.
+const discoveryTimeout = 15 * time.Second
+
+// gitProviderRequest is the shared request body for the four Git
+// discovery endpoints. Token is plaintext over HTTPS — the wizard has
+// not yet persisted it, so there is nothing to decrypt server-side.
+// Empty Provider triggers DetectProviderWithProbe.
+type gitProviderRequest struct {
+	Provider    string `json:"provider"`
+	BaseURL     string `json:"base_url"`
+	AccessToken string `json:"access_token"`
+	RepoOwner   string `json:"repo_owner"`
+	RepoName    string `json:"repo_name"`
+	Branch      string `json:"branch"`
+	Query       string `json:"query"`
+}
+
+// gitIdentRe accepts Git owner / repo identifiers as the major hosts
+// (GitHub, GitLab, Gitea/Forgejo) accept them: alphanumeric plus dot,
+// underscore, hyphen. Rejecting other characters at the API boundary
+// prevents `..` traversal and URL injection in the provider code that
+// interpolates these segments into request paths.
+var gitIdentRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9._-]*$`)
+
+// gitBranchRe is more permissive than gitIdentRe: branches may contain
+// `/` (e.g. `feature/foo`) but still cannot contain `..` or control
+// characters. The check below pairs this regex with an explicit `..`
+// reject so a `feature/../admin` value cannot slip through.
+var gitBranchRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9._/-]*$`)
+
+// validateGitIdent guards owner / repo path segments at the boundary
+// so the provider code can interpolate them with fmt.Sprintf without
+// risking traversal. Empty input is reported with the supplied field
+// name so the error message is actionable.
+func validateGitIdent(field, value string) error {
+	v := strings.TrimSpace(value)
+	if v == "" {
+		return &apiError{msg: field + " is required"}
+	}
+	if !gitIdentRe.MatchString(v) {
+		return &apiError{msg: field + " contains invalid characters"}
+	}
+	return nil
+}
+
+// validateGitBranch is the branch-shaped variant of validateGitIdent.
+// Branches legitimately contain `/`; the extra `..` reject covers the
+// one traversal vector the regex still admits.
+func validateGitBranch(value string) error {
+	v := strings.TrimSpace(value)
+	if v == "" {
+		return &apiError{msg: "branch is required"}
+	}
+	if strings.Contains(v, "..") {
+		return &apiError{msg: "branch contains invalid sequence '..'"}
+	}
+	if !gitBranchRe.MatchString(v) {
+		return &apiError{msg: "branch contains invalid characters"}
+	}
+	return nil
+}
+
+// apiError is a small typed error so handlers can distinguish a
+// validation failure (→ 400) from any other error (→ 500/502). The
+// type lives in this file because nothing outside discovery uses it
+// yet — promote to response.go if other handlers need the same shape.
+type apiError struct{ msg string }
+
+func (e *apiError) Error() string { return e.msg }
+
+// providerType normalizes the provider string into the typed enum used
+// by staticsite.NewGitProvider. Empty input falls through to provider
+// auto-detection inside NewGitProvider.
+func (req gitProviderRequest) providerType() staticsite.ProviderType {
+	switch strings.ToLower(strings.TrimSpace(req.Provider)) {
+	case "github":
+		return staticsite.ProviderGitHub
+	case "gitlab":
+		return staticsite.ProviderGitLab
+	case "gitea":
+		return staticsite.ProviderGitea
+	default:
+		return ""
+	}
+}
+
+// newProvider constructs the GitProvider for the request, or writes a
+// 400 to w and returns nil if the inputs are invalid. BaseURL is fully
+// validated here (scheme + host shape); connect-time IP filtering is
+// enforced inside the safe-HTTP transport the provider receives.
+func (req gitProviderRequest) newProvider(w http.ResponseWriter) staticsite.GitProvider {
+	if err := staticsite.ValidateBaseURL(req.BaseURL); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return nil
+	}
+	provider, err := staticsite.NewGitProvider(req.providerType(), req.BaseURL, req.AccessToken)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return nil
+	}
+	return provider
+}
+
+// upstreamError logs the detailed upstream failure server-side and
+// writes a generic 502 to the client. Echoing the raw error string
+// would leak any access token reflected by a misconfigured or
+// attacker-controlled upstream into the response body.
+func upstreamError(w http.ResponseWriter, op string, err error) {
+	slog.Warn("discovery upstream call failed", "op", op, "error", err)
+	respondError(w, http.StatusBadGateway, "upstream git provider returned an error")
+}
+
+// detectGitProviderRequest is the body for POST /api/discovery/git/detect-provider.
+type detectGitProviderRequest struct {
+	BaseURL string `json:"base_url"`
+}
+
+// detectGitProvider probes the base URL for known Git provider API
+// signatures so the wizard can auto-fill the provider dropdown.
+// POST /api/discovery/git/detect-provider.
+func (s *Server) detectGitProvider(w http.ResponseWriter, r *http.Request) {
+	var req detectGitProviderRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	if err := staticsite.ValidateBaseURL(req.BaseURL); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	provider := staticsite.DetectProviderWithProbe(ctx, req.BaseURL)
+	respondJSON(w, http.StatusOK, map[string]string{"provider": string(provider)})
+}
+
+// testGitConnection verifies the configured base URL + token + repo
+// reach the provider successfully so the wizard can fail fast.
+// POST /api/discovery/git/test-connection.
+func (s *Server) testGitConnection(w http.ResponseWriter, r *http.Request) {
+	var req gitProviderRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	if err := validateGitIdent("repo_owner", req.RepoOwner); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	if err := validateGitIdent("repo_name", req.RepoName); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	provider := req.newProvider(w)
+	if provider == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	if err := provider.TestConnection(ctx, req.RepoOwner, req.RepoName); err != nil {
+		upstreamError(w, "test_connection", err)
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"status": "ok"})
+}
+
+// listGitRepos returns repositories accessible with the supplied token,
+// optionally filtered by a name query.
+// POST /api/discovery/git/repos.
+func (s *Server) listGitRepos(w http.ResponseWriter, r *http.Request) {
+	var req gitProviderRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	provider := req.newProvider(w)
+	if provider == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	repos, err := provider.ListRepos(ctx, req.Query)
+	if err != nil {
+		upstreamError(w, "list_repos", err)
+		return
+	}
+	if repos == nil {
+		repos = []staticsite.RepoInfo{}
+	}
+	respondJSON(w, http.StatusOK, repos)
+}
+
+// listGitBranches returns the branch list for a repo.
+// POST /api/discovery/git/branches.
+func (s *Server) listGitBranches(w http.ResponseWriter, r *http.Request) {
+	var req gitProviderRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	if err := validateGitIdent("repo_owner", req.RepoOwner); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	if err := validateGitIdent("repo_name", req.RepoName); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	provider := req.newProvider(w)
+	if provider == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	branches, err := provider.ListBranches(ctx, req.RepoOwner, req.RepoName)
+	if err != nil {
+		upstreamError(w, "list_branches", err)
+		return
+	}
+	if branches == nil {
+		branches = []string{}
+	}
+	respondJSON(w, http.StatusOK, branches)
+}
+
+// listGitTree returns the full directory tree for a branch so the
+// wizard can render the folder picker.
+// POST /api/discovery/git/tree.
+func (s *Server) listGitTree(w http.ResponseWriter, r *http.Request) {
+	var req gitProviderRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	if err := validateGitIdent("repo_owner", req.RepoOwner); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	if err := validateGitIdent("repo_name", req.RepoName); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	if err := validateGitBranch(req.Branch); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	provider := req.newProvider(w)
+	if provider == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	tree, err := provider.ListTree(ctx, req.RepoOwner, req.RepoName, req.Branch)
+	if err != nil {
+		upstreamError(w, "list_tree", err)
+		return
+	}
+	if tree == nil {
+		tree = []staticsite.FolderEntry{}
+	}
+	respondJSON(w, http.StatusOK, tree)
+}
+
+// imageConflict is a slim projection of Workload, scoped to what the
+// /apps/new conflict dialog needs to render.
+type imageConflict struct {
+	ID    string `json:"id"`
+	Name  string `json:"name"`
+	Image string `json:"image"`
+	AppID string `json:"app_id,omitempty"`
+}
+
+// listImageConflicts finds existing image-source workloads whose
+// configured image matches the supplied ref, with or without tag.
+// GET /api/discovery/image/conflicts?image=<ref>.
+//
+// Matching mirrors the legacy quickDeploy behavior: collide on
+// repository-without-tag so nginx:1.25 surfaces nginx, nginx:latest,
+// and nginx:1.26 as conflicts. This is intentionally permissive — the
+// wizard surfaces matches but lets the operator decide.
+func (s *Server) listImageConflicts(w http.ResponseWriter, r *http.Request) {
+	image := strings.TrimSpace(r.URL.Query().Get("image"))
+	if image == "" {
+		respondError(w, http.StatusBadRequest, "image query parameter is required")
+		return
+	}
+	target := stripImageTag(image)
+	if target == "" {
+		respondError(w, http.StatusBadRequest, "image is empty after tag strip")
+		return
+	}
+
+	workloads, err := s.store.ListWorkloads("")
+	if err != nil {
+		slog.Error("list workloads for conflict check", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	conflicts := []imageConflict{}
+	for _, wl := range workloads {
+		if wl.SourceKind != "image" {
+			continue
+		}
+		ref := imageRefFromSourceConfig(wl.SourceConfig)
+		if ref == "" {
+			continue
+		}
+		if stripImageTag(ref) != target {
+			continue
+		}
+		conflicts = append(conflicts, imageConflict{
+			ID:    wl.ID,
+			Name:  wl.Name,
+			Image: ref,
+			AppID: wl.AppID,
+		})
+	}
+	respondJSON(w, http.StatusOK, conflicts)
+}
+
+// inspectImageRequest is the body for POST /api/discovery/image/inspect.
+type inspectImageRequest struct {
+	Image string `json:"image"`
+}
+
+// inspectImageResponse mirrors the frontend InspectResult shape the
+// new-app wizard pre-fills from: the first exposed port (parsed to int,
+// 0 when none) and the image's HEALTHCHECK command string.
+type inspectImageResponse struct {
+	Port        int    `json:"port"`
+	Healthcheck string `json:"healthcheck"`
+}
+
+// inspectImageMetadata inspects a LOCAL image and returns its first
+// exposed port + healthcheck so the wizard can pre-fill those fields.
+// POST /api/discovery/image/inspect.
+//
+// This inspects local images only — it does not pull. When the image is
+// not present locally the docker call fails; we return a generic,
+// non-leaky 400 rather than the git-specific upstreamError so a raw
+// docker daemon string (which may echo the ref) never reaches the client.
+func (s *Server) inspectImageMetadata(w http.ResponseWriter, r *http.Request) {
+	var req inspectImageRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	image := strings.TrimSpace(req.Image)
+	if image == "" {
+		respondError(w, http.StatusBadRequest, "image is required")
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
+	defer cancel()
+
+	info, err := s.docker.InspectImage(ctx, image)
+	if err != nil {
+		slog.Warn("inspect image metadata failed", "error", err)
+		respondError(w, http.StatusBadRequest, "could not inspect image — make sure it is pulled locally and the reference is correct")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, inspectImageResponse{
+		Port:        docker.ExtractPort(info.ExposedPorts),
+		Healthcheck: info.Healthcheck,
+	})
+}
+
+// stripImageTag returns the image reference with the trailing :tag
+// removed, taking care to leave a registry port (e.g. registry:5000/foo)
+// intact. Digest references (image@sha256:...) are returned unchanged.
+func stripImageTag(ref string) string {
+	ref = strings.TrimSpace(ref)
+	if ref == "" {
+		return ""
+	}
+	// Digest reference: keep as-is so two pinned-by-digest workloads do
+	// not collide with each other or with tag-based refs unless the
+	// caller asks for exact-match (we currently don't).
+	if at := strings.Index(ref, "@"); at >= 0 {
+		return ref[:at]
+	}
+	// Strip a :tag suffix only when the colon is in the final path
+	// segment — earlier colons belong to a registry port.
+	lastSlash := strings.LastIndex(ref, "/")
+	tail := ref
+	if lastSlash >= 0 {
+		tail = ref[lastSlash+1:]
+	}
+	if colon := strings.LastIndex(tail, ":"); colon >= 0 {
+		// Only strip if the tag part looks like a tag (no slashes,
+		// non-empty). Otherwise leave alone. When lastSlash is -1 the
+		// arithmetic still yields the right cut point (-1 + 1 + colon
+		// == colon), so no special case is needed.
+		tag := tail[colon+1:]
+		if tag != "" && !strings.ContainsAny(tag, "/") {
+			return ref[:lastSlash+1+colon]
+		}
+	}
+	return ref
+}
+
+// imageRefFromSourceConfig extracts the "image" field from a workload's
+// source_config JSON. Returns "" when the blob is missing, malformed,
+// or has no image field — those workloads simply do not contribute to
+// conflict detection.
+func imageRefFromSourceConfig(raw string) string {
+	if raw == "" {
+		return ""
+	}
+	var cfg struct {
+		Image string `json:"image"`
+	}
+	if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
+		return ""
+	}
+	return strings.TrimSpace(cfg.Image)
+}
+
@@ -0,0 +1,355 @@
+package api
+
+import (
+	"encoding/json"
+	"net/http"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// =============================================================================
+// stripImageTag — pure helper, no fixtures needed
+// =============================================================================
+
+func TestStripImageTag(t *testing.T) {
+	cases := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"empty", "", ""},
+		{"bare", "nginx", "nginx"},
+		{"tagged", "nginx:1.25", "nginx"},
+		{"latest", "nginx:latest", "nginx"},
+		{"owner_tagged", "library/nginx:1.25", "library/nginx"},
+		{"registry_tagged", "registry.example.com/owner/app:v1", "registry.example.com/owner/app"},
+		{"registry_port_no_tag", "registry.example.com:5000/owner/app", "registry.example.com:5000/owner/app"},
+		{"registry_port_with_tag", "registry.example.com:5000/owner/app:v1", "registry.example.com:5000/owner/app"},
+		{"digest", "nginx@sha256:abcd", "nginx"},
+		{"digest_with_owner", "library/nginx@sha256:abcd", "library/nginx"},
+		{"trailing_whitespace", "  nginx:1.25  ", "nginx"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := stripImageTag(tc.in)
+			if got != tc.want {
+				t.Errorf("stripImageTag(%q) = %q, want %q", tc.in, got, tc.want)
+			}
+		})
+	}
+}
+
+// =============================================================================
+// imageRefFromSourceConfig — pure helper
+// =============================================================================
+
+func TestImageRefFromSourceConfig(t *testing.T) {
+	cases := []struct {
+		name string
+		raw  string
+		want string
+	}{
+		{"empty", "", ""},
+		{"malformed", "{not json", ""},
+		{"no_image_field", `{"port":8080}`, ""},
+		{"basic", `{"image":"nginx:1.25"}`, "nginx:1.25"},
+		{"whitespace_trim", `{"image":"  nginx:1.25  "}`, "nginx:1.25"},
+		{"with_extras", `{"image":"nginx","port":8080,"env":{"K":"v"}}`, "nginx"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := imageRefFromSourceConfig(tc.raw)
+			if got != tc.want {
+				t.Errorf("imageRefFromSourceConfig(%q) = %q, want %q", tc.raw, got, tc.want)
+			}
+		})
+	}
+}
+
+// =============================================================================
+// GET /api/discovery/image/conflicts
+// =============================================================================
+
+// seedImageWorkload inserts a plugin-shaped image workload via the store
+// directly. We bypass the API here so each test case starts with a
+// known fixture independent of /api/workloads create-path behaviour.
+func seedImageWorkload(t *testing.T, st *store.Store, name, imageRef string) {
+	t.Helper()
+	cfg, err := json.Marshal(map[string]any{"image": imageRef, "port": 8080})
+	if err != nil {
+		t.Fatalf("marshal source_config: %v", err)
+	}
+	if _, err := st.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         name,
+		SourceKind:   "image",
+		SourceConfig: string(cfg),
+	}); err != nil {
+		t.Fatalf("seed workload %q: %v", name, err)
+	}
+}
+
+func TestListImageConflicts_NoMatches_ReturnsEmpty(t *testing.T) {
+	e := newAPITestEnv(t)
+	seedImageWorkload(t, e.store, "alpha", "nginx:1.25")
+	seedImageWorkload(t, e.store, "beta", "registry.example.com/owner/web:v2")
+
+	resp := e.do(t, http.MethodGet, "/api/discovery/image/conflicts?image=postgres:16", nil)
+	if resp.StatusCode != http.StatusOK {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got []imageConflict
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(got) != 0 {
+		t.Errorf("expected 0 conflicts, got %d: %+v", len(got), got)
+	}
+}
+
+func TestListImageConflicts_TagMismatch_StillCollides(t *testing.T) {
+	// The legacy quickDeploy collided on repo without tag so nginx:1.25
+	// surfaces nginx:1.26 — this preserves that behaviour.
+	e := newAPITestEnv(t)
+	seedImageWorkload(t, e.store, "nginx-prod", "nginx:1.25")
+	seedImageWorkload(t, e.store, "nginx-latest", "nginx:latest")
+
+	resp := e.do(t, http.MethodGet, "/api/discovery/image/conflicts?image=nginx:1.26", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got []imageConflict
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(got) != 2 {
+		t.Fatalf("expected 2 conflicts, got %d: %+v", len(got), got)
+	}
+	names := map[string]bool{}
+	for _, c := range got {
+		names[c.Name] = true
+	}
+	if !names["nginx-prod"] || !names["nginx-latest"] {
+		t.Errorf("expected both nginx-prod and nginx-latest in conflicts, got %+v", got)
+	}
+}
+
+func TestListImageConflicts_RegistryPortPreserved(t *testing.T) {
+	// Make sure stripImageTag preserves a registry port in the host
+	// segment — registry.example.com:5000/owner/app:v1 must collide
+	// only with refs whose repo is registry.example.com:5000/owner/app.
+	e := newAPITestEnv(t)
+	seedImageWorkload(t, e.store, "with-port", "registry.example.com:5000/owner/app:v1")
+	seedImageWorkload(t, e.store, "no-port", "owner/app:v1")
+
+	resp := e.do(t, http.MethodGet, "/api/discovery/image/conflicts?image=registry.example.com:5000/owner/app:v2", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got []imageConflict
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(got) != 1 || got[0].Name != "with-port" {
+		t.Errorf("expected sole conflict on with-port, got %+v", got)
+	}
+}
+
+func TestListImageConflicts_NonImageSourceIgnored(t *testing.T) {
+	// Static-source workloads must never appear in image conflicts even
+	// if their JSON happens to contain a stray "image" key — guard
+	// against source_kind != "image" rows.
+	e := newAPITestEnv(t)
+	if _, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         "static-with-image-key",
+		SourceKind:   "static",
+		SourceConfig: `{"image":"nginx:1.25","provider":"gitea"}`,
+	}); err != nil {
+		t.Fatalf("seed static workload: %v", err)
+	}
+
+	resp := e.do(t, http.MethodGet, "/api/discovery/image/conflicts?image=nginx:1.25", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got []imageConflict
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(got) != 0 {
+		t.Errorf("expected 0 conflicts (static source filtered out), got %+v", got)
+	}
+}
+
+func TestListImageConflicts_MissingImageParam_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodGet, "/api/discovery/image/conflicts", nil)
+	if resp.StatusCode != http.StatusBadRequest {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// POST /api/discovery/git/* — input validation
+// =============================================================================
+//
+// These tests only assert request-shape validation. The provider
+// implementations themselves are exercised by their own tests in
+// internal/staticsite; we don't reach upstream Git in unit tests.
+
+func TestDetectGitProvider_MissingBaseURL_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/detect-provider", map[string]string{})
+	if resp.StatusCode != http.StatusBadRequest {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+func TestTestGitConnection_MissingRepo_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/test-connection", map[string]string{
+		"base_url": "https://git.example.com",
+	})
+	if resp.StatusCode != http.StatusBadRequest {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+func TestListGitBranches_MissingRepo_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/branches", map[string]string{
+		"base_url": "https://git.example.com",
+	})
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+func TestListGitTree_MissingBranch_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/tree", map[string]string{
+		"base_url":   "https://git.example.com",
+		"repo_owner": "owner",
+		"repo_name":  "repo",
+	})
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+func TestListGitRepos_MissingBaseURL_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/repos", map[string]string{})
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// Validators added during security hardening — boundary checks the
+// providers depend on for safe URL interpolation.
+// =============================================================================
+
+func TestValidateGitIdent(t *testing.T) {
+	cases := []struct {
+		name      string
+		input     string
+		wantError bool
+	}{
+		{"ok_simple", "owner", false},
+		{"ok_with_dash", "my-org", false},
+		{"ok_with_dot", "user.name", false},
+		{"ok_with_underscore", "my_repo", false},
+		{"empty", "", true},
+		{"whitespace_only", "   ", true},
+		{"leading_dot", ".hidden", true},
+		{"leading_dash", "-flag", true},
+		{"slash", "owner/repo", true},
+		{"traversal", "..", true},
+		{"path_traversal", "../admin", true},
+		{"with_space", "my org", true},
+		{"with_special", "owner;rm", true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			err := validateGitIdent("test", tc.input)
+			if tc.wantError && err == nil {
+				t.Errorf("validateGitIdent(%q) = nil, want error", tc.input)
+			}
+			if !tc.wantError && err != nil {
+				t.Errorf("validateGitIdent(%q) = %v, want nil", tc.input, err)
+			}
+		})
+	}
+}
+
+func TestValidateGitBranch(t *testing.T) {
+	cases := []struct {
+		name      string
+		input     string
+		wantError bool
+	}{
+		{"ok_main", "main", false},
+		{"ok_master", "master", false},
+		{"ok_with_slash", "feature/foo", false},
+		{"ok_release_tag", "release/v1.2.3", false},
+		{"empty", "", true},
+		{"traversal", "feature/..", true},
+		{"hidden_traversal", "feature/../admin", true},
+		{"leading_dash", "-flag", true},
+		{"with_space", "feature/my branch", true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			err := validateGitBranch(tc.input)
+			if tc.wantError && err == nil {
+				t.Errorf("validateGitBranch(%q) = nil, want error", tc.input)
+			}
+			if !tc.wantError && err != nil {
+				t.Errorf("validateGitBranch(%q) = %v, want nil", tc.input, err)
+			}
+		})
+	}
+}
+
+func TestTestGitConnection_InvalidOwner_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/test-connection", map[string]string{
+		"base_url":   "https://git.example.com",
+		"repo_owner": "../admin",
+		"repo_name":  "repo",
+	})
+	if resp.StatusCode != http.StatusBadRequest {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 400 (traversal rejected)", resp.StatusCode)
+	}
+}
+
+func TestListGitTree_InvalidBranch_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/tree", map[string]string{
+		"base_url":   "https://git.example.com",
+		"repo_owner": "owner",
+		"repo_name":  "repo",
+		"branch":     "feature/../admin",
+	})
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400 (branch traversal rejected)", resp.StatusCode)
+	}
+}
+
+func TestDetectGitProvider_InvalidScheme_400(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/discovery/git/detect-provider", map[string]string{
+		"base_url": "ftp://git.example.com",
+	})
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400 (non-http scheme rejected)", resp.StatusCode)
+	}
+}
@@ -0,0 +1,375 @@
+package api
+
+import (
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/dns"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/go-chi/chi/v5"
+)
+
+// dnsTargetIP returns the IP to use for DNS A records.
+// Prefers PublicIP (the proxy/NPM host), falls back to ServerIP.
+func dnsTargetIP(settings store.Settings) string {
+	if settings.PublicIP != "" {
+		return settings.PublicIP
+	}
+	return dnsTargetIP(settings)
+}
+
+// dnsRecordView is the response format for DNS records with consumer context.
+type dnsRecordView struct {
+	FQDN         string `json:"fqdn"`
+	Type         string `json:"type"`
+	Content      string `json:"content"`
+	ConsumerType string `json:"consumer_type"`
+	ConsumerName string `json:"consumer_name"`
+	ConsumerID   string `json:"consumer_id"`
+	Status       string `json:"status"` // "synced", "orphaned", "missing"
+}
+
+// listDNSRecords handles GET /api/dns/records.
+// In managed DNS mode: merges local dns_records with actual Cloudflare records to compute sync status.
+// In wildcard mode: shows all expected FQDNs from active consumers (informational, no sync status).
+func (s *Server) listDNSRecords(w http.ResponseWriter, r *http.Request) {
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
+		return
+	}
+
+	consumerNames := s.buildConsumerNameMap()
+
+	// In wildcard mode, show expected records from consumers without sync status.
+	if settings.WildcardDNS {
+		expectedFQDNs, err := s.computeExpectedFQDNs(settings)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "failed to compute expected records: "+err.Error())
+			return
+		}
+		var views []dnsRecordView
+		for fqdn, consumer := range expectedFQDNs {
+			parts := strings.SplitN(consumer, ":", 2)
+			consumerType, consumerID := parts[0], ""
+			if len(parts) > 1 {
+				consumerID = parts[1]
+			}
+			name := consumerNames[consumer]
+			if name == "" {
+				name = consumerID
+			}
+			views = append(views, dnsRecordView{
+				FQDN:         fqdn,
+				Type:         "A",
+				Content:      dnsTargetIP(settings),
+				ConsumerType: consumerType,
+				ConsumerName: name,
+				ConsumerID:   consumerID,
+				Status:       "wildcard",
+			})
+		}
+		if views == nil {
+			views = []dnsRecordView{}
+		}
+		respondJSON(w, http.StatusOK, views)
+		return
+	}
+
+	// Managed DNS mode: full sync status computation.
+
+	// Get local tracked records.
+	localRecords, err := s.store.ListDNSRecords()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to list local records: "+err.Error())
+		return
+	}
+
+	// Try to get actual records from the DNS provider.
+	var providerRecords []dns.Record
+	provider := s.getOrCreateDNSProvider(settings)
+	if provider != nil {
+		providerRecords, err = provider.ListRecords(r.Context())
+		if err != nil {
+			slog.Warn("dns records: failed to list provider records", "error", err)
+			// Continue with local-only view.
+		}
+	}
+
+	// Build a map of provider records by FQDN.
+	providerByFQDN := make(map[string]dns.Record, len(providerRecords))
+	for _, rec := range providerRecords {
+		providerByFQDN[rec.FQDN] = rec
+	}
+
+	// Build a set of local FQDNs.
+	localFQDNs := make(map[string]bool, len(localRecords))
+	for _, rec := range localRecords {
+		localFQDNs[rec.FQDN] = true
+	}
+
+	var views []dnsRecordView
+
+	// Process local records: check if they exist in provider.
+	for _, local := range localRecords {
+		status := "missing"
+		content := dnsTargetIP(settings)
+		if pRec, ok := providerByFQDN[local.FQDN]; ok {
+			status = "synced"
+			content = pRec.Content
+		}
+
+		name := consumerNames[local.ConsumerType+":"+local.ConsumerID]
+		if name == "" {
+			name = local.ConsumerID
+		}
+
+		views = append(views, dnsRecordView{
+			FQDN:         local.FQDN,
+			Type:         "A",
+			Content:      content,
+			ConsumerType: local.ConsumerType,
+			ConsumerName: name,
+			ConsumerID:   local.ConsumerID,
+			Status:       status,
+		})
+	}
+
+	// Find orphaned records: in provider but not in local tracking.
+	for _, pRec := range providerRecords {
+		if !localFQDNs[pRec.FQDN] {
+			views = append(views, dnsRecordView{
+				FQDN:         pRec.FQDN,
+				Type:         pRec.Type,
+				Content:      pRec.Content,
+				ConsumerType: "",
+				ConsumerName: "",
+				ConsumerID:   "",
+				Status:       "orphaned",
+			})
+		}
+	}
+
+	if views == nil {
+		views = []dnsRecordView{}
+	}
+
+	respondJSON(w, http.StatusOK, views)
+}
+
+// deleteDNSRecord handles DELETE /api/dns/records/{fqdn}.
+func (s *Server) deleteDNSRecord(w http.ResponseWriter, r *http.Request) {
+	fqdn := chi.URLParam(r, "fqdn")
+	if fqdn == "" {
+		respondError(w, http.StatusBadRequest, "fqdn is required")
+		return
+	}
+
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
+		return
+	}
+
+	provider := s.getOrCreateDNSProvider(settings)
+	if provider != nil {
+		if err := provider.DeleteRecord(r.Context(), fqdn); err != nil {
+			respondError(w, http.StatusBadGateway, "failed to delete DNS record: "+err.Error())
+			return
+		}
+	}
+
+	// Remove local tracking.
+	if err := s.store.DeleteDNSRecord(fqdn); err != nil {
+		slog.Warn("delete dns tracking record", "fqdn", fqdn, "error", err)
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
+}
+
+// buildConsumerNameMap builds a lookup of "type:id" -> display name for DNS
+// consumers. Sourced from the containers index now that legacy project/stage
+// tables are gone — the workload's name + the container's role + tag is what
+// operators see in the UI.
+func (s *Server) buildConsumerNameMap() map[string]string {
+	names := make(map[string]string)
+	containers, err := s.store.ListContainers(store.ContainerFilter{})
+	if err != nil {
+		return names
+	}
+	workloadNames := make(map[string]string)
+	for _, c := range containers {
+		wname, ok := workloadNames[c.WorkloadID]
+		if !ok {
+			if w, err := s.store.GetWorkloadByID(c.WorkloadID); err == nil {
+				wname = w.Name
+			}
+			workloadNames[c.WorkloadID] = wname
+		}
+		label := wname
+		if c.Role != "" {
+			label = label + "/" + c.Role
+		}
+		if c.ImageTag != "" {
+			label = label + ":" + c.ImageTag
+		}
+		names["instance:"+c.ID] = label
+	}
+	return names
+}
+
+// getOrCreateDNSProvider returns the server's DNS provider, or creates a temporary one from settings.
+func (s *Server) getOrCreateDNSProvider(settings store.Settings) dns.Provider {
+	if p := s.getDNSProviderLocked(); p != nil {
+		return p
+	}
+
+	if settings.WildcardDNS || settings.DNSProvider == "" || settings.CloudflareAPIToken == "" {
+		return nil
+	}
+
+	token, err := crypto.Decrypt(s.encKey, settings.CloudflareAPIToken)
+	if err != nil {
+		slog.Warn("dns: failed to decrypt token for provider creation", "error", err)
+		return nil
+	}
+
+	provider, err := dns.NewProvider(settings.DNSProvider, dns.Config{
+		Token:  token,
+		ZoneID: settings.CloudflareZoneID,
+	})
+	if err != nil {
+		slog.Warn("dns: failed to create provider", "error", err)
+		return nil
+	}
+	return provider
+}
+
+// syncDNSRecords handles POST /api/dns/sync.
+func (s *Server) syncDNSRecords(w http.ResponseWriter, r *http.Request) {
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
+		return
+	}
+
+	if settings.WildcardDNS {
+		respondError(w, http.StatusBadRequest, "DNS sync is disabled in wildcard mode")
+		return
+	}
+
+	provider := s.getOrCreateDNSProvider(settings)
+	if provider == nil {
+		respondError(w, http.StatusBadRequest, "DNS provider not configured")
+		return
+	}
+
+	// Compute expected FQDNs from active consumers.
+	expectedFQDNs, err := s.computeExpectedFQDNs(settings)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to compute expected records: "+err.Error())
+		return
+	}
+
+	// Get actual provider records.
+	providerRecords, err := provider.ListRecords(r.Context())
+	if err != nil {
+		respondError(w, http.StatusBadGateway, "failed to list DNS records: "+err.Error())
+		return
+	}
+
+	providerByFQDN := make(map[string]dns.Record, len(providerRecords))
+	for _, rec := range providerRecords {
+		providerByFQDN[rec.FQDN] = rec
+	}
+
+	// Get local tracking records.
+	localRecords, err := s.store.ListDNSRecords()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to list local records: "+err.Error())
+		return
+	}
+	localByFQDN := make(map[string]bool, len(localRecords))
+	for _, rec := range localRecords {
+		localByFQDN[rec.FQDN] = true
+	}
+
+	created := 0
+	deleted := 0
+	alreadySynced := 0
+
+	// Create missing records.
+	for fqdn, consumer := range expectedFQDNs {
+		if _, exists := providerByFQDN[fqdn]; exists {
+			alreadySynced++
+			continue
+		}
+
+		recordID, err := provider.EnsureRecord(r.Context(), fqdn, dnsTargetIP(settings))
+		if err != nil {
+			slog.Warn("dns sync: failed to create record", "fqdn", fqdn, "error", err)
+			continue
+		}
+
+		// Track locally.
+		parts := strings.SplitN(consumer, ":", 2)
+		consumerType, consumerID := parts[0], ""
+		if len(parts) > 1 {
+			consumerID = parts[1]
+		}
+		if _, err := s.store.CreateDNSRecord(store.DNSRecord{
+			FQDN:             fqdn,
+			ProviderRecordID: recordID,
+			ConsumerType:     consumerType,
+			ConsumerID:       consumerID,
+		}); err != nil {
+			s.store.UpdateDNSRecordProviderID(fqdn, recordID)
+		}
+		created++
+	}
+
+	// Delete orphaned records (in provider + tracked locally, but no active consumer).
+	for _, local := range localRecords {
+		if _, expected := expectedFQDNs[local.FQDN]; !expected {
+			if err := provider.DeleteRecord(r.Context(), local.FQDN); err != nil {
+				slog.Warn("dns sync: failed to delete orphaned record", "fqdn", local.FQDN, "error", err)
+				continue
+			}
+			s.store.DeleteDNSRecord(local.FQDN)
+			deleted++
+		}
+	}
+
+	respondJSON(w, http.StatusOK, map[string]int{
+		"created":        created,
+		"deleted":        deleted,
+		"already_synced": alreadySynced,
+	})
+}
+
+// computeExpectedFQDNs returns a map of FQDN -> "consumerType:consumerID"
+// for every running container that has a proxy route configured. Sourced
+// directly from the containers index — the workload-first cutover dropped
+// the per-stage enable_proxy toggle in favour of "if a proxy route ID
+// exists, the workload wanted a route."
+func (s *Server) computeExpectedFQDNs(settings store.Settings) (map[string]string, error) {
+	expected := make(map[string]string)
+	containers, err := s.store.ListContainers(store.ContainerFilter{})
+	if err != nil {
+		return nil, fmt.Errorf("list containers: %w", err)
+	}
+	for _, c := range containers {
+		if c.Subdomain == "" || c.State != "running" {
+			continue
+		}
+		if c.NpmProxyID == 0 && c.ProxyRouteID == "" {
+			continue
+		}
+		fqdn := c.Subdomain + "." + settings.Domain
+		expected[fqdn] = "instance:" + c.ID
+	}
+	return expected, nil
+}
@@ -0,0 +1,379 @@
+package api
+
+import (
+	"bufio"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"regexp"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// Limits and constants for the log endpoints.
+const (
+	defaultLogTail     = 200
+	maxLogTail         = 5000
+	maxJSONLogBytes    = 4 << 20 // 4 MiB cap for non-streaming log responses
+	maxLogLineBytes    = 1 << 20 // 1 MiB max line length for the bufio.Scanner
+	logHeartbeatPeriod = 20 * time.Second
+)
+
+// ANSI escape sequence patterns. Stripped from streamed log lines so a
+// hostile container cannot inject terminal control sequences (cursor moves,
+// hyperlink escapes, screen clears) into operator displays or pasted output.
+var (
+	ansiCSIPattern = regexp.MustCompile(`\x1b\[[0-9;?]*[ -/]*[@-~]`)
+	ansiOSCPattern = regexp.MustCompile(`\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)`)
+	ctlBytePattern = regexp.MustCompile(`[\x00-\x08\x0b-\x1a\x1c-\x1f\x7f]`)
+)
+
+// streamLogsForContainer streams logs for an arbitrary container ID using the
+// shared SSE/JSON dual-mode pattern. Owner-specific handlers (workload-container)
+// should validate ownership and then delegate here.
+func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request, containerID string) {
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
+		return
+	}
+
+	tail := parseTailParam(r.URL.Query().Get("tail"))
+	follow := r.URL.Query().Get("follow") == "true"
+
+	// Check if client accepts SSE.
+	accept := r.Header.Get("Accept")
+	isSSE := strings.Contains(accept, "text/event-stream")
+
+	logReader, err := s.docker.ContainerLogs(r.Context(), containerID, follow && isSSE, tail)
+	if err != nil {
+		slog.Error("failed to get container logs", "container", containerID, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to get container logs")
+		return
+	}
+	defer logReader.Close()
+
+	if !isSSE {
+		// JSON mode: cap the total bytes read so a chatty container with
+		// tail=large cannot exhaust server memory.
+		scanner := bufio.NewScanner(io.LimitReader(logReader, maxJSONLogBytes))
+		scanner.Buffer(make([]byte, 0, 64*1024), maxLogLineBytes)
+		var lines []string
+		for scanner.Scan() {
+			line := sanitizeDockerLogLine(scanner.Text())
+			if line != "" {
+				lines = append(lines, line)
+			}
+		}
+		if lines == nil {
+			lines = []string{}
+		}
+		respondJSON(w, http.StatusOK, lines)
+		return
+	}
+
+	// SSE mode: stream lines as they arrive.
+	release, ok := acquireSSESlot(w, s.sseGate)
+	if !ok {
+		return
+	}
+	defer release()
+
+	flusher, ok := w.(http.Flusher)
+	if !ok {
+		respondError(w, http.StatusInternalServerError, "streaming not supported")
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/event-stream")
+	w.Header().Set("Cache-Control", "no-cache")
+	w.Header().Set("Connection", "keep-alive")
+
+	// Heartbeat keeps the connection warm through proxies that close idle
+	// streams. Sent as an SSE comment which the EventSource API ignores.
+	heartbeat := time.NewTicker(logHeartbeatPeriod)
+	defer heartbeat.Stop()
+	heartbeatDone := make(chan struct{})
+	defer close(heartbeatDone)
+	var hbMu sync.Mutex
+	go func() {
+		for {
+			select {
+			case <-heartbeat.C:
+				hbMu.Lock()
+				_, _ = io.WriteString(w, ": ping\n\n")
+				flusher.Flush()
+				hbMu.Unlock()
+			case <-heartbeatDone:
+				return
+			case <-r.Context().Done():
+				return
+			}
+		}
+	}()
+
+	scanner := bufio.NewScanner(logReader)
+	scanner.Buffer(make([]byte, 0, 64*1024), maxLogLineBytes)
+	for scanner.Scan() {
+		line := sanitizeDockerLogLine(scanner.Text())
+		if line == "" {
+			continue
+		}
+
+		data, _ := json.Marshal(map[string]string{"line": line})
+		hbMu.Lock()
+		fmt.Fprintf(w, "data: %s\n\n", data)
+		flusher.Flush()
+		hbMu.Unlock()
+
+		// Check if client disconnected.
+		select {
+		case <-r.Context().Done():
+			return
+		default:
+		}
+	}
+}
+
+// parseTailParam validates and clamps the ?tail= query value. Empty/invalid
+// inputs fall back to the default; values above the cap are clamped down.
+// "all" is rejected — letting the caller request unbounded log history is a
+// trivial DoS vector.
+func parseTailParam(raw string) string {
+	if raw == "" {
+		return strconv.Itoa(defaultLogTail)
+	}
+	n, err := strconv.Atoi(raw)
+	if err != nil || n <= 0 {
+		return strconv.Itoa(defaultLogTail)
+	}
+	if n > maxLogTail {
+		n = maxLogTail
+	}
+	return strconv.Itoa(n)
+}
+
+// sanitizeDockerLogLine strips the Docker log stream header (8-byte prefix)
+// that Docker adds to non-TTY container logs, and removes terminal control
+// sequences so a hostile container cannot inject ANSI escapes that hijack an
+// operator's terminal when log output is pasted or rendered raw.
+func sanitizeDockerLogLine(line string) string {
+	// Docker multiplexed stream: first 8 bytes are header (stream type + size).
+	// If the line starts with a non-printable byte followed by 0x00 0x00 0x00, strip 8 bytes.
+	if len(line) > 8 && (line[0] == 1 || line[0] == 2) && line[1] == 0 && line[2] == 0 && line[3] == 0 {
+		line = line[8:]
+	}
+	line = ansiOSCPattern.ReplaceAllString(line, "")
+	line = ansiCSIPattern.ReplaceAllString(line, "")
+	line = ctlBytePattern.ReplaceAllString(line, "")
+	return line
+}
+
+// buildActiveImagesSet returns the set of "image:tag" strings currently used
+// by any container, computed in a single DB pass against the normalized
+// containers index. Returning an error (rather than swallowing) prevents
+// prune logic from treating a transient DB failure as "nothing is active".
+func buildActiveImagesSet(st *store.Store) (map[string]bool, error) {
+	containers, err := st.ListContainers(store.ContainerFilter{})
+	if err != nil {
+		return nil, fmt.Errorf("list containers: %w", err)
+	}
+	active := make(map[string]bool, len(containers))
+	for _, c := range containers {
+		if c.ImageRef == "" {
+			continue
+		}
+		active[c.ImageRef] = true
+	}
+	return active, nil
+}
+
+// workloadImageBases returns the set of "image" strings (no tag) that
+// some workload currently mounts to, derived from container.image_ref.
+// This replaces the legacy "list all projects → projects[].Image" view
+// after the workload-first cutover.
+func workloadImageBases(st *store.Store) (map[string]bool, error) {
+	containers, err := st.ListContainers(store.ContainerFilter{})
+	if err != nil {
+		return nil, fmt.Errorf("list containers: %w", err)
+	}
+	bases := make(map[string]bool, len(containers))
+	for _, c := range containers {
+		if c.ImageRef == "" {
+			continue
+		}
+		ref, _ := splitImageTag(c.ImageRef)
+		if ref != "" {
+			bases[ref] = true
+		}
+	}
+	return bases, nil
+}
+
+// splitImageTag splits "image:tag" into image and tag parts. Returns the
+// full string and empty tag if no colon separator is found. Inlined here
+// because the legacy deploys.go that owned it was removed.
+func splitImageTag(ref string) (string, string) {
+	if idx := strings.LastIndex(ref, ":"); idx != -1 {
+		afterColon := ref[idx+1:]
+		if !strings.Contains(afterColon, "/") {
+			return ref[:idx], afterColon
+		}
+	}
+	return ref, ""
+}
+
+// unusedImageStats handles GET /api/docker/unused-images. Returns the total
+// size of unused workload images and whether the threshold is exceeded.
+func (s *Server) unusedImageStats(w http.ResponseWriter, r *http.Request) {
+	if s.docker == nil {
+		respondJSON(w, http.StatusOK, map[string]any{
+			"total_size_mb": 0, "count": 0, "threshold_mb": 0, "exceeded": false,
+		})
+		return
+	}
+
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		slog.Error("unused images: get settings", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	imageBases, err := workloadImageBases(s.store)
+	if err != nil {
+		slog.Error("unused images: list workload images", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	activeImages, err := buildActiveImagesSet(s.store)
+	if err != nil {
+		slog.Error("unused images: build active set", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	ctx := r.Context()
+	var totalSize int64
+	var count int
+	for base := range imageBases {
+		images, err := s.docker.ListImagesByRef(ctx, base)
+		if err != nil {
+			continue
+		}
+		for _, img := range images {
+			if !activeImages[img.Ref] {
+				totalSize += img.Size
+				count++
+			}
+		}
+	}
+
+	totalMB := totalSize / (1024 * 1024)
+	exceeded := settings.ImagePruneThresholdMB > 0 && int(totalMB) >= settings.ImagePruneThresholdMB
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"total_size_mb": totalMB,
+		"count":         count,
+		"threshold_mb":  settings.ImagePruneThresholdMB,
+		"exceeded":      exceeded,
+	})
+}
+
+// pruneImages handles POST /api/docker/prune-images. Only removes images that
+// some workload references (via container.image_ref), never arbitrary host
+// images.
+func (s *Server) pruneImages(w http.ResponseWriter, r *http.Request) {
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
+		return
+	}
+
+	imageBases, err := workloadImageBases(s.store)
+	if err != nil {
+		slog.Error("prune: list workload images", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	activeImages, err := buildActiveImagesSet(s.store)
+	if err != nil {
+		slog.Error("prune: build active set", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	if len(imageBases) == 0 {
+		respondJSON(w, http.StatusOK, map[string]any{
+			"images_removed":     0,
+			"space_reclaimed_mb": 0,
+			"message":            "No workload images to clean up",
+		})
+		return
+	}
+
+	ctx := r.Context()
+	removed := 0
+	var reclaimedBytes int64
+
+	for base := range imageBases {
+		images, err := s.docker.ListImagesByRef(ctx, base)
+		if err != nil {
+			slog.Warn("prune: list images", "image", base, "error", err)
+			continue
+		}
+
+		for _, img := range images {
+			if activeImages[img.Ref] {
+				continue
+			}
+			if err := s.docker.RemoveImage(ctx, img.ID); err != nil {
+				slog.Warn("prune: remove image", "image", img.Ref, "error", err)
+				continue
+			}
+			removed++
+			reclaimedBytes += img.Size
+			slog.Info("prune: removed image", "ref", img.Ref, "size_mb", img.Size/(1024*1024))
+		}
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"images_removed":     removed,
+		"space_reclaimed_mb": reclaimedBytes / (1024 * 1024),
+	})
+}
+
+// pruneBuildCache handles POST /api/docker/prune-build-cache. It removes
+// unused Docker build-cache records daemon-wide (all=false), so an app's next
+// rebuild still hits its warm cache. The build cache is regenerable by
+// definition — pruning only forces slower rebuilds, never data loss — and the
+// dockerfile/static deploy paths never reclaim it on teardown, so it grows
+// monotonically until pruned here.
+func (s *Server) pruneBuildCache(w http.ResponseWriter, r *http.Request) {
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
+		return
+	}
+
+	result, err := s.docker.PruneBuildCache(r.Context(), false)
+	if err != nil {
+		slog.Error("prune: build cache", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	slog.Info("prune: build cache",
+		"caches_deleted", result.CachesDeleted,
+		"space_reclaimed_mb", result.SpaceReclaimed/(1024*1024))
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"caches_deleted":     result.CachesDeleted,
+		"space_reclaimed_mb": result.SpaceReclaimed / (1024 * 1024),
+	})
+}
@@ -0,0 +1,325 @@
+// Package api: event-trigger HTTP handlers. The dispatcher itself
+// lives in internal/events; this file is the REST surface that lets
+// operators create, edit, and test triggers from the UI.
+package api
+
+import (
+	"context"
+	"errors"
+	"net"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strconv"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/notify"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// triggerInput is the JSON shape accepted by POST + PATCH. Pointers
+// distinguish "absent" from a zero/empty value so PATCH can leave a
+// field unchanged. Required fields on POST are validated explicitly.
+type triggerInput struct {
+	Name               *string `json:"name"`
+	FilterSeverity     *string `json:"filter_severity"`
+	FilterSource       *string `json:"filter_source"`
+	FilterMessageRegex *string `json:"filter_message_regex"`
+	ActionType         *string `json:"action_type"`
+	ActionTarget       *string `json:"action_target"`
+	ActionSecret       *string `json:"action_secret"` // omit = leave unchanged; "" = clear
+	Enabled            *bool   `json:"enabled"`
+}
+
+// actionSecretPlaceholder is what we return on read to signal "a secret
+// is configured" without exposing the actual value. The edit page
+// preserves this placeholder verbatim (or replaces it with a new value)
+// — the API treats the placeholder as "no change" on PATCH. This is
+// the same shape Stripe / GitHub use for their secret read APIs.
+const actionSecretPlaceholder = "********"
+
+// listEventTriggers handles GET /api/event-triggers. Secrets are
+// redacted to avoid exposing them on read; the edit page shows a
+// "configured" indicator when a placeholder is present.
+func (s *Server) listEventTriggers(w http.ResponseWriter, r *http.Request) {
+	out, err := s.store.ListEventTriggers()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list event triggers")
+		return
+	}
+	for i := range out {
+		out[i] = redactTriggerSecret(out[i])
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getEventTrigger handles GET /api/event-triggers/{id}.
+func (s *Server) getEventTrigger(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseTriggerID(w, r)
+	if !ok {
+		return
+	}
+	t, err := s.store.GetEventTrigger(id)
+	if err != nil {
+		mapStoreError(w, err, "event trigger")
+		return
+	}
+	respondJSON(w, http.StatusOK, redactTriggerSecret(t))
+}
+
+// createEventTrigger handles POST /api/event-triggers.
+func (s *Server) createEventTrigger(w http.ResponseWriter, r *http.Request) {
+	var in triggerInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	t := store.EventTrigger{
+		Name:               derefString(in.Name),
+		FilterSeverity:     derefString(in.FilterSeverity),
+		FilterSource:       derefString(in.FilterSource),
+		FilterMessageRegex: derefString(in.FilterMessageRegex),
+		ActionType:         firstNonEmpty(derefString(in.ActionType), store.EventTriggerActionWebhook),
+		ActionTarget:       derefString(in.ActionTarget),
+		ActionSecret:       derefString(in.ActionSecret),
+		Enabled:            in.Enabled == nil || *in.Enabled,
+	}
+	if msg := validateTrigger(t); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.CreateEventTrigger(t)
+	if err != nil {
+		// CreateEventTrigger returns validation-shaped errors plus
+		// raw DB errors. Validation already ran above, so anything
+		// here is a server-side problem — surface as 500 and avoid
+		// echoing driver text to the client.
+		respondError(w, http.StatusInternalServerError, "create event trigger")
+		return
+	}
+	respondJSON(w, http.StatusCreated, redactTriggerSecret(out))
+}
+
+// updateEventTrigger handles PATCH /api/event-triggers/{id}. Each
+// field on the input is optional (pointer); absent fields are left
+// unchanged. ActionSecret receives special treatment so the read-side
+// placeholder round-trips safely.
+func (s *Server) updateEventTrigger(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseTriggerID(w, r)
+	if !ok {
+		return
+	}
+	existing, err := s.store.GetEventTrigger(id)
+	if err != nil {
+		mapStoreError(w, err, "event trigger")
+		return
+	}
+
+	var in triggerInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	if in.Name != nil {
+		existing.Name = *in.Name
+	}
+	if in.FilterSeverity != nil {
+		existing.FilterSeverity = *in.FilterSeverity
+	}
+	if in.FilterSource != nil {
+		existing.FilterSource = *in.FilterSource
+	}
+	if in.FilterMessageRegex != nil {
+		existing.FilterMessageRegex = *in.FilterMessageRegex
+	}
+	if in.ActionType != nil && *in.ActionType != "" {
+		existing.ActionType = *in.ActionType
+	}
+	if in.ActionTarget != nil {
+		existing.ActionTarget = *in.ActionTarget
+	}
+	// Secret round-trip: the read API returns a placeholder when a
+	// secret is configured. If the client echoes the placeholder back
+	// unchanged we leave the stored secret alone; any other value
+	// (including the empty string) is treated as a deliberate update.
+	if in.ActionSecret != nil && *in.ActionSecret != actionSecretPlaceholder {
+		existing.ActionSecret = *in.ActionSecret
+	}
+	if in.Enabled != nil {
+		existing.Enabled = *in.Enabled
+	}
+
+	if msg := validateTrigger(existing); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+
+	out, err := s.store.UpdateEventTrigger(existing)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "event trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update event trigger")
+		return
+	}
+	respondJSON(w, http.StatusOK, redactTriggerSecret(out))
+}
+
+// deleteEventTrigger handles DELETE /api/event-triggers/{id}.
+func (s *Server) deleteEventTrigger(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseTriggerID(w, r)
+	if !ok {
+		return
+	}
+	if err := s.store.DeleteEventTrigger(id); err != nil {
+		mapStoreError(w, err, "event trigger")
+		return
+	}
+	w.WriteHeader(http.StatusNoContent)
+}
+
+// testEventTrigger handles POST /api/event-triggers/{id}/test. Sends
+// a real TriggerWebhookPayload to the action target so receivers see
+// the same shape they'll see at runtime. Routes through the dedicated
+// SendSyncForTestPayload path that preserves the payload through the
+// HMAC+HTTP core unchanged.
+func (s *Server) testEventTrigger(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseTriggerID(w, r)
+	if !ok {
+		return
+	}
+	t, err := s.store.GetEventTrigger(id)
+	if err != nil {
+		mapStoreError(w, err, "event trigger")
+		return
+	}
+	if t.ActionType != store.EventTriggerActionWebhook {
+		respondError(w, http.StatusBadRequest, "action_type not testable")
+		return
+	}
+
+	now := time.Now().UTC().Format(time.RFC3339)
+	payload := events.TriggerWebhookPayload{
+		Type:      "event_trigger",
+		TriggerID: t.ID,
+		Trigger:   t.Name,
+		Event: events.EventLogPayload{
+			ID:        -1,
+			Source:    "test",
+			Severity:  "info",
+			Message:   "Test event from Tinyforge — trigger=" + t.Name,
+			Metadata:  `{"synthetic":true}`,
+			CreatedAt: now,
+		},
+		Timestamp: now,
+	}
+	ctx, cancel := context.WithTimeout(r.Context(), 10*time.Second)
+	defer cancel()
+	result := s.notifier.SendSyncForTestPayload(ctx, t.ActionTarget, t.ActionSecret,
+		notify.TierEventTrigger, "event_trigger", payload)
+	respondJSON(w, http.StatusOK, result)
+}
+
+// validateTrigger runs the full set of invariants over a fully-merged
+// trigger row. Called by both create and update so the contract is
+// enforced once. Returns an empty string for a valid trigger.
+func validateTrigger(t store.EventTrigger) string {
+	if t.Name == "" {
+		return "name is required"
+	}
+	if t.ActionType != "" && t.ActionType != store.EventTriggerActionWebhook {
+		return "action_type must be 'webhook'"
+	}
+	if t.ActionTarget == "" {
+		return "action_target is required"
+	}
+	if msg := validateWebhookURL(t.ActionTarget); msg != "" {
+		return msg
+	}
+	if t.FilterMessageRegex != "" {
+		if _, err := regexp.Compile(t.FilterMessageRegex); err != nil {
+			return "filter_message_regex invalid: " + err.Error()
+		}
+	}
+	return ""
+}
+
+// validateWebhookURL guards against the most common SSRF vectors that
+// admin-controlled webhook URLs enable: non-http(s) schemes, missing
+// host, and internal-network targets (loopback / link-local / RFC1918
+// when the hostname resolves to a literal). Hostname-based lookups
+// are NOT resolved here — DNS rebinding is out of scope and would
+// require enforcement at dispatch time too. Admin gating remains the
+// primary control; this is defense-in-depth.
+func validateWebhookURL(raw string) string {
+	u, err := url.Parse(raw)
+	if err != nil {
+		return "action_target invalid URL: " + err.Error()
+	}
+	if u.Scheme != "http" && u.Scheme != "https" {
+		return "action_target must be http:// or https://"
+	}
+	host := u.Hostname()
+	if host == "" {
+		return "action_target missing host"
+	}
+	// Literal-IP guard: block loopback / link-local / unspecified
+	// addresses outright. RFC1918 private ranges are intentionally
+	// allowed since same-LAN receivers are a legitimate Tinyforge
+	// deployment pattern.
+	if ip := net.ParseIP(host); ip != nil {
+		if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() || ip.IsUnspecified() {
+			return "action_target points at a reserved/loopback address"
+		}
+	}
+	return ""
+}
+
+// redactTriggerSecret returns a copy of t with ActionSecret replaced
+// by the placeholder string when a secret is configured. Empty secret
+// stays empty so the UI can distinguish "no signing" from "signing
+// configured."
+func redactTriggerSecret(t store.EventTrigger) store.EventTrigger {
+	if t.ActionSecret != "" {
+		t.ActionSecret = actionSecretPlaceholder
+	}
+	return t
+}
+
+// mapStoreError translates a store-layer error into an HTTP status +
+// generic message. ErrNotFound → 404; everything else → 500 without
+// echoing driver text to the client (avoids leaking schema details
+// or transient error states to API consumers).
+func mapStoreError(w http.ResponseWriter, err error, resource string) {
+	if errors.Is(err, store.ErrNotFound) {
+		respondNotFound(w, resource)
+		return
+	}
+	respondError(w, http.StatusInternalServerError, "get "+resource)
+}
+
+func parseTriggerID(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	raw := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(raw, 10, 64)
+	if err != nil || id <= 0 {
+		respondError(w, http.StatusBadRequest, "invalid event trigger id")
+		return 0, false
+	}
+	return id, true
+}
+
+func derefString(p *string) string {
+	if p == nil {
+		return ""
+	}
+	return *p
+}
+
+func firstNonEmpty(a, b string) string {
+	if a != "" {
+		return a
+	}
+	return b
+}
@@ -0,0 +1,143 @@
+package api
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+func TestValidateWebhookURL(t *testing.T) {
+	cases := []struct {
+		name    string
+		url     string
+		wantErr string // substring; empty = pass
+	}{
+		{"https valid", "https://example.com/hook", ""},
+		{"http valid", "http://example.com:8080/hook", ""},
+		{"RFC1918 private LAN allowed", "http://192.168.1.50:9090/hook", ""},
+		{"loopback rejected", "http://127.0.0.1:8090/hook", "loopback"},
+		{"ipv6 loopback rejected", "http://[::1]:9000/hook", "loopback"},
+		{"link-local rejected", "http://169.254.169.254/latest/meta-data", "reserved"},
+		{"unspecified rejected", "http://0.0.0.0:9000/hook", "reserved"},
+		{"file scheme rejected", "file:///etc/passwd", "http:// or https://"},
+		{"missing host rejected", "https://", "missing host"},
+		{"malformed url rejected", "://nope", "invalid URL"},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got := validateWebhookURL(c.url)
+			if c.wantErr == "" {
+				if got != "" {
+					t.Fatalf("expected pass, got error: %q", got)
+				}
+				return
+			}
+			if !strings.Contains(got, c.wantErr) {
+				t.Fatalf("error mismatch:\n  got:  %q\n  want substring: %q", got, c.wantErr)
+			}
+		})
+	}
+}
+
+func TestValidateTrigger(t *testing.T) {
+	cases := []struct {
+		name string
+		in   store.EventTrigger
+		want string // substring of error; empty = pass
+	}{
+		{
+			name: "missing name",
+			in:   store.EventTrigger{ActionTarget: "https://x.example.com/h"},
+			want: "name is required",
+		},
+		{
+			name: "missing target",
+			in:   store.EventTrigger{Name: "n"},
+			want: "action_target is required",
+		},
+		{
+			name: "bad scheme",
+			in:   store.EventTrigger{Name: "n", ActionTarget: "ftp://x.example.com/h"},
+			want: "http:// or https://",
+		},
+		{
+			name: "loopback target",
+			in:   store.EventTrigger{Name: "n", ActionTarget: "http://127.0.0.1/hook"},
+			want: "loopback",
+		},
+		{
+			name: "unsupported action_type",
+			in:   store.EventTrigger{Name: "n", ActionType: "email", ActionTarget: "https://x.example.com/h"},
+			want: "action_type must be",
+		},
+		{
+			name: "invalid regex",
+			in: store.EventTrigger{
+				Name: "n", ActionTarget: "https://x.example.com/h",
+				FilterMessageRegex: "([unclosed",
+			},
+			want: "filter_message_regex invalid",
+		},
+		{
+			name: "all valid",
+			in: store.EventTrigger{
+				Name:         "n",
+				ActionTarget: "https://x.example.com/h",
+				FilterSeverity:     "warn,error",
+				FilterMessageRegex: `\bpanic\b`,
+			},
+			want: "",
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got := validateTrigger(c.in)
+			if c.want == "" {
+				if got != "" {
+					t.Fatalf("expected pass, got error: %q", got)
+				}
+				return
+			}
+			if !strings.Contains(got, c.want) {
+				t.Fatalf("error mismatch:\n  got:  %q\n  want substring: %q", got, c.want)
+			}
+		})
+	}
+}
+
+func TestRedactTriggerSecret(t *testing.T) {
+	withSecret := store.EventTrigger{Name: "n", ActionSecret: "shh-real-secret"}
+	got := redactTriggerSecret(withSecret)
+	if got.ActionSecret != actionSecretPlaceholder {
+		t.Errorf("expected placeholder, got %q", got.ActionSecret)
+	}
+	if withSecret.ActionSecret != "shh-real-secret" {
+		t.Errorf("original mutated: %q", withSecret.ActionSecret)
+	}
+
+	noSecret := store.EventTrigger{Name: "n", ActionSecret: ""}
+	got2 := redactTriggerSecret(noSecret)
+	if got2.ActionSecret != "" {
+		t.Errorf("empty secret should stay empty, got %q", got2.ActionSecret)
+	}
+}
+
+func TestDerefString(t *testing.T) {
+	if derefString(nil) != "" {
+		t.Error("nil should deref to empty string")
+	}
+	s := "value"
+	if derefString(&s) != "value" {
+		t.Error("non-nil should deref to value")
+	}
+}
+
+func TestFirstNonEmpty(t *testing.T) {
+	if firstNonEmpty("a", "b") != "a" {
+		t.Error("non-empty first wins")
+	}
+	if firstNonEmpty("", "b") != "b" {
+		t.Error("fallback when first empty")
+	}
+}
@@ -0,0 +1,106 @@
+package api
+
+import (
+	"log/slog"
+	"net/http"
+	"strconv"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// listEventLog handles GET /api/events/log.
+// Supports query parameters: severity, source, since, until, limit, offset.
+func (s *Server) listEventLog(w http.ResponseWriter, r *http.Request) {
+	q := r.URL.Query()
+
+	limit, _ := strconv.Atoi(q.Get("limit"))
+	offset, _ := strconv.Atoi(q.Get("offset"))
+
+	filter := store.EventLogFilter{
+		Severity: q.Get("severity"),
+		Source:   q.Get("source"),
+		Since:    q.Get("since"),
+		Until:    q.Get("until"),
+		Limit:    limit,
+		Offset:   offset,
+	}
+
+	events, err := s.store.ListEvents(filter)
+	if err != nil {
+		slog.Error("failed to list events", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list events")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, events)
+}
+
+// listWorkloadEvents handles GET /api/workloads/{id}/events — the per-app
+// activity/deploy timeline. The workload id is pinned from the path, so a
+// client cannot widen the scope to other workloads or the global feed.
+// Supports the same severity/limit/offset query params as listEventLog.
+func (s *Server) listWorkloadEvents(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+
+	q := r.URL.Query()
+	limit, _ := strconv.Atoi(q.Get("limit"))
+	offset, _ := strconv.Atoi(q.Get("offset"))
+
+	events, err := s.store.ListEvents(store.EventLogFilter{
+		WorkloadID: id,
+		Severity:   q.Get("severity"),
+		Limit:      limit,
+		Offset:     offset,
+	})
+	if err != nil {
+		slog.Error("failed to list workload events", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list events")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, events)
+}
+
+// getEventLogStats handles GET /api/events/log/stats.
+func (s *Server) getEventLogStats(w http.ResponseWriter, r *http.Request) {
+	stats, err := s.store.GetEventStats()
+	if err != nil {
+		slog.Error("failed to get event stats", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to get event stats")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, stats)
+}
+
+// deleteEvent handles DELETE /api/events/log/{id}.
+func (s *Server) deleteEvent(w http.ResponseWriter, r *http.Request) {
+	id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid event ID")
+		return
+	}
+	if err := s.store.DeleteEvent(id); err != nil {
+		slog.Error("failed to delete event", "id", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
+}
+
+// clearEvents handles DELETE /api/events/log.
+func (s *Server) clearEvents(w http.ResponseWriter, r *http.Request) {
+	cleared, err := s.store.ClearAllEvents()
+	if err != nil {
+		slog.Error("failed to clear events", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{"status": "cleared", "count": cleared})
+}
@@ -0,0 +1,251 @@
+package api
+
+import (
+	"context"
+	"net/http"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/proxy"
+)
+
+// healthProbeTimeout caps a single health probe so a stuck dependency does
+// not hold the polling endpoint open. The UI polls every 30 s, so 8 s leaves
+// headroom for the ping + Info + NPM list calls.
+const healthProbeTimeout = 8 * time.Second
+
+// nonAdminDockerFields enumerates the fields any authenticated user is
+// allowed to see — version + connectivity + container counts. Host-detail
+// fields (kernel, root_dir, hostname, OS, storage driver) are admin-only to
+// avoid recon information leaks.
+var nonAdminDockerFields = map[string]bool{
+	"connected":  true,
+	"latency_ms": true,
+	"error":      true,
+	"version":    true,
+	"api_version": true,
+	"containers": true,
+	"running":    true,
+	"paused":     true,
+	"stopped":    true,
+	"images":     true,
+	"ncpu":       true,
+	"memory_total": true,
+}
+
+// nonAdminProxyFields are the proxy fields safe to share with non-admins.
+// Configured URLs and aggregate counts of internal lists/certs are stripped.
+var nonAdminProxyFields = map[string]bool{
+	"provider":            true,
+	"connected":           true,
+	"latency_ms":          true,
+	"error":               true,
+	"proxy_hosts_managed": true,
+}
+
+// getHealth handles GET /api/health.
+//
+// Returns the connectivity state and (when connected) diagnostics for the
+// Docker daemon and the active proxy provider. Detailed host information
+// (kernel, root_dir, internal NPM URL, …) is stripped for non-admin users to
+// avoid leaking infrastructure details to read-only viewers.
+func (s *Server) getHealth(w http.ResponseWriter, r *http.Request) {
+	ctx, cancel := context.WithTimeout(r.Context(), healthProbeTimeout)
+	defer cancel()
+
+	claims, _ := auth.ClaimsFromContext(r.Context())
+	isAdmin := claims.Role == "admin"
+
+	now := time.Now().UTC().Format(time.RFC3339)
+	result := map[string]any{
+		"checked_at": now,
+	}
+
+	// ── Database ─────────────────────────────────────────────────────
+	if err := s.store.DB().PingContext(ctx); err != nil {
+		result["database"] = map[string]any{"connected": false, "error": "database unreachable"}
+	} else {
+		result["database"] = map[string]any{"connected": true}
+	}
+
+	// ── Docker daemon ────────────────────────────────────────────────
+	docker := s.dockerHealth(ctx)
+	if !isAdmin {
+		docker = filterFields(docker, nonAdminDockerFields)
+	}
+	result["docker"] = docker
+
+	// ── Proxy provider ───────────────────────────────────────────────
+	if s.proxyProvider != nil {
+		proxyInfo := s.proxyHealth(ctx)
+		if !isAdmin {
+			proxyInfo = filterFields(proxyInfo, nonAdminProxyFields)
+		}
+		result["proxy"] = proxyInfo
+	}
+
+	respondJSON(w, http.StatusOK, result)
+}
+
+// filterFields returns a copy of m containing only the keys present in allow.
+func filterFields(m map[string]any, allow map[string]bool) map[string]any {
+	out := make(map[string]any, len(allow))
+	for k, v := range m {
+		if allow[k] {
+			out[k] = v
+		}
+	}
+	return out
+}
+
+// dockerHealth probes the Docker daemon and, if reachable, attaches a full
+// DaemonInfo snapshot. The caller does not need to error-check the Info()
+// call — if it fails, the connected flag remains true (ping succeeded) but
+// the detail fields are simply omitted.
+func (s *Server) dockerHealth(ctx context.Context) map[string]any {
+	if s.docker == nil {
+		return map[string]any{
+			"connected": false,
+			"error":     "docker client not initialized",
+		}
+	}
+
+	start := time.Now()
+	if err := s.docker.Ping(ctx); err != nil {
+		return map[string]any{
+			"connected":  false,
+			"error":      err.Error(),
+			"latency_ms": time.Since(start).Milliseconds(),
+		}
+	}
+
+	out := map[string]any{
+		"connected":  true,
+		"latency_ms": time.Since(start).Milliseconds(),
+	}
+
+	// Info enriches the payload; failures are non-fatal.
+	info, err := s.docker.Info(ctx)
+	if err == nil {
+		if info.Version != "" {
+			out["version"] = info.Version
+		}
+		if info.APIVersion != "" {
+			out["api_version"] = info.APIVersion
+		}
+		if info.OS != "" {
+			out["os"] = info.OS
+		}
+		if info.Arch != "" {
+			out["arch"] = info.Arch
+		}
+		if info.Kernel != "" {
+			out["kernel"] = info.Kernel
+		}
+		if info.OperatingSystem != "" {
+			out["operating_system"] = info.OperatingSystem
+		}
+		if info.StorageDriver != "" {
+			out["storage_driver"] = info.StorageDriver
+		}
+		if info.RootDir != "" {
+			out["root_dir"] = info.RootDir
+		}
+		if info.Name != "" {
+			out["name"] = info.Name
+		}
+		if info.NCPU > 0 {
+			out["ncpu"] = info.NCPU
+		}
+		if info.MemoryTotal > 0 {
+			out["memory_total"] = info.MemoryTotal
+		}
+		out["containers"] = info.Containers
+		out["running"] = info.Running
+		out["paused"] = info.Paused
+		out["stopped"] = info.Stopped
+		out["images"] = info.Images
+	}
+
+	return out
+}
+
+// proxyHealth probes the configured proxy provider. For NPM, attaches
+// aggregate counts (proxy hosts, access lists, certificates) which the
+// dashboard surfaces alongside the connection indicator.
+func (s *Server) proxyHealth(ctx context.Context) map[string]any {
+	providerName := s.proxyProvider.Name()
+
+	start := time.Now()
+	err := s.proxyProvider.Ping(ctx)
+	latency := time.Since(start).Milliseconds()
+
+	if err != nil {
+		return map[string]any{
+			"provider":   providerName,
+			"connected":  false,
+			"error":      providerName + " unreachable: " + err.Error(),
+			"latency_ms": latency,
+		}
+	}
+
+	out := map[string]any{
+		"provider":   providerName,
+		"connected":  true,
+		"latency_ms": latency,
+	}
+
+	// Attach configured URL from settings for both NPM and Traefik.
+	if settings, serr := s.store.GetSettings(); serr == nil {
+		switch providerName {
+		case "npm":
+			if settings.NpmURL != "" {
+				out["url"] = settings.NpmURL
+			}
+		case "traefik":
+			if settings.TraefikAPIURL != "" {
+				out["url"] = settings.TraefikAPIURL
+			}
+		}
+	}
+
+	// NPM-specific aggregates — a quick glance at route/list/cert counts.
+	// These calls require an authenticated NPM session, so we trigger the
+	// provider's auth step first (it's cheap: cached JWT is reused for 1h).
+	if providerName == "npm" && s.npm != nil {
+		if np, ok := s.proxyProvider.(*proxy.NpmProvider); ok {
+			if err := np.Authenticate(ctx); err == nil {
+				if hosts, herr := s.npm.ListProxyHosts(ctx); herr == nil {
+					out["proxy_hosts"] = len(hosts)
+				}
+				if lists, lerr := s.npm.ListAccessLists(ctx); lerr == nil {
+					out["access_lists"] = len(lists)
+				}
+				if certs, cerr := s.npm.ListCertificates(ctx); cerr == nil {
+					out["certificates"] = len(certs)
+				}
+			}
+		}
+	}
+
+	// Managed-route count — how many of the proxy's routes were deployed
+	// by Tinyforge itself, counting both Docker instances and static sites.
+	// This works for every provider (NPM, Traefik, …) because it reads from
+	// our own store, not the external proxy API.
+	if managed, merr := s.managedRouteCount(); merr == nil {
+		out["proxy_hosts_managed"] = managed
+	}
+
+	return out
+}
+
+// managedRouteCount returns the number of proxy routes Tinyforge manages,
+// reading from the unified containers index. The domain argument doesn't
+// affect the count so we pass an empty string to skip FQDN rendering.
+func (s *Server) managedRouteCount() (int, error) {
+	routes, err := s.store.ListProxyRoutes("")
+	if err != nil {
+		return 0, err
+	}
+	return len(routes), nil
+}
@@ -0,0 +1,64 @@
+package api
+
+import (
+	"context"
+	"log/slog"
+	"net/http"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/metrics"
+)
+
+// livez always returns 200 if the process is up. Used by container
+// orchestrators / load balancers / Docker HEALTHCHECK as the "is the
+// binary alive" probe. Intentionally does NOT touch the DB or Docker —
+// a slow DB must not cause restart loops.
+func (s *Server) livez(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	_, _ = w.Write([]byte("ok\n"))
+}
+
+// readyz returns 200 only when the process can actually serve traffic:
+// SQLite is reachable, the encryption key is loaded, the deployer is
+// not draining. The response body is intentionally minimal — the
+// specific failing probe name is recorded in slog (operator-visible)
+// rather than returned to unauthenticated callers. This avoids handing
+// reconnaissance to an attacker who can hit /readyz during an outage
+// ("DB down" vs "encryption key missing" leaks operational state).
+func (s *Server) readyz(w http.ResponseWriter, r *http.Request) {
+	ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
+	defer cancel()
+
+	// DB ping: cheap and exact — exercises the connection pool, file
+	// lock, and busy-timeout. A failing ping means SQLite WAL is wedged
+	// or the data dir is gone.
+	if err := s.store.DB().PingContext(ctx); err != nil {
+		slog.Warn("readyz: db ping failed", "error", err)
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		_, _ = w.Write([]byte("not ready\n"))
+		return
+	}
+
+	// Encryption key sanity: if it's zero we cannot decrypt any stored
+	// secret, so the deployer paths will all explode at first use.
+	if s.encKey == ([32]byte{}) {
+		slog.Warn("readyz: encryption key not loaded")
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		_, _ = w.Write([]byte("not ready\n"))
+		return
+	}
+
+	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+	_, _ = w.Write([]byte("ready\n"))
+}
+
+// metricsExport writes the process-wide metrics registry in Prometheus
+// text format. Admin-only by router placement; surface is intentionally
+// thin (no histograms / quantiles, only counters) to keep the binary
+// dependency-free.
+func (s *Server) metricsExport(w http.ResponseWriter, _ *http.Request) {
+	w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+	_ = metrics.DefaultRegistry.WritePrometheus(w)
+}
@@ -0,0 +1,154 @@
+package api
+
+import (
+	"encoding/json"
+	"io"
+	"log/slog"
+	"net/http"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// getHookKindSchema returns the sample config shape for one registered
+// plugin kind. Used by /apps/new and the edit form so the JSON editor's
+// initial body is derived from the plugin itself rather than hardcoded
+// in the frontend.
+//
+//	GET /api/hooks/kinds/{kind}/schema
+func (s *Server) getHookKindSchema(w http.ResponseWriter, r *http.Request) {
+	kind := chi.URLParam(r, "kind")
+	sample, ok := plugin.SchemaSampleFor(kind)
+	if !ok {
+		respondNotFound(w, "plugin kind")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{
+		"kind":   kind,
+		"sample": sample,
+	})
+}
+
+// listHookKinds reports every registered Source and Trigger so operators
+// can verify the plugin registry is wired correctly without writing
+// a workload.
+//
+//	GET /api/hooks/kinds
+func (s *Server) listHookKinds(w http.ResponseWriter, r *http.Request) {
+	respondJSON(w, http.StatusOK, map[string]any{
+		"sources":  plugin.SourceKinds(),
+		"triggers": plugin.TriggerKinds(),
+	})
+}
+
+// dispatchGeneric accepts a pre-normalized InboundEvent payload and fans
+// it out across registered triggers. The body shape mirrors
+// plugin.InboundEvent — vendor-specific webhook parsing (Gitea / GitHub /
+// generic registry) stays in the legacy /api/webhook/* handlers until
+// Phase 5 of the refactor migrates them into trigger-specific ingress.
+//
+//	POST /api/hooks/generic
+//	{
+//	  "kind": "image-push",
+//	  "image": { "registry": "...", "repo": "owner/app", "tag": "v1" }
+//	}
+//
+// Until the store rewrite lands and workloads carry source_kind /
+// trigger_kind, the workloads iteration here returns an empty list and
+// the response reports zero matches. This still exercises the registry
+// path so operators can curl it and confirm wiring.
+func (s *Server) dispatchGeneric(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "read body: "+err.Error())
+		return
+	}
+	var evt plugin.InboundEvent
+	if err := json.Unmarshal(body, &evt); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid InboundEvent: "+err.Error())
+		return
+	}
+	if evt.Kind == "" {
+		respondError(w, http.StatusBadRequest, "kind is required")
+		return
+	}
+
+	ctx := r.Context()
+	triggers := plugin.AllTriggers()
+	workloads := listPluginWorkloads(s)
+	deps := s.deployer.PluginDeps()
+
+	type matchReport struct {
+		WorkloadID    string `json:"workload_id"`
+		TriggerKind   string `json:"trigger_kind"`
+		Reference     string `json:"reference"`
+		Dispatched    bool   `json:"dispatched"`
+		DispatchError string `json:"dispatch_error,omitempty"`
+	}
+	matches := []matchReport{}
+
+	for _, wl := range workloads {
+		trig, ok := triggers[wl.TriggerKind]
+		if !ok {
+			continue
+		}
+		intent, err := trig.Match(ctx, deps, wl, evt)
+		if err != nil {
+			slog.Warn("hooks: trigger match error",
+				"trigger", wl.TriggerKind, "workload", wl.ID, "error", err)
+			continue
+		}
+		if intent == nil {
+			continue
+		}
+		if intent.TriggeredAt.IsZero() {
+			intent.TriggeredAt = time.Now().UTC()
+		}
+		report := matchReport{
+			WorkloadID:  wl.ID,
+			TriggerKind: wl.TriggerKind,
+			Reference:   intent.Reference,
+		}
+		if err := s.deployer.DispatchPlugin(ctx, wl, *intent); err != nil {
+			// Wrapped error can carry registry-auth bytes / compose stdout
+			// (i.e. user secrets baked into the YAML); keep it server-side
+			// only and return a generic flag to the client.
+			slog.Warn("hooks: dispatch failed",
+				"workload", wl.ID, "trigger", wl.TriggerKind, "error", err)
+			report.DispatchError = "dispatch failed; see server logs"
+		} else {
+			report.Dispatched = true
+		}
+		matches = append(matches, report)
+	}
+
+	respondJSON(w, http.StatusAccepted, map[string]any{
+		"event_kind":         evt.Kind,
+		"examined_triggers":  len(triggers),
+		"examined_workloads": len(workloads),
+		"matches":            matches,
+	})
+}
+
+// listPluginWorkloads returns every workload row whose source_kind and
+// trigger_kind are both set — these are the rows that opted into the new
+// plugin pipeline. Legacy rows (kind/ref_id pointing at project/stack/site
+// with empty source_kind) are skipped so the ingress only fires intents
+// for workloads that have a registered Source + Trigger to dispatch them.
+func listPluginWorkloads(s *Server) []plugin.Workload {
+	rows, err := s.store.ListWorkloads("")
+	if err != nil {
+		slog.Warn("hooks: list workloads failed", "error", err)
+		return nil
+	}
+	out := make([]plugin.Workload, 0, len(rows))
+	for _, w := range rows {
+		if w.SourceKind == "" || w.TriggerKind == "" {
+			continue
+		}
+		out = append(out, toPluginWorkload(w))
+	}
+	return out
+}
@@ -1,194 +0,0 @@
-package api
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"log/slog"
-	"net/http"
-
-	"github.com/go-chi/chi/v5"
-
-	"github.com/alexei/docker-watcher/internal/store"
-)
-
-// listInstances handles GET /api/projects/{id}/stages/{stage}/instances.
-func (s *Server) listInstances(w http.ResponseWriter, r *http.Request) {
-	stageID := chi.URLParam(r, "stage")
-
-	// Verify stage exists.
-	if _, err := s.store.GetStageByID(stageID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get stage: "+err.Error())
-		return
-	}
-
-	instances, err := s.store.GetInstancesByStageID(stageID)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list instances: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, instances)
-}
-
-// deployRequest is the expected JSON body for triggering a deploy.
-type deployRequest struct {
-	ImageTag string `json:"image_tag"`
-}
-
-// deployInstance handles POST /api/projects/{id}/stages/{stage}/instances (trigger deploy).
-func (s *Server) deployInstance(w http.ResponseWriter, r *http.Request) {
-	projectID := chi.URLParam(r, "id")
-	stageID := chi.URLParam(r, "stage")
-
-	// Verify project exists.
-	if _, err := s.store.GetProjectByID(projectID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get project: "+err.Error())
-		return
-	}
-
-	// Verify stage exists.
-	if _, err := s.store.GetStageByID(stageID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get stage: "+err.Error())
-		return
-	}
-
-	var req deployRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.ImageTag == "" {
-		respondError(w, http.StatusBadRequest, "image_tag is required")
-		return
-	}
-
-	deployID, err := s.deployer.AsyncTriggerDeploy(r.Context(), projectID, stageID, req.ImageTag)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to trigger deploy: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusAccepted, map[string]string{
-		"status":     "deploying",
-		"deploy_id":  deployID,
-		"project_id": projectID,
-		"stage_id":   stageID,
-		"image_tag":  req.ImageTag,
-	})
-}
-
-// removeInstance handles DELETE /api/projects/{id}/stages/{stage}/instances/{iid}.
-func (s *Server) removeInstance(w http.ResponseWriter, r *http.Request) {
-	instanceID := chi.URLParam(r, "iid")
-
-	inst, err := s.store.GetInstanceByID(instanceID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "instance")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
-		return
-	}
-
-	// Remove the Docker container if it has one.
-	if inst.ContainerID != "" {
-		if err := s.docker.RemoveContainer(r.Context(), inst.ContainerID, true); err != nil {
-			slog.Error("remove container", "container_id", inst.ContainerID, "error", err)
-		}
-	}
-
-	// Delete instance record.
-	if err := s.store.DeleteInstance(instanceID); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to delete instance: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, map[string]string{"deleted": instanceID})
-}
-
-// stopInstance handles POST /api/projects/{id}/stages/{stage}/instances/{iid}/stop.
-func (s *Server) stopInstance(w http.ResponseWriter, r *http.Request) {
-	s.controlInstance(w, r, "stop")
-}
-
-// startInstance handles POST /api/projects/{id}/stages/{stage}/instances/{iid}/start.
-func (s *Server) startInstance(w http.ResponseWriter, r *http.Request) {
-	s.controlInstance(w, r, "start")
-}
-
-// restartInstance handles POST /api/projects/{id}/stages/{stage}/instances/{iid}/restart.
-func (s *Server) restartInstance(w http.ResponseWriter, r *http.Request) {
-	s.controlInstance(w, r, "restart")
-}
-
-// controlInstance performs a stop/start/restart action on an instance's container.
-func (s *Server) controlInstance(w http.ResponseWriter, r *http.Request, action string) {
-	instanceID := chi.URLParam(r, "iid")
-
-	inst, err := s.store.GetInstanceByID(instanceID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "instance")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
-		return
-	}
-
-	if inst.ContainerID == "" {
-		respondError(w, http.StatusBadRequest, "instance has no container")
-		return
-	}
-
-	ctx := r.Context()
-	var controlErr error
-	var newStatus string
-
-	switch action {
-	case "stop":
-		controlErr = s.docker.StopContainer(ctx, inst.ContainerID, 10)
-		newStatus = "stopped"
-	case "start":
-		controlErr = s.docker.StartContainer(ctx, inst.ContainerID)
-		newStatus = "running"
-	case "restart":
-		controlErr = s.docker.RestartContainer(ctx, inst.ContainerID, 10)
-		newStatus = "running"
-	default:
-		respondError(w, http.StatusBadRequest, fmt.Sprintf("unknown action: %s", action))
-		return
-	}
-
-	if controlErr != nil {
-		respondError(w, http.StatusInternalServerError, fmt.Sprintf("failed to %s instance: %v", action, controlErr))
-		return
-	}
-
-	// Update status in store.
-	if err := s.store.UpdateInstanceStatus(instanceID, newStatus); err != nil {
-		slog.Error("update instance status", "instance_id", instanceID, "status", newStatus, "error", err)
-	}
-
-	respondJSON(w, http.StatusOK, map[string]string{
-		"instance_id": instanceID,
-		"action":      action,
-		"status":      newStatus,
-	})
-}
-
-// DeployTriggerer is the interface for triggering deployments.
-type DeployTriggerer interface {
-	TriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) error
-	AsyncTriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) (string, error)
-}
@@ -0,0 +1,350 @@
+// Package api: log-scan rule HTTP handlers. The scanner manager
+// lives in internal/logscanner; this file is the REST surface that
+// lets operators create, edit, and test rules from the UI.
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/logscanner"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// LogScanReloader is what the API calls after any rule CRUD so the
+// scanner manager swaps its snapshot, and what the /stats endpoint
+// queries for runtime counters. Implemented by *logscanner.Manager;
+// nil-tolerant on the API side so the routes still work in a
+// scanner-disabled deployment.
+type LogScanReloader interface {
+	ReloadRules() error
+	Stats() logscanner.Stats
+}
+
+// SetLogScanReloader wires the API → manager reload signal. Called
+// from main after both subsystems are constructed.
+func (s *Server) SetLogScanReloader(r LogScanReloader) {
+	s.logScanReloader = r
+}
+
+// ruleInput is the JSON shape accepted by POST + PATCH. Pointers
+// distinguish "absent" from explicit empty/zero. WorkloadID and
+// OverridesID are immutable on update (per store.UpdateLogScanRule)
+// so they only appear here for create.
+type ruleInput struct {
+	WorkloadID      *string `json:"workload_id"`
+	OverridesID     *int64  `json:"overrides_id"`
+	Name            *string `json:"name"`
+	Pattern         *string `json:"pattern"`
+	Severity        *string `json:"severity"`
+	Streams         *string `json:"streams"`
+	CooldownSeconds *int    `json:"cooldown_seconds"`
+	Enabled         *bool   `json:"enabled"`
+}
+
+// listLogScanRules handles GET /api/log-scan-rules. Optional query
+// filter `workload_id=...` returns only rules scoped to that
+// workload (workload-only + override rows, NOT globals).
+func (s *Server) listLogScanRules(w http.ResponseWriter, r *http.Request) {
+	if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
+		out, err := s.store.ListLogScanRulesByWorkload(wlID)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "list log scan rules")
+			return
+		}
+		respondJSON(w, http.StatusOK, out)
+		return
+	}
+	out, err := s.store.ListLogScanRules()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list log scan rules")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getLogScanRule handles GET /api/log-scan-rules/{id}.
+func (s *Server) getLogScanRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseRuleID(w, r)
+	if !ok {
+		return
+	}
+	rule, err := s.store.GetLogScanRule(id)
+	if err != nil {
+		mapStoreError(w, err, "log scan rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, rule)
+}
+
+// createLogScanRule handles POST /api/log-scan-rules.
+func (s *Server) createLogScanRule(w http.ResponseWriter, r *http.Request) {
+	var in ruleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	rule := store.LogScanRule{
+		WorkloadID:      derefString(in.WorkloadID),
+		OverridesID:     derefInt64(in.OverridesID),
+		Name:            derefString(in.Name),
+		Pattern:         derefString(in.Pattern),
+		Severity:        firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
+		Streams:         firstNonEmpty(derefString(in.Streams), store.LogScanStreamAll),
+		CooldownSeconds: derefIntDefault(in.CooldownSeconds, 60),
+		Enabled:         in.Enabled == nil || *in.Enabled,
+	}
+	if msg := validateRulePattern(rule.Pattern); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.CreateLogScanRule(rule)
+	if err != nil {
+		// Store-side validation errors map to 400; anything else
+		// (driver errors) is a 500 without leaking the raw text.
+		if isClientValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create log scan rule")
+		return
+	}
+	s.reloadLogScan()
+	respondJSON(w, http.StatusCreated, out)
+}
+
+// updateLogScanRule handles PATCH /api/log-scan-rules/{id}. Scope
+// fields (workload_id, overrides_id) are immutable; pattern/severity/
+// streams/cooldown/enabled/name are individually overridable.
+func (s *Server) updateLogScanRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseRuleID(w, r)
+	if !ok {
+		return
+	}
+	existing, err := s.store.GetLogScanRule(id)
+	if err != nil {
+		mapStoreError(w, err, "log scan rule")
+		return
+	}
+	var in ruleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	if in.Name != nil {
+		existing.Name = *in.Name
+	}
+	if in.Pattern != nil {
+		existing.Pattern = *in.Pattern
+	}
+	if in.Severity != nil && *in.Severity != "" {
+		existing.Severity = *in.Severity
+	}
+	if in.Streams != nil && *in.Streams != "" {
+		existing.Streams = *in.Streams
+	}
+	if in.CooldownSeconds != nil {
+		existing.CooldownSeconds = *in.CooldownSeconds
+	}
+	if in.Enabled != nil {
+		existing.Enabled = *in.Enabled
+	}
+	if msg := validateRulePattern(existing.Pattern); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.UpdateLogScanRule(existing)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "log scan rule")
+			return
+		}
+		if isClientValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update log scan rule")
+		return
+	}
+	s.reloadLogScan()
+	respondJSON(w, http.StatusOK, out)
+}
+
+// deleteLogScanRule handles DELETE /api/log-scan-rules/{id}. Override
+// rows that reference this id are cascade-deleted by the store layer.
+func (s *Server) deleteLogScanRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseRuleID(w, r)
+	if !ok {
+		return
+	}
+	if err := s.store.DeleteLogScanRule(id); err != nil {
+		mapStoreError(w, err, "log scan rule")
+		return
+	}
+	s.reloadLogScan()
+	w.WriteHeader(http.StatusNoContent)
+}
+
+// testLogScanRule handles POST /api/log-scan-rules/{id}/test. Body
+// `{"sample_line": "..."}` returns whether the rule pattern matches +
+// any captured subgroups. Lets operators iterate on regexes in the
+// UI without spinning up real container traffic.
+func (s *Server) testLogScanRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseRuleID(w, r)
+	if !ok {
+		return
+	}
+	rule, err := s.store.GetLogScanRule(id)
+	if err != nil {
+		mapStoreError(w, err, "log scan rule")
+		return
+	}
+	var body struct {
+		SampleLine string `json:"sample_line"`
+	}
+	if !decodeJSON(w, r, &body) {
+		return
+	}
+	respondJSON(w, http.StatusOK, testRuleAgainstLine(rule, body.SampleLine))
+}
+
+// getEffectiveLogScanRules handles GET /api/workloads/{id}/effective-rules.
+// Returns the resolved effective rule set (globals minus overrides +
+// workload-only + override-substitutes) that the scanner uses for
+// this workload's containers.
+func (s *Server) getEffectiveLogScanRules(w http.ResponseWriter, r *http.Request) {
+	workloadID := chi.URLParam(r, "id")
+	if workloadID == "" {
+		respondError(w, http.StatusBadRequest, "workload id required")
+		return
+	}
+	rules, err := s.store.EffectiveLogScanRules(workloadID)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "compute effective rules")
+		return
+	}
+	respondJSON(w, http.StatusOK, rules)
+}
+
+// testResult is the shape returned by /test. Keeping it focused —
+// caller wants a yes/no + captures so they can iterate, nothing more.
+type ruleTestResult struct {
+	Matched  bool              `json:"matched"`
+	Captures map[string]string `json:"captures,omitempty"`
+	Error    string            `json:"error,omitempty"`
+}
+
+func testRuleAgainstLine(rule store.LogScanRule, line string) ruleTestResult {
+	re, err := regexp.Compile(rule.Pattern)
+	if err != nil {
+		return ruleTestResult{Error: "rule pattern is invalid: " + err.Error()}
+	}
+	subs := re.FindStringSubmatch(line)
+	if subs == nil {
+		return ruleTestResult{Matched: false}
+	}
+	captures := map[string]string{}
+	names := re.SubexpNames()
+	for i, s := range subs[1:] {
+		key := names[i+1]
+		if key == "" {
+			key = "$" + strconv.Itoa(i+1)
+		}
+		captures[key] = s
+	}
+	return ruleTestResult{Matched: true, Captures: captures}
+}
+
+func validateRulePattern(pattern string) string {
+	if strings.TrimSpace(pattern) == "" {
+		return "pattern is required"
+	}
+	if _, err := regexp.Compile(pattern); err != nil {
+		return "pattern invalid: " + err.Error()
+	}
+	return ""
+}
+
+// isClientValidationErr returns true when the store error is one of
+// the validation errors raised by CreateLogScanRule /
+// UpdateLogScanRule (name/pattern required, invalid enum, negative
+// cooldown). Used to map those to 400 rather than 500 without
+// exposing driver text.
+func isClientValidationErr(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := err.Error()
+	for _, needle := range []string{
+		"name is required",
+		"pattern is required",
+		"invalid severity",
+		"invalid streams",
+		"cooldown_seconds must be",
+		"override row requires workload_id",
+	} {
+		if strings.Contains(msg, needle) {
+			return true
+		}
+	}
+	return false
+}
+
+func parseRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	raw := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(raw, 10, 64)
+	if err != nil || id <= 0 {
+		respondError(w, http.StatusBadRequest, "invalid rule id")
+		return 0, false
+	}
+	return id, true
+}
+
+func derefInt64(p *int64) int64 {
+	if p == nil {
+		return 0
+	}
+	return *p
+}
+
+func derefIntDefault(p *int, def int) int {
+	if p == nil {
+		return def
+	}
+	return *p
+}
+
+// getLogScanStats handles GET /api/log-scan-rules/stats. Returns
+// engine drop counters + last-snapshot compile errors + active
+// tail count so operators can see when their patterns are too
+// greedy or syntactically broken. When the scanner manager is not
+// wired (scanner-disabled deployment), returns a zero-valued
+// shape rather than 404 so the frontend can render the panel
+// uniformly.
+func (s *Server) getLogScanStats(w http.ResponseWriter, r *http.Request) {
+	if s.logScanReloader == nil {
+		respondJSON(w, http.StatusOK, logscanner.Stats{})
+		return
+	}
+	respondJSON(w, http.StatusOK, s.logScanReloader.Stats())
+}
+
+// reloadLogScan fires the manager's snapshot rebuild. Nil-tolerant
+// so the API can run before the manager is wired (and in
+// scanner-disabled deployments). Failures are logged at warn —
+// we don't fail the originating CRUD request because that already
+// succeeded, but operators need a signal so they don't chase a
+// "why isn't my rule firing?" mystery.
+func (s *Server) reloadLogScan() {
+	if s.logScanReloader == nil {
+		return
+	}
+	if err := s.logScanReloader.ReloadRules(); err != nil {
+		slog.Warn("log-scan reload failed; manager snapshot may be stale",
+			"error", err)
+	}
+}
@@ -0,0 +1,235 @@
+// Package api: metric-alert rule HTTP handlers. The evaluator lives in
+// internal/metricalert; this file is the REST surface that lets
+// operators create, edit, and delete threshold rules. Mirrors the
+// log-scan rule handlers.
+package api
+
+import (
+	"errors"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
+// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
+// immutable on update (per store.UpdateMetricAlertRule) so it only takes
+// effect on create.
+type metricAlertRuleInput struct {
+	WorkloadID      *string  `json:"workload_id"`
+	Name            *string  `json:"name"`
+	Metric          *string  `json:"metric"`
+	Comparator      *string  `json:"comparator"`
+	Threshold       *float64 `json:"threshold"`
+	Severity        *string  `json:"severity"`
+	CooldownSeconds *int     `json:"cooldown_seconds"`
+	Enabled         *bool    `json:"enabled"`
+}
+
+// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
+// query filter `workload_id=...` returns rules applying to that workload
+// (its own rows plus globals).
+func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
+	if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
+		out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "list metric alert rules")
+			return
+		}
+		respondJSON(w, http.StatusOK, out)
+		return
+	}
+	out, err := s.store.ListMetricAlertRules()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list metric alert rules")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
+func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	rule, err := s.store.GetMetricAlertRule(id)
+	if err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, rule)
+}
+
+// createMetricAlertRule handles POST /api/metric-alert-rules.
+func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	var in metricAlertRuleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	rule := store.MetricAlertRule{
+		WorkloadID:      derefString(in.WorkloadID),
+		Name:            derefString(in.Name),
+		Metric:          derefString(in.Metric),
+		Comparator:      derefString(in.Comparator),
+		Threshold:       derefFloat64(in.Threshold),
+		Severity:        firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
+		CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
+		Enabled:         in.Enabled == nil || *in.Enabled,
+	}
+	if msg := validateMetricAlertInput(rule); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.CreateMetricAlertRule(rule)
+	if err != nil {
+		if isMetricAlertValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusCreated, out)
+}
+
+// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
+// workload_id is immutable; name/metric/comparator/threshold/severity/
+// cooldown/enabled are individually overridable.
+func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	existing, err := s.store.GetMetricAlertRule(id)
+	if err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	var in metricAlertRuleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	if in.Name != nil {
+		existing.Name = *in.Name
+	}
+	if in.Metric != nil && *in.Metric != "" {
+		existing.Metric = *in.Metric
+	}
+	if in.Comparator != nil && *in.Comparator != "" {
+		existing.Comparator = *in.Comparator
+	}
+	if in.Threshold != nil {
+		existing.Threshold = *in.Threshold
+	}
+	if in.Severity != nil && *in.Severity != "" {
+		existing.Severity = *in.Severity
+	}
+	if in.CooldownSeconds != nil {
+		existing.CooldownSeconds = *in.CooldownSeconds
+	}
+	if in.Enabled != nil {
+		existing.Enabled = *in.Enabled
+	}
+	if msg := validateMetricAlertInput(existing); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.UpdateMetricAlertRule(existing)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "metric alert rule")
+			return
+		}
+		if isMetricAlertValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
+func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	if err := s.store.DeleteMetricAlertRule(id); err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	w.WriteHeader(http.StatusNoContent)
+}
+
+// validateMetricAlertInput does boundary validation so we return a
+// clear 400 before hitting the store. The store re-validates the same
+// invariants as a backstop.
+func validateMetricAlertInput(rule store.MetricAlertRule) string {
+	if strings.TrimSpace(rule.Name) == "" {
+		return "name is required"
+	}
+	switch rule.Metric {
+	case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
+	default:
+		return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
+	}
+	switch rule.Comparator {
+	case store.MetricComparatorGT, store.MetricComparatorLT:
+	default:
+		return "invalid comparator: must be gt or lt"
+	}
+	switch rule.Severity {
+	case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
+	default:
+		return "invalid severity: must be info, warn, or error"
+	}
+	if rule.CooldownSeconds < 0 {
+		return "cooldown_seconds must be >= 0"
+	}
+	return ""
+}
+
+// isMetricAlertValidationErr maps the store's validation errors to 400
+// rather than 500 without leaking driver text.
+func isMetricAlertValidationErr(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := err.Error()
+	for _, needle := range []string{
+		"name is required",
+		"invalid metric",
+		"invalid comparator",
+		"invalid severity",
+		"cooldown_seconds must be",
+	} {
+		if strings.Contains(msg, needle) {
+			return true
+		}
+	}
+	return false
+}
+
+func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	raw := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(raw, 10, 64)
+	if err != nil || id <= 0 {
+		respondError(w, http.StatusBadRequest, "invalid rule id")
+		return 0, false
+	}
+	return id, true
+}
+
+func derefFloat64(p *float64) float64 {
+	if p == nil {
+		return 0
+	}
+	return *p
+}
@@ -1,14 +1,123 @@
 package api

 import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
 	"log/slog"
+	"net"
 	"net/http"
+	"os"
 	"runtime/debug"
+	"strings"
+	"sync"
 	"time"
+
+	"github.com/alexei/tinyforge/internal/metrics"
 )

+// requestIDKey is the context key under which the generated/forwarded
+// X-Request-ID is stored. Exported indirectly via RequestIDFromContext
+// so handlers and services downstream of the API layer can thread it
+// into their own slog calls without re-extracting from headers.
+type requestIDKeyType struct{}
+
+var requestIDKey = requestIDKeyType{}
+
+// RequestIDFromContext returns the correlation ID for the request, or
+// "" when called outside the API request path.
+func RequestIDFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(requestIDKey).(string); ok {
+		return v
+	}
+	return ""
+}
+
+// requestID middleware ensures every request has a stable correlation
+// ID. Honors a caller-supplied X-Request-ID when the request comes from
+// a trusted proxy AND the value matches a safe character set; otherwise
+// generates a fresh 128-bit ID. The ID is echoed back as X-Request-ID
+// and stitched into every subsequent slog call via the context value
+// the `logging` middleware reads.
+//
+// Format clamp: a compromised reverse proxy (or one that mis-parses an
+// untrusted header) could forward an ID containing newlines, semicolons,
+// or other separator characters. Those would corrupt structured log
+// parsers that assume one record per line / key-value. Restricting to
+// `[A-Za-z0-9._-]{1,64}` covers UUIDs, hex IDs, and trace-context IDs
+// without any sharp edges.
+func requestID(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		rid := r.Header.Get("X-Request-ID")
+		if rid == "" || !isTrustedPeer(r) || !isValidRequestID(rid) {
+			rid = newRequestID()
+		}
+		w.Header().Set("X-Request-ID", rid)
+		ctx := context.WithValue(r.Context(), requestIDKey, rid)
+		next.ServeHTTP(w, r.WithContext(ctx))
+	})
+}
+
+// isValidRequestID enforces `[A-Za-z0-9._-]{1,64}` without compiling a
+// regex on the request path. Single linear scan, no allocations.
+func isValidRequestID(s string) bool {
+	if len(s) == 0 || len(s) > 64 {
+		return false
+	}
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		switch {
+		case c >= 'A' && c <= 'Z':
+		case c >= 'a' && c <= 'z':
+		case c >= '0' && c <= '9':
+		case c == '.' || c == '_' || c == '-':
+		default:
+			return false
+		}
+	}
+	return true
+}
+
+// isTrustedPeer is a thin wrapper around the TRUSTED_PROXY_CIDRS allow-
+// list — we honor a forwarded request-id only from upstreams we already
+// trust for X-Forwarded-For. Otherwise an internet client could spam
+// log files with attacker-chosen IDs.
+func isTrustedPeer(r *http.Request) bool {
+	peer := r.RemoteAddr
+	if host, _, err := net.SplitHostPort(peer); err == nil {
+		peer = host
+	}
+	if len(trustedProxyCIDRs) == 0 {
+		return false
+	}
+	ip := net.ParseIP(peer)
+	if ip == nil {
+		return false
+	}
+	for _, n := range trustedProxyCIDRs {
+		if n.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
+func newRequestID() string {
+	var b [16]byte
+	if _, err := rand.Read(b[:]); err != nil {
+		// Fall back to time-based suffix if crypto/rand is unavailable
+		// — extremely unlikely outside of broken environments, but the
+		// ID is for tracing not security, so a deterministic fallback
+		// is preferable to a panic.
+		return "ts-" + time.Now().UTC().Format("20060102T150405.000000000")
+	}
+	return hex.EncodeToString(b[:])
+}
+
 // logging is an HTTP middleware that logs every request with method, path,
-// status code, and duration.
+// status code, and duration. Webhook URLs are redacted before being logged
+// because the secret is the only authenticator — leaking it to log
+// aggregators is equivalent to leaking the credential.
 func logging(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		start := time.Now()
@@ -16,15 +125,108 @@ func logging(next http.Handler) http.Handler {

 		next.ServeHTTP(wrapped, r)

-		slog.Info("http request",
+		fields := []any{
 			"method", r.Method,
-			"path", r.URL.Path,
+			"path", redactPath(r.URL.Path),
 			"status", wrapped.status,
 			"duration", time.Since(start).String(),
-		)
+		}
+		if rq := redactQuery(r.URL.RawQuery); rq != "" {
+			fields = append(fields, "query", rq)
+		}
+		if rid := RequestIDFromContext(r.Context()); rid != "" {
+			fields = append(fields, "request_id", rid)
+		}
+		slog.Info("http request", fields...)
+
+		// Lightweight per-request counter. Bucket by status class so
+		// the cardinality stays at 5 × #methods regardless of how many
+		// distinct response codes we emit.
+		metrics.HTTPRequestsTotal.Inc(bucketMethod(r.Method), statusClass(wrapped.status))
 	})
 }

+// bucketMethod normalises HTTP method names against the standard set
+// so a malicious client cannot spam arbitrary method tokens (RFC 7230
+// allows any token) and inflate the metrics map. Anything off the
+// allow-list collapses to "other".
+func bucketMethod(m string) string {
+	switch m {
+	case "GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS", "CONNECT", "TRACE":
+		return m
+	}
+	return "other"
+}
+
+// statusClass buckets a status code into "1xx".."5xx" / "other". Keeps
+// metrics cardinality bounded so a chatty endpoint can't explode the
+// metrics map with one series per distinct response code.
+func statusClass(code int) string {
+	switch {
+	case code >= 100 && code < 200:
+		return "1xx"
+	case code >= 200 && code < 300:
+		return "2xx"
+	case code >= 300 && code < 400:
+		return "3xx"
+	case code >= 400 && code < 500:
+		return "4xx"
+	case code >= 500 && code < 600:
+		return "5xx"
+	}
+	return "other"
+}
+
+// redactPath strips secrets from URL paths that carry them in segments.
+// Only the canonical /api/webhook/triggers/{secret} surface remains after
+// the hard cutover.
+func redactPath(path string) string {
+	const triggerPrefix = "/api/webhook/triggers/"
+	if strings.HasPrefix(path, triggerPrefix) {
+		return triggerPrefix + "***"
+	}
+	return path
+}
+
+// redactQueryKeys is the case-insensitive set of query-parameter names whose
+// values are masked before a URL lands in the request log. `token` is used by
+// SSE/EventSource when a custom header can't be set; the rest are
+// defence-in-depth against sensitive values ever appearing in a query string.
+var redactQueryKeys = map[string]struct{}{
+	"token":         {},
+	"secret":        {},
+	"password":      {},
+	"passwd":        {},
+	"api_key":       {},
+	"apikey":        {},
+	"access_token":  {},
+	"client_secret": {},
+	"sig":           {},
+	"signature":     {},
+}
+
+// redactQuery masks the values of sensitive query parameters (see
+// redactQueryKeys) in a URL's raw query before it lands in the request log.
+// Key matching is case-insensitive. Returns the input unchanged when there is
+// nothing to redact so a malformed URL surfaces naturally.
+func redactQuery(rawQuery string) string {
+	if rawQuery == "" {
+		return ""
+	}
+	parts := strings.Split(rawQuery, "&")
+	for i, p := range parts {
+		eq := strings.IndexByte(p, '=')
+		if eq < 0 {
+			continue
+		}
+		key := strings.ToLower(p[:eq])
+		if _, ok := redactQueryKeys[key]; ok {
+			parts[i] = p[:eq+1] + "***"
+		}
+	}
+	return strings.Join(parts, "&")
+}
+
 // recovery is an HTTP middleware that catches panics and returns a 500 response.
 func recovery(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -38,12 +240,59 @@ func recovery(next http.Handler) http.Handler {
 	})
 }

-// cors is an HTTP middleware that sets permissive CORS headers for development.
+// securityHeaders sets standard security headers on all responses.
+//
+// Strict-Transport-Security is emitted only when the request arrived
+// over HTTPS (direct TLS or forwarded). Emitting HSTS over plain HTTP
+// is harmless to compliant browsers but flags as an issue in scanners
+// and confuses some reverse proxies.
+//
+// The CSP keeps `'unsafe-inline'` for now because SvelteKit injects
+// inline boot scripts and styles; removing it requires a nonce-based
+// strategy threaded through the SvelteKit handle hook. Tracked as a
+// follow-up; documented in the security report.
+func securityHeaders(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("X-Content-Type-Options", "nosniff")
+		w.Header().Set("X-Frame-Options", "DENY")
+		w.Header().Set("Referrer-Policy", "strict-origin-when-cross-origin")
+		w.Header().Set("Permissions-Policy", "camera=(), microphone=(), geolocation=(), payment=()")
+		w.Header().Set("Content-Security-Policy",
+			"default-src 'self'; "+
+				"script-src 'self' 'unsafe-inline'; "+
+				"style-src 'self' 'unsafe-inline'; "+
+				"img-src 'self' data:; "+
+				"connect-src 'self'; "+
+				"font-src 'self'; "+
+				"frame-ancestors 'none'; "+
+				"base-uri 'self'; "+
+				"form-action 'self'")
+		if isHTTPS(r) {
+			w.Header().Set("Strict-Transport-Security", "max-age=31536000; includeSubDomains")
+		}
+		next.ServeHTTP(w, r)
+	})
+}
+
+func isHTTPS(r *http.Request) bool {
+	if r.TLS != nil {
+		return true
+	}
+	if r.Header.Get("X-Forwarded-Proto") == "https" {
+		return true
+	}
+	return false
+}
+
+// cors is an HTTP middleware that handles CORS for same-origin requests.
+// The frontend is served from the same origin, so cross-origin requests are not expected.
 func cors(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Access-Control-Allow-Origin", "*")
-		w.Header().Set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS")
-		w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization")
+		// The frontend is served from the same origin, so cross-origin
+		// requests are not expected. We do NOT reflect the Origin header
+		// back, as that would allow any website to make credentialed requests.
+		// If cross-origin support is needed in the future, maintain an
+		// explicit allowlist of trusted origins here.

 		if r.Method == http.MethodOptions {
 			w.WriteHeader(http.StatusNoContent)
@@ -54,6 +303,74 @@ func cors(next http.Handler) http.Handler {
 	})
 }

+// maxBodySize limits request body sizes to prevent memory exhaustion.
+const maxBodySize = 1 << 20 // 1 MB
+
+func limitBody(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		r.Body = http.MaxBytesReader(w, r.Body, maxBodySize)
+		next.ServeHTTP(w, r)
+	})
+}
+
+// rateLimiter provides per-IP rate limiting for login endpoints.
+type rateLimiter struct {
+	mu          sync.Mutex
+	attempts    map[string][]time.Time
+	lastCleanup time.Time
+}
+
+func newRateLimiter() *rateLimiter {
+	return &rateLimiter{
+		attempts:    make(map[string][]time.Time),
+		lastCleanup: time.Now(),
+	}
+}
+
+// allow checks if the IP is allowed to make another request.
+// Returns false if the IP has exceeded the limit (10 requests per minute).
+func (rl *rateLimiter) allow(ip string) bool {
+	rl.mu.Lock()
+	defer rl.mu.Unlock()
+
+	now := time.Now()
+	window := now.Add(-1 * time.Minute)
+
+	// Periodically clean all stale IPs to prevent memory leak.
+	if now.Sub(rl.lastCleanup) > 5*time.Minute {
+		for k, times := range rl.attempts {
+			filtered := times[:0]
+			for _, t := range times {
+				if t.After(window) {
+					filtered = append(filtered, t)
+				}
+			}
+			if len(filtered) == 0 {
+				delete(rl.attempts, k)
+			} else {
+				rl.attempts[k] = filtered
+			}
+		}
+		rl.lastCleanup = now
+	}
+
+	// Clean old entries for this IP.
+	filtered := rl.attempts[ip][:0]
+	for _, t := range rl.attempts[ip] {
+		if t.After(window) {
+			filtered = append(filtered, t)
+		}
+	}
+	rl.attempts[ip] = filtered
+
+	if len(filtered) >= 10 {
+		return false
+	}
+
+	rl.attempts[ip] = append(rl.attempts[ip], now)
+	return true
+}
+
 // jsonContentType is an HTTP middleware that sets the default Content-Type to JSON.
 func jsonContentType(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -62,6 +379,115 @@ func jsonContentType(next http.Handler) http.Handler {
 	})
 }

+// rateLimitMiddleware wraps a handler with per-IP rate limiting using the
+// supplied limiter. Requests over the limit get 429.
+func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			ip := clientIP(r)
+			if !rl.allow(ip) {
+				respondError(w, http.StatusTooManyRequests, "rate limit exceeded")
+				return
+			}
+			next.ServeHTTP(w, r)
+		})
+	}
+}
+
+// trustedProxyCIDRs is the parsed allow-list of upstream proxy networks
+// whose X-Forwarded-For header we honor. Set TRUSTED_PROXY_CIDRS to a
+// comma-separated list of CIDRs (e.g. "127.0.0.1/32,10.0.0.0/8") to
+// enable. When unset (the default) X-Forwarded-For is ignored entirely
+// and rate limiting + audit logging use r.RemoteAddr — preventing a
+// remote attacker from spoofing the header to bypass per-IP limiters.
+var trustedProxyCIDRs = parseTrustedProxyCIDRs(os.Getenv("TRUSTED_PROXY_CIDRS"))
+
+func parseTrustedProxyCIDRs(raw string) []*net.IPNet {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+	var nets []*net.IPNet
+	for _, p := range strings.Split(raw, ",") {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		// Allow bare IPs as /32 (IPv4) or /128 (IPv6).
+		if !strings.Contains(p, "/") {
+			if ip := net.ParseIP(p); ip != nil {
+				if ip.To4() != nil {
+					p += "/32"
+				} else {
+					p += "/128"
+				}
+			}
+		}
+		_, n, err := net.ParseCIDR(p)
+		if err != nil {
+			slog.Warn("ignoring invalid TRUSTED_PROXY_CIDRS entry", "value", p, "error", err)
+			continue
+		}
+		nets = append(nets, n)
+	}
+	return nets
+}
+
+// clientIP returns the per-request "client" address used for rate-limit
+// keying and audit attribution. X-Forwarded-For is honored ONLY when the
+// direct peer (r.RemoteAddr) belongs to a configured trusted-proxy CIDR;
+// otherwise the header is ignored to prevent header-spoofing bypasses.
+func clientIP(r *http.Request) string {
+	peer := r.RemoteAddr
+	if host, _, err := net.SplitHostPort(peer); err == nil {
+		peer = host
+	}
+	if len(trustedProxyCIDRs) == 0 {
+		return peer
+	}
+	peerIP := net.ParseIP(peer)
+	if peerIP == nil || !isTrustedProxy(peerIP) {
+		return peer
+	}
+	fwd := r.Header.Get("X-Forwarded-For")
+	if fwd == "" {
+		return peer
+	}
+	// Walk X-Forwarded-For from the RIGHTMOST entry (the address closest to
+	// us, appended by our trusted peer) leftward, skipping entries that are
+	// themselves trusted proxies, and return the first untrusted address.
+	// The LEFTMOST entry is fully client-controlled — trusting it (as a
+	// naive `fwd[:firstComma]` does) lets an attacker spoof their rate-limit
+	// and audit identity by prepending a forged value, defeating the per-IP
+	// login limiter.
+	parts := strings.Split(fwd, ",")
+	for i := len(parts) - 1; i >= 0; i-- {
+		candidate := strings.TrimSpace(parts[i])
+		ip := net.ParseIP(candidate)
+		if ip == nil {
+			continue
+		}
+		if isTrustedProxy(ip) {
+			continue
+		}
+		return candidate
+	}
+	// Every forwarded entry was a trusted proxy (or unparseable) — fall back
+	// to the direct peer.
+	return peer
+}
+
+// isTrustedProxy reports whether ip falls within a configured
+// trusted-proxy CIDR.
+func isTrustedProxy(ip net.IP) bool {
+	for _, n := range trustedProxyCIDRs {
+		if n.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
 // statusRecorder wraps http.ResponseWriter to capture the status code.
 type statusRecorder struct {
 	http.ResponseWriter
@@ -0,0 +1,106 @@
+package api
+
+// Outgoing-webhook signing-secret + send-test endpoints. After the hard
+// cutover only the settings tier survives at the API surface; per-workload
+// notification settings live on the workload row itself and are accessed
+// via the workload endpoints.
+
+import (
+	"context"
+	"log/slog"
+	"net/http"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/notify"
+)
+
+// notificationSecretResponse is what the GET / regenerate endpoints return.
+// The secret is revealed in cleartext exactly once per request — UI is
+// expected to copy or hash it for display, not store it long-term.
+type notificationSecretResponse struct {
+	Secret    string `json:"secret"`
+	HasSecret bool   `json:"has_secret"`
+}
+
+// testEventTimeout caps how long we wait for the receiver before declaring
+// the test failed. Mirrors the production notifier's per-request timeout
+// (10s) so test results are predictive of real send behaviour.
+const testEventTimeout = 10 * time.Second
+
+// buildTestEvent constructs the synthetic payload used by every "send
+// test" endpoint. Marking it as type "test" prevents a misconfigured
+// receiver from mistaking a wiring check for a real deploy event.
+func buildTestEvent(project, stage string) notify.Event {
+	return notify.Event{
+		Type:    "test",
+		Project: project,
+		Stage:   stage,
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Global / settings tier
+// ---------------------------------------------------------------------------
+
+// getSettingsNotificationSecret handles GET /api/settings/notification-secret.
+// Lazily generates a secret if one was never set (typical for sites
+// upgrading from a pre-signing build).
+func (s *Server) getSettingsNotificationSecret(w http.ResponseWriter, r *http.Request) {
+	secret, err := s.store.EnsureSettingsNotificationSecret()
+	if err != nil {
+		slog.Error("get settings notification secret", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to load secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, notificationSecretResponse{Secret: secret, HasSecret: secret != ""})
+}
+
+// regenerateSettingsNotificationSecret handles POST
+// /api/settings/notification-secret/regenerate. Replaces the existing
+// secret with a fresh one, invalidating signatures verified against the
+// old secret.
+func (s *Server) regenerateSettingsNotificationSecret(w http.ResponseWriter, r *http.Request) {
+	secret := generateWebhookSecret()
+	if err := s.store.SetSettingsNotificationSecret(secret); err != nil {
+		slog.Error("regenerate settings notification secret", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to rotate secret")
+		return
+	}
+	slog.Info("settings notification secret rotated")
+	respondJSON(w, http.StatusOK, notificationSecretResponse{Secret: secret, HasSecret: true})
+}
+
+// disableSettingsNotificationSigning handles POST
+// /api/settings/notification-secret/disable. Clears the secret so further
+// outgoing notifications are unsigned. Useful for receivers that don't
+// support HMAC verification.
+func (s *Server) disableSettingsNotificationSigning(w http.ResponseWriter, r *http.Request) {
+	if err := s.store.SetSettingsNotificationSecret(""); err != nil {
+		slog.Error("disable settings notification signing", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to disable signing")
+		return
+	}
+	respondJSON(w, http.StatusOK, notificationSecretResponse{Secret: "", HasSecret: false})
+}
+
+// settingsNotificationTest handles POST /api/settings/notification-test.
+// Sends a synthetic test event to the global webhook URL using the global
+// secret.
+func (s *Server) settingsNotificationTest(w http.ResponseWriter, r *http.Request) {
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to load settings")
+		return
+	}
+	if settings.NotificationURL == "" {
+		respondError(w, http.StatusBadRequest, "no global notification URL configured")
+		return
+	}
+	ctx, cancel := context.WithTimeout(r.Context(), testEventTimeout)
+	defer cancel()
+	result := s.notifier.SendSyncForTest(
+		ctx, settings.NotificationURL, settings.NotificationSecret, notify.TierSettings,
+		buildTestEvent("__tinyforge__", ""),
+	)
+	respondJSON(w, http.StatusOK, result)
+}
@@ -1,153 +0,0 @@
-package api
-
-import (
-	"errors"
-	"net/http"
-
-	"github.com/go-chi/chi/v5"
-
-	"github.com/alexei/docker-watcher/internal/store"
-)
-
-// projectRequest is the expected JSON body for creating/updating a project.
-type projectRequest struct {
-	Name        string `json:"name"`
-	Registry    string `json:"registry"`
-	Image       string `json:"image"`
-	Port        int    `json:"port"`
-	Healthcheck string `json:"healthcheck"`
-	Env         string `json:"env"`
-	Volumes     string `json:"volumes"`
-}
-
-// listProjects handles GET /api/projects.
-func (s *Server) listProjects(w http.ResponseWriter, r *http.Request) {
-	projects, err := s.store.GetAllProjects()
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list projects: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, projects)
-}
-
-// createProject handles POST /api/projects.
-func (s *Server) createProject(w http.ResponseWriter, r *http.Request) {
-	var req projectRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.Name == "" {
-		respondError(w, http.StatusBadRequest, "name is required")
-		return
-	}
-	if req.Image == "" {
-		respondError(w, http.StatusBadRequest, "image is required")
-		return
-	}
-	if req.Env == "" {
-		req.Env = "{}"
-	}
-	if req.Volumes == "" {
-		req.Volumes = "{}"
-	}
-
-	project, err := s.store.CreateProject(store.Project{
-		Name:        req.Name,
-		Registry:    req.Registry,
-		Image:       req.Image,
-		Port:        req.Port,
-		Healthcheck: req.Healthcheck,
-		Env:         req.Env,
-		Volumes:     req.Volumes,
-	})
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create project: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusCreated, project)
-}
-
-// getProject handles GET /api/projects/{id}.
-func (s *Server) getProject(w http.ResponseWriter, r *http.Request) {
-	id := chi.URLParam(r, "id")
-	project, err := s.store.GetProjectByID(id)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get project: "+err.Error())
-		return
-	}
-
-	// Also fetch stages for this project.
-	stages, err := s.store.GetStagesByProjectID(id)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to get stages: "+err.Error())
-		return
-	}
-
-	respondJSON(w, http.StatusOK, map[string]any{
-		"project": project,
-		"stages":  stages,
-	})
-}
-
-// updateProject handles PUT /api/projects/{id}.
-func (s *Server) updateProject(w http.ResponseWriter, r *http.Request) {
-	id := chi.URLParam(r, "id")
-
-	existing, err := s.store.GetProjectByID(id)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get project: "+err.Error())
-		return
-	}
-
-	var req projectRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	// Apply updates to existing project, preserving fields not provided.
-	updated := existing
-	if req.Name != "" {
-		updated.Name = req.Name
-	}
-	if req.Image != "" {
-		updated.Image = req.Image
-	}
-	updated.Registry = req.Registry
-	updated.Port = req.Port
-	updated.Healthcheck = req.Healthcheck
-	if req.Env != "" {
-		updated.Env = req.Env
-	}
-	if req.Volumes != "" {
-		updated.Volumes = req.Volumes
-	}
-
-	if err := s.store.UpdateProject(updated); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to update project: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, updated)
-}
-
-// deleteProject handles DELETE /api/projects/{id}.
-func (s *Server) deleteProject(w http.ResponseWriter, r *http.Request) {
-	id := chi.URLParam(r, "id")
-	if err := s.store.DeleteProject(id); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to delete project: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
-}
@@ -0,0 +1,35 @@
+package api
+
+import (
+	"log/slog"
+	"net/http"
+	"sort"
+)
+
+// listProxyRoutes handles GET /api/proxies. Returns proxy routes derived
+// from the containers index — the legacy static-site / project split is
+// gone; any workload whose container carries a proxy route ID is listed.
+func (s *Server) listProxyRoutes(w http.ResponseWriter, r *http.Request) {
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		slog.Error("failed to get settings for proxy routes", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	routes, err := s.store.ListProxyRoutes(settings.Domain)
+	if err != nil {
+		slog.Error("failed to list proxy routes", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	sort.SliceStable(routes, func(i, j int) bool {
+		if routes[i].Domain == routes[j].Domain {
+			return routes[i].ProjectName < routes[j].ProjectName
+		}
+		return routes[i].Domain < routes[j].Domain
+	})
+
+	respondJSON(w, http.StatusOK, routes)
+}
@@ -8,9 +8,9 @@ import (

 	"github.com/go-chi/chi/v5"

-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/registry"
-	"github.com/alexei/docker-watcher/internal/store"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/registry"
+	"github.com/alexei/tinyforge/internal/store"
 )

 // registryRequest is the expected JSON body for creating/updating a registry.
@@ -26,7 +26,8 @@ type registryRequest struct {
 func (s *Server) listRegistries(w http.ResponseWriter, r *http.Request) {
 	registries, err := s.store.GetAllRegistries()
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list registries: "+err.Error())
+		slog.Error("failed to list registries", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -80,7 +81,8 @@ func (s *Server) createRegistry(w http.ResponseWriter, r *http.Request) {
 	// Encrypt the token if provided.
 	encToken, err := crypto.EncryptIfNotEmpty(s.encKey, req.Token)
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to encrypt token: "+err.Error())
+		slog.Error("failed to encrypt token", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -92,7 +94,8 @@ func (s *Server) createRegistry(w http.ResponseWriter, r *http.Request) {
 		Owner: req.Owner,
 	})
 	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create registry: "+err.Error())
+		slog.Error("failed to create registry", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -112,7 +115,8 @@ func (s *Server) updateRegistry(w http.ResponseWriter, r *http.Request) {
 			respondNotFound(w, "registry")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to get registry: "+err.Error())
+		slog.Error("failed to get registry", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -138,14 +142,16 @@ func (s *Server) updateRegistry(w http.ResponseWriter, r *http.Request) {
 	if req.Token != "" {
 		encToken, err := crypto.EncryptIfNotEmpty(s.encKey, req.Token)
 		if err != nil {
-			respondError(w, http.StatusInternalServerError, "failed to encrypt token: "+err.Error())
+			slog.Error("failed to encrypt token", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 			return
 		}
 		updated.Token = encToken
 	}

 	if err := s.store.UpdateRegistry(updated); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to update registry: "+err.Error())
+		slog.Error("failed to update registry", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}
 	respondJSON(w, http.StatusOK, map[string]string{
@@ -162,7 +168,8 @@ func (s *Server) deleteRegistry(w http.ResponseWriter, r *http.Request) {
 			respondNotFound(w, "registry")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to delete registry: "+err.Error())
+		slog.Error("failed to delete registry", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}
 	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
@@ -184,7 +191,8 @@ func (s *Server) testRegistry(w http.ResponseWriter, r *http.Request) {
 			respondNotFound(w, "registry")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to get registry: "+err.Error())
+		slog.Error("failed to get registry", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -244,7 +252,8 @@ func (s *Server) listRegistryTags(w http.ResponseWriter, r *http.Request) {
 			respondNotFound(w, "registry")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to get registry: "+err.Error())
+		slog.Error("failed to get registry", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -285,7 +294,8 @@ func (s *Server) listRegistryImages(w http.ResponseWriter, r *http.Request) {
 			respondNotFound(w, "registry")
 			return
 		}
-		respondError(w, http.StatusInternalServerError, "failed to get registry: "+err.Error())
+		slog.Error("failed to get registry", "error", err)
+			respondError(w, http.StatusInternalServerError, "internal server error")
 		return
 	}

@@ -4,7 +4,6 @@ import (
 	"encoding/json"
 	"log/slog"
 	"net/http"
-	"reflect"
 )

 // envelope is the standard API response wrapper.
@@ -15,15 +14,7 @@ type envelope struct {
 }

 // respondJSON writes a JSON success response with the given status code and data.
-// Nil slices are converted to empty arrays to avoid "null" in JSON output.
 func respondJSON(w http.ResponseWriter, status int, data any) {
-	// Convert nil slices to empty arrays so JSON encodes as [] not null.
-	if data != nil {
-		v := reflect.ValueOf(data)
-		if v.Kind() == reflect.Slice && v.IsNil() {
-			data = reflect.MakeSlice(v.Type(), 0, 0).Interface()
-		}
-	}
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(status)
 	if err := json.NewEncoder(w).Encode(envelope{Success: true, Data: data}); err != nil {
@@ -47,6 +38,10 @@ func respondNotFound(w http.ResponseWriter, entity string) {

 // decodeJSON reads and decodes the request body into the given value.
 // Returns false and writes a 400 error response if decoding fails.
+//
+// Lenient: unknown fields are silently dropped to keep legacy clients
+// compatible. New endpoints that take opaque user-controlled JSON should
+// use decodeJSONStrict instead.
 func decodeJSON(w http.ResponseWriter, r *http.Request, v any) bool {
 	if err := json.NewDecoder(r.Body).Decode(v); err != nil {
 		respondError(w, http.StatusBadRequest, "invalid JSON: "+err.Error())
@@ -54,3 +49,17 @@ func decodeJSON(w http.ResponseWriter, r *http.Request, v any) bool {
 	}
 	return true
 }
+
+// decodeJSONStrict is decodeJSON plus DisallowUnknownFields. Use for
+// endpoints whose request shape is opaque (e.g. workload source/trigger
+// config blobs) — surfacing typos client-side beats silently dropping
+// fields the server then can't act on.
+func decodeJSONStrict(w http.ResponseWriter, r *http.Request, v any) bool {
+	dec := json.NewDecoder(r.Body)
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(v); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid JSON: "+err.Error())
+		return false
+	}
+	return true
+}
@@ -3,34 +3,84 @@ package api
 import (
 	"context"
 	"log/slog"
+	"sync"
+	"sync/atomic"

 	"github.com/go-chi/chi/v5"

-	"github.com/alexei/docker-watcher/internal/auth"
-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/docker"
-	"github.com/alexei/docker-watcher/internal/events"
-	"github.com/alexei/docker-watcher/internal/store"
-	"github.com/alexei/docker-watcher/internal/webhook"
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/backup"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/dns"
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/notify"
+	"github.com/alexei/tinyforge/internal/npm"
+	"github.com/alexei/tinyforge/internal/proxy"
+	"github.com/alexei/tinyforge/internal/stale"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
+	"github.com/alexei/tinyforge/internal/webhook"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

+// DNSProviderChangedFunc is called when DNS settings change so the caller can
+// update the provider on the deployer.
+type DNSProviderChangedFunc func(provider dns.Provider)
+
+// PluginDispatcher is the subset of the deployer the API layer uses for the
+// plugin-native dispatch surface (generic-hooks endpoint + workload teardown
+// + future surfaces). Defined here so the API does not import the deployer
+// package directly.
+type PluginDispatcher interface {
+	webhook.PluginDispatcher
+	DispatchTeardown(ctx context.Context, w plugin.Workload) error
+}
+
 // Server holds all dependencies for the API layer.
 type Server struct {
-	store        *store.Store
-	docker       *docker.Client
-	deployer     DeployTriggerer
-	webhook      *webhook.Handler
-	eventBus     *events.Bus
-	encKey       [32]byte
-	localAuth    *auth.LocalAuth
-	oidcProvider *auth.OIDCProvider
+	store         *store.Store
+	docker        *docker.Client
+	npm           *npm.Client // optional: only for NPM-specific endpoints (certificates)
+	proxyProvider proxy.Provider
+	deployer      PluginDispatcher
+	notifier      *notify.Notifier
+	webhook       *webhook.Handler
+	eventBus      *events.Bus
+	encKey        [32]byte
+	localAuth     *auth.LocalAuth
+	oidcProvider  *auth.OIDCProvider
+	staleScanner  *stale.Scanner
+
+	dnsProviderMu        sync.RWMutex
+	dnsProvider          dns.Provider
+	onDNSProviderChanged DNSProviderChangedFunc
+
+	backupEngine            *backup.Engine
+	snapshotEngine          *volsnap.Engine
+	sseGate                 *sseGate
+	logScanReloader         LogScanReloader
+	dbPath                  string
+	shutdownFunc            func()                                // called after restore to trigger graceful shutdown
+	onBackupSettingsChanged func(enabled bool, intervalHours int) // called when backup settings change
+	onProxyProviderChanged  func(provider proxy.Provider)         // called when proxy provider changes
+
+	// restoreInFlight is a process-wide guard against double-firing
+	// the restore endpoint. A rapid double-click would otherwise
+	// schedule two goroutines racing s.store.Close() and the
+	// candidate-over-live rename. CAS to true at the entry point;
+	// reject the second caller with 409 Conflict.
+	restoreInFlight atomic.Bool
 }

 // NewServer creates a new API Server with all required dependencies.
 func NewServer(
 	st *store.Store,
 	dockerClient *docker.Client,
-	deployer DeployTriggerer,
+	npmClient *npm.Client,
+	proxyProvider proxy.Provider,
+	deployer PluginDispatcher,
+	notifier *notify.Notifier,
 	webhookHandler *webhook.Handler,
 	eventBus *events.Bus,
 	encKey [32]byte,
@@ -38,13 +88,17 @@ func NewServer(
 	localAuth := auth.NewLocalAuth(encKey)

 	s := &Server{
-		store:     st,
-		docker:    dockerClient,
-		deployer:  deployer,
-		webhook:   webhookHandler,
-		eventBus:  eventBus,
-		encKey:    encKey,
-		localAuth: localAuth,
+		store:         st,
+		docker:        dockerClient,
+		npm:           npmClient,
+		proxyProvider: proxyProvider,
+		deployer:      deployer,
+		notifier:      notifier,
+		webhook:       webhookHandler,
+		eventBus:      eventBus,
+		encKey:        encKey,
+		localAuth:     localAuth,
+		sseGate:       newSSEGate(maxConcurrentSSEStreams),
 	}

 	// Try to initialize OIDC provider from stored settings.
@@ -56,15 +110,94 @@ func NewServer(
 	return s
 }

+// SetStaleScanner sets the stale scanner on the server.
+// Called after both the API server and scanner are initialized.
+func (s *Server) SetStaleScanner(scanner *stale.Scanner) {
+	s.staleScanner = scanner
+}
+
+// SetBackupEngine sets the backup engine on the server.
+func (s *Server) SetBackupEngine(engine *backup.Engine) {
+	s.backupEngine = engine
+}
+
+// SetSnapshotEngine sets the volume-snapshot engine on the server.
+func (s *Server) SetSnapshotEngine(engine *volsnap.Engine) {
+	s.snapshotEngine = engine
+}
+
+// SetDBPath sets the database file path (needed for restore).
+func (s *Server) SetDBPath(path string) {
+	s.dbPath = path
+}
+
+// SetShutdownFunc sets the function called after a restore to trigger graceful shutdown.
+func (s *Server) SetShutdownFunc(fn func()) {
+	s.shutdownFunc = fn
+}
+
+// SetBackupSettingsChangedCallback sets the callback for when backup settings change.
+func (s *Server) SetBackupSettingsChangedCallback(fn func(enabled bool, intervalHours int)) {
+	s.onBackupSettingsChanged = fn
+}
+
+// SetProxyProviderChangedCallback sets the callback for when the proxy provider changes.
+func (s *Server) SetProxyProviderChangedCallback(fn func(provider proxy.Provider)) {
+	s.onProxyProviderChanged = fn
+}
+
+// SetProxyProvider updates the proxy provider at runtime.
+func (s *Server) SetProxyProvider(provider proxy.Provider) {
+	s.proxyProvider = provider
+}
+
+// SetDNSProvider sets the current DNS provider on the server.
+func (s *Server) SetDNSProvider(provider dns.Provider) {
+	s.dnsProviderMu.Lock()
+	defer s.dnsProviderMu.Unlock()
+	s.dnsProvider = provider
+}
+
+// getDNSProviderLocked returns the current DNS provider under read lock.
+func (s *Server) getDNSProviderLocked() dns.Provider {
+	s.dnsProviderMu.RLock()
+	defer s.dnsProviderMu.RUnlock()
+	return s.dnsProvider
+}
+
+// SetDNSProviderChangedCallback sets the callback for when DNS settings change.
+func (s *Server) SetDNSProviderChangedCallback(fn DNSProviderChangedFunc) {
+	s.onDNSProviderChanged = fn
+}
+
 // initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
 func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
-	// Decrypt the OIDC client secret if it's encrypted.
+	// Decrypt the OIDC client secret. The prior code did a try-decrypt
+	// and silently treated failures as plaintext — under a rotated key
+	// that sent ciphertext upstream to the OP. Now:
+	//   - If the value carries the tf1: envelope → fail loud on
+	//     decrypt failure (rotated key / corrupted ciphertext).
+	//   - If the value is unprefixed (legacy ciphertext from v0 or true
+	//     plaintext from an old migration) → try decrypt; on failure
+	//     accept as plaintext (the only safe legacy interpretation).
 	clientSecret := as.OIDCClientSecret
 	if clientSecret != "" {
-		if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
+		switch {
+		case crypto.HasEnvelope(clientSecret):
+			decrypted, err := crypto.Decrypt(s.encKey, clientSecret)
+			if err != nil {
+				slog.Error("OIDC client secret could not be decrypted — refusing to initialize provider",
+					"error", err,
+					"hint", "rotate ENCRYPTION_KEY back, OR re-save OIDC settings to re-encrypt with the current key")
+				return
+			}
 			clientSecret = decrypted
+		default:
+			// Legacy v0 value: try decrypt; on failure assume plaintext.
+			if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
+				clientSecret = decrypted
+			}
 		}
-		// If decrypt fails, assume it's already plaintext (migration scenario).
 	}
 	provider, err := auth.NewOIDCProvider(ctx, auth.OIDCConfig{
 		IssuerURL:    as.OIDCIssuerURL,
@@ -84,101 +217,328 @@ func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
 func (s *Server) Router() chi.Router {
 	r := chi.NewRouter()

-	// Global middleware.
+	// Global middleware. requestID runs first so every downstream log
+	// line (and the access log emitted by `logging`) carries the same
+	// correlation id, plus the response carries it back on the
+	// X-Request-ID header for the operator to grep across services.
+	r.Use(requestID)
 	r.Use(recovery)
+	r.Use(securityHeaders)
 	r.Use(logging)
 	r.Use(cors)

+	// Unauthenticated health probes — mounted at the root so container
+	// orchestrators / load balancers can hit them without knowing about
+	// the /api prefix. /livez intentionally does no work and stays
+	// unbounded; /readyz pings the DB and is rate-limited to keep an
+	// unauthenticated flood from serialising behind SQLite's single
+	// writer connection (busy-timeout = 5s) and log-amplifying every
+	// request via the structured access log. The 10-per-minute budget
+	// is the existing rateLimiter default — generous for k8s readiness
+	// probes (typically every 5-10s), restrictive for an attacker.
+	r.Get("/livez", s.livez)
+	readyLimiter := newRateLimiter()
+	r.With(rateLimitMiddleware(readyLimiter)).Get("/readyz", s.readyz)
+
+	loginLimiter := newRateLimiter()
+	webhookLimiter := newRateLimiter()
+
 	r.Route("/api", func(r chi.Router) {
-		// JSON content type only for API routes (not static files).
+		// JSON content type and body size limit for API routes.
 		r.Use(jsonContentType)
+		r.Use(limitBody)

 		// Public auth endpoints (no auth required).
-		r.Post("/auth/login", s.login)
+		r.Get("/auth/mode", s.authMode)
+		r.Post("/auth/login", s.rateLimitedLogin(loginLimiter))
 		r.Get("/auth/oidc/login", s.oidcLogin)
 		r.Get("/auth/oidc/callback", s.oidcCallback)
+		r.Post("/auth/oidc/token", s.oidcExchangeToken)

 		// Webhook handler (uses its own secret-based auth).
-		r.Mount("/webhook", s.webhook.Route())
+		// Per-IP rate limit prevents an attacker who has guessed (or leaked)
+		// a secret from triggering a deploy storm, and rejects unauthenticated
+		// brute-force probes over the secret URL space.
+		r.With(rateLimitMiddleware(webhookLimiter)).Mount("/webhook", s.webhook.Route())

 		// Protected routes: require valid JWT.
 		r.Group(func(r chi.Router) {
 			r.Use(auth.Middleware(s.localAuth))

-			// Config export (protected — reveals project/infra details).
-			r.Get("/config/export", s.exportConfig)
+			// Plugin registry inspection + unified ingress.
+			r.Get("/hooks/kinds", s.listHookKinds)
+			r.Get("/hooks/kinds/{kind}/schema", s.getHookKindSchema)
+			r.With(auth.AdminOnly).Post("/hooks/generic", s.dispatchGeneric)

-			// Auth management.
-			r.Get("/auth/me", s.currentUser)
-			r.Get("/auth/settings", s.getAuthSettings)
-			r.Put("/auth/settings", s.updateAuthSettings)
-			r.Get("/auth/users", s.listUsers)
-			r.Post("/auth/users", s.createUser)
-			r.Delete("/auth/users/{uid}", s.deleteUser)
-
-			// Project endpoints.
-			r.Get("/projects", s.listProjects)
-			r.Post("/projects", s.createProject)
-			r.Route("/projects/{id}", func(r chi.Router) {
-				r.Get("/", s.getProject)
-				r.Put("/", s.updateProject)
-				r.Delete("/", s.deleteProject)
-
-				// Stage endpoints.
-				r.Post("/stages", s.createStage)
-				r.Put("/stages/{stage}", s.updateStage)
-				r.Delete("/stages/{stage}", s.deleteStage)
-
-				// Stage env override endpoints.
-				r.Get("/stages/{stage}/env", s.listStageEnv)
-				r.Post("/stages/{stage}/env", s.createStageEnv)
-				r.Put("/stages/{stage}/env/{envId}", s.updateStageEnv)
-				r.Delete("/stages/{stage}/env/{envId}", s.deleteStageEnv)
-
-				// Instance endpoints.
-				r.Get("/stages/{stage}/instances", s.listInstances)
-				r.Post("/stages/{stage}/instances", s.deployInstance)
-				r.Delete("/stages/{stage}/instances/{iid}", s.removeInstance)
-
-				// Instance control endpoints.
-				r.Post("/stages/{stage}/instances/{iid}/stop", s.stopInstance)
-				r.Post("/stages/{stage}/instances/{iid}/start", s.startInstance)
-				r.Post("/stages/{stage}/instances/{iid}/restart", s.restartInstance)
-
-				// Volume endpoints.
-				r.Get("/volumes", s.listVolumes)
-				r.Post("/volumes", s.createVolume)
-				r.Put("/volumes/{volId}", s.updateVolume)
-				r.Delete("/volumes/{volId}", s.deleteVolume)
+			// Workload-creation discovery helpers: provider probe,
+			// connection test, repo / branch / tree browsers, and
+			// image-source conflict detection. Admin-gated because
+			// they accept an access token + can enumerate other
+			// workloads' images.
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/discovery/git/detect-provider", s.detectGitProvider)
+				r.Post("/discovery/git/test-connection", s.testGitConnection)
+				r.Post("/discovery/git/repos", s.listGitRepos)
+				r.Post("/discovery/git/branches", s.listGitBranches)
+				r.Post("/discovery/git/tree", s.listGitTree)
+				r.Get("/discovery/image/conflicts", s.listImageConflicts)
+				r.Post("/discovery/image/inspect", s.inspectImageMetadata)
 			})

-			// Deploy endpoints.
-			r.Get("/deploys", s.listDeploys)
-			r.Get("/deploys/{id}/logs", s.streamDeployLogs)
-
-			// SSE endpoint for real-time instance status and deploy events.
+			// Read-only endpoints (any authenticated user).
+			r.Get("/health", s.getHealth)
+			r.Get("/auth/me", s.currentUser)
+			r.Post("/auth/logout", s.logout)
+			r.Get("/proxies", s.listProxyRoutes)
+			r.Get("/docker/unused-images", s.unusedImageStats)
 			r.Get("/events", s.streamEvents)
-
-			// Quick deploy endpoints.
-			r.Post("/deploy/inspect", s.inspectImage)
-			r.Post("/deploy/quick", s.quickDeploy)
-
-			// Registry endpoints.
+			r.Get("/events/log", s.listEventLog)
+			r.Get("/events/log/stats", s.getEventLogStats)
 			r.Get("/registries", s.listRegistries)
-			r.Post("/registries", s.createRegistry)
 			r.Route("/registries/{id}", func(r chi.Router) {
+				// All registry probes are admin-gated. The /tags and
+				// /images endpoints used to be open to any authenticated
+				// user, but they make outbound requests using the
+				// admin-encrypted registry token — a viewer could
+				// effectively drive arbitrary requests against a private
+				// registry under admin credentials.
+				r.Use(auth.AdminOnly)
+				r.Get("/tags/*", s.listRegistryTags)
+				r.Get("/images", s.listRegistryImages)
 				r.Put("/", s.updateRegistry)
 				r.Delete("/", s.deleteRegistry)
 				r.Post("/test", s.testRegistry)
-				r.Get("/tags/*", s.listRegistryTags)
-				r.Get("/images", s.listRegistryImages)
+			})
+			r.Get("/settings", s.getSettings)
+			r.Get("/settings/npm-certificates", s.listNpmCertificates)
+			r.Get("/settings/npm-access-lists", s.listNpmAccessLists)
+
+			// Volume scope metadata (read-only).
+			r.Get("/volumes/scopes", s.listVolumeScopes)
+
+			// Stale container endpoints (read).
+			r.Get("/containers/stale", s.listStaleContainers)
+
+			// Workload-shaped endpoints — the canonical surface after the
+			// hard cutover. Reads open to any authenticated user; mutations
+			// admin-gated.
+			r.Get("/workloads", s.listWorkloads)
+			r.With(auth.AdminOnly).Post("/workloads", s.createPluginWorkload)
+			r.Route("/workloads/{id}", func(r chi.Router) {
+				r.Get("/", s.getWorkload)
+				r.Get("/containers", s.listWorkloadContainers)
+				r.Get("/containers/{cid}/logs", s.streamWorkloadContainerLogs)
+				r.With(auth.AdminOnly).Patch("/app", s.updateWorkloadAppID)
+				r.With(auth.AdminOnly).Put("/plugin", s.updatePluginWorkload)
+				r.With(auth.AdminOnly).Post("/deploy", s.deployPluginWorkload)
+				r.With(auth.AdminOnly).Post("/stop", s.stopPluginWorkload)
+				r.With(auth.AdminOnly).Post("/start", s.startPluginWorkload)
+				r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload)
+
+				// Volume snapshots (admin-only). Capture/list a workload's
+				// host-bind data volumes; {sid}-scoped download/delete live
+				// in the global admin group alongside backups.
+				r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots)
+				r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable)
+				r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot)
+
+				// Runtime view: per-source persisted state + storage usage.
+				// Read-only; safe for any authenticated user.
+				r.Get("/runtime-state", s.getWorkloadRuntimeState)
+				r.Get("/storage", s.getWorkloadStorage)
+
+				// Per-workload activity / deploy timeline (read-only). Scoped
+				// to this workload's event-log rows; the global feed lives at
+				// /events/log.
+				r.Get("/events", s.listWorkloadEvents)
+
+				// Per-workload env vars. Listing open to authenticated readers;
+				// mutations admin-gated. Encrypted values are write-only after store.
+				r.Get("/env", s.listWorkloadEnv)
+				r.With(auth.AdminOnly).Put("/env", s.setWorkloadEnv)
+				r.With(auth.AdminOnly).Delete("/env/{envID}", s.deleteWorkloadEnv)
+
+				// Per-workload inbound webhook URL handlers were dropped in
+				// the hard legacy cutover; inbound webhooks are now first-
+				// class Triggers reachable via /api/triggers/{id}/webhook.
+
+				// Per-workload volume mounts.
+				r.Get("/volumes", s.listWorkloadVolumes)
+				r.With(auth.AdminOnly).Put("/volumes", s.setWorkloadVolume)
+				r.With(auth.AdminOnly).Delete("/volumes/{volID}", s.deleteWorkloadVolume)
+
+				// Stages chain: parent + self + direct children, plus a
+				// promote-from action that copies the source workload's
+				// running image tag onto this workload's default_tag.
+				r.Get("/chain", s.getWorkloadChain)
+				r.With(auth.AdminOnly).Post("/promote-from/{sourceID}", s.promoteFromWorkload)
+
+				// Trigger bindings on this workload — the symmetric view
+				// of /triggers/{id}/bindings keyed on the workload side.
+				r.Get("/triggers", s.listBindingsForWorkload)
+				r.With(auth.AdminOnly).Post("/triggers", s.bindTriggerToWorkload)
+
+				// Per-workload notification routes — multi-destination
+				// fan-out (Slack channel + Discord webhook + ...). When
+				// zero rows are configured the dispatcher falls back to
+				// the legacy single-URL columns on the workload row.
+				r.Get("/notifications", s.listWorkloadNotifications)
+				r.With(auth.AdminOnly).Post("/notifications", s.createWorkloadNotification)
+				r.With(auth.AdminOnly).Put("/notifications/{nid}", s.updateWorkloadNotification)
+				r.With(auth.AdminOnly).Delete("/notifications/{nid}", s.deleteWorkloadNotification)
 			})

-			// Settings endpoints.
-			r.Get("/settings", s.getSettings)
-			r.Put("/settings", s.updateSettings)
-			r.Get("/settings/webhook-url", s.getWebhookURL)
-			r.Post("/settings/webhook-url/regenerate", s.regenerateWebhookSecret)
+			// Global container index, joined to workload + app names.
+			r.Get("/containers", s.listAllContainers)
+			r.Get("/containers/{id}", s.getContainer)
+
+			// App grouping (optional UI; admin-gated mutations).
+			r.Get("/apps", s.listApps)
+			r.Get("/apps/{id}", s.getApp)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/apps", s.createApp)
+				r.Put("/apps/{id}", s.updateApp)
+				r.Delete("/apps/{id}", s.deleteApp)
+			})
+
+			// First-class Triggers (redeploy signal sources). One trigger
+			// fans out to many workloads via workload_trigger_bindings.
+			r.Get("/triggers", s.listTriggers)
+			r.Get("/triggers/{id}", s.getTrigger)
+			r.Get("/triggers/{id}/bindings", s.listBindingsForTrigger)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/triggers", s.createTrigger)
+				r.Put("/triggers/{id}", s.updateTrigger)
+				r.Delete("/triggers/{id}", s.deleteTrigger)
+				r.Get("/triggers/{id}/webhook", s.getTriggerWebhook)
+				r.Post("/triggers/{id}/webhook/regenerate", s.regenerateTriggerWebhook)
+				r.Post("/triggers/{id}/fire", s.fireTriggerNow)
+				r.Post("/triggers/{id}/bindings", s.bindWorkloadToTrigger)
+				r.Put("/bindings/{bid}", s.updateBinding)
+				r.Delete("/bindings/{bid}", s.deleteBinding)
+			})
+
+			// Event triggers: filter+action rules over the event_log stream.
+			r.Get("/event-triggers", s.listEventTriggers)
+			r.Get("/event-triggers/{id}", s.getEventTrigger)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/event-triggers", s.createEventTrigger)
+				r.Patch("/event-triggers/{id}", s.updateEventTrigger)
+				r.Delete("/event-triggers/{id}", s.deleteEventTrigger)
+				r.Post("/event-triggers/{id}/test", s.testEventTrigger)
+			})
+
+			// Log-scan rules.
+			r.Get("/log-scan-rules", s.listLogScanRules)
+			r.Get("/log-scan-rules/stats", s.getLogScanStats)
+			r.Get("/log-scan-rules/{id}", s.getLogScanRule)
+			r.Get("/workloads/{id}/effective-rules", s.getEffectiveLogScanRules)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/log-scan-rules", s.createLogScanRule)
+				r.Patch("/log-scan-rules/{id}", s.updateLogScanRule)
+				r.Delete("/log-scan-rules/{id}", s.deleteLogScanRule)
+				r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
+			})
+
+			// Metric-alert rules.
+			r.Get("/metric-alert-rules", s.listMetricAlertRules)
+			r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/metric-alert-rules", s.createMetricAlertRule)
+				r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule)
+				r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule)
+			})
+
+			// Shared secrets (env vars shared across workloads by scope).
+			r.Get("/shared-secrets", s.listSharedSecrets)
+			r.Get("/shared-secrets/{id}", s.getSharedSecret)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/shared-secrets", s.createSharedSecret)
+				r.Patch("/shared-secrets/{id}", s.updateSharedSecret)
+				r.Delete("/shared-secrets/{id}", s.deleteSharedSecret)
+			})
+
+			// System resources (read-only).
+			r.Get("/system/stats", s.getSystemStats)
+			r.Get("/system/stats/history", s.getSystemStatsHistory)
+			r.Get("/system/stats/top", s.listTopContainers)
+
+			// Admin-only routes: require admin role.
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+
+				// Prometheus-format metrics export. Admin-only so the
+				// counter cardinality cannot be enumerated by a low-trust
+				// viewer to map internal endpoints / sources / outcomes.
+				// Scrape with bearer auth from your Prometheus job.
+				r.Get("/metrics", s.metricsExport)
+
+				// Config export (reveals registry/global details).
+				r.Get("/config/export", s.exportConfig)
+
+				// Event log management.
+				r.Delete("/events/log/{id}", s.deleteEvent)
+				r.Delete("/events/log", s.clearEvents)
+
+				// Auth management.
+				r.Get("/auth/settings", s.getAuthSettings)
+				r.Put("/auth/settings", s.updateAuthSettings)
+				r.Get("/auth/users", s.listUsers)
+				r.Post("/auth/users", s.createUser)
+				r.Put("/auth/users/{uid}", s.updateUser)
+				r.Put("/auth/users/{uid}/password", s.changePassword)
+				r.Delete("/auth/users/{uid}", s.deleteUser)
+
+				// Registry creation.
+				r.Post("/registries", s.createRegistry)
+
+				// Stale container cleanup endpoints.
+				// Bulk route must be registered before parameterized route.
+				r.Post("/containers/stale/cleanup", s.bulkCleanupStaleContainers)
+				r.Post("/containers/stale/{id}/cleanup", s.cleanupStaleContainer)
+
+				// Settings endpoints.
+				r.Put("/settings", s.updateSettings)
+
+				// Global outgoing-webhook signing & test.
+				r.Get("/settings/notification-secret", s.getSettingsNotificationSecret)
+				r.Post("/settings/notification-secret/regenerate", s.regenerateSettingsNotificationSecret)
+				r.Post("/settings/notification-secret/disable", s.disableSettingsNotificationSigning)
+				r.Post("/settings/notification-test", s.settingsNotificationTest)
+
+				// Docker management.
+				r.Post("/docker/prune-images", s.pruneImages)
+				r.Post("/docker/prune-build-cache", s.pruneBuildCache)
+
+				// NPM connection test.
+				r.Post("/settings/npm/test", s.testNpmConnection)
+
+				// DNS management endpoints.
+				r.Post("/settings/dns/test", s.testDNSConnection)
+				r.Post("/settings/dns/zones", s.listDNSZones)
+				r.Get("/dns/records", s.listDNSRecords)
+				r.Post("/dns/sync", s.syncDNSRecords)
+				r.Delete("/dns/records/{fqdn}", s.deleteDNSRecord)
+
+				// Backup endpoints.
+				r.Get("/backups", s.listBackups)
+				r.Post("/backups", s.triggerBackup)
+				r.Get("/backups/{id}/download", s.downloadBackup)
+				r.Delete("/backups/{id}", s.deleteBackup)
+				r.Post("/backups/{id}/restore", s.restoreBackup)
+
+				// Volume-snapshot download/delete (workload-scoped capture +
+				// list live under /workloads/{id}/snapshots).
+				r.Get("/snapshots/{sid}/download", s.downloadSnapshot)
+				r.Delete("/snapshots/{sid}", s.deleteSnapshot)
+			})
 		})
 	})

@@ -0,0 +1,43 @@
+package api
+
+import (
+	"strconv"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// generateWebhookSecret is a one-line bridge to store.GenerateWebhookSecret
+// so the api handlers and the store CRUD share one secret-generation
+// path — no panic-vs-UUID-fallback divergence.
+func generateWebhookSecret() string { return store.GenerateWebhookSecret() }
+
+// webhookURLResponse is the common payload returned by every webhook
+// endpoint. Clients never see raw secrets except at issue/rotate time via
+// these fields; the URL shape is "/api/webhook/..." so callers can prepend
+// their own origin.
+type webhookURLResponse struct {
+	WebhookURL              string `json:"webhook_url"`
+	WebhookSecret           string `json:"webhook_secret"`
+	HasSigningSecret        bool   `json:"has_signing_secret"`
+	WebhookRequireSignature bool   `json:"webhook_require_signature"`
+}
+
+// signingSecretResponse is returned when a signing secret is issued or rotated.
+type signingSecretResponse struct {
+	SigningSecret string `json:"signing_secret"`
+}
+
+// parseLimit clamps a query-string limit to [1, max], falling back to def.
+func parseLimit(raw string, def, max int) int {
+	if raw == "" {
+		return def
+	}
+	n, err := strconv.Atoi(raw)
+	if err != nil || n <= 0 {
+		return def
+	}
+	if n > max {
+		return max
+	}
+	return n
+}
@@ -1,17 +1,26 @@
 package api

 import (
-	"fmt"
+	"context"
+	"log/slog"
 	"net/http"
+	"path/filepath"
+	"strings"

-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/webhook"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/dns"
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/npm"
+	"github.com/alexei/tinyforge/internal/proxy"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volume"
 )

 // settingsRequest is the expected JSON body for updating settings.
 type settingsRequest struct {
 	Domain           string `json:"domain"`
 	ServerIP         string `json:"server_ip"`
+	PublicIP         string `json:"public_ip"`
 	Network          string `json:"network"`
 	SubdomainPattern string `json:"subdomain_pattern"`
 	NotificationURL  string `json:"notification_url"`
@@ -19,6 +28,27 @@ type settingsRequest struct {
 	NpmEmail         string `json:"npm_email"`
 	NpmPassword      string `json:"npm_password"`
 	PollingInterval  string `json:"polling_interval"`
+	SSLCertificateID   *int    `json:"ssl_certificate_id,omitempty"`
+	StaleThresholdDays *int    `json:"stale_threshold_days,omitempty"`
+	AllowedVolumePaths *string `json:"allowed_volume_paths,omitempty"`
+	WildcardDNS          *bool   `json:"wildcard_dns,omitempty"`
+	DNSProvider          *string `json:"dns_provider,omitempty"`
+	CloudflareAPIToken   string  `json:"cloudflare_api_token"`
+	CloudflareZoneID     *string `json:"cloudflare_zone_id,omitempty"`
+	NpmAccessListID      *int    `json:"npm_access_list_id,omitempty"`
+	ImagePruneThresholdMB *int   `json:"image_prune_threshold_mb,omitempty"`
+	NpmRemote            *bool   `json:"npm_remote,omitempty"`
+	ProxyProvider        *string `json:"proxy_provider,omitempty"`
+	TraefikEntrypoint    *string `json:"traefik_entrypoint,omitempty"`
+	TraefikCertResolver  *string `json:"traefik_cert_resolver,omitempty"`
+	TraefikNetwork       *string `json:"traefik_network,omitempty"`
+	TraefikAPIURL        *string `json:"traefik_api_url,omitempty"`
+	BackupEnabled          *bool `json:"backup_enabled,omitempty"`
+	BackupIntervalHours    *int  `json:"backup_interval_hours,omitempty"`
+	BackupRetentionCount   *int  `json:"backup_retention_count,omitempty"`
+	AutoBackupBeforeDeploy *bool `json:"auto_backup_before_deploy,omitempty"`
+	StatsIntervalSeconds   *int  `json:"stats_interval_seconds,omitempty"`
+	StatsRetentionHours    *int  `json:"stats_retention_hours,omitempty"`
 }

 // getSettings handles GET /api/settings.
@@ -31,16 +61,39 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) {

 	// Return settings without sensitive fields.
 	respondJSON(w, http.StatusOK, map[string]any{
-		"domain":            settings.Domain,
-		"server_ip":         settings.ServerIP,
-		"network":           settings.Network,
-		"subdomain_pattern": settings.SubdomainPattern,
-		"notification_url":  settings.NotificationURL,
-		"npm_url":           settings.NpmURL,
-		"npm_email":         settings.NpmEmail,
-		"has_npm_password":  settings.NpmPassword != "",
-		"polling_interval":  settings.PollingInterval,
-		"updated_at":        settings.UpdatedAt,
+		"domain":                    settings.Domain,
+		"server_ip":                 settings.ServerIP,
+		"public_ip":                 settings.PublicIP,
+		"network":                   settings.Network,
+		"subdomain_pattern":         settings.SubdomainPattern,
+		"notification_url":          settings.NotificationURL,
+		"has_notification_secret":   settings.NotificationSecret != "",
+		"npm_url":                   settings.NpmURL,
+		"npm_email":                 settings.NpmEmail,
+		"has_npm_password":          settings.NpmPassword != "",
+		"npm_remote":               settings.NpmRemote,
+		"image_prune_threshold_mb": settings.ImagePruneThresholdMB,
+		"npm_access_list_id":       settings.NpmAccessListID,
+		"polling_interval":          settings.PollingInterval,
+		"ssl_certificate_id":        settings.SSLCertificateID,
+		"stale_threshold_days":      settings.StaleThresholdDays,
+		"allowed_volume_paths":      settings.AllowedVolumePaths,
+		"wildcard_dns":              settings.WildcardDNS,
+		"dns_provider":              settings.DNSProvider,
+		"has_cloudflare_api_token":  settings.CloudflareAPIToken != "",
+		"cloudflare_zone_id":        settings.CloudflareZoneID,
+		"proxy_provider":            settings.ProxyProvider,
+		"traefik_entrypoint":        settings.TraefikEntrypoint,
+		"traefik_cert_resolver":     settings.TraefikCertResolver,
+		"traefik_network":           settings.TraefikNetwork,
+		"traefik_api_url":           settings.TraefikAPIURL,
+		"backup_enabled":             settings.BackupEnabled,
+		"backup_interval_hours":      settings.BackupIntervalHours,
+		"backup_retention_count":     settings.BackupRetentionCount,
+		"auto_backup_before_deploy":  settings.AutoBackupBeforeDeploy,
+		"stats_interval_seconds":    settings.StatsIntervalSeconds,
+		"stats_retention_hours":     settings.StatsRetentionHours,
+		"updated_at":                settings.UpdatedAt,
 	})
 }

@@ -64,6 +117,9 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) {
 	if req.ServerIP != "" {
 		updated.ServerIP = req.ServerIP
 	}
+	if req.PublicIP != "" {
+		updated.PublicIP = req.PublicIP
+	}
 	if req.Network != "" {
 		updated.Network = req.Network
 	}
@@ -89,54 +145,609 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) {
 	if req.PollingInterval != "" {
 		updated.PollingInterval = req.PollingInterval
 	}
+	sslChanged := false
+	if req.SSLCertificateID != nil && *req.SSLCertificateID != updated.SSLCertificateID {
+		updated.SSLCertificateID = *req.SSLCertificateID
+		sslChanged = true
+	}
+	if req.StaleThresholdDays != nil {
+		if *req.StaleThresholdDays < 1 {
+			respondError(w, http.StatusBadRequest, "stale_threshold_days must be at least 1")
+			return
+		}
+		updated.StaleThresholdDays = *req.StaleThresholdDays
+	}
+	if req.AllowedVolumePaths != nil {
+		// Validate it's valid JSON array of strings.
+		paths, err := volume.ParseAllowedPaths(*req.AllowedVolumePaths)
+		if err != nil {
+			respondError(w, http.StatusBadRequest, "allowed_volume_paths must be a JSON array of strings")
+			return
+		}
+		// Validate each path is absolute.
+		for _, p := range paths {
+			if !filepath.IsAbs(p) {
+				respondError(w, http.StatusBadRequest, "each allowed volume path must be absolute")
+				return
+			}
+		}
+		updated.AllowedVolumePaths = *req.AllowedVolumePaths
+		_ = paths // validated
+	}
+
+	// DNS settings.
+	if req.WildcardDNS != nil {
+		updated.WildcardDNS = *req.WildcardDNS
+	}
+	if req.DNSProvider != nil {
+		updated.DNSProvider = *req.DNSProvider
+	}
+	if req.CloudflareAPIToken != "" {
+		encToken, err := crypto.Encrypt(s.encKey, req.CloudflareAPIToken)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "failed to encrypt cloudflare api token: "+err.Error())
+			return
+		}
+		updated.CloudflareAPIToken = encToken
+	}
+	if req.CloudflareZoneID != nil {
+		updated.CloudflareZoneID = *req.CloudflareZoneID
+	}
+
+	// Proxy provider setting.
+	if req.ProxyProvider != nil {
+		prov := *req.ProxyProvider
+		if prov != "" && prov != "none" && prov != "npm" && prov != "traefik" {
+			respondError(w, http.StatusBadRequest, "proxy_provider must be 'none', 'npm', or 'traefik'")
+			return
+		}
+		updated.ProxyProvider = prov
+	}
+	if req.ImagePruneThresholdMB != nil {
+		updated.ImagePruneThresholdMB = *req.ImagePruneThresholdMB
+	}
+	if req.NpmRemote != nil {
+		updated.NpmRemote = *req.NpmRemote
+	}
+	if req.NpmAccessListID != nil {
+		updated.NpmAccessListID = *req.NpmAccessListID
+	}
+
+	// Traefik provider settings.
+	if req.TraefikEntrypoint != nil {
+		updated.TraefikEntrypoint = *req.TraefikEntrypoint
+	}
+	if req.TraefikCertResolver != nil {
+		updated.TraefikCertResolver = *req.TraefikCertResolver
+	}
+	if req.TraefikNetwork != nil {
+		updated.TraefikNetwork = *req.TraefikNetwork
+	}
+	if req.TraefikAPIURL != nil {
+		updated.TraefikAPIURL = *req.TraefikAPIURL
+	}
+
+	// Backup settings.
+	if req.BackupEnabled != nil {
+		updated.BackupEnabled = *req.BackupEnabled
+	}
+	if req.BackupIntervalHours != nil {
+		if *req.BackupIntervalHours < 1 {
+			respondError(w, http.StatusBadRequest, "backup_interval_hours must be at least 1")
+			return
+		}
+		updated.BackupIntervalHours = *req.BackupIntervalHours
+	}
+	if req.BackupRetentionCount != nil {
+		if *req.BackupRetentionCount < 1 {
+			respondError(w, http.StatusBadRequest, "backup_retention_count must be at least 1")
+			return
+		}
+		updated.BackupRetentionCount = *req.BackupRetentionCount
+	}
+	if req.AutoBackupBeforeDeploy != nil {
+		updated.AutoBackupBeforeDeploy = *req.AutoBackupBeforeDeploy
+	}
+	if req.StatsIntervalSeconds != nil {
+		v := *req.StatsIntervalSeconds
+		if v != 0 && (v < 5 || v > 300) {
+			respondError(w, http.StatusBadRequest, "stats_interval_seconds must be 0 (disabled) or between 5 and 300")
+			return
+		}
+		updated.StatsIntervalSeconds = v
+	}
+	if req.StatsRetentionHours != nil {
+		v := *req.StatsRetentionHours
+		if v < 0 || v > 24 {
+			respondError(w, http.StatusBadRequest, "stats_retention_hours must be between 0 and 24")
+			return
+		}
+		updated.StatsRetentionHours = v
+	}

 	if err := s.store.UpdateSettings(updated); err != nil {
 		respondError(w, http.StatusInternalServerError, "failed to update settings: "+err.Error())
 		return
 	}
+
+	// If proxy-affecting settings changed, re-sync all proxy routes in the background.
+	proxyChanged := existing.Domain != updated.Domain ||
+		existing.ProxyProvider != updated.ProxyProvider ||
+		existing.NpmRemote != updated.NpmRemote ||
+		existing.NpmAccessListID != updated.NpmAccessListID ||
+		sslChanged
+	if proxyChanged {
+		go s.resyncAllProxies(existing, updated)
+	}
+
+	// Handle DNS provider changes.
+	dnsChanged := existing.WildcardDNS != updated.WildcardDNS ||
+		existing.DNSProvider != updated.DNSProvider ||
+		existing.CloudflareZoneID != updated.CloudflareZoneID ||
+		(req.CloudflareAPIToken != "" && req.CloudflareAPIToken != "unchanged")
+	if dnsChanged {
+		oldProvider := s.getDNSProviderLocked()
+		go s.handleDNSSettingsChange(oldProvider, existing, updated)
+	}
+
+	// Handle backup settings changes.
+	backupChanged := existing.BackupEnabled != updated.BackupEnabled ||
+		existing.BackupIntervalHours != updated.BackupIntervalHours
+	if backupChanged && s.onBackupSettingsChanged != nil {
+		s.onBackupSettingsChanged(updated.BackupEnabled, updated.BackupIntervalHours)
+	}
+
 	respondJSON(w, http.StatusOK, map[string]string{"status": "updated"})
 }

-// getWebhookURL handles GET /api/settings/webhook-url.
-func (s *Server) getWebhookURL(w http.ResponseWriter, r *http.Request) {
+// listNpmCertificates handles GET /api/settings/npm-certificates.
+// It authenticates to NPM using the stored credentials and returns only wildcard certificates.
+func (s *Server) listNpmCertificates(w http.ResponseWriter, r *http.Request) {
 	settings, err := s.store.GetSettings()
 	if err != nil {
 		respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
 		return
 	}

-	webhookURL := ""
-	if settings.WebhookSecret != "" && settings.Domain != "" {
-		webhookURL = fmt.Sprintf("https://%s/api/webhook/%s", settings.Domain, settings.WebhookSecret)
-	}
-
-	respondJSON(w, http.StatusOK, map[string]string{
-		"webhook_url": webhookURL,
-	})
-}
-
-// regenerateWebhookSecret handles POST /api/settings/regenerate.
-func (s *Server) regenerateWebhookSecret(w http.ResponseWriter, r *http.Request) {
-	secret, err := webhook.RegenerateWebhookSecret(s.store)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to regenerate webhook secret: "+err.Error())
+	if settings.NpmURL == "" || settings.NpmEmail == "" || settings.NpmPassword == "" {
+		respondError(w, http.StatusBadRequest, "NPM credentials not configured")
 		return
 	}

+	npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to decrypt npm password: "+err.Error())
+		return
+	}
+
+	client := npm.New(settings.NpmURL)
+	if err := client.Authenticate(r.Context(), settings.NpmEmail, npmPassword); err != nil {
+		respondError(w, http.StatusBadGateway, "failed to authenticate to NPM: "+err.Error())
+		return
+	}
+
+	certs, err := client.ListCertificates(r.Context())
+	if err != nil {
+		respondError(w, http.StatusBadGateway, "failed to list certificates: "+err.Error())
+		return
+	}
+
+	// Filter to wildcard certificates only.
+	var wildcards []npm.Certificate
+	for _, cert := range certs {
+		if isWildcardCert(cert) {
+			wildcards = append(wildcards, cert)
+		}
+	}
+
+	if wildcards == nil {
+		wildcards = []npm.Certificate{}
+	}
+
+	respondJSON(w, http.StatusOK, wildcards)
+}
+
+// listNpmAccessLists handles GET /api/settings/npm-access-lists.
+// It authenticates to NPM using the stored credentials and returns all access lists.
+func (s *Server) listNpmAccessLists(w http.ResponseWriter, r *http.Request) {
 	settings, err := s.store.GetSettings()
 	if err != nil {
 		respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
 		return
 	}

-	webhookURL := ""
-	if settings.Domain != "" {
-		webhookURL = fmt.Sprintf("https://%s/api/webhook/%s", settings.Domain, secret)
+	if settings.NpmURL == "" || settings.NpmEmail == "" || settings.NpmPassword == "" {
+		respondError(w, http.StatusBadRequest, "NPM credentials not configured")
+		return
 	}

-	respondJSON(w, http.StatusOK, map[string]string{
-		"webhook_url":    webhookURL,
-		"webhook_secret": secret,
+	npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to decrypt npm password: "+err.Error())
+		return
+	}
+
+	client := npm.New(settings.NpmURL)
+	if err := client.Authenticate(r.Context(), settings.NpmEmail, npmPassword); err != nil {
+		respondError(w, http.StatusBadGateway, "failed to authenticate to NPM: "+err.Error())
+		return
+	}
+
+	lists, err := client.ListAccessLists(r.Context())
+	if err != nil {
+		respondError(w, http.StatusBadGateway, "failed to list access lists: "+err.Error())
+		return
+	}
+
+	if lists == nil {
+		lists = []npm.AccessList{}
+	}
+
+	respondJSON(w, http.StatusOK, lists)
+}
+
+// isWildcardCert returns true if any of the certificate's domain names contains "*".
+func isWildcardCert(cert npm.Certificate) bool {
+	for _, d := range cert.DomainNames {
+		if strings.Contains(d, "*") {
+			return true
+		}
+	}
+	return false
+}
+
+// createProxyProvider builds a proxy.Provider from the given settings.
+func (s *Server) createProxyProvider(settings store.Settings) proxy.Provider {
+	switch settings.ProxyProvider {
+	case "npm":
+		if settings.NpmURL == "" || settings.NpmEmail == "" || settings.NpmPassword == "" {
+			slog.Warn("proxy resync: NPM credentials incomplete, falling back to none")
+			return proxy.NewNoneProvider()
+		}
+		npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
+		if err != nil {
+			slog.Error("proxy resync: decrypt npm password", "error", err)
+			return proxy.NewNoneProvider()
+		}
+		return proxy.NewNpmProvider(npm.New(settings.NpmURL), settings.NpmEmail, npmPassword)
+	case "traefik":
+		return proxy.NewTraefikProvider(
+			settings.TraefikEntrypoint,
+			settings.TraefikCertResolver,
+			settings.TraefikNetwork,
+			settings.TraefikAPIURL,
+		)
+	default:
+		return proxy.NewNoneProvider()
+	}
+}
+
+// resyncAllProxies re-configures or removes proxy routes for all running instances
+// when proxy-affecting settings change (domain, SSL cert, proxy provider).
+// Runs in the background after settings save.
+func (s *Server) resyncAllProxies(oldSettings, newSettings store.Settings) {
+	ctx := context.Background()
+
+	// Collect all proxy-enabled instances.
+	routes, err := s.store.ListProxyRoutes(oldSettings.Domain)
+	if err != nil {
+		slog.Error("proxy resync: list routes", "error", err)
+		return
+	}
+
+	if len(routes) == 0 {
+		slog.Info("proxy resync: no proxy routes to update")
+		return
+	}
+
+	providerChanged := oldSettings.ProxyProvider != newSettings.ProxyProvider
+	domainChanged := oldSettings.Domain != newSettings.Domain
+
+	// Step 1: If provider changed, delete old routes from the OLD provider, then switch.
+	if providerChanged {
+		slog.Info("proxy resync: provider changed", "old", oldSettings.ProxyProvider, "new", newSettings.ProxyProvider)
+		oldProvider := s.proxyProvider
+		for _, route := range routes {
+			if route.ProxyRouteID != "" {
+				if err := oldProvider.DeleteRoute(ctx, route.ProxyRouteID); err != nil {
+					slog.Warn("proxy resync: delete old route", "route_id", route.ProxyRouteID, "error", err)
+				}
+			}
+		}
+
+		// Create and install the new provider.
+		newProvider := s.createProxyProvider(newSettings)
+		s.SetProxyProvider(newProvider)
+		if s.onProxyProviderChanged != nil {
+			s.onProxyProviderChanged(newProvider)
+		}
+	}
+
+	// Step 2: If new provider is "none", clear all proxy route IDs and we're done.
+	if newSettings.ProxyProvider == "none" {
+		for _, route := range routes {
+			c, err := s.store.GetContainerByID(route.InstanceID)
+			if err != nil {
+				continue
+			}
+			c.ProxyRouteID = ""
+			c.NpmProxyID = 0
+			if err := s.store.UpdateContainer(c); err != nil {
+				slog.Warn("proxy resync: clear route ID", "container", route.InstanceID, "error", err)
+			}
+		}
+		slog.Info("proxy resync: cleared all proxy routes (provider set to none)", "count", len(routes))
+		return
+	}
+
+	// Step 3: Re-create/update routes with the current provider and new settings.
+	updated := 0
+	for _, route := range routes {
+		if route.Subdomain == "" {
+			continue
+		}
+
+		fqdn := route.Subdomain + "." + newSettings.Domain
+
+		// Reconstruct the container name (Docker DNS name) from project/stage/tag.
+		containerName := docker.ContainerName(route.ProjectName, route.StageName, route.ImageTag)
+
+		routeID, err := s.proxyProvider.ConfigureRoute(ctx, fqdn, containerName, route.Port, proxy.RouteOptions{
+			SSLCertificateID: newSettings.SSLCertificateID,
+		})
+		if err != nil {
+			slog.Warn("proxy resync: configure route failed",
+				"domain", fqdn, "instance", route.InstanceID, "error", err)
+			continue
+		}
+
+		// Update container row with new route ID.
+		c, err := s.store.GetContainerByID(route.InstanceID)
+		if err != nil {
+			continue
+		}
+		c.ProxyRouteID = routeID
+		if domainChanged {
+			slog.Info("proxy resync: domain updated", "container", route.InstanceID, "domain", fqdn)
+		}
+		if err := s.store.UpdateContainer(c); err != nil {
+			slog.Warn("proxy resync: update container", "container", route.InstanceID, "error", err)
+		}
+		updated++
+	}
+
+	slog.Info("proxy resync: completed", "updated", updated, "total", len(routes))
+}
+
+// handleDNSSettingsChange reacts to DNS configuration changes:
+// - If switching to wildcard mode: remove all managed DNS records from the provider.
+// - If switching provider or credentials: remove old records, create new provider, re-sync.
+func (s *Server) handleDNSSettingsChange(oldProvider dns.Provider, oldSettings, newSettings store.Settings) {
+	ctx := context.Background()
+
+	// Step 1: If there was an old provider, remove all managed DNS records from it.
+	if !oldSettings.WildcardDNS && oldSettings.DNSProvider != "" && oldProvider != nil {
+		records, err := s.store.ListDNSRecords()
+		if err != nil {
+			slog.Error("dns settings change: list records for cleanup", "error", err)
+		} else {
+			for _, rec := range records {
+				if err := oldProvider.DeleteRecord(ctx, rec.FQDN); err != nil {
+					slog.Warn("dns settings change: delete old record", "fqdn", rec.FQDN, "error", err)
+				}
+				if err := s.store.DeleteDNSRecord(rec.FQDN); err != nil {
+					slog.Warn("dns settings change: remove tracking record", "fqdn", rec.FQDN, "error", err)
+				}
+			}
+			slog.Info("dns settings change: cleaned up old records", "count", len(records))
+		}
+	}
+
+	// Step 2: Create new provider (or nil for wildcard mode).
+	var newProvider dns.Provider
+	if !newSettings.WildcardDNS && newSettings.DNSProvider != "" {
+		token := newSettings.CloudflareAPIToken
+		if token != "" {
+			decrypted, err := crypto.Decrypt(s.encKey, token)
+			if err != nil {
+				slog.Error("dns settings change: decrypt token", "error", err)
+				return
+			}
+			token = decrypted
+		}
+
+		provider, err := dns.NewProvider(newSettings.DNSProvider, dns.Config{
+			Token:  token,
+			ZoneID: newSettings.CloudflareZoneID,
+		})
+		if err != nil {
+			slog.Error("dns settings change: create provider", "error", err)
+			return
+		}
+		newProvider = provider
+	}
+
+	// Step 3: Update the server's DNS provider and notify dependents.
+	s.SetDNSProvider(newProvider)
+	if s.onDNSProviderChanged != nil {
+		s.onDNSProviderChanged(newProvider)
+	}
+
+	slog.Info("dns settings change: provider updated",
+		"wildcard", newSettings.WildcardDNS,
+		"provider", newSettings.DNSProvider)
+}
+
+// dnsTestRequest is the expected JSON body for testing DNS provider credentials.
+type dnsTestRequest struct {
+	Provider string `json:"provider"`
+	Token    string `json:"token"`
+	ZoneID   string `json:"zone_id"`
+}
+
+// testNpmConnection handles POST /api/settings/npm/test.
+// Tests connectivity and authentication to the NPM API.
+func (s *Server) testNpmConnection(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		URL      string `json:"npm_url"`
+		Email    string `json:"npm_email"`
+		Password string `json:"npm_password"`
+	}
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+
+	// Use provided values, fall back to stored settings.
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		slog.Error("failed to get settings", "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	npmURL := req.URL
+	if npmURL == "" {
+		npmURL = settings.NpmURL
+	}
+	if npmURL == "" {
+		respondError(w, http.StatusBadRequest, "NPM URL is required")
+		return
+	}
+
+	email := req.Email
+	if email == "" {
+		email = settings.NpmEmail
+	}
+
+	password := req.Password
+	if password == "" && settings.NpmPassword != "" {
+		decrypted, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
+		if err != nil {
+			respondError(w, http.StatusBadRequest, "failed to decrypt stored NPM password")
+			return
+		}
+		password = decrypted
+	}
+
+	if email == "" || password == "" {
+		respondError(w, http.StatusBadRequest, "NPM email and password are required")
+		return
+	}
+
+	// Test connectivity.
+	client := npm.New(npmURL)
+	ctx := r.Context()
+
+	if err := client.Ping(ctx); err != nil {
+		slog.Warn("npm test: ping failed", "url", npmURL, "error", err)
+		respondError(w, http.StatusBadGateway, "Cannot reach NPM at "+npmURL)
+		return
+	}
+
+	// Test authentication.
+	if err := client.Authenticate(ctx, email, password); err != nil {
+		slog.Warn("npm test: auth failed", "url", npmURL, "error", err)
+		respondError(w, http.StatusBadGateway, "NPM authentication failed — check email and password")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{"status": "connected"})
+}
+
+// testDNSConnection handles POST /api/settings/dns/test.
+func (s *Server) testDNSConnection(w http.ResponseWriter, r *http.Request) {
+	var req dnsTestRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+
+	if req.Provider != "cloudflare" {
+		respondError(w, http.StatusBadRequest, "unsupported DNS provider: "+req.Provider)
+		return
+	}
+
+	token := req.Token
+	// If no token provided, use the stored one.
+	if token == "" {
+		settings, err := s.store.GetSettings()
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
+			return
+		}
+		if settings.CloudflareAPIToken == "" {
+			respondError(w, http.StatusBadRequest, "no Cloudflare API token configured")
+			return
+		}
+		decrypted, err := crypto.Decrypt(s.encKey, settings.CloudflareAPIToken)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "failed to decrypt token: "+err.Error())
+			return
+		}
+		token = decrypted
+	}
+
+	provider, err := dns.NewCloudflare(token, req.ZoneID)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid configuration: "+err.Error())
+		return
+	}
+
+	if err := provider.TestConnection(r.Context()); err != nil {
+		respondJSON(w, http.StatusOK, map[string]any{
+			"success": false,
+			"error":   err.Error(),
+		})
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
 	})
 }

+// dnsZonesRequest is the expected JSON body for listing DNS zones.
+type dnsZonesRequest struct {
+	Token string `json:"token"`
+}
+
+// listDNSZones handles POST /api/settings/dns/zones.
+func (s *Server) listDNSZones(w http.ResponseWriter, r *http.Request) {
+	var req dnsZonesRequest
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+	token := req.Token
+	// If no token in body, use stored one.
+	if token == "" {
+		settings, err := s.store.GetSettings()
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "failed to get settings: "+err.Error())
+			return
+		}
+		if settings.CloudflareAPIToken == "" {
+			respondError(w, http.StatusBadRequest, "no Cloudflare API token configured")
+			return
+		}
+		decrypted, err := crypto.Decrypt(s.encKey, settings.CloudflareAPIToken)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "failed to decrypt token: "+err.Error())
+			return
+		}
+		token = decrypted
+	}
+
+	provider, err := dns.NewCloudflare(token, "")
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid configuration: "+err.Error())
+		return
+	}
+
+	zones, err := provider.ListZones(r.Context())
+	if err != nil {
+		respondError(w, http.StatusBadGateway, "failed to list zones: "+err.Error())
+		return
+	}
+
+	respondJSON(w, http.StatusOK, zones)
+}
+
@@ -0,0 +1,272 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// sharedSecretRow is the JSON shape returned to clients. The secret value is
+// NEVER returned — once stored it is write-only (mirroring workload_env). The
+// has_value flag lets the UI show whether a value is set without exposing it;
+// to rotate, the operator submits a new value.
+type sharedSecretRow struct {
+	ID          string `json:"id"`
+	Name        string `json:"name"`
+	HasValue    bool   `json:"has_value"`
+	Encrypted   bool   `json:"encrypted"`
+	Scope       string `json:"scope"`
+	AppID       string `json:"app_id"`
+	Description string `json:"description"`
+	Enabled     bool   `json:"enabled"`
+	CreatedAt   string `json:"created_at"`
+	UpdatedAt   string `json:"updated_at"`
+}
+
+func toSharedSecretRow(sec store.SharedSecret) sharedSecretRow {
+	return sharedSecretRow{
+		ID:          sec.ID,
+		Name:        sec.Name,
+		HasValue:    sec.Value != "",
+		Encrypted:   sec.Encrypted,
+		Scope:       sec.Scope,
+		AppID:       sec.AppID,
+		Description: sec.Description,
+		Enabled:     sec.Enabled,
+		CreatedAt:   sec.CreatedAt,
+		UpdatedAt:   sec.UpdatedAt,
+	}
+}
+
+// listSharedSecrets handles GET /api/shared-secrets. Values are redacted.
+func (s *Server) listSharedSecrets(w http.ResponseWriter, r *http.Request) {
+	rows, err := s.store.ListSharedSecrets()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list shared secrets")
+		return
+	}
+	out := make([]sharedSecretRow, 0, len(rows))
+	for _, sec := range rows {
+		out = append(out, toSharedSecretRow(sec))
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getSharedSecret handles GET /api/shared-secrets/{id}. Value is redacted.
+func (s *Server) getSharedSecret(w http.ResponseWriter, r *http.Request) {
+	sec, err := s.store.GetSharedSecret(chi.URLParam(r, "id"))
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get shared secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, toSharedSecretRow(sec))
+}
+
+// createSharedSecretRequest is the POST body. Encrypted=true (the default for
+// a non-empty value) causes the value to be encrypted at rest with the global
+// key before it ever reaches the store.
+type createSharedSecretRequest struct {
+	Name        string `json:"name"`
+	Value       string `json:"value"`
+	Encrypted   *bool  `json:"encrypted"` // defaults true
+	Scope       string `json:"scope"`     // global | app
+	AppID       string `json:"app_id"`    // required when scope == app
+	Description string `json:"description"`
+	Enabled     *bool  `json:"enabled"` // defaults true
+}
+
+func (s *Server) createSharedSecret(w http.ResponseWriter, r *http.Request) {
+	var req createSharedSecretRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.Name = strings.TrimSpace(req.Name)
+	if !validEnvKey(req.Name) {
+		respondError(w, http.StatusBadRequest, "name must be a valid env key [A-Za-z_][A-Za-z0-9_]*")
+		return
+	}
+	if msg := validateSharedSecretScope(req.Scope, req.AppID); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+
+	encrypted := true
+	if req.Encrypted != nil {
+		encrypted = *req.Encrypted
+	}
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+
+	value, err := s.encryptSecretValue(req.Value, encrypted)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "encrypt value")
+		return
+	}
+
+	sec, err := s.store.CreateSharedSecret(store.SharedSecret{
+		Name:        req.Name,
+		Value:       value,
+		Encrypted:   encrypted,
+		Scope:       req.Scope,
+		AppID:       strings.TrimSpace(req.AppID),
+		Description: req.Description,
+		Enabled:     enabled,
+	})
+	if err != nil {
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "a shared secret with this scope and name already exists")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create shared secret")
+		return
+	}
+	respondJSON(w, http.StatusCreated, toSharedSecretRow(sec))
+}
+
+// updateSharedSecretRequest is the PATCH body. Every field is optional; nil
+// means "leave unchanged". A nil Value preserves the stored ciphertext (so a
+// metadata-only edit can't accidentally blank a secret); a non-nil Value
+// rotates it (re-encrypted under the effective Encrypted flag).
+type updateSharedSecretRequest struct {
+	Name        *string `json:"name"`
+	Value       *string `json:"value"`
+	Encrypted   *bool   `json:"encrypted"`
+	Scope       *string `json:"scope"`
+	AppID       *string `json:"app_id"`
+	Description *string `json:"description"`
+	Enabled     *bool   `json:"enabled"`
+}
+
+func (s *Server) updateSharedSecret(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	existing, err := s.store.GetSharedSecret(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get shared secret")
+		return
+	}
+
+	var req updateSharedSecretRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+
+	merged := existing
+	if req.Name != nil {
+		merged.Name = strings.TrimSpace(*req.Name)
+		if !validEnvKey(merged.Name) {
+			respondError(w, http.StatusBadRequest, "name must be a valid env key [A-Za-z_][A-Za-z0-9_]*")
+			return
+		}
+	}
+	if req.Encrypted != nil {
+		merged.Encrypted = *req.Encrypted
+	}
+	if req.Scope != nil {
+		merged.Scope = *req.Scope
+	}
+	if req.AppID != nil {
+		merged.AppID = strings.TrimSpace(*req.AppID)
+	}
+	if req.Description != nil {
+		merged.Description = *req.Description
+	}
+	if req.Enabled != nil {
+		merged.Enabled = *req.Enabled
+	}
+	if msg := validateSharedSecretScope(merged.Scope, merged.AppID); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+
+	// Value handling: only (re)encrypt when the caller supplied a new value.
+	// Otherwise keep the stored ciphertext untouched — but if the Encrypted
+	// flag flipped without a new value we cannot transcode the opaque stored
+	// bytes, so reject that ambiguous request rather than corrupting the row.
+	if req.Value != nil {
+		v, encErr := s.encryptSecretValue(*req.Value, merged.Encrypted)
+		if encErr != nil {
+			respondError(w, http.StatusInternalServerError, "encrypt value")
+			return
+		}
+		merged.Value = v
+	} else if req.Encrypted != nil && *req.Encrypted != existing.Encrypted {
+		respondError(w, http.StatusBadRequest, "changing 'encrypted' requires resubmitting 'value'")
+		return
+	}
+
+	sec, err := s.store.UpdateSharedSecret(merged)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "a shared secret with this scope and name already exists")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update shared secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, toSharedSecretRow(sec))
+}
+
+func (s *Server) deleteSharedSecret(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if err := s.store.DeleteSharedSecret(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "shared secret")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete shared secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
+}
+
+// encryptSecretValue encrypts value with the global key when encrypted is set
+// and the value is non-empty; otherwise it returns the value unchanged. An
+// empty value stays empty (no value set) regardless of the flag.
+func (s *Server) encryptSecretValue(value string, encrypted bool) (string, error) {
+	if !encrypted || value == "" {
+		return value, nil
+	}
+	enc, err := crypto.Encrypt(s.encKey, value)
+	if err != nil {
+		slog.Error("encrypt shared secret value", "error", err)
+		return "", err
+	}
+	return enc, nil
+}
+
+// validateSharedSecretScope returns a non-empty 400 message when the scope /
+// app_id pairing is invalid; "" when valid. Mirrors the store-side invariant
+// so the API rejects with a clear message before hitting the store.
+func validateSharedSecretScope(scope, appID string) string {
+	switch scope {
+	case store.SharedSecretScopeGlobal:
+		return ""
+	case store.SharedSecretScopeApp:
+		if strings.TrimSpace(appID) == "" {
+			return "app_id is required when scope is 'app'"
+		}
+		return ""
+	default:
+		return "scope must be 'global' or 'app'"
+	}
+}
@@ -2,56 +2,23 @@ package api

 import (
 	"encoding/json"
-	"errors"
 	"fmt"
 	"log/slog"
 	"net/http"
-	"strings"
+	"time"

-	"github.com/go-chi/chi/v5"
-
-	"github.com/alexei/docker-watcher/internal/events"
-	"github.com/alexei/docker-watcher/internal/store"
+	"github.com/alexei/tinyforge/internal/events"
 )

-// streamDeployLogs handles GET /api/deploys/{id}/logs.
-// It supports both SSE streaming and JSON fallback based on the Accept header.
-//
-// SSE mode (Accept: text/event-stream):
-//
-//	Streams deploy log events in real-time. Existing logs are sent first,
-//	then new logs are pushed as they arrive via the event bus.
-//
-// JSON mode (default):
-//
-//	Returns all existing deploy logs as a JSON array.
-func (s *Server) streamDeployLogs(w http.ResponseWriter, r *http.Request) {
-	deployID := chi.URLParam(r, "id")
-
-	// Verify deploy exists.
-	deploy, err := s.store.GetDeployByID(deployID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "deploy")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get deploy: "+err.Error())
+// streamEvents handles GET /api/events.
+// It streams instance status changes and deploy status changes via SSE.
+func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
+	release, ok := acquireSSESlot(w, s.sseGate)
+	if !ok {
 		return
 	}
+	defer release()

-	// JSON fallback: return existing logs as array.
-	accept := r.Header.Get("Accept")
-	if !strings.Contains(accept, "text/event-stream") {
-		logs, err := s.store.GetDeployLogs(deployID)
-		if err != nil {
-			respondError(w, http.StatusInternalServerError, "failed to get deploy logs: "+err.Error())
-			return
-		}
-		respondJSON(w, http.StatusOK, logs)
-		return
-	}
-
-	// SSE mode.
 	flusher, ok := w.(http.Flusher)
 	if !ok {
 		respondError(w, http.StatusInternalServerError, "streaming not supported")
@@ -65,96 +32,32 @@ func (s *Server) streamDeployLogs(w http.ResponseWriter, r *http.Request) {
 	w.WriteHeader(http.StatusOK)
 	flusher.Flush()

-	// Send existing logs first.
-	existingLogs, err := s.store.GetDeployLogs(deployID)
-	if err != nil {
-		slog.Error("get existing deploy logs", "error", err)
-	} else {
-		for _, entry := range existingLogs {
-			writeSSE(w, flusher, events.Event{
-				Type: events.EventDeployLog,
-				Payload: events.DeployLogPayload{
-					DeployID: deployID,
-					Message:  entry.Message,
-					Level:    entry.Level,
-				},
-			})
-		}
-	}
-
-	// If deploy is already finished, send completion and close.
-	if isTerminalStatus(deploy.Status) {
-		writeSSE(w, flusher, events.Event{
-			Type: events.EventDeployStatus,
-			Payload: events.DeployStatusPayload{
-				DeployID:  deployID,
-				ProjectID: deploy.ProjectID,
-				StageID:   deploy.StageID,
-				ImageTag:  deploy.ImageTag,
-				Status:    deploy.Status,
-				Error:     deploy.Error,
-			},
-		})
-		return
-	}
-
-	// Subscribe to new deploy log events for this deploy.
+	// Build logs are high-volume: a single verbose `docker build` can emit
+	// thousands of lines. Streaming them to EVERY connection would flood each
+	// subscriber's bounded bus buffer and evict status/log events for ALL
+	// clients. So build logs are delivered ONLY to connections that opt in
+	// with ?workload_id=<id>, and only for that workload. Connections without
+	// the param (e.g. the global dashboard) never receive build-log frames.
+	buildLogWorkloadID := r.URL.Query().Get("workload_id")
 	sub := s.eventBus.Subscribe(func(evt events.Event) bool {
-		switch payload := evt.Payload.(type) {
-		case events.DeployLogPayload:
-			return payload.DeployID == deployID
-		case events.DeployStatusPayload:
-			return payload.DeployID == deployID
+		switch evt.Type {
+		case events.EventInstanceStatus, events.EventDeployStatus, events.EventLog:
+			return true
+		case events.EventBuildLog:
+			if buildLogWorkloadID == "" {
+				return false
+			}
+			p, ok := evt.Payload.(events.BuildLogPayload)
+			return ok && p.WorkloadID == buildLogWorkloadID
 		default:
 			return false
 		}
 	})
 	defer s.eventBus.Unsubscribe(sub)

-	ctx := r.Context()
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case evt, ok := <-sub:
-			if !ok {
-				return
-			}
-			writeSSE(w, flusher, evt)
-
-			// Close stream when deploy reaches terminal status.
-			if evt.Type == events.EventDeployStatus {
-				if payload, ok := evt.Payload.(events.DeployStatusPayload); ok {
-					if isTerminalStatus(payload.Status) {
-						return
-					}
-				}
-			}
-		}
-	}
-}
-
-// streamEvents handles GET /api/events.
-// It streams instance status changes and deploy status changes via SSE.
-func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
-	flusher, ok := w.(http.Flusher)
-	if !ok {
-		respondError(w, http.StatusInternalServerError, "streaming not supported")
-		return
-	}
-
-	w.Header().Set("Content-Type", "text/event-stream")
-	w.Header().Set("Cache-Control", "no-cache")
-	w.Header().Set("Connection", "keep-alive")
-	w.Header().Set("X-Accel-Buffering", "no")
-	w.WriteHeader(http.StatusOK)
-	flusher.Flush()
-
-	// Subscribe to instance status and deploy status events.
-	sub := s.eventBus.Subscribe(func(evt events.Event) bool {
-		return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus
-	})
-	defer s.eventBus.Unsubscribe(sub)
+	// Periodic heartbeat so the browser detects dead connections.
+	heartbeat := time.NewTicker(30 * time.Second)
+	defer heartbeat.Stop()

 	ctx := r.Context()
 	for {
@@ -166,6 +69,10 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
 				return
 			}
 			writeSSE(w, flusher, evt)
+		case <-heartbeat.C:
+			// SSE comment line — keeps the connection alive without triggering onmessage.
+			fmt.Fprintf(w, ": heartbeat\n\n")
+			flusher.Flush()
 		}
 	}
 }
@@ -180,13 +87,3 @@ func writeSSE(w http.ResponseWriter, flusher http.Flusher, evt events.Event) {
 	fmt.Fprintf(w, "data: %s\n\n", data)
 	flusher.Flush()
 }
-
-// isTerminalStatus returns true if the deploy status is final.
-func isTerminalStatus(status string) bool {
-	switch status {
-	case "success", "failed", "rolled_back":
-		return true
-	default:
-		return false
-	}
-}
@@ -0,0 +1,40 @@
+package api
+
+import (
+	"net/http"
+	"sync/atomic"
+)
+
+// maxConcurrentSSEStreams caps the global number of in-flight SSE
+// connections. Each stream holds a goroutine, an event-bus subscription, and
+// (for log streams) a Docker daemon TCP socket; a single tab opening
+// thousands of EventSources would otherwise exhaust file descriptors.
+const maxConcurrentSSEStreams = 256
+
+// sseGate is a counting gate that limits concurrent SSE streams.
+type sseGate struct {
+	cap int64
+	cur atomic.Int64
+}
+
+func newSSEGate(cap int) *sseGate { return &sseGate{cap: int64(cap)} }
+
+// enter reserves a slot and returns a release func, or nil if the gate is full.
+func (g *sseGate) enter() func() {
+	if g.cur.Add(1) > g.cap {
+		g.cur.Add(-1)
+		return nil
+	}
+	return func() { g.cur.Add(-1) }
+}
+
+// acquireSSESlot is a small helper used by every SSE handler to honour the
+// global cap. Returns false (and writes a 503) if the cap is reached.
+func acquireSSESlot(w http.ResponseWriter, gate *sseGate) (release func(), ok bool) {
+	release = gate.enter()
+	if release == nil {
+		respondError(w, http.StatusServiceUnavailable, "stream limit reached")
+		return nil, false
+	}
+	return release, true
+}
@@ -1,176 +0,0 @@
-package api
-
-import (
-	"errors"
-	"net/http"
-
-	"github.com/go-chi/chi/v5"
-
-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/store"
-)
-
-// stageEnvRequest is the expected JSON body for creating/updating a stage env override.
-type stageEnvRequest struct {
-	Key       string `json:"key"`
-	Value     string `json:"value"`
-	Encrypted *bool  `json:"encrypted"`
-}
-
-// listStageEnv handles GET /api/projects/{id}/stages/{stage}/env.
-func (s *Server) listStageEnv(w http.ResponseWriter, r *http.Request) {
-	stageID := chi.URLParam(r, "stage")
-
-	// Verify stage exists.
-	if _, err := s.store.GetStageByID(stageID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get stage: "+err.Error())
-		return
-	}
-
-	envs, err := s.store.GetStageEnvByStageID(stageID)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list stage env: "+err.Error())
-		return
-	}
-
-	// Mask encrypted values in the response.
-	masked := make([]store.StageEnv, len(envs))
-	for i, env := range envs {
-		masked[i] = env
-		if env.Encrypted {
-			masked[i].Value = "••••••••"
-		}
-	}
-
-	respondJSON(w, http.StatusOK, masked)
-}
-
-// createStageEnv handles POST /api/projects/{id}/stages/{stage}/env.
-func (s *Server) createStageEnv(w http.ResponseWriter, r *http.Request) {
-	stageID := chi.URLParam(r, "stage")
-
-	// Verify stage exists.
-	if _, err := s.store.GetStageByID(stageID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get stage: "+err.Error())
-		return
-	}
-
-	var req stageEnvRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.Key == "" {
-		respondError(w, http.StatusBadRequest, "key is required")
-		return
-	}
-
-	encrypted := false
-	if req.Encrypted != nil {
-		encrypted = *req.Encrypted
-	}
-
-	value := req.Value
-	if encrypted && value != "" {
-		enc, err := crypto.Encrypt(s.encKey, value)
-		if err != nil {
-			respondError(w, http.StatusInternalServerError, "failed to encrypt value: "+err.Error())
-			return
-		}
-		value = enc
-	}
-
-	env, err := s.store.CreateStageEnv(store.StageEnv{
-		StageID:   stageID,
-		Key:       req.Key,
-		Value:     value,
-		Encrypted: encrypted,
-	})
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create stage env: "+err.Error())
-		return
-	}
-
-	// Mask encrypted value in the response.
-	if env.Encrypted {
-		env.Value = "••••••••"
-	}
-
-	respondJSON(w, http.StatusCreated, env)
-}
-
-// updateStageEnv handles PUT /api/projects/{id}/stages/{stage}/env/{envId}.
-func (s *Server) updateStageEnv(w http.ResponseWriter, r *http.Request) {
-	envID := chi.URLParam(r, "envId")
-
-	existing, err := s.store.GetStageEnvByID(envID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage env")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get stage env: "+err.Error())
-		return
-	}
-
-	var req stageEnvRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	updated := existing
-	if req.Key != "" {
-		updated.Key = req.Key
-	}
-	if req.Encrypted != nil {
-		updated.Encrypted = *req.Encrypted
-	}
-
-	// Only update value if provided (allows updating key/encrypted without changing the value).
-	if req.Value != "" {
-		value := req.Value
-		if updated.Encrypted {
-			enc, err := crypto.Encrypt(s.encKey, value)
-			if err != nil {
-				respondError(w, http.StatusInternalServerError, "failed to encrypt value: "+err.Error())
-				return
-			}
-			value = enc
-		}
-		updated.Value = value
-	}
-
-	if err := s.store.UpdateStageEnv(updated); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to update stage env: "+err.Error())
-		return
-	}
-
-	// Mask encrypted value in the response.
-	if updated.Encrypted {
-		updated.Value = "••••••••"
-	}
-
-	respondJSON(w, http.StatusOK, updated)
-}
-
-// deleteStageEnv handles DELETE /api/projects/{id}/stages/{stage}/env/{envId}.
-func (s *Server) deleteStageEnv(w http.ResponseWriter, r *http.Request) {
-	envID := chi.URLParam(r, "envId")
-	if err := s.store.DeleteStageEnv(envID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage env")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to delete stage env: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, map[string]string{"deleted": envID})
-}
@@ -1,137 +0,0 @@
-package api
-
-import (
-	"errors"
-	"net/http"
-
-	"github.com/go-chi/chi/v5"
-
-	"github.com/alexei/docker-watcher/internal/store"
-)
-
-// stageRequest is the expected JSON body for creating/updating a stage.
-type stageRequest struct {
-	Name         string `json:"name"`
-	TagPattern   string `json:"tag_pattern"`
-	AutoDeploy   *bool  `json:"auto_deploy"`
-	MaxInstances *int   `json:"max_instances"`
-	Confirm      *bool  `json:"confirm"`
-	PromoteFrom  string `json:"promote_from"`
-	Subdomain    string `json:"subdomain"`
-}
-
-// createStage handles POST /api/projects/{id}/stages.
-func (s *Server) createStage(w http.ResponseWriter, r *http.Request) {
-	projectID := chi.URLParam(r, "id")
-
-	// Verify project exists.
-	if _, err := s.store.GetProjectByID(projectID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get project: "+err.Error())
-		return
-	}
-
-	var req stageRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.Name == "" {
-		respondError(w, http.StatusBadRequest, "name is required")
-		return
-	}
-	if req.TagPattern == "" {
-		req.TagPattern = "*"
-	}
-
-	autoDeploy := false
-	if req.AutoDeploy != nil {
-		autoDeploy = *req.AutoDeploy
-	}
-	maxInstances := 1
-	if req.MaxInstances != nil {
-		maxInstances = *req.MaxInstances
-	}
-	confirm := false
-	if req.Confirm != nil {
-		confirm = *req.Confirm
-	}
-
-	stage, err := s.store.CreateStage(store.Stage{
-		ProjectID:    projectID,
-		Name:         req.Name,
-		TagPattern:   req.TagPattern,
-		AutoDeploy:   autoDeploy,
-		MaxInstances: maxInstances,
-		Confirm:      confirm,
-		PromoteFrom:  req.PromoteFrom,
-		Subdomain:    req.Subdomain,
-	})
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create stage: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusCreated, stage)
-}
-
-// updateStage handles PUT /api/projects/{id}/stages/{stage}.
-func (s *Server) updateStage(w http.ResponseWriter, r *http.Request) {
-	stageID := chi.URLParam(r, "stage")
-
-	existing, err := s.store.GetStageByID(stageID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get stage: "+err.Error())
-		return
-	}
-
-	var req stageRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	updated := existing
-	if req.Name != "" {
-		updated.Name = req.Name
-	}
-	if req.TagPattern != "" {
-		updated.TagPattern = req.TagPattern
-	}
-	if req.AutoDeploy != nil {
-		updated.AutoDeploy = *req.AutoDeploy
-	}
-	if req.MaxInstances != nil {
-		updated.MaxInstances = *req.MaxInstances
-	}
-	if req.Confirm != nil {
-		updated.Confirm = *req.Confirm
-	}
-	updated.PromoteFrom = req.PromoteFrom
-	updated.Subdomain = req.Subdomain
-
-	if err := s.store.UpdateStage(updated); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to update stage: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, updated)
-}
-
-// deleteStage handles DELETE /api/projects/{id}/stages/{stage}.
-func (s *Server) deleteStage(w http.ResponseWriter, r *http.Request) {
-	stageID := chi.URLParam(r, "stage")
-	if err := s.store.DeleteStage(stageID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "stage")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to delete stage: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, map[string]string{"deleted": stageID})
-}
@@ -0,0 +1,176 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"log/slog"
+	"net/http"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/stale"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// listStaleContainers handles GET /api/containers/stale.
+func (s *Server) listStaleContainers(w http.ResponseWriter, r *http.Request) {
+	if s.staleScanner == nil {
+		respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
+		return
+	}
+
+	staleRows, err := s.staleScanner.FindStaleContainers(r.Context())
+	if err != nil {
+		slog.Error("failed to find stale containers", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to find stale containers")
+		return
+	}
+
+	if staleRows == nil {
+		staleRows = []stale.StaleContainer{}
+	}
+	respondJSON(w, http.StatusOK, staleRows)
+}
+
+// cleanupStaleContainer handles POST /api/containers/stale/{id}/cleanup.
+// Stops the Docker container, removes the proxy route, and deletes the
+// container row. {id} is the container row ID.
+func (s *Server) cleanupStaleContainer(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	c, err := s.store.GetContainerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "container")
+			return
+		}
+		slog.Error("failed to get container", "id", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to get container")
+		return
+	}
+
+	if c.State == "removing" {
+		respondError(w, http.StatusConflict, "container is already being removed")
+		return
+	}
+
+	if err := s.cleanupContainer(r, c); err != nil {
+		slog.Error("failed to cleanup container", "id", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to cleanup container")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{"cleaned": id})
+}
+
+// bulkCleanupStaleContainers handles POST /api/containers/stale/cleanup.
+func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Request) {
+	if s.staleScanner == nil {
+		respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
+		return
+	}
+
+	staleRows, err := s.staleScanner.FindStaleContainers(r.Context())
+	if err != nil {
+		slog.Error("failed to find stale containers for bulk cleanup", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to find stale containers")
+		return
+	}
+
+	var cleaned []string
+	var failed []string
+
+	for _, sc := range staleRows {
+		if sc.Container.State == "removing" {
+			continue
+		}
+		if err := s.cleanupContainer(r, sc.Container); err != nil {
+			slog.Error("bulk stale cleanup failed",
+				"id", sc.Container.ID, "error", err)
+			failed = append(failed, sc.Container.ID)
+			continue
+		}
+		cleaned = append(cleaned, sc.Container.ID)
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"cleaned": cleaned,
+		"failed":  failed,
+	})
+}
+
+// cleanupContainer stops a Docker container, removes its proxy route,
+// deletes the container row, and emits an event.
+func (s *Server) cleanupContainer(r *http.Request, c store.Container) error {
+	ctx := r.Context()
+
+	if err := s.store.UpdateContainerState(c.ID, "removing"); err != nil {
+		slog.Warn("stale cleanup: update state to removing", "id", c.ID, "error", err)
+	}
+
+	if c.ContainerID != "" {
+		if err := s.docker.StopContainer(ctx, c.ContainerID, 10); err != nil {
+			slog.Warn("stale cleanup: stop container", "container_id", c.ContainerID, "error", err)
+		}
+		if err := s.docker.RemoveContainer(ctx, c.ContainerID, true); err != nil {
+			slog.Warn("stale cleanup: remove container", "container_id", c.ContainerID, "error", err)
+		}
+	}
+
+	if c.ProxyRouteID != "" {
+		if err := s.proxyProvider.DeleteRoute(ctx, c.ProxyRouteID); err != nil {
+			slog.Warn("stale cleanup: delete proxy route", "route_id", c.ProxyRouteID, "error", err)
+		}
+	}
+
+	if err := s.store.DeleteContainer(c.ID); err != nil {
+		return err
+	}
+
+	s.emitStaleCleanupEvent(c)
+
+	return nil
+}
+
+// emitStaleCleanupEvent publishes an event when a stale container is cleaned up.
+func (s *Server) emitStaleCleanupEvent(c store.Container) {
+	msg := "Stale container cleaned up: " + c.ID + " (tag: " + c.ImageTag + ")"
+
+	// Use json.Marshal — c.Role is reconciler-derived from a Docker label and
+	// could contain quotes / control chars that break a hand-built JSON string.
+	metaBytes, err := json.Marshal(map[string]string{
+		"container_id": c.ID,
+		"workload_id":  c.WorkloadID,
+		"role":         c.Role,
+	})
+	if err != nil {
+		// json.Marshal on a flat string map can only fail in pathological
+		// circumstances (memory exhaustion); fall back to an empty object so
+		// the event still records.
+		metaBytes = []byte(`{}`)
+	}
+
+	evt, err := s.store.InsertEvent(store.EventLog{
+		Source:   "stale_cleanup",
+		Severity: "info",
+		Message:  msg,
+		Metadata: string(metaBytes),
+	})
+	if err != nil {
+		slog.Error("stale cleanup: failed to persist event", "error", err)
+		return
+	}
+
+	s.eventBus.Publish(events.Event{
+		Type: events.EventLog,
+		Payload: events.EventLogPayload{
+			ID:        evt.ID,
+			Source:    "stale_cleanup",
+			Severity:  "info",
+			Message:   msg,
+			Metadata:  evt.Metadata,
+			CreatedAt: evt.CreatedAt,
+		},
+	})
+}
@@ -0,0 +1,42 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// getInstanceStats handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats.
+// {iid} is the container row ID (same UUID as the legacy instance ID).
+func (s *Server) getInstanceStats(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "iid")
+
+	c, err := s.store.GetContainerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "container")
+			return
+		}
+		slog.Error("failed to get container", "id", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to get container")
+		return
+	}
+
+	if c.ContainerID == "" {
+		respondError(w, http.StatusBadRequest, "container row has no docker container bound")
+		return
+	}
+
+	stats, err := s.docker.GetContainerStats(r.Context(), c.ContainerID)
+	if err != nil {
+		slog.Error("failed to get container stats", "container_id", c.ContainerID, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to get container stats")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, stats)
+}
@@ -0,0 +1,189 @@
+package api
+
+import (
+	"log/slog"
+	"net/http"
+	"sort"
+	"strconv"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// topConsumerMinWindow is how recent a container sample must be to count toward
+// the "top consumers" list. Scaled with the collector interval (read from
+// settings) so it stays meaningful even when sampling is sparse.
+const topConsumerMinWindow = 2 * time.Minute
+
+// TopContainerSample augments a stats sample with the human-readable owner
+// name so the UI can show "workload/role" without an extra round-trip per row.
+type TopContainerSample struct {
+	store.ContainerStatsSample
+	OwnerName string `json:"owner_name"`
+}
+
+const (
+	// defaultHistoryWindow is used when no ?window= param is provided or the
+	// value fails to parse. Matches the default retention so the "last 2h"
+	// view always has data when collection is enabled.
+	defaultHistoryWindow = 2 * time.Hour
+	maxHistoryWindow     = 24 * time.Hour
+)
+
+// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
+// and returns a bounded duration.
+func parseWindow(r *http.Request) time.Duration {
+	raw := r.URL.Query().Get("window")
+	if raw == "" {
+		return defaultHistoryWindow
+	}
+	d, err := time.ParseDuration(raw)
+	if err != nil || d <= 0 {
+		return defaultHistoryWindow
+	}
+	if d > maxHistoryWindow {
+		return maxHistoryWindow
+	}
+	return d
+}
+
+// sinceTimestamp converts a duration into a Unix-seconds cutoff.
+func sinceTimestamp(window time.Duration) int64 {
+	return time.Now().UTC().Add(-window).Unix()
+}
+
+// getSystemStats handles GET /api/system/stats — current host snapshot.
+// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
+// handler returns 503 so the frontend can show a dedicated unavailable
+// state instead of treating it as a generic 5xx failure.
+func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
+		return
+	}
+	sys, err := s.docker.GetSystemStats(r.Context())
+	if err != nil {
+		slog.Warn("system stats unavailable", "error", err)
+		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
+		return
+	}
+	respondJSON(w, http.StatusOK, sys)
+}
+
+// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
+func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
+	samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
+	if err != nil {
+		slog.Error("failed to list system stats samples", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list samples")
+		return
+	}
+	if samples == nil {
+		samples = []store.SystemStatsSample{}
+	}
+	respondJSON(w, http.StatusOK, samples)
+}
+
+// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
+// Returns the top-N most recent samples across containers, sorted by CPU or
+// memory. Container IDs are stripped for non-admins so a low-privilege viewer
+// cannot enumerate workloads outside their scope.
+func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
+	limit := 5
+	if raw := r.URL.Query().Get("limit"); raw != "" {
+		if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
+			limit = n
+		}
+	}
+	by := r.URL.Query().Get("by")
+	if by != "memory" {
+		by = "cpu"
+	}
+
+	// Samples must be at least as recent as max(2*interval, 2 minutes) so the
+	// list reflects near-current load even when collection is sparse.
+	window := topConsumerMinWindow
+	if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
+		if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
+			window = w
+		}
+	}
+
+	samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
+	if err != nil {
+		slog.Error("failed to list container samples for top", "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list samples")
+		return
+	}
+
+	// Keep only the latest sample per container.
+	latest := make(map[string]store.ContainerStatsSample, len(samples))
+	for _, sm := range samples {
+		if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
+			latest[sm.ContainerID] = sm
+		}
+	}
+
+	top := make([]store.ContainerStatsSample, 0, len(latest))
+	for _, sm := range latest {
+		top = append(top, sm)
+	}
+
+	sort.Slice(top, func(i, j int) bool {
+		if by == "memory" {
+			return top[i].MemoryUsage > top[j].MemoryUsage
+		}
+		return top[i].CPUPercent > top[j].CPUPercent
+	})
+	if len(top) > limit {
+		top = top[:limit]
+	}
+
+	enriched := s.enrichWithOwnerNames(top)
+
+	// Scrub container IDs for non-admins. The owner name is the actionable
+	// identifier; the container ID is a host-level handle that reveals
+	// workload existence to viewers who shouldn't have it.
+	claims, _ := auth.ClaimsFromContext(r.Context())
+	if claims.Role != "admin" {
+		for i := range enriched {
+			enriched[i].ContainerID = ""
+		}
+	}
+
+	respondJSON(w, http.StatusOK, enriched)
+}
+
+// enrichWithOwnerNames attaches a human-readable owner name to each sample.
+// Names are resolved through the containers index → workloads, which after
+// the cutover is the only available lookup path.
+func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
+	out := make([]TopContainerSample, len(samples))
+	for i, sm := range samples {
+		out[i] = TopContainerSample{ContainerStatsSample: sm}
+		out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
+	}
+	return out
+}
+
+// lookupInstanceName returns "workload/role" for a container row, or empty
+// on any lookup error so a transient miss does not break the response.
+func (s *Server) lookupInstanceName(instanceID string) string {
+	c, err := s.store.GetContainerByID(instanceID)
+	if err != nil {
+		return ""
+	}
+	w, err := s.store.GetWorkloadByID(c.WorkloadID)
+	if err != nil {
+		if c.Role != "" {
+			return c.Role
+		}
+		return ""
+	}
+	if c.Role != "" {
+		return w.Name + "/" + c.Role
+	}
+	return w.Name
+}
+
@@ -0,0 +1,794 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/webhook"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// fireInFlight tracks trigger IDs that have a fire-now request actively
+// running so a runaway script or rapid double-click doesn't queue
+// duplicate deploys. Keyed by trigger ID; entries are added under the
+// mutex and removed by the handler's defer. Sufficient for an admin
+// gate — a real rate limiter belongs at the middleware layer, not here.
+var (
+	fireInFlightMu sync.Mutex
+	fireInFlight   = map[string]struct{}{}
+)
+
+// triggerView is the response shape for /api/triggers. Webhook secrets
+// are never serialized — read them via the dedicated /webhook subresource
+// where the canonical URL is composed.
+type triggerView struct {
+	ID                      string          `json:"id"`
+	Kind                    string          `json:"kind"`
+	Name                    string          `json:"name"`
+	Config                  json.RawMessage `json:"config"`
+	WebhookEnabled          bool            `json:"webhook_enabled"`
+	WebhookRequireSignature bool            `json:"webhook_require_signature"`
+	BindingCount            int             `json:"binding_count"`
+	// LastFiredAt is the RFC3339 wall-clock the scheduler last
+	// dispatched this trigger. Always present in the response shape;
+	// empty for triggers that have never fired or are not scheduler-
+	// driven. The detail page renders it as "last fired" on schedule
+	// triggers; other kinds ignore it.
+	LastFiredAt string `json:"last_fired_at"`
+	CreatedAt   string `json:"created_at"`
+	UpdatedAt   string `json:"updated_at"`
+}
+
+func (s *Server) toTriggerView(t store.Trigger) triggerView {
+	count, err := s.store.CountBindingsForTrigger(t.ID)
+	if err != nil {
+		slog.Warn("triggerView: count bindings", "trigger", t.ID, "error", err)
+	}
+	return triggerView{
+		ID:                      t.ID,
+		Kind:                    t.Kind,
+		Name:                    t.Name,
+		Config:                  json.RawMessage(t.Config),
+		WebhookEnabled:          t.WebhookSecret != "",
+		WebhookRequireSignature: t.WebhookRequireSignature,
+		BindingCount:            count,
+		LastFiredAt:             t.LastFiredAt,
+		CreatedAt:               t.CreatedAt,
+		UpdatedAt:               t.UpdatedAt,
+	}
+}
+
+// toTriggerViewWithCount is the join-aware variant used by listTriggers
+// to avoid one COUNT(*) per row. Kept distinct from toTriggerView so
+// single-row paths (get/create/update) keep the simple call shape.
+func toTriggerViewWithCount(row store.TriggerWithBindingCount) triggerView {
+	return triggerView{
+		ID:                      row.ID,
+		Kind:                    row.Kind,
+		Name:                    row.Name,
+		Config:                  json.RawMessage(row.Config),
+		WebhookEnabled:          row.WebhookSecret != "",
+		WebhookRequireSignature: row.WebhookRequireSignature,
+		BindingCount:            row.BindingCount,
+		LastFiredAt:             row.LastFiredAt,
+		CreatedAt:               row.CreatedAt,
+		UpdatedAt:               row.UpdatedAt,
+	}
+}
+
+// triggerRequest is the create/update body. Config is opaque per kind.
+// Auto-generates a webhook secret on create when WebhookEnabled is true;
+// the secret is exposed only via the /webhook subresource.
+//
+// WebhookRequireSignature is a *bool so we can distinguish "field omitted
+// by client" (nil → apply secure default of true when webhook is enabled)
+// from an explicit opt-out (false → respected).
+type triggerRequest struct {
+	Kind                    string          `json:"kind"`
+	Name                    string          `json:"name"`
+	Config                  json.RawMessage `json:"config"`
+	WebhookEnabled          bool            `json:"webhook_enabled"`
+	WebhookRequireSignature *bool           `json:"webhook_require_signature,omitempty"`
+}
+
+// Same per-blob caps used on the workload pluginWorkloadRequest path —
+// triggers and workload trigger configs share the same plugin Validate()
+// call, so the byte budget should match.
+const maxTriggerStandaloneConfigBytes = 16 << 10
+
+func (s *Server) listTriggers(w http.ResponseWriter, r *http.Request) {
+	kind := r.URL.Query().Get("kind")
+	rows, err := s.store.ListTriggersWithBindingCount(kind)
+	if err != nil {
+		slog.Error("list triggers", "error", err)
+		respondError(w, http.StatusInternalServerError, "list triggers")
+		return
+	}
+	out := make([]triggerView, 0, len(rows))
+	for _, t := range rows {
+		out = append(out, toTriggerViewWithCount(t))
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+func (s *Server) getTrigger(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	t, err := s.store.GetTriggerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get trigger")
+		return
+	}
+	respondJSON(w, http.StatusOK, s.toTriggerView(t))
+}
+
+// buildTriggerFromRequest assembles a store.Trigger ready for insert.
+// Centralized so the standalone create endpoint and the inline-bind
+// endpoint cannot drift on secret-generation defaults.
+//
+// SECURITY: a new trigger with webhook enabled defaults to require_signature
+// = true. Operators can opt out at create time for receivers that do not
+// support HMAC, but the safer default avoids the "freshly-created trigger
+// accepts unsigned posts to its URL" footgun.
+func buildTriggerFromRequest(req triggerRequest) store.Trigger {
+	// Secure default: if webhook is enabled and the operator did NOT
+	// explicitly set require_signature, force it on. Explicit false is
+	// preserved (legacy receivers without HMAC support still work).
+	requireSig := false
+	if req.WebhookRequireSignature != nil {
+		requireSig = *req.WebhookRequireSignature
+	} else if req.WebhookEnabled {
+		requireSig = true
+	}
+	t := store.Trigger{
+		Kind:                    req.Kind,
+		Name:                    strings.TrimSpace(req.Name),
+		Config:                  string(req.Config),
+		WebhookRequireSignature: requireSig,
+	}
+	if req.WebhookEnabled {
+		t.WebhookSecret = generateWebhookSecret()
+		t.WebhookSigningSecret = generateWebhookSecret()
+	}
+	return t
+}
+
+func (s *Server) createTrigger(w http.ResponseWriter, r *http.Request) {
+	var req triggerRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	if err := validateTriggerRequest(req); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	created, err := s.store.CreateTrigger(buildTriggerFromRequest(req))
+	if err != nil {
+		slog.Error("create trigger", "error", err)
+		// UNIQUE name collision is the most common user-facing failure.
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "a trigger with this name already exists")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create trigger")
+		return
+	}
+	respondJSON(w, http.StatusCreated, s.toTriggerView(created))
+}
+
+func (s *Server) updateTrigger(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	existing, err := s.store.GetTriggerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get trigger")
+		return
+	}
+	var req triggerRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	// Kind is immutable on update. Mirror the value from the existing
+	// row so validateTriggerRequest can still verify the config blob.
+	req.Kind = existing.Kind
+	if err := validateTriggerRequest(req); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	if req.Name != "" {
+		existing.Name = strings.TrimSpace(req.Name)
+	}
+	if len(req.Config) > 0 {
+		existing.Config = string(req.Config)
+	}
+	if req.WebhookRequireSignature != nil {
+		existing.WebhookRequireSignature = *req.WebhookRequireSignature
+	} else if req.WebhookEnabled && !existing.WebhookRequireSignature {
+		// Re-enabling webhook without specifying the signature flag —
+		// take the secure default.
+		existing.WebhookRequireSignature = true
+	}
+	wasEnabled := existing.WebhookSecret != ""
+	if req.WebhookEnabled && !wasEnabled {
+		// false→true transition: rotate both secrets so re-enabling
+		// after a disable does not silently revive an old leaked URL.
+		existing.WebhookSecret = generateWebhookSecret()
+		existing.WebhookSigningSecret = generateWebhookSecret()
+	}
+	if !req.WebhookEnabled {
+		existing.WebhookSecret = ""
+		existing.WebhookSigningSecret = ""
+	}
+	if err := s.store.UpdateTrigger(existing); err != nil {
+		slog.Error("update trigger", "error", err)
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "a trigger with this name already exists")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update trigger")
+		return
+	}
+	respondJSON(w, http.StatusOK, s.toTriggerView(existing))
+}
+
+func (s *Server) deleteTrigger(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if err := s.store.DeleteTrigger(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete trigger")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
+}
+
+// triggerWebhookView surfaces the inbound URL for a trigger. Returns
+// empty path / secret when the trigger has webhook ingress disabled.
+type triggerWebhookView struct {
+	URL                     string `json:"url"`
+	Secret                  string `json:"secret"`
+	WebhookRequireSignature bool   `json:"webhook_require_signature"`
+}
+
+func (s *Server) getTriggerWebhook(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	t, err := s.store.GetTriggerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get trigger")
+		return
+	}
+	view := triggerWebhookView{
+		Secret:                  t.WebhookSecret,
+		WebhookRequireSignature: t.WebhookRequireSignature,
+	}
+	if t.WebhookSecret != "" {
+		view.URL = "/api/webhook/triggers/" + t.WebhookSecret
+	}
+	respondJSON(w, http.StatusOK, view)
+}
+
+// fireTriggerNow dispatches a trigger immediately without waiting for
+// its next natural fire window. Used by the /triggers/[id] "Fire now"
+// button so an operator can re-test a fixed broken deploy without
+// waiting one full schedule interval.
+//
+// Scope: schedule triggers only. Other kinds (registry / git / manual)
+// already have their own dispatch paths — registry/git fire on real
+// inbound events, manual fires from the workload Deploy button. Adding
+// "fire-now" for those would duplicate those flows without adding new
+// capability.
+//
+// Side effect: updates last_fired_at to "now" (same persist-before-
+// dispatch ordering the scheduler uses) so the natural next-fire
+// window shifts forward by exactly the interval. This is the
+// principle-of-least-surprise behavior — an operator who fires now
+// is intentionally resetting the cadence.
+func (s *Server) fireTriggerNow(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	// Per-trigger in-flight guard. AdminOnly + UI throttle is the only
+	// gate against rapid double-clicks; without this guard a runaway
+	// script could queue parallel fans-out of the same schedule, each
+	// holding up to maxTriggerFanOutConcurrency deployer slots.
+	// Returning 429 lets the client distinguish "already running" from
+	// a real validation error.
+	fireInFlightMu.Lock()
+	if _, busy := fireInFlight[id]; busy {
+		fireInFlightMu.Unlock()
+		respondError(w, http.StatusTooManyRequests,
+			"a fire is already in progress for this trigger")
+		return
+	}
+	fireInFlight[id] = struct{}{}
+	fireInFlightMu.Unlock()
+	defer func() {
+		fireInFlightMu.Lock()
+		delete(fireInFlight, id)
+		fireInFlightMu.Unlock()
+	}()
+
+	trg, err := s.store.GetTriggerByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "failed to load trigger")
+		return
+	}
+	if trg.Kind != "schedule" {
+		respondError(w, http.StatusBadRequest,
+			"fire-now is only supported for schedule triggers")
+		return
+	}
+
+	// AdminOnly middleware guarantees claims; treat their absence as a
+	// boot-time wiring bug rather than fall back to an unattributable
+	// "manual" string that collides with the `manual` trigger kind in
+	// audit logs.
+	claims, ok := auth.ClaimsFromContext(r.Context())
+	if !ok || claims.Username == "" {
+		slog.Error("fire-now: missing claims under AdminOnly", "trigger", trg.Name)
+		respondError(w, http.StatusInternalServerError, "missing auth context")
+		return
+	}
+	actor := claims.Username
+
+	now := time.Now().UTC()
+	if err := s.store.SetTriggerLastFired(trg.ID, now.Format(time.RFC3339)); err != nil {
+		respondError(w, http.StatusInternalServerError, "persist last_fired_at")
+		return
+	}
+
+	evt := plugin.InboundEvent{
+		Kind:     "schedule",
+		Schedule: &plugin.ScheduleEvent{FiredAt: now},
+	}
+	results, err := s.webhook.FanOutForTrigger(r.Context(), trg, evt)
+	if err != nil {
+		slog.Warn("fire-now: fan-out failed",
+			"trigger", trg.Name, "actor", actor, "error", err)
+		// Don't expose the raw error — it can carry registry-auth or
+		// compose-stdout bytes (matches the manual-deploy handler).
+		respondError(w, http.StatusInternalServerError, "fire failed; see server logs")
+		return
+	}
+
+	var deployed, errored int
+	for _, b := range results {
+		switch {
+		case b.Deployed:
+			deployed++
+		case b.Reason == webhook.ReasonBindingDisabled, b.Reason == webhook.ReasonNoMatch:
+			// silent
+		default:
+			errored++
+		}
+	}
+	// Empty fan-out (no bindings) is almost certainly an operator
+	// mistake — the UI button is gated on binding_count>0, but the
+	// counts can change between page load and click. Warn so the
+	// no-op shows up in audit logs.
+	if len(results) == 0 {
+		slog.Warn("fire-now: no bindings to fire",
+			"trigger", trg.Name, "actor", actor)
+	} else {
+		slog.Info("fire-now dispatched",
+			"trigger", trg.Name, "actor", actor,
+			"bindings", len(results), "deployed", deployed, "errored", errored)
+	}
+
+	respondJSON(w, http.StatusAccepted, map[string]any{
+		"trigger":  trg.Name,
+		"fired_at": now.Format(time.RFC3339),
+		"bindings": len(results),
+		"deployed": deployed,
+		"errored":  errored,
+	})
+}
+
+func (s *Server) regenerateTriggerWebhook(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	secret := generateWebhookSecret()
+	if err := s.store.SetTriggerWebhookSecret(id, secret); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "rotate webhook secret")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{
+		"secret": secret,
+		"url":    "/api/webhook/triggers/" + secret,
+	})
+}
+
+// maxBindingConfigBytes caps a per-binding override blob. Smaller than
+// the full trigger config — bindings should be lightweight tweaks
+// (tag pattern, branch filter), not whole replacement configs.
+const maxBindingConfigBytes = 8 << 10
+
+// validateBindingConfig enforces the size cap and runs the trigger
+// plugin's Validate() against the merged (trigger.config + binding)
+// shape so a malformed override is caught at write time instead of
+// silently breaking webhook fan-out at deploy time.
+func validateBindingConfig(trg store.Trigger, bindingConfig json.RawMessage) error {
+	if len(bindingConfig) > maxBindingConfigBytes {
+		return fmt.Errorf("binding_config exceeds %d bytes", maxBindingConfigBytes)
+	}
+	merged, err := plugin.MergeJSONConfig(json.RawMessage(trg.Config), bindingConfig)
+	if err != nil {
+		return fmt.Errorf("binding_config: %w", err)
+	}
+	tp, err := plugin.GetTrigger(trg.Kind)
+	if err != nil {
+		return err
+	}
+	return tp.Validate(merged)
+}
+
+// validateTriggerRequest type-checks the trigger via the registered
+// plugin. Accepts an empty config only when the plugin allows it (e.g.
+// the manual trigger).
+func validateTriggerRequest(req triggerRequest) error {
+	if strings.TrimSpace(req.Kind) == "" {
+		return fmt.Errorf("kind is required")
+	}
+	if strings.TrimSpace(req.Name) == "" {
+		return fmt.Errorf("name is required")
+	}
+	if len(req.Config) > maxTriggerStandaloneConfigBytes {
+		return fmt.Errorf("config exceeds %d bytes", maxTriggerStandaloneConfigBytes)
+	}
+	tp, err := plugin.GetTrigger(req.Kind)
+	if err != nil {
+		return err
+	}
+	return tp.Validate(req.Config)
+}
+
+// bindingView shapes one binding for the /api/triggers/{id}/bindings
+// listing. Includes the workload's name to avoid an N+1 round-trip on
+// the frontend.
+type bindingView struct {
+	ID            string          `json:"id"`
+	WorkloadID    string          `json:"workload_id"`
+	WorkloadName  string          `json:"workload_name"`
+	TriggerID     string          `json:"trigger_id"`
+	BindingConfig json.RawMessage `json:"binding_config"`
+	Enabled       bool            `json:"enabled"`
+	SortOrder     int             `json:"sort_order"`
+	CreatedAt     string          `json:"created_at"`
+	UpdatedAt     string          `json:"updated_at"`
+}
+
+func (s *Server) toBindingView(b store.WorkloadTriggerBinding) bindingView {
+	name := ""
+	if w, err := s.store.GetWorkloadByID(b.WorkloadID); err == nil {
+		name = w.Name
+	}
+	return bindingView{
+		ID:            b.ID,
+		WorkloadID:    b.WorkloadID,
+		WorkloadName:  name,
+		TriggerID:     b.TriggerID,
+		BindingConfig: json.RawMessage(b.BindingConfig),
+		Enabled:       b.Enabled,
+		SortOrder:     b.SortOrder,
+		CreatedAt:     b.CreatedAt,
+		UpdatedAt:     b.UpdatedAt,
+	}
+}
+
+func (s *Server) listBindingsForTrigger(w http.ResponseWriter, r *http.Request) {
+	tid := chi.URLParam(r, "id")
+	if _, err := s.store.GetTriggerByID(tid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get trigger")
+		return
+	}
+	rows, err := s.store.ListBindingsForTriggerWithNames(tid)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list bindings")
+		return
+	}
+	out := make([]bindingView, 0, len(rows))
+	for _, b := range rows {
+		out = append(out, bindingView{
+			ID:            b.ID,
+			WorkloadID:    b.WorkloadID,
+			WorkloadName:  b.WorkloadName,
+			TriggerID:     b.TriggerID,
+			BindingConfig: json.RawMessage(b.BindingConfig),
+			Enabled:       b.Enabled,
+			SortOrder:     b.SortOrder,
+			CreatedAt:     b.CreatedAt,
+			UpdatedAt:     b.UpdatedAt,
+		})
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// bindingRequest is shared by trigger-side bind (POST .../bindings) and
+// workload-side bind (POST workloads/{id}/triggers).
+type bindingRequest struct {
+	WorkloadID    string          `json:"workload_id"`
+	TriggerID     string          `json:"trigger_id"`
+	BindingConfig json.RawMessage `json:"binding_config"`
+	Enabled       *bool           `json:"enabled"`
+	SortOrder     int             `json:"sort_order"`
+}
+
+func (s *Server) bindWorkloadToTrigger(w http.ResponseWriter, r *http.Request) {
+	tid := chi.URLParam(r, "id")
+	var req bindingRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	if req.WorkloadID == "" {
+		respondError(w, http.StatusBadRequest, "workload_id is required")
+		return
+	}
+	trg, err := s.store.GetTriggerByID(tid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get trigger")
+		return
+	}
+	if _, err := s.store.GetWorkloadByID(req.WorkloadID); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	if err := validateBindingConfig(trg, req.BindingConfig); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+	b := store.WorkloadTriggerBinding{
+		WorkloadID:    req.WorkloadID,
+		TriggerID:     tid,
+		BindingConfig: string(req.BindingConfig),
+		Enabled:       enabled,
+		SortOrder:     req.SortOrder,
+	}
+	created, err := s.store.CreateBinding(b)
+	if err != nil {
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "this workload is already bound to this trigger")
+			return
+		}
+		slog.Error("create binding", "error", err)
+		respondError(w, http.StatusInternalServerError, "create binding")
+		return
+	}
+	respondJSON(w, http.StatusCreated, s.toBindingView(created))
+}
+
+func (s *Server) updateBinding(w http.ResponseWriter, r *http.Request) {
+	bid := chi.URLParam(r, "bid")
+	existing, err := s.store.GetBindingByID(bid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "binding")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get binding")
+		return
+	}
+	var req bindingRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	if len(req.BindingConfig) > 0 {
+		trg, terr := s.store.GetTriggerByID(existing.TriggerID)
+		if terr != nil {
+			slog.Error("update binding: trigger lookup", "trigger", existing.TriggerID, "error", terr)
+			respondError(w, http.StatusInternalServerError, "trigger lookup")
+			return
+		}
+		if err := validateBindingConfig(trg, req.BindingConfig); err != nil {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		existing.BindingConfig = string(req.BindingConfig)
+	}
+	if req.Enabled != nil {
+		existing.Enabled = *req.Enabled
+	}
+	existing.SortOrder = req.SortOrder
+	if err := s.store.UpdateBinding(existing); err != nil {
+		respondError(w, http.StatusInternalServerError, "update binding")
+		return
+	}
+	respondJSON(w, http.StatusOK, s.toBindingView(existing))
+}
+
+// listBindingsForWorkload is the workload-side mirror of
+// listBindingsForTrigger. Returns every trigger bound to the workload
+// in sort_order so the detail page can render them inline.
+func (s *Server) listBindingsForWorkload(w http.ResponseWriter, r *http.Request) {
+	wid := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(wid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	rows, err := s.store.ListBindingsForWorkloadWithNames(wid)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list bindings")
+		return
+	}
+	type item struct {
+		bindingView
+		TriggerKind string `json:"trigger_kind"`
+		TriggerName string `json:"trigger_name"`
+	}
+	out := make([]item, 0, len(rows))
+	for _, b := range rows {
+		out = append(out, item{
+			bindingView: bindingView{
+				ID:            b.ID,
+				WorkloadID:    b.WorkloadID,
+				TriggerID:     b.TriggerID,
+				BindingConfig: json.RawMessage(b.BindingConfig),
+				Enabled:       b.Enabled,
+				SortOrder:     b.SortOrder,
+				CreatedAt:     b.CreatedAt,
+				UpdatedAt:     b.UpdatedAt,
+			},
+			TriggerKind: b.TriggerKind,
+			TriggerName: b.TriggerName,
+		})
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// workloadBindRequest covers the two UX flows: bind an existing trigger
+// (TriggerID present) or inline-create one in the same call (TriggerID
+// empty + Inline populated). The inline form keeps the 1:1 case feeling
+// unchanged from the embedded-trigger era.
+type workloadBindRequest struct {
+	TriggerID     string          `json:"trigger_id"`
+	BindingConfig json.RawMessage `json:"binding_config"`
+	Enabled       *bool           `json:"enabled"`
+	SortOrder     int             `json:"sort_order"`
+	Inline        *triggerRequest `json:"inline"`
+}
+
+func (s *Server) bindTriggerToWorkload(w http.ResponseWriter, r *http.Request) {
+	wid := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(wid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	var req workloadBindRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	if req.TriggerID == "" && req.Inline == nil {
+		respondError(w, http.StatusBadRequest, "either trigger_id or inline trigger is required")
+		return
+	}
+
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+
+	// Inline path: create trigger + binding atomically so a binding
+	// failure cannot leak a half-built trigger row.
+	if req.TriggerID == "" {
+		if err := validateTriggerRequest(*req.Inline); err != nil {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		_, b, err := s.store.CreateTriggerWithBindingTx(
+			buildTriggerFromRequest(*req.Inline),
+			store.WorkloadTriggerBinding{
+				WorkloadID:    wid,
+				BindingConfig: string(req.BindingConfig),
+				Enabled:       enabled,
+				SortOrder:     req.SortOrder,
+			},
+		)
+		if err != nil {
+			if errors.Is(err, store.ErrUnique) {
+				respondError(w, http.StatusConflict, "a trigger with this name already exists")
+				return
+			}
+			slog.Error("inline trigger+binding tx", "error", err)
+			respondError(w, http.StatusInternalServerError, "create inline trigger+binding")
+			return
+		}
+		respondJSON(w, http.StatusCreated, s.toBindingView(b))
+		return
+	}
+
+	// Existing-trigger path: just bind.
+	trg, err := s.store.GetTriggerByID(req.TriggerID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "trigger")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get trigger")
+		return
+	}
+	if err := validateBindingConfig(trg, req.BindingConfig); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	b, err := s.store.CreateBinding(store.WorkloadTriggerBinding{
+		WorkloadID:    wid,
+		TriggerID:     req.TriggerID,
+		BindingConfig: string(req.BindingConfig),
+		Enabled:       enabled,
+		SortOrder:     req.SortOrder,
+	})
+	if err != nil {
+		if errors.Is(err, store.ErrUnique) {
+			respondError(w, http.StatusConflict, "this workload is already bound to this trigger")
+			return
+		}
+		slog.Error("create binding from workload side", "error", err)
+		respondError(w, http.StatusInternalServerError, "create binding")
+		return
+	}
+	respondJSON(w, http.StatusCreated, s.toBindingView(b))
+}
+
+func (s *Server) deleteBinding(w http.ResponseWriter, r *http.Request) {
+	bid := chi.URLParam(r, "bid")
+	if err := s.store.DeleteBinding(bid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "binding")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete binding")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": bid})
+}
@@ -0,0 +1,177 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"io"
+	"log/slog"
+	"net/http"
+	"os"
+	"path/filepath"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
+)
+
+// listWorkloadSnapshots handles GET /api/workloads/{id}/snapshots.
+func (s *Server) listWorkloadSnapshots(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	snaps, err := s.snapshotEngine.List(id)
+	if err != nil {
+		slog.Error("snapshots: list", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	respondJSON(w, http.StatusOK, snaps)
+}
+
+// snapshotableVolume is the sanitized view of a volume in the snapshotable
+// response — it omits the resolved host path so internal layout is not leaked.
+type snapshotableVolume struct {
+	Target string `json:"target"`
+	Scope  string `json:"scope"`
+	Source string `json:"source"`
+}
+
+// getWorkloadSnapshotable handles GET /api/workloads/{id}/snapshotable. It
+// tells the UI which volumes can be snapshotted and which are skipped (and
+// why), so users are never misled about coverage.
+func (s *Server) getWorkloadSnapshotable(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	workload, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "workload not found")
+		return
+	}
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	refs, skipped, err := volsnap.SnapshotableVolumes(s.store, workload, settings)
+	if err != nil {
+		slog.Error("snapshots: enumerate", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	volumes := make([]snapshotableVolume, 0, len(refs))
+	for _, ref := range refs {
+		volumes = append(volumes, snapshotableVolume{Target: ref.Target, Scope: ref.Scope, Source: ref.Source})
+	}
+	if skipped == nil {
+		skipped = []volsnap.SkippedVolume{}
+	}
+	respondJSON(w, http.StatusOK, map[string]any{
+		"volumes": volumes,
+		"skipped": skipped,
+	})
+}
+
+// createWorkloadSnapshot handles POST /api/workloads/{id}/snapshots.
+func (s *Server) createWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	id := chi.URLParam(r, "id")
+	workload, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "workload not found")
+		return
+	}
+	settings, err := s.store.GetSettings()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	var body struct {
+		Label string `json:"label"`
+	}
+	if r.ContentLength != 0 {
+		if err := json.NewDecoder(io.LimitReader(r.Body, 1<<20)).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+			respondError(w, http.StatusBadRequest, "invalid JSON body")
+			return
+		}
+	}
+
+	snap, err := s.snapshotEngine.Create(workload, settings, body.Label)
+	if err != nil {
+		// "no snapshottable volume data" is client-actionable (400, safe to
+		// echo). Any other error is server-side: log the detail, return a
+		// generic 500 so internal paths / DB text never reach the client.
+		if errors.Is(err, volsnap.ErrNoSnapshotData) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		slog.Error("snapshots: create", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	respondJSON(w, http.StatusCreated, snap)
+}
+
+// deleteSnapshot handles DELETE /api/snapshots/{sid}.
+func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	sid := chi.URLParam(r, "sid")
+	if err := s.snapshotEngine.Delete(sid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondError(w, http.StatusNotFound, "snapshot not found")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "failed to delete snapshot")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
+}
+
+// downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
+// tar.gz archive. The resolved path is containment-checked against the
+// snapshot directory.
+func (s *Server) downloadSnapshot(w http.ResponseWriter, r *http.Request) {
+	if s.snapshotEngine == nil {
+		respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
+		return
+	}
+	sid := chi.URLParam(r, "sid")
+	snap, err := s.snapshotEngine.Get(sid)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot not found")
+		return
+	}
+	path, err := s.snapshotEngine.FilePath(snap)
+	if err != nil {
+		respondError(w, http.StatusForbidden, "access denied")
+		return
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "snapshot file not found on disk")
+		return
+	}
+	defer f.Close()
+	stat, err := f.Stat()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to read snapshot file")
+		return
+	}
+	name := filepath.Base(snap.Filename)
+	w.Header().Set("Content-Type", "application/gzip")
+	w.Header().Set("Content-Disposition", "attachment; filename=\""+name+"\"")
+	http.ServeContent(w, r, name, stat.ModTime(), f)
+}
@@ -0,0 +1,178 @@
+package api
+
+import (
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/volsnap"
+	"github.com/alexei/tinyforge/internal/webhook"
+)
+
+// newSnapshotEnv builds an API test env with the volume-snapshot engine wired
+// (the shared newAPITestEnv does not wire it). dataDir holds the snapshot
+// archives; baseVol is where host-bind volume directories resolve.
+func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
+	t.Helper()
+	st, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("create store: %v", err)
+	}
+	t.Cleanup(func() { st.Close() })
+
+	encKey := [32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	dispatcher := &fakeAPIDispatcher{}
+	wh := webhook.NewHandler(st)
+	wh.SetPluginDispatcher(dispatcher)
+	srv := NewServer(st, nil, nil, nil, dispatcher, nil, wh, nil, encKey)
+
+	snapEng, err := volsnap.New(st, t.TempDir())
+	if err != nil {
+		t.Fatalf("snapshot engine: %v", err)
+	}
+	srv.SetSnapshotEngine(snapEng)
+
+	httpsrv := httptest.NewServer(srv.Router())
+	t.Cleanup(httpsrv.Close)
+
+	la := auth.NewLocalAuth(encKey)
+	tok, err := la.GenerateToken(auth.Claims{UserID: "u-admin", Username: "admin", Role: "admin"})
+	if err != nil {
+		t.Fatalf("mint token: %v", err)
+	}
+
+	baseVol := t.TempDir()
+	settings, _ := st.GetSettings()
+	settings.BaseVolumePath = baseVol
+	if err := st.UpdateSettings(settings); err != nil {
+		t.Fatalf("update settings: %v", err)
+	}
+
+	return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey}, baseVol
+}
+
+func TestVolumeSnapshots_EndToEnd(t *testing.T) {
+	e, baseVol := newSnapshotEnv(t)
+
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name:         "data-app",
+		Kind:         "project",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"registry.example.com/owner/app","port":8080}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{
+		WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project",
+	}); err != nil {
+		t.Fatalf("set volume: %v", err)
+	}
+
+	// Materialize the resolved host-bind dir with a file so there is data to
+	// capture. Layout mirrors ResolveWorkloadPath for project scope:
+	// <baseVol>/<name>-<id8>/<source>.
+	id8 := w.ID
+	if len(id8) > 8 {
+		id8 = id8[:8]
+	}
+	hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
+	if err := os.MkdirAll(hostDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("important"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	// snapshotable lists the one host-bind volume.
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshotable", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("snapshotable status = %d", resp.StatusCode)
+	}
+	var snapable struct {
+		Volumes []map[string]string `json:"volumes"`
+		Skipped []map[string]string `json:"skipped"`
+	}
+	decodeEnvelope(t, resp, &snapable)
+	if len(snapable.Volumes) != 1 || snapable.Volumes[0]["target"] != "/data" {
+		t.Fatalf("expected 1 snapshotable volume /data, got %+v", snapable)
+	}
+
+	// Create a snapshot.
+	resp = e.do(t, http.MethodPost, "/api/workloads/"+w.ID+"/snapshots", map[string]string{"label": "before upgrade"})
+	if resp.StatusCode != http.StatusCreated {
+		t.Fatalf("create snapshot status = %d", resp.StatusCode)
+	}
+	var snap store.VolumeSnapshot
+	decodeEnvelope(t, resp, &snap)
+	if snap.ID == "" || snap.SizeBytes == 0 || snap.Label != "before upgrade" {
+		t.Fatalf("unexpected snapshot: %+v", snap)
+	}
+
+	// It appears in the list.
+	resp = e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshots", nil)
+	var list []store.VolumeSnapshot
+	decodeEnvelope(t, resp, &list)
+	if len(list) != 1 || list[0].ID != snap.ID {
+		t.Fatalf("expected 1 snapshot in list, got %+v", list)
+	}
+
+	// Download streams a non-empty gzip archive (not the JSON envelope).
+	resp = e.do(t, http.MethodGet, "/api/snapshots/"+snap.ID+"/download", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("download status = %d", resp.StatusCode)
+	}
+	if ct := resp.Header.Get("Content-Type"); ct != "application/gzip" {
+		t.Errorf("download content-type = %q, want application/gzip", ct)
+	}
+	data, _ := io.ReadAll(resp.Body)
+	resp.Body.Close()
+	if len(data) == 0 {
+		t.Error("download body is empty")
+	}
+
+	// Delete removes it.
+	resp = e.do(t, http.MethodDelete, "/api/snapshots/"+snap.ID, nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("delete status = %d", resp.StatusCode)
+	}
+	resp = e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshots", nil)
+	var after []store.VolumeSnapshot
+	decodeEnvelope(t, resp, &after)
+	if len(after) != 0 {
+		t.Fatalf("expected 0 snapshots after delete, got %d", len(after))
+	}
+}
+
+func TestCreateSnapshot_NoVolumeData_Returns400(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	w, err := e.store.CreateWorkload(store.Workload{
+		Name:         "no-vol-app",
+		Kind:         "project",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"x","port":80}`,
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+w.ID+"/snapshots", nil)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("expected 400 for an app with no snapshottable volumes, got %d", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+func TestSnapshotEndpoints_RequireWorkload(t *testing.T) {
+	e, _ := newSnapshotEnv(t)
+	// snapshotable on an unknown workload → 404.
+	resp := e.do(t, http.MethodGet, "/api/workloads/does-not-exist/snapshotable", nil)
+	if resp.StatusCode != http.StatusNotFound {
+		t.Fatalf("snapshotable unknown workload = %d, want 404", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
@@ -1,143 +0,0 @@
-package api
-
-import (
-	"errors"
-	"net/http"
-
-	"github.com/go-chi/chi/v5"
-
-	"github.com/alexei/docker-watcher/internal/store"
-)
-
-// volumeRequest is the expected JSON body for creating/updating a volume.
-type volumeRequest struct {
-	Source string `json:"source"`
-	Target string `json:"target"`
-	Mode   string `json:"mode"`
-}
-
-// listVolumes handles GET /api/projects/{id}/volumes.
-func (s *Server) listVolumes(w http.ResponseWriter, r *http.Request) {
-	projectID := chi.URLParam(r, "id")
-
-	// Verify project exists.
-	if _, err := s.store.GetProjectByID(projectID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get project: "+err.Error())
-		return
-	}
-
-	vols, err := s.store.GetVolumesByProjectID(projectID)
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to list volumes: "+err.Error())
-		return
-	}
-
-	respondJSON(w, http.StatusOK, vols)
-}
-
-// createVolume handles POST /api/projects/{id}/volumes.
-func (s *Server) createVolume(w http.ResponseWriter, r *http.Request) {
-	projectID := chi.URLParam(r, "id")
-
-	// Verify project exists.
-	if _, err := s.store.GetProjectByID(projectID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "project")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get project: "+err.Error())
-		return
-	}
-
-	var req volumeRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	if req.Source == "" {
-		respondError(w, http.StatusBadRequest, "source is required")
-		return
-	}
-	if req.Target == "" {
-		respondError(w, http.StatusBadRequest, "target is required")
-		return
-	}
-	if req.Mode == "" {
-		req.Mode = "shared"
-	}
-	if req.Mode != "shared" && req.Mode != "isolated" {
-		respondError(w, http.StatusBadRequest, "mode must be 'shared' or 'isolated'")
-		return
-	}
-
-	vol, err := s.store.CreateVolume(store.Volume{
-		ProjectID: projectID,
-		Source:    req.Source,
-		Target:   req.Target,
-		Mode:     req.Mode,
-	})
-	if err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to create volume: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusCreated, vol)
-}
-
-// updateVolume handles PUT /api/projects/{id}/volumes/{volId}.
-func (s *Server) updateVolume(w http.ResponseWriter, r *http.Request) {
-	volID := chi.URLParam(r, "volId")
-
-	existing, err := s.store.GetVolumeByID(volID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "volume")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to get volume: "+err.Error())
-		return
-	}
-
-	var req volumeRequest
-	if !decodeJSON(w, r, &req) {
-		return
-	}
-
-	updated := existing
-	if req.Source != "" {
-		updated.Source = req.Source
-	}
-	if req.Target != "" {
-		updated.Target = req.Target
-	}
-	if req.Mode != "" {
-		if req.Mode != "shared" && req.Mode != "isolated" {
-			respondError(w, http.StatusBadRequest, "mode must be 'shared' or 'isolated'")
-			return
-		}
-		updated.Mode = req.Mode
-	}
-
-	if err := s.store.UpdateVolume(updated); err != nil {
-		respondError(w, http.StatusInternalServerError, "failed to update volume: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, updated)
-}
-
-// deleteVolume handles DELETE /api/projects/{id}/volumes/{volId}.
-func (s *Server) deleteVolume(w http.ResponseWriter, r *http.Request) {
-	volID := chi.URLParam(r, "volId")
-	if err := s.store.DeleteVolume(volID); err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			respondNotFound(w, "volume")
-			return
-		}
-		respondError(w, http.StatusInternalServerError, "failed to delete volume: "+err.Error())
-		return
-	}
-	respondJSON(w, http.StatusOK, map[string]string{"deleted": volID})
-}
@@ -0,0 +1,250 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
+)
+
+// chainNode is the lightweight shape returned by /chain — we deliberately
+// don't return full plugin.Workload values for ancestor/descendant rows
+// because the secret fields don't belong in a chain-traversal response.
+//
+// IsPreview / PreviewBranch surface branch-preview children to the UI so it
+// can render them in a dedicated "Preview environments" panel rather than as
+// undistinguished stage children. They are computed against the chain's
+// `self` workload via preview.IsPreviewChild — the canonical "this child is a
+// branch preview" test that reverses the MaterializeForBranch naming formula.
+// Both are zero-valued (false / "") for the parent and self nodes and for
+// operator-created stage children.
+type chainNode struct {
+	ID            string `json:"id"`
+	Name          string `json:"name"`
+	SourceKind    string `json:"source_kind"`
+	TriggerKind   string `json:"trigger_kind"`
+	IsPreview     bool   `json:"is_preview"`
+	PreviewBranch string `json:"preview_branch,omitempty"`
+	CreatedAt     string `json:"created_at"`
+	UpdatedAt     string `json:"updated_at"`
+}
+
+func chainNodeOf(w store.Workload) chainNode {
+	return chainNode{
+		ID:          w.ID,
+		Name:        w.Name,
+		SourceKind:  w.SourceKind,
+		TriggerKind: w.TriggerKind,
+		CreatedAt:   w.CreatedAt,
+		UpdatedAt:   w.UpdatedAt,
+	}
+}
+
+// previewBranchOf extracts the branch a preview child was materialized for
+// from its source_config (the `branch` key MaterializeForBranch wrote).
+// Returns "" on a missing/malformed config — the caller only calls this for
+// rows preview.IsPreviewChild already confirmed, so a blank result just means
+// the JSON couldn't be decoded.
+func previewBranchOf(w store.Workload) string {
+	var cfg struct {
+		Branch string `json:"branch"`
+	}
+	if w.SourceConfig != "" {
+		_ = json.Unmarshal([]byte(w.SourceConfig), &cfg)
+	}
+	return cfg.Branch
+}
+
+// childChainNode builds a chainNode for a child row, marking it as a branch
+// preview (and attaching its branch) when it was materialized from `self`.
+func childChainNode(self, child store.Workload) chainNode {
+	node := chainNodeOf(child)
+	if preview.IsPreviewChild(self, child) {
+		node.IsPreview = true
+		node.PreviewBranch = previewBranchOf(child)
+	}
+	return node
+}
+
+// getWorkloadChain handles GET /api/workloads/{id}/chain.
+//
+// Returns the workload's parent (or nil), itself, and its direct children
+// — i.e. one hop in each direction along the parent_workload_id graph.
+// Deeper traversal is left to the client: the chain is a tree the user
+// builds incrementally, and a server-side recursive walk would surprise
+// operators with O(N) loads on big graphs.
+func (s *Server) getWorkloadChain(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	self, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+
+	var parent *chainNode
+	if self.ParentWorkloadID != "" {
+		p, err := s.store.GetWorkloadByID(self.ParentWorkloadID)
+		if err == nil {
+			node := chainNodeOf(p)
+			parent = &node
+		} else if !errors.Is(err, store.ErrNotFound) {
+			slog.Warn("chain: parent lookup failed", "workload", id, "parent", self.ParentWorkloadID, "error", err)
+		}
+	}
+
+	childRows, err := s.store.ListChildrenByParent(self.ID)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list children")
+		return
+	}
+	children := make([]chainNode, 0, len(childRows))
+	for _, c := range childRows {
+		children = append(children, childChainNode(self, c))
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"parent":   parent,
+		"self":     chainNodeOf(self),
+		"children": children,
+	})
+}
+
+// promoteFromRequest is the body of /promote-from. ImageTag is optional —
+// when blank the server falls back to whatever tag the source workload's
+// most recent running container reports. The endpoint is intentionally
+// non-destructive: it updates the SourceConfig.default_tag and queues a
+// manual deploy. It does not change parent_workload_id.
+type promoteFromRequest struct {
+	ImageTag string `json:"image_tag"`
+	Deploy   bool   `json:"deploy"`
+}
+
+// promoteFromWorkload handles POST /api/workloads/{id}/promote-from/{sourceID}.
+//
+// Copies the source workload's currently-running image tag into the
+// target's SourceConfig.default_tag, optionally triggering an immediate
+// deploy. The target's existing config blob is preserved aside from the
+// promoted field. Both workloads must use the same source_kind (image)
+// — promoting across kinds is undefined and rejected.
+func (s *Server) promoteFromWorkload(w http.ResponseWriter, r *http.Request) {
+	targetID := chi.URLParam(r, "id")
+	sourceID := chi.URLParam(r, "sourceID")
+	if targetID == sourceID {
+		respondError(w, http.StatusBadRequest, "target and source must differ")
+		return
+	}
+
+	target, err := s.store.GetWorkloadByID(targetID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get target workload")
+		return
+	}
+	source, err := s.store.GetWorkloadByID(sourceID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "source workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get source workload")
+		return
+	}
+	if target.SourceKind != "image" || source.SourceKind != "image" {
+		respondError(w, http.StatusBadRequest, "promote-from is only defined for image source workloads on both ends")
+		return
+	}
+
+	var req promoteFromRequest
+	if r.ContentLength > 0 {
+		if !decodeJSONStrict(w, r, &req) {
+			return
+		}
+	}
+
+	// Resolve the tag: explicit override wins; otherwise pick the running
+	// container's image_tag on the source workload.
+	tag := strings.TrimSpace(req.ImageTag)
+	if tag == "" {
+		rows, err := s.store.ListContainersByWorkload(sourceID)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "list source containers")
+			return
+		}
+		for _, c := range rows {
+			if c.State == "running" && c.ImageTag != "" {
+				tag = c.ImageTag
+				break
+			}
+		}
+		if tag == "" {
+			respondError(w, http.StatusBadRequest, "source workload has no running container; specify image_tag explicitly")
+			return
+		}
+	}
+
+	// Decode target source_config, patch default_tag, re-encode.
+	cfg := map[string]any{}
+	if target.SourceConfig != "" && target.SourceConfig != "{}" {
+		if err := json.Unmarshal([]byte(target.SourceConfig), &cfg); err != nil {
+			respondError(w, http.StatusInternalServerError, "decode target source_config")
+			return
+		}
+	}
+	cfg["default_tag"] = tag
+	patched, err := json.Marshal(cfg)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "encode target source_config")
+		return
+	}
+	target.SourceConfig = string(patched)
+	if err := s.store.UpdateWorkload(target); err != nil {
+		slog.Error("promote: update target", "target", targetID, "error", err)
+		respondError(w, http.StatusInternalServerError, "update target workload")
+		return
+	}
+
+	actor := "promote"
+	if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
+		actor = claims.Username
+	}
+	resp := map[string]any{
+		"workload_id":   targetID,
+		"source_id":     sourceID,
+		"promoted_tag":  tag,
+		"deploy_queued": false,
+	}
+	if req.Deploy {
+		intent := plugin.DeploymentIntent{
+			Reason:      "promote",
+			Reference:   tag,
+			Metadata:    map[string]string{"source_workload_id": sourceID},
+			TriggeredAt: time.Now().UTC(),
+			TriggeredBy: actor,
+		}
+		if err := s.deployer.DispatchPlugin(r.Context(), toPluginWorkload(target), intent); err != nil {
+			slog.Warn("promote: dispatch failed", "target", targetID, "error", err)
+			respondError(w, http.StatusInternalServerError, "dispatch failed; see server logs")
+			return
+		}
+		resp["deploy_queued"] = true
+	}
+	respondJSON(w, http.StatusOK, resp)
+}
+
@@ -0,0 +1,147 @@
+package api
+
+import (
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// TestChildChainNode_MarksPreviewChildren verifies the /chain DTO builder
+// distinguishes branch-preview children (materialized by the preview package)
+// from operator-created stage children that merely share the parent link.
+// The discriminator is preview.IsPreviewChild, which reverses the
+// MaterializeForBranch naming formula: name == template.Name + "/" + slug.
+func TestChildChainNode_MarksPreviewChildren(t *testing.T) {
+	template := store.Workload{
+		ID:         "tmpl-1",
+		Name:       "myapp",
+		SourceKind: "dockerfile",
+	}
+
+	tests := []struct {
+		name       string
+		child      store.Workload
+		wantPrev   bool
+		wantBranch string
+	}{
+		{
+			name: "preview child is marked with its branch",
+			child: store.Workload{
+				ID:               "child-prev",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login","port":3000}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   true,
+			wantBranch: "feat/login",
+		},
+		{
+			name: "operator-named stage child sharing the parent is not a preview",
+			child: store.Workload{
+				ID:               "child-stage",
+				Name:             "myapp-staging",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"main"}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			name: "child of a different parent is not a preview of self",
+			child: store.Workload{
+				ID:               "child-other",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login"}`,
+				ParentWorkloadID: "some-other-template",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			name: "child with no branch in source_config is not a preview",
+			child: store.Workload{
+				ID:               "child-nobranch",
+				Name:             "myapp/feat-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			// Same parent + a valid branch, but the name carries an extra
+			// suffix so it fails ONLY the slug-equality check (expected
+			// "myapp/feat-login", got "myapp/feat-login-staging"). The
+			// branch alone must not be enough to mark a preview.
+			name: "valid branch but name fails the slug match is not a preview",
+			child: store.Workload{
+				ID:               "child-slugmiss",
+				Name:             "myapp/feat-login-staging",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"feat/login","port":3000}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   false,
+			wantBranch: "",
+		},
+		{
+			// Uppercase + slash branch: slugifyBranch lowercases and maps
+			// "/" -> "-", so "Feature/Login" -> "feature-login" and the name
+			// "myapp/feature-login" matches. PreviewBranch must echo the RAW
+			// branch from source_config ("Feature/Login"), not the slug.
+			name: "uppercase slash branch matches and keeps raw branch",
+			child: store.Workload{
+				ID:               "child-upper",
+				Name:             "myapp/feature-login",
+				SourceKind:       "dockerfile",
+				SourceConfig:     `{"branch":"Feature/Login","port":8080}`,
+				ParentWorkloadID: "tmpl-1",
+			},
+			wantPrev:   true,
+			wantBranch: "Feature/Login",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			node := childChainNode(template, tc.child)
+			if node.IsPreview != tc.wantPrev {
+				t.Errorf("IsPreview = %v, want %v", node.IsPreview, tc.wantPrev)
+			}
+			if node.PreviewBranch != tc.wantBranch {
+				t.Errorf("PreviewBranch = %q, want %q", node.PreviewBranch, tc.wantBranch)
+			}
+			// Base fields must always round-trip regardless of preview status.
+			if node.ID != tc.child.ID || node.Name != tc.child.Name {
+				t.Errorf("base fields mangled: got id=%q name=%q", node.ID, node.Name)
+			}
+		})
+	}
+}
+
+// TestPreviewBranchOf_ToleratesMalformedConfig confirms the branch extractor
+// returns "" rather than panicking on a missing or invalid source_config.
+func TestPreviewBranchOf_ToleratesMalformedConfig(t *testing.T) {
+	cases := []struct {
+		name string
+		cfg  string
+		want string
+	}{
+		{"valid branch", `{"branch":"release/v1"}`, "release/v1"},
+		{"empty config", ``, ""},
+		{"empty object", `{}`, ""},
+		{"malformed json", `{not-json`, ""},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got := previewBranchOf(store.Workload{SourceConfig: c.cfg})
+			if got != c.want {
+				t.Errorf("previewBranchOf(%q) = %q, want %q", c.cfg, got, c.want)
+			}
+		})
+	}
+}
@@ -0,0 +1,89 @@
+package api
+
+import (
+	"encoding/json"
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// toPluginWorkload converts a persisted store.Workload row into the value
+// shape that Source / Trigger plugins consume. Lives in the api package
+// (rather than store or plugin) to keep plugin's dependency graph free of
+// store imports and avoid the cycle that would form otherwise.
+//
+// SourceConfig / TriggerConfig are passed through as raw JSON; the matching
+// plugin decodes them with plugin.SourceConfigOf[T] / TriggerConfigOf[T].
+// PublicFaces is decoded eagerly because every consumer needs the parsed
+// slice (proxy registration, UI, validation).
+func toPluginWorkload(w store.Workload) plugin.Workload {
+	var faces []plugin.PublicFace
+	if w.PublicFaces != "" {
+		if err := json.Unmarshal([]byte(w.PublicFaces), &faces); err != nil {
+			slog.Warn("workload: invalid public_faces JSON, treating as empty",
+				"workload", w.ID, "error", err)
+			faces = nil
+		}
+	}
+	return plugin.Workload{
+		ID:                      w.ID,
+		Name:                    w.Name,
+		GroupID:                 w.AppID,
+		ParentWorkloadID:        w.ParentWorkloadID,
+		SourceKind:              w.SourceKind,
+		SourceConfig:            json.RawMessage(w.SourceConfig),
+		TriggerKind:             w.TriggerKind,
+		TriggerConfig:           json.RawMessage(w.TriggerConfig),
+		PublicFaces:             faces,
+		NotificationURL:         w.NotificationURL,
+		NotificationSecret:      w.NotificationSecret,
+		WebhookSecret:           w.WebhookSecret,
+		WebhookSigningSecret:    w.WebhookSigningSecret,
+		WebhookRequireSignature: w.WebhookRequireSignature,
+		CreatedAt:               w.CreatedAt,
+		UpdatedAt:               w.UpdatedAt,
+	}
+}
+
+// fromPluginWorkload is the symmetric direction — used by /api/workloads
+// create + update handlers. Returns a store.Workload ready to pass to
+// store.CreateWorkload / store.UpdateWorkload. The caller is responsible
+// for re-encoding PublicFaces; we do it here to keep the JSON shape in
+// one place.
+func fromPluginWorkload(p plugin.Workload) (store.Workload, error) {
+	facesJSON := "[]"
+	if len(p.PublicFaces) > 0 {
+		b, err := json.Marshal(p.PublicFaces)
+		if err != nil {
+			return store.Workload{}, err
+		}
+		facesJSON = string(b)
+	}
+	srcCfg := string(p.SourceConfig)
+	if srcCfg == "" {
+		srcCfg = "{}"
+	}
+	trgCfg := string(p.TriggerConfig)
+	if trgCfg == "" {
+		trgCfg = "{}"
+	}
+	return store.Workload{
+		ID:                      p.ID,
+		Name:                    p.Name,
+		AppID:                   p.GroupID,
+		ParentWorkloadID:        p.ParentWorkloadID,
+		SourceKind:              p.SourceKind,
+		SourceConfig:            srcCfg,
+		TriggerKind:             p.TriggerKind,
+		TriggerConfig:           trgCfg,
+		PublicFaces:             facesJSON,
+		NotificationURL:         p.NotificationURL,
+		NotificationSecret:      p.NotificationSecret,
+		WebhookSecret:           p.WebhookSecret,
+		WebhookSigningSecret:    p.WebhookSigningSecret,
+		WebhookRequireSignature: p.WebhookRequireSignature,
+		CreatedAt:               p.CreatedAt,
+		UpdatedAt:               p.UpdatedAt,
+	}, nil
+}
@@ -0,0 +1,165 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// workloadEnvRow is the JSON shape returned to clients. Plaintext is
+// redacted for encrypted entries — once a value is encrypted, the
+// server treats it as write-only. To rotate, the operator submits a new
+// value; to read, they have to look at the running container.
+type workloadEnvRow struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Key        string `json:"key"`
+	Value      string `json:"value"`
+	Encrypted  bool   `json:"encrypted"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
+func (s *Server) listWorkloadEnv(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	rows, err := s.store.ListWorkloadEnv(id)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workload env")
+		return
+	}
+	out := make([]workloadEnvRow, 0, len(rows))
+	for _, e := range rows {
+		row := workloadEnvRow{
+			ID:         e.ID,
+			WorkloadID: e.WorkloadID,
+			Key:        e.Key,
+			Encrypted:  e.Encrypted,
+			CreatedAt:  e.CreatedAt,
+			UpdatedAt:  e.UpdatedAt,
+		}
+		if e.Encrypted {
+			row.Value = "" // write-only after encryption
+		} else {
+			row.Value = e.Value
+		}
+		out = append(out, row)
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// setWorkloadEnvRequest is the POST/PUT body. Encrypted=true causes the
+// server to encrypt the value at rest with the global encryption key.
+type setWorkloadEnvRequest struct {
+	Key       string `json:"key"`
+	Value     string `json:"value"`
+	Encrypted bool   `json:"encrypted"`
+}
+
+func (s *Server) setWorkloadEnv(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	var req setWorkloadEnvRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.Key = strings.TrimSpace(req.Key)
+	if req.Key == "" {
+		respondError(w, http.StatusBadRequest, "key is required")
+		return
+	}
+	if !validEnvKey(req.Key) {
+		respondError(w, http.StatusBadRequest, "key must match [A-Za-z_][A-Za-z0-9_]*")
+		return
+	}
+	value := req.Value
+	if req.Encrypted && value != "" {
+		enc, err := crypto.Encrypt(s.encKey, value)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "encrypt value")
+			return
+		}
+		value = enc
+	}
+	row, err := s.store.SetWorkloadEnv(store.WorkloadEnv{
+		WorkloadID: id,
+		Key:        req.Key,
+		Value:      value,
+		Encrypted:  req.Encrypted,
+	})
+	if err != nil {
+		slog.Error("set workload env", "workload", id, "key", req.Key, "error", err)
+		respondError(w, http.StatusInternalServerError, "set workload env")
+		return
+	}
+	respondJSON(w, http.StatusOK, workloadEnvRow{
+		ID:         row.ID,
+		WorkloadID: row.WorkloadID,
+		Key:        row.Key,
+		Value:      "", // never echo even fresh writes — caller already has it
+		Encrypted:  row.Encrypted,
+		CreatedAt:  row.CreatedAt,
+		UpdatedAt:  row.UpdatedAt,
+	})
+}
+
+func (s *Server) deleteWorkloadEnv(w http.ResponseWriter, r *http.Request) {
+	envID := chi.URLParam(r, "envID")
+	if err := s.store.DeleteWorkloadEnv(envID); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload env")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete workload env")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": envID})
+}
+
+// Workload-level webhook URL handlers were dropped in the hard legacy
+// cutover: the old `/api/webhook/workloads/{secret}` route is gone, so
+// minting a workload secret would hand operators a URL that 404s. The
+// inbound webhook surface is now exclusively first-class triggers
+// (`/api/webhook/triggers/{secret}`); use the trigger CRUD + bindings
+// endpoints to wire a workload to inbound deploys.
+
+// validEnvKey accepts POSIX-style env names. Rejects anything that would
+// confuse Docker's env parser (=, spaces, control chars).
+func validEnvKey(k string) bool {
+	if len(k) == 0 || len(k) > 256 {
+		return false
+	}
+	for i, ch := range k {
+		switch {
+		case ch >= 'A' && ch <= 'Z',
+			ch >= 'a' && ch <= 'z',
+			ch == '_':
+			continue
+		case (ch >= '0' && ch <= '9') && i > 0:
+			continue
+		default:
+			return false
+		}
+	}
+	return true
+}
@@ -0,0 +1,231 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// workloadNotificationRow is the JSON shape returned to clients. The
+// `secret_set` boolean replaces the actual ciphertext: once stored a
+// secret is write-only, mirroring how workload_env hides encrypted
+// values. Rotating means submitting a new value.
+type workloadNotificationRow struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	SecretSet  bool   `json:"secret_set"`
+	EventTypes string `json:"event_types"`
+	Enabled    bool   `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
+func toWorkloadNotificationRow(n store.WorkloadNotification) workloadNotificationRow {
+	return workloadNotificationRow{
+		ID:         n.ID,
+		WorkloadID: n.WorkloadID,
+		Name:       n.Name,
+		URL:        n.URL,
+		SecretSet:  n.Secret != "",
+		EventTypes: n.EventTypes,
+		Enabled:    n.Enabled,
+		SortOrder:  n.SortOrder,
+		CreatedAt:  n.CreatedAt,
+		UpdatedAt:  n.UpdatedAt,
+	}
+}
+
+func (s *Server) listWorkloadNotifications(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	rows, err := s.store.ListWorkloadNotifications(id)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workload notifications")
+		return
+	}
+	out := make([]workloadNotificationRow, 0, len(rows))
+	for _, n := range rows {
+		out = append(out, toWorkloadNotificationRow(n))
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// workloadNotificationRequest is the POST/PUT body. Secret is the raw
+// plaintext webhook signing key; the server encrypts it at rest with
+// the global encryption key before INSERT. An empty Secret on UPDATE
+// leaves the stored secret untouched so the operator can edit the URL
+// or event filter without re-entering the secret each time.
+type workloadNotificationRequest struct {
+	Name       string `json:"name"`
+	URL        string `json:"url"`
+	Secret     string `json:"secret"`
+	EventTypes string `json:"event_types"`
+	Enabled    *bool  `json:"enabled"`
+	SortOrder  int    `json:"sort_order"`
+}
+
+func (s *Server) createWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	var req workloadNotificationRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.URL = strings.TrimSpace(req.URL)
+	req.Name = strings.TrimSpace(req.Name)
+	if req.URL == "" {
+		respondError(w, http.StatusBadRequest, "url is required")
+		return
+	}
+	encSecret := ""
+	if req.Secret != "" {
+		v, err := crypto.Encrypt(s.encKey, req.Secret)
+		if err != nil {
+			slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
+			respondError(w, http.StatusInternalServerError, "encrypt secret")
+			return
+		}
+		encSecret = v
+	}
+	enabled := true
+	if req.Enabled != nil {
+		enabled = *req.Enabled
+	}
+	created, err := s.store.CreateWorkloadNotification(store.WorkloadNotification{
+		WorkloadID: id,
+		Name:       req.Name,
+		URL:        req.URL,
+		Secret:     encSecret,
+		EventTypes: req.EventTypes,
+		Enabled:    enabled,
+		SortOrder:  req.SortOrder,
+	})
+	if err != nil {
+		slog.Error("workload notifications: create", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "create workload notification")
+		return
+	}
+	respondJSON(w, http.StatusCreated, toWorkloadNotificationRow(created))
+}
+
+func (s *Server) updateWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	nid := chi.URLParam(r, "nid")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	existing, err := s.store.GetWorkloadNotification(nid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload_notification")
+		return
+	}
+	if existing.WorkloadID != id {
+		// Route mismatch — the row exists but under a different workload.
+		// Return 404 rather than 403 so we don't leak the existence of
+		// foreign rows to an unauthorised caller.
+		respondNotFound(w, "workload_notification")
+		return
+	}
+
+	var req workloadNotificationRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.URL = strings.TrimSpace(req.URL)
+	req.Name = strings.TrimSpace(req.Name)
+	if req.URL == "" {
+		respondError(w, http.StatusBadRequest, "url is required")
+		return
+	}
+
+	existing.Name = req.Name
+	existing.URL = req.URL
+	existing.EventTypes = req.EventTypes
+	existing.SortOrder = req.SortOrder
+	if req.Enabled != nil {
+		existing.Enabled = *req.Enabled
+	}
+	// Empty Secret on UPDATE preserves the stored ciphertext — explicit
+	// rotation requires sending the new plaintext. This avoids forcing
+	// the operator to re-enter their secret on every URL edit.
+	if req.Secret != "" {
+		v, err := crypto.Encrypt(s.encKey, req.Secret)
+		if err != nil {
+			slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
+			respondError(w, http.StatusInternalServerError, "encrypt secret")
+			return
+		}
+		existing.Secret = v
+	}
+
+	if err := s.store.UpdateWorkloadNotification(existing); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		slog.Error("workload notifications: update", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "update workload notification")
+		return
+	}
+	respondJSON(w, http.StatusOK, toWorkloadNotificationRow(existing))
+}
+
+func (s *Server) deleteWorkloadNotification(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	nid := chi.URLParam(r, "nid")
+	existing, err := s.store.GetWorkloadNotification(nid)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload_notification")
+		return
+	}
+	if existing.WorkloadID != id {
+		respondNotFound(w, "workload_notification")
+		return
+	}
+	if err := s.store.DeleteWorkloadNotification(nid); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload_notification")
+			return
+		}
+		slog.Error("workload notifications: delete", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "delete workload notification")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]any{"success": true})
+}
@@ -0,0 +1,388 @@
+package api
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"log/slog"
+	"net/http"
+	"sync"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// storageProbeCache memoizes the `du` result per workload for a short
+// window so a tight polling loop on /storage cannot turn into one
+// `docker exec du` per request. The TTL is intentionally short — the
+// panel is a coarse usage indicator, not a real-time meter.
+var (
+	storageProbeCacheTTL = 30 * time.Second
+	storageProbeMu       sync.Mutex
+	storageProbeCache    = map[string]storageProbeEntry{}
+)
+
+type storageProbeEntry struct {
+	at      time.Time
+	usage   int64
+	probeOk bool
+}
+
+// Runtime endpoints surface what the legacy /api/sites/* surface used
+// to expose on the static-site detail page: the last commit SHA / last
+// sync timestamp / status persisted by the static plugin in
+// containers.extra_json, the data-volume disk usage, and stop / start
+// controls that don't require a full re-deploy.
+//
+// The handlers are deliberately decoupled from the plugin interface so
+// they work uniformly across source kinds: stop/start operate on the
+// Docker container IDs stored in the containers index regardless of
+// kind; runtime-state reads what the source persisted (currently only
+// "static" writes a structured blob); storage usage is static-only
+// today but the endpoint shape allows future sources to opt in.
+
+// runtimeStatePayload is the JSON shape returned by
+// GET /api/workloads/{id}/runtime-state.
+//
+// SourceKind is always present so the UI can decide whether to render
+// the static-specific fields (last_commit_sha, last_sync_at, ...). The
+// container-row fields (ContainerID, State) come from the canonical
+// containers row that the static plugin maintains under the
+// deterministic ID `<workloadID>:site`.
+type runtimeStatePayload struct {
+	SourceKind    string `json:"source_kind"`
+	HasState      bool   `json:"has_state"`
+	ContainerID   string `json:"container_id,omitempty"`
+	State         string `json:"state,omitempty"`
+	Status        string `json:"status,omitempty"`
+	LastCommitSHA string `json:"last_commit_sha,omitempty"`
+	LastSyncAt    string `json:"last_sync_at,omitempty"`
+	LastError     string `json:"last_error,omitempty"`
+}
+
+// getWorkloadRuntimeState handles GET /api/workloads/{id}/runtime-state.
+// Reads the typed state the static plugin writes into containers.extra_json
+// (see internal/workload/plugin/source/static/state.go). Non-static
+// source kinds return SourceKind + HasState=false; the panel hides
+// itself rather than the endpoint 404ing.
+func (s *Server) getWorkloadRuntimeState(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	workload, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		slog.Error("get workload for runtime-state", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	payload := runtimeStatePayload{SourceKind: workload.SourceKind}
+
+	// Both static and dockerfile sources persist their runtime state into
+	// containers.extra_json under a deterministic row id. The shapes
+	// match (status / last_commit_sha / last_sync_at / last_error) so the
+	// handler can decode them identically. The suffix differs per source
+	// kind: static uses ":site", dockerfile uses ":dockerfile".
+	var rowSuffix string
+	switch workload.SourceKind {
+	case "static":
+		rowSuffix = ":site"
+	case "dockerfile":
+		rowSuffix = ":dockerfile"
+	default:
+		respondJSON(w, http.StatusOK, payload)
+		return
+	}
+
+	// The owning plugin maintains one container row per workload at the
+	// deterministic ID. A missing row means the workload has never been
+	// deployed — return HasState=false so the UI can prompt the operator
+	// to deploy.
+	row, err := s.store.GetContainerByID(id + rowSuffix)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondJSON(w, http.StatusOK, payload)
+			return
+		}
+		slog.Error("get container row for runtime-state", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	payload.HasState = true
+	payload.ContainerID = row.ContainerID
+	payload.State = row.State
+
+	// extra_json is the source of truth for the typed runtime fields.
+	// A decode failure is non-fatal: we still report container_id /
+	// state so the UI is useful, just without the sync history.
+	//
+	// No mutex here even though the writer (state.go saveState) holds
+	// a per-workload mutex on read-modify-write — SQLite returns the
+	// ExtraJSON column as a fully-materialized string from a single
+	// SELECT, so the reader sees either the pre- or post-write snapshot
+	// atomically. There is no torn read to defend against.
+	if row.ExtraJSON != "" && row.ExtraJSON != "{}" {
+		var st struct {
+			Status        string `json:"status"`
+			LastCommitSHA string `json:"last_commit_sha"`
+			LastSyncAt    string `json:"last_sync_at"`
+			LastError     string `json:"last_error"`
+		}
+		if err := json.Unmarshal([]byte(row.ExtraJSON), &st); err != nil {
+			slog.Debug("decode extra_json for runtime-state", "workload", id, "error", err)
+		} else {
+			payload.Status = st.Status
+			payload.LastCommitSHA = st.LastCommitSHA
+			payload.LastSyncAt = st.LastSyncAt
+			payload.LastError = st.LastError
+		}
+	}
+
+	respondJSON(w, http.StatusOK, payload)
+}
+
+// storageUsagePayload is the JSON shape returned by
+// GET /api/workloads/{id}/storage. ProbeError surfaces a non-fatal
+// failure to compute used_bytes (du timed out, exec returned non-zero,
+// etc.) so the UI can render "usage unavailable" instead of an
+// always-zero number.
+type storageUsagePayload struct {
+	SourceKind string `json:"source_kind"`
+	Enabled    bool   `json:"enabled"`
+	UsedBytes  int64  `json:"used_bytes"`
+	LimitMB    int    `json:"limit_mb,omitempty"`
+	ProbeError string `json:"probe_error,omitempty"`
+}
+
+// getWorkloadStorage handles GET /api/workloads/{id}/storage.
+//
+// For static workloads with storage enabled, execs `du -sb /app/data`
+// inside the running container to compute the data volume's footprint.
+// For workloads without storage (or non-static source kinds), returns
+// Enabled=false and zero usage so the UI can hide the panel.
+func (s *Server) getWorkloadStorage(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	workload, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		slog.Error("get workload for storage", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	payload := storageUsagePayload{SourceKind: workload.SourceKind}
+
+	if workload.SourceKind != "static" {
+		respondJSON(w, http.StatusOK, payload)
+		return
+	}
+
+	// Decode storage knobs from source_config. Missing / malformed
+	// blobs are treated as storage-disabled rather than erroring; the
+	// validator that runs on workload create already rejects invalid
+	// configs at the source.
+	var cfg struct {
+		StorageEnabled bool `json:"storage_enabled"`
+		StorageLimitMB int  `json:"storage_limit_mb"`
+	}
+	if workload.SourceConfig != "" {
+		if err := json.Unmarshal([]byte(workload.SourceConfig), &cfg); err != nil {
+			// Validator catches malformed configs at create-time, so
+			// this is unexpected — log so a drifted row is traceable.
+			slog.Debug("decode source_config for storage", "workload", id, "error", err)
+		}
+	}
+	payload.Enabled = cfg.StorageEnabled
+	payload.LimitMB = cfg.StorageLimitMB
+
+	if !cfg.StorageEnabled || s.docker == nil {
+		respondJSON(w, http.StatusOK, payload)
+		return
+	}
+
+	// Cache hit short-circuits the docker exec entirely so a polling
+	// frontend cannot turn this into a per-request `du`.
+	storageProbeMu.Lock()
+	if cached, ok := storageProbeCache[id]; ok && time.Since(cached.at) < storageProbeCacheTTL {
+		storageProbeMu.Unlock()
+		payload.UsedBytes = cached.usage
+		if !cached.probeOk {
+			payload.ProbeError = "storage probe unavailable"
+		}
+		respondJSON(w, http.StatusOK, payload)
+		return
+	}
+	storageProbeMu.Unlock()
+
+	// Find the running container. The static plugin's canonical row is
+	// at <id>:site; we also tolerate workloads whose plugin produced
+	// multiple containers by scanning the index.
+	containers, err := s.store.ListContainersByWorkload(id)
+	if err != nil {
+		slog.Error("list containers for storage", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	probeOk := false
+	for _, c := range containers {
+		if c.ContainerID == "" {
+			continue
+		}
+		// 15s budget — `du` on a Hugo-style `public/` with tens of
+		// thousands of files and a cold page cache can run several
+		// seconds. The cache above keeps the amortized cost small.
+		ctx, cancel := context.WithTimeout(r.Context(), 15*time.Second)
+		usage, err := s.docker.InspectSiteStorageUsage(ctx, c.ContainerID)
+		cancel()
+		if err != nil {
+			slog.Debug("storage usage probe failed", "workload", id, "container", c.ContainerID, "error", err)
+			continue
+		}
+		payload.UsedBytes = usage.UsedBytes
+		probeOk = true
+		break
+	}
+	if !probeOk {
+		payload.ProbeError = "storage probe unavailable"
+	}
+
+	storageProbeMu.Lock()
+	storageProbeCache[id] = storageProbeEntry{at: time.Now(), usage: payload.UsedBytes, probeOk: probeOk}
+	storageProbeMu.Unlock()
+
+	respondJSON(w, http.StatusOK, payload)
+}
+
+// stopStartResult is the JSON shape returned by both stop and start
+// handlers — counts so the UI can show "1 of 2 containers stopped".
+type stopStartResult struct {
+	Touched int `json:"touched"`
+	Failed  int `json:"failed"`
+}
+
+// stopPluginWorkload handles POST /api/workloads/{id}/stop.
+//
+// Stops every container row belonging to the workload via Docker. Does
+// not remove containers or update runtime state — the reconciler
+// (internal/workload/plugin/source/static/reconcile.go) flips state to
+// "stopped"/"failed" on its next pass, and the user can immediately see
+// the new Docker state via /api/workloads/{id}/containers.
+//
+// Returning 200 with a `{touched, failed}` envelope even on partial
+// failures so the UI can surface "2 of 3 stopped" rather than treating
+// the whole call as red.
+func (s *Server) stopPluginWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		slog.Error("get workload for stop", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "docker client unavailable")
+		return
+	}
+
+	containers, err := s.store.ListContainersByWorkload(id)
+	if err != nil {
+		slog.Error("list containers for stop", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	result := stopStartResult{}
+	for _, c := range containers {
+		if c.ContainerID == "" {
+			continue
+		}
+		// 30s per-container ctx budget; the third arg to StopContainer
+		// is the in-container SIGTERM grace period before SIGKILL.
+		ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
+		if err := s.docker.StopContainer(ctx, c.ContainerID, 10); err != nil {
+			slog.Warn("stop container failed", "workload", id, "container", c.ContainerID, "error", err)
+			result.Failed++
+		} else {
+			result.Touched++
+		}
+		cancel()
+	}
+	if result.Touched == 0 && result.Failed == 0 {
+		// No live container row to act on — distinguish from a successful
+		// stop of zero containers so the UI can show "nothing to stop"
+		// rather than a misleading green toast.
+		respondError(w, http.StatusConflict, "no running container to stop")
+		return
+	}
+	if result.Touched == 0 && result.Failed > 0 {
+		respondError(w, http.StatusBadGateway, "all containers failed to stop")
+		return
+	}
+	respondJSON(w, http.StatusOK, result)
+}
+
+// startPluginWorkload handles POST /api/workloads/{id}/start.
+//
+// Calls `docker start` on every container row belonging to the
+// workload. Does not redeploy or recreate; if the container has been
+// removed externally, start returns an error and the operator should
+// click Deploy. Same partial-failure envelope as stop.
+func (s *Server) startPluginWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		slog.Error("get workload for start", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	if s.docker == nil {
+		respondError(w, http.StatusServiceUnavailable, "docker client unavailable")
+		return
+	}
+
+	containers, err := s.store.ListContainersByWorkload(id)
+	if err != nil {
+		slog.Error("list containers for start", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+
+	result := stopStartResult{}
+	for _, c := range containers {
+		if c.ContainerID == "" {
+			continue
+		}
+		ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
+		if err := s.docker.StartContainer(ctx, c.ContainerID); err != nil {
+			slog.Warn("start container failed", "workload", id, "container", c.ContainerID, "error", err)
+			result.Failed++
+		} else {
+			result.Touched++
+		}
+		cancel()
+	}
+	if result.Touched == 0 && result.Failed == 0 {
+		// No persisted container — deploy first to materialize one.
+		respondError(w, http.StatusConflict, "no container to start; deploy first")
+		return
+	}
+	if result.Touched == 0 && result.Failed > 0 {
+		respondError(w, http.StatusBadGateway, "all containers failed to start")
+		return
+	}
+	respondJSON(w, http.StatusOK, result)
+}
@@ -0,0 +1,359 @@
+package api
+
+import (
+	"encoding/json"
+	"net/http"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// =============================================================================
+// GET /api/workloads/{id}/runtime-state
+// =============================================================================
+
+func TestGetWorkloadRuntimeState_NotFound_404(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodGet, "/api/workloads/does-not-exist/runtime-state", nil)
+	if resp.StatusCode != http.StatusNotFound {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 404", resp.StatusCode)
+	}
+}
+
+func TestGetWorkloadRuntimeState_NonStaticSource_ReturnsBareKind(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         "img-app",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"nginx:1.25"}`,
+	})
+	if err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if got.SourceKind != "image" {
+		t.Errorf("SourceKind = %q, want image", got.SourceKind)
+	}
+	if got.HasState {
+		t.Errorf("HasState = true, want false for non-static source")
+	}
+}
+
+func TestGetWorkloadRuntimeState_StaticSourceNeverDeployed_HasStateFalse(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindSite),
+		Name:         "pages",
+		SourceKind:   "static",
+		SourceConfig: `{"provider":"gitea"}`,
+	})
+	if err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if got.HasState {
+		t.Errorf("HasState = true, want false (never deployed)")
+	}
+}
+
+func TestGetWorkloadRuntimeState_StaticSourceDeployed_DecodesExtraJSON(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindSite),
+		Name:         "pages",
+		SourceKind:   "static",
+		SourceConfig: `{"provider":"gitea"}`,
+	})
+	if err != nil {
+		t.Fatalf("seed workload: %v", err)
+	}
+	extra, _ := json.Marshal(map[string]any{
+		"status":          "deployed",
+		"last_commit_sha": "abc1234",
+		"last_sync_at":    "2026-05-16T10:00:00Z",
+		"last_error":      "",
+		// An unknown key — confirms decoding is lenient.
+		"unknown_future_field": "ignored",
+	})
+	if err := e.store.UpsertContainer(store.Container{
+		ID:           wl.ID + ":site",
+		WorkloadID:   wl.ID,
+		WorkloadKind: string(store.WorkloadKindSite),
+		Host:         "local",
+		ContainerID:  "abcdef1234",
+		State:        "running",
+		ExtraJSON:    string(extra),
+	}); err != nil {
+		t.Fatalf("seed container: %v", err)
+	}
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if !got.HasState {
+		t.Fatalf("HasState = false, want true")
+	}
+	if got.ContainerID != "abcdef1234" || got.State != "running" {
+		t.Errorf("container fields = (%q,%q), want (abcdef1234, running)", got.ContainerID, got.State)
+	}
+	if got.Status != "deployed" || got.LastCommitSHA != "abc1234" || got.LastSyncAt == "" {
+		t.Errorf("runtime fields = %+v, want deployed/abc1234/non-empty", got)
+	}
+}
+
+func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindSite),
+		Name:         "pages",
+		SourceKind:   "static",
+		SourceConfig: `{"provider":"gitea"}`,
+	})
+	// Seed a row with a valid extra_json first, then corrupt it via raw
+	// SQL. Prior to the write-side validateExtraJSON guard this test
+	// could pass a malformed string straight to UpsertContainer; the
+	// guard now rejects that at the boundary, which is the correct
+	// behaviour. The reader resilience this test verifies remains
+	// relevant for pre-existing bad rows from upgrades or external
+	// manipulation, so we still produce one via direct SQL.
+	if err := e.store.UpsertContainer(store.Container{
+		ID:           wl.ID + ":site",
+		WorkloadID:   wl.ID,
+		WorkloadKind: string(store.WorkloadKindSite),
+		Host:         "local",
+		ContainerID:  "abc",
+		State:        "running",
+		ExtraJSON:    `{}`,
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	if _, err := e.store.DB().Exec(
+		`UPDATE containers SET extra_json = ? WHERE id = ?`,
+		`{this is not json`, wl.ID+":site",
+	); err != nil {
+		t.Fatalf("corrupt extra_json: %v", err)
+	}
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200 (decode is non-fatal)", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	_ = decodeEnvelope(t, resp, &got)
+	if !got.HasState || got.ContainerID != "abc" {
+		t.Errorf("expected HasState + container id present, got %+v", got)
+	}
+	if got.Status != "" || got.LastCommitSHA != "" {
+		t.Errorf("expected typed fields empty on decode failure, got %+v", got)
+	}
+}
+
+func TestGetWorkloadRuntimeState_DockerfileSourceDeployed_DecodesExtraJSON(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, err := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         "build-app",
+		SourceKind:   "dockerfile",
+		SourceConfig: `{"provider":"gitea","port":3000}`,
+	})
+	if err != nil {
+		t.Fatalf("seed workload: %v", err)
+	}
+	extra, _ := json.Marshal(map[string]any{
+		"status":          "deployed",
+		"last_commit_sha": "deadbeef",
+		"last_sync_at":    "2026-05-23T10:00:00Z",
+		"last_error":      "",
+	})
+	if err := e.store.UpsertContainer(store.Container{
+		ID:           wl.ID + ":dockerfile",
+		WorkloadID:   wl.ID,
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Host:         "local",
+		ContainerID:  "ffeeddcc",
+		State:        "running",
+		ExtraJSON:    string(extra),
+	}); err != nil {
+		t.Fatalf("seed container: %v", err)
+	}
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got runtimeStatePayload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if !got.HasState {
+		t.Fatalf("HasState = false, want true")
+	}
+	if got.SourceKind != "dockerfile" {
+		t.Errorf("SourceKind = %q, want dockerfile", got.SourceKind)
+	}
+	if got.ContainerID != "ffeeddcc" || got.State != "running" {
+		t.Errorf("container fields = (%q,%q), want (ffeeddcc, running)", got.ContainerID, got.State)
+	}
+	if got.Status != "deployed" || got.LastCommitSHA != "deadbeef" {
+		t.Errorf("runtime fields = %+v, want deployed/deadbeef", got)
+	}
+}
+
+// =============================================================================
+// GET /api/workloads/{id}/storage
+// =============================================================================
+
+func TestGetWorkloadStorage_NonStaticSource_EmptyPayload(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindProject),
+		Name:         "img-app",
+		SourceKind:   "image",
+		SourceConfig: `{"image":"nginx"}`,
+	})
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/storage", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got storageUsagePayload
+	_ = decodeEnvelope(t, resp, &got)
+	if got.Enabled || got.UsedBytes != 0 {
+		t.Errorf("expected empty payload for non-static, got %+v", got)
+	}
+}
+
+func TestGetWorkloadStorage_StaticDisabled_ReturnsLimitButNoUsage(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindSite),
+		Name:         "pages",
+		SourceKind:   "static",
+		SourceConfig: `{"provider":"gitea","storage_enabled":false,"storage_limit_mb":0}`,
+	})
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/storage", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got storageUsagePayload
+	_ = decodeEnvelope(t, resp, &got)
+	if got.Enabled {
+		t.Errorf("Enabled = true, want false")
+	}
+}
+
+func TestGetWorkloadStorage_StaticEnabledNoDockerClient_ReturnsZeroUsage(t *testing.T) {
+	// docker is nil in the test env — the handler must still return
+	// a valid payload (enabled + limit) without panicking.
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindSite),
+		Name:         "pages",
+		SourceKind:   "static",
+		SourceConfig: `{"provider":"gitea","storage_enabled":true,"storage_limit_mb":512}`,
+	})
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/storage", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got storageUsagePayload
+	_ = decodeEnvelope(t, resp, &got)
+	if !got.Enabled || got.LimitMB != 512 {
+		t.Errorf("got %+v, want enabled=true limit=512", got)
+	}
+	if got.UsedBytes != 0 {
+		t.Errorf("UsedBytes = %d, want 0 (no docker client)", got.UsedBytes)
+	}
+}
+
+// =============================================================================
+// POST /api/workloads/{id}/{stop,start}
+// =============================================================================
+
+func TestStopPluginWorkload_NotFound_404(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodPost, "/api/workloads/missing/stop", nil)
+	if resp.StatusCode != http.StatusNotFound {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 404", resp.StatusCode)
+	}
+}
+
+func TestStopPluginWorkload_NoDockerClient_503(t *testing.T) {
+	// The test env passes a nil dockerClient. The handler must refuse
+	// with 503 rather than panicking on a nil deref.
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind: string(store.WorkloadKindSite), Name: "x", SourceKind: "static",
+	})
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+wl.ID+"/stop", nil)
+	if resp.StatusCode != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want 503", resp.StatusCode)
+	}
+}
+
+func TestStartPluginWorkload_NoDockerClient_503(t *testing.T) {
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind: string(store.WorkloadKindSite), Name: "x", SourceKind: "static",
+	})
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+wl.ID+"/start", nil)
+	if resp.StatusCode != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want 503", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// stripImageTag-style behaviour assertions for the storage probe cache —
+// memoization wins on the second call within the TTL window.
+// =============================================================================
+
+func TestStorageProbeCache_SecondCallSkipsProbe(t *testing.T) {
+	// Clear the cache so a different test order doesn't pre-warm.
+	storageProbeMu.Lock()
+	storageProbeCache = map[string]storageProbeEntry{}
+	storageProbeMu.Unlock()
+
+	e := newAPITestEnv(t)
+	wl, _ := e.store.CreateWorkload(store.Workload{
+		Kind:         string(store.WorkloadKindSite),
+		Name:         "pages",
+		SourceKind:   "static",
+		SourceConfig: `{"provider":"gitea","storage_enabled":true,"storage_limit_mb":256}`,
+	})
+
+	// First call populates the cache (docker is nil, so it short-circuits
+	// before the probe and never writes a cache entry — this test is
+	// asserting that the no-docker path is well-behaved).
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/storage", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("first call status = %d, want 200", resp.StatusCode)
+	}
+	resp.Body.Close()
+
+	// Second call should also return 200 — the path is idempotent.
+	resp = e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/storage", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("second call status = %d, want 200", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
@@ -0,0 +1,174 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// workloadVolumeRequest is the body shape accepted by the upsert
+// endpoint. Defaults to scope=absolute when unset.
+type workloadVolumeRequest struct {
+	Source string `json:"source"`
+	Target string `json:"target"`
+	Scope  string `json:"scope"`
+	Name   string `json:"name"`
+}
+
+// scopeInfo carries one volume scope plus its operator-facing description.
+// The UI uses NeedsName to decide whether to show the name input.
+type scopeInfo struct {
+	Scope       string `json:"scope"`
+	Description string `json:"description"`
+	NeedsName   bool   `json:"needs_name"`
+	PathExample string `json:"path_example"`
+}
+
+// listVolumeScopes handles GET /api/volumes/scopes. Returns the catalogue
+// of supported volume scopes so the workload-volume editor can render
+// scope-specific help text without baking the list into the frontend.
+func (s *Server) listVolumeScopes(w http.ResponseWriter, r *http.Request) {
+	scopes := []scopeInfo{
+		{
+			Scope:       "instance",
+			Description: "Each deploy gets its own isolated directory keyed by image tag.",
+			NeedsName:   false,
+			PathExample: "{base}/{workload}/instance-{tag}/{source}",
+		},
+		{
+			Scope:       "stage",
+			Description: "Shared across all instances of this workload (alias of project scope).",
+			NeedsName:   false,
+			PathExample: "{base}/{workload}/{source}",
+		},
+		{
+			Scope:       "project",
+			Description: "Shared across all instances of this workload.",
+			NeedsName:   false,
+			PathExample: "{base}/{workload}/{source}",
+		},
+		{
+			Scope:       "project_named",
+			Description: "A named volume within the workload — multiple mounts can share the name.",
+			NeedsName:   true,
+			PathExample: "{base}/{workload}/_named/{name}/{source}",
+		},
+		{
+			Scope:       "named",
+			Description: "Globally named volume shared across workloads (e.g. shared databases).",
+			NeedsName:   true,
+			PathExample: "{base}/_named/{name}/{source}",
+		},
+		{
+			Scope:       "ephemeral",
+			Description: "In-memory tmpfs mount. Fast but data is lost when the container stops.",
+			NeedsName:   false,
+			PathExample: "(tmpfs — no host path)",
+		},
+		{
+			Scope:       "absolute",
+			Description: "Direct host path. Must be under an allowed path configured in settings.",
+			NeedsName:   false,
+			PathExample: "/mnt/nfs/data (must match allowed paths)",
+		},
+	}
+	respondJSON(w, http.StatusOK, scopes)
+}
+
+func (s *Server) listWorkloadVolumes(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	rows, err := s.store.ListWorkloadVolumes(id)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workload volumes")
+		return
+	}
+	respondJSON(w, http.StatusOK, rows)
+}
+
+func (s *Server) setWorkloadVolume(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	var req workloadVolumeRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	req.Target = strings.TrimSpace(req.Target)
+	if req.Target == "" {
+		respondError(w, http.StatusBadRequest, "target is required")
+		return
+	}
+	if !strings.HasPrefix(req.Target, "/") {
+		respondError(w, http.StatusBadRequest, "target must be an absolute container path")
+		return
+	}
+	if strings.Contains(req.Target, "..") {
+		respondError(w, http.StatusBadRequest, "target may not contain path traversal segments")
+		return
+	}
+	scope := req.Scope
+	if scope == "" {
+		scope = string(store.VolumeScopeAbsolute)
+	}
+	if !store.IsValidVolumeScope(scope) {
+		respondError(w, http.StatusBadRequest, "invalid scope")
+		return
+	}
+	// Absolute-scope mounts must reference a real host path; allow-list
+	// enforcement happens at deploy time against settings.AllowedVolumePaths.
+	if scope == string(store.VolumeScopeAbsolute) {
+		if strings.TrimSpace(req.Source) == "" {
+			respondError(w, http.StatusBadRequest, "source is required for absolute scope")
+			return
+		}
+		if strings.Contains(req.Source, "..") {
+			respondError(w, http.StatusBadRequest, "source may not contain path traversal segments")
+			return
+		}
+	}
+	row, err := s.store.SetWorkloadVolume(store.WorkloadVolume{
+		WorkloadID: id,
+		Source:     req.Source,
+		Target:     req.Target,
+		Scope:      scope,
+		Name:       req.Name,
+	})
+	if err != nil {
+		slog.Error("set workload volume", "workload", id, "target", req.Target, "error", err)
+		respondError(w, http.StatusInternalServerError, "set workload volume")
+		return
+	}
+	respondJSON(w, http.StatusOK, row)
+}
+
+func (s *Server) deleteWorkloadVolume(w http.ResponseWriter, r *http.Request) {
+	volID := chi.URLParam(r, "volID")
+	if err := s.store.DeleteWorkloadVolume(volID); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload volume")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete workload volume")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": volID})
+}
@@ -0,0 +1,110 @@
+package api
+
+import (
+	"errors"
+	"net/http"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/go-chi/chi/v5"
+)
+
+// listWorkloads handles GET /api/workloads. Optional ?kind=project|stack|site
+// filter narrows the result. The shape mirrors the projects/stacks/sites
+// listing endpoints — clients use this to render the global Workloads view.
+func (s *Server) listWorkloads(w http.ResponseWriter, r *http.Request) {
+	kind := store.WorkloadKind(r.URL.Query().Get("kind"))
+	out, err := s.store.ListWorkloads(kind)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workloads")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getWorkload handles GET /api/workloads/{id}.
+func (s *Server) getWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	wl, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	respondJSON(w, http.StatusOK, wl)
+}
+
+// streamWorkloadContainerLogs handles GET /api/workloads/{id}/containers/{cid}/logs.
+// Reuses the shared SSE/JSON log streamer; ownership is verified by joining
+// through workload_id on the container row so an attacker can't stream
+// logs from a foreign container by guessing IDs under the wrong workload URL.
+func (s *Server) streamWorkloadContainerLogs(w http.ResponseWriter, r *http.Request) {
+	workloadID := chi.URLParam(r, "id")
+	containerRowID := chi.URLParam(r, "cid")
+
+	c, err := s.store.GetContainerByID(containerRowID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "container")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "internal server error")
+		return
+	}
+	if c.WorkloadID != workloadID {
+		// Returning 404 (not 403) so the existence of a container under
+		// another workload is not confirmed.
+		respondNotFound(w, "container")
+		return
+	}
+	if c.ContainerID == "" {
+		respondError(w, http.StatusBadRequest, "container row has no docker container bound")
+		return
+	}
+	s.streamLogsForContainer(w, r, c.ContainerID)
+}
+
+// listWorkloadContainers handles GET /api/workloads/{id}/containers.
+// Returns every Container row owned by this workload, newest first. The
+// frontend's <WorkloadContainers> component uses this on every kind-specific
+// detail page (project, stack, site) so the table shape is uniform.
+func (s *Server) listWorkloadContainers(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	out, err := s.store.ListContainersByWorkload(id)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list workload containers")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// updateWorkloadAppID handles PATCH /api/workloads/{id}/app. Body: {"app_id": "..."}.
+// Empty string clears the app assignment. Used by the (optional) Apps UI.
+func (s *Server) updateWorkloadAppID(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	var req struct {
+		AppID string `json:"app_id"`
+	}
+	if !decodeJSON(w, r, &req) {
+		return
+	}
+
+	wl, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	wl.AppID = req.AppID
+	if err := s.store.UpdateWorkload(wl); err != nil {
+		respondError(w, http.StatusInternalServerError, "update workload")
+		return
+	}
+	respondJSON(w, http.StatusOK, wl)
+}
@@ -0,0 +1,306 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+	"github.com/alexei/tinyforge/internal/workload/preview"
+)
+
+// pluginWorkloadRequest is the JSON body accepted by create + update.
+// SourceConfig / TriggerConfig are raw JSON blobs validated by the
+// matching plugin's Validate() before persistence.
+type pluginWorkloadRequest struct {
+	Name                    string              `json:"name"`
+	GroupID                 string              `json:"group_id"`
+	ParentWorkloadID        string              `json:"parent_workload_id"`
+	SourceKind              string              `json:"source_kind"`
+	SourceConfig            json.RawMessage     `json:"source_config"`
+	TriggerKind             string              `json:"trigger_kind"`
+	TriggerConfig           json.RawMessage     `json:"trigger_config"`
+	PublicFaces             []plugin.PublicFace `json:"public_faces"`
+	NotificationURL         string              `json:"notification_url"`
+	WebhookRequireSignature bool                `json:"webhook_require_signature"`
+}
+
+// Per-blob caps so two opaque JSON fields can't blow past the route-level
+// body limit individually. The route already caps the whole body, but a
+// 1 MiB SourceConfig is unreasonable for any source we plan to support.
+const (
+	maxSourceConfigBytes  = 64 << 10 // 64 KiB
+	maxTriggerConfigBytes = 16 << 10 // 16 KiB
+	// Hard upper bound on public faces — multi-face is now supported (route
+	// IDs are stored per-fqdn in container.extra_json so teardown is clean)
+	// but a workload with hundreds of public faces is almost certainly a
+	// bug in the caller, not legitimate config.
+	maxPublicFaces = 16
+)
+
+// createPluginWorkload handles POST /api/workloads.
+//
+// Validates source/trigger kinds against the registered plugins, runs each
+// plugin's own Validate() on its config blob, then persists the row. The
+// row is created with the new plugin-shape fields populated; the legacy
+// kind/ref_id columns stay empty for plugin-native workloads.
+func (s *Server) createPluginWorkload(w http.ResponseWriter, r *http.Request) {
+	var req pluginWorkloadRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	if strings.TrimSpace(req.Name) == "" {
+		respondError(w, http.StatusBadRequest, "name is required")
+		return
+	}
+	if err := validatePluginKinds(req); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	pw := plugin.Workload{
+		Name:                    req.Name,
+		GroupID:                 req.GroupID,
+		ParentWorkloadID:        req.ParentWorkloadID,
+		SourceKind:              req.SourceKind,
+		SourceConfig:            req.SourceConfig,
+		TriggerKind:             req.TriggerKind,
+		TriggerConfig:           req.TriggerConfig,
+		PublicFaces:             req.PublicFaces,
+		NotificationURL:         req.NotificationURL,
+		WebhookRequireSignature: req.WebhookRequireSignature,
+	}
+	sw, err := fromPluginWorkload(pw)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "encode workload: "+err.Error())
+		return
+	}
+	// Plugin-native rows are flagged with kind="plugin"; ref_id is
+	// self-referenced to the row's own ID inside CreateWorkload so the
+	// UNIQUE(kind, ref_id) index can hold many sibling plugin workloads.
+	sw.Kind = "plugin"
+	created, err := s.store.CreateWorkload(sw)
+	if err != nil {
+		slog.Error("create plugin workload", "error", err)
+		respondError(w, http.StatusInternalServerError, "create workload")
+		return
+	}
+	respondJSON(w, http.StatusCreated, toPluginWorkload(created))
+}
+
+// updatePluginWorkload handles PUT /api/workloads/{id}/plugin. Only the
+// fields that belong to the plugin model are mutable here; legacy
+// project/stack/site fields are edited through their own endpoints during
+// the cutover.
+func (s *Server) updatePluginWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	existing, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+
+	var req pluginWorkloadRequest
+	if !decodeJSONStrict(w, r, &req) {
+		return
+	}
+	if err := validatePluginKinds(req); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
+	if req.Name != "" {
+		existing.Name = req.Name
+	}
+	existing.AppID = req.GroupID
+	existing.ParentWorkloadID = req.ParentWorkloadID
+	existing.SourceKind = req.SourceKind
+	if len(req.SourceConfig) > 0 {
+		existing.SourceConfig = string(req.SourceConfig)
+	}
+	existing.TriggerKind = req.TriggerKind
+	if len(req.TriggerConfig) > 0 {
+		existing.TriggerConfig = string(req.TriggerConfig)
+	}
+	if req.PublicFaces != nil {
+		b, _ := json.Marshal(req.PublicFaces)
+		existing.PublicFaces = string(b)
+	}
+	existing.NotificationURL = req.NotificationURL
+	existing.WebhookRequireSignature = req.WebhookRequireSignature
+
+	if err := s.store.UpdateWorkload(existing); err != nil {
+		slog.Error("update plugin workload", "error", err)
+		respondError(w, http.StatusInternalServerError, "update workload")
+		return
+	}
+	respondJSON(w, http.StatusOK, toPluginWorkload(existing))
+}
+
+// deployPluginWorkload handles POST /api/workloads/{id}/deploy.
+//
+// Builds a manual DeploymentIntent and dispatches it through the matching
+// Source plugin — independent of whatever TriggerKind the workload has
+// configured. The body is optional; supplying `reference` overrides what
+// the Source uses (e.g. force a specific image tag).
+func (s *Server) deployPluginWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	row, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	if row.SourceKind == "" {
+		respondError(w, http.StatusBadRequest, "workload has no source_kind; cannot dispatch")
+		return
+	}
+
+	var body struct {
+		Reference string `json:"reference"`
+		Note      string `json:"note"`
+	}
+	if r.ContentLength > 0 {
+		if !decodeJSONStrict(w, r, &body) {
+			return
+		}
+	}
+
+	actor := "manual"
+	if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
+		actor = claims.Username
+	}
+	intent := plugin.DeploymentIntent{
+		Reason:      "manual",
+		Reference:   body.Reference,
+		Metadata:    map[string]string{"note": body.Note},
+		TriggeredAt: time.Now().UTC(),
+		TriggeredBy: actor,
+	}
+	if err := s.deployer.DispatchPlugin(r.Context(), toPluginWorkload(row), intent); err != nil {
+		// Full error stays in the server log; the client gets a generic
+		// message because the wrapped error can carry registry-auth bytes
+		// or compose-stdout secrets.
+		slog.Warn("manual dispatch failed", "workload", id, "actor", actor, "error", err)
+		respondError(w, http.StatusInternalServerError, "dispatch failed; see server logs")
+		return
+	}
+	respondJSON(w, http.StatusAccepted, map[string]any{
+		"workload_id":  id,
+		"reference":    intent.Reference,
+		"triggered_by": actor,
+	})
+}
+
+// deletePluginWorkload handles DELETE /api/workloads/{id}.
+//
+// Performs Source.Teardown first so containers / proxy routes / DNS are
+// cleaned up before the workload row is dropped. A teardown failure is
+// logged but does not block the row delete — the row must not outlive
+// the things it owns even when the cleanup is partial.
+func (s *Server) deletePluginWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	row, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+
+	// Cascade-teardown any branch previews materialized from this workload
+	// so deleting a template does not orphan their containers, proxy routes,
+	// and rows. Operator-managed stage-chain children (which share the same
+	// parent link) are deliberately left alone — only previews are auto-owned
+	// by the template (see preview.IsPreviewChild).
+	if previews, err := preview.ListPreviewChildren(s.store, row); err != nil {
+		slog.Warn("delete workload: list preview children", "workload", id, "error", err)
+	} else {
+		for _, child := range previews {
+			if child.SourceKind != "" {
+				if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(child)); err != nil {
+					slog.Warn("delete workload: preview child teardown error",
+						"workload", id, "child", child.ID, "error", err)
+				}
+			}
+			if err := s.store.DeleteWorkload(child.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
+				slog.Warn("delete workload: preview child delete error",
+					"workload", id, "child", child.ID, "error", err)
+			}
+		}
+	}
+
+	if row.SourceKind != "" {
+		if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(row)); err != nil {
+			slog.Warn("delete workload: teardown error",
+				"workload", id, "kind", row.SourceKind, "error", err)
+		}
+	}
+
+	if err := s.store.DeleteWorkload(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "delete workload")
+		return
+	}
+	respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
+}
+
+// validatePluginKinds verifies the requested source_kind and trigger_kind
+// resolve to registered plugins, then asks each plugin to validate its
+// own config blob. Empty kinds are allowed (legacy rows or partial setup).
+// Per-blob byte caps and the v1 single-face limit are enforced here so a
+// hand-crafted DB write can't bypass them later.
+func validatePluginKinds(req pluginWorkloadRequest) error {
+	if len(req.SourceConfig) > maxSourceConfigBytes {
+		return fmt.Errorf("source_config exceeds %d bytes", maxSourceConfigBytes)
+	}
+	if len(req.TriggerConfig) > maxTriggerConfigBytes {
+		return fmt.Errorf("trigger_config exceeds %d bytes", maxTriggerConfigBytes)
+	}
+	if len(req.PublicFaces) > maxPublicFaces {
+		return fmt.Errorf("at most %d public faces per workload", maxPublicFaces)
+	}
+	if req.SourceKind != "" {
+		src, err := plugin.GetSource(req.SourceKind)
+		if err != nil {
+			return err
+		}
+		if err := src.Validate(req.SourceConfig); err != nil {
+			return err
+		}
+	}
+	if req.TriggerKind != "" {
+		trg, err := plugin.GetTrigger(req.TriggerKind)
+		if err != nil {
+			return err
+		}
+		if err := trg.Validate(req.TriggerConfig); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
@@ -0,0 +1,995 @@
+package api
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync/atomic"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/webhook"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+
+	// Blank-imports register the source/trigger plugins the tests assert
+	// against. Mirrors cmd/server/main.go's set.
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/source/image"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/git"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/manual"
+	_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/registry"
+)
+
+// =============================================================================
+// Test helpers
+// =============================================================================
+
+// fakeAPIDispatcher is the minimum PluginDispatcher the API needs. It
+// counts Deploy / Teardown calls so handlers can be observed end-to-end.
+// Returning nil errors keeps the tests focused on HTTP/store behaviour;
+// per-test errFn override flips that on demand.
+type fakeAPIDispatcher struct {
+	deployCount   atomic.Int32
+	teardownCount atomic.Int32
+
+	lastIntent atomic.Pointer[plugin.DeploymentIntent]
+	lastWorkID atomic.Value // string
+
+	deployErrFn   func() error
+	teardownErrFn func() error
+}
+
+func (f *fakeAPIDispatcher) DispatchPlugin(_ context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	f.deployCount.Add(1)
+	f.lastIntent.Store(&intent)
+	f.lastWorkID.Store(w.ID)
+	if f.deployErrFn != nil {
+		return f.deployErrFn()
+	}
+	return nil
+}
+
+func (f *fakeAPIDispatcher) DispatchTeardown(_ context.Context, w plugin.Workload) error {
+	f.teardownCount.Add(1)
+	f.lastWorkID.Store(w.ID)
+	if f.teardownErrFn != nil {
+		return f.teardownErrFn()
+	}
+	return nil
+}
+
+func (f *fakeAPIDispatcher) PluginDeps() plugin.Deps { return plugin.Deps{} }
+
+// apiTestEnv bundles everything a test needs: a live test server, the
+// underlying store for asserting persistence, the fake dispatcher for
+// observing dispatch calls, and an admin token for hitting protected routes.
+type apiTestEnv struct {
+	srv        *httptest.Server
+	store      *store.Store
+	dispatcher *fakeAPIDispatcher
+	adminToken string
+	encKey     [32]byte
+}
+
+func (e *apiTestEnv) close() { e.srv.Close() }
+
+// newAPITestEnv spins up an in-memory store, a fake dispatcher, a webhook
+// handler bound to the dispatcher, and the API server. An admin user is
+// created and a valid JWT minted so authenticated routes can be exercised.
+func newAPITestEnv(t *testing.T) *apiTestEnv {
+	t.Helper()
+
+	st, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("create store: %v", err)
+	}
+	t.Cleanup(func() { st.Close() })
+
+	encKey := [32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	dispatcher := &fakeAPIDispatcher{}
+	wh := webhook.NewHandler(st)
+	wh.SetPluginDispatcher(dispatcher)
+
+	srv := NewServer(
+		st,
+		nil, // dockerClient — unused on the routes under test
+		nil, // npmClient
+		nil, // proxyProvider
+		dispatcher,
+		nil, // notifier
+		wh,
+		nil, // eventBus
+		encKey,
+	)
+
+	httpsrv := httptest.NewServer(srv.Router())
+	t.Cleanup(httpsrv.Close)
+
+	// Mint an admin token via the same auth.LocalAuth instance the server uses.
+	// The router constructs LocalAuth from encKey internally; rebuilding one
+	// here with the same key produces a token the server's middleware
+	// accepts.
+	la := auth.NewLocalAuth(encKey)
+	tok, err := la.GenerateToken(auth.Claims{
+		UserID: "u-admin", Username: "admin", Role: "admin",
+	})
+	if err != nil {
+		t.Fatalf("mint token: %v", err)
+	}
+
+	return &apiTestEnv{
+		srv:        httpsrv,
+		store:      st,
+		dispatcher: dispatcher,
+		adminToken: tok.Token,
+		encKey:     encKey,
+	}
+}
+
+// do issues an authenticated request and returns the response. Failures
+// to construct the request are fatal because they are bugs in the test
+// itself, not the system under test.
+func (e *apiTestEnv) do(t *testing.T, method, path string, body any) *http.Response {
+	t.Helper()
+	var rdr io.Reader
+	if body != nil {
+		b, err := json.Marshal(body)
+		if err != nil {
+			t.Fatalf("marshal body: %v", err)
+		}
+		rdr = bytes.NewReader(b)
+	}
+	req, err := http.NewRequest(method, e.srv.URL+path, rdr)
+	if err != nil {
+		t.Fatalf("new request: %v", err)
+	}
+	req.Header.Set("Authorization", "Bearer "+e.adminToken)
+	if body != nil {
+		req.Header.Set("Content-Type", "application/json")
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		t.Fatalf("do request: %v", err)
+	}
+	return resp
+}
+
+// decodeEnvelope reads the response body into the standard {success,data,error}
+// envelope and decodes data into out. Fatals on any error — tests should
+// already have asserted the status code separately.
+func decodeEnvelope(t *testing.T, resp *http.Response, out any) string {
+	t.Helper()
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("read body: %v", err)
+	}
+	var env struct {
+		Success bool            `json:"success"`
+		Data    json.RawMessage `json:"data"`
+		Error   string          `json:"error"`
+	}
+	if err := json.Unmarshal(body, &env); err != nil {
+		t.Fatalf("unmarshal envelope: %v\nbody=%s", err, string(body))
+	}
+	if out != nil && len(env.Data) > 0 {
+		if err := json.Unmarshal(env.Data, out); err != nil {
+			t.Fatalf("unmarshal data: %v\ndata=%s", err, string(env.Data))
+		}
+	}
+	return env.Error
+}
+
+// validImageSourceConfig returns the JSON body for a valid image source
+// config — kept consistent across tests so the create-success cases all
+// look the same.
+func validImageSourceConfig() json.RawMessage {
+	return json.RawMessage(`{"image":"registry.example.com/owner/app","port":8080,"default_tag":"latest"}`)
+}
+
+// =============================================================================
+// POST /api/workloads — create
+// =============================================================================
+
+func TestCreateWorkload_HappyPath_ReturnsCreatedRow(t *testing.T) {
+	e := newAPITestEnv(t)
+
+	body := pluginWorkloadRequest{
+		Name:         "my-app",
+		SourceKind:   "image",
+		SourceConfig: validImageSourceConfig(),
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads", body)
+	if resp.StatusCode != http.StatusCreated {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 201", resp.StatusCode)
+	}
+	var got plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if got.ID == "" {
+		t.Fatal("expected ID to be assigned")
+	}
+	if got.Name != "my-app" {
+		t.Fatalf("Name = %q, want my-app", got.Name)
+	}
+	// Sanity: the row is persisted in the store with the same ID and the
+	// kind="plugin" sentinel so legacy filters continue to skip it.
+	row, err := e.store.GetWorkloadByID(got.ID)
+	if err != nil {
+		t.Fatalf("GetWorkloadByID: %v", err)
+	}
+	if row.Kind != "plugin" {
+		t.Fatalf("row.Kind = %q, want plugin", row.Kind)
+	}
+	// CreateWorkload self-references RefID to ID for plugin-native rows
+	// so the UNIQUE(kind, ref_id) constraint can hold many siblings.
+	if row.RefID != row.ID {
+		t.Fatalf("RefID = %q, want self-reference to ID %q", row.RefID, row.ID)
+	}
+}
+
+func TestCreateWorkload_ValidationErrors(t *testing.T) {
+	cases := []struct {
+		name     string
+		req      pluginWorkloadRequest
+		wantCode int
+		wantSub  string
+	}{
+		{
+			name:     "empty name",
+			req:      pluginWorkloadRequest{Name: "   ", SourceKind: "image", SourceConfig: validImageSourceConfig()},
+			wantCode: http.StatusBadRequest,
+			wantSub:  "name is required",
+		},
+		{
+			name:     "unknown source kind",
+			req:      pluginWorkloadRequest{Name: "x", SourceKind: "no-such-kind", SourceConfig: json.RawMessage(`{}`)},
+			wantCode: http.StatusBadRequest,
+			wantSub:  "no source registered",
+		},
+		{
+			name:     "unknown trigger kind via inline binding (validateTrigger)",
+			req:      pluginWorkloadRequest{Name: "x", SourceKind: "image", SourceConfig: validImageSourceConfig(), TriggerKind: "no-such-trigger", TriggerConfig: json.RawMessage(`{}`)},
+			wantCode: http.StatusBadRequest,
+			wantSub:  "no trigger registered",
+		},
+		{
+			name: "oversized source config",
+			req: pluginWorkloadRequest{
+				Name:         "x",
+				SourceKind:   "image",
+				SourceConfig: json.RawMessage(`{"image":"x","junk":"` + strings.Repeat("A", maxSourceConfigBytes+10) + `"}`),
+			},
+			wantCode: http.StatusBadRequest,
+			wantSub:  "source_config exceeds",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			e := newAPITestEnv(t)
+			resp := e.do(t, http.MethodPost, "/api/workloads", tc.req)
+			defer resp.Body.Close()
+			if resp.StatusCode != tc.wantCode {
+				body, _ := io.ReadAll(resp.Body)
+				t.Fatalf("status = %d, want %d (body=%s)", resp.StatusCode, tc.wantCode, string(body))
+			}
+			body, _ := io.ReadAll(resp.Body)
+			if !strings.Contains(string(body), tc.wantSub) {
+				t.Fatalf("body %q missing substring %q", string(body), tc.wantSub)
+			}
+		})
+	}
+}
+
+// =============================================================================
+// GET /api/workloads — list
+// =============================================================================
+
+func TestListWorkloads_Empty(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodGet, "/api/workloads", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var got []plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(got) != 0 {
+		t.Fatalf("expected empty list, got %d rows", len(got))
+	}
+}
+
+func TestListWorkloads_Populated(t *testing.T) {
+	e := newAPITestEnv(t)
+	// Seed via the test helper to avoid the production UNIQUE(kind, ref_id)
+	// quirk on the create handler (see seedWorkload comment).
+	alphaID := seedWorkload(t, e, "alpha")
+	betaID := seedWorkload(t, e, "beta")
+
+	resp := e.do(t, http.MethodGet, "/api/workloads", nil)
+	var got []plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	// Assert membership, not just count — a regression that dropped a row
+	// while inserting a duplicate would pass a bare len() check.
+	seen := map[string]bool{}
+	for _, w := range got {
+		seen[w.ID] = true
+	}
+	if !seen[alphaID] || !seen[betaID] {
+		t.Fatalf("list missing seeded ids; got=%v want both %s and %s", got, alphaID, betaID)
+	}
+	if len(got) != 2 {
+		t.Fatalf("expected 2 rows, got %d", len(got))
+	}
+}
+
+// =============================================================================
+// GET /api/workloads/{id}
+// =============================================================================
+
+func TestGetWorkload_NotFound(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, http.MethodGet, "/api/workloads/no-such-id", nil)
+	if resp.StatusCode != http.StatusNotFound {
+		t.Fatalf("status = %d, want 404", resp.StatusCode)
+	}
+}
+
+func TestGetWorkload_Found(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "fetch-me")
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+id, nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// PUT /api/workloads/{id}/plugin
+// =============================================================================
+
+func TestUpdatePluginWorkload_PreservesKindAndUpdatesName(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "before")
+
+	body := pluginWorkloadRequest{
+		Name:         "after",
+		SourceKind:   "image",
+		SourceConfig: validImageSourceConfig(),
+	}
+	resp := e.do(t, http.MethodPut, "/api/workloads/"+id+"/plugin", body)
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d, want 200 (body=%s)", resp.StatusCode, string(body))
+	}
+	resp.Body.Close()
+
+	row, err := e.store.GetWorkloadByID(id)
+	if err != nil {
+		t.Fatalf("GetWorkloadByID: %v", err)
+	}
+	if row.Name != "after" {
+		t.Fatalf("Name = %q, want after", row.Name)
+	}
+	if row.Kind != "plugin" {
+		t.Fatalf("Kind mutated unexpectedly: %q", row.Kind)
+	}
+}
+
+// =============================================================================
+// POST /api/workloads/{id}/deploy
+// =============================================================================
+
+func TestDeployPluginWorkload_DispatchesManualIntent(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "deploy-me")
+
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/deploy", map[string]string{
+		"reference": "v1.2.3",
+		"note":      "test deploy",
+	})
+	if resp.StatusCode != http.StatusAccepted {
+		body, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d, want 202 (body=%s)", resp.StatusCode, string(body))
+	}
+	resp.Body.Close()
+
+	if got := e.dispatcher.deployCount.Load(); got != 1 {
+		t.Fatalf("Deploy called %d times, want 1", got)
+	}
+	intent := e.dispatcher.lastIntent.Load()
+	if intent == nil {
+		t.Fatal("dispatcher did not capture intent")
+	}
+	if intent.Reason != "manual" {
+		t.Fatalf("intent.Reason = %q, want manual", intent.Reason)
+	}
+	if intent.Reference != "v1.2.3" {
+		t.Fatalf("intent.Reference = %q, want v1.2.3", intent.Reference)
+	}
+	if intent.TriggeredBy != "admin" {
+		t.Fatalf("intent.TriggeredBy = %q, want admin", intent.TriggeredBy)
+	}
+}
+
+func TestDeployPluginWorkload_RejectsWorkloadWithoutSourceKind(t *testing.T) {
+	e := newAPITestEnv(t)
+	// Build a row directly (bypass the API) with empty SourceKind.
+	row, err := e.store.CreateWorkload(store.Workload{
+		Kind: "plugin", Name: "no-kind",
+	})
+	if err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+row.ID+"/deploy", nil)
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+	resp.Body.Close()
+}
+
+// =============================================================================
+// DELETE /api/workloads/{id}
+// =============================================================================
+
+func TestDeleteWorkload_CallsTeardownAndDeletes(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "delete-me")
+
+	resp := e.do(t, http.MethodDelete, "/api/workloads/"+id, nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	resp.Body.Close()
+
+	if got := e.dispatcher.teardownCount.Load(); got != 1 {
+		t.Fatalf("Teardown called %d times, want 1", got)
+	}
+	if _, err := e.store.GetWorkloadByID(id); err == nil {
+		t.Fatal("expected workload to be deleted from store")
+	}
+}
+
+func TestDeleteWorkload_TeardownErrorDoesNotBlockDelete(t *testing.T) {
+	e := newAPITestEnv(t)
+	e.dispatcher.teardownErrFn = func() error { return fmt.Errorf("teardown blew up") }
+	id := seedWorkload(t, e, "stubborn")
+
+	resp := e.do(t, http.MethodDelete, "/api/workloads/"+id, nil)
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d, want 200 (body=%s)", resp.StatusCode, string(body))
+	}
+	resp.Body.Close()
+	if _, err := e.store.GetWorkloadByID(id); err == nil {
+		t.Fatal("workload row must be deleted even when teardown fails")
+	}
+}
+
+// =============================================================================
+// PATCH /api/workloads/{id}/app
+// =============================================================================
+
+func TestUpdateWorkloadAppID_SetsAppID(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "with-app")
+
+	resp := e.do(t, http.MethodPatch, "/api/workloads/"+id+"/app", map[string]string{
+		"app_id": "app-123",
+	})
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d, want 200 (body=%s)", resp.StatusCode, string(body))
+	}
+	resp.Body.Close()
+	row, _ := e.store.GetWorkloadByID(id)
+	if row.AppID != "app-123" {
+		t.Fatalf("AppID = %q, want app-123", row.AppID)
+	}
+}
+
+// =============================================================================
+// /api/workloads/{id}/env  CRUD
+// =============================================================================
+
+func TestWorkloadEnv_PutListDelete(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "with-env")
+
+	// PUT (plaintext)
+	put := e.do(t, http.MethodPut, "/api/workloads/"+id+"/env", map[string]any{
+		"key":       "DATABASE_URL",
+		"value":     "postgres://plain",
+		"encrypted": false,
+	})
+	if put.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(put.Body)
+		t.Fatalf("PUT status = %d (body=%s)", put.StatusCode, string(body))
+	}
+	put.Body.Close()
+
+	// LIST
+	listResp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/env", nil)
+	var rows []workloadEnvRow
+	_ = decodeEnvelope(t, listResp, &rows)
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 row, got %d", len(rows))
+	}
+	if rows[0].Value != "postgres://plain" {
+		t.Fatalf("plaintext value missing in list: got %q", rows[0].Value)
+	}
+
+	// DELETE
+	delResp := e.do(t, http.MethodDelete, "/api/workloads/"+id+"/env/"+rows[0].ID, nil)
+	if delResp.StatusCode != http.StatusOK {
+		t.Fatalf("DELETE status = %d", delResp.StatusCode)
+	}
+	delResp.Body.Close()
+
+	listResp2 := e.do(t, http.MethodGet, "/api/workloads/"+id+"/env", nil)
+	var rows2 []workloadEnvRow
+	_ = decodeEnvelope(t, listResp2, &rows2)
+	if len(rows2) != 0 {
+		t.Fatalf("expected 0 rows after delete, got %d", len(rows2))
+	}
+}
+
+func TestWorkloadEnv_EncryptedValueNotEchoed(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "secret-env")
+
+	plain := "super-secret-value-12345"
+	put := e.do(t, http.MethodPut, "/api/workloads/"+id+"/env", map[string]any{
+		"key":       "API_KEY",
+		"value":     plain,
+		"encrypted": true,
+	})
+	put.Body.Close()
+	if put.StatusCode != http.StatusOK {
+		t.Fatalf("PUT status = %d", put.StatusCode)
+	}
+
+	listResp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/env", nil)
+	defer listResp.Body.Close()
+	body, _ := io.ReadAll(listResp.Body)
+
+	if strings.Contains(string(body), plain) {
+		t.Fatalf("encrypted plaintext leaked in response body: %s", string(body))
+	}
+
+	// Cross-check: the stored ciphertext must decrypt back to the plain value.
+	rows, _ := e.store.ListWorkloadEnv(id)
+	if len(rows) != 1 || !rows[0].Encrypted {
+		t.Fatalf("expected 1 encrypted row, got %+v", rows)
+	}
+	dec, err := crypto.Decrypt(e.encKey, rows[0].Value)
+	if err != nil {
+		t.Fatalf("decrypt at-rest value: %v", err)
+	}
+	if dec != plain {
+		t.Fatalf("decrypted = %q, want %q", dec, plain)
+	}
+}
+
+func TestWorkloadEnv_RejectsInvalidKey(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "bad-env-key")
+
+	resp := e.do(t, http.MethodPut, "/api/workloads/"+id+"/env", map[string]any{
+		"key":   "1BAD-KEY",
+		"value": "x",
+	})
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// /api/workloads/{id}/volumes  CRUD
+// =============================================================================
+
+func TestWorkloadVolumes_PutListDelete(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "with-vols")
+
+	put := e.do(t, http.MethodPut, "/api/workloads/"+id+"/volumes", map[string]any{
+		"source": "/srv/data",
+		"target": "/data",
+		"scope":  "absolute",
+	})
+	if put.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(put.Body)
+		t.Fatalf("PUT status = %d (body=%s)", put.StatusCode, string(body))
+	}
+	put.Body.Close()
+
+	listResp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/volumes", nil)
+	var rows []store.WorkloadVolume
+	_ = decodeEnvelope(t, listResp, &rows)
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 volume, got %d", len(rows))
+	}
+
+	delResp := e.do(t, http.MethodDelete, "/api/workloads/"+id+"/volumes/"+rows[0].ID, nil)
+	if delResp.StatusCode != http.StatusOK {
+		t.Fatalf("DELETE status = %d", delResp.StatusCode)
+	}
+	delResp.Body.Close()
+
+	rowsAfter, _ := e.store.ListWorkloadVolumes(id)
+	if len(rowsAfter) != 0 {
+		t.Fatalf("expected 0 volumes after delete, got %d", len(rowsAfter))
+	}
+}
+
+func TestWorkloadVolumes_RejectsTraversal(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "no-traversal")
+
+	cases := []struct {
+		name string
+		body map[string]any
+	}{
+		{"target with ..", map[string]any{"source": "/srv/data", "target": "/data/../etc", "scope": "absolute"}},
+		{"source with ..", map[string]any{"source": "/srv/../etc/shadow", "target": "/d", "scope": "absolute"}},
+		{"target not absolute", map[string]any{"source": "/srv/data", "target": "data", "scope": "absolute"}},
+		{"target empty", map[string]any{"source": "/srv/data", "target": "", "scope": "absolute"}},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			resp := e.do(t, http.MethodPut, "/api/workloads/"+id+"/volumes", tc.body)
+			defer resp.Body.Close()
+			if resp.StatusCode != http.StatusBadRequest {
+				body, _ := io.ReadAll(resp.Body)
+				t.Fatalf("status = %d, want 400 (body=%s)", resp.StatusCode, string(body))
+			}
+		})
+	}
+}
+
+// =============================================================================
+// GET /api/workloads/{id}/chain
+// =============================================================================
+
+func TestGetWorkloadChain_ParentSelfChildren(t *testing.T) {
+	e := newAPITestEnv(t)
+	parentID := seedWorkload(t, e, "parent")
+	childID := seedWorkloadWithParent(t, e, "child", parentID)
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+parentID+"/chain", nil)
+	var got struct {
+		Parent   *map[string]any   `json:"parent"`
+		Self     map[string]any    `json:"self"`
+		Children []map[string]any  `json:"children"`
+	}
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if got.Parent != nil {
+		t.Fatalf("parent should be nil for root, got %+v", *got.Parent)
+	}
+	if got.Self["id"] != parentID {
+		t.Fatalf("self.id = %v, want %s", got.Self["id"], parentID)
+	}
+	if len(got.Children) != 1 || got.Children[0]["id"] != childID {
+		t.Fatalf("children = %+v", got.Children)
+	}
+}
+
+// =============================================================================
+// POST /api/workloads/{id}/promote-from/{sourceID}
+// =============================================================================
+
+func TestPromoteFrom_CopiesRunningTagToTarget(t *testing.T) {
+	e := newAPITestEnv(t)
+	sourceID := seedWorkload(t, e, "stage-prod")
+	targetID := seedWorkload(t, e, "stage-staging")
+
+	// Seed a "running" container with an image_tag on the source workload
+	// so the promote endpoint has something to copy.
+	if err := e.store.UpsertContainer(store.Container{
+		ID:         sourceID + ":web",
+		WorkloadID: sourceID,
+		Role:       "web",
+		ImageTag:   "v9.9.9",
+		State:      "running",
+		LastSeenAt: store.Now(),
+	}); err != nil {
+		t.Fatalf("seed container: %v", err)
+	}
+
+	resp := e.do(t, http.MethodPost,
+		fmt.Sprintf("/api/workloads/%s/promote-from/%s", targetID, sourceID),
+		map[string]any{},
+	)
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d (body=%s)", resp.StatusCode, string(body))
+	}
+	resp.Body.Close()
+
+	// Target workload's source_config.default_tag must now equal v9.9.9.
+	row, _ := e.store.GetWorkloadByID(targetID)
+	var cfg map[string]any
+	if err := json.Unmarshal([]byte(row.SourceConfig), &cfg); err != nil {
+		t.Fatalf("decode target source_config: %v", err)
+	}
+	if cfg["default_tag"] != "v9.9.9" {
+		t.Fatalf("default_tag = %v, want v9.9.9", cfg["default_tag"])
+	}
+}
+
+// =============================================================================
+// /api/workloads/{id}/triggers — list + inline create+bind
+// =============================================================================
+
+func TestListBindingsForWorkload_EmptyByDefault(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "no-bindings")
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/triggers", nil)
+	var rows []map[string]any
+	_ = decodeEnvelope(t, resp, &rows)
+	if len(rows) != 0 {
+		t.Fatalf("expected 0 bindings, got %d", len(rows))
+	}
+}
+
+func TestBindTriggerToWorkload_InlineManualTrigger(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "inline-bind")
+
+	body := map[string]any{
+		"binding_config": json.RawMessage(`{}`),
+		"inline": map[string]any{
+			"kind":   "manual",
+			"name":   "manual-trigger-for-inline",
+			"config": json.RawMessage(`{}`),
+		},
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/triggers", body)
+	if resp.StatusCode != http.StatusCreated {
+		body, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d (body=%s)", resp.StatusCode, string(body))
+	}
+	resp.Body.Close()
+
+	// Verify a binding row exists for this workload.
+	bindings, err := e.store.ListBindingsForWorkloadWithNames(id)
+	if err != nil {
+		t.Fatalf("list bindings: %v", err)
+	}
+	if len(bindings) != 1 {
+		t.Fatalf("expected 1 binding, got %d", len(bindings))
+	}
+	if bindings[0].TriggerKind != "manual" {
+		t.Fatalf("expected manual trigger, got %q", bindings[0].TriggerKind)
+	}
+}
+
+func TestBindTriggerToWorkload_RequiresEitherTriggerIDOrInline(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := seedWorkload(t, e, "bind-validation")
+
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/triggers", map[string]any{
+		"binding_config": json.RawMessage(`{}`),
+	})
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// Standalone /api/triggers CRUD
+// =============================================================================
+
+func TestCreateTrigger_HappyPath(t *testing.T) {
+	e := newAPITestEnv(t)
+	body := map[string]any{
+		"kind":            "manual",
+		"name":            "standalone-manual",
+		"config":          json.RawMessage(`{}`),
+		"webhook_enabled": false,
+	}
+	resp := e.do(t, http.MethodPost, "/api/triggers", body)
+	if resp.StatusCode != http.StatusCreated {
+		raw, _ := io.ReadAll(resp.Body)
+		t.Fatalf("status = %d (body=%s)", resp.StatusCode, string(raw))
+	}
+	var got map[string]any
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("envelope error %q", errMsg)
+	}
+	if got["kind"] != "manual" || got["name"] != "standalone-manual" {
+		t.Fatalf("wrong shape: %v", got)
+	}
+	// Sanity: the row landed in the store.
+	if _, err := e.store.GetTriggerByName("standalone-manual"); err != nil {
+		t.Fatalf("trigger missing from store: %v", err)
+	}
+}
+
+func TestCreateTrigger_DuplicateNameReturns409(t *testing.T) {
+	e := newAPITestEnv(t)
+	body := map[string]any{
+		"kind":   "manual",
+		"name":   "dup-trigger",
+		"config": json.RawMessage(`{}`),
+	}
+	if r := e.do(t, http.MethodPost, "/api/triggers", body); r.StatusCode != http.StatusCreated {
+		r.Body.Close()
+		t.Fatalf("first create status = %d", r.StatusCode)
+	}
+	r2 := e.do(t, http.MethodPost, "/api/triggers", body)
+	defer r2.Body.Close()
+	if r2.StatusCode != http.StatusConflict {
+		t.Fatalf("dup status = %d, want 409", r2.StatusCode)
+	}
+}
+
+func TestListTriggers_PopulatedKindFilter(t *testing.T) {
+	e := newAPITestEnv(t)
+	mkBody := func(kind, name string) map[string]any {
+		cfg := json.RawMessage(`{}`)
+		switch kind {
+		case "registry":
+			cfg = json.RawMessage(`{"image":"registry.example.com/o/a","tag_pattern":"*"}`)
+		case "git":
+			cfg = json.RawMessage(`{"repo":"o/r","mode":"push","branch":"main"}`)
+		}
+		return map[string]any{"kind": kind, "name": name, "config": cfg}
+	}
+	for _, kn := range []struct{ kind, name string }{
+		{"manual", "m1"}, {"manual", "m2"}, {"registry", "r1"}, {"git", "g1"},
+	} {
+		if r := e.do(t, http.MethodPost, "/api/triggers", mkBody(kn.kind, kn.name)); r.StatusCode != http.StatusCreated {
+			raw, _ := io.ReadAll(r.Body)
+			t.Fatalf("seed %s/%s: status %d (%s)", kn.kind, kn.name, r.StatusCode, raw)
+		} else {
+			r.Body.Close()
+		}
+	}
+	resp := e.do(t, http.MethodGet, "/api/triggers", nil)
+	var all []map[string]any
+	_ = decodeEnvelope(t, resp, &all)
+	if len(all) != 4 {
+		t.Fatalf("all triggers = %d, want 4", len(all))
+	}
+	r2 := e.do(t, http.MethodGet, "/api/triggers?kind=manual", nil)
+	var manuals []map[string]any
+	_ = decodeEnvelope(t, r2, &manuals)
+	if len(manuals) != 2 {
+		t.Fatalf("manual triggers = %d, want 2", len(manuals))
+	}
+	for _, row := range manuals {
+		if row["kind"] != "manual" {
+			t.Fatalf("kind filter leaked: %v", row)
+		}
+	}
+}
+
+func TestDeleteTrigger_RemovesRow(t *testing.T) {
+	e := newAPITestEnv(t)
+	createResp := e.do(t, http.MethodPost, "/api/triggers", map[string]any{
+		"kind": "manual", "name": "del-me", "config": json.RawMessage(`{}`),
+	})
+	var created map[string]any
+	_ = decodeEnvelope(t, createResp, &created)
+	id, _ := created["id"].(string)
+	if id == "" {
+		t.Fatal("no id in create response")
+	}
+	r2 := e.do(t, http.MethodDelete, "/api/triggers/"+id, nil)
+	r2.Body.Close()
+	if r2.StatusCode != http.StatusOK {
+		t.Fatalf("delete status = %d", r2.StatusCode)
+	}
+	if _, err := e.store.GetTriggerByID(id); err == nil {
+		t.Fatal("trigger still in store after delete")
+	}
+}
+
+// =============================================================================
+// Auth gating
+// =============================================================================
+
+func TestAdminOnlyRoutes_RejectViewerToken(t *testing.T) {
+	e := newAPITestEnv(t)
+
+	// Mint a viewer token using a fresh LocalAuth bound to the same key.
+	la := auth.NewLocalAuth(e.encKey)
+	tok, err := la.GenerateToken(auth.Claims{
+		UserID: "u-viewer", Username: "viewer", Role: "viewer",
+	})
+	if err != nil {
+		t.Fatalf("mint viewer token: %v", err)
+	}
+	body := pluginWorkloadRequest{
+		Name: "x", SourceKind: "image", SourceConfig: validImageSourceConfig(),
+	}
+	b, _ := json.Marshal(body)
+	req, _ := http.NewRequest(http.MethodPost, e.srv.URL+"/api/workloads", bytes.NewReader(b))
+	req.Header.Set("Authorization", "Bearer "+tok.Token)
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		t.Fatalf("do: %v", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusForbidden {
+		t.Fatalf("viewer status = %d, want 403", resp.StatusCode)
+	}
+}
+
+func TestUnauthenticatedRoutes_RejectMissingToken(t *testing.T) {
+	e := newAPITestEnv(t)
+	req, _ := http.NewRequest(http.MethodGet, e.srv.URL+"/api/workloads", nil)
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		t.Fatalf("do: %v", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusUnauthorized {
+		t.Fatalf("status = %d, want 401", resp.StatusCode)
+	}
+}
+
+// =============================================================================
+// Test-only fixtures
+// =============================================================================
+
+// seedWorkload creates a minimal valid image-source workload via the
+// real POST /api/workloads handler and returns its ID. Going through
+// the handler exercises the same validation + ref_id self-reference
+// path that production callers hit.
+func seedWorkload(t *testing.T, e *apiTestEnv, name string) string {
+	t.Helper()
+	body := pluginWorkloadRequest{
+		Name:         name,
+		SourceKind:   "image",
+		SourceConfig: validImageSourceConfig(),
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads", body)
+	if resp.StatusCode != http.StatusCreated {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("seedWorkload(%s): status = %d", name, resp.StatusCode)
+	}
+	var got plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("seedWorkload(%s): envelope error %q", name, errMsg)
+	}
+	return got.ID
+}
+
+func seedWorkloadWithParent(t *testing.T, e *apiTestEnv, name, parentID string) string {
+	t.Helper()
+	body := pluginWorkloadRequest{
+		Name:             name,
+		ParentWorkloadID: parentID,
+		SourceKind:       "image",
+		SourceConfig:     validImageSourceConfig(),
+	}
+	resp := e.do(t, http.MethodPost, "/api/workloads", body)
+	if resp.StatusCode != http.StatusCreated {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("seedWorkloadWithParent(%s): status = %d", name, resp.StatusCode)
+	}
+	var got plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("seedWorkloadWithParent(%s): envelope error %q", name, errMsg)
+	}
+	return got.ID
+}
@@ -3,8 +3,10 @@ package auth
 import (
 	"crypto/hmac"
 	"crypto/sha256"
+	"encoding/hex"
 	"errors"
 	"fmt"
+	"sync"
 	"time"

 	"github.com/golang-jwt/jwt/v5"
@@ -31,21 +33,67 @@ type jwtClaims struct {
 // LocalAuth handles password hashing and JWT token management for local auth mode.
 type LocalAuth struct {
 	jwtSecret []byte
+	mu        sync.RWMutex
+	blacklist map[string]time.Time // token hash -> expiry time
 }

 // NewLocalAuth creates a LocalAuth deriving the JWT signing key from the encryption key
 // using HMAC-SHA256.
 func NewLocalAuth(encKey [32]byte) *LocalAuth {
 	mac := hmac.New(sha256.New, encKey[:])
-	mac.Write([]byte("docker-watcher-jwt-secret"))
-	return &LocalAuth{
+	mac.Write([]byte("tinyforge-jwt-secret"))
+	la := &LocalAuth{
 		jwtSecret: mac.Sum(nil),
+		blacklist: make(map[string]time.Time),
+	}
+	// Periodically clean expired blacklist entries.
+	go la.cleanBlacklist()
+	return la
+}
+
+// RevokeToken adds a token to the blacklist.
+func (la *LocalAuth) RevokeToken(tokenString string) {
+	hash := sha256.Sum256([]byte(tokenString))
+	key := hex.EncodeToString(hash[:])
+	la.mu.Lock()
+	defer la.mu.Unlock()
+	la.blacklist[key] = time.Now().Add(TokenExpiry)
+}
+
+// IsRevoked checks if a token has been revoked.
+func (la *LocalAuth) IsRevoked(tokenString string) bool {
+	hash := sha256.Sum256([]byte(tokenString))
+	key := hex.EncodeToString(hash[:])
+	la.mu.RLock()
+	defer la.mu.RUnlock()
+	_, exists := la.blacklist[key]
+	return exists
+}
+
+// cleanBlacklist removes expired entries from the blacklist every hour.
+func (la *LocalAuth) cleanBlacklist() {
+	ticker := time.NewTicker(1 * time.Hour)
+	for range ticker.C {
+		la.mu.Lock()
+		now := time.Now()
+		for k, expiry := range la.blacklist {
+			if now.After(expiry) {
+				delete(la.blacklist, k)
+			}
+		}
+		la.mu.Unlock()
 	}
 }

+// bcryptCost is the work factor used for new password hashes. Bumped from
+// the library default (10) to 12 so cost grows with hardware. Existing
+// hashes at lower costs still verify — bcrypt encodes the cost in the
+// stored hash itself.
+const bcryptCost = 12
+
 // HashPassword hashes a plaintext password using bcrypt.
 func HashPassword(password string) (string, error) {
-	hash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
+	hash, err := bcrypt.GenerateFromPassword([]byte(password), bcryptCost)
 	if err != nil {
 		return "", fmt.Errorf("hash password: %w", err)
 	}
@@ -68,7 +116,7 @@ func (la *LocalAuth) GenerateToken(claims Claims) (SessionToken, error) {
 		RegisteredClaims: jwt.RegisteredClaims{
 			ExpiresAt: jwt.NewNumericDate(expiresAt),
 			IssuedAt:  jwt.NewNumericDate(time.Now()),
-			Issuer:    "docker-watcher",
+			Issuer:    "tinyforge",
 		},
 		UserID:   claims.UserID,
 		Username: claims.Username,
@@ -0,0 +1,132 @@
+package auth
+
+import (
+	"testing"
+)
+
+func TestHashAndCheckPassword(t *testing.T) {
+	hash, err := HashPassword("my-password-123")
+	if err != nil {
+		t.Fatalf("HashPassword failed: %v", err)
+	}
+	if hash == "my-password-123" {
+		t.Fatal("hash equals plaintext")
+	}
+
+	// Correct password
+	if err := CheckPassword(hash, "my-password-123"); err != nil {
+		t.Fatalf("CheckPassword rejected correct password: %v", err)
+	}
+
+	// Wrong password
+	if err := CheckPassword(hash, "wrong-password"); err == nil {
+		t.Fatal("CheckPassword accepted wrong password")
+	}
+}
+
+func TestGenerateAndValidateToken(t *testing.T) {
+	key := [32]byte{}
+	copy(key[:], "test-jwt-secret-32-bytes-needed!")
+	la := NewLocalAuth(key)
+
+	claims := Claims{UserID: "u1", Username: "admin", Role: "admin"}
+	token, err := la.GenerateToken(claims)
+	if err != nil {
+		t.Fatalf("GenerateToken failed: %v", err)
+	}
+	if token.Token == "" {
+		t.Fatal("generated empty token")
+	}
+
+	// Validate the token
+	got, err := la.ValidateToken(token.Token)
+	if err != nil {
+		t.Fatalf("ValidateToken failed: %v", err)
+	}
+	if got.UserID != "u1" || got.Username != "admin" || got.Role != "admin" {
+		t.Fatalf("claims mismatch: %+v", got)
+	}
+}
+
+func TestValidateInvalidToken(t *testing.T) {
+	key := [32]byte{}
+	copy(key[:], "test-jwt-secret-32-bytes-needed!")
+	la := NewLocalAuth(key)
+
+	_, err := la.ValidateToken("invalid-token-string")
+	if err == nil {
+		t.Fatal("ValidateToken should reject invalid token")
+	}
+}
+
+func TestValidateTokenFromDifferentKey(t *testing.T) {
+	key1 := [32]byte{}
+	copy(key1[:], "first-jwt-secret-32-bytes-needed")
+	la1 := NewLocalAuth(key1)
+
+	key2 := [32]byte{}
+	copy(key2[:], "other-jwt-secret-32-bytes-needed")
+	la2 := NewLocalAuth(key2)
+
+	claims := Claims{UserID: "u1", Username: "admin", Role: "admin"}
+	token, err := la1.GenerateToken(claims)
+	if err != nil {
+		t.Fatalf("GenerateToken failed: %v", err)
+	}
+
+	// Token signed with key1 should not validate with key2
+	_, err = la2.ValidateToken(token.Token)
+	if err == nil {
+		t.Fatal("ValidateToken should reject token signed with different key")
+	}
+}
+
+func TestHashPasswordDifferentOutputs(t *testing.T) {
+	hash1, err := HashPassword("same-password")
+	if err != nil {
+		t.Fatalf("HashPassword 1 failed: %v", err)
+	}
+
+	hash2, err := HashPassword("same-password")
+	if err != nil {
+		t.Fatalf("HashPassword 2 failed: %v", err)
+	}
+
+	if hash1 == hash2 {
+		t.Fatal("bcrypt should produce different hashes for same input (random salt)")
+	}
+
+	// Both should still verify
+	if err := CheckPassword(hash1, "same-password"); err != nil {
+		t.Fatal("hash1 should verify")
+	}
+	if err := CheckPassword(hash2, "same-password"); err != nil {
+		t.Fatal("hash2 should verify")
+	}
+}
+
+func TestTokenContainsClaims(t *testing.T) {
+	key := [32]byte{}
+	copy(key[:], "test-jwt-secret-32-bytes-needed!")
+	la := NewLocalAuth(key)
+
+	claims := Claims{UserID: "user-42", Username: "testuser", Role: "viewer"}
+	token, err := la.GenerateToken(claims)
+	if err != nil {
+		t.Fatalf("GenerateToken failed: %v", err)
+	}
+
+	got, err := la.ValidateToken(token.Token)
+	if err != nil {
+		t.Fatalf("ValidateToken failed: %v", err)
+	}
+	if got.UserID != "user-42" {
+		t.Fatalf("UserID mismatch: got %q, want %q", got.UserID, "user-42")
+	}
+	if got.Username != "testuser" {
+		t.Fatalf("Username mismatch: got %q, want %q", got.Username, "testuser")
+	}
+	if got.Role != "viewer" {
+		t.Fatalf("Role mismatch: got %q, want %q", got.Role, "viewer")
+	}
+}
@@ -18,7 +18,7 @@ const claimsKey contextKey = "auth_claims"
 func Middleware(la *LocalAuth) func(http.Handler) http.Handler {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			tokenStr := extractToken(r)
+			tokenStr := ExtractToken(r)
 			if tokenStr == "" {
 				http.Error(w, `{"success":false,"error":"authentication required"}`, http.StatusUnauthorized)
 				return
@@ -30,6 +30,11 @@ func Middleware(la *LocalAuth) func(http.Handler) http.Handler {
 				return
 			}

+			if la.IsRevoked(tokenStr) {
+				http.Error(w, `{"success":false,"error":"token has been revoked"}`, http.StatusUnauthorized)
+				return
+			}
+
 			ctx := context.WithValue(r.Context(), claimsKey, claims)
 			next.ServeHTTP(w, r.WithContext(ctx))
 		})
@@ -55,8 +60,8 @@ func ClaimsFromContext(ctx context.Context) (Claims, bool) {
 	return claims, ok
 }

-// extractToken gets the JWT from the Authorization header or "token" query param.
-func extractToken(r *http.Request) string {
+// ExtractToken gets the JWT from the Authorization header or "token" query param.
+func ExtractToken(r *http.Request) string {
 	// Try Authorization: Bearer <token>
 	authHeader := r.Header.Get("Authorization")
 	if strings.HasPrefix(authHeader, "Bearer ") {
@@ -2,26 +2,6 @@ package auth

 import "time"

-// User represents an authenticated user stored in the database.
-type User struct {
-	ID           string `json:"id"`
-	Username     string `json:"username"`
-	PasswordHash string `json:"-"`
-	Email        string `json:"email"`
-	Role         string `json:"role"` // admin, viewer
-	CreatedAt    string `json:"created_at"`
-	UpdatedAt    string `json:"updated_at"`
-}
-
-// AuthSettings holds the authentication configuration (single-row pattern).
-type AuthSettings struct {
-	AuthMode         string `json:"auth_mode"` // local, oidc
-	OIDCClientID     string `json:"oidc_client_id"`
-	OIDCClientSecret string `json:"-"`
-	OIDCIssuerURL    string `json:"oidc_issuer_url"`
-	OIDCRedirectURL  string `json:"oidc_redirect_url"`
-}
-
 // Claims represents the JWT token claims.
 type Claims struct {
 	UserID   string `json:"user_id"`
@@ -0,0 +1,372 @@
+package backup
+
+import (
+	"database/sql"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	_ "modernc.org/sqlite" // read-only candidate inspection via PRAGMA integrity_check
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// Engine manages database backup operations.
+type Engine struct {
+	mu        sync.Mutex
+	store     *store.Store
+	dbPath    string
+	backupDir string
+}
+
+// New creates a new backup engine. It ensures the backup directory exists.
+func New(st *store.Store, dbPath, dataDir string) (*Engine, error) {
+	backupDir := filepath.Join(dataDir, "backups")
+	if err := os.MkdirAll(backupDir, 0o755); err != nil {
+		return nil, fmt.Errorf("create backup directory: %w", err)
+	}
+	return &Engine{
+		store:     st,
+		dbPath:    dbPath,
+		backupDir: backupDir,
+	}, nil
+}
+
+// BackupDir returns the path to the backup directory.
+func (e *Engine) BackupDir() string {
+	return e.backupDir
+}
+
+// CreateBackup creates a new database backup using VACUUM INTO.
+// Returns the backup metadata record.
+func (e *Engine) CreateBackup(backupType string) (store.Backup, error) {
+	// Validate backup type to prevent path traversal via filename.
+	switch backupType {
+	case "manual", "auto", "pre-restore", "pre-deploy":
+		// valid
+	default:
+		return store.Backup{}, fmt.Errorf("invalid backup type: %q", backupType)
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	timestamp := time.Now().UTC().Format("20060102-150405")
+	filename := fmt.Sprintf("tinyforge-%s-%s.db", backupType, timestamp)
+	destPath := filepath.Join(e.backupDir, filename)
+
+	// VACUUM INTO creates a clean, standalone copy of the database.
+	// It is safe to use while the database is open and in WAL mode.
+	_, err := e.store.DB().Exec(`VACUUM INTO ?`, destPath)
+	if err != nil {
+		return store.Backup{}, fmt.Errorf("vacuum into %s: %w", destPath, err)
+	}
+
+	// Get file size.
+	info, err := os.Stat(destPath)
+	if err != nil {
+		return store.Backup{}, fmt.Errorf("stat backup file: %w", err)
+	}
+
+	// Store metadata.
+	backup, err := e.store.CreateBackup(store.Backup{
+		Filename:   filename,
+		SizeBytes:  info.Size(),
+		BackupType: backupType,
+	})
+	if err != nil {
+		// Best effort: remove the file if metadata insert fails.
+		os.Remove(destPath)
+		return store.Backup{}, fmt.Errorf("store backup metadata: %w", err)
+	}
+
+	slog.Info("backup created", "id", backup.ID, "filename", filename, "size", info.Size(), "type", backupType)
+	return backup, nil
+}
+
+// ListBackups returns all backup records.
+func (e *Engine) ListBackups() ([]store.Backup, error) {
+	return e.store.ListBackups()
+}
+
+// GetBackup returns a single backup record.
+func (e *Engine) GetBackup(id string) (store.Backup, error) {
+	return e.store.GetBackup(id)
+}
+
+// FilePath returns the full filesystem path for a backup.
+func (e *Engine) FilePath(backup store.Backup) string {
+	return filepath.Join(e.backupDir, backup.Filename)
+}
+
+// DeleteBackup removes a backup file and its metadata record.
+func (e *Engine) DeleteBackup(id string) error {
+	backup, err := e.store.GetBackup(id)
+	if err != nil {
+		return fmt.Errorf("get backup: %w", err)
+	}
+
+	// Remove file.
+	filePath := filepath.Join(e.backupDir, backup.Filename)
+	if err := os.Remove(filePath); err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("remove backup file: %w", err)
+	}
+
+	// Remove metadata.
+	if err := e.store.DeleteBackup(id); err != nil {
+		return fmt.Errorf("delete backup metadata: %w", err)
+	}
+
+	slog.Info("backup deleted", "id", id, "filename", backup.Filename)
+	return nil
+}
+
+// RestorePath returns the path of a backup file for restore operations.
+// The caller is responsible for actually replacing the database.
+func (e *Engine) RestorePath(id string) (string, error) {
+	backup, err := e.store.GetBackup(id)
+	if err != nil {
+		return "", fmt.Errorf("get backup: %w", err)
+	}
+
+	// Filename comes from a DB row. Defence-in-depth: a backup file must live
+	// directly under backupDir, so reject any value carrying a path separator
+	// or traversal before joining. A poisoned row (future import path, manual
+	// insert) must never let restore read — and then atomically copy over the
+	// live DB — an arbitrary file. CreateBackup builds safe base names; this
+	// enforces the same invariant on read.
+	if backup.Filename == "" || backup.Filename == "." || backup.Filename == ".." ||
+		backup.Filename != filepath.Base(backup.Filename) {
+		return "", fmt.Errorf("backup: invalid filename %q", backup.Filename)
+	}
+
+	filePath := filepath.Join(e.backupDir, backup.Filename)
+	if _, err := os.Stat(filePath); err != nil {
+		return "", fmt.Errorf("backup file not found: %w", err)
+	}
+
+	return filePath, nil
+}
+
+// PrepareRestore validates a backup candidate before the caller swaps it
+// over the live DB. Runs three checks in order:
+//
+//  1. The candidate file exists and is non-empty.
+//  2. SQLite header magic matches (catches corrupted or partial downloads).
+//  3. `PRAGMA integrity_check` against a temp copy returns "ok"
+//     (catches WAL/page corruption that the header check misses).
+//
+// On success returns the candidate path. On failure returns a wrapped
+// error describing which probe rejected the file, so the operator can
+// see exactly why a "restore" was refused rather than getting a corrupt
+// DB at next boot.
+//
+// We use a *temp copy* for integrity_check because attaching the
+// candidate read-only into the live process would still hold a file
+// handle SQLite considers writable on Windows.
+func (e *Engine) PrepareRestore(id string) (string, error) {
+	path, err := e.RestorePath(id)
+	if err != nil {
+		return "", err
+	}
+
+	info, err := os.Stat(path)
+	if err != nil {
+		return "", fmt.Errorf("restore: stat candidate: %w", err)
+	}
+	if info.Size() < 100 {
+		return "", fmt.Errorf("restore: candidate %s is suspiciously small (%d bytes)", path, info.Size())
+	}
+
+	// SQLite file header: "SQLite format 3\x00" (16 bytes).
+	hdr, err := readHead(path, 16)
+	if err != nil {
+		return "", fmt.Errorf("restore: read header: %w", err)
+	}
+	if string(hdr) != "SQLite format 3\x00" {
+		return "", fmt.Errorf("restore: candidate %s is not a SQLite database (header mismatch)", path)
+	}
+
+	if err := integrityCheck(path); err != nil {
+		return "", fmt.Errorf("restore: integrity check failed: %w", err)
+	}
+
+	return path, nil
+}
+
+func readHead(path string, n int) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	buf := make([]byte, n)
+	// io.ReadFull (not f.Read) guarantees the buffer is filled.
+	// A bare Read can short-return on some filesystems / on small
+	// files, which would skew the SQLite-header magic check below.
+	if _, err := io.ReadFull(f, buf); err != nil {
+		return nil, err
+	}
+	return buf, nil
+}
+
+// integrityCheck opens the candidate read-only and runs
+// `PRAGMA integrity_check`. We use immutable=1 so the driver does not
+// try to create WAL/SHM sidecars or upgrade the journal mode on the
+// candidate — both of which fail with "attempt to write a readonly
+// database" against a backup file. Anything other than the single row
+// `"ok"` is treated as corruption.
+func integrityCheck(path string) error {
+	db, err := sql.Open("sqlite", "file:"+path+"?mode=ro&immutable=1")
+	if err != nil {
+		return fmt.Errorf("open candidate: %w", err)
+	}
+	defer db.Close()
+
+	rows, err := db.Query("PRAGMA integrity_check")
+	if err != nil {
+		return fmt.Errorf("pragma integrity_check: %w", err)
+	}
+	defer rows.Close()
+
+	if !rows.Next() {
+		return fmt.Errorf("integrity_check returned no rows")
+	}
+	var result string
+	if err := rows.Scan(&result); err != nil {
+		return fmt.Errorf("scan integrity_check: %w", err)
+	}
+	if result != "ok" {
+		return fmt.Errorf("integrity_check: %s", result)
+	}
+	return nil
+}
+
+// AtomicReplaceDB writes a backup candidate into place atomically.
+// The caller is expected to:
+//  1. Call PrepareRestore(id) → candidatePath.
+//  2. Take a "pre-restore" backup of the current DB via CreateBackup.
+//  3. Close the live *sql.DB.
+//  4. Call AtomicReplaceDB(candidatePath, livePath).
+//  5. Trigger graceful shutdown; main() will re-open on next start.
+//
+// AtomicReplaceDB also wipes WAL/SHM sidecar files so the new DB starts
+// from a clean checkpoint state. Failure to remove sidecars is logged
+// but non-fatal — SQLite recreates them on open.
+func (e *Engine) AtomicReplaceDB(candidatePath, livePath string) error {
+	// Copy candidate to a tmp file next to the live DB, then rename
+	// atomically. On Windows os.Rename across volumes fails, so we
+	// keep tmp on the same dir as the destination.
+	tmp := livePath + ".restore.tmp"
+	if err := copyFile(candidatePath, tmp); err != nil {
+		return fmt.Errorf("copy candidate to %s: %w", tmp, err)
+	}
+	// Best-effort: remove WAL/SHM so SQLite re-checkpoints from the
+	// restored main file rather than a stale WAL pointing at the old
+	// DB's pages.
+	for _, sidecar := range []string{livePath + "-wal", livePath + "-shm"} {
+		if err := os.Remove(sidecar); err != nil && !os.IsNotExist(err) {
+			slog.Warn("restore: remove sidecar", "path", sidecar, "error", err)
+		}
+	}
+	if err := os.Rename(tmp, livePath); err != nil {
+		// Clean up tmp on rename failure so we don't leak a partial file.
+		_ = os.Remove(tmp)
+		return fmt.Errorf("rename %s → %s: %w", tmp, livePath, err)
+	}
+	slog.Info("restore: database file replaced atomically", "live", livePath)
+	return nil
+}
+
+func copyFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
+	if err != nil {
+		return err
+	}
+	if _, err := io.Copy(out, in); err != nil {
+		_ = out.Close()
+		return err
+	}
+	return out.Close()
+}
+
+// Prune removes old backups exceeding the retention count.
+// Returns the number of backups pruned.
+func (e *Engine) Prune(retentionCount int) (int, error) {
+	if retentionCount <= 0 {
+		return 0, nil
+	}
+
+	count, err := e.store.CountBackups()
+	if err != nil {
+		return 0, fmt.Errorf("count backups: %w", err)
+	}
+
+	excess := count - retentionCount
+	if excess <= 0 {
+		return 0, nil
+	}
+
+	oldest, err := e.store.GetOldestBackups(excess)
+	if err != nil {
+		return 0, fmt.Errorf("get oldest backups: %w", err)
+	}
+
+	pruned := 0
+	for _, b := range oldest {
+		if err := e.DeleteBackup(b.ID); err != nil {
+			slog.Warn("prune: failed to delete backup", "id", b.ID, "error", err)
+			continue
+		}
+		pruned++
+	}
+
+	if pruned > 0 {
+		slog.Info("backups pruned", "pruned", pruned, "retention", retentionCount)
+	}
+	return pruned, nil
+}
+
+// CleanOrphans removes backup files in the backup directory that have no metadata record.
+func (e *Engine) CleanOrphans() (int, error) {
+	entries, err := os.ReadDir(e.backupDir)
+	if err != nil {
+		return 0, fmt.Errorf("read backup directory: %w", err)
+	}
+
+	backups, err := e.store.ListBackups()
+	if err != nil {
+		return 0, fmt.Errorf("list backups: %w", err)
+	}
+
+	tracked := make(map[string]bool, len(backups))
+	for _, b := range backups {
+		tracked[b.Filename] = true
+	}
+
+	cleaned := 0
+	for _, entry := range entries {
+		if entry.IsDir() {
+			continue
+		}
+		if !tracked[entry.Name()] {
+			filePath := filepath.Join(e.backupDir, entry.Name())
+			if err := os.Remove(filePath); err != nil {
+				slog.Warn("clean orphan: failed to remove file", "file", entry.Name(), "error", err)
+				continue
+			}
+			slog.Info("removed orphaned backup file", "file", entry.Name())
+			cleaned++
+		}
+	}
+	return cleaned, nil
+}
@@ -0,0 +1,113 @@
+package backup
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// newTestEngine spins up an isolated store + engine pair for tests.
+// Each test gets its own tempdir so backup files do not collide.
+func newTestEngine(t *testing.T) (*Engine, *store.Store, string) {
+	t.Helper()
+	dir := t.TempDir()
+	dbPath := filepath.Join(dir, "tinyforge.db")
+	st, err := store.New(dbPath)
+	if err != nil {
+		t.Fatalf("store.New: %v", err)
+	}
+	t.Cleanup(func() { _ = st.Close() })
+
+	eng, err := New(st, dbPath, dir)
+	if err != nil {
+		t.Fatalf("backup.New: %v", err)
+	}
+	return eng, st, dbPath
+}
+
+func TestPrepareRestore_RejectsTinyFile(t *testing.T) {
+	eng, st, _ := newTestEngine(t)
+
+	// Plant a backup row with a tiny file masquerading as a backup.
+	tinyPath := filepath.Join(eng.BackupDir(), "tinyforge-manual-junk.db")
+	if err := os.WriteFile(tinyPath, []byte("hi"), 0o600); err != nil {
+		t.Fatalf("write tiny: %v", err)
+	}
+	bk, err := st.CreateBackup(store.Backup{
+		Filename:   "tinyforge-manual-junk.db",
+		SizeBytes:  2,
+		BackupType: "manual",
+	})
+	if err != nil {
+		t.Fatalf("CreateBackup row: %v", err)
+	}
+
+	if _, err := eng.PrepareRestore(bk.ID); err == nil {
+		t.Fatal("expected PrepareRestore to reject tiny file, got nil")
+	} else if !strings.Contains(err.Error(), "suspiciously small") {
+		t.Errorf("error = %v, want 'suspiciously small'", err)
+	}
+}
+
+func TestPrepareRestore_RejectsNonSQLite(t *testing.T) {
+	eng, st, _ := newTestEngine(t)
+
+	// 200 bytes of non-SQLite garbage: passes the size check, fails
+	// the header magic check.
+	garbagePath := filepath.Join(eng.BackupDir(), "tinyforge-manual-bogus.db")
+	junk := make([]byte, 200)
+	for i := range junk {
+		junk[i] = byte('x')
+	}
+	if err := os.WriteFile(garbagePath, junk, 0o600); err != nil {
+		t.Fatalf("write junk: %v", err)
+	}
+	bk, err := st.CreateBackup(store.Backup{
+		Filename:   "tinyforge-manual-bogus.db",
+		SizeBytes:  int64(len(junk)),
+		BackupType: "manual",
+	})
+	if err != nil {
+		t.Fatalf("CreateBackup row: %v", err)
+	}
+
+	if _, err := eng.PrepareRestore(bk.ID); err == nil {
+		t.Fatal("expected PrepareRestore to reject non-SQLite blob, got nil")
+	} else if !strings.Contains(err.Error(), "header") {
+		t.Errorf("error = %v, want header mismatch", err)
+	}
+}
+
+func TestPrepareRestore_AcceptsValidVacuumInto(t *testing.T) {
+	eng, _, _ := newTestEngine(t)
+
+	// A fresh CreateBackup from the engine itself is, by construction,
+	// a valid SQLite database — VACUUM INTO produces a clean copy.
+	bk, err := eng.CreateBackup("manual")
+	if err != nil {
+		t.Fatalf("CreateBackup: %v", err)
+	}
+	path, err := eng.PrepareRestore(bk.ID)
+	if err != nil {
+		t.Fatalf("PrepareRestore on valid backup: %v", err)
+	}
+	if path == "" {
+		t.Errorf("PrepareRestore returned empty path")
+	}
+}
+
+func TestPrepareRestore_UnknownID(t *testing.T) {
+	eng, _, _ := newTestEngine(t)
+
+	_, err := eng.PrepareRestore("nonexistent-id")
+	if err == nil {
+		t.Fatal("expected error for unknown id, got nil")
+	}
+	if errors.Is(err, store.ErrNotFound) {
+		// fine — wrapped through RestorePath
+	}
+}
@@ -7,11 +7,12 @@ import (
 	"gopkg.in/yaml.v3"
 )

-// SeedConfig represents the top-level YAML seed configuration.
+// SeedConfig represents the top-level YAML seed configuration. After the
+// hard cutover only global settings + registries are supported; workloads
+// are created through the API.
 type SeedConfig struct {
-	Global     GlobalConfig            `yaml:"global"`
-	Registries map[string]RegistryDef  `yaml:"registries"`
-	Projects   map[string]ProjectDef   `yaml:"projects"`
+	Global     GlobalConfig           `yaml:"global"`
+	Registries map[string]RegistryDef `yaml:"registries"`
 }

 // GlobalConfig holds domain-wide settings from the seed file.
@@ -38,27 +39,6 @@ type RegistryDef struct {
 	Token string `yaml:"token"`
 }

-// ProjectDef defines a project from the seed file.
-type ProjectDef struct {
-	Registry    string            `yaml:"registry"`
-	Image       string            `yaml:"image"`
-	Port        int               `yaml:"port"`
-	Healthcheck string            `yaml:"healthcheck"`
-	Env         map[string]string `yaml:"env"`
-	Volumes     map[string]string `yaml:"volumes"`
-	Stages      map[string]StageDef `yaml:"stages"`
-}
-
-// StageDef defines a deployment stage from the seed file.
-type StageDef struct {
-	TagPattern   string `yaml:"tag_pattern"`
-	AutoDeploy   bool   `yaml:"auto_deploy"`
-	MaxInstances int    `yaml:"max_instances"`
-	Confirm      bool   `yaml:"confirm"`
-	PromoteFrom  string `yaml:"promote_from"`
-	Subdomain    string `yaml:"subdomain"`
-}
-
 // LoadSeedFile reads and parses the YAML seed config from the given path.
 func LoadSeedFile(path string) (SeedConfig, error) {
 	data, err := os.ReadFile(path)
@@ -88,25 +68,5 @@ func validate(cfg SeedConfig) error {
 	if cfg.Global.Domain == "" {
 		return fmt.Errorf("global.domain is required")
 	}
-
-	for name, proj := range cfg.Projects {
-		if proj.Image == "" {
-			return fmt.Errorf("project %q: image is required", name)
-		}
-		if proj.Registry != "" {
-			if _, ok := cfg.Registries[proj.Registry]; !ok {
-				return fmt.Errorf("project %q: references unknown registry %q", name, proj.Registry)
-			}
-		}
-		for stageName, stage := range proj.Stages {
-			if stage.TagPattern == "" {
-				return fmt.Errorf("project %q stage %q: tag_pattern is required", name, stageName)
-			}
-			if stage.MaxInstances < 0 {
-				return fmt.Errorf("project %q stage %q: max_instances must be >= 0", name, stageName)
-			}
-		}
-	}
-
 	return nil
 }
@@ -1,16 +1,17 @@
 package config

 import (
-	"encoding/json"
 	"fmt"

-	"github.com/alexei/docker-watcher/internal/store"
+	"github.com/alexei/tinyforge/internal/store"
 	"gopkg.in/yaml.v3"
 )

 // ExportConfig reads the current database state and produces a SeedConfig YAML
-// representation. Credential fields (tokens, passwords) are exported as placeholder
-// strings since they are encrypted in the database.
+// representation. Credential fields (tokens, passwords) are exported as
+// placeholder strings since they are encrypted in the database. After the hard
+// cutover, only global settings + registries are exported — workloads and
+// triggers are created through the API, not via seed files.
 func ExportConfig(db *store.Store) ([]byte, error) {
 	cfg, err := buildSeedConfig(db)
 	if err != nil {
@@ -25,7 +26,6 @@ func ExportConfig(db *store.Store) ([]byte, error) {
 	return data, nil
 }

-// buildSeedConfig constructs a SeedConfig from the current database state.
 func buildSeedConfig(db *store.Store) (SeedConfig, error) {
 	settings, err := db.GetSettings()
 	if err != nil {
@@ -37,11 +37,6 @@ func buildSeedConfig(db *store.Store) (SeedConfig, error) {
 		return SeedConfig{}, fmt.Errorf("get registries: %w", err)
 	}

-	projects, err := db.GetAllProjects()
-	if err != nil {
-		return SeedConfig{}, fmt.Errorf("get projects: %w", err)
-	}
-
 	cfg := SeedConfig{
 		Global: GlobalConfig{
 			Domain:           settings.Domain,
@@ -56,7 +51,6 @@ func buildSeedConfig(db *store.Store) (SeedConfig, error) {
 			},
 		},
 		Registries: make(map[string]RegistryDef),
-		Projects:   make(map[string]ProjectDef),
 	}

 	for _, reg := range registries {
@@ -67,52 +61,5 @@ func buildSeedConfig(db *store.Store) (SeedConfig, error) {
 		}
 	}

-	for _, proj := range projects {
-		stages, err := db.GetStagesByProjectID(proj.ID)
-		if err != nil {
-			return SeedConfig{}, fmt.Errorf("get stages for project %s: %w", proj.Name, err)
-		}
-
-		stageDefs := make(map[string]StageDef)
-		for _, st := range stages {
-			stageDefs[st.Name] = StageDef{
-				TagPattern:   st.TagPattern,
-				AutoDeploy:   st.AutoDeploy,
-				MaxInstances: st.MaxInstances,
-				Confirm:      st.Confirm,
-				PromoteFrom:  st.PromoteFrom,
-				Subdomain:    st.Subdomain,
-			}
-		}
-
-		envMap := parseJSONMap(proj.Env)
-		volMap := parseJSONMap(proj.Volumes)
-
-		cfg.Projects[proj.Name] = ProjectDef{
-			Registry:    proj.Registry,
-			Image:       proj.Image,
-			Port:        proj.Port,
-			Healthcheck: proj.Healthcheck,
-			Env:         envMap,
-			Volumes:     volMap,
-			Stages:      stageDefs,
-		}
-	}
-
 	return cfg, nil
 }
-
-// parseJSONMap safely parses a JSON-encoded map string. Returns nil on failure.
-func parseJSONMap(jsonStr string) map[string]string {
-	if jsonStr == "" || jsonStr == "{}" {
-		return nil
-	}
-	var m map[string]string
-	if err := json.Unmarshal([]byte(jsonStr), &m); err != nil {
-		return nil
-	}
-	if len(m) == 0 {
-		return nil
-	}
-	return m
-}
@@ -1,23 +1,26 @@
+// Package config loads and exports seed configuration. After the hard
+// cutover the seed shape covers only what survives the workload-first
+// refactor: global settings and registries. Project / stage / volume
+// seeding is gone; the new way to bootstrap a workload is the plugin
+// pipeline (POST /api/workloads).
 package config

 import (
-	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
-	"time"

-	"github.com/alexei/docker-watcher/internal/crypto"
-	"github.com/alexei/docker-watcher/internal/store"
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/store"
 	"github.com/google/uuid"
 )

 // ImportSeed loads the seed YAML file and imports its contents into the store.
-// Import is idempotent: it is skipped if any projects or registries already exist.
+// Import is idempotent: it is skipped if any registries already exist.
 // Credential fields (registry tokens, NPM password) are encrypted before storage.
 func ImportSeed(db *store.Store, seedPath string) error {
 	if _, err := os.Stat(seedPath); os.IsNotExist(err) {
-		log.Printf("No seed file at %s, skipping import", seedPath)
+		slog.Info("no seed file, skipping import", "path", seedPath)
 		return nil
 	}

@@ -26,7 +29,7 @@ func ImportSeed(db *store.Store, seedPath string) error {
 		return fmt.Errorf("check if db is populated: %w", err)
 	}
 	if populated {
-		log.Println("Database already has data, skipping seed import")
+		slog.Info("database already has data, skipping seed import")
 		return nil
 	}

@@ -44,20 +47,14 @@ func ImportSeed(db *store.Store, seedPath string) error {
 		return fmt.Errorf("import seed: %w", err)
 	}

-	log.Printf("Seed config imported from %s", seedPath)
+	slog.Info("seed config imported", "path", seedPath)
 	return nil
 }

-// isPopulated returns true if the store already contains projects or registries.
+// isPopulated returns true if the store already contains any registries.
+// Workloads / apps are intentionally not consulted — they get created
+// through the API, not seeded.
 func isPopulated(db *store.Store) (bool, error) {
-	projects, err := db.GetAllProjects()
-	if err != nil {
-		return false, fmt.Errorf("get projects: %w", err)
-	}
-	if len(projects) > 0 {
-		return true, nil
-	}
-
 	registries, err := db.GetAllRegistries()
 	if err != nil {
 		return false, fmt.Errorf("get registries: %w", err)
@@ -65,13 +62,7 @@ func isPopulated(db *store.Store) (bool, error) {
 	return len(registries) > 0, nil
 }

-// now returns the current time formatted for SQLite storage.
-func now() string {
-	return time.Now().UTC().Format("2006-01-02 15:04:05")
-}
-
-// importAll runs the full seed import inside a database transaction.
-// Uses raw SQL within the transaction so all inserts are atomic.
+// importAll runs the seed import inside a database transaction.
 func importAll(db *store.Store, cfg SeedConfig, encKey [32]byte) error {
 	tx, err := db.DB().Begin()
 	if err != nil {
@@ -79,9 +70,8 @@ func importAll(db *store.Store, cfg SeedConfig, encKey [32]byte) error {
 	}
 	defer tx.Rollback() //nolint:errcheck // rollback after commit is a no-op

-	timestamp := now()
+	timestamp := store.Now()

-	// Import registries first — projects reference them by name.
 	for name, regDef := range cfg.Registries {
 		encToken, err := crypto.EncryptIfNotEmpty(encKey, regDef.Token)
 		if err != nil {
@@ -99,50 +89,6 @@ func importAll(db *store.Store, cfg SeedConfig, encKey [32]byte) error {
 		}
 	}

-	// Import projects and their stages.
-	for name, projDef := range cfg.Projects {
-		envJSON, err := mapToJSON(projDef.Env)
-		if err != nil {
-			return fmt.Errorf("encode env for project %q: %w", name, err)
-		}
-		volJSON, err := mapToJSON(projDef.Volumes)
-		if err != nil {
-			return fmt.Errorf("encode volumes for project %q: %w", name, err)
-		}
-
-		projectID := uuid.New().String()
-		_, err = tx.Exec(
-			`INSERT INTO projects (id, name, registry, image, port, healthcheck, env, volumes, created_at, updated_at)
-			 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
-			projectID, name, projDef.Registry, projDef.Image, projDef.Port,
-			projDef.Healthcheck, envJSON, volJSON, timestamp, timestamp,
-		)
-		if err != nil {
-			return fmt.Errorf("insert project %q: %w", name, err)
-		}
-
-		for stageName, stageDef := range projDef.Stages {
-			maxInstances := stageDef.MaxInstances
-			if maxInstances == 0 {
-				maxInstances = 1
-			}
-
-			stageID := uuid.New().String()
-			_, err = tx.Exec(
-				`INSERT INTO stages (id, project_id, name, tag_pattern, auto_deploy, max_instances, confirm, promote_from, subdomain, created_at, updated_at)
-				 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
-				stageID, projectID, stageName, stageDef.TagPattern,
-				boolToInt(stageDef.AutoDeploy), maxInstances,
-				boolToInt(stageDef.Confirm), stageDef.PromoteFrom,
-				stageDef.Subdomain, timestamp, timestamp,
-			)
-			if err != nil {
-				return fmt.Errorf("insert stage %q for project %q: %w", stageName, name, err)
-			}
-		}
-	}
-
-	// Import global settings — encrypt NPM password.
 	encNpmPassword, err := crypto.EncryptIfNotEmpty(encKey, cfg.Global.Npm.Password)
 	if err != nil {
 		return fmt.Errorf("encrypt npm password: %w", err)
@@ -172,23 +118,3 @@ func importAll(db *store.Store, cfg SeedConfig, encKey [32]byte) error {

 	return nil
 }
-
-// boolToInt converts a bool to an integer for SQLite storage.
-func boolToInt(b bool) int {
-	if b {
-		return 1
-	}
-	return 0
-}
-
-// mapToJSON encodes a string map to JSON. Returns "{}" for nil maps.
-func mapToJSON(m map[string]string) (string, error) {
-	if m == nil {
-		return "{}", nil
-	}
-	b, err := json.Marshal(m)
-	if err != nil {
-		return "", err
-	}
-	return string(b), nil
-}
@@ -10,11 +10,26 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"strings"
 )

 // ErrNoKey is returned when ENCRYPTION_KEY is not set.
 var ErrNoKey = errors.New("ENCRYPTION_KEY environment variable is not set")

+// ErrDecryptFailed wraps any cipher.Open / decoder failure. Callers
+// upgrading from the silent-fallback pattern (treat-as-plaintext when
+// decrypt errored) MUST instead surface this — a rotated key would
+// otherwise silently leak ciphertext to upstream services as if it
+// were plaintext.
+var ErrDecryptFailed = errors.New("crypto: decrypt failed (wrong key, corrupted ciphertext, or unversioned legacy value)")
+
+// envelopeV1Prefix tags ciphertext produced by Encrypt going forward.
+// Older databases may carry unprefixed hex blobs from the v0 era; those
+// are still readable via Decrypt for backward compatibility, but every
+// new write goes through EncryptV1 and emits the prefix so a future key
+// rotation has a clean fail-loud signal.
+const envelopeV1Prefix = "tf1:"
+
 // DeriveKey computes a 32-byte AES-256 key from the given passphrase using SHA-256.
 // This is acceptable when ENCRYPTION_KEY is a high-entropy random string (e.g., 32+ hex chars).
 // For human-chosen passphrases, consider Argon2id or PBKDF2 with a salt instead.
@@ -28,11 +43,15 @@ func KeyFromEnv() ([32]byte, error) {
 	if raw == "" {
 		return [32]byte{}, ErrNoKey
 	}
+	if len(raw) < 32 {
+		return [32]byte{}, fmt.Errorf("ENCRYPTION_KEY must be at least 32 characters long (got %d)", len(raw))
+	}
 	return DeriveKey(raw), nil
 }

 // Encrypt encrypts plaintext using AES-256-GCM with a random nonce.
-// The returned ciphertext is hex-encoded: nonce || ciphertext+tag.
+// Returns a versioned envelope (tf1:<hex>) so downstream readers can
+// distinguish ciphertext from accidentally-stored plaintext.
 func Encrypt(key [32]byte, plaintext string) (string, error) {
 	block, err := aes.NewCipher(key[:])
 	if err != nil {
@@ -50,14 +69,34 @@ func Encrypt(key [32]byte, plaintext string) (string, error) {
 	}

 	sealed := gcm.Seal(nonce, nonce, []byte(plaintext), nil)
-	return hex.EncodeToString(sealed), nil
+	return envelopeV1Prefix + hex.EncodeToString(sealed), nil
 }

-// Decrypt decrypts a hex-encoded ciphertext produced by Encrypt.
-func Decrypt(key [32]byte, ciphertextHex string) (string, error) {
-	data, err := hex.DecodeString(ciphertextHex)
+// HasEnvelope reports whether the value is a v1-prefixed ciphertext.
+// Useful for router-level "decrypt only if encrypted" decision points
+// that previously relied on `err == nil` from a try-decrypt — that
+// pattern silently masked rotated-key failures.
+func HasEnvelope(value string) bool {
+	return strings.HasPrefix(value, envelopeV1Prefix)
+}
+
+// Decrypt decrypts an envelope (tf1:<hex>). For backward compatibility
+// it also accepts unprefixed hex from the v0 era — but only when the
+// resulting plaintext is valid; a wrong key for legacy data now returns
+// ErrDecryptFailed instead of silently treating ciphertext as
+// plaintext.
+//
+// Callers MUST NOT swallow the error and fall back to "use as-is".
+// That pattern is the exact footgun the envelope versioning removes.
+func Decrypt(key [32]byte, ciphertext string) (string, error) {
+	hexBlob := ciphertext
+	if strings.HasPrefix(hexBlob, envelopeV1Prefix) {
+		hexBlob = hexBlob[len(envelopeV1Prefix):]
+	}
+
+	data, err := hex.DecodeString(hexBlob)
 	if err != nil {
-		return "", fmt.Errorf("decode hex: %w", err)
+		return "", fmt.Errorf("%w: decode hex: %v", ErrDecryptFailed, err)
 	}

 	block, err := aes.NewCipher(key[:])
@@ -72,15 +111,15 @@ func Decrypt(key [32]byte, ciphertextHex string) (string, error) {

 	nonceSize := gcm.NonceSize()
 	if len(data) < nonceSize {
-		return "", errors.New("ciphertext too short")
+		return "", fmt.Errorf("%w: ciphertext too short", ErrDecryptFailed)
 	}

 	nonce := data[:nonceSize]
-	ciphertext := data[nonceSize:]
+	body := data[nonceSize:]

-	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
+	plaintext, err := gcm.Open(nil, nonce, body, nil)
 	if err != nil {
-		return "", fmt.Errorf("decrypt: %w", err)
+		return "", fmt.Errorf("%w: %v", ErrDecryptFailed, err)
 	}

 	return string(plaintext), nil
@@ -0,0 +1,136 @@
+package crypto
+
+import (
+	"testing"
+)
+
+func TestDeriveKey(t *testing.T) {
+	key := DeriveKey("test-passphrase-that-is-long-enough")
+	if key == [32]byte{} {
+		t.Fatal("DeriveKey returned zero key")
+	}
+	// Same input produces same output
+	key2 := DeriveKey("test-passphrase-that-is-long-enough")
+	if key != key2 {
+		t.Fatal("DeriveKey is not deterministic")
+	}
+	// Different input produces different output
+	key3 := DeriveKey("different-passphrase-also-long-enough")
+	if key == key3 {
+		t.Fatal("DeriveKey produced same key for different inputs")
+	}
+}
+
+func TestEncryptDecryptRoundTrip(t *testing.T) {
+	key := DeriveKey("test-key-for-encryption-testing-1234")
+	plaintext := "super-secret-value"
+
+	encrypted, err := Encrypt(key, plaintext)
+	if err != nil {
+		t.Fatalf("Encrypt failed: %v", err)
+	}
+	if encrypted == plaintext {
+		t.Fatal("Encrypt returned plaintext")
+	}
+	if encrypted == "" {
+		t.Fatal("Encrypt returned empty string")
+	}
+
+	decrypted, err := Decrypt(key, encrypted)
+	if err != nil {
+		t.Fatalf("Decrypt failed: %v", err)
+	}
+	if decrypted != plaintext {
+		t.Fatalf("Decrypt mismatch: got %q, want %q", decrypted, plaintext)
+	}
+}
+
+func TestDecryptWithWrongKey(t *testing.T) {
+	key1 := DeriveKey("key-one-for-testing-encryption-1234")
+	key2 := DeriveKey("key-two-for-testing-encryption-5678")
+
+	encrypted, err := Encrypt(key1, "secret")
+	if err != nil {
+		t.Fatalf("Encrypt failed: %v", err)
+	}
+
+	_, err = Decrypt(key2, encrypted)
+	if err == nil {
+		t.Fatal("Decrypt with wrong key should have failed")
+	}
+}
+
+func TestEncryptIfNotEmpty(t *testing.T) {
+	key := DeriveKey("test-key-for-encryption-testing-1234")
+
+	result, err := EncryptIfNotEmpty(key, "")
+	if err != nil {
+		t.Fatalf("EncryptIfNotEmpty with empty input failed: %v", err)
+	}
+	if result != "" {
+		t.Fatal("EncryptIfNotEmpty should return empty for empty input")
+	}
+
+	result, err = EncryptIfNotEmpty(key, "value")
+	if err != nil {
+		t.Fatalf("EncryptIfNotEmpty failed: %v", err)
+	}
+	if result == "" || result == "value" {
+		t.Fatal("EncryptIfNotEmpty should encrypt non-empty input")
+	}
+}
+
+func TestKeyFromEnv(t *testing.T) {
+	// Test with no key set
+	t.Setenv("ENCRYPTION_KEY", "")
+	_, err := KeyFromEnv()
+	if err == nil {
+		t.Fatal("KeyFromEnv should fail with empty ENCRYPTION_KEY")
+	}
+
+	// Test with valid key
+	t.Setenv("ENCRYPTION_KEY", "a-very-long-encryption-key-that-is-definitely-over-32-chars")
+	key, err := KeyFromEnv()
+	if err != nil {
+		t.Fatalf("KeyFromEnv failed with valid key: %v", err)
+	}
+	if key == [32]byte{} {
+		t.Fatal("KeyFromEnv returned zero key")
+	}
+}
+
+func TestEncryptProducesDifferentCiphertexts(t *testing.T) {
+	key := DeriveKey("test-key-for-nonce-uniqueness-1234")
+
+	enc1, err := Encrypt(key, "same-plaintext")
+	if err != nil {
+		t.Fatalf("Encrypt 1 failed: %v", err)
+	}
+
+	enc2, err := Encrypt(key, "same-plaintext")
+	if err != nil {
+		t.Fatalf("Encrypt 2 failed: %v", err)
+	}
+
+	if enc1 == enc2 {
+		t.Fatal("Two encryptions of the same plaintext should produce different ciphertexts (random nonce)")
+	}
+}
+
+func TestDecryptInvalidHex(t *testing.T) {
+	key := DeriveKey("test-key-for-invalid-hex-testing")
+
+	_, err := Decrypt(key, "not-valid-hex!!!")
+	if err == nil {
+		t.Fatal("Decrypt should fail with invalid hex input")
+	}
+}
+
+func TestDecryptTooShort(t *testing.T) {
+	key := DeriveKey("test-key-for-short-ciphertext-test")
+
+	_, err := Decrypt(key, "aabb")
+	if err == nil {
+		t.Fatal("Decrypt should fail with ciphertext shorter than nonce")
+	}
+}
@@ -1,175 +0,0 @@
-package deployer
-
-import (
-	"context"
-	"fmt"
-	"log/slog"
-
-	"github.com/alexei/docker-watcher/internal/docker"
-	"github.com/alexei/docker-watcher/internal/store"
-	"github.com/google/uuid"
-)
-
-// blueGreenDeploy performs a zero-downtime deployment:
-// 1. Start new container (green)
-// 2. Health check green
-// 3. Swap NPM proxy to point to green
-// 4. Stop old container (blue)
-//
-// If the new container fails health check, it is removed and the old one stays.
-func (d *Deployer) blueGreenDeploy(
-	ctx context.Context,
-	project store.Project,
-	stage store.Stage,
-	settings store.Settings,
-	deployID string,
-	imageTag string,
-) (string, int, string, error) {
-	// Find existing running instance for this stage (the "blue" instance).
-	existingInstances, err := d.store.GetInstancesByStageID(stage.ID)
-	if err != nil {
-		return "", 0, "", fmt.Errorf("get existing instances: %w", err)
-	}
-
-	var blueInstance *store.Instance
-	for _, inst := range existingInstances {
-		if inst.Status == "running" {
-			instCopy := inst
-			blueInstance = &instCopy
-			break
-		}
-	}
-
-	// Step 1: Pull image.
-	if err := d.store.UpdateDeployStatus(deployID, "pulling", ""); err != nil {
-		slog.Warn("update deploy status", "error", err)
-	}
-	d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "pulling", "")
-	d.logDeploy(deployID, fmt.Sprintf("Blue-green: pulling image %s:%s", project.Image, imageTag), "info")
-
-	authConfig, err := d.buildRegistryAuth(project)
-	if err != nil {
-		return "", 0, "", fmt.Errorf("build registry auth: %w", err)
-	}
-
-	if err := d.docker.PullImage(ctx, project.Image, imageTag, authConfig); err != nil {
-		return "", 0, "", fmt.Errorf("pull image: %w", err)
-	}
-	d.logDeploy(deployID, "Image pulled successfully", "info")
-
-	// Step 2: Ensure network.
-	networkID, err := d.docker.EnsureNetwork(ctx, settings.Network)
-	if err != nil {
-		return "", 0, "", fmt.Errorf("ensure network: %w", err)
-	}
-
-	// Step 3: Create and start green container.
-	if err := d.store.UpdateDeployStatus(deployID, "starting", ""); err != nil {
-		slog.Warn("update deploy status", "error", err)
-	}
-	d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "starting", "")
-
-	instanceID := uuid.New().String()
-	subdomain := d.buildSubdomain(project, stage, settings, imageTag)
-	containerName := docker.ContainerName(project.Name, stage.Name, imageTag)
-	portStr := fmt.Sprintf("%d/tcp", project.Port)
-	envVars := d.mergeEnvVars(project, stage.ID)
-	mounts := d.computeVolumeMounts(project.ID, stage.Name, imageTag)
-
-	containerCfg := docker.ContainerConfig{
-		Name:         containerName,
-		Image:        project.Image + ":" + imageTag,
-		Env:          envVars,
-		ExposedPorts: []string{portStr},
-		NetworkName:  settings.Network,
-		NetworkID:    networkID,
-		Project:      project.Name,
-		Stage:        stage.Name,
-		InstanceID:   instanceID,
-		Mounts:       mounts,
-	}
-
-	d.logDeploy(deployID, fmt.Sprintf("Blue-green: creating green container %s", containerName), "info")
-	containerID, err := d.docker.CreateContainer(ctx, containerCfg)
-	if err != nil {
-		return "", 0, instanceID, fmt.Errorf("create container: %w", err)
-	}
-
-	// Create instance record.
-	inst, err := d.store.CreateInstanceWithID(store.Instance{
-		ID:          instanceID,
-		StageID:     stage.ID,
-		ProjectID:   project.ID,
-		ContainerID: containerID,
-		ImageTag:    imageTag,
-		Subdomain:   subdomain,
-		Status:      "stopped",
-		Port:        project.Port,
-	})
-	if err != nil {
-		return containerID, 0, instanceID, fmt.Errorf("create instance record: %w", err)
-	}
-	instanceID = inst.ID
-
-	if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
-		slog.Warn("link deploy to instance", "error", err)
-	}
-
-	d.logDeploy(deployID, fmt.Sprintf("Blue-green: starting green container %s", containerName), "info")
-	if err := d.docker.StartContainer(ctx, containerID); err != nil {
-		return containerID, 0, instanceID, fmt.Errorf("start container: %w", err)
-	}
-
-	if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
-		slog.Warn("update instance status", "error", err)
-	}
-	d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
-
-	// Step 4: Health check the green container.
-	if project.Healthcheck != "" {
-		if err := d.store.UpdateDeployStatus(deployID, "health_checking", ""); err != nil {
-			slog.Warn("update deploy status", "error", err)
-		}
-		d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "health_checking", "")
-
-		healthURL := fmt.Sprintf("http://%s:%d%s", containerName, project.Port, project.Healthcheck)
-		d.logDeploy(deployID, fmt.Sprintf("Blue-green: health checking green at %s", healthURL), "info")
-
-		if err := d.health.Check(ctx, healthURL); err != nil {
-			return containerID, 0, instanceID, fmt.Errorf("health check green: %w", err)
-		}
-		d.logDeploy(deployID, "Blue-green: green health check passed", "info")
-	}
-
-	// Step 5: Swap NPM proxy to green.
-	if err := d.store.UpdateDeployStatus(deployID, "configuring_proxy", ""); err != nil {
-		slog.Warn("update deploy status", "error", err)
-	}
-	d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "configuring_proxy", "")
-
-	npmProxyID, err := d.configureProxy(ctx, deployID, settings, containerName, project.Port, subdomain)
-	if err != nil {
-		return containerID, 0, instanceID, fmt.Errorf("configure proxy: %w", err)
-	}
-
-	inst.NpmProxyID = npmProxyID
-	inst.Subdomain = subdomain
-	if err := d.store.UpdateInstance(inst); err != nil {
-		slog.Warn("update instance with proxy ID", "error", err)
-	}
-
-	d.logDeploy(deployID, "Blue-green: proxy swapped to green container", "info")
-
-	// Step 6: Stop the blue container.
-	if blueInstance != nil {
-		d.logDeploy(deployID, fmt.Sprintf("Blue-green: stopping blue instance %s (tag: %s)", blueInstance.ID, blueInstance.ImageTag), "info")
-		if err := d.removeInstance(ctx, *blueInstance, settings); err != nil {
-			// Non-fatal: log but continue. Green is already serving traffic.
-			d.logDeploy(deployID, fmt.Sprintf("Blue-green: warning: failed to remove blue instance: %v", err), "warn")
-		} else {
-			d.logDeploy(deployID, "Blue-green: blue instance removed", "info")
-		}
-	}
-
-	return containerID, npmProxyID, instanceID, nil
-}
--- a/Show More
+++ b/Show More