Compare commits
28 Commits
ea55d31177
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 7641c0de12 | |||
| 1c47030854 | |||
| 8a5f69af87 | |||
| 7733e64b08 | |||
| 5b51bbbd7f | |||
| e3d140c57a | |||
| 0c4c338bfe | |||
| c8e71a0c34 | |||
| 6492944c8f | |||
| c2ca6c0b73 | |||
| ec8c0cd891 | |||
| 192204a51c | |||
| 6b45ed62bb | |||
| 2ba49b9bb6 | |||
| 00503b4c0a | |||
| 97f338fba3 | |||
| 15e5b186cd | |||
| fa6d5bd3ba | |||
| bd7a11d4e7 | |||
| 7576f54e76 | |||
| 2e26f555c5 | |||
| cdb9fd57d1 | |||
| 5c17885197 | |||
| 93b6911b34 | |||
| 3071cda512 | |||
| 410a131cec | |||
| 956943edbb | |||
| 279f373f80 |
+45
-7
@@ -1,9 +1,47 @@
|
||||
# VCS / tooling
|
||||
.git
|
||||
node_modules
|
||||
web/node_modules
|
||||
web/build
|
||||
data
|
||||
*.md
|
||||
plans/
|
||||
.claude/
|
||||
.gitignore
|
||||
.dockerignore
|
||||
.gitea/
|
||||
.github/
|
||||
.claude/
|
||||
.code-review-graph/
|
||||
.vex.toml
|
||||
.facts-sync.json
|
||||
.facts-suggestions.md
|
||||
|
||||
# Node / frontend build artifacts (frontend stage rebuilds web/build)
|
||||
node_modules/
|
||||
web/node_modules/
|
||||
web/build/
|
||||
web/.svelte-kit/
|
||||
|
||||
# Runtime / local data
|
||||
data/
|
||||
.env
|
||||
.env.*
|
||||
*.log
|
||||
|
||||
# Compiled binaries (rebuilt inside the image)
|
||||
tinyforge
|
||||
tinyforge.exe
|
||||
tinyforge-server.exe
|
||||
server.exe
|
||||
docker-watcher
|
||||
docker-watcher.exe
|
||||
docker-watcher.exe~
|
||||
/cli
|
||||
/cli.exe
|
||||
|
||||
# Build/orchestration files not needed inside the image
|
||||
Dockerfile
|
||||
docker-compose.yml
|
||||
Makefile
|
||||
*.example.yaml
|
||||
|
||||
# Docs / planning / design (not needed at runtime)
|
||||
*.md
|
||||
docs/
|
||||
plans/
|
||||
design-mockups/
|
||||
test-data/
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
# Facts Repo Suggestions
|
||||
|
||||
Pending suggestions to push back to claude-code-facts.
|
||||
|
||||
---
|
||||
|
||||
## 2026-06-21: Buildx + registry buildcache DOES work on the TrueNAS Gitea runner
|
||||
|
||||
**Target file:** gitea-python-ci-cd.md
|
||||
**Section:** "## 7. Docker Build" and "## 9. Gitea vs GitHub Actions Differences"
|
||||
**Reason:** The doc's compatibility table says "Docker Buildx — May not work (runner networking)" and the Docker section uses plain `docker build` + `docker push --all-tags`. In practice, `docker/setup-buildx-action@v3` + `docker/build-push-action@v5` with `cache-from/to: type=registry,ref=$REGISTRY:buildcache,mode=max` (and `type=gha` for no-push CI builds) works on the current `git.dolgolyov-family.by` runner — verified in the notify-bridge and tiny-forge pipelines. Recommend adding a "buildx path (preferred when it works)" variant alongside the conservative plain-`docker build` path, and softening the row to "Usually works; falls back to plain `docker build`."
|
||||
|
||||
---
|
||||
|
||||
## 2026-06-21: Quote `if:` expressions that contain a colon
|
||||
|
||||
**Target file:** gitea-python-ci-cd.md
|
||||
**Section:** "## 9. Gitea vs GitHub Actions Differences" (or a new "Workflow gotchas")
|
||||
**Reason:** A common skip-guard `if: ${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}` contains `: ` inside the literal, which makes strict YAML parsers (PyYAML, and validators) treat it as a nested mapping and error with "mapping values are not allowed here". Gitea's parser is lenient and accepts the unquoted form, but it fails any standard YAML lint. Fix: wrap the whole expression in double quotes — `if: "${{ ... 'chore: release v' ... }}"`.
|
||||
|
||||
---
|
||||
|
||||
## 2026-06-21: Add a "Go on Gitea" CI/CD note
|
||||
|
||||
**Target file:** gitea-python-ci-cd.md (or a new gitea-go-ci-cd.md)
|
||||
**Section:** new
|
||||
**Reason:** The doc is Python-only. The same release/Docker patterns apply to Go services with these deltas: pin `setup-go` to match the `go` directive in `go.mod` (a mismatch silently triggers a slow `GOTOOLCHAIN=auto` toolchain download); gate on `go vet ./...` + `go test ./internal/...`; multi-stage Dockerfile with `--mount=type=cache,target=/go/pkg/mod` and `target=/root/.cache/go-build` (requires `# syntax=docker/dockerfile:1.7`); `CGO_ENABLED=0 -ldflags="-s -w"` static binary on an `alpine` runtime with a non-root user and a `wget --spider` HEALTHCHECK.
|
||||
+54
-18
@@ -5,34 +5,70 @@ on:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
frontend:
|
||||
# Skip the build on release-bump commits — the tag push runs release.yml.
|
||||
if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
cache: npm
|
||||
cache-dependency-path: web/package-lock.json
|
||||
|
||||
- name: Install frontend dependencies
|
||||
working-directory: web
|
||||
run: npm ci --no-audit
|
||||
|
||||
- name: Svelte check
|
||||
working-directory: web
|
||||
run: npm run check
|
||||
|
||||
- name: Unit tests (vitest)
|
||||
working-directory: web
|
||||
run: npm run test
|
||||
|
||||
- name: Build frontend
|
||||
working-directory: web
|
||||
run: npm run build
|
||||
|
||||
backend:
|
||||
if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.24'
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
|
||||
- name: Install frontend dependencies
|
||||
working-directory: web
|
||||
run: npm ci --no-audit
|
||||
|
||||
- name: Build frontend
|
||||
working-directory: web
|
||||
run: npm run build
|
||||
go-version: '1.25'
|
||||
cache-dependency-path: go.sum
|
||||
|
||||
- name: Vet Go code
|
||||
run: go vet ./...
|
||||
|
||||
- name: Build Go binary
|
||||
run: CGO_ENABLED=0 go build -ldflags="-s -w" -o tinyforge ./cmd/server
|
||||
- name: Run Go tests
|
||||
run: go test ./internal/... -count=1
|
||||
|
||||
- name: Build Docker image
|
||||
run: docker build -t tinyforge:dev .
|
||||
build-image:
|
||||
if: "${{ !startsWith(gitea.event.head_commit.message, 'chore: release v') }}"
|
||||
needs: [frontend, backend]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build Docker image (no push)
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: false
|
||||
tags: tinyforge:ci-${{ gitea.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
+134
-69
@@ -10,19 +10,109 @@ env:
|
||||
REGISTRY: git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge
|
||||
|
||||
jobs:
|
||||
create-release:
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
# Gate the release on a passing test suite. A tagged release must never
|
||||
# ship code that fails `go vet` / `go test`.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
release_id: ${{ steps.create.outputs.release_id }}
|
||||
steps:
|
||||
- name: Fetch RELEASE_NOTES.md only
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.25'
|
||||
cache-dependency-path: go.sum
|
||||
|
||||
- name: Vet Go code
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run Go tests
|
||||
run: go test ./internal/... -count=1
|
||||
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
# Build + push the image FIRST. If this fails, no release is created
|
||||
# (create-release depends on it) — so we never leave an orphan release
|
||||
# pointing at a tag with no published image.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
build-docker:
|
||||
needs: test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Compute tags
|
||||
id: meta
|
||||
run: |
|
||||
TAG="${{ gitea.ref_name }}"
|
||||
VERSION="${TAG#v}"
|
||||
echo "tag=$TAG" >> "$GITHUB_OUTPUT"
|
||||
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
||||
# Detect pre-release (alpha/beta/rc) — these do NOT get :latest.
|
||||
if echo "$TAG" | grep -qE '(alpha|beta|rc)'; then
|
||||
echo "is_pre=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "is_pre=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Gitea Container Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.SERVER_HOST }}
|
||||
username: ${{ gitea.actor }}
|
||||
password: ${{ secrets.DEPLOY_TOKEN }}
|
||||
|
||||
- name: Build and push image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY }}:${{ steps.meta.outputs.tag }}
|
||||
${{ env.REGISTRY }}:${{ steps.meta.outputs.version }}
|
||||
${{ env.REGISTRY }}:sha-${{ gitea.sha }}
|
||||
${{ steps.meta.outputs.is_pre == 'false' && format('{0}:latest', env.REGISTRY) || '' }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}:buildcache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}:buildcache,mode=max
|
||||
|
||||
- name: Trigger redeploy webhook
|
||||
if: steps.meta.outputs.is_pre == 'false'
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if [ -n "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" ]; then
|
||||
echo "Triggering redeploy webhook..."
|
||||
curl -sf -X POST "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" \
|
||||
--max-time 30 || echo "::warning::Redeploy webhook failed"
|
||||
else
|
||||
echo "DOCKER_REDEPLOY_WEBHOOK_URL not set — skipping auto-deploy"
|
||||
fi
|
||||
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
# Create the Gitea release LAST — body = RELEASE_NOTES.md + auto-changelog.
|
||||
# ───────────────────────────────────────────────────────────────────────
|
||||
create-release:
|
||||
needs: build-docker
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout (full history for changelog)
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
sparse-checkout: RELEASE_NOTES.md
|
||||
sparse-checkout-cone-mode: false
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Generate changelog
|
||||
id: changelog
|
||||
run: |
|
||||
PREV_TAG=$(git tag --sort=-v:refname | head -2 | tail -1)
|
||||
if [ -z "$PREV_TAG" ] || [ "$PREV_TAG" = "${{ gitea.ref_name }}" ]; then
|
||||
git log --oneline --no-decorate -n 20 > /tmp/changelog.txt
|
||||
else
|
||||
git log --oneline --no-decorate "${PREV_TAG}..HEAD" > /tmp/changelog.txt
|
||||
fi
|
||||
|
||||
- name: Create Gitea release
|
||||
id: create
|
||||
env:
|
||||
DEPLOY_TOKEN: ${{ secrets.DEPLOY_TOKEN }}
|
||||
run: |
|
||||
@@ -42,74 +132,49 @@ jobs:
|
||||
echo "Found RELEASE_NOTES.md"
|
||||
else
|
||||
export RELEASE_NOTES=""
|
||||
echo "No RELEASE_NOTES.md found — release will have no body"
|
||||
echo "No RELEASE_NOTES.md found — release body = changelog only"
|
||||
fi
|
||||
|
||||
BODY_JSON=$(python3 -c "
|
||||
# Build release body (notes + changelog) via Python to avoid shell
|
||||
# escaping and CLI length limits.
|
||||
export TAG VERSION IS_PRE
|
||||
python3 <<'PY'
|
||||
import json, os
|
||||
notes = os.environ.get('RELEASE_NOTES', '')
|
||||
print(json.dumps(notes.strip()))
|
||||
")
|
||||
|
||||
# Create release via Gitea API
|
||||
RELEASE=$(curl -s -X POST "$BASE_URL/releases" \
|
||||
notes = os.environ.get('RELEASE_NOTES', '')
|
||||
changelog = open('/tmp/changelog.txt').read().strip()
|
||||
|
||||
sections = []
|
||||
if notes.strip():
|
||||
sections.append(notes.strip())
|
||||
if changelog:
|
||||
sections.append('## Changelog\n\n' + changelog)
|
||||
|
||||
payload = {
|
||||
'tag_name': os.environ['TAG'],
|
||||
'name': os.environ['VERSION'],
|
||||
'body': '\n\n'.join(sections),
|
||||
'draft': False,
|
||||
'prerelease': os.environ['IS_PRE'] == 'true',
|
||||
}
|
||||
with open('/tmp/release-payload.json', 'w') as f:
|
||||
json.dump(payload, f)
|
||||
PY
|
||||
|
||||
HTTP=$(curl -s -o /tmp/release-resp.json -w "%{http_code}" \
|
||||
-X POST "$BASE_URL/releases" \
|
||||
-H "Authorization: token $DEPLOY_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"tag_name\": \"$TAG\",
|
||||
\"name\": \"$VERSION\",
|
||||
\"body\": $BODY_JSON,
|
||||
\"draft\": false,
|
||||
\"prerelease\": $IS_PRE
|
||||
}")
|
||||
--data-binary @/tmp/release-payload.json)
|
||||
|
||||
# Fallback: if release already exists for this tag, reuse it
|
||||
RELEASE_ID=$(echo "$RELEASE" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])" 2>/dev/null)
|
||||
if [ -z "$RELEASE_ID" ]; then
|
||||
echo "::warning::Release already exists for tag $TAG — reusing existing release"
|
||||
RELEASE=$(curl -s "$BASE_URL/releases/tags/$TAG" \
|
||||
-H "Authorization: token $DEPLOY_TOKEN")
|
||||
RELEASE_ID=$(echo "$RELEASE" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
|
||||
fi
|
||||
echo "release_id=$RELEASE_ID" >> "$GITHUB_OUTPUT"
|
||||
echo "Created release $RELEASE_ID for $TAG"
|
||||
|
||||
build-docker:
|
||||
needs: create-release
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Gitea Container Registry
|
||||
id: docker-login
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo "${{ secrets.DEPLOY_TOKEN }}" | docker login \
|
||||
"$SERVER_HOST" -u "${{ gitea.actor }}" --password-stdin
|
||||
|
||||
- name: Build and tag
|
||||
if: steps.docker-login.outcome == 'success'
|
||||
run: |
|
||||
TAG="${{ gitea.ref_name }}"
|
||||
VERSION="${TAG#v}"
|
||||
docker build -t "$REGISTRY:$TAG" -t "$REGISTRY:$VERSION" .
|
||||
# Tag as 'latest' only for stable releases
|
||||
if ! echo "$TAG" | grep -qE '(alpha|beta|rc)'; then
|
||||
docker tag "$REGISTRY:$TAG" "$REGISTRY:latest"
|
||||
fi
|
||||
|
||||
- name: Push
|
||||
if: steps.docker-login.outcome == 'success'
|
||||
run: docker push "$REGISTRY" --all-tags
|
||||
|
||||
- name: Trigger Portainer redeploy
|
||||
if: steps.docker-login.outcome == 'success'
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if [ -n "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" ]; then
|
||||
echo "Triggering Portainer redeploy..."
|
||||
curl -sf -X POST "${{ secrets.DOCKER_REDEPLOY_WEBHOOK_URL }}" \
|
||||
--max-time 30 || echo "::warning::Portainer webhook failed"
|
||||
echo "POST /releases → HTTP $HTTP"
|
||||
if [ "$HTTP" = "201" ]; then
|
||||
RELEASE_ID=$(python3 -c "import json; print(json.load(open('/tmp/release-resp.json'))['id'])")
|
||||
echo "Created release $RELEASE_ID for $TAG"
|
||||
elif [ "$HTTP" = "409" ] || grep -q "already exists" /tmp/release-resp.json; then
|
||||
echo "::warning::Release already exists for tag $TAG — reusing"
|
||||
else
|
||||
echo "DOCKER_REDEPLOY_WEBHOOK_URL not set — skipping auto-deploy"
|
||||
echo "::error::Failed to create release for $TAG (HTTP $HTTP)"
|
||||
head -c 2000 /tmp/release-resp.json; echo
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -6,7 +6,10 @@ data/
|
||||
.env
|
||||
tinyforge
|
||||
tinyforge.exe
|
||||
/cli
|
||||
/cli.exe
|
||||
server.exe
|
||||
tinyforge-server.exe
|
||||
docker-watcher
|
||||
docker-watcher.exe
|
||||
docker-watcher.exe~
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
# vex configuration — https://github.com/tenatarika/vex
|
||||
#
|
||||
# Place this file in your project root as .vex.toml
|
||||
|
||||
# Glob patterns to exclude from indexing (gitignore syntax, on top of .gitignore)
|
||||
# exclude = [
|
||||
# "vendor/**",
|
||||
# "node_modules/**",
|
||||
# "*.generated.go",
|
||||
# "dist/**",
|
||||
# ]
|
||||
|
||||
# Default output format: "text", "json", or "compact"
|
||||
# format = "text"
|
||||
|
||||
# Enable semantic embeddings by default (slower indexing, enables meaning-based search)
|
||||
semantic = true
|
||||
|
||||
# Automatically run `vex update` before search if the index is stale
|
||||
auto_update = true
|
||||
|
||||
# Embedder used for semantic indexing. Known IDs: minilm-l6-v2 (default).
|
||||
# Changing the embedder requires a full reindex.
|
||||
# embedder = "minilm-l6-v2"
|
||||
|
||||
# Cache directory override. Defaults to the platform cache location.
|
||||
# macOS: ~/Library/Caches/vex
|
||||
# Linux: $XDG_CACHE_HOME/vex (fallback: ~/.cache/vex)
|
||||
# Windows: %LOCALAPPDATA%\vex (fallback: %USERPROFILE%\AppData\Local\vex)
|
||||
# Accepts absolute paths, "~/..." or paths relative to this file (e.g. "./.vex/cache").
|
||||
# Can also be overridden per-invocation with --cache-dir or $VEX_CACHE_DIR.
|
||||
# cache_dir = "./.vex/cache"
|
||||
|
||||
# Store the index inside the project as `<project>/.vex_cache/`. Useful when
|
||||
# the cache should travel with the project (e.g. on a moved or renamed
|
||||
# directory). vex writes a `.gitignore` inside it so contents are not
|
||||
# committed. Overridden by `cache_dir`, `--cache-dir`, or $VEX_CACHE_DIR.
|
||||
# local_cache = false
|
||||
|
||||
# Thread count for parallel indexing (index/update/watch).
|
||||
# * unset — 80% of available cores, rounded up (default, leaves headroom)
|
||||
# * 0 — use all cores (explicit opt-in to max throughput)
|
||||
# * N — exactly N workers
|
||||
# Overridable per-invocation with `-j/--jobs` or $VEX_JOBS.
|
||||
# jobs = 4
|
||||
|
||||
# Build the persistent call-graph section. Disabling falls back to live-scan
|
||||
# for `vex callers`/`vex callees` (slower per-query, but saves indexing
|
||||
# time on large monorepos). The opt-out is persisted in the manifest so
|
||||
# `vex update` does not silently re-add the section.
|
||||
# Per-invocation override: `vex index --no-call-graph`.
|
||||
# call_graph = true
|
||||
|
||||
# Build the BM25 channel. Disabling drops the third RRF channel and keeps
|
||||
# only structural (+ semantic). Same persistence rules as `call_graph`.
|
||||
# Per-invocation override: `vex index --no-bm25`.
|
||||
# bm25 = true
|
||||
@@ -12,3 +12,33 @@ Start/restart with: `./scripts/dev-server.sh`
|
||||
## Frontend
|
||||
|
||||
- **Boolean inputs use `ToggleSwitch`** (`$lib/components/ToggleSwitch.svelte`) — the slide-style switch is the unified control across the WebUI. Do not introduce raw `<input type="checkbox">` elements; place a `<ToggleSwitch>` next to a label/help block instead.
|
||||
- **Confirmations & destructive actions use `ConfirmDialog`** (`$lib/components/ConfirmDialog.svelte`) — never native `window.confirm` / `alert`. For navigation guards (e.g. the unsaved-changes prompt on `/apps/new`), `cancel()` the navigation in `beforeNavigate`, open `ConfirmDialog`, and re-issue the navigation with a bypass flag on confirm. Native `beforeunload` is acceptable only for hard tab-close/reload, where the browser forbids custom UI.
|
||||
- **Source-config shape: `$lib/workload/sourceForms.ts`** is the single source of truth (seed/serialize/validity for image/compose/static/dockerfile), consumed by both `/apps/new` and `/apps/[id]`. Don't re-inline seed/serialize logic.
|
||||
- **"App" = workload with `source_kind !== ''`.** Triggers are first-class bindings (`workload_trigger_bindings`), NOT on the workload row — never gate app lists/counts on `trigger_kind` (it's empty for plugin workloads). Legacy pre-cutover `kind:project/stack/site` rows have an empty `source_kind` and must be excluded everywhere.
|
||||
- **i18n parity is mandatory** — every key in BOTH `web/src/lib/i18n/{en,ru}.json`. A missing key is NOT a build error (`$t` returns the key string), so verify parity manually.
|
||||
|
||||
## Backend
|
||||
|
||||
- **Per-workload deploy lock.** Every deploy entrypoint (API deploy, rollback, promote,
|
||||
generic-hooks, webhook trigger dispatch) funnels through `deployer.DispatchPlugin`, which
|
||||
holds a per-workload `keyedmutex` lock (`internal/keyedmutex`) for the whole dispatch;
|
||||
`DispatchTeardown` takes it too. This serializes all container/volume mutation per workload.
|
||||
Do NOT add a deploy/teardown path that bypasses `DispatchPlugin`. Operations that must run
|
||||
a deploy *while already holding* the lock (volume-snapshot restore) use
|
||||
`Deployer.LockWorkload` + `RedeployLocked` (the unlocked dispatch) — calling `DispatchPlugin`
|
||||
under the held lock would deadlock (Go mutexes are not reentrant). `activeWg` is a global
|
||||
drain barrier for shutdown, NOT a per-workload lock.
|
||||
- **Volume snapshot restore** lives in `volsnap.Engine.Restore` (engine-owned, not the API
|
||||
handler): preflight re-resolves volumes from the workload's CURRENT config (never the
|
||||
snapshot manifest — that's tamper-influenceable) → lock → stop → extract-to-tmp →
|
||||
pre-restore snapshot → journal → atomic rename swap → redeploy. A startup
|
||||
`RecoverInterruptedRestores` sweep replays the journal after a crash; it MUST be wired (with
|
||||
`SetLifecycle`) before the API serves. The archive extractor treats the tar as untrusted
|
||||
(zip-slip/type-allowlist/bomb-cap); the endpoint requires an `X-Confirm-Restore: <sid>`
|
||||
header (CSRF), like the DB restore.
|
||||
|
||||
## Build & Test
|
||||
|
||||
- Frontend (from `web/`): `npm run check` (svelte-check — expect 0 errors), `npm run build`, `npm run test` (vitest; pure-logic units like `sourceForms.test.ts`).
|
||||
- Backend (repo root): `go build ./...`, `go vet ./internal/...`, `go test ./internal/...`.
|
||||
- `./scripts/dev-server.sh` rebuilds the SPA + restarts the Go server on :8090; it kills the prior process, so a previous background dev-server task reporting **exit 1 is expected**, not a failure.
|
||||
|
||||
+19
-4
@@ -1,3 +1,4 @@
|
||||
# syntax=docker/dockerfile:1.7
|
||||
# Stage 1: Build frontend
|
||||
FROM node:20-alpine AS frontend-builder
|
||||
|
||||
@@ -9,25 +10,33 @@ COPY web/ ./
|
||||
RUN npm run build
|
||||
|
||||
# Stage 2: Build Go binary
|
||||
FROM golang:1.24-alpine AS backend-builder
|
||||
FROM golang:1.25-alpine AS backend-builder
|
||||
|
||||
RUN apk add --no-cache git ca-certificates
|
||||
|
||||
WORKDIR /build
|
||||
COPY go.mod go.sum ./
|
||||
ENV GOTOOLCHAIN=auto
|
||||
RUN go mod download
|
||||
# Cache mounts persist the module + build caches across rebuilds (BuildKit).
|
||||
RUN --mount=type=cache,target=/go/pkg/mod \
|
||||
go mod download
|
||||
|
||||
COPY . .
|
||||
# Copy built frontend into the expected embed location.
|
||||
COPY --from=frontend-builder /build/web/build ./web/build
|
||||
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /tinyforge ./cmd/server
|
||||
RUN --mount=type=cache,target=/go/pkg/mod \
|
||||
--mount=type=cache,target=/root/.cache/go-build \
|
||||
CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /tinyforge ./cmd/server
|
||||
|
||||
# Stage 3: Minimal runtime image
|
||||
FROM alpine:3.19
|
||||
|
||||
RUN apk add --no-cache ca-certificates tzdata
|
||||
LABEL org.opencontainers.image.source="https://git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge"
|
||||
LABEL org.opencontainers.image.title="Tinyforge"
|
||||
LABEL org.opencontainers.image.description="Self-hosted Docker deployment + mini-CI platform"
|
||||
|
||||
RUN apk add --no-cache ca-certificates tzdata wget
|
||||
|
||||
# Create non-root user.
|
||||
RUN addgroup -g 1000 -S app && adduser -u 1000 -S app -G app
|
||||
@@ -46,4 +55,10 @@ EXPOSE 8080
|
||||
ENV DATA_DIR=/app/data
|
||||
ENV LISTEN_ADDR=:8080
|
||||
|
||||
VOLUME /app/data
|
||||
|
||||
# /readyz is the public readiness probe (pings the DB); /livez is liveness.
|
||||
HEALTHCHECK --interval=30s --timeout=5s --retries=3 --start-period=10s \
|
||||
CMD wget --no-verbose --tries=1 --spider http://localhost:8080/readyz || exit 1
|
||||
|
||||
ENTRYPOINT ["/app/tinyforge"]
|
||||
|
||||
@@ -11,6 +11,15 @@ Self-hosted deployment platform with a web dashboard. Deploy Docker containers f
|
||||
- **Multi-stage projects** (dev, staging, prod) with tag pattern matching
|
||||
- **Real-time deploy logs** via SSE streaming
|
||||
|
||||
### Branch Preview Environments
|
||||
|
||||
Get an isolated, throwaway deploy for every feature branch:
|
||||
|
||||
- Add a **branch pattern** (e.g. `feat/*`) to a workload's **git trigger** (Triggers panel → git trigger → *Branch pattern*).
|
||||
- Pushing to any branch matching the pattern deploys an **isolated per-branch preview** — a child workload that inherits the source config, served at a **slug-prefixed subdomain** (`feat-login-app.example.com`) so previews never collide with each other or the main deploy.
|
||||
- Previews are **automatically torn down** when the branch is deleted upstream.
|
||||
- Manage live previews from the app's **Preview environments** panel (`/apps/[id]`): open each branch's URL or tear it down manually. A torn-down preview is recreated on the next push to its branch.
|
||||
|
||||
### Static Sites
|
||||
|
||||
Deploy static sites and Deno-powered APIs directly from Git repositories:
|
||||
@@ -106,6 +115,46 @@ curl -X POST https://your-domain/api/webhook/<secret> \
|
||||
3. Enter your provider's Issuer URL, Client ID, and Client Secret
|
||||
4. Set the Redirect URL to `https://your-domain/api/auth/oidc/callback`
|
||||
|
||||
## CLI
|
||||
|
||||
`tinyforge` is a terminal client for driving a server from the shell, built on the same HTTP API as the web UI.
|
||||
|
||||
### Build
|
||||
|
||||
```bash
|
||||
go build -o tinyforge ./cmd/cli # ./tinyforge (tinyforge.exe on Windows)
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Log in once — caches a 24h token in ~/.tinyforge/config.json (mode 0600)
|
||||
tinyforge login --base-url http://localhost:8090
|
||||
# ...or non-interactively (no password echo / shell-history leak):
|
||||
TINYFORGE_PASSWORD=… tinyforge login --base-url http://localhost:8090 --user admin
|
||||
|
||||
tinyforge apps # list apps + container state
|
||||
tinyforge deploy my-app # deploy and wait for completion
|
||||
tinyforge deploy my-app --ref v1.2.3 --note "hotfix"
|
||||
tinyforge logs my-app -f # follow logs (Ctrl-C to stop)
|
||||
tinyforge status # server health + current user
|
||||
tinyforge status my-app # one app's containers
|
||||
tinyforge logout # revoke + clear the cached token
|
||||
```
|
||||
|
||||
### Server & token resolution
|
||||
|
||||
| Setting | Flag | Env | Default |
|
||||
| -------- | ------------ | ----------------- | ------------------------ |
|
||||
| Base URL | `--base-url` | `TINYFORGE_URL` | `http://localhost:8080` |
|
||||
| Token | `--token` | `TINYFORGE_TOKEN` | cached by `login` |
|
||||
| Config | `--config` | `TINYFORGE_CONFIG`| `~/.tinyforge/config.json` |
|
||||
|
||||
### Notes
|
||||
|
||||
- Login returns a **24h JWT** — there is no long-lived API token yet, so unattended use re-logs in when the token expires. `deploy` / `stop` / `start` require an **admin** account.
|
||||
- The token is sent as an `Authorization: Bearer` header (never placed in the URL) and the config file is written with `0600` permissions.
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
|
||||
+149
@@ -0,0 +1,149 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
)
|
||||
|
||||
func runApps(args []string) error {
|
||||
// Accept an optional "list" subcommand: `tinyforge apps` == `tinyforge apps list`.
|
||||
if len(args) > 0 && args[0] == "list" {
|
||||
args = args[1:]
|
||||
}
|
||||
fs := flag.NewFlagSet("apps", flag.ExitOnError)
|
||||
g := addGlobalFlags(fs)
|
||||
fs.Usage = func() {
|
||||
fmt.Fprint(os.Stderr, "Usage: tinyforge apps [list] [--base-url URL]\n\nList apps (workloads with a source) and their container state.\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sess, err := newSession(g)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var workloads []Workload
|
||||
if err := sess.client.doJSON(ctx, "GET", "/api/workloads", nil, &workloads); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// One extra call fetches every container so state can be shown without an
|
||||
// N+1 per-app request.
|
||||
var containers []Container
|
||||
if err := sess.client.doJSON(ctx, "GET", "/api/containers", nil, &containers); err != nil {
|
||||
return err
|
||||
}
|
||||
byWorkload := map[string][]Container{}
|
||||
for _, c := range containers {
|
||||
byWorkload[c.WorkloadID] = append(byWorkload[c.WorkloadID], c)
|
||||
}
|
||||
|
||||
apps := make([]Workload, 0, len(workloads))
|
||||
for _, w := range workloads {
|
||||
if w.isApp() {
|
||||
apps = append(apps, w)
|
||||
}
|
||||
}
|
||||
sort.Slice(apps, func(i, j int) bool { return apps[i].Name < apps[j].Name })
|
||||
|
||||
if len(apps) == 0 {
|
||||
fmt.Println("No apps yet. Create one in the web UI, then deploy with 'tinyforge deploy <app>'.")
|
||||
return nil
|
||||
}
|
||||
|
||||
tw := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
|
||||
fmt.Fprintln(tw, "NAME\tSOURCE\tSTATE\tID")
|
||||
for _, w := range apps {
|
||||
fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", w.Name, w.SourceKind, stateSummary(byWorkload[w.ID]), idShort(w.ID))
|
||||
}
|
||||
return tw.Flush()
|
||||
}
|
||||
|
||||
// stateSummary condenses a workload's containers into one status word.
|
||||
func stateSummary(cs []Container) string {
|
||||
if len(cs) == 0 {
|
||||
return "—"
|
||||
}
|
||||
running := 0
|
||||
for _, c := range cs {
|
||||
if c.State == "running" {
|
||||
running++
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case running == len(cs):
|
||||
return "running"
|
||||
case running == 0:
|
||||
return cs[0].State // e.g. stopped / failed / missing
|
||||
default:
|
||||
return fmt.Sprintf("%d/%d running", running, len(cs))
|
||||
}
|
||||
}
|
||||
|
||||
// resolveApp maps a user-supplied reference (name, full id, or id prefix) to a
|
||||
// single app workload. Exact id wins, then exact name, then a unique id prefix.
|
||||
func resolveApp(ctx context.Context, c *Client, ref string) (Workload, error) {
|
||||
var workloads []Workload
|
||||
if err := c.doJSON(ctx, "GET", "/api/workloads", nil, &workloads); err != nil {
|
||||
return Workload{}, err
|
||||
}
|
||||
|
||||
var byID, byName, byPrefix []Workload
|
||||
for _, w := range workloads {
|
||||
if !w.isApp() {
|
||||
continue
|
||||
}
|
||||
switch {
|
||||
case w.ID == ref:
|
||||
byID = append(byID, w)
|
||||
case strings.EqualFold(w.Name, ref):
|
||||
byName = append(byName, w)
|
||||
case len(ref) >= 6 && strings.HasPrefix(w.ID, ref):
|
||||
byPrefix = append(byPrefix, w)
|
||||
}
|
||||
}
|
||||
|
||||
if len(byID) == 1 {
|
||||
return byID[0], nil
|
||||
}
|
||||
if len(byName) == 1 {
|
||||
return byName[0], nil
|
||||
}
|
||||
if len(byName) > 1 {
|
||||
return Workload{}, ambiguousErr(ref, byName)
|
||||
}
|
||||
if len(byPrefix) == 1 {
|
||||
return byPrefix[0], nil
|
||||
}
|
||||
if len(byPrefix) > 1 {
|
||||
return Workload{}, ambiguousErr(ref, byPrefix)
|
||||
}
|
||||
return Workload{}, fmt.Errorf("no app matching %q (try 'tinyforge apps list')", ref)
|
||||
}
|
||||
|
||||
func ambiguousErr(ref string, matches []Workload) error {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "%q matches multiple apps; use the id:\n", ref)
|
||||
for _, w := range matches {
|
||||
fmt.Fprintf(&b, " %s %s\n", idShort(w.ID), w.Name)
|
||||
}
|
||||
return fmt.Errorf("%s", strings.TrimRight(b.String(), "\n"))
|
||||
}
|
||||
|
||||
func idShort(id string) string {
|
||||
if len(id) > 8 {
|
||||
return id[:8]
|
||||
}
|
||||
return id
|
||||
}
|
||||
@@ -0,0 +1,232 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// apiError carries the server's error message plus the HTTP status, so callers
|
||||
// can distinguish auth failures (401) from other errors without losing the
|
||||
// server's message (e.g. "invalid credentials" vs "invalid or expired token").
|
||||
type apiError struct {
|
||||
status int
|
||||
msg string
|
||||
}
|
||||
|
||||
func (e *apiError) Error() string { return e.msg }
|
||||
|
||||
// isAuthError reports whether err is a 401 from the API.
|
||||
func isAuthError(err error) bool {
|
||||
var ae *apiError
|
||||
return errors.As(err, &ae) && ae.status == http.StatusUnauthorized
|
||||
}
|
||||
|
||||
// Client talks to the Tinyforge HTTP API. It has no global timeout so that
|
||||
// long synchronous deploys and follow streams work; callers pass a context
|
||||
// with the appropriate deadline.
|
||||
type Client struct {
|
||||
baseURL string
|
||||
token string
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
func newClient(baseURL, token string) *Client {
|
||||
return &Client{
|
||||
baseURL: strings.TrimRight(baseURL, "/"),
|
||||
token: token,
|
||||
http: &http.Client{},
|
||||
}
|
||||
}
|
||||
|
||||
// apiEnvelope mirrors the server's response wrapper. The server's struct is
|
||||
// unexported, so the CLI defines its own matching shape. Data is deferred so a
|
||||
// single decode path serves every endpoint.
|
||||
type apiEnvelope struct {
|
||||
Success bool `json:"success"`
|
||||
Data json.RawMessage `json:"data"`
|
||||
Error string `json:"error"`
|
||||
}
|
||||
|
||||
// SessionToken is the data payload of POST /api/auth/login.
|
||||
type SessionToken struct {
|
||||
Token string `json:"token"`
|
||||
ExpiresAt string `json:"expires_at"`
|
||||
}
|
||||
|
||||
// User is the data payload of GET /api/auth/me.
|
||||
type User struct {
|
||||
ID string `json:"id"`
|
||||
Username string `json:"username"`
|
||||
Email string `json:"email"`
|
||||
Role string `json:"role"`
|
||||
}
|
||||
|
||||
// Workload is the subset of the workload row the CLI needs. An "app" is a
|
||||
// workload with a non-empty SourceKind.
|
||||
type Workload struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Kind string `json:"kind"`
|
||||
AppID string `json:"app_id"`
|
||||
SourceKind string `json:"source_kind"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
func (w Workload) isApp() bool { return w.SourceKind != "" }
|
||||
|
||||
// Container is the subset of a container row the CLI needs. State is one of
|
||||
// running|stopped|failed|missing|starting|created|restarting|paused|...
|
||||
type Container struct {
|
||||
ID string `json:"id"`
|
||||
WorkloadID string `json:"workload_id"`
|
||||
Role string `json:"role"`
|
||||
ContainerID string `json:"container_id"`
|
||||
ImageRef string `json:"image_ref"`
|
||||
State string `json:"state"`
|
||||
Port int `json:"port"`
|
||||
Subdomain string `json:"subdomain"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// DeployResult is the data payload of POST /api/workloads/{id}/deploy.
|
||||
type DeployResult struct {
|
||||
WorkloadID string `json:"workload_id"`
|
||||
Reference string `json:"reference"`
|
||||
TriggeredBy string `json:"triggered_by"`
|
||||
}
|
||||
|
||||
// doJSON performs a JSON request and unwraps the response envelope. body may be
|
||||
// nil. out may be nil when the caller does not need the data payload. A 401
|
||||
// maps to errNotAuthenticated; any other non-success surfaces the server's
|
||||
// error message.
|
||||
func (c *Client) doJSON(ctx context.Context, method, path string, body, out any) error {
|
||||
var reqBody io.Reader
|
||||
if body != nil {
|
||||
buf, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("encode request: %w", err)
|
||||
}
|
||||
reqBody = bytes.NewReader(buf)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, reqBody)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
c.authorize(req)
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
raw, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
|
||||
if err != nil {
|
||||
return fmt.Errorf("read response: %w", err)
|
||||
}
|
||||
|
||||
var env apiEnvelope
|
||||
if jsonErr := json.Unmarshal(raw, &env); jsonErr != nil {
|
||||
// Non-JSON body (e.g. a proxy error page). Surface status + a snippet,
|
||||
// preserving auth-error typing for 401s with an unparseable body.
|
||||
if resp.StatusCode >= 400 {
|
||||
return &apiError{status: resp.StatusCode, msg: fmt.Sprintf(
|
||||
"%s %s: unexpected response (status %d): %s", method, path, resp.StatusCode, snippet(raw))}
|
||||
}
|
||||
return fmt.Errorf("%s %s: decode response: %w", method, path, jsonErr)
|
||||
}
|
||||
if resp.StatusCode >= 400 || !env.Success {
|
||||
msg := env.Error
|
||||
if msg == "" {
|
||||
msg = fmt.Sprintf("%s %s: request failed (status %d)", method, path, resp.StatusCode)
|
||||
}
|
||||
return &apiError{status: resp.StatusCode, msg: msg}
|
||||
}
|
||||
if out != nil && len(env.Data) > 0 {
|
||||
if err := json.Unmarshal(env.Data, out); err != nil {
|
||||
return fmt.Errorf("decode response data: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// authorize attaches the bearer token. Using the Authorization header (rather
|
||||
// than a ?token= query param) keeps the JWT out of server and proxy logs.
|
||||
func (c *Client) authorize(req *http.Request) {
|
||||
if c.token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+c.token)
|
||||
}
|
||||
}
|
||||
|
||||
// streamSSE opens an SSE stream and invokes onData for each `data:` payload.
|
||||
// Comment lines (heartbeats, beginning with ':') and blanks are skipped. The
|
||||
// stream ends on EOF, context cancellation, or when onData returns an error.
|
||||
func (c *Client) streamSSE(ctx context.Context, path string, onData func(payload []byte) error) error {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "text/event-stream")
|
||||
c.authorize(req)
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("GET %s: %w", path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
|
||||
var env apiEnvelope
|
||||
msg := fmt.Sprintf("GET %s: stream failed (status %d)", path, resp.StatusCode)
|
||||
if json.Unmarshal(raw, &env) == nil && env.Error != "" {
|
||||
msg = env.Error
|
||||
}
|
||||
return &apiError{status: resp.StatusCode, msg: msg}
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
scanner.Buffer(make([]byte, 0, 64<<10), 2<<20) // tolerate long log lines
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if line == "" || strings.HasPrefix(line, ":") {
|
||||
continue // blank separator or SSE comment/heartbeat
|
||||
}
|
||||
data, ok := strings.CutPrefix(line, "data:")
|
||||
if !ok {
|
||||
continue // ignore event:/id: fields — the API uses default events
|
||||
}
|
||||
if err := onData([]byte(strings.TrimPrefix(data, " "))); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil && !errors.Is(err, context.Canceled) {
|
||||
return fmt.Errorf("read stream: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// snippet returns a short, single-line view of an unexpected response body.
|
||||
func snippet(b []byte) string {
|
||||
const max = 200
|
||||
s := strings.TrimSpace(string(b))
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
if len(s) > max {
|
||||
s = s[:max] + "…"
|
||||
}
|
||||
if s == "" {
|
||||
s = "(empty body)"
|
||||
}
|
||||
return s
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// defaultBaseURL matches the server's default LISTEN_ADDR (:8080). The dev
|
||||
// server runs on :8090; point at it with --base-url or $TINYFORGE_URL.
|
||||
const defaultBaseURL = "http://localhost:8080"
|
||||
|
||||
// Config is the persisted CLI state at ~/.tinyforge/config.json.
|
||||
type Config struct {
|
||||
BaseURL string `json:"base_url"`
|
||||
Token string `json:"token"`
|
||||
ExpiresAt string `json:"expires_at"`
|
||||
}
|
||||
|
||||
// globals holds the cross-cutting flags every command accepts.
|
||||
type globals struct {
|
||||
baseURL *string
|
||||
token *string
|
||||
configPath *string
|
||||
}
|
||||
|
||||
// addGlobalFlags registers the shared flags on a command's flag set.
|
||||
func addGlobalFlags(fs *flag.FlagSet) *globals {
|
||||
return &globals{
|
||||
baseURL: fs.String("base-url", "", "Tinyforge server URL (default $TINYFORGE_URL or "+defaultBaseURL+")"),
|
||||
token: fs.String("token", "", "auth token (default $TINYFORGE_TOKEN or cached config)"),
|
||||
configPath: fs.String("config", "", "config file path (default $TINYFORGE_CONFIG or ~/.tinyforge/config.json)"),
|
||||
}
|
||||
}
|
||||
|
||||
// configFilePath resolves the config file location with precedence:
|
||||
// --config flag > $TINYFORGE_CONFIG > ~/.tinyforge/config.json.
|
||||
func configFilePath(g *globals) (string, error) {
|
||||
if g != nil && *g.configPath != "" {
|
||||
return *g.configPath, nil
|
||||
}
|
||||
if env := os.Getenv("TINYFORGE_CONFIG"); env != "" {
|
||||
return env, nil
|
||||
}
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("locate home directory: %w", err)
|
||||
}
|
||||
return filepath.Join(home, ".tinyforge", "config.json"), nil
|
||||
}
|
||||
|
||||
// loadConfig reads the config file. A missing file yields a zero Config and no
|
||||
// error — first run is not a failure.
|
||||
func loadConfig(path string) (Config, error) {
|
||||
var cfg Config
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return cfg, nil
|
||||
}
|
||||
return cfg, fmt.Errorf("read config %s: %w", path, err)
|
||||
}
|
||||
// An empty or whitespace-only file (e.g. freshly touched) is treated as
|
||||
// "no config yet" rather than a parse error.
|
||||
if len(bytes.TrimSpace(data)) == 0 {
|
||||
return cfg, nil
|
||||
}
|
||||
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||
return cfg, fmt.Errorf("parse config %s: %w", path, err)
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
// saveConfig writes the config file with 0600 permissions, since it holds a
|
||||
// bearer token. The parent directory is created if absent.
|
||||
func saveConfig(path string, cfg Config) error {
|
||||
if dir := filepath.Dir(path); dir != "" {
|
||||
if err := os.MkdirAll(dir, 0o700); err != nil {
|
||||
return fmt.Errorf("create config dir: %w", err)
|
||||
}
|
||||
}
|
||||
data, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("encode config: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0o600); err != nil {
|
||||
return fmt.Errorf("write config %s: %w", path, err)
|
||||
}
|
||||
// os.WriteFile only applies the mode when creating the file; Chmod ensures
|
||||
// 0600 even when overwriting a pre-existing, looser-permissioned config.
|
||||
if err := os.Chmod(path, 0o600); err != nil {
|
||||
return fmt.Errorf("secure config %s: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveBaseURL applies precedence: --base-url > $TINYFORGE_URL > config > default.
|
||||
func resolveBaseURL(g *globals, cfg Config) string {
|
||||
if g != nil && *g.baseURL != "" {
|
||||
return *g.baseURL
|
||||
}
|
||||
if env := os.Getenv("TINYFORGE_URL"); env != "" {
|
||||
return env
|
||||
}
|
||||
if cfg.BaseURL != "" {
|
||||
return cfg.BaseURL
|
||||
}
|
||||
return defaultBaseURL
|
||||
}
|
||||
|
||||
// resolveToken applies precedence: --token > $TINYFORGE_TOKEN > config.
|
||||
func resolveToken(g *globals, cfg Config) string {
|
||||
if g != nil && *g.token != "" {
|
||||
return *g.token
|
||||
}
|
||||
if env := os.Getenv("TINYFORGE_TOKEN"); env != "" {
|
||||
return env
|
||||
}
|
||||
return cfg.Token
|
||||
}
|
||||
|
||||
// session bundles the resolved client with the loaded config and its path, so
|
||||
// commands can both make requests and persist updates (e.g. login).
|
||||
type session struct {
|
||||
client *Client
|
||||
cfg Config
|
||||
configPath string
|
||||
}
|
||||
|
||||
// newSession loads config and builds a client with resolved base URL + token.
|
||||
func newSession(g *globals) (*session, error) {
|
||||
path, err := configFilePath(g)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cfg, err := loadConfig(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &session{
|
||||
client: newClient(resolveBaseURL(g, cfg), resolveToken(g, cfg)),
|
||||
cfg: cfg,
|
||||
configPath: path,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
func runDeploy(args []string) error {
|
||||
fs := flag.NewFlagSet("deploy", flag.ExitOnError)
|
||||
g := addGlobalFlags(fs)
|
||||
ref := fs.String("ref", "", "image tag / git ref / source-specific deploy target")
|
||||
note := fs.String("note", "", "free-text note recorded with the deploy")
|
||||
timeout := fs.Duration("timeout", 15*time.Minute, "max time to wait for the deploy to finish")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprint(os.Stderr, "Usage: tinyforge deploy <app> [--ref TAG] [--note TEXT] [--timeout DUR]\n\n"+
|
||||
"Trigger a deploy and wait for it to finish. Requires an admin token.\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if fs.NArg() != 1 {
|
||||
fs.Usage()
|
||||
return fmt.Errorf("expected exactly one app (name or id)")
|
||||
}
|
||||
|
||||
sess, err := newSession(g)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Resolve the app on a short deadline; the deploy itself gets the full one.
|
||||
resolveCtx, cancelResolve := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancelResolve()
|
||||
app, err := resolveApp(resolveCtx, sess.client, fs.Arg(0))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
body := map[string]string{}
|
||||
if *ref != "" {
|
||||
body["reference"] = *ref
|
||||
}
|
||||
if *note != "" {
|
||||
body["note"] = *note
|
||||
}
|
||||
|
||||
fmt.Printf("Deploying %s%s…\n", app.Name, refSuffix(*ref))
|
||||
|
||||
// The endpoint returns 202 but blocks until the deploy completes, so a
|
||||
// success here means it finished; allow plenty of time for pull/build.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), *timeout)
|
||||
defer cancel()
|
||||
|
||||
var result DeployResult
|
||||
if err := sess.client.doJSON(ctx, "POST", "/api/workloads/"+app.ID+"/deploy", body, &result); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Deploy of %s completed (triggered by %s).\n", app.Name, result.TriggeredBy)
|
||||
fmt.Printf("Follow with: tinyforge logs %s -f\n", app.Name)
|
||||
return nil
|
||||
}
|
||||
|
||||
func refSuffix(ref string) string {
|
||||
if ref == "" {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf(" @ %s", ref)
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func runLogin(args []string) error {
|
||||
fs := flag.NewFlagSet("login", flag.ExitOnError)
|
||||
g := addGlobalFlags(fs)
|
||||
user := fs.String("user", "", "username (prompted if omitted)")
|
||||
pass := fs.String("password", "", "password (insecure; prefer $TINYFORGE_PASSWORD or the prompt)")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprint(os.Stderr, "Usage: tinyforge login [--user U] [--password P] [--base-url URL]\n\n"+
|
||||
"Authenticate against the server and cache the token.\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sess, err := newSession(g)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
username := *user
|
||||
if username == "" {
|
||||
username, err = promptLine("Username: ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
password := *pass
|
||||
if password == "" {
|
||||
password = os.Getenv("TINYFORGE_PASSWORD")
|
||||
}
|
||||
if password == "" {
|
||||
password, err = promptPassword("Password: ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if username == "" || password == "" {
|
||||
return fmt.Errorf("username and password are required")
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var tok SessionToken
|
||||
body := map[string]string{"username": username, "password": password}
|
||||
if err := sess.client.doJSON(ctx, "POST", "/api/auth/login", body, &tok); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Persist the resolved base URL alongside the token so later commands need
|
||||
// no flags. The token file is written 0600 by saveConfig.
|
||||
sess.cfg.BaseURL = sess.client.baseURL
|
||||
sess.cfg.Token = tok.Token
|
||||
sess.cfg.ExpiresAt = tok.ExpiresAt
|
||||
if err := saveConfig(sess.configPath, sess.cfg); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Logged in to %s as %s.\n", sess.client.baseURL, username)
|
||||
if exp := friendlyExpiry(tok.ExpiresAt); exp != "" {
|
||||
fmt.Printf("Token valid until %s.\n", exp)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func runLogout(args []string) error {
|
||||
fs := flag.NewFlagSet("logout", flag.ExitOnError)
|
||||
g := addGlobalFlags(fs)
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sess, err := newSession(g)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if sess.client.token == "" {
|
||||
fmt.Println("Not logged in.")
|
||||
return nil
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Best-effort server-side revocation; clear the local token regardless.
|
||||
revokeErr := sess.client.doJSON(ctx, "POST", "/api/auth/logout", nil, nil)
|
||||
|
||||
sess.cfg.Token = ""
|
||||
sess.cfg.ExpiresAt = ""
|
||||
if err := saveConfig(sess.configPath, sess.cfg); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if revokeErr != nil {
|
||||
fmt.Printf("Cleared local token (server revocation skipped: %v).\n", revokeErr)
|
||||
return nil
|
||||
}
|
||||
fmt.Println("Logged out.")
|
||||
return nil
|
||||
}
|
||||
|
||||
// promptLine reads a single trimmed line from stdin.
|
||||
func promptLine(label string) (string, error) {
|
||||
fmt.Fprint(os.Stderr, label)
|
||||
r := bufio.NewReader(os.Stdin)
|
||||
line, err := r.ReadString('\n')
|
||||
if err != nil && line == "" {
|
||||
return "", fmt.Errorf("read input: %w", err)
|
||||
}
|
||||
return strings.TrimSpace(line), nil
|
||||
}
|
||||
|
||||
// friendlyExpiry formats an RFC3339 expiry as a local time, best-effort.
|
||||
func friendlyExpiry(s string) string {
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
t, err := time.Parse(time.RFC3339, s)
|
||||
if err != nil {
|
||||
return s
|
||||
}
|
||||
return t.Local().Format("2006-01-02 15:04 MST")
|
||||
}
|
||||
+143
@@ -0,0 +1,143 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func runLogs(args []string) error {
|
||||
fs := flag.NewFlagSet("logs", flag.ExitOnError)
|
||||
g := addGlobalFlags(fs)
|
||||
follow := fs.Bool("f", false, "follow the log stream (Ctrl-C to stop)")
|
||||
tail := fs.Int("tail", 200, "number of trailing lines to show (max 5000)")
|
||||
container := fs.String("container", "", "container row id/prefix or role (when an app has several)")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprint(os.Stderr, "Usage: tinyforge logs <app> [-f] [--tail N] [--container CID]\n\nPrint or follow a container's logs.\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if fs.NArg() != 1 {
|
||||
fs.Usage()
|
||||
return fmt.Errorf("expected exactly one app (name or id)")
|
||||
}
|
||||
|
||||
sess, err := newSession(g)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
resolveCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
app, err := resolveApp(resolveCtx, sess.client, fs.Arg(0))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var containers []Container
|
||||
if err := sess.client.doJSON(resolveCtx, "GET", "/api/workloads/"+app.ID+"/containers", nil, &containers); err != nil {
|
||||
return err
|
||||
}
|
||||
target, err := chooseContainer(containers, *container)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
q := url.Values{}
|
||||
q.Set("tail", fmt.Sprintf("%d", *tail))
|
||||
base := "/api/workloads/" + app.ID + "/containers/" + target.ID + "/logs"
|
||||
|
||||
if !*follow {
|
||||
var lines []string
|
||||
if err := sess.client.doJSON(resolveCtx, "GET", base+"?"+q.Encode(), nil, &lines); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, line := range lines {
|
||||
fmt.Println(line)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Follow: stream until EOF or Ctrl-C.
|
||||
q.Set("follow", "true")
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
|
||||
defer stop()
|
||||
|
||||
err = sess.client.streamSSE(ctx, base+"?"+q.Encode(), func(payload []byte) error {
|
||||
var frame struct {
|
||||
Line string `json:"line"`
|
||||
}
|
||||
if json.Unmarshal(payload, &frame) != nil {
|
||||
return nil // ignore frames we can't parse
|
||||
}
|
||||
fmt.Println(frame.Line)
|
||||
return nil
|
||||
})
|
||||
if ctx.Err() != nil { // user interrupted — clean exit
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// chooseContainer selects which container to read. With an explicit selector,
|
||||
// it matches the row id (exact or prefix) or the role. Otherwise it uses the
|
||||
// sole container, or the sole running one, and errors with a list when the
|
||||
// choice is ambiguous.
|
||||
func chooseContainer(cs []Container, selector string) (Container, error) {
|
||||
if len(cs) == 0 {
|
||||
return Container{}, fmt.Errorf("app has no containers yet — deploy it first")
|
||||
}
|
||||
|
||||
if selector != "" {
|
||||
var matches []Container
|
||||
for _, c := range cs {
|
||||
if c.ID == selector || strings.EqualFold(c.Role, selector) ||
|
||||
(len(selector) >= 6 && strings.HasPrefix(c.ID, selector)) {
|
||||
matches = append(matches, c)
|
||||
}
|
||||
}
|
||||
switch len(matches) {
|
||||
case 1:
|
||||
return matches[0], nil
|
||||
case 0:
|
||||
return Container{}, fmt.Errorf("no container matching %q\n%s", selector, containerList(cs))
|
||||
default:
|
||||
return Container{}, fmt.Errorf("%q matches multiple containers\n%s", selector, containerList(cs))
|
||||
}
|
||||
}
|
||||
|
||||
if len(cs) == 1 {
|
||||
return cs[0], nil
|
||||
}
|
||||
var running []Container
|
||||
for _, c := range cs {
|
||||
if c.State == "running" {
|
||||
running = append(running, c)
|
||||
}
|
||||
}
|
||||
if len(running) == 1 {
|
||||
return running[0], nil
|
||||
}
|
||||
return Container{}, fmt.Errorf("app has %d containers; pick one with --container:\n%s", len(cs), containerList(cs))
|
||||
}
|
||||
|
||||
func containerList(cs []Container) string {
|
||||
var b strings.Builder
|
||||
for _, c := range cs {
|
||||
role := c.Role
|
||||
if role == "" {
|
||||
role = "(default)"
|
||||
}
|
||||
fmt.Fprintf(&b, " %s %-12s %s\n", idShort(c.ID), role, c.State)
|
||||
}
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
// Command tinyforge is a terminal client for a Tinyforge server.
|
||||
//
|
||||
// It drives the existing HTTP API: log in to obtain a 24h JWT, then list
|
||||
// apps, trigger deploys, stream logs, and check status. The token is cached
|
||||
// in ~/.tinyforge/config.json (mode 0600) so subsequent commands reuse it.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// tinyforge login [--user U] [--password P]
|
||||
// tinyforge apps [list]
|
||||
// tinyforge deploy <app> [--ref TAG] [--note TEXT]
|
||||
// tinyforge logs <app> [-f] [--tail N] [--container CID]
|
||||
// tinyforge status [<app>]
|
||||
// tinyforge logout
|
||||
// tinyforge version
|
||||
//
|
||||
// The target server is resolved from --base-url, then $TINYFORGE_URL, then the
|
||||
// saved config, then http://localhost:8080.
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
// version is the CLI build version. Overridable at build time via
|
||||
// -ldflags "-X main.version=...".
|
||||
var version = "dev"
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
usage(os.Stderr)
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
cmd, args := os.Args[1], os.Args[2:]
|
||||
|
||||
var err error
|
||||
switch cmd {
|
||||
case "login":
|
||||
err = runLogin(args)
|
||||
case "logout":
|
||||
err = runLogout(args)
|
||||
case "apps":
|
||||
err = runApps(args)
|
||||
case "deploy":
|
||||
err = runDeploy(args)
|
||||
case "logs":
|
||||
err = runLogs(args)
|
||||
case "status":
|
||||
err = runStatus(args)
|
||||
case "version", "--version", "-v":
|
||||
fmt.Printf("tinyforge %s\n", version)
|
||||
case "help", "-h", "--help":
|
||||
usage(os.Stdout)
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "tinyforge: unknown command %q\n\n", cmd)
|
||||
usage(os.Stderr)
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
// Authenticated commands that hit a 401 get a re-login hint; the login
|
||||
// command itself surfaces the server message ("invalid credentials").
|
||||
if cmd != "login" && isAuthError(err) {
|
||||
err = fmt.Errorf("%w — run 'tinyforge login'", err)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "tinyforge: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func usage(w *os.File) {
|
||||
fmt.Fprint(w, `tinyforge — terminal client for a Tinyforge server
|
||||
|
||||
Usage:
|
||||
tinyforge <command> [flags]
|
||||
|
||||
Commands:
|
||||
login Authenticate and cache a token
|
||||
logout Revoke the cached token and clear it
|
||||
apps [list] List your apps (workloads with a source)
|
||||
deploy <app> Trigger a deploy (waits for completion)
|
||||
logs <app> Print container logs (use -f to follow)
|
||||
status [<app>] Show server health, or one app's containers
|
||||
version Print the CLI version
|
||||
|
||||
Global flags (accepted by any command):
|
||||
--base-url URL Server URL (default $TINYFORGE_URL or http://localhost:8080)
|
||||
--token TOKEN Auth token (default $TINYFORGE_TOKEN or cached config)
|
||||
--config PATH Config file (default $TINYFORGE_CONFIG or ~/.tinyforge/config.json)
|
||||
|
||||
Run "tinyforge <command> -h" for command-specific flags.
|
||||
`)
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
//go:build !windows
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// promptPassword reads a password from stdin with echo disabled via stty. If
|
||||
// stty is unavailable (no tty, missing binary), it falls back to an echoed
|
||||
// read so the command still works in pipes/CI.
|
||||
func promptPassword(label string) (string, error) {
|
||||
fmt.Fprint(os.Stderr, label)
|
||||
|
||||
echoDisabled := stty("-echo") == nil
|
||||
if echoDisabled {
|
||||
defer func() {
|
||||
_ = stty("echo")
|
||||
fmt.Fprintln(os.Stderr) // the Enter keystroke was not echoed
|
||||
}()
|
||||
}
|
||||
|
||||
line, err := bufio.NewReader(os.Stdin).ReadString('\n')
|
||||
if err != nil && line == "" {
|
||||
return "", fmt.Errorf("read password: %w", err)
|
||||
}
|
||||
return strings.TrimRight(line, "\r\n"), nil
|
||||
}
|
||||
|
||||
func stty(arg string) error {
|
||||
cmd := exec.Command("stty", arg)
|
||||
cmd.Stdin = os.Stdin
|
||||
return cmd.Run()
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
//go:build windows
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// enableEchoInput is the Windows console mode bit that echoes typed input.
|
||||
const enableEchoInput = 0x0004
|
||||
|
||||
// promptPassword reads a password from the console with echo disabled, using
|
||||
// kernel32 directly so no third-party dependency is needed. If the console
|
||||
// mode cannot be changed (e.g. piped stdin), it falls back to an echoed read.
|
||||
func promptPassword(label string) (string, error) {
|
||||
fmt.Fprint(os.Stderr, label)
|
||||
|
||||
kernel32 := syscall.NewLazyDLL("kernel32.dll")
|
||||
getConsoleMode := kernel32.NewProc("GetConsoleMode")
|
||||
setConsoleMode := kernel32.NewProc("SetConsoleMode")
|
||||
handle := syscall.Handle(os.Stdin.Fd())
|
||||
|
||||
var mode uint32
|
||||
echoDisabled := false
|
||||
if r, _, _ := getConsoleMode.Call(uintptr(handle), uintptr(unsafe.Pointer(&mode))); r != 0 {
|
||||
if ret, _, _ := setConsoleMode.Call(uintptr(handle), uintptr(mode&^enableEchoInput)); ret != 0 {
|
||||
echoDisabled = true
|
||||
defer setConsoleMode.Call(uintptr(handle), uintptr(mode))
|
||||
}
|
||||
}
|
||||
|
||||
line, err := bufio.NewReader(os.Stdin).ReadString('\n')
|
||||
if echoDisabled {
|
||||
fmt.Fprintln(os.Stderr) // the Enter keystroke was not echoed
|
||||
}
|
||||
if err != nil && line == "" {
|
||||
return "", fmt.Errorf("read password: %w", err)
|
||||
}
|
||||
return strings.TrimRight(line, "\r\n"), nil
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
)
|
||||
|
||||
func runStatus(args []string) error {
|
||||
fs := flag.NewFlagSet("status", flag.ExitOnError)
|
||||
g := addGlobalFlags(fs)
|
||||
fs.Usage = func() {
|
||||
fmt.Fprint(os.Stderr, "Usage: tinyforge status [<app>]\n\nWith no app: server health and the logged-in user.\nWith an app: that app's containers.\n")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sess, err := newSession(g)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if fs.NArg() == 0 {
|
||||
return serverStatus(ctx, sess)
|
||||
}
|
||||
return appStatus(ctx, sess.client, fs.Arg(0))
|
||||
}
|
||||
|
||||
func serverStatus(ctx context.Context, sess *session) error {
|
||||
fmt.Printf("Server: %s\n", sess.client.baseURL)
|
||||
|
||||
var me User
|
||||
if err := sess.client.doJSON(ctx, "GET", "/api/auth/me", nil, &me); err != nil {
|
||||
fmt.Printf("User: not logged in (%v)\n", err)
|
||||
} else {
|
||||
fmt.Printf("User: %s (%s)\n", me.Username, me.Role)
|
||||
}
|
||||
if exp := friendlyExpiry(sess.cfg.ExpiresAt); exp != "" {
|
||||
fmt.Printf("Token: valid until %s\n", exp)
|
||||
}
|
||||
|
||||
var health map[string]any
|
||||
if err := sess.client.doJSON(ctx, "GET", "/api/health", nil, &health); err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("DB: %s\n", connState(health, "database"))
|
||||
docker := connState(health, "docker")
|
||||
if v := nestedString(health, "docker", "version"); v != "" {
|
||||
docker += " (v" + v + ")"
|
||||
}
|
||||
fmt.Printf("Docker: %s\n", docker)
|
||||
if _, ok := health["proxy"]; ok {
|
||||
fmt.Printf("Proxy: %s\n", connState(health, "proxy"))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func appStatus(ctx context.Context, c *Client, ref string) error {
|
||||
app, err := resolveApp(ctx, c, ref)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var containers []Container
|
||||
if err := c.doJSON(ctx, "GET", "/api/workloads/"+app.ID+"/containers", nil, &containers); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("%s (%s, %s)\n", app.Name, app.SourceKind, idShort(app.ID))
|
||||
if len(containers) == 0 {
|
||||
fmt.Println("No containers — not deployed yet.")
|
||||
return nil
|
||||
}
|
||||
|
||||
tw := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
|
||||
fmt.Fprintln(tw, "ROLE\tSTATE\tIMAGE\tPORT\tSUBDOMAIN\tCONTAINER")
|
||||
for _, c := range containers {
|
||||
role := c.Role
|
||||
if role == "" {
|
||||
role = "(default)"
|
||||
}
|
||||
port := ""
|
||||
if c.Port != 0 {
|
||||
port = fmt.Sprintf("%d", c.Port)
|
||||
}
|
||||
fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\n",
|
||||
role, c.State, c.ImageRef, port, c.Subdomain, idShort(c.ID))
|
||||
}
|
||||
return tw.Flush()
|
||||
}
|
||||
|
||||
// connState reads health[section].connected and renders connected/disconnected,
|
||||
// appending the section's error string when present.
|
||||
func connState(health map[string]any, section string) string {
|
||||
m, ok := health[section].(map[string]any)
|
||||
if !ok {
|
||||
return "unknown"
|
||||
}
|
||||
connected, _ := m["connected"].(bool)
|
||||
if connected {
|
||||
return "connected"
|
||||
}
|
||||
if msg, ok := m["error"].(string); ok && msg != "" {
|
||||
return "disconnected (" + msg + ")"
|
||||
}
|
||||
return "disconnected"
|
||||
}
|
||||
|
||||
func nestedString(m map[string]any, section, key string) string {
|
||||
sub, ok := m[section].(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
s, _ := sub[key].(string)
|
||||
return s
|
||||
}
|
||||
+67
-1
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/health"
|
||||
"github.com/alexei/tinyforge/internal/logging"
|
||||
"github.com/alexei/tinyforge/internal/logscanner"
|
||||
"github.com/alexei/tinyforge/internal/metricalert"
|
||||
"github.com/alexei/tinyforge/internal/notify"
|
||||
"github.com/alexei/tinyforge/internal/npm"
|
||||
"github.com/alexei/tinyforge/internal/proxy"
|
||||
@@ -36,6 +37,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/stale"
|
||||
"github.com/alexei/tinyforge/internal/stats"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/volsnap"
|
||||
"github.com/alexei/tinyforge/internal/webhook"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
|
||||
@@ -43,6 +45,7 @@ import (
|
||||
// itself with internal/workload/plugin. Adding a new Source or Trigger
|
||||
// is a matter of dropping a new package and adding it to this list.
|
||||
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/compose"
|
||||
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/dockerfile"
|
||||
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/image"
|
||||
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/static"
|
||||
_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/git"
|
||||
@@ -62,6 +65,20 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Acquire single-instance lockfile BEFORE opening the DB. SQLite +
|
||||
// SetMaxOpenConns(1) does not protect against two Tinyforge processes
|
||||
// sharing a data directory; without this guard a misconfigured
|
||||
// systemd unit, container restart race, or `tinyforge` shell typo can
|
||||
// silently double-fire schedulers, double-poll registries, and
|
||||
// corrupt `extra_json` RMW. The lockfile is a PID file under
|
||||
// $DATA_DIR/tinyforge.lock — collisions with dead PIDs are reclaimed.
|
||||
releaseLock, err := store.AcquireLockfile(dataDir)
|
||||
if err != nil {
|
||||
slog.Error("could not acquire data-dir lock", "data_dir", dataDir, "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer releaseLock()
|
||||
|
||||
// Open database.
|
||||
dbPath := filepath.Join(dataDir, "tinyforge.db")
|
||||
db, err := store.New(dbPath)
|
||||
@@ -78,6 +95,21 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// One-shot migration: rewrite every legacy unprefixed-hex secret
|
||||
// in the DB into the new tf1: envelope form. Idempotent (gated by
|
||||
// schema_versions version 2). Lets the rest of the codebase treat
|
||||
// envelope-presence as a stable invariant for future key rotations.
|
||||
// Failures here are logged but non-fatal: a partial migration just
|
||||
// means some columns keep working through Decrypt's legacy
|
||||
// fallback until the next manual save re-encrypts them.
|
||||
if err := db.MigrateSecretsToEnvelope(store.EnvelopeMigrator{
|
||||
HasEnvelope: crypto.HasEnvelope,
|
||||
Decrypt: func(v string) (string, error) { return crypto.Decrypt(encKey, v) },
|
||||
Encrypt: func(v string) (string, error) { return crypto.Encrypt(encKey, v) },
|
||||
}); err != nil {
|
||||
slog.Warn("secrets envelope migration", "error", err)
|
||||
}
|
||||
|
||||
// Import seed config on first launch (idempotent).
|
||||
seedPath := envOrDefault("SEED_FILE", "./tinyforge.yaml")
|
||||
if err := config.ImportSeed(db, seedPath); err != nil {
|
||||
@@ -197,7 +229,8 @@ func main() {
|
||||
switch {
|
||||
case r.Deployed:
|
||||
deployed++
|
||||
case r.Reason == webhook.ReasonBindingDisabled, r.Reason == webhook.ReasonNoMatch:
|
||||
case r.Reason == webhook.ReasonBindingDisabled, r.Reason == webhook.ReasonNoMatch,
|
||||
r.Reason == webhook.ReasonPreviewNoop:
|
||||
// not a failure — silent
|
||||
default:
|
||||
errored++
|
||||
@@ -291,6 +324,19 @@ func main() {
|
||||
}
|
||||
dep.SetPreDeployBackuper(backupEngine)
|
||||
|
||||
// Initialize volume-snapshot engine (per-workload data-volume archives).
|
||||
snapshotEngine, err := volsnap.New(db, dataDir)
|
||||
if err != nil {
|
||||
slog.Error("create snapshot engine", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
// Reclaim snapshot files orphaned by workload deletes (rows CASCADE, files don't).
|
||||
if cleaned, err := snapshotEngine.CleanOrphans(); err != nil {
|
||||
slog.Warn("snapshots: clean orphans on startup", "error", err)
|
||||
} else if cleaned > 0 {
|
||||
slog.Info("snapshots: cleaned orphan files on startup", "count", cleaned)
|
||||
}
|
||||
|
||||
// Clean orphaned backup files and prune on startup.
|
||||
if cleaned, err := backupEngine.CleanOrphans(); err != nil {
|
||||
slog.Warn("backup: clean orphans on startup", "error", err)
|
||||
@@ -359,11 +405,30 @@ func main() {
|
||||
}
|
||||
defer logScanMgr.Stop()
|
||||
|
||||
// Metric-alert manager: evaluates threshold rules against recent
|
||||
// container stats samples and emits event_log entries on breach.
|
||||
// The store satisfies RuleSource/SampleSource/EventSink; the event
|
||||
// bus is the Publisher.
|
||||
metricAlertMgr := metricalert.New(db, db, db, eventBus)
|
||||
metricAlertMgr.Start()
|
||||
defer metricAlertMgr.Stop()
|
||||
|
||||
// Build API server.
|
||||
apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
|
||||
apiServer.SetStaleScanner(staleScanner)
|
||||
apiServer.SetLogScanReloader(logScanMgr)
|
||||
apiServer.SetBackupEngine(backupEngine)
|
||||
apiServer.SetSnapshotEngine(snapshotEngine)
|
||||
// Wire the restore lifecycle seam and reconcile any restore interrupted by a
|
||||
// crash, BEFORE the HTTP server starts serving — so a half-applied restore is
|
||||
// completed/reverted first and the restore endpoint is never reachable
|
||||
// without its safety net.
|
||||
snapshotEngine.SetLifecycle(&restoreLifecycle{dep: dep, docker: dockerClient, store: db})
|
||||
if n, err := snapshotEngine.RecoverInterruptedRestores(); err != nil {
|
||||
slog.Warn("snapshots: recover interrupted restores on startup", "error", err)
|
||||
} else if n > 0 {
|
||||
slog.Info("snapshots: recovered interrupted restores on startup", "count", n)
|
||||
}
|
||||
apiServer.SetDBPath(dbPath)
|
||||
apiServer.SetBackupSettingsChangedCallback(scheduleAutobackup)
|
||||
apiServer.SetDNSProvider(dnsProvider)
|
||||
@@ -420,6 +485,7 @@ func main() {
|
||||
eventBus.Unsubscribe(notifySub)
|
||||
staleScanner.Stop()
|
||||
statsCollector.Stop()
|
||||
metricAlertMgr.Stop()
|
||||
|
||||
// Drain in-progress deploys and notifications.
|
||||
dep.Drain()
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/deployer"
|
||||
"github.com/alexei/tinyforge/internal/docker"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// restoreStopTimeoutSeconds bounds the graceful-stop window per container during
|
||||
// a restore quiesce before Docker kills it.
|
||||
const restoreStopTimeoutSeconds = 10
|
||||
|
||||
// restoreLifecycle adapts the deployer + Docker client + store to the
|
||||
// volsnap.Lifecycle seam the volume-snapshot restore flow needs. It lives in the
|
||||
// composition root so the volsnap package stays decoupled from deployer/docker.
|
||||
type restoreLifecycle struct {
|
||||
dep *deployer.Deployer
|
||||
docker *docker.Client
|
||||
store *store.Store
|
||||
}
|
||||
|
||||
// Lock takes the deployer's per-workload deploy lock so the restore serializes
|
||||
// against every deploy entrypoint (C1).
|
||||
func (l *restoreLifecycle) Lock(workloadID string) func() { return l.dep.LockWorkload(workloadID) }
|
||||
|
||||
// StopContainers stops every running container for the workload (quiesce before
|
||||
// the volume swap, C4) and returns the image tag the newest running container
|
||||
// was on, so the redeploy brings the SAME version back up. ListContainersByWorkload
|
||||
// returns rows newest-first, so the first running row is the newest.
|
||||
func (l *restoreLifecycle) StopContainers(ctx context.Context, workloadID string) (string, error) {
|
||||
rows, err := l.store.ListContainersByWorkload(workloadID)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("list containers: %w", err)
|
||||
}
|
||||
tag := ""
|
||||
for _, c := range rows {
|
||||
if c.State != "running" || c.ContainerID == "" {
|
||||
continue
|
||||
}
|
||||
if tag == "" && c.ImageTag != "" {
|
||||
tag = c.ImageTag // newest running container's tag
|
||||
}
|
||||
if err := l.docker.StopContainer(ctx, c.ContainerID, restoreStopTimeoutSeconds); err != nil {
|
||||
return "", fmt.Errorf("stop container %s: %w", c.ContainerID, err)
|
||||
}
|
||||
if err := l.store.UpdateContainerState(c.ID, "stopped"); err != nil {
|
||||
slog.Warn("restore: mark container stopped", "container", c.ID, "error", err)
|
||||
}
|
||||
}
|
||||
return tag, nil
|
||||
}
|
||||
|
||||
// Redeploy re-dispatches the workload via the deployer's unlocked path (the
|
||||
// restore already holds the per-workload lock). reference pins the image tag.
|
||||
func (l *restoreLifecycle) Redeploy(ctx context.Context, w store.Workload, reference string) error {
|
||||
intent := plugin.DeploymentIntent{
|
||||
Reason: "restore",
|
||||
Reference: reference,
|
||||
Metadata: map[string]string{"note": "redeploy after volume snapshot restore"},
|
||||
TriggeredAt: time.Now().UTC(),
|
||||
TriggeredBy: "restore",
|
||||
}
|
||||
return l.dep.RedeployLocked(ctx, plugin.WorkloadFromStore(w), intent)
|
||||
}
|
||||
+11
-2
@@ -1,7 +1,13 @@
|
||||
services:
|
||||
tinyforge:
|
||||
# Default: build from source so a fresh clone works out of the box.
|
||||
build: .
|
||||
image: tinyforge:latest
|
||||
# Image name doubles as the Gitea registry tag. To DEPLOY the pre-built
|
||||
# image instead of building (e.g. Portainer pulling on a webhook), comment
|
||||
# out `build:` above — compose will then pull this tag. `:latest` is pushed
|
||||
# only for stable (non pre-release) releases, and the registry may require
|
||||
# `docker login git.dolgolyov-family.by` first if the package is private.
|
||||
image: git.dolgolyov-family.by/alexei.dolgolyov/tiny-forge:latest
|
||||
container_name: tinyforge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
@@ -31,7 +37,10 @@ services:
|
||||
networks:
|
||||
- staging-net
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/api/auth/login"]
|
||||
# /readyz is the public readiness probe (pings the DB, rate-limited).
|
||||
# The previous target (/api/auth/login) is POST-only, so a GET/spider
|
||||
# request returned 405 and the container was always reported unhealthy.
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/readyz"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Tinyforge Codemaps — Index
|
||||
|
||||
**Last Updated:** 2026-05-16
|
||||
**Last Updated:** 2026-05-16 (added `container-extra-json` policy doc)
|
||||
|
||||
This directory contains architectural maps of key Tinyforge subsystems. Each codemap focuses on one major area: core data types, contract surfaces, integration points, and recipes for extending the system.
|
||||
|
||||
@@ -8,6 +8,7 @@ This directory contains architectural maps of key Tinyforge subsystems. Each cod
|
||||
|
||||
- **[Workload Plugin](./workload-plugin.md)** — Source × Trigger plugin contracts; registry lookups; webhook fan-out; how to add new kinds.
|
||||
- **[Discovery & Runtime API](./discovery-and-runtime.md)** — `/api/discovery/*` helpers (Git provider probe, repo/branch/tree pickers, image conflicts); `/api/workloads/{id}/runtime-state` + `/storage` + `/stop` + `/start`; SSRF-safe HTTP client in `internal/staticsite`.
|
||||
- **[`containers.extra_json` Evolution Policy](./container-extra-json.md)** — Ownership model, reader/writer rules, wholesale-overwrite vs preserve-unknown-keys patterns, concurrency invariants; checklist for adding a new field without breaking older deployers.
|
||||
|
||||
## Cross-References
|
||||
|
||||
|
||||
@@ -0,0 +1,105 @@
|
||||
# `containers.extra_json` — Evolution Policy
|
||||
|
||||
**Last Updated:** 2026-05-16
|
||||
|
||||
`extra_json` is a TEXT column on the `containers` table that source plugins use to persist source-specific runtime state that hasn't been promoted to a first-class column. It is the single forward-compatibility seam between the canonical container row and per-source needs that arise after a schema is in production.
|
||||
|
||||
This doc captures the rules every reader and writer must follow so new sources can extend the blob without breaking older ones.
|
||||
|
||||
## Schema position
|
||||
|
||||
- Column: `containers.extra_json TEXT NOT NULL DEFAULT '{}'` ([`internal/store/store.go:233`](../../internal/store/store.go#L233)).
|
||||
- All four write paths (`CreateContainer`, `UpsertContainer`, `ReconcileContainer`, `UpdateContainer`) normalize `""` → `'{}'` before the SQL exec — readers can assume a non-empty JSON object string and never need to handle SQL `NULL` or the empty-string edge.
|
||||
- Defined on the `Container` model: [`internal/store/models.go:342-347`](../../internal/store/models.go#L342-L347).
|
||||
|
||||
## Ownership model
|
||||
|
||||
**One container row → one owning source.** Sources never write to a row that belongs to another source. In practice:
|
||||
|
||||
| Source kind | Row key | Number of rows per workload | Writes `extra_json` today? |
|
||||
| ----------- | -------------------------------------- | --------------------------- | --------------------------- |
|
||||
| `static` | deterministic `<workloadID>:site` | exactly 1 | yes (preserve-unknown-keys) |
|
||||
| `image` | UUID per deployed container | 1 + N (blue-green rolls) | yes (wholesale-overwrite) |
|
||||
| `compose` | deterministic `<workloadID>:<service>` | N (one per compose service) | no — left at `'{}'` default |
|
||||
|
||||
Two sources cannot contend on the same row, so the policy below is concerned with **forward compatibility across versions of the same source**, not cross-source contention. When compose (or any future source) starts writing `extra_json`, the same rules apply.
|
||||
|
||||
## Reader rules — ALL readers
|
||||
|
||||
1. **Tolerate unknown keys.** Decode into a typed struct using `encoding/json`; Go's default unmarshaller silently drops unknown keys, which is the desired behaviour. Never use `json.Decoder.DisallowUnknownFields()` on `extra_json`.
|
||||
2. **Tolerate decode failure as non-fatal where the row's first-class columns are useful.** A corrupted `extra_json` is debug-logged and the reader falls back to zero state — see `workload_runtime.go:118-133` for the canonical pattern. The container's `ContainerID`, `State`, `ProxyRouteID`, etc. live in their own columns and are still trustworthy.
|
||||
3. **Tolerate `''` and `'{}'`.** Both are equivalent to "no extras yet". Readers must short-circuit before json.Unmarshal to avoid `unexpected end of JSON input` on the empty case.
|
||||
|
||||
## Writer rules — by mutation style
|
||||
|
||||
Two distinct write patterns live in the codebase today. Pick the one that matches your source's needs.
|
||||
|
||||
### Wholesale-overwrite (image source pattern)
|
||||
|
||||
When the writer owns 100% of the blob's shape and discards old contents on every write:
|
||||
|
||||
```go
|
||||
// internal/workload/plugin/source/image/image.go:341-343
|
||||
extra := containerExtra{ProxyRoutes: faceRoutes}
|
||||
if b, err := json.Marshal(extra); err == nil {
|
||||
created.ExtraJSON = string(b)
|
||||
}
|
||||
```
|
||||
|
||||
- Cheap and simple.
|
||||
- **Loses unknown keys written by future versions of the same source.** Only use when you are certain no other writer (including a future version of this code) needs to round-trip an unknown key.
|
||||
- The `containerExtra` struct must be **additive-only**: never rename or remove a field once shipped, and never change its JSON type. Mark new fields with `omitempty` so older readers downgrading to an older codebase don't see surprise nulls.
|
||||
|
||||
### Preserve-unknown-keys (static source pattern)
|
||||
|
||||
When future versions of the source (or sibling writers) may add fields and the current writer must round-trip them:
|
||||
|
||||
```go
|
||||
// internal/workload/plugin/source/static/state.go saveState
|
||||
// 1. Decode existing blob into map[string]json.RawMessage.
|
||||
// 2. Strip every key the current typed-state struct owns
|
||||
// (runtimeStateKeys) so a cleared field actually drops.
|
||||
// 3. Apply caller's mutate() to the typed state.
|
||||
// 4. Re-marshal typed state, splice its keys back into the
|
||||
// generic map (overwriting any historical sibling).
|
||||
// 5. Marshal the merged map back into extra_json.
|
||||
```
|
||||
|
||||
- Slightly more expensive (two round-trips through `json`).
|
||||
- Preserves keys the current writer doesn't know about — required for safe rolling deploys where a newer instance writes a new key, an older instance then reads, mutates, and writes back.
|
||||
- Must declare the typed key set explicitly (`runtimeStateKeys`) so step 2 can strip them. This invariant is fenced by `TestRuntimeState_JSONTagsRoundTrip` in [`state_integration_test.go`](../../internal/workload/plugin/source/static/state_integration_test.go).
|
||||
|
||||
**Default to preserve-unknown-keys for any new source.** Wholesale-overwrite is acceptable for the image source today because the row's lifetime is short (replaced on every blue-green roll) and only one writer touches it. Sources whose container rows are long-lived (static, future compose-with-stateful-services) should preserve unknown keys.
|
||||
|
||||
## Concurrency
|
||||
|
||||
`UpsertContainer` is atomic at the SQL layer — SQLite serializes statements through one connection ([`internal/store/store.go:55`](../../internal/store/store.go#L55) `SetMaxOpenConns(1)`) with WAL mode enabled ([`store.go:60`](../../internal/store/store.go#L60)). That guarantees no torn write on a single row, and concurrent readers see a consistent snapshot — they read either the pre- or post-write state, never a half-applied one.
|
||||
|
||||
What that does **not** guarantee is atomic read-modify-write across two Go goroutines. The static source serializes its RMW through a per-workload `sync.Mutex` keyed by workload ID (`internal/workload/plugin/source/static/state.go` `lockFor` + `saveState`). Any source that does its own read-modify-write on `extra_json` must do the same — verified in `TestSaveState_ConcurrentWritesDoNotLoseUpdates` (which loses 15+ markers per 20-writer run when the mutex is disabled, as confirmed in commit `ef62a41`).
|
||||
|
||||
If a future source is purely wholesale-overwrite from a single writer, no lock is needed.
|
||||
|
||||
## What `extra_json` is NOT for
|
||||
|
||||
- **Workload-level config.** Workload config goes in `workloads.source_config` and is the operator's surface.
|
||||
- **Cross-source state.** If two sources need the same data, promote it to a column.
|
||||
- **Anything queryable.** SQLite can JSON-path `extra_json` but no index supports it; readers always pull the column wholesale and parse in Go.
|
||||
- **Secrets.** Anything sensitive lives in `workload_env` (per-entry encrypt flag) or another encrypted table.
|
||||
|
||||
## Adding a new field — checklist
|
||||
|
||||
1. Add the field to your source's typed struct with `omitempty` and a stable `json:"snake_case"` tag.
|
||||
2. If you use the **preserve-unknown-keys** pattern, add the JSON key to your `*Keys` slice (the equivalent of `runtimeStateKeys`).
|
||||
3. Confirm older readers (older deploys of the same binary) still parse the blob — `encoding/json` should drop the unknown key silently. Add a regression test if there's any doubt.
|
||||
4. Document the new field in this codemap if it's load-bearing for cross-source code (e.g., the proxy_routes map drives `ListProxyRoutes`).
|
||||
|
||||
## Pointers
|
||||
|
||||
- Container model + `ExtraJSON` comment: [`internal/store/models.go:342-347`](../../internal/store/models.go#L342-L347)
|
||||
- Schema declaration: [`internal/store/store.go:233`](../../internal/store/store.go#L233)
|
||||
- Store-level normalization (`'{}'` default) across all four write paths: [`internal/store/containers.go:42-43`](../../internal/store/containers.go#L42-L43) (CreateContainer), `:77-78` (UpsertContainer), `:129-130` (ReconcileContainer), `:321-322` (UpdateContainer).
|
||||
- Wholesale-overwrite writer + struct: [`image.go:341-343`](../../internal/workload/plugin/source/image/image.go#L341-L343) writes; [`image.go:481-487`](../../internal/workload/plugin/source/image/image.go#L481-L487) defines `containerExtra`; [`image.go:449-456`](../../internal/workload/plugin/source/image/image.go#L449-L456) reads it back in Teardown.
|
||||
- Preserve-unknown-keys example + concurrency lock: [`internal/workload/plugin/source/static/state.go`](../../internal/workload/plugin/source/static/state.go).
|
||||
- Canonical "decode-and-tolerate" consumer (the only cross-source reader in tree today): [`internal/api/workload_runtime.go:118-133`](../../internal/api/workload_runtime.go#L118-L133) decodes the static-only typed fields and falls back to first-class columns when the blob is empty, missing keys, or malformed.
|
||||
|
||||
Note: no cross-source consumer reads `extra_json` in `internal/store/`. The proxy/route data exposed by `ListProxyRoutes` ([`containers.go:196`](../../internal/store/containers.go#L196)) comes from first-class columns (`proxy_route_id`, `subdomain`, `port`); the `proxy_routes` map inside `extra_json` is read only by the image source's own Teardown for cleanup.
|
||||
@@ -500,13 +500,15 @@ covers the use case — `promote-from` works, the UI shows the relationship.
|
||||
Probably can leave the legacy `stages` table dropped entirely once cutover
|
||||
proceeds.
|
||||
|
||||
### `Container.extra_json` evolution
|
||||
### ~~`Container.extra_json` evolution~~ — DONE (2026-05-16)
|
||||
|
||||
Currently only the image source uses it (per-face proxy route IDs). If
|
||||
other sources gain similar needs (compose service health metadata, static
|
||||
build SHAs), the schema there should stay versionless and additive — every
|
||||
reader must tolerate unknown keys. Document this in the source plugin
|
||||
guide alongside the codemap entry.
|
||||
Both writer patterns now have an active example in-tree (image source
|
||||
clobbers, static source preserves) and the policy is documented in
|
||||
[`docs/CODEMAPS/container-extra-json.md`](CODEMAPS/container-extra-json.md):
|
||||
ownership model, wholesale-overwrite vs preserve-unknown-keys, reader
|
||||
tolerance for unknown keys + decode failure, the per-workload mutex
|
||||
requirement for any read-modify-write writer, and a checklist for adding
|
||||
a new field without breaking older deployers.
|
||||
|
||||
## File pointers for the next session
|
||||
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
# GitOps: config-as-code with `.tinyforge.yml`
|
||||
|
||||
A **dockerfile** or **static** workload can read part of its deploy config from a
|
||||
`.tinyforge.yml` file in its own repo. Tinyforge fetches the file, shows you how it
|
||||
differs from the live config (**drift**), and applies it when you click **Sync** — so the
|
||||
repo becomes the source of truth for the declared fields.
|
||||
|
||||
This is opt-in per workload and **manual-sync only** in v1: nothing is applied automatically
|
||||
on deploy, and a sync never runs without an explicit admin action.
|
||||
|
||||
## Enabling it
|
||||
|
||||
1. Open the workload (Apps → your app).
|
||||
2. In the **GitOps** panel, toggle it on. The default file path is `.tinyforge.yml` at the
|
||||
repo root; change it if your file lives elsewhere (e.g. `deploy/.tinyforge.yml`).
|
||||
3. Add a `.tinyforge.yml` to the repo (schema below) and push.
|
||||
4. The panel shows the parsed file and any drift vs. the live config. Click **Sync now** to
|
||||
apply the repo's values to the workload.
|
||||
|
||||
Only **dockerfile** and **static** sources are eligible — they're the git-backed sources.
|
||||
`image` and `compose` workloads don't show the panel.
|
||||
|
||||
## `.tinyforge.yml` schema (v1)
|
||||
|
||||
```yaml
|
||||
version: 1 # required, must be 1
|
||||
deploy:
|
||||
# dockerfile only:
|
||||
port: 8080 # container port the app listens on
|
||||
healthcheck: /healthz # HTTP path probed before a blue-green cutover ("" to disable)
|
||||
# dockerfile + static:
|
||||
deploy_strategy: blue-green # "" | recreate | blue-green
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- **Only the fields above are honored.** Unknown keys are rejected with an error (so a typo
|
||||
surfaces instead of being silently ignored).
|
||||
- Fields you omit are **left untouched** — the file overlays only what it declares; it never
|
||||
clears the rest of your config.
|
||||
- The file is **source-aware**: a `static` workload only honors `deploy_strategy` (a static
|
||||
site has no port/healthcheck); `port`/`healthcheck` in a static site's file are ignored.
|
||||
- `deploy_strategy: ""` and `recreate` are equivalent (both are the default for dockerfile
|
||||
and static), so they never show as drift against each other.
|
||||
|
||||
## What `.tinyforge.yml` does **not** contain
|
||||
|
||||
- **No repo location** (provider / owner / repo / branch) and **no access token** — those
|
||||
stay in Tinyforge's encrypted database. This is deliberate: it keeps credentials out of
|
||||
your repo. (You need the repo coords to find the file in the first place, so they can't
|
||||
live in it.)
|
||||
|
||||
## Drift and sync
|
||||
|
||||
- **Drift** is computed only over the fields the file declares, after normalization (so a
|
||||
defaulted strategy or a YAML-int vs stored-number difference isn't a false positive).
|
||||
- **Sync** fetches the file, merges the declared fields onto a copy of the live config,
|
||||
**validates the merged result** with the source's own rules, and only persists it if it
|
||||
passes — a bad file is rejected as a whole and never leaves a partial config. The sync is
|
||||
recorded to the workload's activity log (not the deploy ledger — it changes config, it
|
||||
isn't a deploy).
|
||||
- While GitOps is enabled, the edit form shows a banner noting which fields the repo manages;
|
||||
editing them in the UI works, but the next Sync overwrites them with the repo's values.
|
||||
|
||||
## Not in v1 (planned)
|
||||
|
||||
These are intentionally out of scope for the first version; the design leaves clean seams
|
||||
for them:
|
||||
|
||||
- **`env` and `faces` (public subdomains)** — they live in separate stores and (for `env`)
|
||||
would re-introduce a secrets-in-repo risk; deferred to a typed multi-target apply.
|
||||
- **Auto-apply on deploy** — applying the repo config automatically on every push. v1 keeps
|
||||
a human in the loop with the drift view + manual Sync. When added, it will read the file
|
||||
at the exact deployed commit (a source-plugin concern), not at dispatch time.
|
||||
- **Multi-workload reconcile** — one repo declaring/creating/deleting many workloads
|
||||
(the full Flux/Argo model). v1 is per-workload, config-only, with no create/delete.
|
||||
- **`image` / `compose` sources** — not git-backed / overlapping config surface.
|
||||
@@ -0,0 +1,223 @@
|
||||
# Deploy History + One-Click Rollback — Implementation Plan
|
||||
|
||||
**Status:** planned (review incorporated) · **Feature rank:** #1 · **Date:** 2026-06-19
|
||||
|
||||
## Review findings incorporated (adversarial pass)
|
||||
|
||||
- **BLOCKER — never persist the raw deploy error** (it can carry registry-auth bytes /
|
||||
compose stdout — see `compose.go` SECURITY comment + `workloads_plugin.go:198`).
|
||||
`deploy_history.error` only ever gets a **fixed generic marker**
|
||||
(`"deploy failed (see server logs)"`) on failure; the raw error goes to `slog` only.
|
||||
`capDeployStatus(err.Error())` is rejected.
|
||||
- **BLOCKER — don't double-count metrics.** `DispatchPlugin` already calls
|
||||
`metrics.DeploysTotal.Inc(...)`; recording slots into the **existing** outcome block,
|
||||
not a re-added metrics line.
|
||||
- **FIX — no runtime-state store getter exists.** static/dockerfile `LastCommitSHA`
|
||||
lives in `containers.extra_json` on a deterministic-ID row
|
||||
(`GetContainerByID(w.ID+":site")` / `+":dockerfile"`, decode `ExtraJSON`). Moot for
|
||||
Phase-1 rollback (image-only) but the resolver must use this, not a fictional getter.
|
||||
- **FIX — cascade is distrusted here.** `DeleteWorkload` explicitly deletes containers
|
||||
rather than relying on the FK. Match that: add `DELETE FROM deploy_history WHERE
|
||||
workload_id = ?` inside the `DeleteWorkload` transaction, and make the cascade test a
|
||||
hard gate.
|
||||
- **FIX — keep recording off the hot path's tail.** `DispatchPlugin` runs synchronously
|
||||
on the request goroutine; the INSERT is cheap but `PruneDeployHistory` runs in a
|
||||
goroutine. Draining-rejected attempts (beginDispatch fail) record nothing — correct,
|
||||
a never-run deploy must not appear as a rollback target.
|
||||
- **FIX — pagination:** use `parseLimit(raw, 50, 200)` (not the unclamped
|
||||
`listWorkloadEvents` style); parse `offset` separately, clamp negatives to 0.
|
||||
|
||||
|
||||
## Problem
|
||||
|
||||
Tinyforge has *failure* rollback (a failed deploy unwinds its own new container —
|
||||
[image.go:258](../../internal/workload/plugin/source/image/image.go)), but **no way to
|
||||
revert a *successful* deploy to a prior version.** Blue-green's `enforceMaxInstances`
|
||||
deletes the old container rows after cutover, so once `v3` replaces `v2` there is no
|
||||
record of `v2` and nothing to roll back to. The only "history" is free-text
|
||||
`event_log` rows (`"deployed"`) — not structured, not version-pinned, not replayable.
|
||||
|
||||
This is the single most-requested capability for any deploy tool, and the plumbing is
|
||||
90% there: every deploy flows through one choke point, and the manual-deploy endpoint
|
||||
already accepts a `reference` override.
|
||||
|
||||
## Key architectural facts (verified against current code)
|
||||
|
||||
- **Single dispatch choke point:** `Deployer.DispatchPlugin(ctx, w, intent)` in
|
||||
[internal/deployer/dispatch.go](../../internal/deployer/dispatch.go) routes *every*
|
||||
source kind and already computes a success/failure `outcome`. This is where history
|
||||
is recorded.
|
||||
- **`intent.Reference` is the version handle:** image source resolves
|
||||
`tag := intent.Reference` (falling back to `DefaultTag`/`latest`). The manual deploy
|
||||
endpoint ([workloads_plugin.go](../../internal/api/workloads_plugin.go)) already accepts
|
||||
`{reference, note}` and builds a `manual` intent. **Rollback = deploy with a pinned
|
||||
reference + a distinct reason.**
|
||||
- **Effective vs requested reference:** for a *manual* image deploy `intent.Reference`
|
||||
is often `""` (means `DefaultTag`). The *effective* deployed tag is written onto the
|
||||
freshest container row (`store.Container.ImageTag`). For static/dockerfile the
|
||||
effective version is `runtime_state.LastCommitSHA`, resolved inside the source.
|
||||
- **Built-from-source sources don't honor a SHA reference on Deploy** — static and
|
||||
dockerfile clone `cfg.Branch` HEAD and capture `latestSHA`; they cannot yet check out
|
||||
an arbitrary commit. So **SHA-pinned rollback for them needs a source change (later
|
||||
phase).** Image-tag rollback works today.
|
||||
- **Migration pattern:** additive statements in `runMigrations()` /
|
||||
`workloadTables` in [store.go](../../internal/store/store.go); workload-scoped tables
|
||||
use `REFERENCES workloads(id) ON DELETE CASCADE`. Per-table CRUD lives in its own
|
||||
`internal/store/<table>.go`, model in `models.go`.
|
||||
- **Idempotency note:** the image source's same-tag short-circuit returns *before* it
|
||||
arms its `EmitDeployEvent` defer, so a no-op deploy emits no timeline event. History
|
||||
recorded at `DispatchPlugin` will still log it as a `success` attempt — acceptable
|
||||
(history = ledger of attempts), but called out so the divergence is intentional.
|
||||
|
||||
## Scope
|
||||
|
||||
### Phase 1 (this plan)
|
||||
1. Persistent, structured **deploy-history ledger** for **all** source kinds (success
|
||||
*and* failure) — powers an audit timeline and the rollback action.
|
||||
2. **One-click rollback** for the **image** source (redeploy a pinned tag).
|
||||
3. Read-only history panel on `/apps/[id]`; rollback button shown only for entries that
|
||||
are `success` + have a non-empty reference + a rollback-capable source kind.
|
||||
|
||||
### Explicitly out of scope (future phases, table already supports them)
|
||||
- SHA-pinned rebuild rollback for static/dockerfile (needs source checkout-by-commit).
|
||||
- Config-snapshot rollback for compose (no artifact reference).
|
||||
- Promotion (dev→staging→prod) — separate feature, will reuse this ledger.
|
||||
|
||||
## Data model
|
||||
|
||||
New table `deploy_history` (added to `workloadTables` in `runMigrations`):
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS deploy_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
|
||||
source_kind TEXT NOT NULL DEFAULT '',
|
||||
reference TEXT NOT NULL DEFAULT '', -- effective artifact: image tag | commit sha | ''
|
||||
reason TEXT NOT NULL DEFAULT '', -- manual|registry-push|git-push|cron|rollback|promote
|
||||
triggered_by TEXT NOT NULL DEFAULT '',
|
||||
note TEXT NOT NULL DEFAULT '',
|
||||
outcome TEXT NOT NULL DEFAULT '', -- success | failure
|
||||
error TEXT NOT NULL DEFAULT '', -- truncated, secret-free
|
||||
started_at TEXT NOT NULL DEFAULT '',
|
||||
finished_at TEXT NOT NULL DEFAULT ''
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_deploy_history_workload
|
||||
ON deploy_history(workload_id, id DESC);
|
||||
```
|
||||
|
||||
**Why a dedicated table (not `event_log`):** structured + queryable, version-pinned,
|
||||
carries the replayable `reference`, and its retention is independent of the human event
|
||||
feed. `event_log` stays the free-text timeline; `deploy_history` is the version ledger.
|
||||
|
||||
Go model in `models.go` (`DeployHistoryEntry`, mirrors `MetricAlertRule` style).
|
||||
|
||||
## Backend changes
|
||||
|
||||
### 1. Store — `internal/store/deploy_history.go` (new) + `models.go` + `store.go`
|
||||
- `DeployHistoryEntry` struct.
|
||||
- `InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error)`.
|
||||
- `ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error)`
|
||||
— ordered `id DESC`; default/clamped limit (e.g. 50, max 200) via existing `parseLimit`
|
||||
conventions at the API layer.
|
||||
- `GetDeployHistory(id int64) (DeployHistoryEntry, error)` — for rollback lookup;
|
||||
`ErrNotFound` on miss.
|
||||
- `PruneDeployHistory(workloadID string, keep int) error` — keep newest `keep` per
|
||||
workload (mirror the stats-prune pattern). Called best-effort after insert.
|
||||
- Migration: append `CREATE TABLE` + index to `workloadTables`.
|
||||
- Table test `deploy_history_test.go` (insert/list/get/prune, cascade-on-workload-delete).
|
||||
|
||||
### 2. Deployer — record at the choke point (`internal/deployer/dispatch.go`)
|
||||
Wrap the existing `src.Deploy(...)` call:
|
||||
```go
|
||||
started := store.Now()
|
||||
err = src.Deploy(ctx, d.PluginDeps(), w, intent)
|
||||
outcome := "success"; if err != nil { outcome = "failure" }
|
||||
metrics.DeploysTotal.Inc(w.SourceKind, outcome)
|
||||
d.recordDeployHistory(w, intent, outcome, err, started) // best-effort, never blocks
|
||||
return err
|
||||
```
|
||||
- `recordDeployHistory` resolves the **effective reference** and inserts a row.
|
||||
Best-effort: a store failure is logged, never propagated (same contract as
|
||||
`maybeBackupBeforeDeploy` and `EmitDeployEvent`).
|
||||
- **Effective-reference resolver** (`internal/deployer/deploy_ref.go`, unit-tested):
|
||||
1. start from `intent.Reference`;
|
||||
2. `image`: read newest `ListContainersByWorkload(w.ID)` row (by `CreatedAt`), prefer
|
||||
its `ImageTag` when non-empty — captures the `DefaultTag`/`latest` resolution;
|
||||
3. `static`/`dockerfile`: when still empty, read persisted runtime state
|
||||
`LastCommitSHA` (verify exact store getter during impl);
|
||||
4. `compose`/unknown: leave as-is (may be `""`).
|
||||
- **Error sanitization:** reuse the `capDeployStatus` cap (256 runes) idea — store a
|
||||
short, secret-free `error`. The raw error keeps going to `slog` only. (The deploy
|
||||
error already carries a generic client message; the wrapped detail must not be
|
||||
persisted verbatim because it can echo registry-auth / compose-stdout bytes — same
|
||||
caller contract documented on `EmitDeployEvent`.)
|
||||
- Recording does **not** run for `DispatchReconcile` (periodic, not a deploy) or
|
||||
`DispatchTeardown`.
|
||||
|
||||
### 3. API — `internal/api/deploy_history.go` (new) + `router.go`
|
||||
- `GET /api/workloads/{id}/deploys?limit=&offset=` → `listWorkloadDeploys` (read; any
|
||||
authenticated user — mirrors `listWorkloadEvents`). Uses `parseLimit`.
|
||||
- `POST /api/workloads/{id}/rollback` → `rollbackWorkload` (`auth.AdminOnly`), body
|
||||
`{deploy_id}`:
|
||||
1. load workload (404 if missing; 400 if `source_kind == ""`);
|
||||
2. `GetDeployHistory(deploy_id)`; 404 if missing, 400 if its `workload_id` ≠ path id
|
||||
(no cross-workload replay);
|
||||
3. guard: `outcome == "success"`, `reference != ""`, and `source_kind` is
|
||||
rollback-capable (`image` in Phase 1) → else 400 with a clear message;
|
||||
4. build `manual`-shaped intent `{Reason: "rollback", Reference: row.reference,
|
||||
Metadata: {"note": "rollback to " + row.reference, "rollback_of": <id>},
|
||||
TriggeredBy: actor}`;
|
||||
5. `deployer.DispatchPlugin(...)`; 202 on accept (same shape as deploy).
|
||||
- Register both routes inside the existing `r.Route("/workloads/{id}", …)` block in
|
||||
[router.go](../../internal/api/router.go), next to `/deploy` and `/events`.
|
||||
- A `RollbackCapable(sourceKind) bool` helper (single source of truth, shared with the
|
||||
list response so the frontend can render the button state without hardcoding kinds).
|
||||
- The list response includes a per-entry `rollbackable bool` computed server-side.
|
||||
|
||||
## Frontend changes (`web/`)
|
||||
|
||||
- **`DeployHistoryPanel.svelte`** (new, in `lib/components/`): table of entries —
|
||||
short reference, reason badge, `outcome` `StatusBadge` (ok/bad), `triggered_by`,
|
||||
relative time. For `rollbackable` rows a **Roll back** button → `ConfirmDialog`
|
||||
("Roll back <name> to <reference>?") → `POST …/rollback {deploy_id}` → `Toast` +
|
||||
refresh history and container state. Loading via `Skeleton`; `EmptyState` when no
|
||||
rows. Reuses existing components only.
|
||||
- Mount the panel on **`/apps/[id]`** alongside the activity timeline (it is the
|
||||
*structured, actionable* sibling of the free-text timeline).
|
||||
- **i18n:** add keys under a `deployHistory.*` namespace to **both**
|
||||
`web/src/lib/i18n/en.json` and `ru.json` (parity is mandatory and not a build error —
|
||||
verify manually per CLAUDE.md).
|
||||
- API client: add `listDeploys(id, params)` and `rollback(id, deployId)` to the existing
|
||||
workload API module.
|
||||
|
||||
## Testing
|
||||
|
||||
- **Store:** `deploy_history_test.go` — insert/list ordering, get, prune-keeps-newest,
|
||||
cascade delete with workload.
|
||||
- **Deployer:** extend `deployer` tests — `DispatchPlugin` writes one `success` row and
|
||||
one `failure` row (with sanitized error); reconcile/teardown write none. Resolver unit
|
||||
test (`deploy_ref_test.go`) for the image read-back + empty fallbacks.
|
||||
- **API:** rollback guards — cross-workload id → 400; non-success/empty-ref/
|
||||
non-image → 400; happy path → 202 and a `rollback`-reason history row appears.
|
||||
- **Web:** keep it light (the panel is mostly presentational); a `sourceForms`-style
|
||||
pure-logic unit only if a non-trivial helper emerges.
|
||||
- Gates: `go build ./...`, `go vet ./internal/...`, `go test ./internal/...`,
|
||||
`cd web && npm run check && npm run test`, then `./scripts/dev-server.sh`.
|
||||
|
||||
## Risks / mitigations
|
||||
|
||||
- **Recording must never break a deploy** → best-effort insert, errors only logged
|
||||
(matches existing `EmitDeployEvent` / pre-deploy-backup contracts).
|
||||
- **Secret leakage via `error`** → store only a capped, generic reason; raw error to
|
||||
`slog` only.
|
||||
- **Unbounded growth** → `PruneDeployHistory` keeps newest N per workload.
|
||||
- **Rollback to a vanished image tag** → the image source's `PullImage` fails and its
|
||||
own failure-rollback leaves the live container untouched; the rollback attempt is
|
||||
recorded as `failure`. No special handling needed.
|
||||
- **No-op rollback (target already running, `MaxInstances>1`)** → image short-circuit
|
||||
returns `nil`; recorded as `success`. Acceptable.
|
||||
|
||||
## Rollout
|
||||
|
||||
Single PR. Additive migration (no destructive DDL). No settings changes. Backward
|
||||
compatible: existing workloads simply start accumulating history on their next deploy.
|
||||
@@ -0,0 +1,98 @@
|
||||
# Configurable Deploy Strategy — Implementation Plan
|
||||
|
||||
**Status:** planned (workflow-designed + adversarially reviewed) · **Feature rank:** #3 · **Date:** 2026-06-19
|
||||
|
||||
## Problem
|
||||
|
||||
`image` does zero-downtime blue-green; `dockerfile` and `static` **stop+remove the old
|
||||
container before creating the new one** on every redeploy (a real downtime window).
|
||||
`compose` is stack-managed. Give operators a per-workload **deploy strategy** and bring
|
||||
blue-green to the built-from-source sources.
|
||||
|
||||
## Design (chosen via a 3-proposal judge panel; "minimal" won, 9/10)
|
||||
|
||||
Per-source `deploy_strategy` field **inside each source's `SourceConfig` JSON blob** —
|
||||
**no new DB column, no migration, no `dispatch.go` change**. Values: `""` (back-compat
|
||||
default), `"recreate"`, `"blue-green"`. Round-trips opaquely through
|
||||
`plugin.WorkloadFromStore` / `SourceConfigOf[Config]`; validated in each source's existing
|
||||
`Validate(json.RawMessage)` (runs on create **and** update at `workloads_plugin.go:291`).
|
||||
|
||||
**Per-source default (load-bearing):** a single shared default would silently flip
|
||||
image's native blue-green to recreate, so each source has a tiny `effectiveStrategy`:
|
||||
- `image`: `""` → **blue-green**
|
||||
- `dockerfile` / `static` / `compose`: `""` → **recreate**
|
||||
|
||||
The blue-green branch for dockerfile/static uses a **transient two-container / single-row
|
||||
swap** so `state.go`, `teardown.go`, and `reconcile.go` (which read one deterministic row)
|
||||
stay **untouched** — the lowest-risk way to ship gap-free cutover.
|
||||
|
||||
## Review fixes folded in (adversarial pass)
|
||||
|
||||
1. **BLOCKER — ordering / crash-safety.** Blue-green order MUST be: create+start green →
|
||||
readiness-gate green → `ConfigureRoute(green)` (upsert) → **`saveState(green)` into the
|
||||
single row FIRST** → only THEN stop+remove blue (captured before saveState). The single
|
||||
row must always point at a running container; reaping blue before persisting green
|
||||
orphans green and makes the reconciler flip a healthy workload to `failed`.
|
||||
2. **Unique green name is load-bearing.** dockerfile/static names are deterministic
|
||||
(`tf-build-<name>-<id>` / `dw-site-<name>-<id>`) and double as the proxy `forwardHost`.
|
||||
The green container needs a genuinely unique name (`…-<ms-hex>`, lifted from
|
||||
`image.buildContainerName`) set in **both** `cc.Name` **and** the `ConfigureRoute`
|
||||
`forwardHost`.
|
||||
3. **Readiness, not liveness.** Before cutover, use `deps.Health.Check(ctx, http://<green>:
|
||||
<port><healthcheck>)` when a healthcheck path is configured (dockerfile has `Healthcheck`);
|
||||
fall back to the existing 3s liveness gate otherwise. Don't advertise "zero-downtime" on
|
||||
the liveness-only path.
|
||||
4. **Pure upsert.** Drop the pre-`DeleteRoute`; call only `ConfigureRoute` (upsert-by-FQDN
|
||||
for NPM repoints in place; Traefik is label-driven). **Traefik caveat:** blue+green
|
||||
briefly carry the same host-rule labels → momentary dual-serve; documented as a
|
||||
Traefik-only phase-1 limitation (NPM, the common case, is gap-free).
|
||||
5. **deno + storage → force recreate.** When `static` has `StorageEnabled && mode==deno`,
|
||||
`effectiveStrategy` forces `recreate` — blue-green would mount the same RW named volume
|
||||
into both containers (a concurrent-writer window recreate never had).
|
||||
6. **image `recreate` gets its own shape.** Don't reuse `rollbackNew` (assumes blue
|
||||
survives). image `recreate` = reap existing running containers **after** a successful
|
||||
pull, then create green; on green failure the downtime is the accepted recreate
|
||||
contract (logged distinctly, not as a non-disruptive rollback).
|
||||
7. Image tag `:latest` shared by blue/green is **safe** — containers pin image-by-id at
|
||||
create (no fix needed).
|
||||
|
||||
## Files (phase 1, backend-only)
|
||||
|
||||
- **NEW** `internal/workload/plugin/strategy.go` — `StrategyRecreate`/`StrategyBlueGreen`
|
||||
consts, `ValidateStrategy(value string, allowBlueGreen bool) error`,
|
||||
`BuildGreenName(name, id string, ts time.Time) string` (lifted unique-suffix scheme).
|
||||
`+ strategy_test.go`.
|
||||
- `image/image.go` — `DeployStrategy` on Config; `effectiveStrategy` (""→blue-green);
|
||||
Validate; honor `recreate` (reap-after-pull + dedicated log).
|
||||
- `dockerfile/dockerfile.go` (Config + Validate) + `dockerfile/deploy.go` (blue-green
|
||||
branch, fixes 1–4) + `dockerfile/deploy_test.go`.
|
||||
- `static/static.go` (Config + Validate) + `static/deploy.go` (blue-green branch + deno
|
||||
gate, fixes 1–5) + `static/deploy_test.go`.
|
||||
- `compose/compose.go` — Config field + Validate rejects `blue-green` (allowBlueGreen=false)
|
||||
+ test.
|
||||
|
||||
## Phase 1 backward-compat lock (mandatory, unit-tested)
|
||||
`ValidateStrategy("", …)` returns nil; every `effectiveStrategy("")` returns the source's
|
||||
historical default. Existing rows (no `deploy_strategy` key) decode `""` → today's exact
|
||||
behavior, byte-for-byte.
|
||||
|
||||
## Later phases (deferred)
|
||||
- **P2 (UI):** `sourceForms.ts` seed/serialize + `/apps/new` & `/apps/[id]` select +
|
||||
en/ru i18n (hide blue-green for compose).
|
||||
- **P3 (harden):** mandatory HTTP readiness probe for static; connection draining before
|
||||
blue removal; Traefik label suppression at cutover.
|
||||
- **P4 (architecture):** extract image's proven sequence into a shared
|
||||
`plugin.DeploySingleContainer`; migrate dockerfile/static to the multi-row model
|
||||
(crash-safe mid-swap; unlocks `MaxInstances>1`).
|
||||
- **P5:** true `rolling` (needs a backend-pool primitive on `proxy.Provider`) + compose
|
||||
green-project blue-green.
|
||||
|
||||
## Test plan
|
||||
Table-driven, TDD: `ValidateStrategy` accept/reject matrix (incl. `allowBlueGreen=false`,
|
||||
reserved `rolling` rejected, `""` accepted); per-source `effectiveStrategy` defaults +
|
||||
deno-storage→recreate; dockerfile/static blue-green deploy tests asserting (a) green named
|
||||
≠ deterministic name, (b) collision teardown NOT run, (c) `ConfigureRoute` called with
|
||||
`forwardHost==green` and NO preceding `DeleteRoute`, (d) `saveState(green)` **before**
|
||||
`RemoveContainer(blue)`, (e) single row ends at green; failure path: green fails gate →
|
||||
green removed, blue + route untouched; compose rejects blue-green. Gates: `go build`,
|
||||
`go vet`, `go test ./internal/...`, `npm run check/test`, `./scripts/dev-server.sh`.
|
||||
@@ -0,0 +1,84 @@
|
||||
# Per-Workload Metrics Graph — Implementation Plan
|
||||
|
||||
**Status:** planned · **Feature rank:** #2 · **Date:** 2026-06-19
|
||||
|
||||
## Problem
|
||||
|
||||
Stats are collected per container (`container_stats_samples`, CPU/mem/net/disk) and
|
||||
charted **globally** on the dashboard (`SystemResourcesCard` + `ResourceChart`), but
|
||||
`/apps/[id]` shows only live snapshots — there's no per-workload "is my app leaking
|
||||
memory / pegging CPU over the last few hours" view. This is a daily question and the
|
||||
data already exists; we just need a per-workload query + a panel that reuses the chart.
|
||||
|
||||
## Verified facts
|
||||
|
||||
- `ContainerStatsSample.OwnerID` == the **container row id** (`containers.id`), confirmed
|
||||
by `lookupInstanceName` → `GetContainerByID(sm.OwnerID)` in
|
||||
[stats_history.go](../../internal/api/stats_history.go). `OwnerType` ∈ {instance, site}.
|
||||
- Each sample's `ts` is that container's own Docker-stats `Timestamp.Unix()`
|
||||
([collector.go](../../internal/stats/collector.go)) — NOT one shared tick stamp. In a
|
||||
multi-container tick the per-second truncation usually collapses them to the same
|
||||
integer `ts`, so per-`ts` aggregation works; a ±1s split at a second boundary is
|
||||
cosmetic for a trend line. (Reviewer-corrected.) The handler 404s on an unknown
|
||||
workload id but returns `[]` for a known workload with no samples yet.
|
||||
- `ResourceChart.svelte` takes a fully-built `EChartsOption` from the parent; the parent
|
||||
owns series/axes (see `SystemResourcesCard`). Reads stay available when Docker is down
|
||||
(samples come from SQLite, not the daemon).
|
||||
- Per-workload reads (`/events`, `/runtime-state`) are open to any authenticated user;
|
||||
this endpoint follows suit (no `AdminOnly`).
|
||||
|
||||
## Backend
|
||||
|
||||
1. **Store** — `ListContainerStatsSamplesByWorkload(workloadID string, sinceTS int64)`:
|
||||
```sql
|
||||
SELECT cs.container_id, cs.owner_type, cs.owner_id, cs.ts,
|
||||
cs.cpu_percent, cs.memory_usage, cs.memory_limit,
|
||||
cs.network_rx, cs.network_tx, cs.block_read, cs.block_write
|
||||
FROM container_stats_samples cs
|
||||
JOIN containers c ON c.id = cs.owner_id
|
||||
WHERE c.workload_id = ? AND cs.ts >= ?
|
||||
ORDER BY cs.ts ASC
|
||||
```
|
||||
Returns `[]ContainerStatsSample`.
|
||||
|
||||
2. **API** — `getWorkloadStatsHistory` (GET `/api/workloads/{id}/stats/history?window=`):
|
||||
reuse `parseWindow`/`sinceTimestamp`; aggregate samples **per ts** into a compact
|
||||
series so multi-container workloads (compose) sum correctly:
|
||||
```go
|
||||
type workloadStatsPoint struct {
|
||||
TS int64 `json:"ts"`
|
||||
CPUPercent float64 `json:"cpu_percent"` // sum across the workload's containers
|
||||
MemoryUsage int64 `json:"memory_usage"` // sum bytes
|
||||
MemoryLimit int64 `json:"memory_limit"` // max (effective ceiling)
|
||||
}
|
||||
```
|
||||
Always returns `[]` (never 503) — empty when stats are disabled / Docker was down /
|
||||
the workload is new. Register in the `/workloads/{id}` route block.
|
||||
|
||||
3. **Tests** — store: join scopes to the right workload (A's samples ≠ B's); API:
|
||||
per-ts aggregation sums two containers at the same tick.
|
||||
|
||||
## Frontend
|
||||
|
||||
4. **api.ts** — `WorkloadStatsPoint` type + `fetchWorkloadStatsHistory(id, window, signal)`.
|
||||
5. **`WorkloadMetricsPanel.svelte`** — window selector (30m / 2h / 6h), fetch + 15s poll
|
||||
(mirror `SystemResourcesCard`), build an `EChartsOption` with **two series**: CPU %
|
||||
on the left axis, Memory (MiB) on the right axis (absolute bytes, because
|
||||
`memory_limit` is often 0/unlimited so a % would divide by zero). `EmptyState`/ hint
|
||||
when there are no samples. Render via `ResourceChart`. Mount on `/apps/[id]` near the
|
||||
deploy-history panel.
|
||||
6. **i18n** — `apps.detail.metrics.*` in both en.json and ru.json (parity mandatory).
|
||||
|
||||
## Risks / mitigations
|
||||
|
||||
- **Docker down / stats disabled** → empty series, friendly hint (no error). SQLite read
|
||||
path is independent of the daemon.
|
||||
- **memory_limit = 0 (unlimited)** → plot absolute MiB, not %, to avoid div-by-zero.
|
||||
- **Sparse sampling** → chart shows whatever ticks exist; window selector lets the user
|
||||
widen. No interpolation.
|
||||
- **Auth** → read-only, any authenticated user (consistent with other per-workload reads).
|
||||
|
||||
## Rollout
|
||||
|
||||
Single change set, additive, no migration. Reuses the existing `echarts` dependency and
|
||||
`ResourceChart` component.
|
||||
@@ -10,8 +10,11 @@ require (
|
||||
github.com/moby/moby/api v1.54.0
|
||||
github.com/moby/moby/client v0.3.0
|
||||
github.com/robfig/cron/v3 v3.0.1
|
||||
github.com/yuin/goldmark v1.8.2
|
||||
golang.org/x/crypto v0.28.0
|
||||
golang.org/x/oauth2 v0.25.0
|
||||
golang.org/x/sync v0.20.0
|
||||
golang.org/x/sys v0.33.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
modernc.org/sqlite v1.34.5
|
||||
)
|
||||
@@ -34,15 +37,12 @@ require (
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.1 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
github.com/yuin/goldmark v1.8.2 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
||||
go.opentelemetry.io/otel v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.35.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.35.0 // indirect
|
||||
golang.org/x/mod v0.18.0 // indirect
|
||||
golang.org/x/sync v0.20.0 // indirect
|
||||
golang.org/x/sys v0.33.0 // indirect
|
||||
golang.org/x/tools v0.22.0 // indirect
|
||||
modernc.org/libc v1.55.3 // indirect
|
||||
modernc.org/mathutil v1.6.0 // indirect
|
||||
|
||||
@@ -85,8 +85,6 @@ golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
|
||||
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70=
|
||||
golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
|
||||
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
|
||||
@@ -16,13 +16,12 @@ import (
|
||||
)
|
||||
|
||||
// rateLimitedLogin wraps the login handler with per-IP rate limiting.
|
||||
// Uses clientIP() so X-Forwarded-For is honored only when the request
|
||||
// arrives from a configured trusted-proxy CIDR — preventing remote
|
||||
// attackers from spoofing the header to bypass the per-IP login limiter.
|
||||
func (s *Server) rateLimitedLogin(rl *rateLimiter) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
ip := r.RemoteAddr
|
||||
if fwd := r.Header.Get("X-Forwarded-For"); fwd != "" {
|
||||
ip = fwd
|
||||
}
|
||||
if !rl.allow(ip) {
|
||||
if !rl.allow(clientIP(r)) {
|
||||
respondError(w, http.StatusTooManyRequests, "too many login attempts, try again later")
|
||||
return
|
||||
}
|
||||
|
||||
+73
-32
@@ -1,7 +1,6 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -118,7 +117,22 @@ func (s *Server) deleteBackup(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// restoreBackup handles POST /api/backups/{id}/restore.
|
||||
// This replaces the current database with the backup and triggers a graceful shutdown.
|
||||
//
|
||||
// Restore happens in three documented stages so a failure at any stage
|
||||
// leaves the live DB intact:
|
||||
//
|
||||
// 1. PRE-FLIGHT (sync, before the HTTP response): PrepareRestore opens
|
||||
// the candidate read-only and runs `PRAGMA integrity_check`. If it
|
||||
// fails the live DB is untouched and we return 400 with the reason.
|
||||
//
|
||||
// 2. SAFETY NET: a pre-restore backup of the LIVE DB is created so the
|
||||
// operator can roll back even if the candidate is later discovered
|
||||
// to be missing data.
|
||||
//
|
||||
// 3. SWAP (async, after the response is flushed): close the live DB,
|
||||
// atomic-rename the candidate over the live path, wipe WAL/SHM,
|
||||
// trigger graceful shutdown. supervisord / systemd / docker
|
||||
// restart=on-failure brings the process back with the new DB.
|
||||
func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
|
||||
if s.backupEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "backup engine not initialized")
|
||||
@@ -126,13 +140,44 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
id := chi.URLParam(r, "id")
|
||||
restorePath, err := s.backupEngine.RestorePath(id)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "backup not found: "+err.Error())
|
||||
|
||||
// CSRF / accidental-fire guard: the restore endpoint is the most
|
||||
// destructive surface in the API (replaces the whole DB). Even
|
||||
// though it sits behind AdminOnly + Bearer JWT, a blind cross-site
|
||||
// POST or a misclicked button in any open admin tab can fire it.
|
||||
// Require the operator's client to echo X-Confirm-Restore: <id>
|
||||
// — matching the path param — so a CSRF post-form / image-src
|
||||
// trick can't trigger restore (browsers don't let cross-origin
|
||||
// requests set custom headers without a preflight).
|
||||
if confirm := r.Header.Get("X-Confirm-Restore"); confirm != id {
|
||||
respondError(w, http.StatusBadRequest,
|
||||
"missing or mismatched X-Confirm-Restore header (must equal backup id)")
|
||||
return
|
||||
}
|
||||
|
||||
// Create a safety backup before restore so the user can undo if needed.
|
||||
// Single-flight guard: a rapid double-click would otherwise spawn
|
||||
// two goroutines racing s.store.Close() and the candidate-over-
|
||||
// live rename. CAS to true here; if someone else won, return 409.
|
||||
if !s.restoreInFlight.CompareAndSwap(false, true) {
|
||||
respondError(w, http.StatusConflict, "a restore is already in progress")
|
||||
return
|
||||
}
|
||||
// Do NOT release the flag — the restore path triggers shutdown.
|
||||
// A failed restore is also terminal (the DB may be closed); a
|
||||
// fresh process boot is the recovery path.
|
||||
// PRE-FLIGHT: refuse before touching anything if the candidate is
|
||||
// not a valid SQLite database or fails integrity_check. This is the
|
||||
// guard the prior code lacked — a corrupt backup would silently
|
||||
// overwrite a healthy live DB.
|
||||
restorePath, err := s.backupEngine.PrepareRestore(id)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// SAFETY NET: pre-restore snapshot of the live DB. A failure here
|
||||
// is logged but does not abort — the integrity-checked candidate
|
||||
// is still safer than refusing to restore.
|
||||
if _, err := s.backupEngine.CreateBackup("pre-restore"); err != nil {
|
||||
slog.Warn("failed to create pre-restore backup", "error", err)
|
||||
}
|
||||
@@ -153,41 +198,37 @@ func (s *Server) restoreBackup(w http.ResponseWriter, r *http.Request) {
|
||||
go func() {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
// Close the current database to release locks.
|
||||
// Once we begin closing the live DB the process can no longer serve
|
||||
// requests against a sane store, so EVERY exit path from here must
|
||||
// trigger shutdown. Returning early would leave the server limping
|
||||
// on a closed/half-swapped database with no path to recovery except
|
||||
// an external kill. shutdownFunc → graceful shutdown → main returns
|
||||
// → deferred releaseLock()/db.Close() run, and the supervisor reopens
|
||||
// whatever DB is on disk on the next boot.
|
||||
triggerShutdown := func() {
|
||||
if s.shutdownFunc != nil {
|
||||
s.shutdownFunc()
|
||||
}
|
||||
}
|
||||
|
||||
// Close the current database to release locks. AtomicReplaceDB
|
||||
// expects the live file to be unmapped before swap (especially
|
||||
// important on Windows where open files cannot be renamed over).
|
||||
if err := s.store.Close(); err != nil {
|
||||
slog.Error("restore: failed to close database", "error", err)
|
||||
slog.Error("restore: failed to close database, restarting", "error", err)
|
||||
triggerShutdown()
|
||||
return
|
||||
}
|
||||
|
||||
// Copy the backup file over the main database using streaming (no full read into memory).
|
||||
src, err := os.Open(restorePath)
|
||||
if err != nil {
|
||||
slog.Error("restore: failed to open backup file", "error", err)
|
||||
if err := s.backupEngine.AtomicReplaceDB(restorePath, s.dbPath); err != nil {
|
||||
slog.Error("restore: atomic replace failed, restarting", "error", err)
|
||||
triggerShutdown()
|
||||
return
|
||||
}
|
||||
defer src.Close()
|
||||
|
||||
dst, err := os.Create(s.dbPath)
|
||||
if err != nil {
|
||||
slog.Error("restore: failed to create database file", "error", err)
|
||||
return
|
||||
}
|
||||
defer dst.Close()
|
||||
|
||||
if _, err := io.Copy(dst, src); err != nil {
|
||||
slog.Error("restore: failed to copy backup to database", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Remove WAL and SHM files to ensure clean state.
|
||||
os.Remove(s.dbPath + "-wal")
|
||||
os.Remove(s.dbPath + "-shm")
|
||||
|
||||
slog.Info("restore: database replaced, triggering shutdown")
|
||||
|
||||
// Signal the server to shut down gracefully so it can be restarted.
|
||||
if s.shutdownFunc != nil {
|
||||
s.shutdownFunc()
|
||||
}
|
||||
triggerShutdown()
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// parseOffset parses a pagination offset, clamping anything invalid or
|
||||
// negative to 0. parseLimit (secrets.go) handles the limit half.
|
||||
func parseOffset(raw string) int {
|
||||
n, err := strconv.Atoi(raw)
|
||||
if err != nil || n < 0 {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// rollbackCapableKinds is the single source of truth for which source kinds
|
||||
// support reference-pinned redeploy. The image source resolves
|
||||
// intent.Reference as the tag, so replaying a prior tag is a real rollback.
|
||||
// static/dockerfile clone branch HEAD and cannot yet check out an arbitrary
|
||||
// commit (a later phase); compose has no single artifact handle.
|
||||
var rollbackCapableKinds = map[string]bool{"image": true}
|
||||
|
||||
// RollbackCapable reports whether a source kind supports one-click rollback.
|
||||
// Used by both the list response (per-row `rollbackable` flag) and the
|
||||
// rollback guard so the UI and the server never disagree.
|
||||
func RollbackCapable(sourceKind string) bool { return rollbackCapableKinds[sourceKind] }
|
||||
|
||||
// listWorkloadDeploys handles GET /api/workloads/{id}/deploys. Read-only,
|
||||
// open to any authenticated user (mirrors the per-workload events feed).
|
||||
// Returns the structured deploy ledger newest-first with a server-computed
|
||||
// `rollbackable` flag per row.
|
||||
func (s *Server) listWorkloadDeploys(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
|
||||
q := r.URL.Query()
|
||||
limit := parseLimit(q.Get("limit"), 50, 200)
|
||||
offset := parseOffset(q.Get("offset"))
|
||||
|
||||
rows, err := s.store.ListDeployHistory(id, limit, offset)
|
||||
if err != nil {
|
||||
slog.Error("failed to list deploy history", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list deploy history")
|
||||
return
|
||||
}
|
||||
for i := range rows {
|
||||
rows[i].Rollbackable = rows[i].Outcome == "success" &&
|
||||
rows[i].Reference != "" &&
|
||||
RollbackCapable(rows[i].SourceKind)
|
||||
}
|
||||
respondJSON(w, http.StatusOK, rows)
|
||||
}
|
||||
|
||||
// rollbackWorkload handles POST /api/workloads/{id}/rollback. Admin-only
|
||||
// (same gate as /deploy). Body: {"deploy_id": <id>}. It resolves the pinned
|
||||
// reference from a prior successful, rollback-capable ledger row belonging
|
||||
// to this workload and replays it as a `rollback`-reason deploy.
|
||||
func (s *Server) rollbackWorkload(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
|
||||
row, err := s.store.GetWorkloadByID(id)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
if row.SourceKind == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload has no source_kind; cannot roll back")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
DeployID int64 `json:"deploy_id"`
|
||||
}
|
||||
if !decodeJSONStrict(w, r, &body) {
|
||||
return
|
||||
}
|
||||
if body.DeployID <= 0 {
|
||||
respondError(w, http.StatusBadRequest, "deploy_id is required")
|
||||
return
|
||||
}
|
||||
|
||||
entry, err := s.store.GetDeployHistory(body.DeployID)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "deploy history entry")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get deploy history")
|
||||
return
|
||||
}
|
||||
// No cross-workload replay: the entry must belong to the path workload.
|
||||
if entry.WorkloadID != id {
|
||||
respondError(w, http.StatusBadRequest, "deploy entry does not belong to this workload")
|
||||
return
|
||||
}
|
||||
if entry.Outcome != "success" {
|
||||
respondError(w, http.StatusBadRequest, "cannot roll back to a failed deploy")
|
||||
return
|
||||
}
|
||||
if entry.Reference == "" || !RollbackCapable(row.SourceKind) {
|
||||
respondError(w, http.StatusBadRequest, "this deploy is not rollback-capable")
|
||||
return
|
||||
}
|
||||
|
||||
actor := "manual"
|
||||
if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
|
||||
actor = claims.Username
|
||||
}
|
||||
intent := plugin.DeploymentIntent{
|
||||
Reason: "rollback",
|
||||
Reference: entry.Reference,
|
||||
Metadata: map[string]string{
|
||||
"note": "rollback to " + entry.Reference,
|
||||
"rollback_of": strconv.FormatInt(entry.ID, 10),
|
||||
},
|
||||
TriggeredAt: time.Now().UTC(),
|
||||
TriggeredBy: actor,
|
||||
}
|
||||
if err := s.deployer.DispatchPlugin(r.Context(), toPluginWorkload(row), intent); err != nil {
|
||||
// Raw error stays in the server log; client gets a generic message
|
||||
// (the wrapped error can carry registry-auth bytes).
|
||||
slog.Warn("rollback dispatch failed", "workload", id, "actor", actor,
|
||||
"reference", entry.Reference, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "rollback failed; see server logs")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusAccepted, map[string]any{
|
||||
"workload_id": id,
|
||||
"reference": entry.Reference,
|
||||
"rollback_of": entry.ID,
|
||||
"triggered_by": actor,
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// createImageWorkload creates an image-source workload through the API so
|
||||
// source_kind is persisted exactly as production does, returning its id.
|
||||
func createImageWorkload(t *testing.T, e *apiTestEnv, name string) string {
|
||||
t.Helper()
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads", pluginWorkloadRequest{
|
||||
Name: name, SourceKind: "image", SourceConfig: validImageSourceConfig(),
|
||||
})
|
||||
if resp.StatusCode != http.StatusCreated {
|
||||
_ = decodeEnvelope(t, resp, nil)
|
||||
t.Fatalf("create workload: status %d", resp.StatusCode)
|
||||
}
|
||||
var got plugin.Workload
|
||||
if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
|
||||
t.Fatalf("create workload envelope error: %q", errMsg)
|
||||
}
|
||||
return got.ID
|
||||
}
|
||||
|
||||
func TestListWorkloadDeploys_ComputesRollbackable(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
id := createImageWorkload(t, e, "app")
|
||||
|
||||
// success + reference + image => rollbackable
|
||||
e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
// failure => not rollbackable
|
||||
e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "v2", Outcome: "failure",
|
||||
})
|
||||
// success but empty reference => not rollbackable
|
||||
e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "", Outcome: "success",
|
||||
})
|
||||
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/deploys", nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200", resp.StatusCode)
|
||||
}
|
||||
var rows []store.DeployHistoryEntry
|
||||
if errMsg := decodeEnvelope(t, resp, &rows); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
}
|
||||
if len(rows) != 3 {
|
||||
t.Fatalf("expected 3 rows, got %d", len(rows))
|
||||
}
|
||||
// Newest-first: empty-ref success, failure, then v1 success.
|
||||
if !rows[2].Rollbackable {
|
||||
t.Fatalf("v1 success row should be rollbackable: %+v", rows[2])
|
||||
}
|
||||
if rows[1].Rollbackable || rows[0].Rollbackable {
|
||||
t.Fatalf("failure / empty-ref rows must not be rollbackable")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRollback_HappyPath_DispatchesRollbackIntent(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
id := createImageWorkload(t, e, "app")
|
||||
entry, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
|
||||
before := e.dispatcher.deployCount.Load()
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/rollback",
|
||||
map[string]any{"deploy_id": entry.ID})
|
||||
if resp.StatusCode != http.StatusAccepted {
|
||||
errMsg := decodeEnvelope(t, resp, nil)
|
||||
t.Fatalf("status = %d, want 202 (err=%q)", resp.StatusCode, errMsg)
|
||||
}
|
||||
if got := e.dispatcher.deployCount.Load(); got != before+1 {
|
||||
t.Fatalf("expected one dispatch, got delta %d", got-before)
|
||||
}
|
||||
intent := e.dispatcher.lastIntent.Load()
|
||||
if intent == nil || intent.Reason != "rollback" || intent.Reference != "v1" {
|
||||
t.Fatalf("expected rollback intent for v1, got %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRollback_Guards(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
imageID := createImageWorkload(t, e, "img")
|
||||
otherID := createImageWorkload(t, e, "other")
|
||||
|
||||
success, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: imageID, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
failed, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: imageID, SourceKind: "image", Reference: "v2", Outcome: "failure",
|
||||
})
|
||||
otherWL, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: otherID, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
workload string
|
||||
body any
|
||||
wantCode int
|
||||
}{
|
||||
{"missing deploy_id", imageID, map[string]any{}, http.StatusBadRequest},
|
||||
{"zero deploy_id", imageID, map[string]any{"deploy_id": 0}, http.StatusBadRequest},
|
||||
{"unknown deploy_id", imageID, map[string]any{"deploy_id": 999999}, http.StatusNotFound},
|
||||
{"unknown workload", "nope", map[string]any{"deploy_id": success.ID}, http.StatusNotFound},
|
||||
{"failed deploy", imageID, map[string]any{"deploy_id": failed.ID}, http.StatusBadRequest},
|
||||
{"cross-workload entry", imageID, map[string]any{"deploy_id": otherWL.ID}, http.StatusBadRequest},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads/"+c.workload+"/rollback", c.body)
|
||||
if resp.StatusCode != c.wantCode {
|
||||
errMsg := decodeEnvelope(t, resp, nil)
|
||||
t.Fatalf("status = %d, want %d (err=%q)", resp.StatusCode, c.wantCode, errMsg)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/docker"
|
||||
"github.com/alexei/tinyforge/internal/staticsite"
|
||||
)
|
||||
|
||||
@@ -350,6 +351,54 @@ func (s *Server) listImageConflicts(w http.ResponseWriter, r *http.Request) {
|
||||
respondJSON(w, http.StatusOK, conflicts)
|
||||
}
|
||||
|
||||
// inspectImageRequest is the body for POST /api/discovery/image/inspect.
|
||||
type inspectImageRequest struct {
|
||||
Image string `json:"image"`
|
||||
}
|
||||
|
||||
// inspectImageResponse mirrors the frontend InspectResult shape the
|
||||
// new-app wizard pre-fills from: the first exposed port (parsed to int,
|
||||
// 0 when none) and the image's HEALTHCHECK command string.
|
||||
type inspectImageResponse struct {
|
||||
Port int `json:"port"`
|
||||
Healthcheck string `json:"healthcheck"`
|
||||
}
|
||||
|
||||
// inspectImageMetadata inspects a LOCAL image and returns its first
|
||||
// exposed port + healthcheck so the wizard can pre-fill those fields.
|
||||
// POST /api/discovery/image/inspect.
|
||||
//
|
||||
// This inspects local images only — it does not pull. When the image is
|
||||
// not present locally the docker call fails; we return a generic,
|
||||
// non-leaky 400 rather than the git-specific upstreamError so a raw
|
||||
// docker daemon string (which may echo the ref) never reaches the client.
|
||||
func (s *Server) inspectImageMetadata(w http.ResponseWriter, r *http.Request) {
|
||||
var req inspectImageRequest
|
||||
if !decodeJSON(w, r, &req) {
|
||||
return
|
||||
}
|
||||
image := strings.TrimSpace(req.Image)
|
||||
if image == "" {
|
||||
respondError(w, http.StatusBadRequest, "image is required")
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(r.Context(), discoveryTimeout)
|
||||
defer cancel()
|
||||
|
||||
info, err := s.docker.InspectImage(ctx, image)
|
||||
if err != nil {
|
||||
slog.Warn("inspect image metadata failed", "error", err)
|
||||
respondError(w, http.StatusBadRequest, "could not inspect image — make sure it is pulled locally and the reference is correct")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, inspectImageResponse{
|
||||
Port: docker.ExtractPort(info.ExposedPorts),
|
||||
Healthcheck: info.Healthcheck,
|
||||
})
|
||||
}
|
||||
|
||||
// stripImageTag returns the image reference with the trailing :tag
|
||||
// removed, taking care to leave a registry port (e.g. registry:5000/foo)
|
||||
// intact. Digest references (image@sha256:...) are returned unchanged.
|
||||
|
||||
@@ -348,3 +348,32 @@ func (s *Server) pruneImages(w http.ResponseWriter, r *http.Request) {
|
||||
"space_reclaimed_mb": reclaimedBytes / (1024 * 1024),
|
||||
})
|
||||
}
|
||||
|
||||
// pruneBuildCache handles POST /api/docker/prune-build-cache. It removes
|
||||
// unused Docker build-cache records daemon-wide (all=false), so an app's next
|
||||
// rebuild still hits its warm cache. The build cache is regenerable by
|
||||
// definition — pruning only forces slower rebuilds, never data loss — and the
|
||||
// dockerfile/static deploy paths never reclaim it on teardown, so it grows
|
||||
// monotonically until pruned here.
|
||||
func (s *Server) pruneBuildCache(w http.ResponseWriter, r *http.Request) {
|
||||
if s.docker == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := s.docker.PruneBuildCache(r.Context(), false)
|
||||
if err != nil {
|
||||
slog.Error("prune: build cache", "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "internal server error")
|
||||
return
|
||||
}
|
||||
|
||||
slog.Info("prune: build cache",
|
||||
"caches_deleted", result.CachesDeleted,
|
||||
"space_reclaimed_mb", result.SpaceReclaimed/(1024*1024))
|
||||
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"caches_deleted": result.CachesDeleted,
|
||||
"space_reclaimed_mb": result.SpaceReclaimed / (1024 * 1024),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -37,6 +37,36 @@ func (s *Server) listEventLog(w http.ResponseWriter, r *http.Request) {
|
||||
respondJSON(w, http.StatusOK, events)
|
||||
}
|
||||
|
||||
// listWorkloadEvents handles GET /api/workloads/{id}/events — the per-app
|
||||
// activity/deploy timeline. The workload id is pinned from the path, so a
|
||||
// client cannot widen the scope to other workloads or the global feed.
|
||||
// Supports the same severity/limit/offset query params as listEventLog.
|
||||
func (s *Server) listWorkloadEvents(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
|
||||
q := r.URL.Query()
|
||||
limit, _ := strconv.Atoi(q.Get("limit"))
|
||||
offset, _ := strconv.Atoi(q.Get("offset"))
|
||||
|
||||
events, err := s.store.ListEvents(store.EventLogFilter{
|
||||
WorkloadID: id,
|
||||
Severity: q.Get("severity"),
|
||||
Limit: limit,
|
||||
Offset: offset,
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("failed to list workload events", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list events")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, events)
|
||||
}
|
||||
|
||||
// getEventLogStats handles GET /api/events/log/stats.
|
||||
func (s *Server) getEventLogStats(w http.ResponseWriter, r *http.Request) {
|
||||
stats, err := s.store.GetEventStats()
|
||||
|
||||
@@ -0,0 +1,364 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/crypto"
|
||||
"github.com/alexei/tinyforge/internal/gitops"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// keyedMutex is a lazily-populated per-key lock. Used to serialize a critical
|
||||
// section per workload id (the GitOps sync) without a global lock.
|
||||
type keyedMutex struct {
|
||||
mu sync.Mutex
|
||||
m map[string]*sync.Mutex
|
||||
}
|
||||
|
||||
// lock acquires the mutex for key and returns its unlock func.
|
||||
func (k *keyedMutex) lock(key string) func() {
|
||||
k.mu.Lock()
|
||||
if k.m == nil {
|
||||
k.m = make(map[string]*sync.Mutex)
|
||||
}
|
||||
mu, ok := k.m[key]
|
||||
if !ok {
|
||||
mu = &sync.Mutex{}
|
||||
k.m[key] = mu
|
||||
}
|
||||
k.mu.Unlock()
|
||||
|
||||
mu.Lock()
|
||||
return mu.Unlock
|
||||
}
|
||||
|
||||
// gitOpsStatusResponse is the single rich payload the GitOps panel reads — it
|
||||
// folds the file preview, parsed status, and drift into one response so the UI
|
||||
// makes a single call (no separate /drift round-trip).
|
||||
type gitOpsStatusResponse struct {
|
||||
Eligible bool `json:"eligible"` // source kind supports GitOps
|
||||
Enabled bool `json:"enabled"` // opt-in flag on the workload
|
||||
Path string `json:"path"` // repo-relative config path
|
||||
Status string `json:"status"` // disabled|ok|no_file|fetch_failed|invalid
|
||||
Raw string `json:"raw"` // the .tinyforge.yml text, when present
|
||||
Message string `json:"message"` // token-redacted detail for non-ok
|
||||
CommitSHA string `json:"commit_sha"` // ref the file was read at
|
||||
LastSyncAt string `json:"last_sync_at"` // last successful sync ("" = never)
|
||||
Drift []gitops.DriftEntry `json:"drift"` // declared fields that differ from live
|
||||
DriftCount int `json:"drift_count"`
|
||||
// ManagedFields lists every source_config key the repo overlay declares
|
||||
// (not just the drifting ones) so the UI can lock exactly those fields on
|
||||
// the edit form. Populated only when the file parsed (status ok).
|
||||
ManagedFields []string `json:"managed_fields"`
|
||||
}
|
||||
|
||||
// getWorkloadGitOps handles GET /api/workloads/{id}/gitops. Read-only; open to
|
||||
// any authenticated user. When GitOps is enabled it fetches the repo's
|
||||
// .tinyforge.yml live and computes drift against the stored source_config.
|
||||
func (s *Server) getWorkloadGitOps(w http.ResponseWriter, r *http.Request) {
|
||||
row, ok := s.loadWorkload(w, chi.URLParam(r, "id"))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
resp := gitOpsStatusResponse{
|
||||
Eligible: gitops.IsEligibleSource(row.SourceKind),
|
||||
Enabled: row.GitOpsEnabled,
|
||||
Path: row.GitOpsPath,
|
||||
Status: "disabled",
|
||||
LastSyncAt: row.GitOpsLastSyncAt,
|
||||
CommitSHA: row.GitOpsCommitSHA,
|
||||
Drift: []gitops.DriftEntry{},
|
||||
}
|
||||
if resp.Path == "" {
|
||||
resp.Path = ".tinyforge.yml"
|
||||
}
|
||||
|
||||
// Only reach out to the repo when GitOps is actually on.
|
||||
if row.GitOpsEnabled && resp.Eligible {
|
||||
ref, err := s.gitOpsRepoRef(row)
|
||||
if err != nil {
|
||||
// Decoding/decrypt failure: surface as fetch_failed, never the raw
|
||||
// error (it can carry the token / config bytes).
|
||||
slog.Warn("gitops: build repo ref", "workload", row.ID, "error", err)
|
||||
resp.Status = string(gitops.StatusFetchFailed)
|
||||
resp.Message = "could not read repo settings for this workload"
|
||||
respondJSON(w, http.StatusOK, resp)
|
||||
return
|
||||
}
|
||||
res := gitops.Fetch(r.Context(), ref)
|
||||
resp.Status = string(res.Status)
|
||||
resp.CommitSHA = firstNonEmpty(res.CommitSHA, row.GitOpsCommitSHA)
|
||||
resp.Message = res.Message
|
||||
if len(res.Raw) > 0 {
|
||||
resp.Raw = string(res.Raw)
|
||||
}
|
||||
if res.Status == gitops.StatusOK {
|
||||
drift, derr := gitops.Drift(res.Spec, json.RawMessage(row.SourceConfig), row.SourceKind)
|
||||
if derr != nil {
|
||||
slog.Warn("gitops: drift", "workload", row.ID, "error", derr)
|
||||
} else if drift != nil {
|
||||
resp.Drift = drift
|
||||
}
|
||||
resp.DriftCount = len(resp.Drift)
|
||||
resp.ManagedFields = planFields(gitops.BuildPlan(res.Spec, row.SourceKind))
|
||||
}
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// setWorkloadGitOps handles PUT /api/workloads/{id}/gitops. Admin-only.
|
||||
// Body: {"enabled": bool, "path": string}. Enabling is refused for source
|
||||
// kinds that aren't git-backed; the path is validated against traversal.
|
||||
func (s *Server) setWorkloadGitOps(w http.ResponseWriter, r *http.Request) {
|
||||
row, ok := s.loadWorkload(w, chi.URLParam(r, "id"))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
if !decodeJSONStrict(w, r, &body) {
|
||||
return
|
||||
}
|
||||
|
||||
if body.Enabled && !gitops.IsEligibleSource(row.SourceKind) {
|
||||
respondError(w, http.StatusBadRequest,
|
||||
"GitOps is only available for dockerfile and static sources")
|
||||
return
|
||||
}
|
||||
|
||||
path := strings.TrimSpace(body.Path)
|
||||
if path != "" && !validGitOpsPath(path) {
|
||||
respondError(w, http.StatusBadRequest,
|
||||
"invalid path: must be a repo-relative file (no \"..\", no leading slash)")
|
||||
return
|
||||
}
|
||||
|
||||
if err := s.store.SetWorkloadGitOps(row.ID, body.Enabled, path); err != nil {
|
||||
slog.Error("gitops: set", "workload", row.ID, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to update GitOps settings")
|
||||
return
|
||||
}
|
||||
if path == "" {
|
||||
path = ".tinyforge.yml"
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]any{"enabled": body.Enabled, "path": path})
|
||||
}
|
||||
|
||||
// syncWorkloadGitOps handles POST /api/workloads/{id}/gitops/sync. Admin-only.
|
||||
// It fetches the repo's .tinyforge.yml, merges the declared overlay onto the
|
||||
// live source_config (validate-then-commit), persists it, and records the sync.
|
||||
// Explicit action only — there is no auto-apply on deploy in v1.
|
||||
func (s *Server) syncWorkloadGitOps(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
// Serialize the whole read→merge→write per workload so two concurrent
|
||||
// syncs can't clobber each other (review S5). Load the row INSIDE the lock
|
||||
// so each sync merges off the latest persisted config.
|
||||
unlock := s.gitopsSync.lock(id)
|
||||
defer unlock()
|
||||
|
||||
row, ok := s.loadWorkload(w, id)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if !gitops.IsEligibleSource(row.SourceKind) {
|
||||
respondError(w, http.StatusBadRequest,
|
||||
"GitOps is only available for dockerfile and static sources")
|
||||
return
|
||||
}
|
||||
if !row.GitOpsEnabled {
|
||||
respondError(w, http.StatusBadRequest, "enable GitOps for this workload first")
|
||||
return
|
||||
}
|
||||
|
||||
ref, err := s.gitOpsRepoRef(row)
|
||||
if err != nil {
|
||||
slog.Warn("gitops: build repo ref", "workload", row.ID, "error", err)
|
||||
respondError(w, http.StatusBadGateway, "could not read repo settings for this workload")
|
||||
return
|
||||
}
|
||||
|
||||
res := gitops.Fetch(r.Context(), ref)
|
||||
switch res.Status {
|
||||
case gitops.StatusOK:
|
||||
// proceed
|
||||
case gitops.StatusNoFile:
|
||||
respondError(w, http.StatusBadRequest, "no "+ref.Path+" found on branch "+ref.Branch)
|
||||
return
|
||||
case gitops.StatusInvalid:
|
||||
respondError(w, http.StatusBadRequest, "invalid "+ref.Path+": "+res.Message)
|
||||
return
|
||||
default: // fetch_failed
|
||||
slog.Warn("gitops: fetch failed", "workload", row.ID, "detail", res.Message)
|
||||
respondError(w, http.StatusBadGateway, "could not fetch "+ref.Path+" from the repo")
|
||||
return
|
||||
}
|
||||
|
||||
src, err := plugin.GetSource(row.SourceKind)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "unknown source kind")
|
||||
return
|
||||
}
|
||||
plan := gitops.BuildPlan(res.Spec, row.SourceKind)
|
||||
merged, err := gitops.MergeAndValidate(json.RawMessage(row.SourceConfig), plan, src.Validate)
|
||||
if err != nil {
|
||||
// The merged config failed the source's own Validate — the file
|
||||
// declares something this workload can't accept. Safe to surface (it
|
||||
// describes config shape, not secrets).
|
||||
respondError(w, http.StatusBadRequest, "the repo config was rejected: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// Persist via a full-row update off the row we loaded (single read →
|
||||
// merge → write). A per-workload sync lock that closes the remaining
|
||||
// edit-vs-sync window is a Phase 4 hardening item.
|
||||
row.SourceConfig = string(merged)
|
||||
if err := s.store.UpdateWorkload(row); err != nil {
|
||||
slog.Error("gitops: persist merged config", "workload", row.ID, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to apply the repo config")
|
||||
return
|
||||
}
|
||||
if err := s.store.RecordGitOpsSync(row.ID, res.CommitSHA, store.Now()); err != nil {
|
||||
slog.Warn("gitops: record sync", "workload", row.ID, "error", err)
|
||||
}
|
||||
|
||||
actor := "manual"
|
||||
if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
|
||||
actor = claims.Username
|
||||
}
|
||||
appliedFields := planFields(plan)
|
||||
s.recordGitOpsEvent(row.ID, res.CommitSHA, actor, appliedFields)
|
||||
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"status": "applied",
|
||||
"commit_sha": res.CommitSHA,
|
||||
"applied_fields": appliedFields,
|
||||
"triggered_by": actor,
|
||||
})
|
||||
}
|
||||
|
||||
// loadWorkload fetches a workload by id, writing the appropriate error response
|
||||
// and returning ok=false on miss. Shared by the GitOps handlers.
|
||||
func (s *Server) loadWorkload(w http.ResponseWriter, id string) (store.Workload, bool) {
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return store.Workload{}, false
|
||||
}
|
||||
row, err := s.store.GetWorkloadByID(id)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return store.Workload{}, false
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return store.Workload{}, false
|
||||
}
|
||||
return row, true
|
||||
}
|
||||
|
||||
// gitOpsRepoRef builds a gitops.RepoRef from a workload's source_config: it
|
||||
// decodes the common git coords (identical keys across dockerfile + static)
|
||||
// and decrypts the access token. The gitops package stays decoupled from the
|
||||
// store/crypto by taking the plain coords.
|
||||
func (s *Server) gitOpsRepoRef(row store.Workload) (gitops.RepoRef, error) {
|
||||
var c struct {
|
||||
Provider string `json:"provider"`
|
||||
BaseURL string `json:"base_url"`
|
||||
RepoOwner string `json:"repo_owner"`
|
||||
RepoName string `json:"repo_name"`
|
||||
Branch string `json:"branch"`
|
||||
AccessToken string `json:"access_token"`
|
||||
}
|
||||
if err := json.Unmarshal([]byte(row.SourceConfig), &c); err != nil {
|
||||
return gitops.RepoRef{}, fmt.Errorf("decode source_config: %w", err)
|
||||
}
|
||||
token := ""
|
||||
if c.AccessToken != "" {
|
||||
dec, err := crypto.Decrypt(s.encKey, c.AccessToken)
|
||||
if err != nil {
|
||||
return gitops.RepoRef{}, fmt.Errorf("decrypt access token: %w", err)
|
||||
}
|
||||
token = dec
|
||||
}
|
||||
branch := c.Branch
|
||||
if branch == "" {
|
||||
branch = "main"
|
||||
}
|
||||
path := row.GitOpsPath
|
||||
if path == "" {
|
||||
path = ".tinyforge.yml"
|
||||
}
|
||||
return gitops.RepoRef{
|
||||
Provider: c.Provider,
|
||||
BaseURL: c.BaseURL,
|
||||
Owner: c.RepoOwner,
|
||||
Repo: c.RepoName,
|
||||
Branch: branch,
|
||||
Token: token,
|
||||
Path: path,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// recordGitOpsEvent writes a sync to the per-workload event log — the audit
|
||||
// trail for a config-only sync, kept OUT of deploy_history (which the rollback
|
||||
// feature treats as redeployable rows).
|
||||
func (s *Server) recordGitOpsEvent(workloadID, sha, actor string, fields []string) {
|
||||
meta, _ := json.Marshal(map[string]any{"commit_sha": sha, "by": actor, "fields": fields})
|
||||
if _, err := s.store.InsertEvent(store.EventLog{
|
||||
Source: "gitops",
|
||||
WorkloadID: workloadID,
|
||||
Severity: "info",
|
||||
Message: "GitOps config synced from repo",
|
||||
Metadata: string(meta),
|
||||
}); err != nil {
|
||||
slog.Warn("gitops: record event", "workload", workloadID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// validGitOpsPath rejects absolute paths, traversal, and URL-significant or
|
||||
// control characters so a stored config path can't escape the repo (review M2)
|
||||
// or smuggle a query/fragment onto the provider's raw-file URL (review LOW-1).
|
||||
func validGitOpsPath(p string) bool {
|
||||
if p == "" || len(p) > 255 {
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(p, "/") || strings.HasPrefix(p, "\\") {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(p, "..") {
|
||||
return false
|
||||
}
|
||||
for _, r := range p {
|
||||
if r < 0x20 || r == 0x7f || r == '?' || r == '#' || r == ' ' || r == '\\' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// planFields returns the source_config keys an apply plan touches.
|
||||
func planFields(plan gitops.ApplyPlan) []string {
|
||||
fields := make([]string, 0, len(plan.SourceConfigPatch))
|
||||
for k := range plan.SourceConfigPatch {
|
||||
fields = append(fields, k)
|
||||
}
|
||||
return fields
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/gitops"
|
||||
)
|
||||
|
||||
func TestValidGitOpsPath(t *testing.T) {
|
||||
cases := []struct {
|
||||
path string
|
||||
ok bool
|
||||
}{
|
||||
{".tinyforge.yml", true},
|
||||
{"deploy/.tinyforge.yml", true},
|
||||
{"config/app.yaml", true},
|
||||
{"/etc/passwd", false}, // absolute
|
||||
{"\\windows\\path", false}, // absolute (backslash)
|
||||
{"../../etc/passwd", false}, // traversal
|
||||
{"deploy/../../x", false}, // traversal mid-path
|
||||
{"foo?ref=evil", false}, // query-param injection (LOW-1)
|
||||
{"foo#frag", false}, // fragment injection
|
||||
{"with space.yml", false}, // whitespace
|
||||
{"", false}, // empty
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := validGitOpsPath(c.path); got != c.ok {
|
||||
t.Errorf("validGitOpsPath(%q) = %v, want %v", c.path, got, c.ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlanFields(t *testing.T) {
|
||||
spec := gitops.Spec{Version: 1, Deploy: gitops.DeploySpec{
|
||||
Port: ptrInt(8080),
|
||||
DeployStrategy: ptrStr("blue-green"),
|
||||
}}
|
||||
got := planFields(gitops.BuildPlan(spec, gitops.SourceDockerfile))
|
||||
sort.Strings(got)
|
||||
want := []string{"deploy_strategy", "port"}
|
||||
if len(got) != len(want) || got[0] != want[0] || got[1] != want[1] {
|
||||
t.Fatalf("planFields = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func ptrInt(i int) *int { return &i }
|
||||
func ptrStr(s string) *string { return &s }
|
||||
@@ -0,0 +1,64 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/metrics"
|
||||
)
|
||||
|
||||
// livez always returns 200 if the process is up. Used by container
|
||||
// orchestrators / load balancers / Docker HEALTHCHECK as the "is the
|
||||
// binary alive" probe. Intentionally does NOT touch the DB or Docker —
|
||||
// a slow DB must not cause restart loops.
|
||||
func (s *Server) livez(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
_, _ = w.Write([]byte("ok\n"))
|
||||
}
|
||||
|
||||
// readyz returns 200 only when the process can actually serve traffic:
|
||||
// SQLite is reachable, the encryption key is loaded, the deployer is
|
||||
// not draining. The response body is intentionally minimal — the
|
||||
// specific failing probe name is recorded in slog (operator-visible)
|
||||
// rather than returned to unauthenticated callers. This avoids handing
|
||||
// reconnaissance to an attacker who can hit /readyz during an outage
|
||||
// ("DB down" vs "encryption key missing" leaks operational state).
|
||||
func (s *Server) readyz(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// DB ping: cheap and exact — exercises the connection pool, file
|
||||
// lock, and busy-timeout. A failing ping means SQLite WAL is wedged
|
||||
// or the data dir is gone.
|
||||
if err := s.store.DB().PingContext(ctx); err != nil {
|
||||
slog.Warn("readyz: db ping failed", "error", err)
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
_, _ = w.Write([]byte("not ready\n"))
|
||||
return
|
||||
}
|
||||
|
||||
// Encryption key sanity: if it's zero we cannot decrypt any stored
|
||||
// secret, so the deployer paths will all explode at first use.
|
||||
if s.encKey == ([32]byte{}) {
|
||||
slog.Warn("readyz: encryption key not loaded")
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
_, _ = w.Write([]byte("not ready\n"))
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
_, _ = w.Write([]byte("ready\n"))
|
||||
}
|
||||
|
||||
// metricsExport writes the process-wide metrics registry in Prometheus
|
||||
// text format. Admin-only by router placement; surface is intentionally
|
||||
// thin (no histograms / quantiles, only counters) to keep the binary
|
||||
// dependency-free.
|
||||
func (s *Server) metricsExport(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
_ = metrics.DefaultRegistry.WritePrometheus(w)
|
||||
}
|
||||
@@ -0,0 +1,235 @@
|
||||
// Package api: metric-alert rule HTTP handlers. The evaluator lives in
|
||||
// internal/metricalert; this file is the REST surface that lets
|
||||
// operators create, edit, and delete threshold rules. Mirrors the
|
||||
// log-scan rule handlers.
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
|
||||
// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
|
||||
// immutable on update (per store.UpdateMetricAlertRule) so it only takes
|
||||
// effect on create.
|
||||
type metricAlertRuleInput struct {
|
||||
WorkloadID *string `json:"workload_id"`
|
||||
Name *string `json:"name"`
|
||||
Metric *string `json:"metric"`
|
||||
Comparator *string `json:"comparator"`
|
||||
Threshold *float64 `json:"threshold"`
|
||||
Severity *string `json:"severity"`
|
||||
CooldownSeconds *int `json:"cooldown_seconds"`
|
||||
Enabled *bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
|
||||
// query filter `workload_id=...` returns rules applying to that workload
|
||||
// (its own rows plus globals).
|
||||
func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
|
||||
if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
|
||||
out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list metric alert rules")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
return
|
||||
}
|
||||
out, err := s.store.ListMetricAlertRules()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list metric alert rules")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
|
||||
func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
id, ok := parseMetricAlertRuleID(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
rule, err := s.store.GetMetricAlertRule(id)
|
||||
if err != nil {
|
||||
mapStoreError(w, err, "metric alert rule")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, rule)
|
||||
}
|
||||
|
||||
// createMetricAlertRule handles POST /api/metric-alert-rules.
|
||||
func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
var in metricAlertRuleInput
|
||||
if !decodeJSON(w, r, &in) {
|
||||
return
|
||||
}
|
||||
rule := store.MetricAlertRule{
|
||||
WorkloadID: derefString(in.WorkloadID),
|
||||
Name: derefString(in.Name),
|
||||
Metric: derefString(in.Metric),
|
||||
Comparator: derefString(in.Comparator),
|
||||
Threshold: derefFloat64(in.Threshold),
|
||||
Severity: firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
|
||||
CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
|
||||
Enabled: in.Enabled == nil || *in.Enabled,
|
||||
}
|
||||
if msg := validateMetricAlertInput(rule); msg != "" {
|
||||
respondError(w, http.StatusBadRequest, msg)
|
||||
return
|
||||
}
|
||||
out, err := s.store.CreateMetricAlertRule(rule)
|
||||
if err != nil {
|
||||
if isMetricAlertValidationErr(err) {
|
||||
respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "create metric alert rule")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusCreated, out)
|
||||
}
|
||||
|
||||
// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
|
||||
// workload_id is immutable; name/metric/comparator/threshold/severity/
|
||||
// cooldown/enabled are individually overridable.
|
||||
func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
id, ok := parseMetricAlertRuleID(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
existing, err := s.store.GetMetricAlertRule(id)
|
||||
if err != nil {
|
||||
mapStoreError(w, err, "metric alert rule")
|
||||
return
|
||||
}
|
||||
var in metricAlertRuleInput
|
||||
if !decodeJSON(w, r, &in) {
|
||||
return
|
||||
}
|
||||
if in.Name != nil {
|
||||
existing.Name = *in.Name
|
||||
}
|
||||
if in.Metric != nil && *in.Metric != "" {
|
||||
existing.Metric = *in.Metric
|
||||
}
|
||||
if in.Comparator != nil && *in.Comparator != "" {
|
||||
existing.Comparator = *in.Comparator
|
||||
}
|
||||
if in.Threshold != nil {
|
||||
existing.Threshold = *in.Threshold
|
||||
}
|
||||
if in.Severity != nil && *in.Severity != "" {
|
||||
existing.Severity = *in.Severity
|
||||
}
|
||||
if in.CooldownSeconds != nil {
|
||||
existing.CooldownSeconds = *in.CooldownSeconds
|
||||
}
|
||||
if in.Enabled != nil {
|
||||
existing.Enabled = *in.Enabled
|
||||
}
|
||||
if msg := validateMetricAlertInput(existing); msg != "" {
|
||||
respondError(w, http.StatusBadRequest, msg)
|
||||
return
|
||||
}
|
||||
out, err := s.store.UpdateMetricAlertRule(existing)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "metric alert rule")
|
||||
return
|
||||
}
|
||||
if isMetricAlertValidationErr(err) {
|
||||
respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "update metric alert rule")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
|
||||
func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
id, ok := parseMetricAlertRuleID(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if err := s.store.DeleteMetricAlertRule(id); err != nil {
|
||||
mapStoreError(w, err, "metric alert rule")
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
|
||||
// validateMetricAlertInput does boundary validation so we return a
|
||||
// clear 400 before hitting the store. The store re-validates the same
|
||||
// invariants as a backstop.
|
||||
func validateMetricAlertInput(rule store.MetricAlertRule) string {
|
||||
if strings.TrimSpace(rule.Name) == "" {
|
||||
return "name is required"
|
||||
}
|
||||
switch rule.Metric {
|
||||
case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
|
||||
default:
|
||||
return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
|
||||
}
|
||||
switch rule.Comparator {
|
||||
case store.MetricComparatorGT, store.MetricComparatorLT:
|
||||
default:
|
||||
return "invalid comparator: must be gt or lt"
|
||||
}
|
||||
switch rule.Severity {
|
||||
case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
|
||||
default:
|
||||
return "invalid severity: must be info, warn, or error"
|
||||
}
|
||||
if rule.CooldownSeconds < 0 {
|
||||
return "cooldown_seconds must be >= 0"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// isMetricAlertValidationErr maps the store's validation errors to 400
|
||||
// rather than 500 without leaking driver text.
|
||||
func isMetricAlertValidationErr(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := err.Error()
|
||||
for _, needle := range []string{
|
||||
"name is required",
|
||||
"invalid metric",
|
||||
"invalid comparator",
|
||||
"invalid severity",
|
||||
"cooldown_seconds must be",
|
||||
} {
|
||||
if strings.Contains(msg, needle) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
|
||||
raw := chi.URLParam(r, "id")
|
||||
id, err := strconv.ParseInt(raw, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
respondError(w, http.StatusBadRequest, "invalid rule id")
|
||||
return 0, false
|
||||
}
|
||||
return id, true
|
||||
}
|
||||
|
||||
func derefFloat64(p *float64) float64 {
|
||||
if p == nil {
|
||||
return 0
|
||||
}
|
||||
return *p
|
||||
}
|
||||
+318
-7
@@ -1,14 +1,119 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/metrics"
|
||||
)
|
||||
|
||||
// requestIDKey is the context key under which the generated/forwarded
|
||||
// X-Request-ID is stored. Exported indirectly via RequestIDFromContext
|
||||
// so handlers and services downstream of the API layer can thread it
|
||||
// into their own slog calls without re-extracting from headers.
|
||||
type requestIDKeyType struct{}
|
||||
|
||||
var requestIDKey = requestIDKeyType{}
|
||||
|
||||
// RequestIDFromContext returns the correlation ID for the request, or
|
||||
// "" when called outside the API request path.
|
||||
func RequestIDFromContext(ctx context.Context) string {
|
||||
if v, ok := ctx.Value(requestIDKey).(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// requestID middleware ensures every request has a stable correlation
|
||||
// ID. Honors a caller-supplied X-Request-ID when the request comes from
|
||||
// a trusted proxy AND the value matches a safe character set; otherwise
|
||||
// generates a fresh 128-bit ID. The ID is echoed back as X-Request-ID
|
||||
// and stitched into every subsequent slog call via the context value
|
||||
// the `logging` middleware reads.
|
||||
//
|
||||
// Format clamp: a compromised reverse proxy (or one that mis-parses an
|
||||
// untrusted header) could forward an ID containing newlines, semicolons,
|
||||
// or other separator characters. Those would corrupt structured log
|
||||
// parsers that assume one record per line / key-value. Restricting to
|
||||
// `[A-Za-z0-9._-]{1,64}` covers UUIDs, hex IDs, and trace-context IDs
|
||||
// without any sharp edges.
|
||||
func requestID(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
rid := r.Header.Get("X-Request-ID")
|
||||
if rid == "" || !isTrustedPeer(r) || !isValidRequestID(rid) {
|
||||
rid = newRequestID()
|
||||
}
|
||||
w.Header().Set("X-Request-ID", rid)
|
||||
ctx := context.WithValue(r.Context(), requestIDKey, rid)
|
||||
next.ServeHTTP(w, r.WithContext(ctx))
|
||||
})
|
||||
}
|
||||
|
||||
// isValidRequestID enforces `[A-Za-z0-9._-]{1,64}` without compiling a
|
||||
// regex on the request path. Single linear scan, no allocations.
|
||||
func isValidRequestID(s string) bool {
|
||||
if len(s) == 0 || len(s) > 64 {
|
||||
return false
|
||||
}
|
||||
for i := 0; i < len(s); i++ {
|
||||
c := s[i]
|
||||
switch {
|
||||
case c >= 'A' && c <= 'Z':
|
||||
case c >= 'a' && c <= 'z':
|
||||
case c >= '0' && c <= '9':
|
||||
case c == '.' || c == '_' || c == '-':
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// isTrustedPeer is a thin wrapper around the TRUSTED_PROXY_CIDRS allow-
|
||||
// list — we honor a forwarded request-id only from upstreams we already
|
||||
// trust for X-Forwarded-For. Otherwise an internet client could spam
|
||||
// log files with attacker-chosen IDs.
|
||||
func isTrustedPeer(r *http.Request) bool {
|
||||
peer := r.RemoteAddr
|
||||
if host, _, err := net.SplitHostPort(peer); err == nil {
|
||||
peer = host
|
||||
}
|
||||
if len(trustedProxyCIDRs) == 0 {
|
||||
return false
|
||||
}
|
||||
ip := net.ParseIP(peer)
|
||||
if ip == nil {
|
||||
return false
|
||||
}
|
||||
for _, n := range trustedProxyCIDRs {
|
||||
if n.Contains(ip) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func newRequestID() string {
|
||||
var b [16]byte
|
||||
if _, err := rand.Read(b[:]); err != nil {
|
||||
// Fall back to time-based suffix if crypto/rand is unavailable
|
||||
// — extremely unlikely outside of broken environments, but the
|
||||
// ID is for tracing not security, so a deterministic fallback
|
||||
// is preferable to a panic.
|
||||
return "ts-" + time.Now().UTC().Format("20060102T150405.000000000")
|
||||
}
|
||||
return hex.EncodeToString(b[:])
|
||||
}
|
||||
|
||||
// logging is an HTTP middleware that logs every request with method, path,
|
||||
// status code, and duration. Webhook URLs are redacted before being logged
|
||||
// because the secret is the only authenticator — leaking it to log
|
||||
@@ -20,15 +125,58 @@ func logging(next http.Handler) http.Handler {
|
||||
|
||||
next.ServeHTTP(wrapped, r)
|
||||
|
||||
slog.Info("http request",
|
||||
fields := []any{
|
||||
"method", r.Method,
|
||||
"path", redactPath(r.URL.Path),
|
||||
"status", wrapped.status,
|
||||
"duration", time.Since(start).String(),
|
||||
)
|
||||
}
|
||||
if rq := redactQuery(r.URL.RawQuery); rq != "" {
|
||||
fields = append(fields, "query", rq)
|
||||
}
|
||||
if rid := RequestIDFromContext(r.Context()); rid != "" {
|
||||
fields = append(fields, "request_id", rid)
|
||||
}
|
||||
slog.Info("http request", fields...)
|
||||
|
||||
// Lightweight per-request counter. Bucket by status class so
|
||||
// the cardinality stays at 5 × #methods regardless of how many
|
||||
// distinct response codes we emit.
|
||||
metrics.HTTPRequestsTotal.Inc(bucketMethod(r.Method), statusClass(wrapped.status))
|
||||
})
|
||||
}
|
||||
|
||||
// bucketMethod normalises HTTP method names against the standard set
|
||||
// so a malicious client cannot spam arbitrary method tokens (RFC 7230
|
||||
// allows any token) and inflate the metrics map. Anything off the
|
||||
// allow-list collapses to "other".
|
||||
func bucketMethod(m string) string {
|
||||
switch m {
|
||||
case "GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS", "CONNECT", "TRACE":
|
||||
return m
|
||||
}
|
||||
return "other"
|
||||
}
|
||||
|
||||
// statusClass buckets a status code into "1xx".."5xx" / "other". Keeps
|
||||
// metrics cardinality bounded so a chatty endpoint can't explode the
|
||||
// metrics map with one series per distinct response code.
|
||||
func statusClass(code int) string {
|
||||
switch {
|
||||
case code >= 100 && code < 200:
|
||||
return "1xx"
|
||||
case code >= 200 && code < 300:
|
||||
return "2xx"
|
||||
case code >= 300 && code < 400:
|
||||
return "3xx"
|
||||
case code >= 400 && code < 500:
|
||||
return "4xx"
|
||||
case code >= 500 && code < 600:
|
||||
return "5xx"
|
||||
}
|
||||
return "other"
|
||||
}
|
||||
|
||||
// redactPath strips secrets from URL paths that carry them in segments.
|
||||
// Only the canonical /api/webhook/triggers/{secret} surface remains after
|
||||
// the hard cutover.
|
||||
@@ -40,6 +188,45 @@ func redactPath(path string) string {
|
||||
return path
|
||||
}
|
||||
|
||||
// redactQueryKeys is the case-insensitive set of query-parameter names whose
|
||||
// values are masked before a URL lands in the request log. `token` is used by
|
||||
// SSE/EventSource when a custom header can't be set; the rest are
|
||||
// defence-in-depth against sensitive values ever appearing in a query string.
|
||||
var redactQueryKeys = map[string]struct{}{
|
||||
"token": {},
|
||||
"secret": {},
|
||||
"password": {},
|
||||
"passwd": {},
|
||||
"api_key": {},
|
||||
"apikey": {},
|
||||
"access_token": {},
|
||||
"client_secret": {},
|
||||
"sig": {},
|
||||
"signature": {},
|
||||
}
|
||||
|
||||
// redactQuery masks the values of sensitive query parameters (see
|
||||
// redactQueryKeys) in a URL's raw query before it lands in the request log.
|
||||
// Key matching is case-insensitive. Returns the input unchanged when there is
|
||||
// nothing to redact so a malformed URL surfaces naturally.
|
||||
func redactQuery(rawQuery string) string {
|
||||
if rawQuery == "" {
|
||||
return ""
|
||||
}
|
||||
parts := strings.Split(rawQuery, "&")
|
||||
for i, p := range parts {
|
||||
eq := strings.IndexByte(p, '=')
|
||||
if eq < 0 {
|
||||
continue
|
||||
}
|
||||
key := strings.ToLower(p[:eq])
|
||||
if _, ok := redactQueryKeys[key]; ok {
|
||||
parts[i] = p[:eq+1] + "***"
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, "&")
|
||||
}
|
||||
|
||||
// recovery is an HTTP middleware that catches panics and returns a 500 response.
|
||||
func recovery(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -54,16 +241,49 @@ func recovery(next http.Handler) http.Handler {
|
||||
}
|
||||
|
||||
// securityHeaders sets standard security headers on all responses.
|
||||
//
|
||||
// Strict-Transport-Security is emitted only when the request arrived
|
||||
// over HTTPS (direct TLS or forwarded). Emitting HSTS over plain HTTP
|
||||
// is harmless to compliant browsers but flags as an issue in scanners
|
||||
// and confuses some reverse proxies.
|
||||
//
|
||||
// The CSP keeps `'unsafe-inline'` for now because SvelteKit injects
|
||||
// inline boot scripts and styles; removing it requires a nonce-based
|
||||
// strategy threaded through the SvelteKit handle hook. Tracked as a
|
||||
// follow-up; documented in the security report.
|
||||
func securityHeaders(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("X-Content-Type-Options", "nosniff")
|
||||
w.Header().Set("X-Frame-Options", "DENY")
|
||||
w.Header().Set("Referrer-Policy", "strict-origin-when-cross-origin")
|
||||
w.Header().Set("Content-Security-Policy", "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; connect-src 'self'; font-src 'self'")
|
||||
w.Header().Set("Permissions-Policy", "camera=(), microphone=(), geolocation=(), payment=()")
|
||||
w.Header().Set("Content-Security-Policy",
|
||||
"default-src 'self'; "+
|
||||
"script-src 'self' 'unsafe-inline'; "+
|
||||
"style-src 'self' 'unsafe-inline'; "+
|
||||
"img-src 'self' data:; "+
|
||||
"connect-src 'self'; "+
|
||||
"font-src 'self'; "+
|
||||
"frame-ancestors 'none'; "+
|
||||
"base-uri 'self'; "+
|
||||
"form-action 'self'")
|
||||
if isHTTPS(r) {
|
||||
w.Header().Set("Strict-Transport-Security", "max-age=31536000; includeSubDomains")
|
||||
}
|
||||
next.ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
func isHTTPS(r *http.Request) bool {
|
||||
if r.TLS != nil {
|
||||
return true
|
||||
}
|
||||
if r.Header.Get("X-Forwarded-Proto") == "https" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// cors is an HTTP middleware that handles CORS for same-origin requests.
|
||||
// The frontend is served from the same origin, so cross-origin requests are not expected.
|
||||
func cors(next http.Handler) http.Handler {
|
||||
@@ -164,10 +384,7 @@ func jsonContentType(next http.Handler) http.Handler {
|
||||
func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
|
||||
return func(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
ip := r.RemoteAddr
|
||||
if fwd := r.Header.Get("X-Forwarded-For"); fwd != "" {
|
||||
ip = fwd
|
||||
}
|
||||
ip := clientIP(r)
|
||||
if !rl.allow(ip) {
|
||||
respondError(w, http.StatusTooManyRequests, "rate limit exceeded")
|
||||
return
|
||||
@@ -177,6 +394,100 @@ func rateLimitMiddleware(rl *rateLimiter) func(http.Handler) http.Handler {
|
||||
}
|
||||
}
|
||||
|
||||
// trustedProxyCIDRs is the parsed allow-list of upstream proxy networks
|
||||
// whose X-Forwarded-For header we honor. Set TRUSTED_PROXY_CIDRS to a
|
||||
// comma-separated list of CIDRs (e.g. "127.0.0.1/32,10.0.0.0/8") to
|
||||
// enable. When unset (the default) X-Forwarded-For is ignored entirely
|
||||
// and rate limiting + audit logging use r.RemoteAddr — preventing a
|
||||
// remote attacker from spoofing the header to bypass per-IP limiters.
|
||||
var trustedProxyCIDRs = parseTrustedProxyCIDRs(os.Getenv("TRUSTED_PROXY_CIDRS"))
|
||||
|
||||
func parseTrustedProxyCIDRs(raw string) []*net.IPNet {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil
|
||||
}
|
||||
var nets []*net.IPNet
|
||||
for _, p := range strings.Split(raw, ",") {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
// Allow bare IPs as /32 (IPv4) or /128 (IPv6).
|
||||
if !strings.Contains(p, "/") {
|
||||
if ip := net.ParseIP(p); ip != nil {
|
||||
if ip.To4() != nil {
|
||||
p += "/32"
|
||||
} else {
|
||||
p += "/128"
|
||||
}
|
||||
}
|
||||
}
|
||||
_, n, err := net.ParseCIDR(p)
|
||||
if err != nil {
|
||||
slog.Warn("ignoring invalid TRUSTED_PROXY_CIDRS entry", "value", p, "error", err)
|
||||
continue
|
||||
}
|
||||
nets = append(nets, n)
|
||||
}
|
||||
return nets
|
||||
}
|
||||
|
||||
// clientIP returns the per-request "client" address used for rate-limit
|
||||
// keying and audit attribution. X-Forwarded-For is honored ONLY when the
|
||||
// direct peer (r.RemoteAddr) belongs to a configured trusted-proxy CIDR;
|
||||
// otherwise the header is ignored to prevent header-spoofing bypasses.
|
||||
func clientIP(r *http.Request) string {
|
||||
peer := r.RemoteAddr
|
||||
if host, _, err := net.SplitHostPort(peer); err == nil {
|
||||
peer = host
|
||||
}
|
||||
if len(trustedProxyCIDRs) == 0 {
|
||||
return peer
|
||||
}
|
||||
peerIP := net.ParseIP(peer)
|
||||
if peerIP == nil || !isTrustedProxy(peerIP) {
|
||||
return peer
|
||||
}
|
||||
fwd := r.Header.Get("X-Forwarded-For")
|
||||
if fwd == "" {
|
||||
return peer
|
||||
}
|
||||
// Walk X-Forwarded-For from the RIGHTMOST entry (the address closest to
|
||||
// us, appended by our trusted peer) leftward, skipping entries that are
|
||||
// themselves trusted proxies, and return the first untrusted address.
|
||||
// The LEFTMOST entry is fully client-controlled — trusting it (as a
|
||||
// naive `fwd[:firstComma]` does) lets an attacker spoof their rate-limit
|
||||
// and audit identity by prepending a forged value, defeating the per-IP
|
||||
// login limiter.
|
||||
parts := strings.Split(fwd, ",")
|
||||
for i := len(parts) - 1; i >= 0; i-- {
|
||||
candidate := strings.TrimSpace(parts[i])
|
||||
ip := net.ParseIP(candidate)
|
||||
if ip == nil {
|
||||
continue
|
||||
}
|
||||
if isTrustedProxy(ip) {
|
||||
continue
|
||||
}
|
||||
return candidate
|
||||
}
|
||||
// Every forwarded entry was a trusted proxy (or unparseable) — fall back
|
||||
// to the direct peer.
|
||||
return peer
|
||||
}
|
||||
|
||||
// isTrustedProxy reports whether ip falls within a configured
|
||||
// trusted-proxy CIDR.
|
||||
func isTrustedProxy(ip net.IP) bool {
|
||||
for _, n := range trustedProxyCIDRs {
|
||||
if n.Contains(ip) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// statusRecorder wraps http.ResponseWriter to capture the status code.
|
||||
type statusRecorder struct {
|
||||
http.ResponseWriter
|
||||
|
||||
+150
-12
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
@@ -13,11 +14,13 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/dns"
|
||||
"github.com/alexei/tinyforge/internal/docker"
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/keyedmutex"
|
||||
"github.com/alexei/tinyforge/internal/notify"
|
||||
"github.com/alexei/tinyforge/internal/npm"
|
||||
"github.com/alexei/tinyforge/internal/proxy"
|
||||
"github.com/alexei/tinyforge/internal/stale"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/volsnap"
|
||||
"github.com/alexei/tinyforge/internal/webhook"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
@@ -50,17 +53,34 @@ type Server struct {
|
||||
oidcProvider *auth.OIDCProvider
|
||||
staleScanner *stale.Scanner
|
||||
|
||||
// gitopsSync serializes the GitOps sync (read→merge→write) per workload so
|
||||
// two concurrent syncs can't race on source_config (review S5).
|
||||
gitopsSync keyedMutex
|
||||
|
||||
// volRestoreInFlight is a per-workload single-flight guard for volume
|
||||
// snapshot restore: a concurrent restore of the same workload is rejected
|
||||
// fast with 409 (TryLock) rather than queuing behind the deployer lock.
|
||||
volRestoreInFlight keyedmutex.Mutex
|
||||
|
||||
dnsProviderMu sync.RWMutex
|
||||
dnsProvider dns.Provider
|
||||
onDNSProviderChanged DNSProviderChangedFunc
|
||||
|
||||
backupEngine *backup.Engine
|
||||
snapshotEngine *volsnap.Engine
|
||||
sseGate *sseGate
|
||||
logScanReloader LogScanReloader
|
||||
dbPath string
|
||||
shutdownFunc func() // called after restore to trigger graceful shutdown
|
||||
onBackupSettingsChanged func(enabled bool, intervalHours int) // called when backup settings change
|
||||
onProxyProviderChanged func(provider proxy.Provider) // called when proxy provider changes
|
||||
|
||||
// restoreInFlight is a process-wide guard against double-firing
|
||||
// the restore endpoint. A rapid double-click would otherwise
|
||||
// schedule two goroutines racing s.store.Close() and the
|
||||
// candidate-over-live rename. CAS to true at the entry point;
|
||||
// reject the second caller with 409 Conflict.
|
||||
restoreInFlight atomic.Bool
|
||||
}
|
||||
|
||||
// NewServer creates a new API Server with all required dependencies.
|
||||
@@ -111,6 +131,11 @@ func (s *Server) SetBackupEngine(engine *backup.Engine) {
|
||||
s.backupEngine = engine
|
||||
}
|
||||
|
||||
// SetSnapshotEngine sets the volume-snapshot engine on the server.
|
||||
func (s *Server) SetSnapshotEngine(engine *volsnap.Engine) {
|
||||
s.snapshotEngine = engine
|
||||
}
|
||||
|
||||
// SetDBPath sets the database file path (needed for restore).
|
||||
func (s *Server) SetDBPath(path string) {
|
||||
s.dbPath = path
|
||||
@@ -157,13 +182,32 @@ func (s *Server) SetDNSProviderChangedCallback(fn DNSProviderChangedFunc) {
|
||||
|
||||
// initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
|
||||
func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
|
||||
// Decrypt the OIDC client secret if it's encrypted.
|
||||
// Decrypt the OIDC client secret. The prior code did a try-decrypt
|
||||
// and silently treated failures as plaintext — under a rotated key
|
||||
// that sent ciphertext upstream to the OP. Now:
|
||||
// - If the value carries the tf1: envelope → fail loud on
|
||||
// decrypt failure (rotated key / corrupted ciphertext).
|
||||
// - If the value is unprefixed (legacy ciphertext from v0 or true
|
||||
// plaintext from an old migration) → try decrypt; on failure
|
||||
// accept as plaintext (the only safe legacy interpretation).
|
||||
clientSecret := as.OIDCClientSecret
|
||||
if clientSecret != "" {
|
||||
if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
|
||||
switch {
|
||||
case crypto.HasEnvelope(clientSecret):
|
||||
decrypted, err := crypto.Decrypt(s.encKey, clientSecret)
|
||||
if err != nil {
|
||||
slog.Error("OIDC client secret could not be decrypted — refusing to initialize provider",
|
||||
"error", err,
|
||||
"hint", "rotate ENCRYPTION_KEY back, OR re-save OIDC settings to re-encrypt with the current key")
|
||||
return
|
||||
}
|
||||
clientSecret = decrypted
|
||||
default:
|
||||
// Legacy v0 value: try decrypt; on failure assume plaintext.
|
||||
if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil {
|
||||
clientSecret = decrypted
|
||||
}
|
||||
}
|
||||
// If decrypt fails, assume it's already plaintext (migration scenario).
|
||||
}
|
||||
provider, err := auth.NewOIDCProvider(ctx, auth.OIDCConfig{
|
||||
IssuerURL: as.OIDCIssuerURL,
|
||||
@@ -183,12 +227,29 @@ func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
|
||||
func (s *Server) Router() chi.Router {
|
||||
r := chi.NewRouter()
|
||||
|
||||
// Global middleware.
|
||||
// Global middleware. requestID runs first so every downstream log
|
||||
// line (and the access log emitted by `logging`) carries the same
|
||||
// correlation id, plus the response carries it back on the
|
||||
// X-Request-ID header for the operator to grep across services.
|
||||
r.Use(requestID)
|
||||
r.Use(recovery)
|
||||
r.Use(securityHeaders)
|
||||
r.Use(logging)
|
||||
r.Use(cors)
|
||||
|
||||
// Unauthenticated health probes — mounted at the root so container
|
||||
// orchestrators / load balancers can hit them without knowing about
|
||||
// the /api prefix. /livez intentionally does no work and stays
|
||||
// unbounded; /readyz pings the DB and is rate-limited to keep an
|
||||
// unauthenticated flood from serialising behind SQLite's single
|
||||
// writer connection (busy-timeout = 5s) and log-amplifying every
|
||||
// request via the structured access log. The 10-per-minute budget
|
||||
// is the existing rateLimiter default — generous for k8s readiness
|
||||
// probes (typically every 5-10s), restrictive for an attacker.
|
||||
r.Get("/livez", s.livez)
|
||||
readyLimiter := newRateLimiter()
|
||||
r.With(rateLimitMiddleware(readyLimiter)).Get("/readyz", s.readyz)
|
||||
|
||||
loginLimiter := newRateLimiter()
|
||||
webhookLimiter := newRateLimiter()
|
||||
|
||||
@@ -232,6 +293,7 @@ func (s *Server) Router() chi.Router {
|
||||
r.Post("/discovery/git/branches", s.listGitBranches)
|
||||
r.Post("/discovery/git/tree", s.listGitTree)
|
||||
r.Get("/discovery/image/conflicts", s.listImageConflicts)
|
||||
r.Post("/discovery/image/inspect", s.inspectImageMetadata)
|
||||
})
|
||||
|
||||
// Read-only endpoints (any authenticated user).
|
||||
@@ -245,16 +307,18 @@ func (s *Server) Router() chi.Router {
|
||||
r.Get("/events/log/stats", s.getEventLogStats)
|
||||
r.Get("/registries", s.listRegistries)
|
||||
r.Route("/registries/{id}", func(r chi.Router) {
|
||||
// All registry probes are admin-gated. The /tags and
|
||||
// /images endpoints used to be open to any authenticated
|
||||
// user, but they make outbound requests using the
|
||||
// admin-encrypted registry token — a viewer could
|
||||
// effectively drive arbitrary requests against a private
|
||||
// registry under admin credentials.
|
||||
r.Use(auth.AdminOnly)
|
||||
r.Get("/tags/*", s.listRegistryTags)
|
||||
r.Get("/images", s.listRegistryImages)
|
||||
|
||||
// Admin-only registry mutations.
|
||||
r.Group(func(r chi.Router) {
|
||||
r.Use(auth.AdminOnly)
|
||||
r.Put("/", s.updateRegistry)
|
||||
r.Delete("/", s.deleteRegistry)
|
||||
r.Post("/test", s.testRegistry)
|
||||
})
|
||||
r.Put("/", s.updateRegistry)
|
||||
r.Delete("/", s.deleteRegistry)
|
||||
r.Post("/test", s.testRegistry)
|
||||
})
|
||||
r.Get("/settings", s.getSettings)
|
||||
r.Get("/settings/npm-certificates", s.listNpmCertificates)
|
||||
@@ -282,11 +346,44 @@ func (s *Server) Router() chi.Router {
|
||||
r.With(auth.AdminOnly).Post("/start", s.startPluginWorkload)
|
||||
r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload)
|
||||
|
||||
// Deploy ledger + rollback. The history feed is read-only
|
||||
// (any authenticated user); rollback is a redeploy, so it is
|
||||
// admin-gated like /deploy.
|
||||
r.Get("/deploys", s.listWorkloadDeploys)
|
||||
r.With(auth.AdminOnly).Post("/rollback", s.rollbackWorkload)
|
||||
|
||||
// GitOps config-as-code (dockerfile/static). The status read
|
||||
// (incl. live drift) is open to any authenticated user; enable/
|
||||
// disable and sync mutate config, so they are admin-gated.
|
||||
r.Get("/gitops", s.getWorkloadGitOps)
|
||||
r.With(auth.AdminOnly).Put("/gitops", s.setWorkloadGitOps)
|
||||
r.With(auth.AdminOnly).Post("/gitops/sync", s.syncWorkloadGitOps)
|
||||
|
||||
// Volume snapshots (admin-only). Capture/list a workload's
|
||||
// host-bind data volumes; {sid}-scoped download/delete live
|
||||
// in the global admin group alongside backups.
|
||||
r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots)
|
||||
r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable)
|
||||
r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot)
|
||||
// Restore overwrites live volume data and restarts the app — the
|
||||
// most destructive workload action. Admin-gated + X-Confirm-Restore
|
||||
// header (CSRF) + per-workload single-flight, mirroring DB restore.
|
||||
r.With(auth.AdminOnly).Post("/snapshots/{sid}/restore", s.restoreWorkloadSnapshot)
|
||||
|
||||
// Runtime view: per-source persisted state + storage usage.
|
||||
// Read-only; safe for any authenticated user.
|
||||
r.Get("/runtime-state", s.getWorkloadRuntimeState)
|
||||
r.Get("/storage", s.getWorkloadStorage)
|
||||
|
||||
// Per-workload metrics history (CPU/memory time-series),
|
||||
// aggregated across the workload's containers. Read-only.
|
||||
r.Get("/stats/history", s.getWorkloadStatsHistory)
|
||||
|
||||
// Per-workload activity / deploy timeline (read-only). Scoped
|
||||
// to this workload's event-log rows; the global feed lives at
|
||||
// /events/log.
|
||||
r.Get("/events", s.listWorkloadEvents)
|
||||
|
||||
// Per-workload env vars. Listing open to authenticated readers;
|
||||
// mutations admin-gated. Encrypted values are write-only after store.
|
||||
r.Get("/env", s.listWorkloadEnv)
|
||||
@@ -312,6 +409,15 @@ func (s *Server) Router() chi.Router {
|
||||
// of /triggers/{id}/bindings keyed on the workload side.
|
||||
r.Get("/triggers", s.listBindingsForWorkload)
|
||||
r.With(auth.AdminOnly).Post("/triggers", s.bindTriggerToWorkload)
|
||||
|
||||
// Per-workload notification routes — multi-destination
|
||||
// fan-out (Slack channel + Discord webhook + ...). When
|
||||
// zero rows are configured the dispatcher falls back to
|
||||
// the legacy single-URL columns on the workload row.
|
||||
r.Get("/notifications", s.listWorkloadNotifications)
|
||||
r.With(auth.AdminOnly).Post("/notifications", s.createWorkloadNotification)
|
||||
r.With(auth.AdminOnly).Put("/notifications/{nid}", s.updateWorkloadNotification)
|
||||
r.With(auth.AdminOnly).Delete("/notifications/{nid}", s.deleteWorkloadNotification)
|
||||
})
|
||||
|
||||
// Global container index, joined to workload + app names.
|
||||
@@ -370,6 +476,26 @@ func (s *Server) Router() chi.Router {
|
||||
r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
|
||||
})
|
||||
|
||||
// Metric-alert rules.
|
||||
r.Get("/metric-alert-rules", s.listMetricAlertRules)
|
||||
r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule)
|
||||
r.Group(func(r chi.Router) {
|
||||
r.Use(auth.AdminOnly)
|
||||
r.Post("/metric-alert-rules", s.createMetricAlertRule)
|
||||
r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule)
|
||||
r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule)
|
||||
})
|
||||
|
||||
// Shared secrets (env vars shared across workloads by scope).
|
||||
r.Get("/shared-secrets", s.listSharedSecrets)
|
||||
r.Get("/shared-secrets/{id}", s.getSharedSecret)
|
||||
r.Group(func(r chi.Router) {
|
||||
r.Use(auth.AdminOnly)
|
||||
r.Post("/shared-secrets", s.createSharedSecret)
|
||||
r.Patch("/shared-secrets/{id}", s.updateSharedSecret)
|
||||
r.Delete("/shared-secrets/{id}", s.deleteSharedSecret)
|
||||
})
|
||||
|
||||
// System resources (read-only).
|
||||
r.Get("/system/stats", s.getSystemStats)
|
||||
r.Get("/system/stats/history", s.getSystemStatsHistory)
|
||||
@@ -379,6 +505,12 @@ func (s *Server) Router() chi.Router {
|
||||
r.Group(func(r chi.Router) {
|
||||
r.Use(auth.AdminOnly)
|
||||
|
||||
// Prometheus-format metrics export. Admin-only so the
|
||||
// counter cardinality cannot be enumerated by a low-trust
|
||||
// viewer to map internal endpoints / sources / outcomes.
|
||||
// Scrape with bearer auth from your Prometheus job.
|
||||
r.Get("/metrics", s.metricsExport)
|
||||
|
||||
// Config export (reveals registry/global details).
|
||||
r.Get("/config/export", s.exportConfig)
|
||||
|
||||
@@ -414,6 +546,7 @@ func (s *Server) Router() chi.Router {
|
||||
|
||||
// Docker management.
|
||||
r.Post("/docker/prune-images", s.pruneImages)
|
||||
r.Post("/docker/prune-build-cache", s.pruneBuildCache)
|
||||
|
||||
// NPM connection test.
|
||||
r.Post("/settings/npm/test", s.testNpmConnection)
|
||||
@@ -431,6 +564,11 @@ func (s *Server) Router() chi.Router {
|
||||
r.Get("/backups/{id}/download", s.downloadBackup)
|
||||
r.Delete("/backups/{id}", s.deleteBackup)
|
||||
r.Post("/backups/{id}/restore", s.restoreBackup)
|
||||
|
||||
// Volume-snapshot download/delete (workload-scoped capture +
|
||||
// list live under /workloads/{id}/snapshots).
|
||||
r.Get("/snapshots/{sid}/download", s.downloadSnapshot)
|
||||
r.Delete("/snapshots/{sid}", s.deleteSnapshot)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -0,0 +1,272 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/crypto"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// sharedSecretRow is the JSON shape returned to clients. The secret value is
|
||||
// NEVER returned — once stored it is write-only (mirroring workload_env). The
|
||||
// has_value flag lets the UI show whether a value is set without exposing it;
|
||||
// to rotate, the operator submits a new value.
|
||||
type sharedSecretRow struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
HasValue bool `json:"has_value"`
|
||||
Encrypted bool `json:"encrypted"`
|
||||
Scope string `json:"scope"`
|
||||
AppID string `json:"app_id"`
|
||||
Description string `json:"description"`
|
||||
Enabled bool `json:"enabled"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
func toSharedSecretRow(sec store.SharedSecret) sharedSecretRow {
|
||||
return sharedSecretRow{
|
||||
ID: sec.ID,
|
||||
Name: sec.Name,
|
||||
HasValue: sec.Value != "",
|
||||
Encrypted: sec.Encrypted,
|
||||
Scope: sec.Scope,
|
||||
AppID: sec.AppID,
|
||||
Description: sec.Description,
|
||||
Enabled: sec.Enabled,
|
||||
CreatedAt: sec.CreatedAt,
|
||||
UpdatedAt: sec.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
// listSharedSecrets handles GET /api/shared-secrets. Values are redacted.
|
||||
func (s *Server) listSharedSecrets(w http.ResponseWriter, r *http.Request) {
|
||||
rows, err := s.store.ListSharedSecrets()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list shared secrets")
|
||||
return
|
||||
}
|
||||
out := make([]sharedSecretRow, 0, len(rows))
|
||||
for _, sec := range rows {
|
||||
out = append(out, toSharedSecretRow(sec))
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
// getSharedSecret handles GET /api/shared-secrets/{id}. Value is redacted.
|
||||
func (s *Server) getSharedSecret(w http.ResponseWriter, r *http.Request) {
|
||||
sec, err := s.store.GetSharedSecret(chi.URLParam(r, "id"))
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "shared secret")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get shared secret")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, toSharedSecretRow(sec))
|
||||
}
|
||||
|
||||
// createSharedSecretRequest is the POST body. Encrypted=true (the default for
|
||||
// a non-empty value) causes the value to be encrypted at rest with the global
|
||||
// key before it ever reaches the store.
|
||||
type createSharedSecretRequest struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Encrypted *bool `json:"encrypted"` // defaults true
|
||||
Scope string `json:"scope"` // global | app
|
||||
AppID string `json:"app_id"` // required when scope == app
|
||||
Description string `json:"description"`
|
||||
Enabled *bool `json:"enabled"` // defaults true
|
||||
}
|
||||
|
||||
func (s *Server) createSharedSecret(w http.ResponseWriter, r *http.Request) {
|
||||
var req createSharedSecretRequest
|
||||
if !decodeJSONStrict(w, r, &req) {
|
||||
return
|
||||
}
|
||||
req.Name = strings.TrimSpace(req.Name)
|
||||
if !validEnvKey(req.Name) {
|
||||
respondError(w, http.StatusBadRequest, "name must be a valid env key [A-Za-z_][A-Za-z0-9_]*")
|
||||
return
|
||||
}
|
||||
if msg := validateSharedSecretScope(req.Scope, req.AppID); msg != "" {
|
||||
respondError(w, http.StatusBadRequest, msg)
|
||||
return
|
||||
}
|
||||
|
||||
encrypted := true
|
||||
if req.Encrypted != nil {
|
||||
encrypted = *req.Encrypted
|
||||
}
|
||||
enabled := true
|
||||
if req.Enabled != nil {
|
||||
enabled = *req.Enabled
|
||||
}
|
||||
|
||||
value, err := s.encryptSecretValue(req.Value, encrypted)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "encrypt value")
|
||||
return
|
||||
}
|
||||
|
||||
sec, err := s.store.CreateSharedSecret(store.SharedSecret{
|
||||
Name: req.Name,
|
||||
Value: value,
|
||||
Encrypted: encrypted,
|
||||
Scope: req.Scope,
|
||||
AppID: strings.TrimSpace(req.AppID),
|
||||
Description: req.Description,
|
||||
Enabled: enabled,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrUnique) {
|
||||
respondError(w, http.StatusConflict, "a shared secret with this scope and name already exists")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "create shared secret")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusCreated, toSharedSecretRow(sec))
|
||||
}
|
||||
|
||||
// updateSharedSecretRequest is the PATCH body. Every field is optional; nil
|
||||
// means "leave unchanged". A nil Value preserves the stored ciphertext (so a
|
||||
// metadata-only edit can't accidentally blank a secret); a non-nil Value
|
||||
// rotates it (re-encrypted under the effective Encrypted flag).
|
||||
type updateSharedSecretRequest struct {
|
||||
Name *string `json:"name"`
|
||||
Value *string `json:"value"`
|
||||
Encrypted *bool `json:"encrypted"`
|
||||
Scope *string `json:"scope"`
|
||||
AppID *string `json:"app_id"`
|
||||
Description *string `json:"description"`
|
||||
Enabled *bool `json:"enabled"`
|
||||
}
|
||||
|
||||
func (s *Server) updateSharedSecret(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
existing, err := s.store.GetSharedSecret(id)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "shared secret")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get shared secret")
|
||||
return
|
||||
}
|
||||
|
||||
var req updateSharedSecretRequest
|
||||
if !decodeJSONStrict(w, r, &req) {
|
||||
return
|
||||
}
|
||||
|
||||
merged := existing
|
||||
if req.Name != nil {
|
||||
merged.Name = strings.TrimSpace(*req.Name)
|
||||
if !validEnvKey(merged.Name) {
|
||||
respondError(w, http.StatusBadRequest, "name must be a valid env key [A-Za-z_][A-Za-z0-9_]*")
|
||||
return
|
||||
}
|
||||
}
|
||||
if req.Encrypted != nil {
|
||||
merged.Encrypted = *req.Encrypted
|
||||
}
|
||||
if req.Scope != nil {
|
||||
merged.Scope = *req.Scope
|
||||
}
|
||||
if req.AppID != nil {
|
||||
merged.AppID = strings.TrimSpace(*req.AppID)
|
||||
}
|
||||
if req.Description != nil {
|
||||
merged.Description = *req.Description
|
||||
}
|
||||
if req.Enabled != nil {
|
||||
merged.Enabled = *req.Enabled
|
||||
}
|
||||
if msg := validateSharedSecretScope(merged.Scope, merged.AppID); msg != "" {
|
||||
respondError(w, http.StatusBadRequest, msg)
|
||||
return
|
||||
}
|
||||
|
||||
// Value handling: only (re)encrypt when the caller supplied a new value.
|
||||
// Otherwise keep the stored ciphertext untouched — but if the Encrypted
|
||||
// flag flipped without a new value we cannot transcode the opaque stored
|
||||
// bytes, so reject that ambiguous request rather than corrupting the row.
|
||||
if req.Value != nil {
|
||||
v, encErr := s.encryptSecretValue(*req.Value, merged.Encrypted)
|
||||
if encErr != nil {
|
||||
respondError(w, http.StatusInternalServerError, "encrypt value")
|
||||
return
|
||||
}
|
||||
merged.Value = v
|
||||
} else if req.Encrypted != nil && *req.Encrypted != existing.Encrypted {
|
||||
respondError(w, http.StatusBadRequest, "changing 'encrypted' requires resubmitting 'value'")
|
||||
return
|
||||
}
|
||||
|
||||
sec, err := s.store.UpdateSharedSecret(merged)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "shared secret")
|
||||
return
|
||||
}
|
||||
if errors.Is(err, store.ErrUnique) {
|
||||
respondError(w, http.StatusConflict, "a shared secret with this scope and name already exists")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "update shared secret")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, toSharedSecretRow(sec))
|
||||
}
|
||||
|
||||
func (s *Server) deleteSharedSecret(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if err := s.store.DeleteSharedSecret(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "shared secret")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "delete shared secret")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
|
||||
}
|
||||
|
||||
// encryptSecretValue encrypts value with the global key when encrypted is set
|
||||
// and the value is non-empty; otherwise it returns the value unchanged. An
|
||||
// empty value stays empty (no value set) regardless of the flag.
|
||||
func (s *Server) encryptSecretValue(value string, encrypted bool) (string, error) {
|
||||
if !encrypted || value == "" {
|
||||
return value, nil
|
||||
}
|
||||
enc, err := crypto.Encrypt(s.encKey, value)
|
||||
if err != nil {
|
||||
slog.Error("encrypt shared secret value", "error", err)
|
||||
return "", err
|
||||
}
|
||||
return enc, nil
|
||||
}
|
||||
|
||||
// validateSharedSecretScope returns a non-empty 400 message when the scope /
|
||||
// app_id pairing is invalid; "" when valid. Mirrors the store-side invariant
|
||||
// so the API rejects with a clear message before hitting the store.
|
||||
func validateSharedSecretScope(scope, appID string) string {
|
||||
switch scope {
|
||||
case store.SharedSecretScopeGlobal:
|
||||
return ""
|
||||
case store.SharedSecretScopeApp:
|
||||
if strings.TrimSpace(appID) == "" {
|
||||
return "app_id is required when scope is 'app'"
|
||||
}
|
||||
return ""
|
||||
default:
|
||||
return "scope must be 'global' or 'app'"
|
||||
}
|
||||
}
|
||||
+19
-2
@@ -32,9 +32,26 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
flusher.Flush()
|
||||
|
||||
// Subscribe to instance status, deploy status, and persistent event log events.
|
||||
// Build logs are high-volume: a single verbose `docker build` can emit
|
||||
// thousands of lines. Streaming them to EVERY connection would flood each
|
||||
// subscriber's bounded bus buffer and evict status/log events for ALL
|
||||
// clients. So build logs are delivered ONLY to connections that opt in
|
||||
// with ?workload_id=<id>, and only for that workload. Connections without
|
||||
// the param (e.g. the global dashboard) never receive build-log frames.
|
||||
buildLogWorkloadID := r.URL.Query().Get("workload_id")
|
||||
sub := s.eventBus.Subscribe(func(evt events.Event) bool {
|
||||
return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus || evt.Type == events.EventLog
|
||||
switch evt.Type {
|
||||
case events.EventInstanceStatus, events.EventDeployStatus, events.EventLog:
|
||||
return true
|
||||
case events.EventBuildLog:
|
||||
if buildLogWorkloadID == "" {
|
||||
return false
|
||||
}
|
||||
p, ok := evt.Payload.(events.BuildLogPayload)
|
||||
return ok && p.WorkloadID == buildLogWorkloadID
|
||||
default:
|
||||
return false
|
||||
}
|
||||
})
|
||||
defer s.eventBus.Unsubscribe(sub)
|
||||
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
@@ -85,6 +88,76 @@ func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
|
||||
respondJSON(w, http.StatusOK, samples)
|
||||
}
|
||||
|
||||
// workloadStatsPoint is one aggregated time bucket for a workload's metrics
|
||||
// graph: every container the workload owns is summed at each timestamp so a
|
||||
// multi-container (compose) workload reads as a single line. MemoryLimit is
|
||||
// the max across containers — the effective ceiling — though the UI plots
|
||||
// absolute MiB because the limit is often 0 (unlimited).
|
||||
type workloadStatsPoint struct {
|
||||
TS int64 `json:"ts"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryUsage int64 `json:"memory_usage"`
|
||||
MemoryLimit int64 `json:"memory_limit"`
|
||||
}
|
||||
|
||||
// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
|
||||
// Read-only and open to any authenticated user (mirrors the per-workload
|
||||
// events/runtime-state feeds). Always returns a (possibly empty) array — never
|
||||
// 503 — because samples come from SQLite, which is available even when the
|
||||
// Docker daemon is down or stats collection is disabled. Unknown workload id
|
||||
// 404s; a known workload with no samples yet returns [].
|
||||
func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
|
||||
samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
|
||||
if err != nil {
|
||||
slog.Error("failed to list workload stats samples", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
|
||||
}
|
||||
|
||||
// aggregateWorkloadStats folds per-container samples into one series keyed by
|
||||
// timestamp: CPU% and memory usage are summed across the workload's containers,
|
||||
// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
|
||||
// that order without an extra sort.
|
||||
func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
|
||||
points := make([]workloadStatsPoint, 0)
|
||||
idx := make(map[int64]int) // ts → index in points
|
||||
for _, sm := range samples {
|
||||
if i, ok := idx[sm.TS]; ok {
|
||||
points[i].CPUPercent += sm.CPUPercent
|
||||
points[i].MemoryUsage += sm.MemoryUsage
|
||||
if sm.MemoryLimit > points[i].MemoryLimit {
|
||||
points[i].MemoryLimit = sm.MemoryLimit
|
||||
}
|
||||
continue
|
||||
}
|
||||
idx[sm.TS] = len(points)
|
||||
points = append(points, workloadStatsPoint{
|
||||
TS: sm.TS,
|
||||
CPUPercent: sm.CPUPercent,
|
||||
MemoryUsage: sm.MemoryUsage,
|
||||
MemoryLimit: sm.MemoryLimit,
|
||||
})
|
||||
}
|
||||
return points
|
||||
}
|
||||
|
||||
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
|
||||
// Returns the top-N most recent samples across containers, sorted by CPU or
|
||||
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
func TestAggregateWorkloadStats_SumsPerTimestamp(t *testing.T) {
|
||||
// Two containers reporting at the same two ticks → summed per ts.
|
||||
samples := []store.ContainerStatsSample{
|
||||
{TS: 100, CPUPercent: 10, MemoryUsage: 1000, MemoryLimit: 4000},
|
||||
{TS: 100, CPUPercent: 5, MemoryUsage: 500, MemoryLimit: 8000},
|
||||
{TS: 200, CPUPercent: 20, MemoryUsage: 2000, MemoryLimit: 4000},
|
||||
}
|
||||
pts := aggregateWorkloadStats(samples)
|
||||
if len(pts) != 2 {
|
||||
t.Fatalf("expected 2 buckets, got %d", len(pts))
|
||||
}
|
||||
if pts[0].TS != 100 || pts[0].CPUPercent != 15 || pts[0].MemoryUsage != 1500 {
|
||||
t.Fatalf("ts=100 bucket wrong: %+v", pts[0])
|
||||
}
|
||||
// Memory limit takes the max across containers.
|
||||
if pts[0].MemoryLimit != 8000 {
|
||||
t.Fatalf("expected max memory limit 8000, got %d", pts[0].MemoryLimit)
|
||||
}
|
||||
if pts[1].TS != 200 || pts[1].CPUPercent != 20 {
|
||||
t.Fatalf("ts=200 bucket wrong: %+v", pts[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestAggregateWorkloadStats_Empty(t *testing.T) {
|
||||
pts := aggregateWorkloadStats(nil)
|
||||
if pts == nil {
|
||||
t.Fatal("expected non-nil empty slice for clean JSON []")
|
||||
}
|
||||
if len(pts) != 0 {
|
||||
t.Fatalf("expected 0 points, got %d", len(pts))
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkloadStatsHistory_UnknownWorkload404(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
resp := e.do(t, "GET", "/api/workloads/nope/stats/history", nil)
|
||||
if resp.StatusCode != 404 {
|
||||
t.Fatalf("expected 404 for unknown workload, got %d", resp.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkloadStatsHistory_KnownWorkloadEmpty(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
id := createImageWorkload(t, e, "metrics-app")
|
||||
resp := e.do(t, "GET", "/api/workloads/"+id+"/stats/history", nil)
|
||||
if resp.StatusCode != 200 {
|
||||
t.Fatalf("expected 200, got %d", resp.StatusCode)
|
||||
}
|
||||
var pts []workloadStatsPoint
|
||||
if errMsg := decodeEnvelope(t, resp, &pts); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
}
|
||||
if len(pts) != 0 {
|
||||
t.Fatalf("expected empty series for app with no samples, got %d", len(pts))
|
||||
}
|
||||
}
|
||||
@@ -89,12 +89,16 @@ func toTriggerViewWithCount(row store.TriggerWithBindingCount) triggerView {
|
||||
// triggerRequest is the create/update body. Config is opaque per kind.
|
||||
// Auto-generates a webhook secret on create when WebhookEnabled is true;
|
||||
// the secret is exposed only via the /webhook subresource.
|
||||
//
|
||||
// WebhookRequireSignature is a *bool so we can distinguish "field omitted
|
||||
// by client" (nil → apply secure default of true when webhook is enabled)
|
||||
// from an explicit opt-out (false → respected).
|
||||
type triggerRequest struct {
|
||||
Kind string `json:"kind"`
|
||||
Name string `json:"name"`
|
||||
Config json.RawMessage `json:"config"`
|
||||
WebhookEnabled bool `json:"webhook_enabled"`
|
||||
WebhookRequireSignature bool `json:"webhook_require_signature"`
|
||||
WebhookRequireSignature *bool `json:"webhook_require_signature,omitempty"`
|
||||
}
|
||||
|
||||
// Same per-blob caps used on the workload pluginWorkloadRequest path —
|
||||
@@ -134,12 +138,26 @@ func (s *Server) getTrigger(w http.ResponseWriter, r *http.Request) {
|
||||
// buildTriggerFromRequest assembles a store.Trigger ready for insert.
|
||||
// Centralized so the standalone create endpoint and the inline-bind
|
||||
// endpoint cannot drift on secret-generation defaults.
|
||||
//
|
||||
// SECURITY: a new trigger with webhook enabled defaults to require_signature
|
||||
// = true. Operators can opt out at create time for receivers that do not
|
||||
// support HMAC, but the safer default avoids the "freshly-created trigger
|
||||
// accepts unsigned posts to its URL" footgun.
|
||||
func buildTriggerFromRequest(req triggerRequest) store.Trigger {
|
||||
// Secure default: if webhook is enabled and the operator did NOT
|
||||
// explicitly set require_signature, force it on. Explicit false is
|
||||
// preserved (legacy receivers without HMAC support still work).
|
||||
requireSig := false
|
||||
if req.WebhookRequireSignature != nil {
|
||||
requireSig = *req.WebhookRequireSignature
|
||||
} else if req.WebhookEnabled {
|
||||
requireSig = true
|
||||
}
|
||||
t := store.Trigger{
|
||||
Kind: req.Kind,
|
||||
Name: strings.TrimSpace(req.Name),
|
||||
Config: string(req.Config),
|
||||
WebhookRequireSignature: req.WebhookRequireSignature,
|
||||
WebhookRequireSignature: requireSig,
|
||||
}
|
||||
if req.WebhookEnabled {
|
||||
t.WebhookSecret = generateWebhookSecret()
|
||||
@@ -199,7 +217,13 @@ func (s *Server) updateTrigger(w http.ResponseWriter, r *http.Request) {
|
||||
if len(req.Config) > 0 {
|
||||
existing.Config = string(req.Config)
|
||||
}
|
||||
existing.WebhookRequireSignature = req.WebhookRequireSignature
|
||||
if req.WebhookRequireSignature != nil {
|
||||
existing.WebhookRequireSignature = *req.WebhookRequireSignature
|
||||
} else if req.WebhookEnabled && !existing.WebhookRequireSignature {
|
||||
// Re-enabling webhook without specifying the signature flag —
|
||||
// take the secure default.
|
||||
existing.WebhookRequireSignature = true
|
||||
}
|
||||
wasEnabled := existing.WebhookSecret != ""
|
||||
if req.WebhookEnabled && !wasEnabled {
|
||||
// false→true transition: rotate both secrets so re-enabling
|
||||
|
||||
@@ -0,0 +1,243 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/volsnap"
|
||||
)
|
||||
|
||||
// listWorkloadSnapshots handles GET /api/workloads/{id}/snapshots.
|
||||
func (s *Server) listWorkloadSnapshots(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
id := chi.URLParam(r, "id")
|
||||
snaps, err := s.snapshotEngine.List(id)
|
||||
if err != nil {
|
||||
slog.Error("snapshots: list", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "internal server error")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, snaps)
|
||||
}
|
||||
|
||||
// snapshotableVolume is the sanitized view of a volume in the snapshotable
|
||||
// response — it omits the resolved host path so internal layout is not leaked.
|
||||
type snapshotableVolume struct {
|
||||
Target string `json:"target"`
|
||||
Scope string `json:"scope"`
|
||||
Source string `json:"source"`
|
||||
}
|
||||
|
||||
// getWorkloadSnapshotable handles GET /api/workloads/{id}/snapshotable. It
|
||||
// tells the UI which volumes can be snapshotted and which are skipped (and
|
||||
// why), so users are never misled about coverage.
|
||||
func (s *Server) getWorkloadSnapshotable(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
id := chi.URLParam(r, "id")
|
||||
workload, err := s.store.GetWorkloadByID(id)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "workload not found")
|
||||
return
|
||||
}
|
||||
settings, err := s.store.GetSettings()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "internal server error")
|
||||
return
|
||||
}
|
||||
refs, skipped, err := volsnap.SnapshotableVolumes(s.store, workload, settings)
|
||||
if err != nil {
|
||||
slog.Error("snapshots: enumerate", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "internal server error")
|
||||
return
|
||||
}
|
||||
|
||||
volumes := make([]snapshotableVolume, 0, len(refs))
|
||||
for _, ref := range refs {
|
||||
volumes = append(volumes, snapshotableVolume{Target: ref.Target, Scope: ref.Scope, Source: ref.Source})
|
||||
}
|
||||
if skipped == nil {
|
||||
skipped = []volsnap.SkippedVolume{}
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"volumes": volumes,
|
||||
"skipped": skipped,
|
||||
})
|
||||
}
|
||||
|
||||
// createWorkloadSnapshot handles POST /api/workloads/{id}/snapshots.
|
||||
func (s *Server) createWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
id := chi.URLParam(r, "id")
|
||||
workload, err := s.store.GetWorkloadByID(id)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "workload not found")
|
||||
return
|
||||
}
|
||||
settings, err := s.store.GetSettings()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "internal server error")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Label string `json:"label"`
|
||||
}
|
||||
if r.ContentLength != 0 {
|
||||
if err := json.NewDecoder(io.LimitReader(r.Body, 1<<20)).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
respondError(w, http.StatusBadRequest, "invalid JSON body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
snap, err := s.snapshotEngine.Create(workload, settings, body.Label)
|
||||
if err != nil {
|
||||
// "no snapshottable volume data" is client-actionable (400, safe to
|
||||
// echo). Any other error is server-side: log the detail, return a
|
||||
// generic 500 so internal paths / DB text never reach the client.
|
||||
if errors.Is(err, volsnap.ErrNoSnapshotData) {
|
||||
respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
slog.Error("snapshots: create", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "internal server error")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusCreated, snap)
|
||||
}
|
||||
|
||||
// deleteSnapshot handles DELETE /api/snapshots/{sid}.
|
||||
func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
sid := chi.URLParam(r, "sid")
|
||||
if err := s.snapshotEngine.Delete(sid); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondError(w, http.StatusNotFound, "snapshot not found")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "failed to delete snapshot")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
|
||||
}
|
||||
|
||||
// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore.
|
||||
//
|
||||
// This is the most destructive workload action: it overwrites the app's live
|
||||
// volume data with the snapshot and recreates its containers. It is guarded like
|
||||
// the DB restore — admin-only, an X-Confirm-Restore header that must echo the
|
||||
// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and
|
||||
// a per-workload single-flight so a double-click can't stack two restores. All
|
||||
// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this
|
||||
// handler only validates and delegates.
|
||||
func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
id := chi.URLParam(r, "id")
|
||||
sid := chi.URLParam(r, "sid")
|
||||
|
||||
if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid {
|
||||
respondError(w, http.StatusBadRequest,
|
||||
"missing or mismatched X-Confirm-Restore header (must equal snapshot id)")
|
||||
return
|
||||
}
|
||||
|
||||
// Up-front validation for precise client errors (Engine.Restore re-checks
|
||||
// ownership + source kind under the lock).
|
||||
snap, err := s.snapshotEngine.Get(sid)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "snapshot not found")
|
||||
return
|
||||
}
|
||||
if snap.WorkloadID != id {
|
||||
respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload")
|
||||
return
|
||||
}
|
||||
row, ok := s.loadWorkload(w, id)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if row.SourceKind != "image" {
|
||||
respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads")
|
||||
return
|
||||
}
|
||||
|
||||
// Per-workload single-flight: reject a concurrent restore of the SAME
|
||||
// workload with 409 rather than queuing it behind the deployer lock.
|
||||
release, ok := s.volRestoreInFlight.TryLock(id)
|
||||
if !ok {
|
||||
respondError(w, http.StatusConflict, "a restore is already in progress for this workload")
|
||||
return
|
||||
}
|
||||
defer release()
|
||||
|
||||
if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil {
|
||||
// Raw error (which can carry resolved host paths) stays in the log; the
|
||||
// client gets a generic message.
|
||||
slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "restore failed; see server logs")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"status": "restored",
|
||||
"workload_id": id,
|
||||
"snapshot_id": sid,
|
||||
})
|
||||
}
|
||||
|
||||
// downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
|
||||
// tar.gz archive. The resolved path is containment-checked against the
|
||||
// snapshot directory.
|
||||
func (s *Server) downloadSnapshot(w http.ResponseWriter, r *http.Request) {
|
||||
if s.snapshotEngine == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
|
||||
return
|
||||
}
|
||||
sid := chi.URLParam(r, "sid")
|
||||
snap, err := s.snapshotEngine.Get(sid)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "snapshot not found")
|
||||
return
|
||||
}
|
||||
path, err := s.snapshotEngine.FilePath(snap)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusForbidden, "access denied")
|
||||
return
|
||||
}
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusNotFound, "snapshot file not found on disk")
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
stat, err := f.Stat()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "failed to read snapshot file")
|
||||
return
|
||||
}
|
||||
name := filepath.Base(snap.Filename)
|
||||
w.Header().Set("Content-Type", "application/gzip")
|
||||
w.Header().Set("Content-Disposition", "attachment; filename=\""+name+"\"")
|
||||
http.ServeContent(w, r, name, stat.ModTime(), f)
|
||||
}
|
||||
@@ -0,0 +1,385 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/volsnap"
|
||||
"github.com/alexei/tinyforge/internal/webhook"
|
||||
)
|
||||
|
||||
// newSnapshotEnv builds an API test env with the volume-snapshot engine wired
|
||||
// (the shared newAPITestEnv does not wire it). dataDir holds the snapshot
|
||||
// archives; baseVol is where host-bind volume directories resolve.
|
||||
func newSnapshotEnv(t *testing.T) (*apiTestEnv, string) {
|
||||
t.Helper()
|
||||
st, err := store.New(":memory:")
|
||||
if err != nil {
|
||||
t.Fatalf("create store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { st.Close() })
|
||||
|
||||
encKey := [32]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
|
||||
dispatcher := &fakeAPIDispatcher{}
|
||||
wh := webhook.NewHandler(st)
|
||||
wh.SetPluginDispatcher(dispatcher)
|
||||
srv := NewServer(st, nil, nil, nil, dispatcher, nil, wh, nil, encKey)
|
||||
|
||||
snapEng, err := volsnap.New(st, t.TempDir())
|
||||
if err != nil {
|
||||
t.Fatalf("snapshot engine: %v", err)
|
||||
}
|
||||
srv.SetSnapshotEngine(snapEng)
|
||||
|
||||
httpsrv := httptest.NewServer(srv.Router())
|
||||
t.Cleanup(httpsrv.Close)
|
||||
|
||||
la := auth.NewLocalAuth(encKey)
|
||||
tok, err := la.GenerateToken(auth.Claims{UserID: "u-admin", Username: "admin", Role: "admin"})
|
||||
if err != nil {
|
||||
t.Fatalf("mint token: %v", err)
|
||||
}
|
||||
|
||||
baseVol := t.TempDir()
|
||||
settings, _ := st.GetSettings()
|
||||
settings.BaseVolumePath = baseVol
|
||||
if err := st.UpdateSettings(settings); err != nil {
|
||||
t.Fatalf("update settings: %v", err)
|
||||
}
|
||||
|
||||
return &apiTestEnv{srv: httpsrv, store: st, dispatcher: dispatcher, adminToken: tok.Token, encKey: encKey, snapEngine: snapEng}, baseVol
|
||||
}
|
||||
|
||||
// doRestore issues an authenticated restore POST, optionally setting the
|
||||
// X-Confirm-Restore header (pass confirm="" to omit it).
|
||||
func (e *apiTestEnv) doRestore(t *testing.T, workloadID, sid, confirm string) *http.Response {
|
||||
t.Helper()
|
||||
req, err := http.NewRequest(http.MethodPost,
|
||||
e.srv.URL+"/api/workloads/"+workloadID+"/snapshots/"+sid+"/restore", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("new request: %v", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+e.adminToken)
|
||||
if confirm != "" {
|
||||
req.Header.Set("X-Confirm-Restore", confirm)
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("do request: %v", err)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// okLifecycle is a no-op volsnap.Lifecycle for HTTP-layer happy-path tests; the
|
||||
// deep restore behavior is covered by the volsnap engine tests.
|
||||
type okLifecycle struct{ tag string }
|
||||
|
||||
func (l *okLifecycle) Lock(string) func() { return func() {} }
|
||||
func (l *okLifecycle) StopContainers(context.Context, string) (string, error) { return l.tag, nil }
|
||||
func (l *okLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
|
||||
|
||||
func TestRestoreSnapshot_RequiresConfirmHeader(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
// Missing header → 400.
|
||||
resp := e.doRestore(t, w.ID, snap.ID, "")
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("missing header status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
// Mismatched header → 400.
|
||||
resp = e.doRestore(t, w.ID, snap.ID, "not-the-sid")
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("mismatched header status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_WrongWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
resp := e.doRestore(t, "some-other-workload", snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("cross-workload restore status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_NonImageWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "site", Kind: "project", SourceKind: "static", SourceConfig: `{}`})
|
||||
snap, _ := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{WorkloadID: w.ID, Filename: "f.tar.gz", Manifest: "[]"})
|
||||
|
||||
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("non-image restore status = %d, want 400", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_NotFound(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, _ := e.store.CreateWorkload(store.Workload{Name: "a", Kind: "project", SourceKind: "image", SourceConfig: `{"image":"x","port":80}`})
|
||||
|
||||
resp := e.doRestore(t, w.ID, "missing-sid", "missing-sid")
|
||||
if resp.StatusCode != http.StatusNotFound {
|
||||
t.Fatalf("unknown snapshot status = %d, want 404", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_HappyPath(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
e.snapEngine.SetLifecycle(&okLifecycle{tag: "v1"})
|
||||
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "data-app", Kind: "project", SourceKind: "image",
|
||||
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project"}); err != nil {
|
||||
t.Fatalf("set volume: %v", err)
|
||||
}
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("ORIGINAL"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings, _ := e.store.GetSettings()
|
||||
snap, err := e.snapEngine.Create(w, settings, "base")
|
||||
if err != nil {
|
||||
t.Fatalf("create snapshot: %v", err)
|
||||
}
|
||||
// Drift the live data, then restore.
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("CHANGED"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
resp := e.doRestore(t, w.ID, snap.ID, snap.ID)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
t.Fatalf("restore status = %d, body=%s", resp.StatusCode, body)
|
||||
}
|
||||
resp.Body.Close()
|
||||
if got, _ := os.ReadFile(filepath.Join(hostDir, "payload.txt")); string(got) != "ORIGINAL" {
|
||||
t.Errorf("payload.txt = %q, want ORIGINAL (restored)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// blockingLifecycle blocks in Lock until released, signaling when entered — so
|
||||
// a test can hold one restore in-flight and assert a second is rejected 409.
|
||||
type blockingLifecycle struct {
|
||||
entered chan struct{}
|
||||
release chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (l *blockingLifecycle) Lock(string) func() {
|
||||
l.once.Do(func() { close(l.entered) })
|
||||
<-l.release
|
||||
return func() {}
|
||||
}
|
||||
func (l *blockingLifecycle) StopContainers(context.Context, string) (string, error) { return "", nil }
|
||||
func (l *blockingLifecycle) Redeploy(context.Context, store.Workload, string) error { return nil }
|
||||
|
||||
// seedRestorable creates an image workload with a project volume + live data and
|
||||
// a captured snapshot, returning the workload and snapshot ids.
|
||||
func seedRestorable(t *testing.T, e *apiTestEnv, baseVol string) (workloadID, snapshotID string) {
|
||||
t.Helper()
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "sf-app", Kind: "project", SourceKind: "image",
|
||||
SourceConfig: `{"image":"reg/app","port":80,"volumes":[{"source":"data","target":"/data","scope":"project"}]}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "sf-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "f.txt"), []byte("data"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
settings, _ := e.store.GetSettings()
|
||||
snap, err := e.snapEngine.Create(w, settings, "base")
|
||||
if err != nil {
|
||||
t.Fatalf("create snapshot: %v", err)
|
||||
}
|
||||
return w.ID, snap.ID
|
||||
}
|
||||
|
||||
func TestRestoreSnapshot_SingleFlight409(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
wid, sid := seedRestorable(t, e, baseVol)
|
||||
bl := &blockingLifecycle{entered: make(chan struct{}), release: make(chan struct{})}
|
||||
e.snapEngine.SetLifecycle(bl)
|
||||
|
||||
// Restore #1: passes validation, takes the single-flight, then blocks inside
|
||||
// the engine's Lock.
|
||||
go func() {
|
||||
resp := e.doRestore(t, wid, sid, sid)
|
||||
resp.Body.Close()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-bl.entered:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("first restore never reached the lifecycle lock")
|
||||
}
|
||||
|
||||
// Restore #2 for the same workload must be rejected fast with 409.
|
||||
resp := e.doRestore(t, wid, sid, sid)
|
||||
got := resp.StatusCode
|
||||
resp.Body.Close()
|
||||
close(bl.release) // let #1 finish
|
||||
if got != http.StatusConflict {
|
||||
t.Fatalf("concurrent restore status = %d, want 409", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolumeSnapshots_EndToEnd(t *testing.T) {
|
||||
e, baseVol := newSnapshotEnv(t)
|
||||
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "data-app",
|
||||
Kind: "project",
|
||||
SourceKind: "image",
|
||||
SourceConfig: `{"image":"registry.example.com/owner/app","port":8080}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
if _, err := e.store.SetWorkloadVolume(store.WorkloadVolume{
|
||||
WorkloadID: w.ID, Target: "/data", Source: "data", Scope: "project",
|
||||
}); err != nil {
|
||||
t.Fatalf("set volume: %v", err)
|
||||
}
|
||||
|
||||
// Materialize the resolved host-bind dir with a file so there is data to
|
||||
// capture. Layout mirrors ResolveWorkloadPath for project scope:
|
||||
// <baseVol>/<name>-<id8>/<source>.
|
||||
id8 := w.ID
|
||||
if len(id8) > 8 {
|
||||
id8 = id8[:8]
|
||||
}
|
||||
hostDir := filepath.Join(baseVol, "data-app-"+id8, "data")
|
||||
if err := os.MkdirAll(hostDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(hostDir, "payload.txt"), []byte("important"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// snapshotable lists the one host-bind volume.
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshotable", nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("snapshotable status = %d", resp.StatusCode)
|
||||
}
|
||||
var snapable struct {
|
||||
Volumes []map[string]string `json:"volumes"`
|
||||
Skipped []map[string]string `json:"skipped"`
|
||||
}
|
||||
decodeEnvelope(t, resp, &snapable)
|
||||
if len(snapable.Volumes) != 1 || snapable.Volumes[0]["target"] != "/data" {
|
||||
t.Fatalf("expected 1 snapshotable volume /data, got %+v", snapable)
|
||||
}
|
||||
|
||||
// Create a snapshot.
|
||||
resp = e.do(t, http.MethodPost, "/api/workloads/"+w.ID+"/snapshots", map[string]string{"label": "before upgrade"})
|
||||
if resp.StatusCode != http.StatusCreated {
|
||||
t.Fatalf("create snapshot status = %d", resp.StatusCode)
|
||||
}
|
||||
var snap store.VolumeSnapshot
|
||||
decodeEnvelope(t, resp, &snap)
|
||||
if snap.ID == "" || snap.SizeBytes == 0 || snap.Label != "before upgrade" {
|
||||
t.Fatalf("unexpected snapshot: %+v", snap)
|
||||
}
|
||||
|
||||
// It appears in the list.
|
||||
resp = e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshots", nil)
|
||||
var list []store.VolumeSnapshot
|
||||
decodeEnvelope(t, resp, &list)
|
||||
if len(list) != 1 || list[0].ID != snap.ID {
|
||||
t.Fatalf("expected 1 snapshot in list, got %+v", list)
|
||||
}
|
||||
|
||||
// Download streams a non-empty gzip archive (not the JSON envelope).
|
||||
resp = e.do(t, http.MethodGet, "/api/snapshots/"+snap.ID+"/download", nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("download status = %d", resp.StatusCode)
|
||||
}
|
||||
if ct := resp.Header.Get("Content-Type"); ct != "application/gzip" {
|
||||
t.Errorf("download content-type = %q, want application/gzip", ct)
|
||||
}
|
||||
data, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
if len(data) == 0 {
|
||||
t.Error("download body is empty")
|
||||
}
|
||||
|
||||
// Delete removes it.
|
||||
resp = e.do(t, http.MethodDelete, "/api/snapshots/"+snap.ID, nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("delete status = %d", resp.StatusCode)
|
||||
}
|
||||
resp = e.do(t, http.MethodGet, "/api/workloads/"+w.ID+"/snapshots", nil)
|
||||
var after []store.VolumeSnapshot
|
||||
decodeEnvelope(t, resp, &after)
|
||||
if len(after) != 0 {
|
||||
t.Fatalf("expected 0 snapshots after delete, got %d", len(after))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateSnapshot_NoVolumeData_Returns400(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
w, err := e.store.CreateWorkload(store.Workload{
|
||||
Name: "no-vol-app",
|
||||
Kind: "project",
|
||||
SourceKind: "image",
|
||||
SourceConfig: `{"image":"x","port":80}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create workload: %v", err)
|
||||
}
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads/"+w.ID+"/snapshots", nil)
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for an app with no snapshottable volumes, got %d", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func TestSnapshotEndpoints_RequireWorkload(t *testing.T) {
|
||||
e, _ := newSnapshotEnv(t)
|
||||
// snapshotable on an unknown workload → 404.
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/does-not-exist/snapshotable", nil)
|
||||
if resp.StatusCode != http.StatusNotFound {
|
||||
t.Fatalf("snapshotable unknown workload = %d, want 404", resp.StatusCode)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
@@ -13,18 +13,29 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
"github.com/alexei/tinyforge/internal/workload/preview"
|
||||
)
|
||||
|
||||
// chainNode is the lightweight shape returned by /chain — we deliberately
|
||||
// don't return full plugin.Workload values for ancestor/descendant rows
|
||||
// because the secret fields don't belong in a chain-traversal response.
|
||||
//
|
||||
// IsPreview / PreviewBranch surface branch-preview children to the UI so it
|
||||
// can render them in a dedicated "Preview environments" panel rather than as
|
||||
// undistinguished stage children. They are computed against the chain's
|
||||
// `self` workload via preview.IsPreviewChild — the canonical "this child is a
|
||||
// branch preview" test that reverses the MaterializeForBranch naming formula.
|
||||
// Both are zero-valued (false / "") for the parent and self nodes and for
|
||||
// operator-created stage children.
|
||||
type chainNode struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
SourceKind string `json:"source_kind"`
|
||||
TriggerKind string `json:"trigger_kind"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
SourceKind string `json:"source_kind"`
|
||||
TriggerKind string `json:"trigger_kind"`
|
||||
IsPreview bool `json:"is_preview"`
|
||||
PreviewBranch string `json:"preview_branch,omitempty"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
func chainNodeOf(w store.Workload) chainNode {
|
||||
@@ -38,6 +49,32 @@ func chainNodeOf(w store.Workload) chainNode {
|
||||
}
|
||||
}
|
||||
|
||||
// previewBranchOf extracts the branch a preview child was materialized for
|
||||
// from its source_config (the `branch` key MaterializeForBranch wrote).
|
||||
// Returns "" on a missing/malformed config — the caller only calls this for
|
||||
// rows preview.IsPreviewChild already confirmed, so a blank result just means
|
||||
// the JSON couldn't be decoded.
|
||||
func previewBranchOf(w store.Workload) string {
|
||||
var cfg struct {
|
||||
Branch string `json:"branch"`
|
||||
}
|
||||
if w.SourceConfig != "" {
|
||||
_ = json.Unmarshal([]byte(w.SourceConfig), &cfg)
|
||||
}
|
||||
return cfg.Branch
|
||||
}
|
||||
|
||||
// childChainNode builds a chainNode for a child row, marking it as a branch
|
||||
// preview (and attaching its branch) when it was materialized from `self`.
|
||||
func childChainNode(self, child store.Workload) chainNode {
|
||||
node := chainNodeOf(child)
|
||||
if preview.IsPreviewChild(self, child) {
|
||||
node.IsPreview = true
|
||||
node.PreviewBranch = previewBranchOf(child)
|
||||
}
|
||||
return node
|
||||
}
|
||||
|
||||
// getWorkloadChain handles GET /api/workloads/{id}/chain.
|
||||
//
|
||||
// Returns the workload's parent (or nil), itself, and its direct children
|
||||
@@ -76,7 +113,7 @@ func (s *Server) getWorkloadChain(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
children := make([]chainNode, 0, len(childRows))
|
||||
for _, c := range childRows {
|
||||
children = append(children, chainNodeOf(c))
|
||||
children = append(children, childChainNode(self, c))
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// TestChildChainNode_MarksPreviewChildren verifies the /chain DTO builder
|
||||
// distinguishes branch-preview children (materialized by the preview package)
|
||||
// from operator-created stage children that merely share the parent link.
|
||||
// The discriminator is preview.IsPreviewChild, which reverses the
|
||||
// MaterializeForBranch naming formula: name == template.Name + "/" + slug.
|
||||
func TestChildChainNode_MarksPreviewChildren(t *testing.T) {
|
||||
template := store.Workload{
|
||||
ID: "tmpl-1",
|
||||
Name: "myapp",
|
||||
SourceKind: "dockerfile",
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
child store.Workload
|
||||
wantPrev bool
|
||||
wantBranch string
|
||||
}{
|
||||
{
|
||||
name: "preview child is marked with its branch",
|
||||
child: store.Workload{
|
||||
ID: "child-prev",
|
||||
Name: "myapp/feat-login",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{"branch":"feat/login","port":3000}`,
|
||||
ParentWorkloadID: "tmpl-1",
|
||||
},
|
||||
wantPrev: true,
|
||||
wantBranch: "feat/login",
|
||||
},
|
||||
{
|
||||
name: "operator-named stage child sharing the parent is not a preview",
|
||||
child: store.Workload{
|
||||
ID: "child-stage",
|
||||
Name: "myapp-staging",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{"branch":"main"}`,
|
||||
ParentWorkloadID: "tmpl-1",
|
||||
},
|
||||
wantPrev: false,
|
||||
wantBranch: "",
|
||||
},
|
||||
{
|
||||
name: "child of a different parent is not a preview of self",
|
||||
child: store.Workload{
|
||||
ID: "child-other",
|
||||
Name: "myapp/feat-login",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{"branch":"feat/login"}`,
|
||||
ParentWorkloadID: "some-other-template",
|
||||
},
|
||||
wantPrev: false,
|
||||
wantBranch: "",
|
||||
},
|
||||
{
|
||||
name: "child with no branch in source_config is not a preview",
|
||||
child: store.Workload{
|
||||
ID: "child-nobranch",
|
||||
Name: "myapp/feat-login",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{}`,
|
||||
ParentWorkloadID: "tmpl-1",
|
||||
},
|
||||
wantPrev: false,
|
||||
wantBranch: "",
|
||||
},
|
||||
{
|
||||
// Same parent + a valid branch, but the name carries an extra
|
||||
// suffix so it fails ONLY the slug-equality check (expected
|
||||
// "myapp/feat-login", got "myapp/feat-login-staging"). The
|
||||
// branch alone must not be enough to mark a preview.
|
||||
name: "valid branch but name fails the slug match is not a preview",
|
||||
child: store.Workload{
|
||||
ID: "child-slugmiss",
|
||||
Name: "myapp/feat-login-staging",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{"branch":"feat/login","port":3000}`,
|
||||
ParentWorkloadID: "tmpl-1",
|
||||
},
|
||||
wantPrev: false,
|
||||
wantBranch: "",
|
||||
},
|
||||
{
|
||||
// Uppercase + slash branch: slugifyBranch lowercases and maps
|
||||
// "/" -> "-", so "Feature/Login" -> "feature-login" and the name
|
||||
// "myapp/feature-login" matches. PreviewBranch must echo the RAW
|
||||
// branch from source_config ("Feature/Login"), not the slug.
|
||||
name: "uppercase slash branch matches and keeps raw branch",
|
||||
child: store.Workload{
|
||||
ID: "child-upper",
|
||||
Name: "myapp/feature-login",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{"branch":"Feature/Login","port":8080}`,
|
||||
ParentWorkloadID: "tmpl-1",
|
||||
},
|
||||
wantPrev: true,
|
||||
wantBranch: "Feature/Login",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
node := childChainNode(template, tc.child)
|
||||
if node.IsPreview != tc.wantPrev {
|
||||
t.Errorf("IsPreview = %v, want %v", node.IsPreview, tc.wantPrev)
|
||||
}
|
||||
if node.PreviewBranch != tc.wantBranch {
|
||||
t.Errorf("PreviewBranch = %q, want %q", node.PreviewBranch, tc.wantBranch)
|
||||
}
|
||||
// Base fields must always round-trip regardless of preview status.
|
||||
if node.ID != tc.child.ID || node.Name != tc.child.Name {
|
||||
t.Errorf("base fields mangled: got id=%q name=%q", node.ID, node.Name)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestPreviewBranchOf_ToleratesMalformedConfig confirms the branch extractor
|
||||
// returns "" rather than panicking on a missing or invalid source_config.
|
||||
func TestPreviewBranchOf_ToleratesMalformedConfig(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
cfg string
|
||||
want string
|
||||
}{
|
||||
{"valid branch", `{"branch":"release/v1"}`, "release/v1"},
|
||||
{"empty config", ``, ""},
|
||||
{"empty object", `{}`, ""},
|
||||
{"malformed json", `{not-json`, ""},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := previewBranchOf(store.Workload{SourceConfig: c.cfg})
|
||||
if got != c.want {
|
||||
t.Errorf("previewBranchOf(%q) = %q, want %q", c.cfg, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -2,48 +2,17 @@ package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// toPluginWorkload converts a persisted store.Workload row into the value
|
||||
// shape that Source / Trigger plugins consume. Lives in the api package
|
||||
// (rather than store or plugin) to keep plugin's dependency graph free of
|
||||
// store imports and avoid the cycle that would form otherwise.
|
||||
//
|
||||
// SourceConfig / TriggerConfig are passed through as raw JSON; the matching
|
||||
// plugin decodes them with plugin.SourceConfigOf[T] / TriggerConfigOf[T].
|
||||
// PublicFaces is decoded eagerly because every consumer needs the parsed
|
||||
// slice (proxy registration, UI, validation).
|
||||
// toPluginWorkload is a local alias for the shared plugin.WorkloadFromStore
|
||||
// converter, kept so the api package's many call sites read tersely and pair
|
||||
// visually with fromPluginWorkload below. The conversion logic lives in the
|
||||
// plugin package (the single home shared with reconciler / webhook).
|
||||
func toPluginWorkload(w store.Workload) plugin.Workload {
|
||||
var faces []plugin.PublicFace
|
||||
if w.PublicFaces != "" {
|
||||
if err := json.Unmarshal([]byte(w.PublicFaces), &faces); err != nil {
|
||||
slog.Warn("workload: invalid public_faces JSON, treating as empty",
|
||||
"workload", w.ID, "error", err)
|
||||
faces = nil
|
||||
}
|
||||
}
|
||||
return plugin.Workload{
|
||||
ID: w.ID,
|
||||
Name: w.Name,
|
||||
GroupID: w.AppID,
|
||||
ParentWorkloadID: w.ParentWorkloadID,
|
||||
SourceKind: w.SourceKind,
|
||||
SourceConfig: json.RawMessage(w.SourceConfig),
|
||||
TriggerKind: w.TriggerKind,
|
||||
TriggerConfig: json.RawMessage(w.TriggerConfig),
|
||||
PublicFaces: faces,
|
||||
NotificationURL: w.NotificationURL,
|
||||
NotificationSecret: w.NotificationSecret,
|
||||
WebhookSecret: w.WebhookSecret,
|
||||
WebhookSigningSecret: w.WebhookSigningSecret,
|
||||
WebhookRequireSignature: w.WebhookRequireSignature,
|
||||
CreatedAt: w.CreatedAt,
|
||||
UpdatedAt: w.UpdatedAt,
|
||||
}
|
||||
return plugin.WorkloadFromStore(w)
|
||||
}
|
||||
|
||||
// fromPluginWorkload is the symmetric direction — used by /api/workloads
|
||||
|
||||
@@ -0,0 +1,231 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/crypto"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// workloadNotificationRow is the JSON shape returned to clients. The
|
||||
// `secret_set` boolean replaces the actual ciphertext: once stored a
|
||||
// secret is write-only, mirroring how workload_env hides encrypted
|
||||
// values. Rotating means submitting a new value.
|
||||
type workloadNotificationRow struct {
|
||||
ID string `json:"id"`
|
||||
WorkloadID string `json:"workload_id"`
|
||||
Name string `json:"name"`
|
||||
URL string `json:"url"`
|
||||
SecretSet bool `json:"secret_set"`
|
||||
EventTypes string `json:"event_types"`
|
||||
Enabled bool `json:"enabled"`
|
||||
SortOrder int `json:"sort_order"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
func toWorkloadNotificationRow(n store.WorkloadNotification) workloadNotificationRow {
|
||||
return workloadNotificationRow{
|
||||
ID: n.ID,
|
||||
WorkloadID: n.WorkloadID,
|
||||
Name: n.Name,
|
||||
URL: n.URL,
|
||||
SecretSet: n.Secret != "",
|
||||
EventTypes: n.EventTypes,
|
||||
Enabled: n.Enabled,
|
||||
SortOrder: n.SortOrder,
|
||||
CreatedAt: n.CreatedAt,
|
||||
UpdatedAt: n.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) listWorkloadNotifications(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
rows, err := s.store.ListWorkloadNotifications(id)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list workload notifications")
|
||||
return
|
||||
}
|
||||
out := make([]workloadNotificationRow, 0, len(rows))
|
||||
for _, n := range rows {
|
||||
out = append(out, toWorkloadNotificationRow(n))
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
// workloadNotificationRequest is the POST/PUT body. Secret is the raw
|
||||
// plaintext webhook signing key; the server encrypts it at rest with
|
||||
// the global encryption key before INSERT. An empty Secret on UPDATE
|
||||
// leaves the stored secret untouched so the operator can edit the URL
|
||||
// or event filter without re-entering the secret each time.
|
||||
type workloadNotificationRequest struct {
|
||||
Name string `json:"name"`
|
||||
URL string `json:"url"`
|
||||
Secret string `json:"secret"`
|
||||
EventTypes string `json:"event_types"`
|
||||
Enabled *bool `json:"enabled"`
|
||||
SortOrder int `json:"sort_order"`
|
||||
}
|
||||
|
||||
func (s *Server) createWorkloadNotification(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
var req workloadNotificationRequest
|
||||
if !decodeJSONStrict(w, r, &req) {
|
||||
return
|
||||
}
|
||||
req.URL = strings.TrimSpace(req.URL)
|
||||
req.Name = strings.TrimSpace(req.Name)
|
||||
if req.URL == "" {
|
||||
respondError(w, http.StatusBadRequest, "url is required")
|
||||
return
|
||||
}
|
||||
encSecret := ""
|
||||
if req.Secret != "" {
|
||||
v, err := crypto.Encrypt(s.encKey, req.Secret)
|
||||
if err != nil {
|
||||
slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "encrypt secret")
|
||||
return
|
||||
}
|
||||
encSecret = v
|
||||
}
|
||||
enabled := true
|
||||
if req.Enabled != nil {
|
||||
enabled = *req.Enabled
|
||||
}
|
||||
created, err := s.store.CreateWorkloadNotification(store.WorkloadNotification{
|
||||
WorkloadID: id,
|
||||
Name: req.Name,
|
||||
URL: req.URL,
|
||||
Secret: encSecret,
|
||||
EventTypes: req.EventTypes,
|
||||
Enabled: enabled,
|
||||
SortOrder: req.SortOrder,
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("workload notifications: create", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "create workload notification")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusCreated, toWorkloadNotificationRow(created))
|
||||
}
|
||||
|
||||
func (s *Server) updateWorkloadNotification(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
nid := chi.URLParam(r, "nid")
|
||||
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
existing, err := s.store.GetWorkloadNotification(nid)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload_notification")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload_notification")
|
||||
return
|
||||
}
|
||||
if existing.WorkloadID != id {
|
||||
// Route mismatch — the row exists but under a different workload.
|
||||
// Return 404 rather than 403 so we don't leak the existence of
|
||||
// foreign rows to an unauthorised caller.
|
||||
respondNotFound(w, "workload_notification")
|
||||
return
|
||||
}
|
||||
|
||||
var req workloadNotificationRequest
|
||||
if !decodeJSONStrict(w, r, &req) {
|
||||
return
|
||||
}
|
||||
req.URL = strings.TrimSpace(req.URL)
|
||||
req.Name = strings.TrimSpace(req.Name)
|
||||
if req.URL == "" {
|
||||
respondError(w, http.StatusBadRequest, "url is required")
|
||||
return
|
||||
}
|
||||
|
||||
existing.Name = req.Name
|
||||
existing.URL = req.URL
|
||||
existing.EventTypes = req.EventTypes
|
||||
existing.SortOrder = req.SortOrder
|
||||
if req.Enabled != nil {
|
||||
existing.Enabled = *req.Enabled
|
||||
}
|
||||
// Empty Secret on UPDATE preserves the stored ciphertext — explicit
|
||||
// rotation requires sending the new plaintext. This avoids forcing
|
||||
// the operator to re-enter their secret on every URL edit.
|
||||
if req.Secret != "" {
|
||||
v, err := crypto.Encrypt(s.encKey, req.Secret)
|
||||
if err != nil {
|
||||
slog.Error("workload notifications: encrypt secret", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "encrypt secret")
|
||||
return
|
||||
}
|
||||
existing.Secret = v
|
||||
}
|
||||
|
||||
if err := s.store.UpdateWorkloadNotification(existing); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload_notification")
|
||||
return
|
||||
}
|
||||
slog.Error("workload notifications: update", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "update workload notification")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, toWorkloadNotificationRow(existing))
|
||||
}
|
||||
|
||||
func (s *Server) deleteWorkloadNotification(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
nid := chi.URLParam(r, "nid")
|
||||
existing, err := s.store.GetWorkloadNotification(nid)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload_notification")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload_notification")
|
||||
return
|
||||
}
|
||||
if existing.WorkloadID != id {
|
||||
respondNotFound(w, "workload_notification")
|
||||
return
|
||||
}
|
||||
if err := s.store.DeleteWorkloadNotification(nid); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload_notification")
|
||||
return
|
||||
}
|
||||
slog.Error("workload notifications: delete", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "delete workload notification")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, map[string]any{"success": true})
|
||||
}
|
||||
@@ -82,16 +82,27 @@ func (s *Server) getWorkloadRuntimeState(w http.ResponseWriter, r *http.Request)
|
||||
|
||||
payload := runtimeStatePayload{SourceKind: workload.SourceKind}
|
||||
|
||||
if workload.SourceKind != "static" {
|
||||
// Both static and dockerfile sources persist their runtime state into
|
||||
// containers.extra_json under a deterministic row id. The shapes
|
||||
// match (status / last_commit_sha / last_sync_at / last_error) so the
|
||||
// handler can decode them identically. The suffix differs per source
|
||||
// kind: static uses ":site", dockerfile uses ":dockerfile".
|
||||
var rowSuffix string
|
||||
switch workload.SourceKind {
|
||||
case "static":
|
||||
rowSuffix = ":site"
|
||||
case "dockerfile":
|
||||
rowSuffix = ":dockerfile"
|
||||
default:
|
||||
respondJSON(w, http.StatusOK, payload)
|
||||
return
|
||||
}
|
||||
|
||||
// The static plugin owns one container row per workload at the
|
||||
// deterministic ID <workloadID>:site. A missing row means the
|
||||
// workload has never been deployed — return HasState=false so the
|
||||
// UI can prompt the operator to deploy.
|
||||
row, err := s.store.GetContainerByID(id + ":site")
|
||||
// The owning plugin maintains one container row per workload at the
|
||||
// deterministic ID. A missing row means the workload has never been
|
||||
// deployed — return HasState=false so the UI can prompt the operator
|
||||
// to deploy.
|
||||
row, err := s.store.GetContainerByID(id + rowSuffix)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondJSON(w, http.StatusOK, payload)
|
||||
|
||||
@@ -130,6 +130,13 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
|
||||
SourceKind: "static",
|
||||
SourceConfig: `{"provider":"gitea"}`,
|
||||
})
|
||||
// Seed a row with a valid extra_json first, then corrupt it via raw
|
||||
// SQL. Prior to the write-side validateExtraJSON guard this test
|
||||
// could pass a malformed string straight to UpsertContainer; the
|
||||
// guard now rejects that at the boundary, which is the correct
|
||||
// behaviour. The reader resilience this test verifies remains
|
||||
// relevant for pre-existing bad rows from upgrades or external
|
||||
// manipulation, so we still produce one via direct SQL.
|
||||
if err := e.store.UpsertContainer(store.Container{
|
||||
ID: wl.ID + ":site",
|
||||
WorkloadID: wl.ID,
|
||||
@@ -137,10 +144,16 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
|
||||
Host: "local",
|
||||
ContainerID: "abc",
|
||||
State: "running",
|
||||
ExtraJSON: `{this is not json`,
|
||||
ExtraJSON: `{}`,
|
||||
}); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
if _, err := e.store.DB().Exec(
|
||||
`UPDATE containers SET extra_json = ? WHERE id = ?`,
|
||||
`{this is not json`, wl.ID+":site",
|
||||
); err != nil {
|
||||
t.Fatalf("corrupt extra_json: %v", err)
|
||||
}
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200 (decode is non-fatal)", resp.StatusCode)
|
||||
@@ -155,6 +168,57 @@ func TestGetWorkloadRuntimeState_MalformedExtraJSON_ReturnsContainerFieldsOnly(t
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetWorkloadRuntimeState_DockerfileSourceDeployed_DecodesExtraJSON(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
wl, err := e.store.CreateWorkload(store.Workload{
|
||||
Kind: string(store.WorkloadKindProject),
|
||||
Name: "build-app",
|
||||
SourceKind: "dockerfile",
|
||||
SourceConfig: `{"provider":"gitea","port":3000}`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed workload: %v", err)
|
||||
}
|
||||
extra, _ := json.Marshal(map[string]any{
|
||||
"status": "deployed",
|
||||
"last_commit_sha": "deadbeef",
|
||||
"last_sync_at": "2026-05-23T10:00:00Z",
|
||||
"last_error": "",
|
||||
})
|
||||
if err := e.store.UpsertContainer(store.Container{
|
||||
ID: wl.ID + ":dockerfile",
|
||||
WorkloadID: wl.ID,
|
||||
WorkloadKind: string(store.WorkloadKindBuild),
|
||||
Host: "local",
|
||||
ContainerID: "ffeeddcc",
|
||||
State: "running",
|
||||
ExtraJSON: string(extra),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed container: %v", err)
|
||||
}
|
||||
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+wl.ID+"/runtime-state", nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200", resp.StatusCode)
|
||||
}
|
||||
var got runtimeStatePayload
|
||||
if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
}
|
||||
if !got.HasState {
|
||||
t.Fatalf("HasState = false, want true")
|
||||
}
|
||||
if got.SourceKind != "dockerfile" {
|
||||
t.Errorf("SourceKind = %q, want dockerfile", got.SourceKind)
|
||||
}
|
||||
if got.ContainerID != "ffeeddcc" || got.State != "running" {
|
||||
t.Errorf("container fields = (%q,%q), want (ffeeddcc, running)", got.ContainerID, got.State)
|
||||
}
|
||||
if got.Status != "deployed" || got.LastCommitSHA != "deadbeef" {
|
||||
t.Errorf("runtime fields = %+v, want deployed/deadbeef", got)
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// GET /api/workloads/{id}/storage
|
||||
// =============================================================================
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
"github.com/alexei/tinyforge/internal/workload/preview"
|
||||
)
|
||||
|
||||
// pluginWorkloadRequest is the JSON body accepted by create + update.
|
||||
@@ -227,6 +228,28 @@ func (s *Server) deletePluginWorkload(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// Cascade-teardown any branch previews materialized from this workload
|
||||
// so deleting a template does not orphan their containers, proxy routes,
|
||||
// and rows. Operator-managed stage-chain children (which share the same
|
||||
// parent link) are deliberately left alone — only previews are auto-owned
|
||||
// by the template (see preview.IsPreviewChild).
|
||||
if previews, err := preview.ListPreviewChildren(s.store, row); err != nil {
|
||||
slog.Warn("delete workload: list preview children", "workload", id, "error", err)
|
||||
} else {
|
||||
for _, child := range previews {
|
||||
if child.SourceKind != "" {
|
||||
if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(child)); err != nil {
|
||||
slog.Warn("delete workload: preview child teardown error",
|
||||
"workload", id, "child", child.ID, "error", err)
|
||||
}
|
||||
}
|
||||
if err := s.store.DeleteWorkload(child.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
|
||||
slog.Warn("delete workload: preview child delete error",
|
||||
"workload", id, "child", child.ID, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if row.SourceKind != "" {
|
||||
if err := s.deployer.DispatchTeardown(r.Context(), toPluginWorkload(row)); err != nil {
|
||||
slog.Warn("delete workload: teardown error",
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/crypto"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/volsnap"
|
||||
"github.com/alexei/tinyforge/internal/webhook"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
|
||||
@@ -75,6 +76,7 @@ type apiTestEnv struct {
|
||||
dispatcher *fakeAPIDispatcher
|
||||
adminToken string
|
||||
encKey [32]byte
|
||||
snapEngine *volsnap.Engine // set by newSnapshotEnv; nil otherwise
|
||||
}
|
||||
|
||||
func (e *apiTestEnv) close() { e.srv.Close() }
|
||||
@@ -670,9 +672,9 @@ func TestGetWorkloadChain_ParentSelfChildren(t *testing.T) {
|
||||
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+parentID+"/chain", nil)
|
||||
var got struct {
|
||||
Parent *map[string]any `json:"parent"`
|
||||
Self map[string]any `json:"self"`
|
||||
Children []map[string]any `json:"children"`
|
||||
Parent *map[string]any `json:"parent"`
|
||||
Self map[string]any `json:"self"`
|
||||
Children []map[string]any `json:"children"`
|
||||
}
|
||||
if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
|
||||
@@ -85,9 +85,15 @@ func (la *LocalAuth) cleanBlacklist() {
|
||||
}
|
||||
}
|
||||
|
||||
// bcryptCost is the work factor used for new password hashes. Bumped from
|
||||
// the library default (10) to 12 so cost grows with hardware. Existing
|
||||
// hashes at lower costs still verify — bcrypt encodes the cost in the
|
||||
// stored hash itself.
|
||||
const bcryptCost = 12
|
||||
|
||||
// HashPassword hashes a plaintext password using bcrypt.
|
||||
func HashPassword(password string) (string, error) {
|
||||
hash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
|
||||
hash, err := bcrypt.GenerateFromPassword([]byte(password), bcryptCost)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("hash password: %w", err)
|
||||
}
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite" // read-only candidate inspection via PRAGMA integrity_check
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
@@ -129,6 +133,17 @@ func (e *Engine) RestorePath(id string) (string, error) {
|
||||
return "", fmt.Errorf("get backup: %w", err)
|
||||
}
|
||||
|
||||
// Filename comes from a DB row. Defence-in-depth: a backup file must live
|
||||
// directly under backupDir, so reject any value carrying a path separator
|
||||
// or traversal before joining. A poisoned row (future import path, manual
|
||||
// insert) must never let restore read — and then atomically copy over the
|
||||
// live DB — an arbitrary file. CreateBackup builds safe base names; this
|
||||
// enforces the same invariant on read.
|
||||
if backup.Filename == "" || backup.Filename == "." || backup.Filename == ".." ||
|
||||
backup.Filename != filepath.Base(backup.Filename) {
|
||||
return "", fmt.Errorf("backup: invalid filename %q", backup.Filename)
|
||||
}
|
||||
|
||||
filePath := filepath.Join(e.backupDir, backup.Filename)
|
||||
if _, err := os.Stat(filePath); err != nil {
|
||||
return "", fmt.Errorf("backup file not found: %w", err)
|
||||
@@ -137,6 +152,153 @@ func (e *Engine) RestorePath(id string) (string, error) {
|
||||
return filePath, nil
|
||||
}
|
||||
|
||||
// PrepareRestore validates a backup candidate before the caller swaps it
|
||||
// over the live DB. Runs three checks in order:
|
||||
//
|
||||
// 1. The candidate file exists and is non-empty.
|
||||
// 2. SQLite header magic matches (catches corrupted or partial downloads).
|
||||
// 3. `PRAGMA integrity_check` against a temp copy returns "ok"
|
||||
// (catches WAL/page corruption that the header check misses).
|
||||
//
|
||||
// On success returns the candidate path. On failure returns a wrapped
|
||||
// error describing which probe rejected the file, so the operator can
|
||||
// see exactly why a "restore" was refused rather than getting a corrupt
|
||||
// DB at next boot.
|
||||
//
|
||||
// We use a *temp copy* for integrity_check because attaching the
|
||||
// candidate read-only into the live process would still hold a file
|
||||
// handle SQLite considers writable on Windows.
|
||||
func (e *Engine) PrepareRestore(id string) (string, error) {
|
||||
path, err := e.RestorePath(id)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("restore: stat candidate: %w", err)
|
||||
}
|
||||
if info.Size() < 100 {
|
||||
return "", fmt.Errorf("restore: candidate %s is suspiciously small (%d bytes)", path, info.Size())
|
||||
}
|
||||
|
||||
// SQLite file header: "SQLite format 3\x00" (16 bytes).
|
||||
hdr, err := readHead(path, 16)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("restore: read header: %w", err)
|
||||
}
|
||||
if string(hdr) != "SQLite format 3\x00" {
|
||||
return "", fmt.Errorf("restore: candidate %s is not a SQLite database (header mismatch)", path)
|
||||
}
|
||||
|
||||
if err := integrityCheck(path); err != nil {
|
||||
return "", fmt.Errorf("restore: integrity check failed: %w", err)
|
||||
}
|
||||
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func readHead(path string, n int) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
buf := make([]byte, n)
|
||||
// io.ReadFull (not f.Read) guarantees the buffer is filled.
|
||||
// A bare Read can short-return on some filesystems / on small
|
||||
// files, which would skew the SQLite-header magic check below.
|
||||
if _, err := io.ReadFull(f, buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// integrityCheck opens the candidate read-only and runs
|
||||
// `PRAGMA integrity_check`. We use immutable=1 so the driver does not
|
||||
// try to create WAL/SHM sidecars or upgrade the journal mode on the
|
||||
// candidate — both of which fail with "attempt to write a readonly
|
||||
// database" against a backup file. Anything other than the single row
|
||||
// `"ok"` is treated as corruption.
|
||||
func integrityCheck(path string) error {
|
||||
db, err := sql.Open("sqlite", "file:"+path+"?mode=ro&immutable=1")
|
||||
if err != nil {
|
||||
return fmt.Errorf("open candidate: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
rows, err := db.Query("PRAGMA integrity_check")
|
||||
if err != nil {
|
||||
return fmt.Errorf("pragma integrity_check: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
if !rows.Next() {
|
||||
return fmt.Errorf("integrity_check returned no rows")
|
||||
}
|
||||
var result string
|
||||
if err := rows.Scan(&result); err != nil {
|
||||
return fmt.Errorf("scan integrity_check: %w", err)
|
||||
}
|
||||
if result != "ok" {
|
||||
return fmt.Errorf("integrity_check: %s", result)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AtomicReplaceDB writes a backup candidate into place atomically.
|
||||
// The caller is expected to:
|
||||
// 1. Call PrepareRestore(id) → candidatePath.
|
||||
// 2. Take a "pre-restore" backup of the current DB via CreateBackup.
|
||||
// 3. Close the live *sql.DB.
|
||||
// 4. Call AtomicReplaceDB(candidatePath, livePath).
|
||||
// 5. Trigger graceful shutdown; main() will re-open on next start.
|
||||
//
|
||||
// AtomicReplaceDB also wipes WAL/SHM sidecar files so the new DB starts
|
||||
// from a clean checkpoint state. Failure to remove sidecars is logged
|
||||
// but non-fatal — SQLite recreates them on open.
|
||||
func (e *Engine) AtomicReplaceDB(candidatePath, livePath string) error {
|
||||
// Copy candidate to a tmp file next to the live DB, then rename
|
||||
// atomically. On Windows os.Rename across volumes fails, so we
|
||||
// keep tmp on the same dir as the destination.
|
||||
tmp := livePath + ".restore.tmp"
|
||||
if err := copyFile(candidatePath, tmp); err != nil {
|
||||
return fmt.Errorf("copy candidate to %s: %w", tmp, err)
|
||||
}
|
||||
// Best-effort: remove WAL/SHM so SQLite re-checkpoints from the
|
||||
// restored main file rather than a stale WAL pointing at the old
|
||||
// DB's pages.
|
||||
for _, sidecar := range []string{livePath + "-wal", livePath + "-shm"} {
|
||||
if err := os.Remove(sidecar); err != nil && !os.IsNotExist(err) {
|
||||
slog.Warn("restore: remove sidecar", "path", sidecar, "error", err)
|
||||
}
|
||||
}
|
||||
if err := os.Rename(tmp, livePath); err != nil {
|
||||
// Clean up tmp on rename failure so we don't leak a partial file.
|
||||
_ = os.Remove(tmp)
|
||||
return fmt.Errorf("rename %s → %s: %w", tmp, livePath, err)
|
||||
}
|
||||
slog.Info("restore: database file replaced atomically", "live", livePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.Copy(out, in); err != nil {
|
||||
_ = out.Close()
|
||||
return err
|
||||
}
|
||||
return out.Close()
|
||||
}
|
||||
|
||||
// Prune removes old backups exceeding the retention count.
|
||||
// Returns the number of backups pruned.
|
||||
func (e *Engine) Prune(retentionCount int) (int, error) {
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// newTestEngine spins up an isolated store + engine pair for tests.
|
||||
// Each test gets its own tempdir so backup files do not collide.
|
||||
func newTestEngine(t *testing.T) (*Engine, *store.Store, string) {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "tinyforge.db")
|
||||
st, err := store.New(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("store.New: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = st.Close() })
|
||||
|
||||
eng, err := New(st, dbPath, dir)
|
||||
if err != nil {
|
||||
t.Fatalf("backup.New: %v", err)
|
||||
}
|
||||
return eng, st, dbPath
|
||||
}
|
||||
|
||||
func TestPrepareRestore_RejectsTinyFile(t *testing.T) {
|
||||
eng, st, _ := newTestEngine(t)
|
||||
|
||||
// Plant a backup row with a tiny file masquerading as a backup.
|
||||
tinyPath := filepath.Join(eng.BackupDir(), "tinyforge-manual-junk.db")
|
||||
if err := os.WriteFile(tinyPath, []byte("hi"), 0o600); err != nil {
|
||||
t.Fatalf("write tiny: %v", err)
|
||||
}
|
||||
bk, err := st.CreateBackup(store.Backup{
|
||||
Filename: "tinyforge-manual-junk.db",
|
||||
SizeBytes: 2,
|
||||
BackupType: "manual",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateBackup row: %v", err)
|
||||
}
|
||||
|
||||
if _, err := eng.PrepareRestore(bk.ID); err == nil {
|
||||
t.Fatal("expected PrepareRestore to reject tiny file, got nil")
|
||||
} else if !strings.Contains(err.Error(), "suspiciously small") {
|
||||
t.Errorf("error = %v, want 'suspiciously small'", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareRestore_RejectsNonSQLite(t *testing.T) {
|
||||
eng, st, _ := newTestEngine(t)
|
||||
|
||||
// 200 bytes of non-SQLite garbage: passes the size check, fails
|
||||
// the header magic check.
|
||||
garbagePath := filepath.Join(eng.BackupDir(), "tinyforge-manual-bogus.db")
|
||||
junk := make([]byte, 200)
|
||||
for i := range junk {
|
||||
junk[i] = byte('x')
|
||||
}
|
||||
if err := os.WriteFile(garbagePath, junk, 0o600); err != nil {
|
||||
t.Fatalf("write junk: %v", err)
|
||||
}
|
||||
bk, err := st.CreateBackup(store.Backup{
|
||||
Filename: "tinyforge-manual-bogus.db",
|
||||
SizeBytes: int64(len(junk)),
|
||||
BackupType: "manual",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateBackup row: %v", err)
|
||||
}
|
||||
|
||||
if _, err := eng.PrepareRestore(bk.ID); err == nil {
|
||||
t.Fatal("expected PrepareRestore to reject non-SQLite blob, got nil")
|
||||
} else if !strings.Contains(err.Error(), "header") {
|
||||
t.Errorf("error = %v, want header mismatch", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareRestore_AcceptsValidVacuumInto(t *testing.T) {
|
||||
eng, _, _ := newTestEngine(t)
|
||||
|
||||
// A fresh CreateBackup from the engine itself is, by construction,
|
||||
// a valid SQLite database — VACUUM INTO produces a clean copy.
|
||||
bk, err := eng.CreateBackup("manual")
|
||||
if err != nil {
|
||||
t.Fatalf("CreateBackup: %v", err)
|
||||
}
|
||||
path, err := eng.PrepareRestore(bk.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("PrepareRestore on valid backup: %v", err)
|
||||
}
|
||||
if path == "" {
|
||||
t.Errorf("PrepareRestore returned empty path")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareRestore_UnknownID(t *testing.T) {
|
||||
eng, _, _ := newTestEngine(t)
|
||||
|
||||
_, err := eng.PrepareRestore("nonexistent-id")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for unknown id, got nil")
|
||||
}
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
// fine — wrapped through RestorePath
|
||||
}
|
||||
}
|
||||
+46
-10
@@ -10,11 +10,26 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ErrNoKey is returned when ENCRYPTION_KEY is not set.
|
||||
var ErrNoKey = errors.New("ENCRYPTION_KEY environment variable is not set")
|
||||
|
||||
// ErrDecryptFailed wraps any cipher.Open / decoder failure. Callers
|
||||
// upgrading from the silent-fallback pattern (treat-as-plaintext when
|
||||
// decrypt errored) MUST instead surface this — a rotated key would
|
||||
// otherwise silently leak ciphertext to upstream services as if it
|
||||
// were plaintext.
|
||||
var ErrDecryptFailed = errors.New("crypto: decrypt failed (wrong key, corrupted ciphertext, or unversioned legacy value)")
|
||||
|
||||
// envelopeV1Prefix tags ciphertext produced by Encrypt going forward.
|
||||
// Older databases may carry unprefixed hex blobs from the v0 era; those
|
||||
// are still readable via Decrypt for backward compatibility, but every
|
||||
// new write goes through EncryptV1 and emits the prefix so a future key
|
||||
// rotation has a clean fail-loud signal.
|
||||
const envelopeV1Prefix = "tf1:"
|
||||
|
||||
// DeriveKey computes a 32-byte AES-256 key from the given passphrase using SHA-256.
|
||||
// This is acceptable when ENCRYPTION_KEY is a high-entropy random string (e.g., 32+ hex chars).
|
||||
// For human-chosen passphrases, consider Argon2id or PBKDF2 with a salt instead.
|
||||
@@ -35,7 +50,8 @@ func KeyFromEnv() ([32]byte, error) {
|
||||
}
|
||||
|
||||
// Encrypt encrypts plaintext using AES-256-GCM with a random nonce.
|
||||
// The returned ciphertext is hex-encoded: nonce || ciphertext+tag.
|
||||
// Returns a versioned envelope (tf1:<hex>) so downstream readers can
|
||||
// distinguish ciphertext from accidentally-stored plaintext.
|
||||
func Encrypt(key [32]byte, plaintext string) (string, error) {
|
||||
block, err := aes.NewCipher(key[:])
|
||||
if err != nil {
|
||||
@@ -53,14 +69,34 @@ func Encrypt(key [32]byte, plaintext string) (string, error) {
|
||||
}
|
||||
|
||||
sealed := gcm.Seal(nonce, nonce, []byte(plaintext), nil)
|
||||
return hex.EncodeToString(sealed), nil
|
||||
return envelopeV1Prefix + hex.EncodeToString(sealed), nil
|
||||
}
|
||||
|
||||
// Decrypt decrypts a hex-encoded ciphertext produced by Encrypt.
|
||||
func Decrypt(key [32]byte, ciphertextHex string) (string, error) {
|
||||
data, err := hex.DecodeString(ciphertextHex)
|
||||
// HasEnvelope reports whether the value is a v1-prefixed ciphertext.
|
||||
// Useful for router-level "decrypt only if encrypted" decision points
|
||||
// that previously relied on `err == nil` from a try-decrypt — that
|
||||
// pattern silently masked rotated-key failures.
|
||||
func HasEnvelope(value string) bool {
|
||||
return strings.HasPrefix(value, envelopeV1Prefix)
|
||||
}
|
||||
|
||||
// Decrypt decrypts an envelope (tf1:<hex>). For backward compatibility
|
||||
// it also accepts unprefixed hex from the v0 era — but only when the
|
||||
// resulting plaintext is valid; a wrong key for legacy data now returns
|
||||
// ErrDecryptFailed instead of silently treating ciphertext as
|
||||
// plaintext.
|
||||
//
|
||||
// Callers MUST NOT swallow the error and fall back to "use as-is".
|
||||
// That pattern is the exact footgun the envelope versioning removes.
|
||||
func Decrypt(key [32]byte, ciphertext string) (string, error) {
|
||||
hexBlob := ciphertext
|
||||
if strings.HasPrefix(hexBlob, envelopeV1Prefix) {
|
||||
hexBlob = hexBlob[len(envelopeV1Prefix):]
|
||||
}
|
||||
|
||||
data, err := hex.DecodeString(hexBlob)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("decode hex: %w", err)
|
||||
return "", fmt.Errorf("%w: decode hex: %v", ErrDecryptFailed, err)
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(key[:])
|
||||
@@ -75,15 +111,15 @@ func Decrypt(key [32]byte, ciphertextHex string) (string, error) {
|
||||
|
||||
nonceSize := gcm.NonceSize()
|
||||
if len(data) < nonceSize {
|
||||
return "", errors.New("ciphertext too short")
|
||||
return "", fmt.Errorf("%w: ciphertext too short", ErrDecryptFailed)
|
||||
}
|
||||
|
||||
nonce := data[:nonceSize]
|
||||
ciphertext := data[nonceSize:]
|
||||
body := data[nonceSize:]
|
||||
|
||||
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
|
||||
plaintext, err := gcm.Open(nil, nonce, body, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("decrypt: %w", err)
|
||||
return "", fmt.Errorf("%w: %v", ErrDecryptFailed, err)
|
||||
}
|
||||
|
||||
return string(plaintext), nil
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
package deployer
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// deployHistoryKeepPerWorkload bounds the ledger per workload. Newer rows
|
||||
// always have larger ids, so pruning keeps the most recent N — enough for a
|
||||
// useful rollback menu without unbounded growth on hot workloads.
|
||||
const deployHistoryKeepPerWorkload = 50
|
||||
|
||||
// recordDeployHistory appends one ledger row for a completed dispatch.
|
||||
//
|
||||
// Best-effort: a store failure is logged and swallowed — recording must
|
||||
// never turn a successful deploy into a failed request (same contract as
|
||||
// EmitDeployEvent and the pre-deploy backup). The raw deploy error is NEVER
|
||||
// persisted: it can carry registry-auth bytes or compose stdout, so only a
|
||||
// fixed, secret-free marker lands in the row (raw detail goes to slog at the
|
||||
// call site). Called only from DispatchPlugin — reconcile/teardown ticks are
|
||||
// not deploys and must not appear in the ledger.
|
||||
func (d *Deployer) recordDeployHistory(w plugin.Workload, intent plugin.DeploymentIntent, outcome string, deployErr error, startedAt string) {
|
||||
if d.store == nil {
|
||||
return
|
||||
}
|
||||
entry := store.DeployHistoryEntry{
|
||||
WorkloadID: w.ID,
|
||||
SourceKind: w.SourceKind,
|
||||
Reference: d.effectiveReference(w, intent),
|
||||
Reason: intent.Reason,
|
||||
TriggeredBy: intent.TriggeredBy,
|
||||
Note: intent.Metadata["note"], // nil map read is safe
|
||||
Outcome: outcome,
|
||||
StartedAt: startedAt,
|
||||
FinishedAt: store.Now(),
|
||||
}
|
||||
if deployErr != nil {
|
||||
entry.Error = "deploy failed (see server logs)"
|
||||
}
|
||||
if _, err := d.store.InsertDeployHistory(entry); err != nil {
|
||||
slog.Warn("deploy history: insert failed", "workload", w.ID, "error", err)
|
||||
return
|
||||
}
|
||||
// Cheap indexed DELETE — negligible next to a multi-second deploy, so it
|
||||
// stays inline rather than on an untracked goroutine that could outrace
|
||||
// graceful shutdown's db.Close().
|
||||
if err := d.store.PruneDeployHistory(w.ID, deployHistoryKeepPerWorkload); err != nil {
|
||||
slog.Warn("deploy history: prune failed", "workload", w.ID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// effectiveReference resolves the artifact handle to record (and, for
|
||||
// rollback-capable sources, to replay). It starts from the trigger-supplied
|
||||
// intent.Reference and, for the image source, prefers the tag actually
|
||||
// written onto the freshest container row — capturing the DefaultTag /
|
||||
// "latest" resolution the source performs when intent.Reference is empty
|
||||
// (e.g. a manual deploy with no override). ListContainersByWorkload returns
|
||||
// newest-first, so rows[0] is the just-deployed container on success.
|
||||
//
|
||||
// For static/dockerfile the git trigger already supplies the commit SHA as
|
||||
// intent.Reference; a manual deploy of those may record an empty reference
|
||||
// (acceptable — they are not rollback-capable in this phase). compose has no
|
||||
// single artifact handle.
|
||||
func (d *Deployer) effectiveReference(w plugin.Workload, intent plugin.DeploymentIntent) string {
|
||||
ref := intent.Reference
|
||||
if w.SourceKind == "image" && d.store != nil {
|
||||
if rows, err := d.store.ListContainersByWorkload(w.ID); err == nil && len(rows) > 0 {
|
||||
if tag := rows[0].ImageTag; tag != "" {
|
||||
ref = tag
|
||||
}
|
||||
}
|
||||
}
|
||||
return ref
|
||||
}
|
||||
@@ -5,6 +5,7 @@
|
||||
package deployer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
@@ -14,9 +15,11 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/docker"
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/health"
|
||||
"github.com/alexei/tinyforge/internal/keyedmutex"
|
||||
"github.com/alexei/tinyforge/internal/notify"
|
||||
"github.com/alexei/tinyforge/internal/proxy"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// Deployer owns the dependency bundle each Source plugin needs at deploy
|
||||
@@ -34,9 +37,44 @@ type Deployer struct {
|
||||
dnsMu sync.RWMutex
|
||||
dns dns.Provider // nil when wildcard DNS is active
|
||||
|
||||
// proxyMu protects hot-swap of d.proxy from runtime settings updates
|
||||
// (SetProxyProvider) racing with PluginDeps() reads on the deploy path.
|
||||
proxyMu sync.RWMutex
|
||||
|
||||
// Graceful shutdown: tracks in-progress deploys.
|
||||
//
|
||||
// drainMu serializes the "is-draining check + activeWg.Add(1)" in
|
||||
// beginDispatch against the "set shuttingDown + Wait()" in Drain. Without
|
||||
// it, a dispatch could pass the draining check, Drain could then flip the
|
||||
// flag and start Wait() with a zero counter, and the dispatch could call
|
||||
// Add(1) concurrently with Wait — a documented sync.WaitGroup misuse
|
||||
// (panic risk) that also lets a deploy slip past the drain barrier.
|
||||
drainMu sync.Mutex
|
||||
activeWg sync.WaitGroup
|
||||
shuttingDown atomic.Bool
|
||||
|
||||
// workloadLocks serializes deploy-class operations per workload id so two
|
||||
// concurrent mutators of the same workload (a manual deploy, a webhook/
|
||||
// trigger dispatch, a rollback, a promote, OR a volume-snapshot restore)
|
||||
// can never interleave their container/volume changes. Every deploy
|
||||
// entrypoint funnels through DispatchPlugin, so locking there gates them
|
||||
// all at one choke point. This is the per-workload lock activeWg is NOT
|
||||
// (activeWg is a global drain barrier for graceful shutdown).
|
||||
workloadLocks keyedmutex.Mutex
|
||||
}
|
||||
|
||||
// LockWorkload acquires the per-workload deploy lock for an external critical
|
||||
// section (volume-snapshot restore) and returns the release func. The restore
|
||||
// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked
|
||||
// (which does NOT re-acquire it).
|
||||
func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) }
|
||||
|
||||
// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock,
|
||||
// because the caller (restore) already holds it via LockWorkload. Calling the
|
||||
// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant.
|
||||
// Not for general use.
|
||||
func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
|
||||
return d.dispatchLocked(ctx, w, intent)
|
||||
}
|
||||
|
||||
// EventPublisher is the interface for publishing events to the event bus.
|
||||
@@ -73,7 +111,11 @@ func New(
|
||||
}
|
||||
|
||||
// SetProxyProvider updates the proxy provider at runtime (e.g., when settings change).
|
||||
// Guarded by proxyMu so concurrent deploys that read d.proxy via PluginDeps()
|
||||
// observe a coherent value (previously a torn-pointer race under -race).
|
||||
func (d *Deployer) SetProxyProvider(provider proxy.Provider) {
|
||||
d.proxyMu.Lock()
|
||||
defer d.proxyMu.Unlock()
|
||||
d.proxy = provider
|
||||
}
|
||||
|
||||
@@ -84,20 +126,34 @@ func (d *Deployer) SetPreDeployBackuper(b PreDeployBackuper) {
|
||||
d.backuper = b
|
||||
}
|
||||
|
||||
// MaybeBackupBeforeDeploy creates a "pre-deploy" Tinyforge DB snapshot when
|
||||
// the setting is enabled. Failures are logged but do not abort the deploy:
|
||||
// missing a backup is preferable to refusing to ship a fix. Exposed so
|
||||
// Source plugins can opt into the same behaviour.
|
||||
func (d *Deployer) MaybeBackupBeforeDeploy(deployID string, settings store.Settings) {
|
||||
if !settings.AutoBackupBeforeDeploy || d.backuper == nil {
|
||||
// maybeBackupBeforeDeploy takes a "pre-deploy" Tinyforge DB snapshot before a
|
||||
// deploy when the operator enabled auto_backup_before_deploy. It is called on
|
||||
// the unified deploy path (DispatchPlugin) so the setting actually fires — its
|
||||
// predecessor was orphaned when the legacy executeDeploy pipeline (its only
|
||||
// caller) was removed in the workload-first cutover, silently disabling the
|
||||
// setting.
|
||||
//
|
||||
// Fail-open: a nil backuper, a settings-load error, or a backup failure all
|
||||
// skip the snapshot without blocking the deploy — missing a backup is
|
||||
// preferable to refusing to ship a fix.
|
||||
func (d *Deployer) maybeBackupBeforeDeploy(workloadID string) {
|
||||
if d.backuper == nil {
|
||||
return
|
||||
}
|
||||
settings, err := d.store.GetSettings()
|
||||
if err != nil {
|
||||
slog.Warn("pre-deploy backup: load settings", "workload", workloadID, "error", err)
|
||||
return
|
||||
}
|
||||
if !settings.AutoBackupBeforeDeploy {
|
||||
return
|
||||
}
|
||||
backup, err := d.backuper.CreateBackup("pre-deploy")
|
||||
if err != nil {
|
||||
slog.Warn("pre-deploy backup failed", "deploy_id", deployID, "error", err)
|
||||
slog.Warn("pre-deploy backup failed", "workload", workloadID, "error", err)
|
||||
return
|
||||
}
|
||||
slog.Info("pre-deploy backup created", "deploy_id", deployID, "backup_id", backup.ID, "filename", backup.Filename)
|
||||
slog.Info("pre-deploy backup created", "workload", workloadID, "backup_id", backup.ID, "filename", backup.Filename)
|
||||
}
|
||||
|
||||
// SetDNSProvider sets the DNS provider for managing DNS records during deployments.
|
||||
@@ -110,8 +166,11 @@ func (d *Deployer) SetDNSProvider(provider dns.Provider) {
|
||||
|
||||
// Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
|
||||
func (d *Deployer) Drain() {
|
||||
if !d.shuttingDown.CompareAndSwap(false, true) {
|
||||
// Already draining.
|
||||
d.drainMu.Lock()
|
||||
already := d.shuttingDown.Swap(true)
|
||||
d.drainMu.Unlock()
|
||||
if already {
|
||||
slog.Info("deployer: drain already in progress")
|
||||
}
|
||||
slog.Info("deployer: draining in-progress deploys")
|
||||
d.activeWg.Wait()
|
||||
@@ -121,11 +180,17 @@ func (d *Deployer) Drain() {
|
||||
// ShuttingDown reports whether Drain() has been called.
|
||||
func (d *Deployer) ShuttingDown() bool { return d.shuttingDown.Load() }
|
||||
|
||||
// rejectIfDraining is exposed in case any plugin wants the same hard-stop
|
||||
// behaviour the legacy pipeline used.
|
||||
func (d *Deployer) rejectIfDraining() error {
|
||||
// beginDispatch atomically rejects when draining and otherwise registers the
|
||||
// in-flight unit on activeWg. The shuttingDown check and the Add(1) MUST be
|
||||
// done together under drainMu (see the field comment): Drain sets the flag
|
||||
// under the same mutex before Wait(), so once Wait() observes a zero counter
|
||||
// no further Add can race it. Callers must defer d.activeWg.Done() on success.
|
||||
func (d *Deployer) beginDispatch() error {
|
||||
d.drainMu.Lock()
|
||||
defer d.drainMu.Unlock()
|
||||
if d.shuttingDown.Load() {
|
||||
return fmt.Errorf("deployer is shutting down, rejecting new deploy")
|
||||
}
|
||||
d.activeWg.Add(1)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -4,26 +4,76 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/metrics"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// DispatchPlugin routes a DeploymentIntent for w to the matching Source
|
||||
// plugin. This is the new unified deploy path; the legacy executeDeploy
|
||||
// remains in place until Phase 6 ports image-deploy logic into
|
||||
// source/image. While both exist, callers must pick: webhook/registry
|
||||
// triggers + image deploys still go through the legacy path, while
|
||||
// /api/hooks/generic + the unified webhook ingress go through here.
|
||||
// plugin. This is the unified deploy path for every source kind (the legacy
|
||||
// executeDeploy pipeline was removed in the workload-first cutover). When the
|
||||
// operator enables auto_backup_before_deploy, a pre-deploy Tinyforge DB
|
||||
// snapshot is taken here, after the source resolves and before it runs.
|
||||
func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
|
||||
// C1: serialize all deploy-class work per workload. Held across the whole
|
||||
// deploy so a concurrent deploy/rollback/promote/trigger — or a volume
|
||||
// restore (which redeploys via RedeployLocked while holding this) — can
|
||||
// never interleave container changes for the same workload.
|
||||
unlock := d.workloadLocks.Lock(w.ID)
|
||||
defer unlock()
|
||||
return d.dispatchLocked(ctx, w, intent)
|
||||
}
|
||||
|
||||
// dispatchLocked is DispatchPlugin's body, assuming the per-workload lock is
|
||||
// already held. RedeployLocked calls it directly during restore.
|
||||
func (d *Deployer) dispatchLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
|
||||
if err := d.beginDispatch(); err != nil {
|
||||
metrics.DeploysTotal.Inc(w.SourceKind, "rejected_draining")
|
||||
return err
|
||||
}
|
||||
defer d.activeWg.Done()
|
||||
src, err := plugin.GetSource(w.SourceKind)
|
||||
if err != nil {
|
||||
// Unknown source: use the constant "unknown" sentinel for the
|
||||
// label so a typo-spam attack can't grow the metrics map with
|
||||
// one series per bogus source_kind. The actual user-supplied
|
||||
// value still surfaces via the wrapped error / event log.
|
||||
metrics.DeploysTotal.Inc("unknown", "unknown_source")
|
||||
return fmt.Errorf("dispatch %s: %w", w.Name, err)
|
||||
}
|
||||
return src.Deploy(ctx, d.PluginDeps(), w, intent)
|
||||
// Optional operator-enabled pre-deploy DB snapshot. Fail-open: never
|
||||
// blocks shipping a deploy. Runs before any source-internal idempotency
|
||||
// check (e.g. the image source's same-tag short-circuit), so a same-tag
|
||||
// redeploy still snapshots — "backup before every deploy attempt".
|
||||
d.maybeBackupBeforeDeploy(w.ID)
|
||||
startedAt := store.Now()
|
||||
err = src.Deploy(ctx, d.PluginDeps(), w, intent)
|
||||
outcome := "success"
|
||||
if err != nil {
|
||||
outcome = "failure"
|
||||
}
|
||||
metrics.DeploysTotal.Inc(w.SourceKind, outcome)
|
||||
// Append to the structured deploy ledger (powers the per-app history
|
||||
// panel + rollback). Best-effort and secret-free; see recordDeployHistory.
|
||||
// Only DispatchPlugin records — reconcile/teardown are not deploys.
|
||||
d.recordDeployHistory(w, intent, outcome, err, startedAt)
|
||||
return err
|
||||
}
|
||||
|
||||
// DispatchTeardown routes a teardown call to the matching Source plugin.
|
||||
// Used when a workload is deleted.
|
||||
// Used when a workload is deleted. Tracked via activeWg so Drain() honours
|
||||
// in-progress teardowns just like deploys.
|
||||
func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) error {
|
||||
// Teardown mutates the same containers/routes a deploy does, so it takes the
|
||||
// per-workload lock too (C1). Callers tear down distinct workload ids
|
||||
// sequentially (e.g. preview children then parent), never nested, so no
|
||||
// self-deadlock.
|
||||
unlock := d.workloadLocks.Lock(w.ID)
|
||||
defer unlock()
|
||||
if err := d.beginDispatch(); err != nil {
|
||||
return err
|
||||
}
|
||||
defer d.activeWg.Done()
|
||||
src, err := plugin.GetSource(w.SourceKind)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dispatch teardown %s: %w", w.Name, err)
|
||||
@@ -33,8 +83,17 @@ func (d *Deployer) DispatchTeardown(ctx context.Context, w plugin.Workload) erro
|
||||
|
||||
// DispatchReconcile routes a Reconcile call. Periodic reconciler iterates
|
||||
// every Workload and calls this; idle Sources should make it a cheap
|
||||
// no-op.
|
||||
// no-op. Tracked via activeWg so a long-running reconcile blocks Drain().
|
||||
func (d *Deployer) DispatchReconcile(ctx context.Context, w plugin.Workload) error {
|
||||
if err := d.beginDispatch(); err != nil {
|
||||
// Silent skip — reconcile is a periodic tick, not a user-initiated
|
||||
// action, so we don't want to surface "draining" errors back to the
|
||||
// reconciler loop. The next tick after restart will catch up. Routing
|
||||
// through beginDispatch keeps the activeWg.Add atomic with the drain
|
||||
// check (see Drain) instead of a bare shuttingDown.Load + Add race.
|
||||
return nil
|
||||
}
|
||||
defer d.activeWg.Done()
|
||||
src, err := plugin.GetSource(w.SourceKind)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dispatch reconcile %s: %w", w.Name, err)
|
||||
@@ -52,10 +111,13 @@ func (d *Deployer) PluginDeps() plugin.Deps {
|
||||
d.dnsMu.RLock()
|
||||
dnsProvider := d.dns
|
||||
d.dnsMu.RUnlock()
|
||||
d.proxyMu.RLock()
|
||||
proxyProvider := d.proxy
|
||||
d.proxyMu.RUnlock()
|
||||
return plugin.Deps{
|
||||
Store: d.store,
|
||||
Docker: d.docker,
|
||||
Proxy: d.proxy,
|
||||
Proxy: proxyProvider,
|
||||
DNS: dnsProvider,
|
||||
Health: d.health,
|
||||
Notifier: d.notifier,
|
||||
|
||||
@@ -21,9 +21,9 @@ import (
|
||||
type fakeSource struct {
|
||||
kind string
|
||||
|
||||
mu sync.Mutex
|
||||
deployErr error
|
||||
teardownErr error
|
||||
mu sync.Mutex
|
||||
deployErr error
|
||||
teardownErr error
|
||||
reconcileErr error
|
||||
|
||||
deployCount atomic.Int32
|
||||
@@ -34,8 +34,8 @@ type fakeSource struct {
|
||||
lastDeps plugin.Deps
|
||||
}
|
||||
|
||||
func (f *fakeSource) Kind() string { return f.kind }
|
||||
func (f *fakeSource) SchemaSample() any { return struct{}{} }
|
||||
func (f *fakeSource) Kind() string { return f.kind }
|
||||
func (f *fakeSource) SchemaSample() any { return struct{}{} }
|
||||
func (f *fakeSource) Validate(json.RawMessage) error { return nil }
|
||||
|
||||
func (f *fakeSource) Deploy(_ context.Context, deps plugin.Deps, _ plugin.Workload, intent plugin.DeploymentIntent) error {
|
||||
@@ -250,6 +250,84 @@ func TestDispatchReconcile_PropagatesSourceError(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Deploy history recording ----------------------------------------------
|
||||
|
||||
// seedDispatchWorkload inserts a real workloads row so deploy_history's FK
|
||||
// (workload_id REFERENCES workloads) is satisfied, then returns a plugin
|
||||
// workload pointing at the fake source.
|
||||
func seedDispatchWorkload(t *testing.T, d *Deployer) plugin.Workload {
|
||||
t.Helper()
|
||||
row, err := d.store.CreateWorkload(store.Workload{Kind: "project", RefID: "dh", Name: "dh"})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateWorkload: %v", err)
|
||||
}
|
||||
return plugin.Workload{ID: row.ID, Name: "dh", SourceKind: "dispatchertest"}
|
||||
}
|
||||
|
||||
func TestDispatchPlugin_RecordsSuccessHistory(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
w := seedDispatchWorkload(t, d)
|
||||
|
||||
intent := plugin.DeploymentIntent{Reason: "manual", Reference: "v9", TriggeredBy: "alice",
|
||||
Metadata: map[string]string{"note": "ship it"}}
|
||||
if err := d.DispatchPlugin(context.Background(), w, intent); err != nil {
|
||||
t.Fatalf("DispatchPlugin: %v", err)
|
||||
}
|
||||
rows, err := d.store.ListDeployHistory(w.ID, 10, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("ListDeployHistory: %v", err)
|
||||
}
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("expected 1 history row, got %d", len(rows))
|
||||
}
|
||||
got := rows[0]
|
||||
if got.Outcome != "success" || got.Reason != "manual" || got.Reference != "v9" {
|
||||
t.Fatalf("unexpected row: %+v", got)
|
||||
}
|
||||
if got.TriggeredBy != "alice" || got.Note != "ship it" {
|
||||
t.Fatalf("intent fields not recorded: %+v", got)
|
||||
}
|
||||
if got.Error != "" {
|
||||
t.Fatalf("success row must have empty error, got %q", got.Error)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDispatchPlugin_RecordsFailureWithoutLeakingError(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
w := seedDispatchWorkload(t, d)
|
||||
|
||||
// A deploy error carrying a "secret" must never reach the persisted row.
|
||||
dispatchTestSource.setDeployErr(errors.New("compose up failed (output: SUPER_SECRET=hunter2)"))
|
||||
_ = d.DispatchPlugin(context.Background(), w, plugin.DeploymentIntent{Reason: "manual"})
|
||||
|
||||
rows, _ := d.store.ListDeployHistory(w.ID, 10, 0)
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("expected 1 history row, got %d", len(rows))
|
||||
}
|
||||
if rows[0].Outcome != "failure" {
|
||||
t.Fatalf("expected failure outcome, got %q", rows[0].Outcome)
|
||||
}
|
||||
if strings.Contains(rows[0].Error, "hunter2") || strings.Contains(rows[0].Error, "SECRET") {
|
||||
t.Fatalf("raw error leaked into history: %q", rows[0].Error)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDispatchReconcile_RecordsNoHistory(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
w := seedDispatchWorkload(t, d)
|
||||
|
||||
if err := d.DispatchReconcile(context.Background(), w); err != nil {
|
||||
t.Fatalf("DispatchReconcile: %v", err)
|
||||
}
|
||||
rows, _ := d.store.ListDeployHistory(w.ID, 10, 0)
|
||||
if len(rows) != 0 {
|
||||
t.Fatalf("reconcile must not write history, got %d rows", len(rows))
|
||||
}
|
||||
}
|
||||
|
||||
// ---- PluginDeps -------------------------------------------------------------
|
||||
|
||||
func TestPluginDeps_PassesStoreAndEncKey(t *testing.T) {
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
package deployer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// fakeBackuper records pre-deploy backup calls so the dispatch wiring can be
|
||||
// asserted. err (when set) simulates a backup failure.
|
||||
type fakeBackuper struct {
|
||||
count atomic.Int32
|
||||
lastType atomic.Value // string
|
||||
err error
|
||||
}
|
||||
|
||||
func (f *fakeBackuper) CreateBackup(backupType string) (store.Backup, error) {
|
||||
f.count.Add(1)
|
||||
f.lastType.Store(backupType)
|
||||
if f.err != nil {
|
||||
return store.Backup{}, f.err
|
||||
}
|
||||
return store.Backup{ID: "b1", Filename: "tinyforge-pre-deploy.db"}, nil
|
||||
}
|
||||
|
||||
func setAutoBackup(t *testing.T, d *Deployer, enabled bool) {
|
||||
t.Helper()
|
||||
s, err := d.store.GetSettings()
|
||||
if err != nil {
|
||||
t.Fatalf("get settings: %v", err)
|
||||
}
|
||||
s.AutoBackupBeforeDeploy = enabled
|
||||
if err := d.store.UpdateSettings(s); err != nil {
|
||||
t.Fatalf("update settings: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Regression: the pre-deploy backup hook was orphaned after the cutover (no
|
||||
// caller on DispatchPlugin), making auto_backup_before_deploy a silent no-op.
|
||||
func TestDispatchPlugin_PreDeployBackup_FiresWhenEnabled(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
b := &fakeBackuper{}
|
||||
d.SetPreDeployBackuper(b)
|
||||
setAutoBackup(t, d, true)
|
||||
|
||||
if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
|
||||
t.Fatalf("dispatch: %v", err)
|
||||
}
|
||||
if got := b.count.Load(); got != 1 {
|
||||
t.Fatalf("CreateBackup called %d times, want 1", got)
|
||||
}
|
||||
if bt, _ := b.lastType.Load().(string); bt != "pre-deploy" {
|
||||
t.Fatalf("backup type = %q, want pre-deploy", bt)
|
||||
}
|
||||
if got := dispatchTestSource.deployCount.Load(); got != 1 {
|
||||
t.Fatalf("Deploy ran %d times, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDispatchPlugin_PreDeployBackup_SkippedWhenDisabled(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
b := &fakeBackuper{}
|
||||
d.SetPreDeployBackuper(b)
|
||||
setAutoBackup(t, d, false)
|
||||
|
||||
if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
|
||||
t.Fatalf("dispatch: %v", err)
|
||||
}
|
||||
if got := b.count.Load(); got != 0 {
|
||||
t.Fatalf("CreateBackup called %d times, want 0 (setting off)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDispatchPlugin_PreDeployBackup_NilBackuperNoPanic(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
setAutoBackup(t, d, true) // enabled, but no backuper wired
|
||||
|
||||
if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
|
||||
t.Fatalf("dispatch must not panic/fail with a nil backuper: %v", err)
|
||||
}
|
||||
if got := dispatchTestSource.deployCount.Load(); got != 1 {
|
||||
t.Fatalf("Deploy ran %d times, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDispatchPlugin_PreDeployBackup_FailOpen(t *testing.T) {
|
||||
resetFake(t)
|
||||
d := newTestDeployer(t)
|
||||
b := &fakeBackuper{err: errors.New("disk full")}
|
||||
d.SetPreDeployBackuper(b)
|
||||
setAutoBackup(t, d, true)
|
||||
|
||||
// A failed backup is logged but must NOT block the deploy.
|
||||
if err := d.DispatchPlugin(context.Background(), sampleWorkload(), plugin.DeploymentIntent{}); err != nil {
|
||||
t.Fatalf("deploy must succeed when backup fails (fail-open): %v", err)
|
||||
}
|
||||
if got := dispatchTestSource.deployCount.Load(); got != 1 {
|
||||
t.Fatalf("Deploy ran %d times, want 1 (despite backup failure)", got)
|
||||
}
|
||||
}
|
||||
+119
-20
@@ -2,20 +2,58 @@ package docker
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/moby/moby/api/types/build"
|
||||
"github.com/moby/moby/client"
|
||||
)
|
||||
|
||||
// BuildImage builds a Docker image from a directory containing a Dockerfile.
|
||||
// The directory is packaged as a tar archive and sent to the Docker daemon.
|
||||
// The tag parameter is the image name:tag to apply (e.g., "dw-site-myapp:latest").
|
||||
// BuildImage builds a Docker image from a directory containing a Dockerfile
|
||||
// at the context root. Kept as a thin wrapper around BuildImageAt for the
|
||||
// static-site plugin which always emits its generated Dockerfile at the
|
||||
// context root. New code should prefer BuildImageAt so the Dockerfile path
|
||||
// is explicit.
|
||||
func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
|
||||
return c.BuildImageAt(ctx, contextDir, "Dockerfile", tag, nil)
|
||||
}
|
||||
|
||||
// BuildImageAt builds a Docker image from a tar of contextDir, using the
|
||||
// Dockerfile at `dockerfile` *inside* the context (typically "Dockerfile"
|
||||
// but may be e.g. "docker/Dockerfile" when the user-supplied repo layout
|
||||
// keeps Dockerfiles in a subfolder).
|
||||
//
|
||||
// The dockerfile argument is the path *relative to contextDir*. Empty
|
||||
// strings are normalised to "Dockerfile" so callers can pass through a
|
||||
// user config value without sanitising twice.
|
||||
//
|
||||
// logFn, if non-nil, is invoked for every non-empty `stream` line the
|
||||
// daemon emits during the build. Callers use this to forward live build
|
||||
// progress (e.g. SSE bus). Errors from the daemon are NOT delivered via
|
||||
// logFn — they surface as the returned error so the caller's failure
|
||||
// path stays the single source of truth.
|
||||
func (c *Client) BuildImageAt(ctx context.Context, contextDir, dockerfile, tag string, logFn func(line string)) error {
|
||||
if dockerfile == "" {
|
||||
dockerfile = "Dockerfile"
|
||||
}
|
||||
// Normalise to forward slashes — the tar entry names use them and the
|
||||
// Docker daemon expects the same.
|
||||
dockerfile = filepath.ToSlash(dockerfile)
|
||||
// Defence-in-depth: the dockerfile path is relative to contextDir and
|
||||
// is increasingly user/config-supplied (subfolder Dockerfiles). Reject
|
||||
// absolute paths and any `..` traversal at the boundary so a value like
|
||||
// "../../etc/passwd" can never be handed to the daemon's build options,
|
||||
// regardless of which builder backend resolves it.
|
||||
if filepath.IsAbs(dockerfile) || strings.HasPrefix(dockerfile, "/") ||
|
||||
dockerfile == ".." || strings.HasPrefix(dockerfile, "../") || strings.Contains(dockerfile, "/../") {
|
||||
return fmt.Errorf("docker build: invalid dockerfile path %q (must be relative to the build context, no traversal)", dockerfile)
|
||||
}
|
||||
// Create tar archive of the build context.
|
||||
pr, pw := io.Pipe()
|
||||
|
||||
@@ -50,16 +88,14 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open %s: %w", path, err)
|
||||
// Per-file close, NOT defer. `defer file.Close()` inside the
|
||||
// WalkFunc only runs when the outer goroutine returns — for a
|
||||
// build context with thousands of files (node_modules-heavy
|
||||
// repo) that leaks one fd per file until the walk completes
|
||||
// and trips EMFILE on default ulimit=1024 systems.
|
||||
if err := streamFileIntoTar(tw, path, relPath); err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
if _, err := io.Copy(tw, file); err != nil {
|
||||
return fmt.Errorf("copy %s to tar: %w", relPath, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
@@ -69,8 +105,16 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
|
||||
pw.CloseWithError(err)
|
||||
}()
|
||||
|
||||
// Pin the legacy builder explicitly. On Docker Engine 23+ BuildKit
|
||||
// is the default for the CLI, but the daemon honours the explicit
|
||||
// Version field on ImageBuildOptions. Legacy builder does NOT support
|
||||
// `RUN --mount=type=bind,source=/host` so a malicious Dockerfile
|
||||
// cannot mount host paths into the build context. Switching to
|
||||
// BuildKit later requires (a) Dockerfile-content validation to
|
||||
// reject bind-mount hints, or (b) an explicit per-workload opt-in.
|
||||
resp, err := c.api.ImageBuild(ctx, pr, client.ImageBuildOptions{
|
||||
Dockerfile: "Dockerfile",
|
||||
Version: build.BuilderV1,
|
||||
Dockerfile: dockerfile,
|
||||
Tags: []string{tag},
|
||||
Remove: true,
|
||||
ForceRemove: true,
|
||||
@@ -80,16 +124,71 @@ func (c *Client) BuildImage(ctx context.Context, contextDir, tag string) error {
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Read the build output to completion (required for the build to finish).
|
||||
output, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
// Drain the daemon's NDJSON stream to completion. The stream MUST
|
||||
// be read for the build to finish — closing the body early aborts
|
||||
// the build. We parse line-by-line into the {Stream, Error} shape
|
||||
// the daemon emits so an honest `{"error":"..."}` line surfaces
|
||||
// without false positives from informational `{"stream":"error
|
||||
// handling: retrying..."}` chatter that the old strings.Contains
|
||||
// path would have flagged.
|
||||
type buildLine struct {
|
||||
Stream string `json:"stream,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
// Some build steps emit single lines exceeding the default 64 KiB
|
||||
// (e.g. a fat go-mod-download dump). Bump to 1 MiB so we don't
|
||||
// silently truncate and miss the trailing error line.
|
||||
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
|
||||
var firstErr string
|
||||
for scanner.Scan() {
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
var bl buildLine
|
||||
if err := json.Unmarshal(line, &bl); err != nil {
|
||||
// Non-JSON line — daemon shouldn't produce these, but
|
||||
// don't fail the build over a parse hiccup.
|
||||
continue
|
||||
}
|
||||
if bl.Error != "" && firstErr == "" {
|
||||
firstErr = bl.Error
|
||||
}
|
||||
if logFn != nil && bl.Stream != "" {
|
||||
logFn(bl.Stream)
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return fmt.Errorf("read build output for %s: %w", tag, err)
|
||||
}
|
||||
|
||||
// Check for error in build output.
|
||||
if strings.Contains(string(output), `"error"`) {
|
||||
return fmt.Errorf("build image %s: build errors in output", tag)
|
||||
if firstErr != "" {
|
||||
return fmt.Errorf("build image %s: %s", tag, firstErr)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// streamFileIntoTar opens path, copies its contents into the tar writer
|
||||
// under the given relPath header, and closes the file *before returning*
|
||||
// — i.e. once per file, not deferred to the end of the entire walk.
|
||||
// Extracted so the per-iteration close discipline is obvious at the
|
||||
// callsite and the file handle isn't accidentally hoisted into the
|
||||
// caller's defer stack via a future refactor.
|
||||
func streamFileIntoTar(tw *tar.Writer, path, relPath string) error {
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open %s: %w", path, err)
|
||||
}
|
||||
_, copyErr := io.Copy(tw, file)
|
||||
// Close BEFORE returning so the fd is released even on copy
|
||||
// failure. Capture both errors so the more-specific copy error
|
||||
// wins when both fire.
|
||||
if cerr := file.Close(); cerr != nil && copyErr == nil {
|
||||
copyErr = cerr
|
||||
}
|
||||
if copyErr != nil {
|
||||
return fmt.Errorf("copy %s to tar: %w", relPath, copyErr)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -108,3 +108,29 @@ func (c *Client) GetSystemStats(ctx context.Context) (SystemStats, error) {
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// BuildCachePruneResult reports the outcome of a build-cache prune.
|
||||
type BuildCachePruneResult struct {
|
||||
CachesDeleted int `json:"caches_deleted"` // number of cache records removed
|
||||
SpaceReclaimed int64 `json:"space_reclaimed"` // bytes reclaimed
|
||||
}
|
||||
|
||||
// PruneBuildCache deletes unused Docker build-cache records and returns the
|
||||
// number of records removed and bytes reclaimed. Docker's build-cache API is
|
||||
// prune-by-filter only — there is no surgical per-record eviction — so this
|
||||
// is the daemon-wide "prune unused" operation.
|
||||
//
|
||||
// When all is false (the default), only build cache not currently in use is
|
||||
// removed, so an app's next rebuild still hits its warm cache. When all is
|
||||
// true, every build-cache record is removed regardless of use, forcing a cold
|
||||
// rebuild for every app.
|
||||
func (c *Client) PruneBuildCache(ctx context.Context, all bool) (BuildCachePruneResult, error) {
|
||||
res, err := c.api.BuildCachePrune(ctx, client.BuildCachePruneOptions{All: all})
|
||||
if err != nil {
|
||||
return BuildCachePruneResult{}, fmt.Errorf("prune build cache: %w", err)
|
||||
}
|
||||
return BuildCachePruneResult{
|
||||
CachesDeleted: len(res.Report.CachesDeleted),
|
||||
SpaceReclaimed: int64(res.Report.SpaceReclaimed),
|
||||
}, nil
|
||||
}
|
||||
|
||||
+22
-6
@@ -27,6 +27,13 @@ const (
|
||||
|
||||
// EventStackStatus is emitted when a compose stack status changes.
|
||||
EventStackStatus EventType = "stack_status"
|
||||
|
||||
// EventBuildLog is emitted for each line of a streaming image build.
|
||||
// Per-line events are ephemeral (not persisted to the event_log) — they
|
||||
// exist to drive a live tail UI during the slow "building" phase of a
|
||||
// dockerfile-source deploy. Subscribers should filter by WorkloadID
|
||||
// because every dockerfile deploy on the box publishes on the same bus.
|
||||
EventBuildLog EventType = "build_log"
|
||||
)
|
||||
|
||||
// Event is a single event published on the bus.
|
||||
@@ -62,12 +69,13 @@ type DeployStatusPayload struct {
|
||||
|
||||
// EventLogPayload is the payload for EventLog events (audit trail).
|
||||
type EventLogPayload struct {
|
||||
ID int64 `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Severity string `json:"severity"`
|
||||
Message string `json:"message"`
|
||||
Metadata string `json:"metadata"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
ID int64 `json:"id"`
|
||||
Source string `json:"source"`
|
||||
WorkloadID string `json:"workload_id"`
|
||||
Severity string `json:"severity"`
|
||||
Message string `json:"message"`
|
||||
Metadata string `json:"metadata"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// StaticSiteStatusPayload is the payload for EventStaticSiteStatus events.
|
||||
@@ -77,6 +85,14 @@ type StaticSiteStatusPayload struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// BuildLogPayload is the payload for EventBuildLog events. One event
|
||||
// per non-empty line read off the daemon's NDJSON build stream.
|
||||
type BuildLogPayload struct {
|
||||
WorkloadID string `json:"workload_id"`
|
||||
Line string `json:"line"`
|
||||
Stream string `json:"stream,omitempty"`
|
||||
}
|
||||
|
||||
// StackStatusPayload is the payload for EventStackStatus events.
|
||||
type StackStatusPayload struct {
|
||||
StackID string `json:"stack_id"`
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
package gitops
|
||||
|
||||
// source_config JSON keys this package can overlay. Kept as constants so the
|
||||
// apply, merge, and drift paths agree on the exact key strings.
|
||||
const (
|
||||
keyPort = "port"
|
||||
keyHealthcheck = "healthcheck"
|
||||
keyDeployStrategy = "deploy_strategy"
|
||||
)
|
||||
|
||||
// Source kinds eligible for GitOps in v1 (git-backed sources only).
|
||||
const (
|
||||
SourceDockerfile = "dockerfile"
|
||||
SourceStatic = "static"
|
||||
)
|
||||
|
||||
// supportedKeys returns the source_config keys a given source kind accepts
|
||||
// from a .tinyforge.yml overlay. A field declared in the file but not in this
|
||||
// set is ignored (not applied, not drift-compared) so a shared file can target
|
||||
// either source without producing dead keys or false drift.
|
||||
//
|
||||
// dockerfile: port + healthcheck + deploy_strategy (its real run knobs).
|
||||
// static: deploy_strategy only (a static site has no port/healthcheck).
|
||||
func supportedKeys(sourceKind string) map[string]bool {
|
||||
switch sourceKind {
|
||||
case SourceDockerfile:
|
||||
return map[string]bool{keyPort: true, keyHealthcheck: true, keyDeployStrategy: true}
|
||||
case SourceStatic:
|
||||
return map[string]bool{keyDeployStrategy: true}
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// IsEligibleSource reports whether GitOps may be enabled for a source kind.
|
||||
func IsEligibleSource(sourceKind string) bool {
|
||||
return supportedKeys(sourceKind) != nil
|
||||
}
|
||||
|
||||
// ApplyPlan is the typed, multi-target plan for applying an overlay. In v1 only
|
||||
// SourceConfigPatch is populated; EnvUpserts/Faces are reserved so env (the
|
||||
// workload_env table) and faces (the public_faces column) can be added later
|
||||
// without reshaping the apply path — they are NOT in v1 (env would re-open the
|
||||
// secrets-in-repo hole; faces live in a sibling store).
|
||||
type ApplyPlan struct {
|
||||
// SourceConfigPatch holds the source_config keys to overlay onto the live
|
||||
// config. Only keys supported by the target source are present.
|
||||
SourceConfigPatch map[string]any
|
||||
|
||||
// reserved for future phases — see package doc.
|
||||
// EnvUpserts []store.WorkloadEnv
|
||||
// Faces []plugin.PublicFace
|
||||
}
|
||||
|
||||
// declaredValues returns the present (non-nil) overlay fields keyed by their
|
||||
// source_config JSON key, before the per-source filter. Shared by BuildPlan and
|
||||
// Drift so they agree on what the file declared.
|
||||
func declaredValues(spec Spec) map[string]any {
|
||||
out := map[string]any{}
|
||||
if spec.Deploy.Port != nil {
|
||||
out[keyPort] = *spec.Deploy.Port
|
||||
}
|
||||
if spec.Deploy.Healthcheck != nil {
|
||||
out[keyHealthcheck] = *spec.Deploy.Healthcheck
|
||||
}
|
||||
if spec.Deploy.DeployStrategy != nil {
|
||||
out[keyDeployStrategy] = *spec.Deploy.DeployStrategy
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// BuildPlan maps the present, source-supported overlay fields to a patch for
|
||||
// the given source kind. Unsupported/absent fields are dropped.
|
||||
func BuildPlan(spec Spec, sourceKind string) ApplyPlan {
|
||||
allowed := supportedKeys(sourceKind)
|
||||
patch := map[string]any{}
|
||||
for k, v := range declaredValues(spec) {
|
||||
if allowed[k] {
|
||||
patch[k] = v
|
||||
}
|
||||
}
|
||||
return ApplyPlan{SourceConfigPatch: patch}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
package gitops
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// DriftEntry is one field where the repo-declared value differs from the live
|
||||
// stored value. Values are display strings; comparison is done on normalized
|
||||
// forms so cosmetic differences (default coercion, YAML int vs JSON number)
|
||||
// don't register as drift.
|
||||
type DriftEntry struct {
|
||||
Field string `json:"field"`
|
||||
RepoValue string `json:"repo_value"`
|
||||
LiveValue string `json:"live_value"`
|
||||
}
|
||||
|
||||
// driftFieldOrder is the stable order drift entries are reported in.
|
||||
var driftFieldOrder = []string{keyPort, keyHealthcheck, keyDeployStrategy}
|
||||
|
||||
// Drift compares the declared overlay (the present, source-supported fields)
|
||||
// against the live source_config and returns the fields that differ. Only
|
||||
// declared fields are considered — a key the file omits is "unmanaged",
|
||||
// neither drift nor clean (review C5). Comparison is post-normalization.
|
||||
func Drift(spec Spec, live json.RawMessage, sourceKind string) ([]DriftEntry, error) {
|
||||
liveMap := map[string]any{}
|
||||
if len(live) > 0 {
|
||||
if err := json.Unmarshal(live, &liveMap); err != nil {
|
||||
return nil, fmt.Errorf("gitops: decode live source_config: %w", err)
|
||||
}
|
||||
}
|
||||
allowed := supportedKeys(sourceKind)
|
||||
declared := declaredValues(spec)
|
||||
|
||||
var entries []DriftEntry
|
||||
for _, k := range driftFieldOrder {
|
||||
repoVal, ok := declared[k]
|
||||
if !ok || !allowed[k] {
|
||||
continue
|
||||
}
|
||||
liveVal, livePresent := liveMap[k]
|
||||
if normalizeField(k, repoVal) == normalizeField(k, liveVal) {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, DriftEntry{
|
||||
Field: k,
|
||||
RepoValue: displayField(k, repoVal, true),
|
||||
LiveValue: displayField(k, liveVal, livePresent),
|
||||
})
|
||||
}
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// normalizeField returns the canonical comparison form of a field value.
|
||||
func normalizeField(key string, v any) string {
|
||||
switch key {
|
||||
case keyDeployStrategy:
|
||||
// "" and "recreate" are the same effective strategy for dockerfile and
|
||||
// static (see each source's effectiveStrategy).
|
||||
s := toStr(v)
|
||||
if s == "" || s == "recreate" {
|
||||
return "recreate"
|
||||
}
|
||||
return s
|
||||
case keyPort:
|
||||
return canonInt(v)
|
||||
default:
|
||||
return toStr(v)
|
||||
}
|
||||
}
|
||||
|
||||
// displayField renders a value for the UI. present=false means the key is
|
||||
// absent from the live config.
|
||||
func displayField(key string, v any, present bool) string {
|
||||
if !present {
|
||||
return "(unset)"
|
||||
}
|
||||
if key == keyDeployStrategy {
|
||||
if s := toStr(v); s == "" {
|
||||
return "recreate (default)"
|
||||
}
|
||||
}
|
||||
switch n := v.(type) {
|
||||
case float64:
|
||||
// JSON numbers decode as float64; show whole numbers without ".0".
|
||||
return strconv.FormatInt(int64(n), 10)
|
||||
case nil:
|
||||
return "(unset)"
|
||||
default:
|
||||
return fmt.Sprint(v)
|
||||
}
|
||||
}
|
||||
|
||||
// canonInt coerces any numeric representation (YAML int, JSON float64, etc.)
|
||||
// to a base-10 integer string for value-equality comparison.
|
||||
func canonInt(v any) string {
|
||||
switch n := v.(type) {
|
||||
case int:
|
||||
return strconv.Itoa(n)
|
||||
case int64:
|
||||
return strconv.FormatInt(n, 10)
|
||||
case float64:
|
||||
return strconv.FormatInt(int64(n), 10)
|
||||
case json.Number:
|
||||
return n.String()
|
||||
case nil:
|
||||
return "0"
|
||||
default:
|
||||
return fmt.Sprint(v)
|
||||
}
|
||||
}
|
||||
|
||||
func toStr(v any) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
if s, ok := v.(string); ok {
|
||||
return s
|
||||
}
|
||||
return fmt.Sprint(v)
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
package gitops
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/staticsite"
|
||||
)
|
||||
|
||||
// maxConfigBytes caps the .tinyforge.yml fetch. The file is tiny; the cap
|
||||
// stops a hostile/misconfigured repo from streaming an unbounded body.
|
||||
const maxConfigBytes = 64 * 1024
|
||||
|
||||
// Status is the outcome of a Fetch. All outcomes are values (not errors) so a
|
||||
// caller always has something to show: an absent file or a provider blip is a
|
||||
// normal state, not a 500.
|
||||
type Status string
|
||||
|
||||
const (
|
||||
StatusOK Status = "ok" // file present and parsed
|
||||
StatusNoFile Status = "no_file" // GitOps enabled, no file at path
|
||||
StatusFetchFailed Status = "fetch_failed" // transport/auth/5xx error
|
||||
StatusInvalid Status = "invalid" // file present but failed to parse
|
||||
)
|
||||
|
||||
// RepoRef is the minimal repo locator Fetch needs. The caller (API layer)
|
||||
// extracts these from the workload's source_config and decrypts the token —
|
||||
// this package stays decoupled from the store and source plugins.
|
||||
type RepoRef struct {
|
||||
Provider string // "gitea" | "github" | "gitlab" | "" (autodetect from BaseURL)
|
||||
BaseURL string
|
||||
Owner string
|
||||
Repo string
|
||||
Branch string
|
||||
Token string // decrypted; "" for public repos
|
||||
Path string // repo-relative file path; defaults to .tinyforge.yml
|
||||
}
|
||||
|
||||
// Result carries everything the API/UI needs about a fetch. Message is a
|
||||
// human-safe, token-redacted detail for non-ok statuses.
|
||||
type Result struct {
|
||||
Status Status
|
||||
Raw []byte
|
||||
Spec Spec
|
||||
CommitSHA string
|
||||
Message string
|
||||
}
|
||||
|
||||
// Fetch reads the .tinyforge.yml from a workload's repo and parses it. Every
|
||||
// failure mode is encoded in Result.Status (never a returned error), with any
|
||||
// detail token-redacted in Result.Message. A missing file is StatusNoFile, not
|
||||
// a failure — never a reason to block or clear config.
|
||||
func Fetch(ctx context.Context, ref RepoRef) Result {
|
||||
provider, err := staticsite.NewGitProvider(staticsite.ProviderType(ref.Provider), ref.BaseURL, ref.Token)
|
||||
if err != nil {
|
||||
return Result{Status: StatusFetchFailed, Message: redact(err, ref.Token)}
|
||||
}
|
||||
|
||||
// Best-effort: the SHA lets the UI show which ref the file came from. A
|
||||
// failure here doesn't sink the fetch — the file read below is what matters.
|
||||
sha, _ := provider.GetLatestCommitSHA(ctx, ref.Owner, ref.Repo, ref.Branch)
|
||||
|
||||
path := ref.Path
|
||||
if path == "" {
|
||||
path = ".tinyforge.yml"
|
||||
}
|
||||
data, err := provider.DownloadFile(ctx, ref.Owner, ref.Repo, ref.Branch, path, maxConfigBytes)
|
||||
if err != nil {
|
||||
if errors.Is(err, staticsite.ErrFileNotFound) {
|
||||
return Result{Status: StatusNoFile, CommitSHA: sha}
|
||||
}
|
||||
return Result{Status: StatusFetchFailed, CommitSHA: sha, Message: redact(err, ref.Token)}
|
||||
}
|
||||
|
||||
spec, err := ParseSpec(data)
|
||||
if err != nil {
|
||||
// Parse errors describe YAML structure (line/col), not the token.
|
||||
return Result{Status: StatusInvalid, Raw: data, CommitSHA: sha, Message: err.Error()}
|
||||
}
|
||||
return Result{Status: StatusOK, Raw: data, Spec: spec, CommitSHA: sha}
|
||||
}
|
||||
|
||||
// redact strips the access token from an error message so a fetch failure can
|
||||
// be surfaced or persisted without leaking the credential (mirrors the
|
||||
// sanitizeError convention in the static/dockerfile sources).
|
||||
func redact(err error, token string) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
msg := err.Error()
|
||||
if token != "" {
|
||||
msg = strings.ReplaceAll(msg, token, "[redacted]")
|
||||
}
|
||||
return msg
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
package gitops
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func strp(s string) *string { return &s }
|
||||
func intp(i int) *int { return &i }
|
||||
|
||||
func TestParseSpec(t *testing.T) {
|
||||
s, err := ParseSpec([]byte("version: 1\ndeploy:\n port: 8080\n deploy_strategy: blue-green\n"))
|
||||
if err != nil {
|
||||
t.Fatalf("valid parse: %v", err)
|
||||
}
|
||||
if s.Version != 1 || s.Deploy.Port == nil || *s.Deploy.Port != 8080 {
|
||||
t.Fatalf("unexpected spec: %+v", s)
|
||||
}
|
||||
if s.Deploy.Healthcheck != nil {
|
||||
t.Fatalf("omitted healthcheck must stay nil")
|
||||
}
|
||||
|
||||
// Unknown keys are rejected — incl. an attempt to declare env (out of v1).
|
||||
if _, err := ParseSpec([]byte("version: 1\ndeploy:\n env:\n FOO: bar\n")); err == nil {
|
||||
t.Fatalf("expected unknown-field error for deploy.env")
|
||||
}
|
||||
if _, err := ParseSpec([]byte("version: 1\nworkloads: []\n")); err == nil {
|
||||
t.Fatalf("expected unknown-field error for top-level workloads")
|
||||
}
|
||||
if _, err := ParseSpec([]byte("version: 2\n")); err == nil {
|
||||
t.Fatalf("expected unsupported-version error")
|
||||
}
|
||||
if _, err := ParseSpec(nil); err == nil {
|
||||
t.Fatalf("expected empty-file error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPlan_SourceAware(t *testing.T) {
|
||||
spec := Spec{Version: 1, Deploy: DeploySpec{
|
||||
Port: intp(8080), Healthcheck: strp("/h"), DeployStrategy: strp("blue-green"),
|
||||
}}
|
||||
|
||||
df := BuildPlan(spec, SourceDockerfile).SourceConfigPatch
|
||||
if df[keyPort] != 8080 || df[keyHealthcheck] != "/h" || df[keyDeployStrategy] != "blue-green" {
|
||||
t.Fatalf("dockerfile patch wrong: %+v", df)
|
||||
}
|
||||
|
||||
// static has no port/healthcheck — they must NOT leak into its patch.
|
||||
st := BuildPlan(spec, SourceStatic).SourceConfigPatch
|
||||
if _, ok := st[keyPort]; ok {
|
||||
t.Fatalf("static patch must not contain port")
|
||||
}
|
||||
if _, ok := st[keyHealthcheck]; ok {
|
||||
t.Fatalf("static patch must not contain healthcheck")
|
||||
}
|
||||
if st[keyDeployStrategy] != "blue-green" {
|
||||
t.Fatalf("static should keep deploy_strategy: %+v", st)
|
||||
}
|
||||
|
||||
if IsEligibleSource("image") || IsEligibleSource("compose") {
|
||||
t.Fatalf("only dockerfile/static are GitOps-eligible in v1")
|
||||
}
|
||||
if !IsEligibleSource(SourceDockerfile) || !IsEligibleSource(SourceStatic) {
|
||||
t.Fatalf("dockerfile + static must be eligible")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeAndValidate_PreservesOmittedFields(t *testing.T) {
|
||||
live := json.RawMessage(`{"repo_owner":"o","repo_name":"r","port":3000,"healthcheck":"/old","deploy_strategy":""}`)
|
||||
spec := Spec{Version: 1, Deploy: DeploySpec{Port: intp(8080)}} // only port declared
|
||||
merged, err := MergeAndValidate(live, BuildPlan(spec, SourceDockerfile), func(json.RawMessage) error { return nil })
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
var m map[string]any
|
||||
if err := json.Unmarshal(merged, &m); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if m["port"].(float64) != 8080 {
|
||||
t.Fatalf("declared port not applied: %v", m["port"])
|
||||
}
|
||||
if m["healthcheck"] != "/old" {
|
||||
t.Fatalf("undeclared healthcheck must be preserved, got %v", m["healthcheck"])
|
||||
}
|
||||
if m["repo_owner"] != "o" {
|
||||
t.Fatalf("untouched repo_owner lost")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeAndValidate_RejectsInvalidMergedConfig(t *testing.T) {
|
||||
live := json.RawMessage(`{"port":3000}`)
|
||||
spec := Spec{Version: 1, Deploy: DeploySpec{DeployStrategy: strp("rolling")}}
|
||||
_, err := MergeAndValidate(live, BuildPlan(spec, SourceDockerfile), func(c json.RawMessage) error {
|
||||
var x struct {
|
||||
DeployStrategy string `json:"deploy_strategy"`
|
||||
}
|
||||
_ = json.Unmarshal(c, &x)
|
||||
if x.DeployStrategy == "rolling" {
|
||||
return errors.New("invalid deploy_strategy")
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatalf("expected the merged config to be rejected as a whole")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDrift_DeclaredOnly_WithNormalization(t *testing.T) {
|
||||
// live: port 3000, healthcheck "/h", strategy "" (== recreate effective).
|
||||
live := json.RawMessage(`{"port":3000,"healthcheck":"/h","deploy_strategy":"","registry_name":"x"}`)
|
||||
// declare: port (changed) + deploy_strategy "recreate" (equal to "" -> no drift).
|
||||
spec := Spec{Version: 1, Deploy: DeploySpec{Port: intp(8080), DeployStrategy: strp("recreate")}}
|
||||
d, err := Drift(spec, live, SourceDockerfile)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(d) != 1 {
|
||||
t.Fatalf("want exactly 1 drift (port), got %d: %+v", len(d), d)
|
||||
}
|
||||
if d[0].Field != keyPort || d[0].RepoValue != "8080" || d[0].LiveValue != "3000" {
|
||||
t.Fatalf("port drift wrong: %+v", d[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestDrift_StaticIgnoresUnsupportedFields(t *testing.T) {
|
||||
live := json.RawMessage(`{"deploy_strategy":"recreate","mode":"static"}`)
|
||||
// port declared but unsupported for static -> ignored; strategy differs -> drift.
|
||||
spec := Spec{Version: 1, Deploy: DeploySpec{Port: intp(8080), DeployStrategy: strp("blue-green")}}
|
||||
d, err := Drift(spec, live, SourceStatic)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(d) != 1 || d[0].Field != keyDeployStrategy {
|
||||
t.Fatalf("static should only drift on deploy_strategy: %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDrift_UnsetLiveValue(t *testing.T) {
|
||||
spec := Spec{Version: 1, Deploy: DeploySpec{Healthcheck: strp("/up")}}
|
||||
d, err := Drift(spec, json.RawMessage(`{}`), SourceDockerfile)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(d) != 1 || d[0].RepoValue != "/up" || d[0].LiveValue != "(unset)" {
|
||||
t.Fatalf("unset live should render as (unset): %+v", d)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRedact_StripsToken(t *testing.T) {
|
||||
msg := redact(errors.New("execute request: token ghp_SECRET rejected"), "ghp_SECRET")
|
||||
if strings.Contains(msg, "ghp_SECRET") {
|
||||
t.Fatalf("token leaked: %s", msg)
|
||||
}
|
||||
if !strings.Contains(msg, "[redacted]") {
|
||||
t.Fatalf("expected redaction marker: %s", msg)
|
||||
}
|
||||
if redact(nil, "x") != "" {
|
||||
t.Fatalf("nil error should redact to empty string")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package gitops
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// MergeAndValidate overlays the plan's SourceConfigPatch onto a copy of the
|
||||
// live source_config and returns the merged JSON — but only after the target
|
||||
// source's own Validate accepts the *merged* result. This is the hard apply
|
||||
// gate (review C4):
|
||||
//
|
||||
// - omitted-field-preserving: keys the file doesn't declare are untouched, so
|
||||
// a partial .tinyforge.yml never clears live config;
|
||||
// - validate-then-commit: a patch that would produce an invalid config (e.g.
|
||||
// deploy_strategy "blue-green" on a source that rejects it, or a bad port)
|
||||
// is refused as a whole — the function never returns a partial/empty config;
|
||||
// - pure: it does not write anything; the caller persists the returned bytes.
|
||||
//
|
||||
// validate is the matching Source.Validate (passed in to keep this package
|
||||
// decoupled from the source plugins).
|
||||
func MergeAndValidate(live json.RawMessage, plan ApplyPlan, validate func(json.RawMessage) error) (json.RawMessage, error) {
|
||||
// Decode the live config into a generic map we can overlay. An empty/null
|
||||
// live config starts from an empty object rather than failing.
|
||||
merged := map[string]any{}
|
||||
if len(live) > 0 {
|
||||
if err := json.Unmarshal(live, &merged); err != nil {
|
||||
return nil, fmt.Errorf("gitops: decode live source_config: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Overlay only the declared patch keys — everything else is preserved.
|
||||
for k, v := range plan.SourceConfigPatch {
|
||||
merged[k] = v
|
||||
}
|
||||
|
||||
out, err := json.Marshal(merged)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gitops: encode merged source_config: %w", err)
|
||||
}
|
||||
|
||||
if validate != nil {
|
||||
if err := validate(out); err != nil {
|
||||
return nil, fmt.Errorf("gitops: merged config rejected: %w", err)
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
// Package gitops implements config-as-code for repo-backed workloads: a
|
||||
// dockerfile/static workload can read a small .tinyforge.yml from its own repo
|
||||
// that declares a subset of its deploy config. The package is deliberately
|
||||
// decoupled from the store and source plugins — it takes a RepoRef (repo
|
||||
// coords + a decrypted token) and a live source_config blob, and returns a
|
||||
// validated merged config + a field-level drift report. It never writes to the
|
||||
// database and never decides to deploy.
|
||||
//
|
||||
// v1 scope (see plans/gitops/PLAN.md): only source_config-resident fields are
|
||||
// overlayable, and the set is source-aware (dockerfile: port/healthcheck/
|
||||
// deploy_strategy; static: deploy_strategy). env/faces live in separate stores
|
||||
// and are intentionally out of v1; the typed ApplyPlan reserves their slots.
|
||||
package gitops
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Spec is the parsed shape of a .tinyforge.yml file (v1).
|
||||
type Spec struct {
|
||||
Version int `yaml:"version"`
|
||||
Deploy DeploySpec `yaml:"deploy"`
|
||||
}
|
||||
|
||||
// DeploySpec carries the overlayable deploy fields. Pointers so an omitted key
|
||||
// is distinguishable from a zero value — only present (non-nil) fields are
|
||||
// applied or drift-compared, so an absent key never clears live config.
|
||||
type DeploySpec struct {
|
||||
Port *int `yaml:"port"`
|
||||
Healthcheck *string `yaml:"healthcheck"`
|
||||
DeployStrategy *string `yaml:"deploy_strategy"`
|
||||
}
|
||||
|
||||
// ParseSpec decodes a .tinyforge.yml body. Unknown keys are rejected
|
||||
// (KnownFields) so a typo or an unsupported field — e.g. someone trying to
|
||||
// declare env/faces in v1 — surfaces as an error instead of being silently
|
||||
// dropped. Only version 1 is accepted.
|
||||
func ParseSpec(data []byte) (Spec, error) {
|
||||
var s Spec
|
||||
dec := yaml.NewDecoder(bytes.NewReader(data))
|
||||
dec.KnownFields(true)
|
||||
if err := dec.Decode(&s); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
return Spec{}, fmt.Errorf("gitops: empty .tinyforge.yml")
|
||||
}
|
||||
return Spec{}, fmt.Errorf("gitops: parse .tinyforge.yml: %w", err)
|
||||
}
|
||||
if s.Version != 1 {
|
||||
return Spec{}, fmt.Errorf("gitops: unsupported version %d (want 1)", s.Version)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
// Package keyedmutex provides a lazily-populated per-key mutex, so a critical
|
||||
// section can be serialized per key (e.g. per workload id) without a global
|
||||
// lock. It is the shared form of the pattern that originated inline in the
|
||||
// GitOps sync handler; the deployer (per-workload deploy serialization) and the
|
||||
// volume-snapshot restore single-flight both use it.
|
||||
package keyedmutex
|
||||
|
||||
import "sync"
|
||||
|
||||
// Mutex hands out one *sync.Mutex per key on demand. The zero value is ready to
|
||||
// use. The internal map only grows (one entry per distinct key ever locked),
|
||||
// which is bounded in practice by the number of workloads.
|
||||
type Mutex struct {
|
||||
mu sync.Mutex
|
||||
m map[string]*sync.Mutex
|
||||
}
|
||||
|
||||
func (k *Mutex) get(key string) *sync.Mutex {
|
||||
k.mu.Lock()
|
||||
defer k.mu.Unlock()
|
||||
if k.m == nil {
|
||||
k.m = make(map[string]*sync.Mutex)
|
||||
}
|
||||
mu, ok := k.m[key]
|
||||
if !ok {
|
||||
mu = &sync.Mutex{}
|
||||
k.m[key] = mu
|
||||
}
|
||||
return mu
|
||||
}
|
||||
|
||||
// Lock blocks until the mutex for key is acquired, then returns its unlock func.
|
||||
func (k *Mutex) Lock(key string) func() {
|
||||
mu := k.get(key)
|
||||
mu.Lock()
|
||||
return mu.Unlock
|
||||
}
|
||||
|
||||
// TryLock attempts to acquire the mutex for key without blocking. On success it
|
||||
// returns the unlock func and true; if the key is already locked it returns nil
|
||||
// and false so the caller can reject (e.g. HTTP 409) instead of queuing.
|
||||
func (k *Mutex) TryLock(key string) (func(), bool) {
|
||||
mu := k.get(key)
|
||||
if !mu.TryLock() {
|
||||
return nil, false
|
||||
}
|
||||
return mu.Unlock, true
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
package keyedmutex
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLockSerializesSameKey(t *testing.T) {
|
||||
var m Mutex
|
||||
unlock := m.Lock("a")
|
||||
|
||||
acquired := make(chan struct{})
|
||||
go func() {
|
||||
u := m.Lock("a")
|
||||
close(acquired)
|
||||
u()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-acquired:
|
||||
t.Fatal("second Lock on the same key acquired while the first was held")
|
||||
case <-time.After(50 * time.Millisecond):
|
||||
// expected: blocked
|
||||
}
|
||||
unlock()
|
||||
select {
|
||||
case <-acquired:
|
||||
// expected: now acquired
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("second Lock did not acquire after release")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLockIndependentKeys(t *testing.T) {
|
||||
var m Mutex
|
||||
unlockA := m.Lock("a")
|
||||
defer unlockA()
|
||||
// A different key must not block.
|
||||
done := make(chan struct{})
|
||||
go func() { u := m.Lock("b"); u(); close(done) }()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("Lock on an independent key blocked")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTryLock(t *testing.T) {
|
||||
var m Mutex
|
||||
unlock, ok := m.TryLock("a")
|
||||
if !ok {
|
||||
t.Fatal("TryLock should succeed on a free key")
|
||||
}
|
||||
if _, ok := m.TryLock("a"); ok {
|
||||
t.Fatal("TryLock should fail while the key is held")
|
||||
}
|
||||
unlock()
|
||||
u2, ok := m.TryLock("a")
|
||||
if !ok {
|
||||
t.Fatal("TryLock should succeed after release")
|
||||
}
|
||||
u2()
|
||||
}
|
||||
|
||||
func TestConcurrentLockNoRace(t *testing.T) {
|
||||
var m Mutex
|
||||
var wg sync.WaitGroup
|
||||
counter := 0
|
||||
for i := 0; i < 50; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
u := m.Lock("shared")
|
||||
counter++ // protected by the keyed lock
|
||||
u()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
if counter != 50 {
|
||||
t.Errorf("counter = %d, want 50 (lost updates ⇒ lock not serializing)", counter)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,349 @@
|
||||
// Package metricalert implements a background goroutine that
|
||||
// periodically evaluates operator-configured metric-threshold rules
|
||||
// against recent container stats samples. On breach (subject to a
|
||||
// per-rule-per-workload cooldown) it emits an event into the existing
|
||||
// event_log + event-bus pipeline — the same fan-out used by the
|
||||
// log-scanner — instead of building any new notification plumbing.
|
||||
package metricalert
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// EvalInterval is how often the evaluator tick fires.
|
||||
const EvalInterval = 30 * time.Second
|
||||
|
||||
// lookbackSeconds bounds how far back we pull samples each tick. Stats
|
||||
// are collected at most every few seconds (see internal/stats), so a
|
||||
// 120s window comfortably captures the latest reading per container
|
||||
// even if collection briefly stalls.
|
||||
const lookbackSeconds = 120
|
||||
|
||||
// RuleSource is the read-side seam for fetching the current rule rows.
|
||||
// Real callers pass *store.Store; tests pass a fake.
|
||||
type RuleSource interface {
|
||||
ListMetricAlertRules() ([]store.MetricAlertRule, error)
|
||||
}
|
||||
|
||||
// SampleSource fetches the recent container stats samples to evaluate.
|
||||
type SampleSource interface {
|
||||
ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error)
|
||||
}
|
||||
|
||||
// EventSink writes a breach into event_log.
|
||||
type EventSink interface {
|
||||
InsertEvent(store.EventLog) (store.EventLog, error)
|
||||
}
|
||||
|
||||
// Publisher fans the breach out on the event bus. Matches *events.Bus.
|
||||
type Publisher interface {
|
||||
Publish(events.Event)
|
||||
}
|
||||
|
||||
// Source identifies metric-alert events in event_log + the bus.
|
||||
const eventSource = "metric_alert"
|
||||
|
||||
// Manager owns the evaluation loop lifecycle. It mirrors
|
||||
// stats.Collector: a once-guarded Start/Stop pair with stop/done
|
||||
// channels and a single-goroutine run loop.
|
||||
type Manager struct {
|
||||
rules RuleSource
|
||||
samples SampleSource
|
||||
sink EventSink
|
||||
pub Publisher
|
||||
|
||||
// now is swappable in tests so cooldown windows can be exercised
|
||||
// deterministically. Defaults to time.Now.
|
||||
now func() time.Time
|
||||
|
||||
// mu guards lastFired. The run loop is single-goroutine today, but
|
||||
// Start/Stop and a future ReloadRules may touch shared state; the
|
||||
// mutex is cheap insurance.
|
||||
mu sync.Mutex
|
||||
lastFired map[string]time.Time // "ruleID:ownerID" -> last emit time
|
||||
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
started bool
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// New wires a manager with the supplied dependencies. Call Start to
|
||||
// begin evaluating.
|
||||
func New(rules RuleSource, samples SampleSource, sink EventSink, pub Publisher) *Manager {
|
||||
return &Manager{
|
||||
rules: rules,
|
||||
samples: samples,
|
||||
sink: sink,
|
||||
pub: pub,
|
||||
now: time.Now,
|
||||
lastFired: map[string]time.Time{},
|
||||
stop: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start launches the background loop. Returns immediately. The loop
|
||||
// exits when Stop is called. Safe to call multiple times — only the
|
||||
// first call has an effect.
|
||||
func (m *Manager) Start() {
|
||||
m.startOnce.Do(func() {
|
||||
m.started = true
|
||||
go m.run()
|
||||
})
|
||||
}
|
||||
|
||||
// Stop signals the loop to exit and blocks until it has finished the
|
||||
// in-flight tick. If Start was never called, Stop returns immediately.
|
||||
func (m *Manager) Stop() {
|
||||
m.stopOnce.Do(func() {
|
||||
close(m.stop)
|
||||
if !m.started {
|
||||
close(m.done)
|
||||
}
|
||||
})
|
||||
<-m.done
|
||||
}
|
||||
|
||||
// run is the main loop. It evaluates once shortly after start, then on
|
||||
// every EvalInterval tick, until Stop is called.
|
||||
func (m *Manager) run() {
|
||||
defer close(m.done)
|
||||
|
||||
// Settle delay so the app + first stats samples exist before the
|
||||
// first evaluation.
|
||||
select {
|
||||
case <-time.After(3 * time.Second):
|
||||
case <-m.stop:
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(EvalInterval)
|
||||
defer ticker.Stop()
|
||||
m.evaluate(m.now())
|
||||
for {
|
||||
select {
|
||||
case <-m.stop:
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.evaluate(m.now())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate runs one pass: load rules + recent samples, reduce to the
|
||||
// freshest sample per (owner, container), and emit on breach subject to
|
||||
// cooldown. Best-effort throughout — a bad rule or sample never crashes
|
||||
// the loop.
|
||||
func (m *Manager) evaluate(now time.Time) {
|
||||
rules, err := m.rules.ListMetricAlertRules()
|
||||
if err != nil {
|
||||
slog.Warn("metricalert: list rules", "error", err)
|
||||
return
|
||||
}
|
||||
if len(rules) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
since := now.Unix() - lookbackSeconds
|
||||
samples, err := m.samples.ListAllRecentContainerStatsSamples(since)
|
||||
if err != nil {
|
||||
slog.Warn("metricalert: list samples", "error", err)
|
||||
return
|
||||
}
|
||||
latest := latestPerContainer(samples)
|
||||
if len(latest) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for _, rule := range rules {
|
||||
if !rule.Enabled {
|
||||
continue
|
||||
}
|
||||
for _, sample := range latest {
|
||||
// Per-workload rules only match their workload; "" matches all.
|
||||
if rule.WorkloadID != "" && rule.WorkloadID != sample.OwnerID {
|
||||
continue
|
||||
}
|
||||
value, ok := metricValue(rule.Metric, sample)
|
||||
if !ok {
|
||||
continue // e.g. memory_percent with a zero limit
|
||||
}
|
||||
if !breached(rule.Comparator, value, rule.Threshold) {
|
||||
continue
|
||||
}
|
||||
if m.coolingDown(rule, sample.OwnerID, now) {
|
||||
continue
|
||||
}
|
||||
m.emit(rule, sample, value)
|
||||
m.recordFire(rule, sample.OwnerID, now)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// latestPerContainer keeps only the most recent sample per
|
||||
// (OwnerID, ContainerID), so each container is judged on its freshest
|
||||
// reading rather than every historical row in the window.
|
||||
func latestPerContainer(samples []store.ContainerStatsSample) []store.ContainerStatsSample {
|
||||
newest := map[string]store.ContainerStatsSample{}
|
||||
for _, s := range samples {
|
||||
key := s.OwnerID + "\x00" + s.ContainerID
|
||||
if prev, ok := newest[key]; !ok || s.TS > prev.TS {
|
||||
newest[key] = s
|
||||
}
|
||||
}
|
||||
out := make([]store.ContainerStatsSample, 0, len(newest))
|
||||
for _, s := range newest {
|
||||
out = append(out, s)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// metricValue resolves a rule's metric against a sample. The bool is
|
||||
// false when the sample can't be judged for that metric (memory_percent
|
||||
// with a zero/unknown limit) so the caller skips it instead of dividing
|
||||
// by zero.
|
||||
func metricValue(metric string, s store.ContainerStatsSample) (float64, bool) {
|
||||
switch metric {
|
||||
case store.MetricCPUPercent:
|
||||
return s.CPUPercent, true
|
||||
case store.MetricMemoryPercent:
|
||||
if s.MemoryLimit <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return float64(s.MemoryUsage) / float64(s.MemoryLimit) * 100, true
|
||||
case store.MetricMemoryBytes:
|
||||
return float64(s.MemoryUsage), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
// breached returns whether value crosses threshold per the comparator.
|
||||
func breached(comparator string, value, threshold float64) bool {
|
||||
switch comparator {
|
||||
case store.MetricComparatorGT:
|
||||
return value > threshold
|
||||
case store.MetricComparatorLT:
|
||||
return value < threshold
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// cooldownKey is the per-rule-per-workload cooldown key.
|
||||
func cooldownKey(ruleID int64, ownerID string) string {
|
||||
return fmt.Sprintf("%d:%s", ruleID, ownerID)
|
||||
}
|
||||
|
||||
func (m *Manager) coolingDown(rule store.MetricAlertRule, ownerID string, now time.Time) bool {
|
||||
if rule.CooldownSeconds <= 0 {
|
||||
return false
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
last, ok := m.lastFired[cooldownKey(rule.ID, ownerID)]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return now.Sub(last) < time.Duration(rule.CooldownSeconds)*time.Second
|
||||
}
|
||||
|
||||
func (m *Manager) recordFire(rule store.MetricAlertRule, ownerID string, now time.Time) {
|
||||
m.mu.Lock()
|
||||
m.lastFired[cooldownKey(rule.ID, ownerID)] = now
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
// emit persists the breach as an event_log row and publishes it on the
|
||||
// bus. WorkloadID routes the alert to that app's activity timeline.
|
||||
// Metadata is JSON-marshalled (never string-concatenated). Any
|
||||
// marshal/insert failure is logged and skipped — emitting must never
|
||||
// crash the loop.
|
||||
func (m *Manager) emit(rule store.MetricAlertRule, sample store.ContainerStatsSample, value float64) {
|
||||
message := formatMessage(rule, value)
|
||||
meta := map[string]any{
|
||||
"workload_id": sample.OwnerID,
|
||||
"rule": rule.Name,
|
||||
"metric": rule.Metric,
|
||||
"value": value,
|
||||
"threshold": rule.Threshold,
|
||||
"comparator": rule.Comparator,
|
||||
}
|
||||
metaJSON, err := json.Marshal(meta)
|
||||
if err != nil {
|
||||
slog.Error("metricalert: marshal metadata", "rule", rule.Name, "error", err)
|
||||
return
|
||||
}
|
||||
severity := rule.Severity
|
||||
if severity == "" {
|
||||
severity = store.LogScanSeverityWarn
|
||||
}
|
||||
evt, err := m.sink.InsertEvent(store.EventLog{
|
||||
Source: eventSource,
|
||||
Severity: severity,
|
||||
Message: message,
|
||||
WorkloadID: sample.OwnerID,
|
||||
Metadata: string(metaJSON),
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("metricalert: persist event", "rule", rule.Name, "error", err)
|
||||
return
|
||||
}
|
||||
if m.pub != nil {
|
||||
m.pub.Publish(events.Event{
|
||||
Type: events.EventLog,
|
||||
Payload: events.EventLogPayload{
|
||||
ID: evt.ID,
|
||||
Source: eventSource,
|
||||
WorkloadID: sample.OwnerID,
|
||||
Severity: severity,
|
||||
Message: message,
|
||||
Metadata: string(metaJSON),
|
||||
CreatedAt: evt.CreatedAt,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// formatMessage builds a concise, human, secret-free breach line. The
|
||||
// only operator-supplied text is rule.Name; the rest are numbers and
|
||||
// fixed labels.
|
||||
func formatMessage(rule store.MetricAlertRule, value float64) string {
|
||||
label, unit := metricLabelUnit(rule.Metric)
|
||||
word := comparatorWord(rule.Comparator)
|
||||
return fmt.Sprintf("%s: %s is %.0f%s (threshold %s %.0f%s)",
|
||||
rule.Name, label, value, unit, word, rule.Threshold, unit)
|
||||
}
|
||||
|
||||
func metricLabelUnit(metric string) (label, unit string) {
|
||||
switch metric {
|
||||
case store.MetricCPUPercent:
|
||||
return "CPU", "%"
|
||||
case store.MetricMemoryPercent:
|
||||
return "Memory", "%"
|
||||
case store.MetricMemoryBytes:
|
||||
return "Memory", " bytes"
|
||||
default:
|
||||
return metric, ""
|
||||
}
|
||||
}
|
||||
|
||||
func comparatorWord(comparator string) string {
|
||||
switch comparator {
|
||||
case store.MetricComparatorGT:
|
||||
return ">"
|
||||
case store.MetricComparatorLT:
|
||||
return "<"
|
||||
default:
|
||||
return comparator
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,284 @@
|
||||
package metricalert
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// --- fakes -----------------------------------------------------------
|
||||
|
||||
type fakeRules struct {
|
||||
rules []store.MetricAlertRule
|
||||
err error
|
||||
}
|
||||
|
||||
func (f *fakeRules) ListMetricAlertRules() ([]store.MetricAlertRule, error) {
|
||||
return f.rules, f.err
|
||||
}
|
||||
|
||||
type fakeSamples struct {
|
||||
samples []store.ContainerStatsSample
|
||||
err error
|
||||
since int64 // captured arg of the last call
|
||||
}
|
||||
|
||||
func (f *fakeSamples) ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error) {
|
||||
f.since = sinceTS
|
||||
return f.samples, f.err
|
||||
}
|
||||
|
||||
type recordedEvent struct {
|
||||
evt store.EventLog
|
||||
}
|
||||
|
||||
type fakeSink struct {
|
||||
events []recordedEvent
|
||||
err error
|
||||
nextID int64
|
||||
}
|
||||
|
||||
func (f *fakeSink) InsertEvent(e store.EventLog) (store.EventLog, error) {
|
||||
if f.err != nil {
|
||||
return store.EventLog{}, f.err
|
||||
}
|
||||
f.nextID++
|
||||
e.ID = f.nextID
|
||||
e.CreatedAt = "2026-05-29T00:00:00Z"
|
||||
f.events = append(f.events, recordedEvent{evt: e})
|
||||
return e, nil
|
||||
}
|
||||
|
||||
type fakePublisher struct {
|
||||
published []events.Event
|
||||
}
|
||||
|
||||
func (f *fakePublisher) Publish(e events.Event) {
|
||||
f.published = append(f.published, e)
|
||||
}
|
||||
|
||||
func newManager(rules []store.MetricAlertRule, samples []store.ContainerStatsSample) (*Manager, *fakeSink, *fakePublisher) {
|
||||
sink := &fakeSink{}
|
||||
pub := &fakePublisher{}
|
||||
m := New(&fakeRules{rules: rules}, &fakeSamples{samples: samples}, sink, pub)
|
||||
return m, sink, pub
|
||||
}
|
||||
|
||||
// --- tests -----------------------------------------------------------
|
||||
|
||||
func TestEvaluate_BreachEmits(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Severity: "error",
|
||||
CooldownSeconds: 300, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", OwnerType: "instance", TS: 100, CPUPercent: 95,
|
||||
}}
|
||||
m, sink, pub := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("expected 1 event, got %d", len(sink.events))
|
||||
}
|
||||
got := sink.events[0].evt
|
||||
if got.Source != "metric_alert" {
|
||||
t.Errorf("source = %q, want metric_alert", got.Source)
|
||||
}
|
||||
if got.Severity != "error" {
|
||||
t.Errorf("severity = %q, want error", got.Severity)
|
||||
}
|
||||
if got.WorkloadID != "w1" {
|
||||
t.Errorf("workload_id = %q, want w1", got.WorkloadID)
|
||||
}
|
||||
if got.Metadata == "" || got.Metadata == "{}" {
|
||||
t.Errorf("metadata should be populated JSON, got %q", got.Metadata)
|
||||
}
|
||||
if len(pub.published) != 1 {
|
||||
t.Fatalf("expected 1 published event, got %d", len(pub.published))
|
||||
}
|
||||
payload, ok := pub.published[0].Payload.(events.EventLogPayload)
|
||||
if !ok {
|
||||
t.Fatalf("published payload is not EventLogPayload")
|
||||
}
|
||||
if payload.WorkloadID != "w1" || payload.Source != "metric_alert" {
|
||||
t.Errorf("payload workload/source mismatch: %+v", payload)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_NoBreachNoEmit(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10,
|
||||
}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 0 {
|
||||
t.Fatalf("expected no events for non-breach, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_DisabledRuleSkipped(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: false,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 0 {
|
||||
t.Fatalf("disabled rule should not emit, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_PerWorkloadScoping(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "w2-only", WorkloadID: "w2", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}, // breach but wrong workload
|
||||
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95}, // breach, correct workload
|
||||
}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("expected 1 event (only w2), got %d", len(sink.events))
|
||||
}
|
||||
if sink.events[0].evt.WorkloadID != "w2" {
|
||||
t.Errorf("event should be scoped to w2, got %q", sink.events[0].evt.WorkloadID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_GlobalRuleMatchesAll(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "global", WorkloadID: "", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95},
|
||||
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95},
|
||||
}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 2 {
|
||||
t.Fatalf("global rule should fire for both workloads, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_MemoryPercentDivByZeroSkip(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 50, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 1000, MemoryLimit: 0,
|
||||
}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 0 {
|
||||
t.Fatalf("zero memory limit should be skipped for percent rule, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_MemoryPercentBreaches(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 90, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 950, MemoryLimit: 1000, // 95%
|
||||
}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("95%% should breach 90%% threshold, got %d events", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_CooldownSuppressesSecondEmit(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, CooldownSeconds: 300, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
base := time.Unix(1000, 0)
|
||||
m.evaluate(base)
|
||||
// 10s later — still inside the 300s cooldown window.
|
||||
m.evaluate(base.Add(10 * time.Second))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("cooldown should suppress second emit, got %d events", len(sink.events))
|
||||
}
|
||||
|
||||
// Past the window — should fire again.
|
||||
m.evaluate(base.Add(301 * time.Second))
|
||||
if len(sink.events) != 2 {
|
||||
t.Fatalf("should re-fire after cooldown elapses, got %d events", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_LatestSamplePerContainer(t *testing.T) {
|
||||
// Two samples for the same container: an old non-breaching reading
|
||||
// and a newer breaching one. Only the freshest should be judged.
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10},
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 150, CPUPercent: 95},
|
||||
}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("expected exactly 1 event from freshest sample, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_LessThanComparator(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-idle", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorLT, Threshold: 5, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 1}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("1%% < 5%% threshold should breach lt rule, got %d events", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_NoRulesNoFetch(t *testing.T) {
|
||||
// With no rules there's nothing to do; we shouldn't even query samples.
|
||||
samplesSrc := &fakeSamples{samples: nil}
|
||||
m := New(&fakeRules{rules: nil}, samplesSrc, &fakeSink{}, &fakePublisher{})
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
if samplesSrc.since != 0 {
|
||||
t.Errorf("samples should not be queried when there are no rules")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,250 @@
|
||||
// Package metrics provides a minimal Prometheus text-format exposition
|
||||
// of Tinyforge's operational counters. We deliberately do NOT import the
|
||||
// official client_golang library: the metrics set here is small, the text
|
||||
// format is simple, and avoiding the dependency keeps `tinyforge` a fast
|
||||
// single-binary install.
|
||||
//
|
||||
// Every counter is a sync/atomic.Int64 — cheap, lock-free, and safe to
|
||||
// touch from any goroutine. Histograms / gauges aren't modeled yet; the
|
||||
// few we need (request latency p50/p99) live downstream of slog and can
|
||||
// be added when the operator actually wants them.
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Registry holds the process-wide counter set. A single zero-value
|
||||
// Registry is ready to use — see DefaultRegistry below for the
|
||||
// recommended way to grab the global handle.
|
||||
type Registry struct {
|
||||
mu sync.RWMutex
|
||||
counters map[string]*counter
|
||||
}
|
||||
|
||||
type counter struct {
|
||||
name string
|
||||
help string
|
||||
labels []string // label names, ordered as declared at registration
|
||||
series map[string]*atomic.Int64
|
||||
// seriesMu only protects insertion of new label tuples — increments
|
||||
// on existing tuples are lock-free via the atomic.
|
||||
seriesMu sync.Mutex
|
||||
}
|
||||
|
||||
// DefaultRegistry is the process-wide registry. All Tinyforge metrics
|
||||
// register against it. Tests can instantiate their own Registry.
|
||||
var DefaultRegistry = newRegistry()
|
||||
|
||||
func newRegistry() *Registry {
|
||||
return &Registry{counters: make(map[string]*counter)}
|
||||
}
|
||||
|
||||
// NewCounter declares a counter on the default registry. Call once at
|
||||
// package init or during NewServer; subsequent calls with the same name
|
||||
// return the existing counter so re-registration is safe.
|
||||
//
|
||||
// label names define the dimensions; calls to Inc must pass values in
|
||||
// the same order. Use the empty slice for label-less counters.
|
||||
func NewCounter(name, help string, labels ...string) *Counter {
|
||||
return DefaultRegistry.NewCounter(name, help, labels...)
|
||||
}
|
||||
|
||||
// NewCounter on a specific Registry — useful in tests.
|
||||
func (r *Registry) NewCounter(name, help string, labels ...string) *Counter {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if c, ok := r.counters[name]; ok {
|
||||
return &Counter{c: c}
|
||||
}
|
||||
c := &counter{
|
||||
name: name,
|
||||
help: help,
|
||||
labels: append([]string(nil), labels...),
|
||||
series: make(map[string]*atomic.Int64),
|
||||
}
|
||||
r.counters[name] = c
|
||||
return &Counter{c: c}
|
||||
}
|
||||
|
||||
// Counter is the public handle returned by NewCounter. Pass it around as
|
||||
// a value — the underlying state lives on the registry.
|
||||
type Counter struct {
|
||||
c *counter
|
||||
}
|
||||
|
||||
// Inc atomically increments the counter for the given label values.
|
||||
// Passing the wrong number of values is a programmer error; we surface
|
||||
// it as a panic during testing rather than silently aggregating into a
|
||||
// bogus series.
|
||||
func (c Counter) Inc(labelValues ...string) {
|
||||
c.Add(1, labelValues...)
|
||||
}
|
||||
|
||||
// Add atomically adds delta. Negative delta is rejected (counters are
|
||||
// monotonic by definition).
|
||||
func (c Counter) Add(delta int64, labelValues ...string) {
|
||||
if delta < 0 {
|
||||
return
|
||||
}
|
||||
if len(labelValues) != len(c.c.labels) {
|
||||
// Programmer error. This used to panic to surface the bug, but Add
|
||||
// runs on hot paths (HTTP middleware, deploy dispatch) and several
|
||||
// callers are off the request goroutine, where a panic would take
|
||||
// down the whole process rather than a single request. Log loudly
|
||||
// and drop the sample so a mislabeled call site can never crash the
|
||||
// server; the bug still shows up immediately in the logs and in
|
||||
// tests via the error output.
|
||||
slog.Error("metrics: label count mismatch — dropping sample",
|
||||
"counter", c.c.name, "want", len(c.c.labels), "got", len(labelValues))
|
||||
return
|
||||
}
|
||||
key := encodeKey(labelValues)
|
||||
c.c.seriesMu.Lock()
|
||||
v, ok := c.c.series[key]
|
||||
if !ok {
|
||||
v = new(atomic.Int64)
|
||||
c.c.series[key] = v
|
||||
}
|
||||
c.c.seriesMu.Unlock()
|
||||
v.Add(delta)
|
||||
}
|
||||
|
||||
// encodeKey joins label values with a 0x1f separator. Prometheus label
|
||||
// values may contain anything except `"` and `\n`, which we escape on
|
||||
// exposition only — the key here is just a map index.
|
||||
func encodeKey(values []string) string {
|
||||
return strings.Join(values, "\x1f")
|
||||
}
|
||||
|
||||
// WritePrometheus dumps the registry in the text exposition format
|
||||
// Prometheus / VictoriaMetrics / OpenMetrics understands. Stable
|
||||
// ordering: counters alphabetical by name; series alphabetical by
|
||||
// encoded label tuple.
|
||||
func (r *Registry) WritePrometheus(w io.Writer) error {
|
||||
r.mu.RLock()
|
||||
names := make([]string, 0, len(r.counters))
|
||||
for n := range r.counters {
|
||||
names = append(names, n)
|
||||
}
|
||||
r.mu.RUnlock()
|
||||
sort.Strings(names)
|
||||
|
||||
for _, name := range names {
|
||||
r.mu.RLock()
|
||||
c := r.counters[name]
|
||||
r.mu.RUnlock()
|
||||
if err := writeCounter(w, c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeCounter(w io.Writer, c *counter) error {
|
||||
if _, err := fmt.Fprintf(w, "# HELP %s %s\n# TYPE %s counter\n", c.name, escapeHelp(c.help), c.name); err != nil {
|
||||
return err
|
||||
}
|
||||
// Snapshot the series map under a SINGLE lock acquisition. The
|
||||
// previous shape acquired+released seriesMu twice per emitted
|
||||
// series (once for the key list, once per Load), contending with
|
||||
// every hot-path Inc on the HTTP request path. The *atomic.Int64
|
||||
// pointers are stable for the lifetime of the registry (we never
|
||||
// delete entries), so reading them after the unlock is safe.
|
||||
type sample struct {
|
||||
key string
|
||||
val *atomic.Int64
|
||||
}
|
||||
c.seriesMu.Lock()
|
||||
samples := make([]sample, 0, len(c.series))
|
||||
for k, v := range c.series {
|
||||
samples = append(samples, sample{k, v})
|
||||
}
|
||||
c.seriesMu.Unlock()
|
||||
|
||||
sort.Slice(samples, func(i, j int) bool { return samples[i].key < samples[j].key })
|
||||
|
||||
for _, s := range samples {
|
||||
val := s.val.Load()
|
||||
labels := decodeKey(s.key, c.labels)
|
||||
if labels == "" {
|
||||
if _, err := fmt.Fprintf(w, "%s %d\n", c.name, val); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
if _, err := fmt.Fprintf(w, "%s{%s} %d\n", c.name, labels, val); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func decodeKey(key string, names []string) string {
|
||||
if key == "" || len(names) == 0 {
|
||||
return ""
|
||||
}
|
||||
values := strings.Split(key, "\x1f")
|
||||
if len(values) != len(names) {
|
||||
// Should not happen — encodeKey/decode are symmetric.
|
||||
return ""
|
||||
}
|
||||
parts := make([]string, len(names))
|
||||
for i, n := range names {
|
||||
parts[i] = fmt.Sprintf(`%s="%s"`, n, escapeLabelValue(values[i]))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func escapeHelp(s string) string {
|
||||
r := strings.NewReplacer("\\", "\\\\", "\n", "\\n")
|
||||
return r.Replace(s)
|
||||
}
|
||||
|
||||
func escapeLabelValue(s string) string {
|
||||
r := strings.NewReplacer("\\", "\\\\", "\n", "\\n", `"`, `\"`)
|
||||
return r.Replace(s)
|
||||
}
|
||||
|
||||
// ── Pre-declared counters ────────────────────────────────────────────
|
||||
//
|
||||
// These are the counters Tinyforge surfaces to operators. Adding more is
|
||||
// a one-line NewCounter call at the call site — no central catalogue,
|
||||
// just keep names lowercase_snake with the `tinyforge_` prefix.
|
||||
|
||||
var (
|
||||
HTTPRequestsTotal = NewCounter(
|
||||
"tinyforge_http_requests_total",
|
||||
"Total HTTP requests handled, partitioned by method and outcome class.",
|
||||
"method", "status_class",
|
||||
)
|
||||
DeploysTotal = NewCounter(
|
||||
"tinyforge_deploys_total",
|
||||
"Total deploys dispatched, partitioned by source kind and outcome.",
|
||||
"source_kind", "outcome",
|
||||
)
|
||||
WebhookDeliveriesTotal = NewCounter(
|
||||
"tinyforge_webhook_deliveries_total",
|
||||
"Total inbound webhook deliveries, partitioned by outcome.",
|
||||
"outcome",
|
||||
)
|
||||
SchedulerTicksTotal = NewCounter(
|
||||
"tinyforge_scheduler_ticks_total",
|
||||
"Total scheduler ticks. The dispatched counter is the success measure.",
|
||||
)
|
||||
SchedulerDispatchedTotal = NewCounter(
|
||||
"tinyforge_scheduler_dispatched_total",
|
||||
"Triggers actually dispatched by the scheduler.",
|
||||
)
|
||||
OutboundNotifyTotal = NewCounter(
|
||||
"tinyforge_outbound_notify_total",
|
||||
"Outbound notification dispatch attempts, partitioned by outcome.",
|
||||
"outcome",
|
||||
)
|
||||
)
|
||||
@@ -16,6 +16,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/metrics"
|
||||
)
|
||||
|
||||
// Event represents a deployment / site-sync notification payload.
|
||||
@@ -83,17 +85,68 @@ type TestResult struct {
|
||||
// Notifications are fire-and-forget by default — failures are logged but do
|
||||
// not propagate. SendSyncForTest is the exception, used only by the manual
|
||||
// test endpoint.
|
||||
//
|
||||
// outboundSem caps the number of in-flight outbound notifications. Without
|
||||
// it a single burst (e.g. 1000 event triggers firing on a noisy log scan)
|
||||
// would spawn 1000 simultaneous TCP connections, which both DoSes the
|
||||
// receiver and exhausts local FDs.
|
||||
type Notifier struct {
|
||||
httpClient *http.Client
|
||||
wg sync.WaitGroup
|
||||
httpClient *http.Client
|
||||
wg sync.WaitGroup
|
||||
outboundSem chan struct{}
|
||||
}
|
||||
|
||||
// maxOutboundNotifications bounds the in-flight outbound webhook fan-out.
|
||||
// Sized to keep small bursts non-blocking while preventing a runaway storm
|
||||
// from starving the rest of the process. Tunable later via settings if any
|
||||
// operator legitimately needs more concurrency.
|
||||
const maxOutboundNotifications = 32
|
||||
|
||||
// New creates a Notifier with sensible defaults.
|
||||
func New() *Notifier {
|
||||
// Transport with bounded host pooling so a slow receiver cannot pin
|
||||
// arbitrarily many sockets open. MaxConnsPerHost mirrors the worker
|
||||
// pool size; idle pruning keeps long-lived processes from holding
|
||||
// stale TCP entries indefinitely.
|
||||
//
|
||||
// NOTE: we deliberately do NOT apply the staticsite SSRF dialer here.
|
||||
// Notification URLs are admin-configured, and an admin already has
|
||||
// Docker-socket (host-root-equivalent) access, so the SSRF surface adds
|
||||
// nothing they couldn't already reach. Blocking loopback/private targets
|
||||
// would instead break the common self-hosted pattern of notifying a
|
||||
// same-host sidecar/bridge (e.g. service-to-notification-bridge on
|
||||
// 127.0.0.1). See the security review (rated LOW / out of trust boundary).
|
||||
tr := &http.Transport{
|
||||
MaxIdleConns: 64,
|
||||
MaxIdleConnsPerHost: 8,
|
||||
MaxConnsPerHost: maxOutboundNotifications,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
}
|
||||
return &Notifier{
|
||||
httpClient: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
Timeout: 10 * time.Second,
|
||||
Transport: tr,
|
||||
},
|
||||
outboundSem: make(chan struct{}, maxOutboundNotifications),
|
||||
}
|
||||
}
|
||||
|
||||
// acquireSlot reserves an outbound slot, respecting ctx so a backed-up
|
||||
// queue cannot starve a request that already has its own deadline.
|
||||
func (n *Notifier) acquireSlot(ctx context.Context) bool {
|
||||
select {
|
||||
case n.outboundSem <- struct{}{}:
|
||||
return true
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Notifier) releaseSlot() {
|
||||
select {
|
||||
case <-n.outboundSem:
|
||||
default:
|
||||
// Drained during shutdown — never block.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,8 +181,15 @@ func (n *Notifier) SendSigned(webhookURL, secret string, tier Tier, event Event)
|
||||
n.wg.Add(1)
|
||||
go func() {
|
||||
defer n.wg.Done()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
if !n.acquireSlot(ctx) {
|
||||
slog.Warn("notify: dropped — outbound queue saturated",
|
||||
"tier", tier, "host", safeHost(webhookURL), "delivery", delivery, "event", event.Type)
|
||||
metrics.OutboundNotifyTotal.Inc("dropped")
|
||||
return
|
||||
}
|
||||
defer n.releaseSlot()
|
||||
|
||||
_, err := n.doSend(ctx, webhookURL, secret, tier, delivery, event)
|
||||
// URL host only — never log the secret or full URL with user-info.
|
||||
@@ -138,11 +198,13 @@ func (n *Notifier) SendSigned(webhookURL, secret string, tier Tier, event Event)
|
||||
slog.Warn("notify: webhook send failed",
|
||||
"tier", tier, "host", host, "delivery", delivery,
|
||||
"event", event.Type, "signed", secret != "", "error", err)
|
||||
metrics.OutboundNotifyTotal.Inc("failure")
|
||||
return
|
||||
}
|
||||
slog.Info("notify: webhook dispatched",
|
||||
"tier", tier, "host", host, "delivery", delivery,
|
||||
"event", event.Type, "signed", secret != "")
|
||||
metrics.OutboundNotifyTotal.Inc("success")
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -166,8 +228,15 @@ func (n *Notifier) SendPayload(webhookURL, secret, eventType string, payload any
|
||||
n.wg.Add(1)
|
||||
go func() {
|
||||
defer n.wg.Done()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
if !n.acquireSlot(ctx) {
|
||||
slog.Warn("notify: dropped trigger payload — outbound queue saturated",
|
||||
"tier", TierEventTrigger, "host", safeHost(webhookURL), "delivery", delivery, "event", eventType)
|
||||
metrics.OutboundNotifyTotal.Inc("dropped")
|
||||
return
|
||||
}
|
||||
defer n.releaseSlot()
|
||||
|
||||
_, err := n.doSendRaw(ctx, webhookURL, secret, TierEventTrigger, delivery, eventType, timestamp, payload)
|
||||
host := safeHost(webhookURL)
|
||||
@@ -175,11 +244,13 @@ func (n *Notifier) SendPayload(webhookURL, secret, eventType string, payload any
|
||||
slog.Warn("notify: trigger webhook send failed",
|
||||
"tier", TierEventTrigger, "host", host, "delivery", delivery,
|
||||
"event", eventType, "signed", secret != "", "error", err)
|
||||
metrics.OutboundNotifyTotal.Inc("failure")
|
||||
return
|
||||
}
|
||||
slog.Info("notify: trigger webhook dispatched",
|
||||
"tier", TierEventTrigger, "host", host, "delivery", delivery,
|
||||
"event", eventType, "signed", secret != "")
|
||||
metrics.OutboundNotifyTotal.Inc("success")
|
||||
}()
|
||||
}
|
||||
|
||||
|
||||
@@ -15,8 +15,8 @@ package reconciler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -110,17 +110,37 @@ func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Load every workload ONCE per tick and index by ID. This replaces both
|
||||
// the former N+1 GetWorkloadByID (one DB read per container) in the
|
||||
// upsert loop and the second ListWorkloads("") in the plugin pass: net 1
|
||||
// query per tick, 0 GetWorkloadByID.
|
||||
//
|
||||
// On error we return BEFORE the upsert loop and leave state untouched
|
||||
// this tick (the next tick retries). We must NOT proceed with an empty
|
||||
// map and fall through to markMissingRows: with no container resolving,
|
||||
// `seen` would be empty and markMissingRows would flip EVERY live row to
|
||||
// 'missing'. Aborting early is the safe choice.
|
||||
rows, err := r.store.ListWorkloads("")
|
||||
if err != nil {
|
||||
return fmt.Errorf("reconciler: list workloads: %w", err)
|
||||
}
|
||||
byID := make(map[string]store.Workload, len(rows))
|
||||
for _, w := range rows {
|
||||
byID[w.ID] = w
|
||||
}
|
||||
|
||||
seen := make(map[string]struct{}, len(items)) // container row IDs we touched
|
||||
|
||||
for _, item := range items {
|
||||
rowID := r.upsertFromItem(item)
|
||||
rowID := r.upsertFromItem(item, byID)
|
||||
if rowID != "" {
|
||||
seen[rowID] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
r.markMissingRows(seen)
|
||||
r.reconcilePluginWorkloads(ctx)
|
||||
r.reconcilePluginWorkloads(ctx, rows)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -137,20 +157,18 @@ func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
|
||||
//
|
||||
// No-op when the plugin dispatcher hasn't been wired (boot-time race,
|
||||
// disabled deployments, tests).
|
||||
func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context) {
|
||||
//
|
||||
// rows is the workload set already loaded once by ReconcileOnce — passed
|
||||
// through rather than re-queried so a tick costs a single ListWorkloads.
|
||||
func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context, rows []store.Workload) {
|
||||
if r.plugins == nil {
|
||||
return
|
||||
}
|
||||
rows, err := r.store.ListWorkloads("")
|
||||
if err != nil {
|
||||
slog.Warn("reconciler: list workloads for plugin pass", "error", err)
|
||||
return
|
||||
}
|
||||
for _, w := range rows {
|
||||
if w.SourceKind == "" {
|
||||
continue
|
||||
}
|
||||
pw := toPluginWorkload(w)
|
||||
pw := plugin.WorkloadFromStore(w)
|
||||
if err := r.plugins.DispatchReconcile(ctx, pw); err != nil {
|
||||
slog.Warn("reconciler: plugin reconcile failed",
|
||||
"workload", w.ID, "kind", w.SourceKind, "error", err)
|
||||
@@ -158,33 +176,6 @@ func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// toPluginWorkload mirrors the api / webhook converters; kept local to
|
||||
// avoid an import dependency between those packages.
|
||||
func toPluginWorkload(w store.Workload) plugin.Workload {
|
||||
var faces []plugin.PublicFace
|
||||
if w.PublicFaces != "" {
|
||||
_ = json.Unmarshal([]byte(w.PublicFaces), &faces)
|
||||
}
|
||||
return plugin.Workload{
|
||||
ID: w.ID,
|
||||
Name: w.Name,
|
||||
GroupID: w.AppID,
|
||||
ParentWorkloadID: w.ParentWorkloadID,
|
||||
SourceKind: w.SourceKind,
|
||||
SourceConfig: json.RawMessage(w.SourceConfig),
|
||||
TriggerKind: w.TriggerKind,
|
||||
TriggerConfig: json.RawMessage(w.TriggerConfig),
|
||||
PublicFaces: faces,
|
||||
NotificationURL: w.NotificationURL,
|
||||
NotificationSecret: w.NotificationSecret,
|
||||
WebhookSecret: w.WebhookSecret,
|
||||
WebhookSigningSecret: w.WebhookSigningSecret,
|
||||
WebhookRequireSignature: w.WebhookRequireSignature,
|
||||
CreatedAt: w.CreatedAt,
|
||||
UpdatedAt: w.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Reconciler) loop(ctx context.Context) {
|
||||
defer r.wg.Done()
|
||||
|
||||
@@ -214,9 +205,9 @@ func (r *Reconciler) loop(ctx context.Context) {
|
||||
// After the hard cutover only the canonical tinyforge.workload.id label
|
||||
// path is honored — every Source plugin labels its containers with the
|
||||
// workload identity at create time.
|
||||
func (r *Reconciler) upsertFromItem(item docker.ReconcileItem) string {
|
||||
func (r *Reconciler) upsertFromItem(item docker.ReconcileItem, byID map[string]store.Workload) string {
|
||||
if id := item.Labels[docker.LabelWorkloadID]; id != "" {
|
||||
return r.upsertByWorkloadLabel(item, id)
|
||||
return r.upsertByWorkloadLabel(item, id, byID)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -233,9 +224,9 @@ func (r *Reconciler) upsertFromItem(item docker.ReconcileItem) string {
|
||||
// known workload row is silently ignored. Anyone with Docker socket access
|
||||
// could otherwise spawn a container with a forged label and steal the
|
||||
// canonical slot for an existing workload.
|
||||
func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string) string {
|
||||
w, err := r.store.GetWorkloadByID(workloadID)
|
||||
if err != nil {
|
||||
func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string, byID map[string]store.Workload) string {
|
||||
w, ok := byID[workloadID]
|
||||
if !ok {
|
||||
// Forged or stale label — log once at debug; tick rate keeps logs quiet.
|
||||
slog.Debug("reconciler: unknown workload_id label", "workload_id", workloadID, "container_id", item.ID)
|
||||
return ""
|
||||
|
||||
@@ -257,6 +257,138 @@ func TestReconcileSkipsProjectInsertWithoutDeployerRow(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileBatchingPreservesBehavior locks Fix A: loading all workloads
|
||||
// once per tick (and resolving labels from that in-memory map instead of an
|
||||
// N+1 GetWorkloadByID) must produce the same outcome as the per-container
|
||||
// lookup did. With multiple containers across multiple workloads plus a forged
|
||||
// label and a stale row, after one ReconcileOnce: known-workload containers
|
||||
// are upserted with the snapshot State, the forged-label container is skipped,
|
||||
// and the absent stale row is flipped to missing.
|
||||
func TestReconcileBatchingPreservesBehavior(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
|
||||
w1 := makeWorkload(t, st, "batch-a", "stack")
|
||||
w2 := makeWorkload(t, st, "batch-b", "stack")
|
||||
|
||||
// A stale row for w2 whose container is gone — must be marked missing.
|
||||
if err := st.UpsertContainer(store.Container{
|
||||
ID: w2.ID + ":old", WorkloadID: w2.ID, WorkloadKind: "stack",
|
||||
Role: "old", ContainerID: "docker-vanished", State: "running",
|
||||
}); err != nil {
|
||||
t.Fatalf("seed stale row: %v", err)
|
||||
}
|
||||
|
||||
fake := &fakeDocker{items: []docker.ReconcileItem{
|
||||
{
|
||||
ID: "docker-a1", Name: "batch-a-web-1", Image: "nginx:1.27", State: "running",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: w1.ID,
|
||||
docker.LabelWorkloadKind: "stack",
|
||||
docker.LabelRole: "web",
|
||||
},
|
||||
Ports: []uint16{8080},
|
||||
},
|
||||
{
|
||||
ID: "docker-b1", Name: "batch-b-api-1", Image: "redis:7", State: "exited",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: w2.ID,
|
||||
docker.LabelWorkloadKind: "stack",
|
||||
docker.LabelRole: "api",
|
||||
},
|
||||
},
|
||||
{
|
||||
// Forged label — no such workload. Must be skipped entirely.
|
||||
ID: "docker-evil", Name: "evil", Image: "nginx", State: "running",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: "wl-forged",
|
||||
docker.LabelWorkloadKind: "stack",
|
||||
docker.LabelRole: "web",
|
||||
},
|
||||
},
|
||||
}}
|
||||
|
||||
r := New(st, fake, 0)
|
||||
if err := r.ReconcileOnce(context.Background()); err != nil {
|
||||
t.Fatalf("ReconcileOnce: %v", err)
|
||||
}
|
||||
|
||||
// w1: one row, bound to docker-a1, running.
|
||||
w1Rows, _ := st.ListContainersByWorkload(w1.ID)
|
||||
if len(w1Rows) != 1 {
|
||||
t.Fatalf("w1: expected 1 row, got %d", len(w1Rows))
|
||||
}
|
||||
if w1Rows[0].ContainerID != "docker-a1" || w1Rows[0].State != "running" || w1Rows[0].Role != "web" {
|
||||
t.Fatalf("w1 row wrong: %+v", w1Rows[0])
|
||||
}
|
||||
|
||||
// w2: the new api container is present (exited→stopped); the stale row is missing.
|
||||
api, _ := st.GetContainerByID(w2.ID + ":api")
|
||||
if api.ContainerID != "docker-b1" || api.State != "stopped" {
|
||||
t.Fatalf("w2 api row wrong: %+v", api)
|
||||
}
|
||||
old, _ := st.GetContainerByID(w2.ID + ":old")
|
||||
if old.State != "missing" {
|
||||
t.Fatalf("w2 stale row should be missing, got %q", old.State)
|
||||
}
|
||||
|
||||
// Forged label produced no row anywhere.
|
||||
all, _ := st.ListContainers(store.ContainerFilter{})
|
||||
for _, c := range all {
|
||||
if c.ContainerID == "docker-evil" {
|
||||
t.Fatalf("forged-label container was adopted: %+v", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileSyncsImageContainerState locks the Fix B coupling: the generic
|
||||
// reconciler upsert pass — NOT image.Reconcile — is what syncs an image
|
||||
// container's State from the snapshot. An image container carries the
|
||||
// workload_id / kind=image / role=image labels at create time, so a present
|
||||
// container's row gets its State written here, proving the per-container
|
||||
// inspect formerly in image.Reconcile is redundant.
|
||||
func TestReconcileSyncsImageContainerState(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
w := makeWorkload(t, st, "img", "image")
|
||||
|
||||
// Deployer pre-created the image container row (running). Docker now
|
||||
// reports it exited — the generic pass must sync it to stopped.
|
||||
if err := st.UpsertContainer(store.Container{
|
||||
ID: "img-deploy-uuid", WorkloadID: w.ID, WorkloadKind: "image",
|
||||
Role: "image", ContainerID: "docker-img", State: "running",
|
||||
}); err != nil {
|
||||
t.Fatalf("seed image row: %v", err)
|
||||
}
|
||||
|
||||
fake := &fakeDocker{items: []docker.ReconcileItem{{
|
||||
ID: "docker-img", Image: "ghcr.io/owner/app:v1", State: "exited",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: w.ID,
|
||||
docker.LabelWorkloadKind: "image",
|
||||
docker.LabelRole: "image",
|
||||
},
|
||||
Ports: []uint16{3000},
|
||||
}}}
|
||||
|
||||
// No plugin reconciler wired — proves the state sync comes from the
|
||||
// generic upsert pass, not from image.Reconcile.
|
||||
r := New(st, fake, 0)
|
||||
if err := r.ReconcileOnce(context.Background()); err != nil {
|
||||
t.Fatalf("ReconcileOnce: %v", err)
|
||||
}
|
||||
|
||||
got, _ := st.GetContainerByID("img-deploy-uuid")
|
||||
if got.State != "stopped" {
|
||||
t.Fatalf("image container state not synced by generic pass: got %q want stopped", got.State)
|
||||
}
|
||||
if got.Port != 3000 || got.ImageRef != "ghcr.io/owner/app:v1" {
|
||||
t.Fatalf("image container docker fields not synced: %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcileNormalizesState(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
w := makeWorkload(t, st, "norm", "stack")
|
||||
|
||||
@@ -27,6 +27,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/metrics"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin/trigger/schedule"
|
||||
@@ -124,6 +125,7 @@ func (s *Scheduler) loop(ctx context.Context) {
|
||||
// TickOnce runs a single sweep. Exposed for tests and for the boot
|
||||
// kick. On error per-trigger the loop continues with the next row.
|
||||
func (s *Scheduler) TickOnce(ctx context.Context) {
|
||||
metrics.SchedulerTicksTotal.Inc()
|
||||
rows, err := s.store.ListTriggers("schedule")
|
||||
if err != nil {
|
||||
slog.Warn("scheduler: list triggers", "error", err)
|
||||
@@ -226,5 +228,6 @@ func (s *Scheduler) fire(ctx context.Context, t store.Trigger, now time.Time) {
|
||||
slog.Warn("scheduler: dispatch", "trigger", t.Name, "error", err)
|
||||
return
|
||||
}
|
||||
metrics.SchedulerDispatchedTotal.Inc()
|
||||
slog.Info("scheduler: fired", "trigger", t.Name, "kind", t.Kind, "at", ts)
|
||||
}
|
||||
|
||||
@@ -92,17 +92,27 @@ func (c *Compose) Ps(ctx context.Context, projectName, yamlPath string) ([]Servi
|
||||
}
|
||||
|
||||
// Logs runs `docker compose -p <projectName> logs --no-color --tail=<n> <service>`.
|
||||
// If service is empty, logs for all services are returned.
|
||||
// If service is empty, logs for all services are returned. The service arg
|
||||
// is preceded by `--` so a service name that begins with `-` cannot be
|
||||
// re-parsed as a flag by the docker CLI (flag-injection guard).
|
||||
func (c *Compose) Logs(ctx context.Context, projectName, service string, tail int) (string, error) {
|
||||
args := []string{"logs", "--no-color", fmt.Sprintf("--tail=%d", tail)}
|
||||
if service != "" {
|
||||
args = append(args, service)
|
||||
args = append(args, "--", service)
|
||||
}
|
||||
return c.run(ctx, projectName, args...)
|
||||
}
|
||||
|
||||
// run executes `docker compose -p <projectName> <args...>` and returns combined output.
|
||||
// run executes `docker compose -p <projectName> <args...>` and returns
|
||||
// combined output. projectName is verified not to begin with `-` because
|
||||
// `docker compose -p '--foo'` would otherwise be re-parsed as a flag —
|
||||
// the callers already sanitize project names through projectNameSanitizer,
|
||||
// but a belt-and-braces refusal here means any future caller cannot
|
||||
// accidentally bypass the sanitizer.
|
||||
func (c *Compose) run(ctx context.Context, projectName string, args ...string) (string, error) {
|
||||
if projectName == "" || strings.HasPrefix(projectName, "-") {
|
||||
return "", fmt.Errorf("docker compose: refusing project name %q", projectName)
|
||||
}
|
||||
full := append([]string{"compose", "-p", projectName}, args...)
|
||||
cmd := exec.CommandContext(ctx, c.binary, full...)
|
||||
var buf bytes.Buffer
|
||||
|
||||
+146
-6
@@ -2,6 +2,7 @@ package stack
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
@@ -15,11 +16,25 @@ type ComposeSpec struct {
|
||||
}
|
||||
|
||||
// ServiceSpec captures the subset of compose service fields we inspect.
|
||||
//
|
||||
// All host-escape-adjacent fields are decoded here even though Tinyforge
|
||||
// itself never reads them at runtime — surfacing them to Validate() is the
|
||||
// only way to *reject* them. Add new fields here when blocking a new
|
||||
// escape vector.
|
||||
type ServiceSpec struct {
|
||||
Image string `yaml:"image,omitempty"`
|
||||
Ports []any `yaml:"ports,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Privileged bool `yaml:"privileged,omitempty"`
|
||||
Image string `yaml:"image,omitempty"`
|
||||
Build any `yaml:"build,omitempty"` // banned — see Validate
|
||||
Ports []any `yaml:"ports,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Privileged bool `yaml:"privileged,omitempty"`
|
||||
Volumes []any `yaml:"volumes,omitempty"`
|
||||
NetworkMode string `yaml:"network_mode,omitempty"`
|
||||
Pid string `yaml:"pid,omitempty"`
|
||||
Ipc string `yaml:"ipc,omitempty"`
|
||||
UsernsMode string `yaml:"userns_mode,omitempty"`
|
||||
CapAdd []string `yaml:"cap_add,omitempty"`
|
||||
Devices []any `yaml:"devices,omitempty"`
|
||||
SecurityOpt []string `yaml:"security_opt,omitempty"`
|
||||
}
|
||||
|
||||
// Parse decodes YAML into a ComposeSpec. Returns a descriptive error on failure.
|
||||
@@ -35,10 +50,20 @@ func Parse(yamlText string) (ComposeSpec, error) {
|
||||
}
|
||||
|
||||
// Validate enforces Tinyforge-level constraints beyond compose schema validity.
|
||||
// All blocked fields below are documented host-escape vectors: any one of
|
||||
// them on its own gives the container root on the host. Tinyforge already
|
||||
// owns the docker socket, so the threat model is "any admin == host root,"
|
||||
// and these blocks raise the bar for any *future* viewer-to-admin
|
||||
// escalation as well as honest-mistake guardrails.
|
||||
//
|
||||
// Current rules:
|
||||
// - No service may set `privileged: true`.
|
||||
// - Every service must declare an image (compose supports build: too, but
|
||||
// Tinyforge v1 disallows building from context to avoid arbitrary-code exec).
|
||||
// - Every service must declare an image (build contexts disallowed).
|
||||
// - No host-IPC / host-PID / host-userns / host networking.
|
||||
// - No `cap_add`, `security_opt`, `devices`.
|
||||
// - `volumes` may not bind-mount the docker socket, /, /etc, /var, /proc,
|
||||
// /sys, /root, or /home — list is conservative; operators with real
|
||||
// bind-mount needs should ship a Source plugin or a dedicated wizard.
|
||||
func Validate(spec ComposeSpec) error {
|
||||
for name, svc := range spec.Services {
|
||||
if svc.Privileged {
|
||||
@@ -47,6 +72,121 @@ func Validate(spec ComposeSpec) error {
|
||||
if svc.Image == "" {
|
||||
return fmt.Errorf("service %q: image is required (build contexts not supported)", name)
|
||||
}
|
||||
if svc.Build != nil {
|
||||
return fmt.Errorf("service %q: build: is not supported (use image:)", name)
|
||||
}
|
||||
if isBlockedNamespaceMode(svc.NetworkMode) {
|
||||
return fmt.Errorf("service %q: network_mode %q is not allowed", name, svc.NetworkMode)
|
||||
}
|
||||
if isBlockedNamespaceMode(svc.Pid) {
|
||||
return fmt.Errorf("service %q: pid: %q is not allowed", name, svc.Pid)
|
||||
}
|
||||
if isBlockedNamespaceMode(svc.Ipc) {
|
||||
return fmt.Errorf("service %q: ipc: %q is not allowed", name, svc.Ipc)
|
||||
}
|
||||
if isHostMode(svc.UsernsMode) {
|
||||
return fmt.Errorf("service %q: userns_mode %q is not allowed", name, svc.UsernsMode)
|
||||
}
|
||||
if len(svc.CapAdd) > 0 {
|
||||
return fmt.Errorf("service %q: cap_add is not allowed", name)
|
||||
}
|
||||
if len(svc.SecurityOpt) > 0 {
|
||||
return fmt.Errorf("service %q: security_opt is not allowed", name)
|
||||
}
|
||||
if len(svc.Devices) > 0 {
|
||||
return fmt.Errorf("service %q: devices is not allowed", name)
|
||||
}
|
||||
for _, v := range svc.Volumes {
|
||||
if host, ok := bindMountHostPath(v); ok {
|
||||
if isBlockedBindMount(host) {
|
||||
return fmt.Errorf("service %q: bind-mounting %q is not allowed", name, host)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// isHostMode reports a host-namespace share, i.e. network_mode / pid / ipc /
|
||||
// userns_mode set to "host". (It deliberately does NOT match "host-gateway",
|
||||
// which is an extra_hosts value, not a namespace mode — matching it here only
|
||||
// produced misleading rejections.)
|
||||
func isHostMode(v string) bool {
|
||||
return v == "host"
|
||||
}
|
||||
|
||||
// isBlockedNamespaceMode reports a namespace mode that must be rejected for
|
||||
// network_mode / pid / ipc: either host sharing ("host") or joining another
|
||||
// container's / compose service's namespace ("container:<id>",
|
||||
// "service:<name>"). The container/service joins are a lateral-movement and
|
||||
// sandbox-escape vector — a malicious service could attach to a victim
|
||||
// container's network or PID namespace.
|
||||
func isBlockedNamespaceMode(v string) bool {
|
||||
return isHostMode(v) ||
|
||||
strings.HasPrefix(v, "container:") ||
|
||||
strings.HasPrefix(v, "service:")
|
||||
}
|
||||
|
||||
// bindMountHostPath extracts the host-side path from a compose volume
|
||||
// declaration. Compose accepts two shapes: a short string "src:dst[:mode]"
|
||||
// and a long form map with a "source" key. Returns ok=false for named
|
||||
// volumes (no host source).
|
||||
func bindMountHostPath(v any) (string, bool) {
|
||||
switch t := v.(type) {
|
||||
case string:
|
||||
// "named:/in/container" has no '/' or '.' prefix on the source.
|
||||
if t == "" {
|
||||
return "", false
|
||||
}
|
||||
parts := strings.SplitN(t, ":", 3)
|
||||
src := parts[0]
|
||||
if strings.HasPrefix(src, "/") || strings.HasPrefix(src, ".") || strings.HasPrefix(src, "~") {
|
||||
return src, true
|
||||
}
|
||||
return "", false
|
||||
case map[string]any:
|
||||
if typ, _ := t["type"].(string); typ != "" && typ != "bind" {
|
||||
return "", false
|
||||
}
|
||||
if src, ok := t["source"].(string); ok {
|
||||
if strings.HasPrefix(src, "/") || strings.HasPrefix(src, ".") || strings.HasPrefix(src, "~") {
|
||||
return src, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// isBlockedBindMount returns true for paths that obviously escape the
|
||||
// container's intended sandbox. Conservative deny-list — operators with
|
||||
// legitimate bind-mount needs should write a dedicated Source plugin
|
||||
// rather than tunnel them through compose.
|
||||
func isBlockedBindMount(host string) bool {
|
||||
// Normalize trailing slash so "/var" and "/var/" both match.
|
||||
clean := strings.TrimRight(host, "/")
|
||||
if clean == "" || clean == "/" {
|
||||
return true
|
||||
}
|
||||
// Relative ("./x", "../x", ".") and home-relative ("~/...") sources are
|
||||
// resolved by Docker against the compose working directory (which
|
||||
// Tinyforge controls and never intends as a host-bind source) or left
|
||||
// unexpanded — and "../" can climb out of that directory entirely. The
|
||||
// absolute-prefix deny-list below can't see these, so reject them
|
||||
// outright rather than give a false sense of coverage.
|
||||
if strings.HasPrefix(clean, ".") || strings.HasPrefix(clean, "~") {
|
||||
return true
|
||||
}
|
||||
// Specific blocked files / sockets.
|
||||
switch clean {
|
||||
case "/var/run/docker.sock", "/run/docker.sock":
|
||||
return true
|
||||
}
|
||||
// Blocked prefixes (cover sub-paths too).
|
||||
blocked := []string{"/etc", "/var", "/proc", "/sys", "/root", "/home", "/boot", "/dev"}
|
||||
for _, p := range blocked {
|
||||
if clean == p || strings.HasPrefix(clean, p+"/") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
package staticsite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
)
|
||||
|
||||
// CommitStatusReporter pushes deploy outcomes back to the git provider as a
|
||||
// commit status, gated on the per-workload report_commit_status flag. It is
|
||||
// strictly best-effort: every call is wrapped so a reporting failure is logged
|
||||
// at Warn and NEVER propagates to fail or block the deploy.
|
||||
//
|
||||
// The provider + identifiers are captured once at deploy start so the hot
|
||||
// transition points (pending/success/failure) read as one-liners. A nil
|
||||
// receiver (reporting disabled) makes Report a no-op, so callers don't have to
|
||||
// guard each transition.
|
||||
//
|
||||
// It lives in the staticsite package (alongside GitProvider / CommitStatus)
|
||||
// rather than the plugin package so the source plugins can share it without
|
||||
// staticsite taking a dependency on plugin. It is parameterized on primitives
|
||||
// (not plugin.Workload) for the same reason.
|
||||
type CommitStatusReporter struct {
|
||||
provider GitProvider
|
||||
owner string
|
||||
repo string
|
||||
sha string
|
||||
targetURL string
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// NewCommitStatusReporter builds a reporter from the resolved deploy inputs.
|
||||
// When enabled is false (report_commit_status off) or the SHA is empty, the
|
||||
// returned reporter's Report method is inert.
|
||||
func NewCommitStatusReporter(provider GitProvider, owner, repo, sha, targetURL string, enabled bool) *CommitStatusReporter {
|
||||
return &CommitStatusReporter{
|
||||
provider: provider,
|
||||
owner: owner,
|
||||
repo: repo,
|
||||
sha: sha,
|
||||
targetURL: targetURL,
|
||||
enabled: enabled,
|
||||
}
|
||||
}
|
||||
|
||||
// Report sends one commit status, swallowing (and logging) any error. Safe to
|
||||
// call on a nil/disabled reporter or with a nil provider/empty SHA.
|
||||
func (r *CommitStatusReporter) Report(ctx context.Context, workloadName, workloadID string, status CommitStatus, description string) {
|
||||
if r == nil || !r.enabled || r.provider == nil || r.sha == "" {
|
||||
return
|
||||
}
|
||||
if err := r.provider.SetCommitStatus(ctx, r.owner, r.repo, r.sha, status, r.targetURL, description); err != nil {
|
||||
slog.Warn("commit-status report failed (ignored)",
|
||||
"workload", workloadName, "workload_id", workloadID, "status", string(status), "error", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
package staticsite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// fakeReporterProvider is a stub GitProvider that records SetCommitStatus
|
||||
// calls. Only the methods the reporter exercises are meaningful; the rest
|
||||
// satisfy the interface and panic if ever hit so a mis-wired test is loud.
|
||||
type fakeReporterProvider struct {
|
||||
calls []reporterStatusCall
|
||||
failErr error // when set, SetCommitStatus returns it (best-effort path)
|
||||
}
|
||||
|
||||
type reporterStatusCall struct {
|
||||
owner, repo, sha string
|
||||
status CommitStatus
|
||||
targetURL, descr string
|
||||
}
|
||||
|
||||
func (f *fakeReporterProvider) SetCommitStatus(_ context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
|
||||
f.calls = append(f.calls, reporterStatusCall{owner, repo, sha, status, targetURL, description})
|
||||
return f.failErr
|
||||
}
|
||||
|
||||
func (*fakeReporterProvider) Name() string { return "fake" }
|
||||
func (*fakeReporterProvider) TestConnection(context.Context, string, string) error {
|
||||
panic("unused")
|
||||
}
|
||||
func (*fakeReporterProvider) ListRepos(context.Context, string) ([]RepoInfo, error) {
|
||||
panic("unused")
|
||||
}
|
||||
func (*fakeReporterProvider) ListBranches(context.Context, string, string) ([]string, error) {
|
||||
panic("unused")
|
||||
}
|
||||
func (*fakeReporterProvider) GetLatestCommitSHA(context.Context, string, string, string) (string, error) {
|
||||
panic("unused")
|
||||
}
|
||||
func (*fakeReporterProvider) ListTree(context.Context, string, string, string) ([]FolderEntry, error) {
|
||||
panic("unused")
|
||||
}
|
||||
func (*fakeReporterProvider) DownloadFolder(context.Context, string, string, string, string, string) error {
|
||||
panic("unused")
|
||||
}
|
||||
func (*fakeReporterProvider) DownloadFile(context.Context, string, string, string, string, int64) ([]byte, error) {
|
||||
panic("unused")
|
||||
}
|
||||
|
||||
// Enabled: forwards to the provider with the captured identifiers + target.
|
||||
func TestCommitStatusReporter_Enabled_Calls(t *testing.T) {
|
||||
fp := &fakeReporterProvider{}
|
||||
r := NewCommitStatusReporter(fp, "owner", "pages", "abc123", "https://app.example.com", true)
|
||||
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusPending, "Tinyforge: deploying")
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "Tinyforge: deployed")
|
||||
|
||||
if len(fp.calls) != 2 {
|
||||
t.Fatalf("calls = %d, want 2", len(fp.calls))
|
||||
}
|
||||
first := fp.calls[0]
|
||||
if first.owner != "owner" || first.repo != "pages" || first.sha != "abc123" {
|
||||
t.Errorf("identifiers wrong: %+v", first)
|
||||
}
|
||||
if first.status != CommitStatusPending {
|
||||
t.Errorf("first status = %q, want pending", first.status)
|
||||
}
|
||||
if first.targetURL != "https://app.example.com" {
|
||||
t.Errorf("targetURL = %q", first.targetURL)
|
||||
}
|
||||
if fp.calls[1].status != CommitStatusSuccess {
|
||||
t.Errorf("second status = %q, want success", fp.calls[1].status)
|
||||
}
|
||||
}
|
||||
|
||||
// Disabled: the reporter is inert.
|
||||
func TestCommitStatusReporter_Disabled_NoCalls(t *testing.T) {
|
||||
fp := &fakeReporterProvider{}
|
||||
r := NewCommitStatusReporter(fp, "owner", "pages", "abc123", "", false)
|
||||
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "x")
|
||||
if len(fp.calls) != 0 {
|
||||
t.Fatalf("expected no calls when disabled, got %d", len(fp.calls))
|
||||
}
|
||||
}
|
||||
|
||||
// An empty SHA (e.g. a provider that couldn't resolve the branch) must not
|
||||
// produce a status call even when reporting is enabled.
|
||||
func TestCommitStatusReporter_EmptySHA_NoCalls(t *testing.T) {
|
||||
fp := &fakeReporterProvider{}
|
||||
r := NewCommitStatusReporter(fp, "owner", "pages", "", "", true)
|
||||
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusPending, "x")
|
||||
if len(fp.calls) != 0 {
|
||||
t.Fatalf("expected no calls with empty SHA, got %d", len(fp.calls))
|
||||
}
|
||||
}
|
||||
|
||||
// A provider error must be swallowed (best-effort) — Report never panics or
|
||||
// propagates. We assert it returns normally after a failing provider call.
|
||||
func TestCommitStatusReporter_ProviderError_Swallowed(t *testing.T) {
|
||||
fp := &fakeReporterProvider{failErr: errors.New("boom")}
|
||||
r := NewCommitStatusReporter(fp, "owner", "pages", "abc123", "", true)
|
||||
|
||||
// Should not panic / propagate.
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusFailure, "Tinyforge: deploy failed")
|
||||
if len(fp.calls) != 1 {
|
||||
t.Fatalf("expected the failing call to still be recorded, got %d", len(fp.calls))
|
||||
}
|
||||
}
|
||||
|
||||
// A nil reporter (constructed only when needed in some call paths) is safe.
|
||||
func TestCommitStatusReporter_NilSafe(t *testing.T) {
|
||||
var r *CommitStatusReporter
|
||||
// Must not panic.
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "x")
|
||||
}
|
||||
|
||||
// A nil provider on an enabled reporter is also a no-op (defensive guard).
|
||||
func TestCommitStatusReporter_NilProvider_NoPanic(t *testing.T) {
|
||||
r := NewCommitStatusReporter(nil, "owner", "pages", "abc123", "", true)
|
||||
// Must not panic.
|
||||
r.Report(context.Background(), "site", "wid-1", CommitStatusSuccess, "x")
|
||||
}
|
||||
@@ -0,0 +1,331 @@
|
||||
package staticsite
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── State mapping (pure) ────────────────────────────────────────────
|
||||
//
|
||||
// Each provider maps the provider-agnostic CommitStatus onto its own API
|
||||
// vocabulary. Gitea/GitHub accept the same four words; GitLab collapses
|
||||
// failure+error into "failed".
|
||||
|
||||
func TestGiteaState_Mapping(t *testing.T) {
|
||||
cases := map[CommitStatus]string{
|
||||
CommitStatusPending: "pending",
|
||||
CommitStatusSuccess: "success",
|
||||
CommitStatusFailure: "failure",
|
||||
CommitStatusError: "error",
|
||||
CommitStatus("???"): "pending", // unknown -> pending fallback
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := giteaState(in); got != want {
|
||||
t.Errorf("giteaState(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitHubState_Mapping(t *testing.T) {
|
||||
cases := map[CommitStatus]string{
|
||||
CommitStatusPending: "pending",
|
||||
CommitStatusSuccess: "success",
|
||||
CommitStatusFailure: "failure",
|
||||
CommitStatusError: "error",
|
||||
CommitStatus("???"): "pending",
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := githubState(in); got != want {
|
||||
t.Errorf("githubState(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLabState_Mapping(t *testing.T) {
|
||||
cases := map[CommitStatus]string{
|
||||
CommitStatusPending: "pending",
|
||||
CommitStatusSuccess: "success",
|
||||
CommitStatusFailure: "failed", // GitLab has no "failure"
|
||||
CommitStatusError: "failed", // error also collapses to "failed"
|
||||
CommitStatus("???"): "pending",
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := gitlabState(in); got != want {
|
||||
t.Errorf("gitlabState(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateDescription(t *testing.T) {
|
||||
short := "Tinyforge: deploying"
|
||||
if got := truncateDescription(short); got != short {
|
||||
t.Errorf("short description mutated: %q", got)
|
||||
}
|
||||
long := strings.Repeat("x", 200)
|
||||
got := truncateDescription(long)
|
||||
if len([]rune(got)) > maxCommitStatusDescription {
|
||||
t.Errorf("truncated length = %d runes, want <= %d", len([]rune(got)), maxCommitStatusDescription)
|
||||
}
|
||||
if !strings.HasSuffix(got, "…") {
|
||||
t.Errorf("missing ellipsis on truncation: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Endpoint + body construction (httptest) ─────────────────────────
|
||||
//
|
||||
// The SSRF-safe client refuses loopback, so for these tests we swap the
|
||||
// provider's httpClient for a plain one pointed at httptest. This still
|
||||
// exercises the real URL/body construction inside each SetCommitStatus.
|
||||
|
||||
type capturedRequest struct {
|
||||
method string
|
||||
path string // r.URL.EscapedPath() — preserves %2F so GitLab's encoded project path is observable
|
||||
rawQ string
|
||||
body map[string]string
|
||||
auth string
|
||||
token string // PRIVATE-TOKEN (GitLab)
|
||||
}
|
||||
|
||||
func newCaptureServer(t *testing.T, capture *capturedRequest) *httptest.Server {
|
||||
t.Helper()
|
||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
capture.method = r.Method
|
||||
capture.path = r.URL.EscapedPath()
|
||||
capture.rawQ = r.URL.RawQuery
|
||||
capture.auth = r.Header.Get("Authorization")
|
||||
capture.token = r.Header.Get("PRIVATE-TOKEN")
|
||||
raw, _ := io.ReadAll(r.Body)
|
||||
if len(raw) > 0 {
|
||||
_ = json.Unmarshal(raw, &capture.body)
|
||||
}
|
||||
w.WriteHeader(http.StatusCreated)
|
||||
_, _ = w.Write([]byte(`{}`))
|
||||
}))
|
||||
}
|
||||
|
||||
func TestGitea_SetCommitStatus_Request(t *testing.T) {
|
||||
var cap capturedRequest
|
||||
srv := newCaptureServer(t, &cap)
|
||||
defer srv.Close()
|
||||
|
||||
f := NewGiteaContentFetcher(srv.URL, "tok123")
|
||||
f.httpClient = srv.Client() // bypass SSRF guard for loopback test server
|
||||
|
||||
err := f.SetCommitStatus(context.Background(), "owner", "repo", "abc123",
|
||||
CommitStatusSuccess, "https://app.example.com", "deployed")
|
||||
if err != nil {
|
||||
t.Fatalf("SetCommitStatus: %v", err)
|
||||
}
|
||||
|
||||
if cap.method != http.MethodPost {
|
||||
t.Errorf("method = %q, want POST", cap.method)
|
||||
}
|
||||
if want := "/api/v1/repos/owner/repo/statuses/abc123"; cap.path != want {
|
||||
t.Errorf("path = %q, want %q", cap.path, want)
|
||||
}
|
||||
if cap.body["state"] != "success" {
|
||||
t.Errorf("state = %q, want success", cap.body["state"])
|
||||
}
|
||||
if cap.body["context"] != "tinyforge" {
|
||||
t.Errorf("context = %q, want tinyforge", cap.body["context"])
|
||||
}
|
||||
if cap.body["target_url"] != "https://app.example.com" {
|
||||
t.Errorf("target_url = %q", cap.body["target_url"])
|
||||
}
|
||||
if cap.body["description"] != "deployed" {
|
||||
t.Errorf("description = %q, want deployed", cap.body["description"])
|
||||
}
|
||||
if cap.auth != "token tok123" {
|
||||
t.Errorf("auth = %q, want 'token tok123'", cap.auth)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitHub_SetCommitStatus_Request(t *testing.T) {
|
||||
var cap capturedRequest
|
||||
srv := newCaptureServer(t, &cap)
|
||||
defer srv.Close()
|
||||
|
||||
// Force GHE-style apiBase so we hit the server's path; the github.com
|
||||
// branch hard-codes api.github.com which the SSRF client would block.
|
||||
g := NewGitHubProvider(srv.URL, "ghp_tok")
|
||||
g.apiBase = srv.URL + "/api/v3"
|
||||
g.httpClient = srv.Client()
|
||||
|
||||
err := g.SetCommitStatus(context.Background(), "octo", "cat", "deadbeef",
|
||||
CommitStatusFailure, "", "failed")
|
||||
if err != nil {
|
||||
t.Fatalf("SetCommitStatus: %v", err)
|
||||
}
|
||||
|
||||
if want := "/api/v3/repos/octo/cat/statuses/deadbeef"; cap.path != want {
|
||||
t.Errorf("path = %q, want %q", cap.path, want)
|
||||
}
|
||||
if cap.body["state"] != "failure" {
|
||||
t.Errorf("state = %q, want failure", cap.body["state"])
|
||||
}
|
||||
if cap.body["context"] != "tinyforge" {
|
||||
t.Errorf("context = %q, want tinyforge", cap.body["context"])
|
||||
}
|
||||
if cap.auth != "Bearer ghp_tok" {
|
||||
t.Errorf("auth = %q, want 'Bearer ghp_tok'", cap.auth)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLab_SetCommitStatus_Request(t *testing.T) {
|
||||
var cap capturedRequest
|
||||
srv := newCaptureServer(t, &cap)
|
||||
defer srv.Close()
|
||||
|
||||
g := NewGitLabProvider(srv.URL, "glpat-xyz")
|
||||
g.httpClient = srv.Client()
|
||||
|
||||
err := g.SetCommitStatus(context.Background(), "grp", "proj", "cafe01",
|
||||
CommitStatusError, "https://app.example.com", "boom")
|
||||
if err != nil {
|
||||
t.Fatalf("SetCommitStatus: %v", err)
|
||||
}
|
||||
|
||||
// GitLab uses the URL-encoded project path + sha in the path, and the
|
||||
// status metadata as query params.
|
||||
if want := "/api/v4/projects/grp%2Fproj/statuses/cafe01"; cap.path != want {
|
||||
t.Errorf("path = %q, want %q", cap.path, want)
|
||||
}
|
||||
q, err := parseQuery(cap.rawQ)
|
||||
if err != nil {
|
||||
t.Fatalf("parse query %q: %v", cap.rawQ, err)
|
||||
}
|
||||
if q["state"] != "failed" { // error -> failed
|
||||
t.Errorf("state = %q, want failed", q["state"])
|
||||
}
|
||||
if q["name"] != "tinyforge" {
|
||||
t.Errorf("name = %q, want tinyforge", q["name"])
|
||||
}
|
||||
if q["target_url"] != "https://app.example.com" {
|
||||
t.Errorf("target_url = %q", q["target_url"])
|
||||
}
|
||||
if q["description"] != "boom" {
|
||||
t.Errorf("description = %q, want boom", q["description"])
|
||||
}
|
||||
if cap.token != "glpat-xyz" {
|
||||
t.Errorf("PRIVATE-TOKEN = %q, want glpat-xyz", cap.token)
|
||||
}
|
||||
}
|
||||
|
||||
// parseQuery is a tiny wrapper so the test reads the first value of each
|
||||
// query key without dragging net/url into every assertion.
|
||||
func parseQuery(raw string) (map[string]string, error) {
|
||||
out := map[string]string{}
|
||||
if raw == "" {
|
||||
return out, nil
|
||||
}
|
||||
for _, pair := range strings.Split(raw, "&") {
|
||||
kv := strings.SplitN(pair, "=", 2)
|
||||
k := urlDecode(kv[0])
|
||||
v := ""
|
||||
if len(kv) == 2 {
|
||||
v = urlDecode(kv[1])
|
||||
}
|
||||
if _, ok := out[k]; !ok {
|
||||
out[k] = v
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func urlDecode(s string) string {
|
||||
dec, err := decodeQueryComponent(s)
|
||||
if err != nil {
|
||||
return s
|
||||
}
|
||||
return dec
|
||||
}
|
||||
|
||||
// decodeQueryComponent decodes one application/x-www-form-urlencoded
|
||||
// component (handles %XX and '+'-as-space) without importing net/url here.
|
||||
func decodeQueryComponent(s string) (string, error) {
|
||||
var b strings.Builder
|
||||
for i := 0; i < len(s); i++ {
|
||||
switch s[i] {
|
||||
case '+':
|
||||
b.WriteByte(' ')
|
||||
case '%':
|
||||
if i+2 >= len(s) {
|
||||
return s, errPercent
|
||||
}
|
||||
hi, lo := fromHex(s[i+1]), fromHex(s[i+2])
|
||||
if hi < 0 || lo < 0 {
|
||||
return s, errPercent
|
||||
}
|
||||
b.WriteByte(byte(hi<<4 | lo))
|
||||
i += 2
|
||||
default:
|
||||
b.WriteByte(s[i])
|
||||
}
|
||||
}
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
var errPercent = &decodeErr{}
|
||||
|
||||
type decodeErr struct{}
|
||||
|
||||
func (*decodeErr) Error() string { return "bad percent-encoding" }
|
||||
|
||||
func fromHex(c byte) int {
|
||||
switch {
|
||||
case c >= '0' && c <= '9':
|
||||
return int(c - '0')
|
||||
case c >= 'a' && c <= 'f':
|
||||
return int(c-'a') + 10
|
||||
case c >= 'A' && c <= 'F':
|
||||
return int(c-'A') + 10
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// TestSetCommitStatus_NonOK_ReturnsError verifies a non-2xx provider
|
||||
// response surfaces as an error (the deploy hook logs + swallows it, but
|
||||
// the provider method itself must report it).
|
||||
func TestSetCommitStatus_NonOK_ReturnsError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
_, _ = w.Write([]byte(`{"message":"bad token"}`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
f := NewGiteaContentFetcher(srv.URL, "tok")
|
||||
f.httpClient = srv.Client()
|
||||
|
||||
err := f.SetCommitStatus(context.Background(), "o", "r", "sha", CommitStatusPending, "", "x")
|
||||
if err == nil {
|
||||
t.Fatal("expected error on 401, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "401") {
|
||||
t.Errorf("error missing status code: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSetCommitStatus_RespectsContext ensures the call honours context
|
||||
// cancellation (defensive — the deploy hook passes the deploy ctx).
|
||||
func TestSetCommitStatus_RespectsContext(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
w.WriteHeader(http.StatusCreated)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
f := NewGiteaContentFetcher(srv.URL, "")
|
||||
f.httpClient = srv.Client()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
|
||||
defer cancel()
|
||||
if err := f.SetCommitStatus(ctx, "o", "r", "sha", CommitStatusPending, "", "x"); err == nil {
|
||||
t.Fatal("expected context-deadline error, got nil")
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
@@ -294,6 +295,15 @@ func (f *GiteaContentFetcher) DownloadFolder(ctx context.Context, owner, repo, b
|
||||
return nil
|
||||
}
|
||||
|
||||
// DownloadFile fetches a single file's raw bytes via Gitea's raw endpoint
|
||||
// (also serves Forgejo/Gogs), capped at maxBytes. Returns ErrFileNotFound on
|
||||
// a 404 so an absent config file reads as a non-error state.
|
||||
func (f *GiteaContentFetcher) DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error) {
|
||||
p := strings.TrimPrefix(path, "/")
|
||||
fileURL := fmt.Sprintf("%s/api/v1/repos/%s/%s/raw/%s?ref=%s", f.baseURL, owner, repo, p, ref)
|
||||
return getFileBytes(ctx, f.httpClient, fileURL, maxBytes, f.setAuth)
|
||||
}
|
||||
|
||||
// TestConnection verifies that the repository is accessible.
|
||||
func (f *GiteaContentFetcher) TestConnection(ctx context.Context, owner, repo string) error {
|
||||
url := fmt.Sprintf("%s/api/v1/repos/%s/%s", f.baseURL, owner, repo)
|
||||
@@ -304,6 +314,54 @@ func (f *GiteaContentFetcher) TestConnection(ctx context.Context, owner, repo st
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetCommitStatus reports a deploy status on a commit via Gitea's commit-
|
||||
// status API (also serves Forgejo/Gogs). The "context" field is fixed to
|
||||
// "tinyforge" so repeated deploys update one status row.
|
||||
func (f *GiteaContentFetcher) SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
|
||||
state := giteaState(status)
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"state": state,
|
||||
"target_url": targetURL,
|
||||
"description": truncateDescription(description),
|
||||
"context": commitStatusContext,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal status: %w", err)
|
||||
}
|
||||
// Path-escape each identifier so the URL shape matches the other
|
||||
// provider methods and a hostile owner/repo/sha can't break out of
|
||||
// the intended path. The SSRF-safe client guards the host.
|
||||
apiURL := fmt.Sprintf("%s/api/v1/repos/%s/%s/statuses/%s",
|
||||
f.baseURL, url.PathEscape(owner), url.PathEscape(repo), url.PathEscape(sha))
|
||||
if err := postJSON(ctx, f.httpClient, apiURL, body, f.setAuth); err != nil {
|
||||
return fmt.Errorf("set commit status: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setAuth applies the Gitea token header (no-op when the token is empty).
|
||||
func (f *GiteaContentFetcher) setAuth(req *http.Request) {
|
||||
if f.token != "" {
|
||||
req.Header.Set("Authorization", "token "+f.token)
|
||||
}
|
||||
}
|
||||
|
||||
// giteaState maps a provider-agnostic CommitStatus onto Gitea's API
|
||||
// vocabulary. Gitea accepts the same four words Tinyforge uses, so this is
|
||||
// a 1:1 mapping with a "pending" fallback for any unknown value.
|
||||
func giteaState(status CommitStatus) string {
|
||||
switch status {
|
||||
case CommitStatusSuccess:
|
||||
return "success"
|
||||
case CommitStatusFailure:
|
||||
return "failure"
|
||||
case CommitStatusError:
|
||||
return "error"
|
||||
default:
|
||||
return "pending"
|
||||
}
|
||||
}
|
||||
|
||||
// doGet performs an authenticated GET request and returns the response body.
|
||||
func (f *GiteaContentFetcher) doGet(ctx context.Context, url string) ([]byte, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
@@ -115,6 +116,43 @@ func (g *GitHubProvider) TestConnection(ctx context.Context, owner, repo string)
|
||||
return err
|
||||
}
|
||||
|
||||
// SetCommitStatus reports a deploy status on a commit via GitHub's commit-
|
||||
// status API (works for github.com and GitHub Enterprise — apiBase already
|
||||
// carries the /api/v3 suffix for GHE). The "context" field is fixed to
|
||||
// "tinyforge" so repeated deploys update one status row.
|
||||
func (g *GitHubProvider) SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"state": githubState(status),
|
||||
"target_url": targetURL,
|
||||
"description": truncateDescription(description),
|
||||
"context": commitStatusContext,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal status: %w", err)
|
||||
}
|
||||
apiURL := fmt.Sprintf("%s/repos/%s/%s/statuses/%s",
|
||||
g.apiBase, url.PathEscape(owner), url.PathEscape(repo), url.PathEscape(sha))
|
||||
if err := postJSON(ctx, g.httpClient, apiURL, body, g.setAuth); err != nil {
|
||||
return fmt.Errorf("set commit status: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// githubState maps a provider-agnostic CommitStatus onto GitHub's API
|
||||
// vocabulary. GitHub accepts the same four words Tinyforge uses.
|
||||
func githubState(status CommitStatus) string {
|
||||
switch status {
|
||||
case CommitStatusSuccess:
|
||||
return "success"
|
||||
case CommitStatusFailure:
|
||||
return "failure"
|
||||
case CommitStatusError:
|
||||
return "error"
|
||||
default:
|
||||
return "pending"
|
||||
}
|
||||
}
|
||||
|
||||
func (g *GitHubProvider) ListBranches(ctx context.Context, owner, repo string) ([]string, error) {
|
||||
var allBranches []string
|
||||
page := 1
|
||||
@@ -250,6 +288,19 @@ func (g *GitHubProvider) DownloadFolder(ctx context.Context, owner, repo, branch
|
||||
return nil
|
||||
}
|
||||
|
||||
// DownloadFile fetches a single file's raw bytes via the GitHub contents API
|
||||
// using the raw media type (works for both github.com and GHE), capped at
|
||||
// maxBytes. Returns ErrFileNotFound on a 404.
|
||||
func (g *GitHubProvider) DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error) {
|
||||
p := strings.TrimPrefix(path, "/")
|
||||
fileURL := fmt.Sprintf("%s/repos/%s/%s/contents/%s?ref=%s", g.apiBase, owner, repo, p, ref)
|
||||
auth := func(r *http.Request) {
|
||||
g.setAuth(r)
|
||||
r.Header.Set("Accept", "application/vnd.github.raw+json")
|
||||
}
|
||||
return getFileBytes(ctx, g.httpClient, fileURL, maxBytes, auth)
|
||||
}
|
||||
|
||||
func (g *GitHubProvider) doGet(ctx context.Context, url string) ([]byte, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
|
||||
@@ -95,6 +95,45 @@ func (g *GitLabProvider) TestConnection(ctx context.Context, owner, repo string)
|
||||
return err
|
||||
}
|
||||
|
||||
// SetCommitStatus reports a deploy status on a commit via GitLab's commit-
|
||||
// status API. GitLab's state vocabulary differs (pending/running/success/
|
||||
// failed/canceled), so failure AND error both map to "failed". The status
|
||||
// metadata (name/target_url/description) is passed as query parameters,
|
||||
// which is how GitLab's POST .../statuses/{sha} endpoint accepts them.
|
||||
func (g *GitLabProvider) SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error {
|
||||
q := url.Values{}
|
||||
q.Set("state", gitlabState(status))
|
||||
q.Set("name", commitStatusContext)
|
||||
if targetURL != "" {
|
||||
q.Set("target_url", targetURL)
|
||||
}
|
||||
if description != "" {
|
||||
q.Set("description", truncateDescription(description))
|
||||
}
|
||||
apiURL := fmt.Sprintf("%s/projects/%s/statuses/%s?%s",
|
||||
g.apiBase, projectPath(owner, repo), url.PathEscape(sha), q.Encode())
|
||||
// No JSON body — all fields ride as query params. Reuse postJSON for
|
||||
// the SSRF-safe POST + 2xx handling; an empty body is valid here.
|
||||
if err := postJSON(ctx, g.httpClient, apiURL, nil, g.setAuth); err != nil {
|
||||
return fmt.Errorf("set commit status: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// gitlabState maps a provider-agnostic CommitStatus onto GitLab's API
|
||||
// vocabulary. GitLab has no "failure"/"error" split — both map to
|
||||
// "failed".
|
||||
func gitlabState(status CommitStatus) string {
|
||||
switch status {
|
||||
case CommitStatusSuccess:
|
||||
return "success"
|
||||
case CommitStatusFailure, CommitStatusError:
|
||||
return "failed"
|
||||
default:
|
||||
return "pending"
|
||||
}
|
||||
}
|
||||
|
||||
func (g *GitLabProvider) ListBranches(ctx context.Context, owner, repo string) ([]string, error) {
|
||||
var allBranches []string
|
||||
page := 1
|
||||
@@ -234,6 +273,22 @@ func (g *GitLabProvider) DownloadFolder(ctx context.Context, owner, repo, branch
|
||||
return nil
|
||||
}
|
||||
|
||||
// DownloadFile fetches a single file's raw bytes via GitLab's raw endpoint,
|
||||
// capped at maxBytes. Returns ErrFileNotFound on a 404. owner/repo/ref are
|
||||
// path-escaped; the file path is passed through verbatim to preserve its `/`
|
||||
// separators (a `..` segment is harmless — the bytes are only parsed in
|
||||
// memory, never written to disk, so there is no local-traversal sink).
|
||||
func (g *GitLabProvider) DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error) {
|
||||
p := strings.TrimPrefix(path, "/")
|
||||
fileURL := fmt.Sprintf("%s/%s/%s/-/raw/%s/%s",
|
||||
g.rawBase,
|
||||
url.PathEscape(owner),
|
||||
url.PathEscape(repo),
|
||||
url.PathEscape(ref),
|
||||
p)
|
||||
return getFileBytes(ctx, g.httpClient, fileURL, maxBytes, g.setAuth)
|
||||
}
|
||||
|
||||
func (g *GitLabProvider) doGet(ctx context.Context, apiURL string) ([]byte, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiURL, nil)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
package staticsite
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
@@ -11,6 +13,11 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// ErrFileNotFound is returned by GitProvider.DownloadFile when the file is
|
||||
// absent (HTTP 404). Callers use it to distinguish "no file" (a normal,
|
||||
// non-error state for GitOps) from a genuine fetch failure.
|
||||
var ErrFileNotFound = errors.New("file not found")
|
||||
|
||||
// RepoInfo represents a repository returned by the provider's list/search API.
|
||||
type RepoInfo struct {
|
||||
Owner string `json:"owner"`
|
||||
@@ -21,6 +28,40 @@ type RepoInfo struct {
|
||||
HTMLURL string `json:"html_url"`
|
||||
}
|
||||
|
||||
// CommitStatus is the deploy outcome reported back to the git provider as
|
||||
// a commit status. The values are provider-agnostic; each implementation
|
||||
// maps them onto its own API vocabulary (Gitea/GitHub use the same four
|
||||
// words, GitLab collapses failure/error into "failed").
|
||||
type CommitStatus string
|
||||
|
||||
const (
|
||||
CommitStatusPending CommitStatus = "pending"
|
||||
CommitStatusSuccess CommitStatus = "success"
|
||||
CommitStatusFailure CommitStatus = "failure"
|
||||
CommitStatusError CommitStatus = "error"
|
||||
)
|
||||
|
||||
// commitStatusContext is the status "context"/"name" key reported to every
|
||||
// provider so repeated deploys update the same status row rather than
|
||||
// piling up new ones.
|
||||
const commitStatusContext = "tinyforge"
|
||||
|
||||
// maxCommitStatusDescription caps the human-readable description so a
|
||||
// provider can't reject the request for an over-long field.
|
||||
const maxCommitStatusDescription = 140
|
||||
|
||||
// truncateDescription clamps a status description to the provider-safe
|
||||
// length, appending an ellipsis when it had to cut.
|
||||
func truncateDescription(s string) string {
|
||||
if len(s) <= maxCommitStatusDescription {
|
||||
return s
|
||||
}
|
||||
// Reserve room for the ellipsis rune; cut on a byte boundary that
|
||||
// stays under the cap. Descriptions are short ASCII strings in
|
||||
// practice, so a simple byte cut is fine here.
|
||||
return s[:maxCommitStatusDescription-1] + "…"
|
||||
}
|
||||
|
||||
// GitProvider abstracts Git hosting API operations.
|
||||
// Implementations exist for Gitea/Forgejo/Gogs, GitHub, and GitLab.
|
||||
type GitProvider interface {
|
||||
@@ -45,6 +86,18 @@ type GitProvider interface {
|
||||
|
||||
// DownloadFolder downloads all files from a folder path to a local directory.
|
||||
DownloadFolder(ctx context.Context, owner, repo, branch, folderPath, destDir string) error
|
||||
|
||||
// DownloadFile fetches a single file's bytes from a ref (branch/sha),
|
||||
// capped at maxBytes. Returns ErrFileNotFound on a 404 so callers can
|
||||
// treat an absent file as a non-error state. Used to read a small in-repo
|
||||
// config file (e.g. .tinyforge.yml) without materializing a whole tree.
|
||||
DownloadFile(ctx context.Context, owner, repo, ref, path string, maxBytes int64) ([]byte, error)
|
||||
|
||||
// SetCommitStatus reports a deploy status on a commit. Best-effort;
|
||||
// callers ignore errors beyond logging. targetURL and description are
|
||||
// optional (pass "" to omit); description is truncated to a provider-
|
||||
// safe length by the implementation.
|
||||
SetCommitStatus(ctx context.Context, owner, repo, sha string, status CommitStatus, targetURL, description string) error
|
||||
}
|
||||
|
||||
// ProviderType identifies a Git hosting provider.
|
||||
@@ -135,6 +188,74 @@ func httpGet(ctx context.Context, client *http.Client, url string) (int, error)
|
||||
return resp.StatusCode, nil
|
||||
}
|
||||
|
||||
// postJSON is a shared helper for POSTing a JSON body to a provider API
|
||||
// endpoint with the caller's auth applied. It accepts any 2xx as success
|
||||
// (status APIs return 201 Created on Gitea/GitHub, 200/201 on GitLab) and
|
||||
// returns a status-code-only error on non-2xx — it must NOT echo the
|
||||
// response body: the deploy hook logs this error best-effort, and a
|
||||
// hostile/misconfigured provider could reflect the request's auth token
|
||||
// back in its body. The body bytes must already be marshalled by the caller.
|
||||
func postJSON(ctx context.Context, client *http.Client, url string, body []byte, authHeader func(r *http.Request)) error {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
if authHeader != nil {
|
||||
authHeader(req)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("execute request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("unexpected status %d", resp.StatusCode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getFileBytes GETs fileURL with the caller's auth applied and returns the
|
||||
// body, enforcing a maxBytes cap. Returns ErrFileNotFound on 404; a
|
||||
// status-code-only error otherwise (it must NOT echo the response body — a
|
||||
// hostile/misconfigured provider could reflect the request's auth token back).
|
||||
func getFileBytes(ctx context.Context, client *http.Client, fileURL string, maxBytes int64, authHeader func(r *http.Request)) ([]byte, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fileURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
if authHeader != nil {
|
||||
authHeader(req)
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("execute request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusNotFound:
|
||||
return nil, ErrFileNotFound
|
||||
case resp.StatusCode != http.StatusOK:
|
||||
return nil, fmt.Errorf("unexpected status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Read one byte past the cap so an over-size file is detected rather than
|
||||
// silently truncated.
|
||||
data, err := io.ReadAll(io.LimitReader(resp.Body, maxBytes+1))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read response: %w", err)
|
||||
}
|
||||
if int64(len(data)) > maxBytes {
|
||||
return nil, fmt.Errorf("file exceeds %d byte cap", maxBytes)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// downloadFileHTTP is a shared helper for downloading a file from a URL.
|
||||
func downloadFileHTTP(ctx context.Context, client *http.Client, url, localPath string, authHeader func(r *http.Request)) error {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
|
||||
@@ -50,34 +50,7 @@ func ValidateBaseURL(raw string) error {
|
||||
func NewSafeHTTPClient(timeout time.Duration) *http.Client {
|
||||
dialer := &net.Dialer{Timeout: 10 * time.Second, KeepAlive: 30 * time.Second}
|
||||
transport := &http.Transport{
|
||||
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
|
||||
host, port, err := net.SplitHostPort(addr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// If the caller passed a literal IP, skip the DNS round-trip.
|
||||
if literal := net.ParseIP(host); literal != nil {
|
||||
if reason := blockReason(literal); reason != "" {
|
||||
return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, literal, reason)
|
||||
}
|
||||
return dialer.DialContext(ctx, network, addr)
|
||||
}
|
||||
ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return nil, fmt.Errorf("no addresses for %s", host)
|
||||
}
|
||||
for _, ip := range ips {
|
||||
if reason := blockReason(ip.IP); reason != "" {
|
||||
return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, ip.IP, reason)
|
||||
}
|
||||
}
|
||||
// Bind to the first resolved IP so a rebind between resolution
|
||||
// and connect cannot redirect the request to a blocked address.
|
||||
return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), port))
|
||||
},
|
||||
DialContext: SafeDialContext(dialer),
|
||||
MaxIdleConns: 16,
|
||||
IdleConnTimeout: 30 * time.Second,
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
@@ -85,6 +58,43 @@ func NewSafeHTTPClient(timeout time.Duration) *http.Client {
|
||||
return &http.Client{Timeout: timeout, Transport: transport}
|
||||
}
|
||||
|
||||
// SafeDialContext returns a DialContext that rejects loopback, link-local,
|
||||
// multicast, unspecified, and cloud-metadata addresses at connect time,
|
||||
// re-resolving and binding to the resolved IP so a DNS rebind between
|
||||
// resolution and connect cannot slip through. Exposed so other transports
|
||||
// (e.g. the outbound notification client) can apply the same SSRF policy
|
||||
// without duplicating it or losing their own connection-pool tuning.
|
||||
func SafeDialContext(dialer *net.Dialer) func(ctx context.Context, network, addr string) (net.Conn, error) {
|
||||
return func(ctx context.Context, network, addr string) (net.Conn, error) {
|
||||
host, port, err := net.SplitHostPort(addr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// If the caller passed a literal IP, skip the DNS round-trip.
|
||||
if literal := net.ParseIP(host); literal != nil {
|
||||
if reason := blockReason(literal); reason != "" {
|
||||
return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, literal, reason)
|
||||
}
|
||||
return dialer.DialContext(ctx, network, addr)
|
||||
}
|
||||
ips, err := net.DefaultResolver.LookupIPAddr(ctx, host)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return nil, fmt.Errorf("no addresses for %s", host)
|
||||
}
|
||||
for _, ip := range ips {
|
||||
if reason := blockReason(ip.IP); reason != "" {
|
||||
return nil, fmt.Errorf("%w: %s (%s)", ErrBlockedAddress, ip.IP, reason)
|
||||
}
|
||||
}
|
||||
// Bind to the first resolved IP so a rebind between resolution
|
||||
// and connect cannot redirect the request to a blocked address.
|
||||
return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), port))
|
||||
}
|
||||
}
|
||||
|
||||
// blockReason returns a human label for why an IP is rejected, or ""
|
||||
// if the IP is allowed. Centralized so all callers share the same
|
||||
// policy.
|
||||
@@ -92,6 +102,13 @@ func blockReason(ip net.IP) string {
|
||||
if ip == nil {
|
||||
return "nil address"
|
||||
}
|
||||
// Normalize IPv4-mapped IPv6 (::ffff:x.x.x.x) so the loopback / link-local
|
||||
// classifiers below catch them. net.IP.To4() returns the 4-byte form for
|
||||
// IPv4-mapped addresses; net's IsLoopback already handles this, but pin
|
||||
// the conversion to avoid future surprises if the std-lib semantics drift.
|
||||
if v4 := ip.To4(); v4 != nil {
|
||||
ip = v4
|
||||
}
|
||||
switch {
|
||||
case ip.IsLoopback():
|
||||
return "loopback"
|
||||
@@ -104,5 +121,22 @@ func blockReason(ip net.IP) string {
|
||||
case ip.IsMulticast():
|
||||
return "multicast"
|
||||
}
|
||||
// Cloud metadata endpoints — AWS / GCP / Azure are covered by the
|
||||
// link-local block (169.254.169.254). The rest must be enumerated.
|
||||
if metadataIPSet[ip.String()] {
|
||||
return "cloud metadata endpoint"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// metadataIPSet enumerates well-known cloud metadata IPs that are NOT
|
||||
// covered by net.IP.IsLinkLocalUnicast. Updating this set is the lightest
|
||||
// way to keep up with new providers without changing the policy shape.
|
||||
var metadataIPSet = map[string]bool{
|
||||
// Alibaba Cloud ECS metadata.
|
||||
"100.100.100.200": true,
|
||||
// Oracle Cloud Infrastructure metadata.
|
||||
"192.0.0.192": true,
|
||||
// AWS IMDS over IPv6 (ULA — not link-local, must be listed).
|
||||
"fd00:ec2::254": true,
|
||||
}
|
||||
|
||||
@@ -234,17 +234,17 @@ func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.Con
|
||||
found := make([]bool, len(targets))
|
||||
|
||||
var wg sync.WaitGroup
|
||||
loop:
|
||||
for i, t := range targets {
|
||||
// Acquire the semaphore in the parent loop so ctx cancellation
|
||||
// short-circuits the queue rather than spawning goroutines that
|
||||
// block on an unreachable slot.
|
||||
// block on an unreachable slot. The labelled break exits the for
|
||||
// loop directly; a bare `break` inside `select` would only break
|
||||
// the select and let the loop continue.
|
||||
select {
|
||||
case sem <- struct{}{}:
|
||||
case <-ctx.Done():
|
||||
break
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
break loop
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(i int, t target) {
|
||||
|
||||
@@ -2,6 +2,7 @@ package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
@@ -9,6 +10,22 @@ import (
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// validateExtraJSON ensures the extra_json column never receives an
|
||||
// invalid JSON document. The codemap (docs/CODEMAPS/container-extra-json.md)
|
||||
// is explicit that readers tolerate unknown keys — but only if the value
|
||||
// is valid JSON at all. A buggy plugin writing `"not json"` would silently
|
||||
// break every reader, with no schema-level check to catch it. Guarding at
|
||||
// the store boundary keeps the invariant cheap and obvious.
|
||||
func validateExtraJSON(v string) error {
|
||||
if v == "" {
|
||||
return nil
|
||||
}
|
||||
if !json.Valid([]byte(v)) {
|
||||
return fmt.Errorf("extra_json: not valid JSON (%d bytes)", len(v))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// containerColumns is the canonical column list for `containers` queries.
|
||||
// stage_id is populated by the deployer for project containers (so ListProxyRoutes
|
||||
// survives stage renames) and left empty for stacks and sites.
|
||||
@@ -42,6 +59,9 @@ func (s *Store) CreateContainer(c Container) (Container, error) {
|
||||
if c.ExtraJSON == "" {
|
||||
c.ExtraJSON = "{}"
|
||||
}
|
||||
if err := validateExtraJSON(c.ExtraJSON); err != nil {
|
||||
return Container{}, err
|
||||
}
|
||||
|
||||
_, err := s.db.Exec(
|
||||
`INSERT INTO containers (`+containerColumns+`)
|
||||
@@ -77,6 +97,9 @@ func (s *Store) UpsertContainer(c Container) error {
|
||||
if c.ExtraJSON == "" {
|
||||
c.ExtraJSON = "{}"
|
||||
}
|
||||
if err := validateExtraJSON(c.ExtraJSON); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// SQLite UPSERT — INSERT...ON CONFLICT(id) DO UPDATE.
|
||||
_, err := s.db.Exec(
|
||||
@@ -129,6 +152,9 @@ func (s *Store) ReconcileContainer(c Container) error {
|
||||
if c.ExtraJSON == "" {
|
||||
c.ExtraJSON = "{}"
|
||||
}
|
||||
if err := validateExtraJSON(c.ExtraJSON); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// extra_json is deliberately NOT in the ON CONFLICT SET clause: the
|
||||
// reconciler can't observe per-face route IDs from Docker, and
|
||||
@@ -321,6 +347,9 @@ func (s *Store) UpdateContainer(c Container) error {
|
||||
if c.ExtraJSON == "" {
|
||||
c.ExtraJSON = "{}"
|
||||
}
|
||||
if err := validateExtraJSON(c.ExtraJSON); err != nil {
|
||||
return err
|
||||
}
|
||||
result, err := s.db.Exec(
|
||||
`UPDATE containers SET workload_id=?, workload_kind=?, role=?, stage_id=?, container_id=?,
|
||||
image_ref=?, image_tag=?, host=?, state=?, port=?,
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// InsertDeployHistory appends one row to the per-workload deploy ledger.
|
||||
// Callers (the deployer choke point) treat this as best-effort: a failure
|
||||
// here must never fail an otherwise-successful deploy. Error is expected to
|
||||
// be a fixed, secret-free marker — never the raw source error.
|
||||
func (s *Store) InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error) {
|
||||
if e.StartedAt == "" {
|
||||
e.StartedAt = Now()
|
||||
}
|
||||
if e.FinishedAt == "" {
|
||||
e.FinishedAt = Now()
|
||||
}
|
||||
res, err := s.db.Exec(
|
||||
`INSERT INTO deploy_history
|
||||
(workload_id, source_kind, reference, reason, triggered_by,
|
||||
note, outcome, error, started_at, finished_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
e.WorkloadID, e.SourceKind, e.Reference, e.Reason, e.TriggeredBy,
|
||||
e.Note, e.Outcome, e.Error, e.StartedAt, e.FinishedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("insert deploy history: %w", err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("get deploy history id: %w", err)
|
||||
}
|
||||
e.ID = id
|
||||
return e, nil
|
||||
}
|
||||
|
||||
// ListDeployHistory returns a workload's ledger newest-first. limit/offset
|
||||
// are assumed pre-clamped by the API layer; a non-positive limit falls back
|
||||
// to a sane default so a bad query can't return the whole table.
|
||||
func (s *Store) ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error) {
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
rows, err := s.db.Query(
|
||||
`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
|
||||
note, outcome, error, started_at, finished_at
|
||||
FROM deploy_history
|
||||
WHERE workload_id = ?
|
||||
ORDER BY id DESC
|
||||
LIMIT ? OFFSET ?`,
|
||||
workloadID, limit, offset,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query deploy history: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
out := make([]DeployHistoryEntry, 0, limit)
|
||||
for rows.Next() {
|
||||
var e DeployHistoryEntry
|
||||
if err := rows.Scan(
|
||||
&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
|
||||
&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan deploy history: %w", err)
|
||||
}
|
||||
out = append(out, e)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// GetDeployHistory fetches one ledger row by id, or ErrNotFound. The
|
||||
// rollback handler uses this to resolve the pinned reference to replay.
|
||||
func (s *Store) GetDeployHistory(id int64) (DeployHistoryEntry, error) {
|
||||
row := s.db.QueryRow(
|
||||
`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
|
||||
note, outcome, error, started_at, finished_at
|
||||
FROM deploy_history WHERE id = ?`, id,
|
||||
)
|
||||
var e DeployHistoryEntry
|
||||
err := row.Scan(
|
||||
&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
|
||||
&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
|
||||
)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("deploy history %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("scan deploy history: %w", err)
|
||||
}
|
||||
return e, nil
|
||||
}
|
||||
|
||||
// PruneDeployHistory keeps only the newest `keep` rows for a workload,
|
||||
// deleting older ones. Bounds unbounded growth on hot workloads. Best-
|
||||
// effort and id-monotonic (newer rows always have larger ids), so it
|
||||
// deletes everything below the keep-th id. A non-positive keep is treated
|
||||
// as "keep a sane default" rather than "delete everything".
|
||||
func (s *Store) PruneDeployHistory(workloadID string, keep int) error {
|
||||
if keep <= 0 {
|
||||
keep = 50
|
||||
}
|
||||
_, err := s.db.Exec(
|
||||
`DELETE FROM deploy_history
|
||||
WHERE workload_id = ?
|
||||
AND id NOT IN (
|
||||
SELECT id FROM deploy_history
|
||||
WHERE workload_id = ?
|
||||
ORDER BY id DESC
|
||||
LIMIT ?
|
||||
)`,
|
||||
workloadID, workloadID, keep,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prune deploy history: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func seedWorkload(t *testing.T, s *Store, name string) Workload {
|
||||
t.Helper()
|
||||
w, err := s.CreateWorkload(Workload{Kind: "project", RefID: name, Name: name})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateWorkload(%s): %v", name, err)
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
func TestDeployHistory_InsertListGet(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "app1")
|
||||
|
||||
first, err := s.InsertDeployHistory(DeployHistoryEntry{
|
||||
WorkloadID: w.ID, SourceKind: "image", Reference: "v1",
|
||||
Reason: "manual", TriggeredBy: "admin", Outcome: "success",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("InsertDeployHistory: %v", err)
|
||||
}
|
||||
if first.ID == 0 {
|
||||
t.Fatal("expected non-zero id")
|
||||
}
|
||||
if first.StartedAt == "" || first.FinishedAt == "" {
|
||||
t.Fatal("expected timestamps to be defaulted")
|
||||
}
|
||||
|
||||
second, _ := s.InsertDeployHistory(DeployHistoryEntry{
|
||||
WorkloadID: w.ID, SourceKind: "image", Reference: "v2",
|
||||
Reason: "registry-push", Outcome: "success",
|
||||
})
|
||||
|
||||
list, err := s.ListDeployHistory(w.ID, 10, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("ListDeployHistory: %v", err)
|
||||
}
|
||||
if len(list) != 2 {
|
||||
t.Fatalf("expected 2 rows, got %d", len(list))
|
||||
}
|
||||
// Newest-first ordering.
|
||||
if list[0].ID != second.ID || list[1].ID != first.ID {
|
||||
t.Fatalf("expected newest-first ordering, got %d then %d", list[0].ID, list[1].ID)
|
||||
}
|
||||
|
||||
got, err := s.GetDeployHistory(first.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetDeployHistory: %v", err)
|
||||
}
|
||||
if got.Reference != "v1" || got.SourceKind != "image" {
|
||||
t.Fatalf("unexpected row: %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_GetNotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, err := s.GetDeployHistory(999)
|
||||
if !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("expected ErrNotFound, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_ListScopedToWorkload(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
a := seedWorkload(t, s, "a")
|
||||
b := seedWorkload(t, s, "b")
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: a.ID, Outcome: "success"})
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: b.ID, Outcome: "success"})
|
||||
|
||||
list, _ := s.ListDeployHistory(a.ID, 10, 0)
|
||||
if len(list) != 1 || list[0].WorkloadID != a.ID {
|
||||
t.Fatalf("expected only workload a's rows, got %+v", list)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_Pagination(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "paged")
|
||||
for i := 0; i < 5; i++ {
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
|
||||
}
|
||||
page1, _ := s.ListDeployHistory(w.ID, 2, 0)
|
||||
page2, _ := s.ListDeployHistory(w.ID, 2, 2)
|
||||
if len(page1) != 2 || len(page2) != 2 {
|
||||
t.Fatalf("expected 2 per page, got %d and %d", len(page1), len(page2))
|
||||
}
|
||||
if page1[0].ID == page2[0].ID {
|
||||
t.Fatal("expected distinct rows across pages")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_Prune(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "noisy")
|
||||
for i := 0; i < 10; i++ {
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
|
||||
}
|
||||
if err := s.PruneDeployHistory(w.ID, 3); err != nil {
|
||||
t.Fatalf("PruneDeployHistory: %v", err)
|
||||
}
|
||||
list, _ := s.ListDeployHistory(w.ID, 100, 0)
|
||||
if len(list) != 3 {
|
||||
t.Fatalf("expected 3 rows after prune, got %d", len(list))
|
||||
}
|
||||
// Prune keeps the newest rows.
|
||||
all, _ := s.ListDeployHistory(w.ID, 100, 0)
|
||||
for i := 1; i < len(all); i++ {
|
||||
if all[i-1].ID < all[i].ID {
|
||||
t.Fatal("expected newest-first after prune")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_CascadeOnWorkloadDelete(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "doomed")
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "failure"})
|
||||
|
||||
if err := s.DeleteWorkload(w.ID); err != nil {
|
||||
t.Fatalf("DeleteWorkload: %v", err)
|
||||
}
|
||||
list, _ := s.ListDeployHistory(w.ID, 100, 0)
|
||||
if len(list) != 0 {
|
||||
t.Fatalf("expected history removed with workload, got %d rows", len(list))
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user