refactor(workload): finalize containers index + post-review hardening
Wraps up the workload refactor with the fixes that came out of the multi-agent code review (see docs/plans/workload-refactor.md "What actually shipped"). Backend: - store.ReconcileContainer: separate write path so the 30s reconciler tick no longer overwrites deployer-owned fields (subdomain, proxy_route_id, npm_proxy_id, image_tag). - Container.stage_id column + index; ListProxyRoutes / ListContainersByStageID join via stage_id (survives stage rename), with legacy fallback to (project_id, role=stage_name). - Reconciler: workload-existence check (rejects forged tinyforge.workload.id labels), skips inventing project-kind rows, child-context cancel before wg.Wait() on shutdown. - Transactional CRUD across projects / stacks / static_sites: parent UPDATE and workload sync land in one transaction so secret rotations are durable. - Webhook routing reads exclusively through workloads.webhook_secret; legacy GetProjectByWebhookSecret / GetStaticSiteByWebhookSecret fallback removed. - store.GetStackByComposeProjectName + indexed lookup (no more full-table stack scan per compose container per tick). - store.ListMissingSweepRows: filtered query for the missing-sweep. - /api/instances/* handlers verify (workload_id, role) match URL (project_id, stage_name) before mutating — closes the cross-project hijack the security review flagged. - extra_json no longer referenced from Go (column kept on disk for now). Frontend: - WorkloadContainers.svelte: generic detail-page panel reusable by stack and site detail pages. - Containers page polish: client-side kind/state filters over an unfiltered fetch, URL-synced filters, race-safe loads via sequence number, EN+RU i18n, sidebar counter via navCounts.containers. Misc: - scripts/dev-server.sh: tolerate empty netstat grep result. - .gitignore: ignore docker-watcher binaries, .claude/worktrees/, .facts-sync.json.
This commit is contained in:
@@ -1,10 +1,14 @@
|
||||
// Package reconciler keeps the normalized containers index in sync with the
|
||||
// Docker daemon. It runs on a tick (and one-shot at boot) — for every
|
||||
// Tinyforge-managed container in `docker ps`, it dispatches to a workload by
|
||||
// labels and upserts a Container row. Rows whose Docker container ID is no
|
||||
// longer present are flipped to state='missing'.
|
||||
// labels and writes a Container row through ReconcileContainer (which only
|
||||
// touches Docker-derived fields on conflict, never deployer-owned columns
|
||||
// like subdomain / proxy_route_id / npm_proxy_id / image_tag / stage_id).
|
||||
// Rows whose Docker container ID is no longer present are flipped to
|
||||
// state='missing'.
|
||||
//
|
||||
// Dispatch precedence:
|
||||
// Dispatch precedence (a container with multiple matching labels is dispatched
|
||||
// by the first match in this order):
|
||||
// 1. tinyforge.workload.id label (canonical, new)
|
||||
// 2. tinyforge.static-site label (legacy site — joins via static_sites)
|
||||
// 3. com.docker.compose.project (stack — joins via Stack.ComposeProjectName)
|
||||
@@ -16,6 +20,7 @@ package reconciler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -38,8 +43,9 @@ type Reconciler struct {
|
||||
docker DockerLister
|
||||
interval time.Duration
|
||||
|
||||
stop chan struct{}
|
||||
wg sync.WaitGroup
|
||||
stop chan struct{}
|
||||
cancel context.CancelFunc // populated in Start; invoked by Stop so an in-flight tick is unblocked.
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// New constructs a Reconciler. interval is the tick period; values <=0 fall
|
||||
@@ -62,15 +68,28 @@ func New(st *store.Store, dockerClient DockerLister, interval time.Duration) *Re
|
||||
|
||||
// Start kicks off the background reconciliation loop. Runs one tick
|
||||
// immediately so startup populates the index without waiting for the first
|
||||
// timer fire. Idempotent: calling Start twice is a programming error.
|
||||
// timer fire. The provided context is wrapped with a child cancel func so
|
||||
// Stop() can unblock an in-flight Docker call.
|
||||
func (r *Reconciler) Start(ctx context.Context) {
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
r.cancel = cancel
|
||||
r.wg.Add(1)
|
||||
go r.loop(ctx)
|
||||
}
|
||||
|
||||
// Stop signals the loop to exit and waits for the in-flight tick to finish.
|
||||
// Stop signals the loop to exit. Cancels the child context FIRST so any
|
||||
// in-flight `docker ps` (which can hang on a stuck daemon) returns promptly,
|
||||
// then waits for the goroutine to finish. Idempotent.
|
||||
func (r *Reconciler) Stop() {
|
||||
close(r.stop)
|
||||
if r.cancel != nil {
|
||||
r.cancel()
|
||||
}
|
||||
select {
|
||||
case <-r.stop:
|
||||
// already closed
|
||||
default:
|
||||
close(r.stop)
|
||||
}
|
||||
r.wg.Wait()
|
||||
}
|
||||
|
||||
@@ -84,8 +103,12 @@ func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
|
||||
}
|
||||
seen := make(map[string]struct{}, len(items)) // container row IDs we touched
|
||||
|
||||
// Build a per-pass cache of compose project name → stack ID so we don't
|
||||
// hit the DB once per compose container.
|
||||
stackByCompose := map[string]store.Stack{}
|
||||
|
||||
for _, item := range items {
|
||||
rowID := r.upsertFromItem(ctx, item)
|
||||
rowID := r.upsertFromItem(item, stackByCompose)
|
||||
if rowID != "" {
|
||||
seen[rowID] = struct{}{}
|
||||
}
|
||||
@@ -121,7 +144,7 @@ func (r *Reconciler) loop(ctx context.Context) {
|
||||
|
||||
// upsertFromItem dispatches one container to its workload and writes the
|
||||
// Container row. Returns the row ID on success or "" if no dispatch matched.
|
||||
func (r *Reconciler) upsertFromItem(ctx context.Context, item docker.ReconcileItem) string {
|
||||
func (r *Reconciler) upsertFromItem(item docker.ReconcileItem, stackCache map[string]store.Stack) string {
|
||||
if id := item.Labels[docker.LabelWorkloadID]; id != "" {
|
||||
return r.upsertByWorkloadLabel(item, id)
|
||||
}
|
||||
@@ -129,28 +152,86 @@ func (r *Reconciler) upsertFromItem(ctx context.Context, item docker.ReconcileIt
|
||||
return r.upsertBySiteLabel(item, siteID)
|
||||
}
|
||||
if cp := item.Labels["com.docker.compose.project"]; cp != "" && strings.HasPrefix(cp, "tinyforge-") {
|
||||
return r.upsertByComposeProject(item, cp)
|
||||
return r.upsertByComposeProject(item, cp, stackCache)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// upsertByWorkloadLabel — canonical path. The row may already exist with a
|
||||
// deployer-assigned UUID (project deploys do this so each blue-green slot
|
||||
// has a stable handle); look it up by docker container ID first and fall
|
||||
// back to the deterministic workloadID:role key.
|
||||
// upsertByWorkloadLabel — canonical path. Project containers are owned by the
|
||||
// deployer: the deployer pre-creates the row with a per-instance UUID and
|
||||
// proxy/subdomain metadata. The reconciler resolves the existing row by
|
||||
// docker container ID and only touches Docker-derived fields. If no existing
|
||||
// row matches and the kind is project, we skip the upsert — inventing a
|
||||
// deterministic-ID row would race with the deployer's UUID rows for stages
|
||||
// with MaxInstances > 1, leaving ghost rows behind.
|
||||
//
|
||||
// Untrusted-label defense: a workload_id label that doesn't resolve to a
|
||||
// known workload row is silently ignored. Anyone with Docker socket access
|
||||
// could otherwise spawn a container with a forged label and steal the
|
||||
// canonical slot for an existing workload.
|
||||
func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string) string {
|
||||
w, err := r.store.GetWorkloadByID(workloadID)
|
||||
if err != nil {
|
||||
// Forged or stale label — log once at debug; tick rate keeps logs quiet.
|
||||
slog.Debug("reconciler: unknown workload_id label", "workload_id", workloadID, "container_id", item.ID)
|
||||
return ""
|
||||
}
|
||||
role := item.Labels[docker.LabelRole]
|
||||
kind := item.Labels[docker.LabelWorkloadKind]
|
||||
rowID := workloadIDRow(workloadID, kind, role, item.ID)
|
||||
if existing, err := r.store.GetContainerByDockerID(item.ID); err == nil {
|
||||
rowID = existing.ID
|
||||
if kind != "" && kind != w.Kind {
|
||||
slog.Warn("reconciler: workload kind mismatch", "label_kind", kind, "stored_kind", w.Kind, "workload_id", workloadID)
|
||||
return ""
|
||||
}
|
||||
if kind == "" {
|
||||
kind = w.Kind
|
||||
}
|
||||
|
||||
// Resolve to existing row by Docker container ID.
|
||||
existing, lookupErr := r.store.GetContainerByDockerID(item.ID)
|
||||
if lookupErr == nil {
|
||||
port := 0
|
||||
if len(item.Ports) > 0 {
|
||||
port = int(item.Ports[0])
|
||||
}
|
||||
if err := r.store.ReconcileContainer(store.Container{
|
||||
ID: existing.ID,
|
||||
WorkloadID: workloadID,
|
||||
WorkloadKind: kind,
|
||||
Role: role,
|
||||
ContainerID: item.ID,
|
||||
ImageRef: item.Image,
|
||||
Host: "local",
|
||||
State: normalizeState(item.State),
|
||||
Port: port,
|
||||
LastSeenAt: store.Now(),
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: reconcile by workload label", "container_id", item.ID, "error", err)
|
||||
return ""
|
||||
}
|
||||
return existing.ID
|
||||
}
|
||||
if !errors.Is(lookupErr, store.ErrNotFound) {
|
||||
slog.Warn("reconciler: lookup container by docker id", "container_id", item.ID, "error", lookupErr)
|
||||
return ""
|
||||
}
|
||||
|
||||
// No row yet. For project workloads, the deployer is the authoritative
|
||||
// writer — wait for the deployer to create the row rather than
|
||||
// inventing one with a deterministic key (which would collide with
|
||||
// MaxInstances > 1 deploys).
|
||||
if kind == string(store.WorkloadKindProject) {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Site/stack reach this branch only when their kind-specific dispatcher
|
||||
// hasn't run yet (e.g. boot tick before site row is registered). The
|
||||
// site/stack dispatchers below own their own deterministic IDs.
|
||||
rowID := workloadIDRow(workloadID, kind, role, item.ID)
|
||||
port := 0
|
||||
if len(item.Ports) > 0 {
|
||||
port = int(item.Ports[0])
|
||||
}
|
||||
if err := r.store.UpsertContainer(store.Container{
|
||||
if err := r.store.ReconcileContainer(store.Container{
|
||||
ID: rowID,
|
||||
WorkloadID: workloadID,
|
||||
WorkloadKind: kind,
|
||||
@@ -162,7 +243,7 @@ func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID
|
||||
Port: port,
|
||||
LastSeenAt: store.Now(),
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: upsert by workload label", "container_id", item.ID, "error", err)
|
||||
slog.Warn("reconciler: reconcile by workload label (insert)", "container_id", item.ID, "error", err)
|
||||
return ""
|
||||
}
|
||||
return rowID
|
||||
@@ -178,7 +259,7 @@ func (r *Reconciler) upsertBySiteLabel(item docker.ReconcileItem, siteID string)
|
||||
if len(item.Ports) > 0 {
|
||||
port = int(item.Ports[0])
|
||||
}
|
||||
if err := r.store.UpsertContainer(store.Container{
|
||||
if err := r.store.ReconcileContainer(store.Container{
|
||||
ID: rowID,
|
||||
WorkloadID: w.ID,
|
||||
WorkloadKind: string(store.WorkloadKindSite),
|
||||
@@ -190,15 +271,24 @@ func (r *Reconciler) upsertBySiteLabel(item docker.ReconcileItem, siteID string)
|
||||
Port: port,
|
||||
LastSeenAt: store.Now(),
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: upsert by site label", "container_id", item.ID, "error", err)
|
||||
slog.Warn("reconciler: reconcile by site label", "container_id", item.ID, "error", err)
|
||||
return ""
|
||||
}
|
||||
return rowID
|
||||
}
|
||||
|
||||
func (r *Reconciler) upsertByComposeProject(item docker.ReconcileItem, composeProject string) string {
|
||||
stack, err := r.findStackByComposeProject(composeProject)
|
||||
if err != nil {
|
||||
func (r *Reconciler) upsertByComposeProject(item docker.ReconcileItem, composeProject string, cache map[string]store.Stack) string {
|
||||
stack, ok := cache[composeProject]
|
||||
if !ok {
|
||||
st, err := r.store.GetStackByComposeProjectName(composeProject)
|
||||
if err != nil {
|
||||
cache[composeProject] = store.Stack{} // negative cache for the rest of the pass
|
||||
return ""
|
||||
}
|
||||
stack = st
|
||||
cache[composeProject] = st
|
||||
}
|
||||
if stack.ID == "" {
|
||||
return ""
|
||||
}
|
||||
w, err := r.store.GetWorkloadByRef(store.WorkloadKindStack, stack.ID)
|
||||
@@ -214,7 +304,7 @@ func (r *Reconciler) upsertByComposeProject(item docker.ReconcileItem, composePr
|
||||
if len(item.Ports) > 0 {
|
||||
port = int(item.Ports[0])
|
||||
}
|
||||
if err := r.store.UpsertContainer(store.Container{
|
||||
if err := r.store.ReconcileContainer(store.Container{
|
||||
ID: rowID,
|
||||
WorkloadID: w.ID,
|
||||
WorkloadKind: string(store.WorkloadKindStack),
|
||||
@@ -226,66 +316,42 @@ func (r *Reconciler) upsertByComposeProject(item docker.ReconcileItem, composePr
|
||||
Port: port,
|
||||
LastSeenAt: store.Now(),
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: upsert by compose project", "container_id", item.ID, "error", err)
|
||||
slog.Warn("reconciler: reconcile by compose project", "container_id", item.ID, "error", err)
|
||||
return ""
|
||||
}
|
||||
return rowID
|
||||
}
|
||||
|
||||
// findStackByComposeProject scans all stacks for a matching ComposeProjectName.
|
||||
// Linear; the stack count is small in practice.
|
||||
func (r *Reconciler) findStackByComposeProject(composeProject string) (store.Stack, error) {
|
||||
stacks, err := r.store.GetAllStacks()
|
||||
if err != nil {
|
||||
return store.Stack{}, err
|
||||
}
|
||||
for _, s := range stacks {
|
||||
if s.ComposeProjectName == composeProject {
|
||||
return s, nil
|
||||
}
|
||||
}
|
||||
return store.Stack{}, store.ErrNotFound
|
||||
}
|
||||
|
||||
// markMissingRows flips state to 'missing' for any container row whose Docker
|
||||
// container ID was not seen in this pass. Rows with empty container_id are
|
||||
// skipped — the deployer creates them ahead of `docker create` so they're
|
||||
// transient and shouldn't be marked missing on a tick that races the deploy.
|
||||
// container ID was not seen in this pass. Uses ListMissingSweepRows to scan
|
||||
// only rows that are bound to a real container and not already missing.
|
||||
func (r *Reconciler) markMissingRows(seen map[string]struct{}) {
|
||||
rows, err := r.store.ListContainers(store.ContainerFilter{})
|
||||
rows, err := r.store.ListMissingSweepRows()
|
||||
if err != nil {
|
||||
slog.Warn("reconciler: list containers for missing-sweep", "error", err)
|
||||
slog.Warn("reconciler: list rows for missing-sweep", "error", err)
|
||||
return
|
||||
}
|
||||
for _, row := range rows {
|
||||
if _, ok := seen[row.ID]; ok {
|
||||
continue
|
||||
}
|
||||
if row.ContainerID == "" {
|
||||
continue // never bound to a real container yet
|
||||
}
|
||||
if row.State == "missing" {
|
||||
continue // already marked
|
||||
}
|
||||
if err := r.store.MarkContainerMissing(row.ID); err != nil {
|
||||
slog.Warn("reconciler: mark missing", "row_id", row.ID, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// workloadIDRow picks the row ID for a workload-labelled container.
|
||||
// Stack rows use the deterministic workloadID:role pattern; sites use
|
||||
// workloadID:site. Project rows have a per-deploy UUID assigned by the
|
||||
// deployer and ALSO carry the role label (= stage name), so the same
|
||||
// pattern resolves to the same row across deployer + reconciler upserts.
|
||||
// workloadIDRow picks the row ID for a non-project workload-labelled
|
||||
// container that has no existing row. Stack rows use workloadID:role; sites
|
||||
// use workloadID:site. Project rows are never invented here — see
|
||||
// upsertByWorkloadLabel for the rationale.
|
||||
func workloadIDRow(workloadID, kind, role, containerID string) string {
|
||||
if role != "" {
|
||||
return workloadID + ":" + role
|
||||
}
|
||||
if kind == string(store.WorkloadKindSite) {
|
||||
return workloadID + ":site"
|
||||
}
|
||||
// Last-resort fallback: container ID. Uncommon path.
|
||||
if role != "" {
|
||||
return workloadID + ":" + role
|
||||
}
|
||||
return workloadID + ":" + containerID
|
||||
}
|
||||
|
||||
|
||||
@@ -188,6 +188,122 @@ func TestReconcileIgnoresUnmanagedContainers(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileDoesNotClobberDeployerFields guards against the regression where
|
||||
// the reconciler's upsert wiped subdomain / proxy_route_id / npm_proxy_id /
|
||||
// image_tag / stage_id on every tick because those columns were included in
|
||||
// the ON CONFLICT DO UPDATE SET clause but never populated by the reconciler.
|
||||
func TestReconcileDoesNotClobberDeployerFields(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
|
||||
// Project workload — exercises the path most affected by the regression
|
||||
// (proxies, blue-green slots, image-tag-based stale detection).
|
||||
project, err := st.CreateProject(store.Project{Name: "p", Image: "nginx"})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateProject: %v", err)
|
||||
}
|
||||
w, _ := st.GetWorkloadByRef(store.WorkloadKindProject, project.ID)
|
||||
|
||||
// Deployer wrote the row with proxy / subdomain / image_tag / stage_id.
|
||||
deployerRow := store.Container{
|
||||
ID: "deploy-uuid-1", WorkloadID: w.ID, WorkloadKind: "project",
|
||||
Role: "prod", StageID: "stage-prod-id", ContainerID: "docker-aaa",
|
||||
ImageRef: "nginx:1.27", ImageTag: "1.27", State: "running", Port: 8080,
|
||||
Subdomain: "prod-p", ProxyRouteID: "route-42", NpmProxyID: 7,
|
||||
}
|
||||
if err := st.UpsertContainer(deployerRow); err != nil {
|
||||
t.Fatalf("seed deployer row: %v", err)
|
||||
}
|
||||
|
||||
// Reconciler sees the same docker container — no proxy fields in labels.
|
||||
fake := &fakeDocker{items: []docker.ReconcileItem{{
|
||||
ID: "docker-aaa", Image: "nginx:1.27", State: "running",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: w.ID,
|
||||
docker.LabelWorkloadKind: "project",
|
||||
docker.LabelRole: "prod",
|
||||
},
|
||||
Ports: []uint16{8080},
|
||||
}}}
|
||||
r := New(st, fake, 0)
|
||||
if err := r.ReconcileOnce(context.Background()); err != nil {
|
||||
t.Fatalf("ReconcileOnce: %v", err)
|
||||
}
|
||||
|
||||
got, _ := st.GetContainerByID("deploy-uuid-1")
|
||||
if got.Subdomain != "prod-p" {
|
||||
t.Fatalf("subdomain wiped: %q", got.Subdomain)
|
||||
}
|
||||
if got.ProxyRouteID != "route-42" {
|
||||
t.Fatalf("proxy_route_id wiped: %q", got.ProxyRouteID)
|
||||
}
|
||||
if got.NpmProxyID != 7 {
|
||||
t.Fatalf("npm_proxy_id wiped: %d", got.NpmProxyID)
|
||||
}
|
||||
if got.ImageTag != "1.27" {
|
||||
t.Fatalf("image_tag wiped: %q", got.ImageTag)
|
||||
}
|
||||
if got.StageID != "stage-prod-id" {
|
||||
t.Fatalf("stage_id wiped: %q", got.StageID)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileRejectsForgedWorkloadLabel guards C2 — a Docker container
|
||||
// claiming a non-existent workload_id must be ignored, not adopted into the
|
||||
// containers index.
|
||||
func TestReconcileRejectsForgedWorkloadLabel(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
fake := &fakeDocker{items: []docker.ReconcileItem{{
|
||||
ID: "docker-evil",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: "wl-does-not-exist",
|
||||
docker.LabelWorkloadKind: "project",
|
||||
docker.LabelRole: "prod",
|
||||
},
|
||||
}}}
|
||||
r := New(st, fake, 0)
|
||||
if err := r.ReconcileOnce(context.Background()); err != nil {
|
||||
t.Fatalf("ReconcileOnce: %v", err)
|
||||
}
|
||||
rows, _ := st.ListContainers(store.ContainerFilter{})
|
||||
if len(rows) != 0 {
|
||||
t.Fatalf("forged label should produce no row, got %d", len(rows))
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileSkipsProjectInsertWithoutDeployerRow guards H3 — the reconciler
|
||||
// must not invent a project container row, since the deployer is the
|
||||
// authoritative writer and inventing rows races with MaxInstances > 1 deploys.
|
||||
func TestReconcileSkipsProjectInsertWithoutDeployerRow(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
project, err := st.CreateProject(store.Project{Name: "p2", Image: "nginx"})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateProject: %v", err)
|
||||
}
|
||||
w, _ := st.GetWorkloadByRef(store.WorkloadKindProject, project.ID)
|
||||
|
||||
// Reconciler sees a real container with project labels but no deployer
|
||||
// row exists yet (race during deploy).
|
||||
fake := &fakeDocker{items: []docker.ReconcileItem{{
|
||||
ID: "docker-race", Image: "nginx", State: "running",
|
||||
Labels: map[string]string{
|
||||
docker.LabelManaged: "true",
|
||||
docker.LabelWorkloadID: w.ID,
|
||||
docker.LabelWorkloadKind: "project",
|
||||
docker.LabelRole: "prod",
|
||||
},
|
||||
}}}
|
||||
r := New(st, fake, 0)
|
||||
if err := r.ReconcileOnce(context.Background()); err != nil {
|
||||
t.Fatalf("ReconcileOnce: %v", err)
|
||||
}
|
||||
rows, _ := st.ListContainersByWorkload(w.ID)
|
||||
if len(rows) != 0 {
|
||||
t.Fatalf("project insert without deployer row should be skipped, got %d rows", len(rows))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcileNormalizesState(t *testing.T) {
|
||||
st := newTestStore(t)
|
||||
stack, _ := st.CreateStack(store.Stack{
|
||||
|
||||
Reference in New Issue
Block a user