Files
tiny-forge/internal/reconciler/reconciler.go
T
alexei.dolgolyov 8d6a527a2b refactor(workload): plugin architecture wave + apps UI + volume scopes
Completes the workload-first refactor's plugin layer:

- internal/workload/plugin/ — Source/Trigger plugin contract,
  registry, types (Workload, DeploymentIntent, InboundEvent,
  PublicFace). Self-registering init() pattern + blank-import
  in cmd/server/main.go.
- Source plugins: image (blue-green with multi-face proxy routing),
  compose, static. Trigger plugins: registry, git, manual.
- internal/deployer/dispatch.go — DispatchPlugin/Teardown/Reconcile
  seam routing the legacy deployer through plugins.
- internal/api/workload_*.go — REST surface: workloads, env,
  volumes, chain (parent/children), promote-from. hooks.go
  serves /api/hooks/kinds/{kind}/schema for the wizard.
- internal/store: workload_env (encrypt-at-rest secrets) and
  workload_volumes tables, keyed on workload_id.
- cmd/server/static_backend.go — phantom-row adapter delegating
  the static source plugin to the legacy staticsite.Manager
  (deleted at hard cutover once the static inline port lands).
- web/src/routes/apps/ — /apps list + /apps/new wizard +
  /apps/[id] detail with kind-aware compose / image / static
  forms (Advanced JSON toggle), env panel, volumes panel,
  webhook panel, chain panel, manual deploy.

Volume scope generalization (v2 resolver):

- internal/volume.ResolveWorkloadPath (workload-keyed, sits
  next to legacy ResolvePath). Honors all VolumeScope values:
  absolute, ephemeral, instance, stage, project, project_named,
  named. internal/workload/plugin/source/image/image.go
  computeMounts wires settings + imageTag through. Coverage in
  internal/volume/resolver_test.go (portable Linux/Windows via
  t.TempDir).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 22:17:41 +03:00

445 lines
14 KiB
Go

// Package reconciler keeps the normalized containers index in sync with the
// Docker daemon. It runs on a tick (and one-shot at boot) — for every
// Tinyforge-managed container in `docker ps`, it dispatches to a workload by
// labels and writes a Container row through ReconcileContainer (which only
// touches Docker-derived fields on conflict, never deployer-owned columns
// like subdomain / proxy_route_id / npm_proxy_id / image_tag / stage_id).
// Rows whose Docker container ID is no longer present are flipped to
// state='missing'.
//
// Dispatch precedence (a container with multiple matching labels is dispatched
// by the first match in this order):
// 1. tinyforge.workload.id label (canonical, new)
// 2. tinyforge.static-site label (legacy site — joins via static_sites)
// 3. com.docker.compose.project (stack — joins via Stack.ComposeProjectName)
//
// The legacy tinyforge.instance-id path was removed when the deployer was
// rewritten to use Container natively — every Tinyforge-managed project
// container now carries the workload labels at create time.
package reconciler
import (
"context"
"encoding/json"
"errors"
"log/slog"
"strings"
"sync"
"time"
"github.com/alexei/tinyforge/internal/docker"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/workload/plugin"
)
// DockerLister is the subset of docker.Client the reconciler depends on.
// Defined here (where it's used) so tests can substitute a fake without
// pulling in the full docker package.
type DockerLister interface {
ListAllForReconciler(ctx context.Context) ([]docker.ReconcileItem, error)
}
// PluginReconciler is the optional dispatch surface for per-workload
// Source.Reconcile calls. Nil-safe — when unset, the reconciler skips
// the plugin pass and only refreshes the containers index from Docker.
type PluginReconciler interface {
DispatchReconcile(ctx context.Context, w plugin.Workload) error
}
// Reconciler is the background worker that syncs the containers index.
type Reconciler struct {
store *store.Store
docker DockerLister
interval time.Duration
plugins PluginReconciler // optional; nil disables the per-workload Source.Reconcile pass.
stop chan struct{}
cancel context.CancelFunc // populated in Start; invoked by Stop so an in-flight tick is unblocked.
wg sync.WaitGroup
}
// New constructs a Reconciler. interval is the tick period; values <=0 fall
// back to 30s. interval > 5m is clamped to 5m so a manual misconfiguration
// can't silently disable timely state updates.
func New(st *store.Store, dockerClient DockerLister, interval time.Duration) *Reconciler {
if interval <= 0 {
interval = 30 * time.Second
}
if interval > 5*time.Minute {
interval = 5 * time.Minute
}
return &Reconciler{
store: st,
docker: dockerClient,
interval: interval,
stop: make(chan struct{}),
}
}
// SetPluginReconciler injects the per-workload Source.Reconcile dispatch.
// Safe to call before or after Start; tick uses whatever's set at the
// time.
func (r *Reconciler) SetPluginReconciler(p PluginReconciler) { r.plugins = p }
// Start kicks off the background reconciliation loop. Runs one tick
// immediately so startup populates the index without waiting for the first
// timer fire. The provided context is wrapped with a child cancel func so
// Stop() can unblock an in-flight Docker call.
func (r *Reconciler) Start(ctx context.Context) {
ctx, cancel := context.WithCancel(ctx)
r.cancel = cancel
r.wg.Add(1)
go r.loop(ctx)
}
// Stop signals the loop to exit. Cancels the child context FIRST so any
// in-flight `docker ps` (which can hang on a stuck daemon) returns promptly,
// then waits for the goroutine to finish. Idempotent.
func (r *Reconciler) Stop() {
if r.cancel != nil {
r.cancel()
}
select {
case <-r.stop:
// already closed
default:
close(r.stop)
}
r.wg.Wait()
}
// ReconcileOnce runs a single reconciliation pass. Exposed for tests and for
// callers that want to force a sync after a known mutation (e.g., right after
// a deploy succeeds, before the next tick).
func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
items, err := r.docker.ListAllForReconciler(ctx)
if err != nil {
return err
}
seen := make(map[string]struct{}, len(items)) // container row IDs we touched
// Build a per-pass cache of compose project name → stack ID so we don't
// hit the DB once per compose container.
stackByCompose := map[string]store.Stack{}
for _, item := range items {
rowID := r.upsertFromItem(item, stackByCompose)
if rowID != "" {
seen[rowID] = struct{}{}
}
}
r.markMissingRows(seen)
r.reconcilePluginWorkloads(ctx)
return nil
}
// reconcilePluginWorkloads iterates every workload row that opted into
// the plugin pipeline (source_kind + trigger_kind both set) and asks the
// dispatcher to invoke Source.Reconcile. Failures are logged per-workload
// — one workload's broken state must not stop sweeping the rest.
//
// No-op when the plugin dispatcher hasn't been wired (boot-time race,
// disabled deployments, tests).
func (r *Reconciler) reconcilePluginWorkloads(ctx context.Context) {
if r.plugins == nil {
return
}
rows, err := r.store.ListWorkloads("")
if err != nil {
slog.Warn("reconciler: list workloads for plugin pass", "error", err)
return
}
for _, w := range rows {
if w.SourceKind == "" || w.TriggerKind == "" {
continue
}
pw := toPluginWorkload(w)
if err := r.plugins.DispatchReconcile(ctx, pw); err != nil {
slog.Warn("reconciler: plugin reconcile failed",
"workload", w.ID, "kind", w.SourceKind, "error", err)
}
}
}
// toPluginWorkload mirrors the api / webhook converters; kept local to
// avoid an import dependency between those packages.
func toPluginWorkload(w store.Workload) plugin.Workload {
var faces []plugin.PublicFace
if w.PublicFaces != "" {
_ = json.Unmarshal([]byte(w.PublicFaces), &faces)
}
return plugin.Workload{
ID: w.ID,
Name: w.Name,
GroupID: w.AppID,
ParentWorkloadID: w.ParentWorkloadID,
SourceKind: w.SourceKind,
SourceConfig: json.RawMessage(w.SourceConfig),
TriggerKind: w.TriggerKind,
TriggerConfig: json.RawMessage(w.TriggerConfig),
PublicFaces: faces,
NotificationURL: w.NotificationURL,
NotificationSecret: w.NotificationSecret,
WebhookSecret: w.WebhookSecret,
WebhookSigningSecret: w.WebhookSigningSecret,
WebhookRequireSignature: w.WebhookRequireSignature,
CreatedAt: w.CreatedAt,
UpdatedAt: w.UpdatedAt,
}
}
func (r *Reconciler) loop(ctx context.Context) {
defer r.wg.Done()
// Boot tick.
if err := r.ReconcileOnce(ctx); err != nil {
slog.Warn("reconciler: initial pass", "error", err)
}
ticker := time.NewTicker(r.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-r.stop:
return
case <-ticker.C:
if err := r.ReconcileOnce(ctx); err != nil {
slog.Warn("reconciler: tick", "error", err)
}
}
}
}
// upsertFromItem dispatches one container to its workload and writes the
// Container row. Returns the row ID on success or "" if no dispatch matched.
func (r *Reconciler) upsertFromItem(item docker.ReconcileItem, stackCache map[string]store.Stack) string {
if id := item.Labels[docker.LabelWorkloadID]; id != "" {
return r.upsertByWorkloadLabel(item, id)
}
if siteID := item.Labels["tinyforge.static-site"]; siteID != "" {
return r.upsertBySiteLabel(item, siteID)
}
if cp := item.Labels["com.docker.compose.project"]; cp != "" && strings.HasPrefix(cp, "tinyforge-") {
return r.upsertByComposeProject(item, cp, stackCache)
}
return ""
}
// upsertByWorkloadLabel — canonical path. Project containers are owned by the
// deployer: the deployer pre-creates the row with a per-instance UUID and
// proxy/subdomain metadata. The reconciler resolves the existing row by
// docker container ID and only touches Docker-derived fields. If no existing
// row matches and the kind is project, we skip the upsert — inventing a
// deterministic-ID row would race with the deployer's UUID rows for stages
// with MaxInstances > 1, leaving ghost rows behind.
//
// Untrusted-label defense: a workload_id label that doesn't resolve to a
// known workload row is silently ignored. Anyone with Docker socket access
// could otherwise spawn a container with a forged label and steal the
// canonical slot for an existing workload.
func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string) string {
w, err := r.store.GetWorkloadByID(workloadID)
if err != nil {
// Forged or stale label — log once at debug; tick rate keeps logs quiet.
slog.Debug("reconciler: unknown workload_id label", "workload_id", workloadID, "container_id", item.ID)
return ""
}
role := item.Labels[docker.LabelRole]
kind := item.Labels[docker.LabelWorkloadKind]
if kind != "" && kind != w.Kind {
slog.Warn("reconciler: workload kind mismatch", "label_kind", kind, "stored_kind", w.Kind, "workload_id", workloadID)
return ""
}
if kind == "" {
kind = w.Kind
}
// Resolve to existing row by Docker container ID.
existing, lookupErr := r.store.GetContainerByDockerID(item.ID)
if lookupErr == nil {
port := 0
if len(item.Ports) > 0 {
port = int(item.Ports[0])
}
if err := r.store.ReconcileContainer(store.Container{
ID: existing.ID,
WorkloadID: workloadID,
WorkloadKind: kind,
Role: role,
ContainerID: item.ID,
ImageRef: item.Image,
Host: "local",
State: normalizeState(item.State),
Port: port,
LastSeenAt: store.Now(),
}); err != nil {
slog.Warn("reconciler: reconcile by workload label", "container_id", item.ID, "error", err)
return ""
}
return existing.ID
}
if !errors.Is(lookupErr, store.ErrNotFound) {
slog.Warn("reconciler: lookup container by docker id", "container_id", item.ID, "error", lookupErr)
return ""
}
// No row yet. For project workloads, the deployer is the authoritative
// writer — wait for the deployer to create the row rather than
// inventing one with a deterministic key (which would collide with
// MaxInstances > 1 deploys).
if kind == string(store.WorkloadKindProject) {
return ""
}
// Site/stack reach this branch only when their kind-specific dispatcher
// hasn't run yet (e.g. boot tick before site row is registered). The
// site/stack dispatchers below own their own deterministic IDs.
rowID := workloadIDRow(workloadID, kind, role, item.ID)
port := 0
if len(item.Ports) > 0 {
port = int(item.Ports[0])
}
if err := r.store.ReconcileContainer(store.Container{
ID: rowID,
WorkloadID: workloadID,
WorkloadKind: kind,
Role: role,
ContainerID: item.ID,
ImageRef: item.Image,
Host: "local",
State: normalizeState(item.State),
Port: port,
LastSeenAt: store.Now(),
}); err != nil {
slog.Warn("reconciler: reconcile by workload label (insert)", "container_id", item.ID, "error", err)
return ""
}
return rowID
}
func (r *Reconciler) upsertBySiteLabel(item docker.ReconcileItem, siteID string) string {
w, err := r.store.GetWorkloadByRef(store.WorkloadKindSite, siteID)
if err != nil {
return ""
}
rowID := w.ID + ":site"
port := 0
if len(item.Ports) > 0 {
port = int(item.Ports[0])
}
if err := r.store.ReconcileContainer(store.Container{
ID: rowID,
WorkloadID: w.ID,
WorkloadKind: string(store.WorkloadKindSite),
Role: "",
ContainerID: item.ID,
ImageRef: item.Image,
Host: "local",
State: normalizeState(item.State),
Port: port,
LastSeenAt: store.Now(),
}); err != nil {
slog.Warn("reconciler: reconcile by site label", "container_id", item.ID, "error", err)
return ""
}
return rowID
}
func (r *Reconciler) upsertByComposeProject(item docker.ReconcileItem, composeProject string, cache map[string]store.Stack) string {
stack, ok := cache[composeProject]
if !ok {
st, err := r.store.GetStackByComposeProjectName(composeProject)
if err != nil {
cache[composeProject] = store.Stack{} // negative cache for the rest of the pass
return ""
}
stack = st
cache[composeProject] = st
}
if stack.ID == "" {
return ""
}
w, err := r.store.GetWorkloadByRef(store.WorkloadKindStack, stack.ID)
if err != nil {
return ""
}
role := item.Labels["com.docker.compose.service"]
if role == "" {
role = item.Name
}
rowID := w.ID + ":" + role
port := 0
if len(item.Ports) > 0 {
port = int(item.Ports[0])
}
if err := r.store.ReconcileContainer(store.Container{
ID: rowID,
WorkloadID: w.ID,
WorkloadKind: string(store.WorkloadKindStack),
Role: role,
ContainerID: item.ID,
ImageRef: item.Image,
Host: "local",
State: normalizeState(item.State),
Port: port,
LastSeenAt: store.Now(),
}); err != nil {
slog.Warn("reconciler: reconcile by compose project", "container_id", item.ID, "error", err)
return ""
}
return rowID
}
// markMissingRows flips state to 'missing' for any container row whose Docker
// container ID was not seen in this pass. Uses ListMissingSweepRows to scan
// only rows that are bound to a real container and not already missing.
func (r *Reconciler) markMissingRows(seen map[string]struct{}) {
rows, err := r.store.ListMissingSweepRows()
if err != nil {
slog.Warn("reconciler: list rows for missing-sweep", "error", err)
return
}
for _, row := range rows {
if _, ok := seen[row.ID]; ok {
continue
}
if err := r.store.MarkContainerMissing(row.ID); err != nil {
slog.Warn("reconciler: mark missing", "row_id", row.ID, "error", err)
}
}
}
// workloadIDRow picks the row ID for a non-project workload-labelled
// container that has no existing row. Stack rows use workloadID:role; sites
// use workloadID:site. Project rows are never invented here — see
// upsertByWorkloadLabel for the rationale.
func workloadIDRow(workloadID, kind, role, containerID string) string {
if kind == string(store.WorkloadKindSite) {
return workloadID + ":site"
}
if role != "" {
return workloadID + ":" + role
}
return workloadID + ":" + containerID
}
// normalizeState maps Docker container states to our condensed set:
// running | stopped | failed | removing | missing.
func normalizeState(dockerState string) string {
switch dockerState {
case "running":
return "running"
case "exited", "dead", "stopped":
return "stopped"
case "created", "restarting", "paused":
return dockerState
case "removing":
return "removing"
default:
return dockerState
}
}