tiny-forge/internal/reconciler/reconciler.go

// Package reconciler keeps the normalized containers index in sync with the
// Docker daemon. It runs on a tick (and one-shot at boot) — for every
// Tinyforge-managed container in `docker ps`, it dispatches to a workload by
// labels and upserts a Container row. Rows whose Docker container ID is no
// longer present are flipped to state='missing'.
//
// Dispatch precedence:
//  1. tinyforge.workload.id label  (canonical, new)
//  2. tinyforge.static-site label  (legacy site — joins via static_sites)
//  3. com.docker.compose.project   (stack — joins via Stack.ComposeProjectName)
//
// The legacy tinyforge.instance-id path was removed when the deployer was
// rewritten to use Container natively — every Tinyforge-managed project
// container now carries the workload labels at create time.
package reconciler

import (
	"context"
	"log/slog"
	"strings"
	"sync"
	"time"

	"github.com/alexei/tinyforge/internal/docker"
	"github.com/alexei/tinyforge/internal/store"
)

// DockerLister is the subset of docker.Client the reconciler depends on.
// Defined here (where it's used) so tests can substitute a fake without
// pulling in the full docker package.
type DockerLister interface {
	ListAllForReconciler(ctx context.Context) ([]docker.ReconcileItem, error)
}

// Reconciler is the background worker that syncs the containers index.
type Reconciler struct {
	store    *store.Store
	docker   DockerLister
	interval time.Duration

	stop chan struct{}
	wg   sync.WaitGroup
}

// New constructs a Reconciler. interval is the tick period; values <=0 fall
// back to 30s. interval > 5m is clamped to 5m so a manual misconfiguration
// can't silently disable timely state updates.
func New(st *store.Store, dockerClient DockerLister, interval time.Duration) *Reconciler {
	if interval <= 0 {
		interval = 30 * time.Second
	}
	if interval > 5*time.Minute {
		interval = 5 * time.Minute
	}
	return &Reconciler{
		store:    st,
		docker:   dockerClient,
		interval: interval,
		stop:     make(chan struct{}),
	}
}

// Start kicks off the background reconciliation loop. Runs one tick
// immediately so startup populates the index without waiting for the first
// timer fire. Idempotent: calling Start twice is a programming error.
func (r *Reconciler) Start(ctx context.Context) {
	r.wg.Add(1)
	go r.loop(ctx)
}

// Stop signals the loop to exit and waits for the in-flight tick to finish.
func (r *Reconciler) Stop() {
	close(r.stop)
	r.wg.Wait()
}

// ReconcileOnce runs a single reconciliation pass. Exposed for tests and for
// callers that want to force a sync after a known mutation (e.g., right after
// a deploy succeeds, before the next tick).
func (r *Reconciler) ReconcileOnce(ctx context.Context) error {
	items, err := r.docker.ListAllForReconciler(ctx)
	if err != nil {
		return err
	}
	seen := make(map[string]struct{}, len(items)) // container row IDs we touched

	for _, item := range items {
		rowID := r.upsertFromItem(ctx, item)
		if rowID != "" {
			seen[rowID] = struct{}{}
		}
	}

	r.markMissingRows(seen)
	return nil
}

func (r *Reconciler) loop(ctx context.Context) {
	defer r.wg.Done()

	// Boot tick.
	if err := r.ReconcileOnce(ctx); err != nil {
		slog.Warn("reconciler: initial pass", "error", err)
	}

	ticker := time.NewTicker(r.interval)
	defer ticker.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-r.stop:
			return
		case <-ticker.C:
			if err := r.ReconcileOnce(ctx); err != nil {
				slog.Warn("reconciler: tick", "error", err)
			}
		}
	}
}

// upsertFromItem dispatches one container to its workload and writes the
// Container row. Returns the row ID on success or "" if no dispatch matched.
func (r *Reconciler) upsertFromItem(ctx context.Context, item docker.ReconcileItem) string {
	if id := item.Labels[docker.LabelWorkloadID]; id != "" {
		return r.upsertByWorkloadLabel(item, id)
	}
	if siteID := item.Labels["tinyforge.static-site"]; siteID != "" {
		return r.upsertBySiteLabel(item, siteID)
	}
	if cp := item.Labels["com.docker.compose.project"]; cp != "" && strings.HasPrefix(cp, "tinyforge-") {
		return r.upsertByComposeProject(item, cp)
	}
	return ""
}

// upsertByWorkloadLabel — canonical path. The row may already exist with a
// deployer-assigned UUID (project deploys do this so each blue-green slot
// has a stable handle); look it up by docker container ID first and fall
// back to the deterministic workloadID:role key.
func (r *Reconciler) upsertByWorkloadLabel(item docker.ReconcileItem, workloadID string) string {
	role := item.Labels[docker.LabelRole]
	kind := item.Labels[docker.LabelWorkloadKind]
	rowID := workloadIDRow(workloadID, kind, role, item.ID)
	if existing, err := r.store.GetContainerByDockerID(item.ID); err == nil {
		rowID = existing.ID
	}

	port := 0
	if len(item.Ports) > 0 {
		port = int(item.Ports[0])
	}
	if err := r.store.UpsertContainer(store.Container{
		ID:           rowID,
		WorkloadID:   workloadID,
		WorkloadKind: kind,
		Role:         role,
		ContainerID:  item.ID,
		ImageRef:     item.Image,
		Host:         "local",
		State:        normalizeState(item.State),
		Port:         port,
		LastSeenAt:   store.Now(),
	}); err != nil {
		slog.Warn("reconciler: upsert by workload label", "container_id", item.ID, "error", err)
		return ""
	}
	return rowID
}

func (r *Reconciler) upsertBySiteLabel(item docker.ReconcileItem, siteID string) string {
	w, err := r.store.GetWorkloadByRef(store.WorkloadKindSite, siteID)
	if err != nil {
		return ""
	}
	rowID := w.ID + ":site"
	port := 0
	if len(item.Ports) > 0 {
		port = int(item.Ports[0])
	}
	if err := r.store.UpsertContainer(store.Container{
		ID:           rowID,
		WorkloadID:   w.ID,
		WorkloadKind: string(store.WorkloadKindSite),
		Role:         "",
		ContainerID:  item.ID,
		ImageRef:     item.Image,
		Host:         "local",
		State:        normalizeState(item.State),
		Port:         port,
		LastSeenAt:   store.Now(),
	}); err != nil {
		slog.Warn("reconciler: upsert by site label", "container_id", item.ID, "error", err)
		return ""
	}
	return rowID
}

func (r *Reconciler) upsertByComposeProject(item docker.ReconcileItem, composeProject string) string {
	stack, err := r.findStackByComposeProject(composeProject)
	if err != nil {
		return ""
	}
	w, err := r.store.GetWorkloadByRef(store.WorkloadKindStack, stack.ID)
	if err != nil {
		return ""
	}
	role := item.Labels["com.docker.compose.service"]
	if role == "" {
		role = item.Name
	}
	rowID := w.ID + ":" + role
	port := 0
	if len(item.Ports) > 0 {
		port = int(item.Ports[0])
	}
	if err := r.store.UpsertContainer(store.Container{
		ID:           rowID,
		WorkloadID:   w.ID,
		WorkloadKind: string(store.WorkloadKindStack),
		Role:         role,
		ContainerID:  item.ID,
		ImageRef:     item.Image,
		Host:         "local",
		State:        normalizeState(item.State),
		Port:         port,
		LastSeenAt:   store.Now(),
	}); err != nil {
		slog.Warn("reconciler: upsert by compose project", "container_id", item.ID, "error", err)
		return ""
	}
	return rowID
}

// findStackByComposeProject scans all stacks for a matching ComposeProjectName.
// Linear; the stack count is small in practice.
func (r *Reconciler) findStackByComposeProject(composeProject string) (store.Stack, error) {
	stacks, err := r.store.GetAllStacks()
	if err != nil {
		return store.Stack{}, err
	}
	for _, s := range stacks {
		if s.ComposeProjectName == composeProject {
			return s, nil
		}
	}
	return store.Stack{}, store.ErrNotFound
}

// markMissingRows flips state to 'missing' for any container row whose Docker
// container ID was not seen in this pass. Rows with empty container_id are
// skipped — the deployer creates them ahead of `docker create` so they're
// transient and shouldn't be marked missing on a tick that races the deploy.
func (r *Reconciler) markMissingRows(seen map[string]struct{}) {
	rows, err := r.store.ListContainers(store.ContainerFilter{})
	if err != nil {
		slog.Warn("reconciler: list containers for missing-sweep", "error", err)
		return
	}
	for _, row := range rows {
		if _, ok := seen[row.ID]; ok {
			continue
		}
		if row.ContainerID == "" {
			continue // never bound to a real container yet
		}
		if row.State == "missing" {
			continue // already marked
		}
		if err := r.store.MarkContainerMissing(row.ID); err != nil {
			slog.Warn("reconciler: mark missing", "row_id", row.ID, "error", err)
		}
	}
}

// workloadIDRow picks the row ID for a workload-labelled container.
// Stack rows use the deterministic workloadID:role pattern; sites use
// workloadID:site. Project rows have a per-deploy UUID assigned by the
// deployer and ALSO carry the role label (= stage name), so the same
// pattern resolves to the same row across deployer + reconciler upserts.
func workloadIDRow(workloadID, kind, role, containerID string) string {
	if role != "" {
		return workloadID + ":" + role
	}
	if kind == string(store.WorkloadKindSite) {
		return workloadID + ":site"
	}
	// Last-resort fallback: container ID. Uncommon path.
	return workloadID + ":" + containerID
}

// normalizeState maps Docker container states to our condensed set:
// running | stopped | failed | removing | missing.
func normalizeState(dockerState string) string {
	switch dockerState {
	case "running":
		return "running"
	case "exited", "dead", "stopped":
		return "stopped"
	case "created", "restarting", "paused":
		return dockerState
	case "removing":
		return "removing"
	default:
		return dockerState
	}
}