tiny-forge/internal/api/workload_runtime.go

package api

import (
	"context"
	"encoding/json"
	"errors"
	"log/slog"
	"net/http"
	"sync"
	"time"

	"github.com/go-chi/chi/v5"

	"github.com/alexei/tinyforge/internal/store"
)

// storageProbeCache memoizes the `du` result per workload for a short
// window so a tight polling loop on /storage cannot turn into one
// `docker exec du` per request. The TTL is intentionally short — the
// panel is a coarse usage indicator, not a real-time meter.
var (
	storageProbeCacheTTL = 30 * time.Second
	storageProbeMu       sync.Mutex
	storageProbeCache    = map[string]storageProbeEntry{}
)

type storageProbeEntry struct {
	at      time.Time
	usage   int64
	probeOk bool
}

// Runtime endpoints surface what the legacy /api/sites/* surface used
// to expose on the static-site detail page: the last commit SHA / last
// sync timestamp / status persisted by the static plugin in
// containers.extra_json, the data-volume disk usage, and stop / start
// controls that don't require a full re-deploy.
//
// The handlers are deliberately decoupled from the plugin interface so
// they work uniformly across source kinds: stop/start operate on the
// Docker container IDs stored in the containers index regardless of
// kind; runtime-state reads what the source persisted (currently only
// "static" writes a structured blob); storage usage is static-only
// today but the endpoint shape allows future sources to opt in.

// runtimeStatePayload is the JSON shape returned by
// GET /api/workloads/{id}/runtime-state.
//
// SourceKind is always present so the UI can decide whether to render
// the static-specific fields (last_commit_sha, last_sync_at, ...). The
// container-row fields (ContainerID, State) come from the canonical
// containers row that the static plugin maintains under the
// deterministic ID `<workloadID>:site`.
type runtimeStatePayload struct {
	SourceKind    string `json:"source_kind"`
	HasState      bool   `json:"has_state"`
	ContainerID   string `json:"container_id,omitempty"`
	State         string `json:"state,omitempty"`
	Status        string `json:"status,omitempty"`
	LastCommitSHA string `json:"last_commit_sha,omitempty"`
	LastSyncAt    string `json:"last_sync_at,omitempty"`
	LastError     string `json:"last_error,omitempty"`
}

// getWorkloadRuntimeState handles GET /api/workloads/{id}/runtime-state.
// Reads the typed state the static plugin writes into containers.extra_json
// (see internal/workload/plugin/source/static/state.go). Non-static
// source kinds return SourceKind + HasState=false; the panel hides
// itself rather than the endpoint 404ing.
func (s *Server) getWorkloadRuntimeState(w http.ResponseWriter, r *http.Request) {
	id := chi.URLParam(r, "id")
	workload, err := s.store.GetWorkloadByID(id)
	if err != nil {
		if errors.Is(err, store.ErrNotFound) {
			respondNotFound(w, "workload")
			return
		}
		slog.Error("get workload for runtime-state", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}

	payload := runtimeStatePayload{SourceKind: workload.SourceKind}

	// Both static and dockerfile sources persist their runtime state into
	// containers.extra_json under a deterministic row id. The shapes
	// match (status / last_commit_sha / last_sync_at / last_error) so the
	// handler can decode them identically. The suffix differs per source
	// kind: static uses ":site", dockerfile uses ":dockerfile".
	var rowSuffix string
	switch workload.SourceKind {
	case "static":
		rowSuffix = ":site"
	case "dockerfile":
		rowSuffix = ":dockerfile"
	default:
		respondJSON(w, http.StatusOK, payload)
		return
	}

	// The owning plugin maintains one container row per workload at the
	// deterministic ID. A missing row means the workload has never been
	// deployed — return HasState=false so the UI can prompt the operator
	// to deploy.
	row, err := s.store.GetContainerByID(id + rowSuffix)
	if err != nil {
		if errors.Is(err, store.ErrNotFound) {
			respondJSON(w, http.StatusOK, payload)
			return
		}
		slog.Error("get container row for runtime-state", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}

	payload.HasState = true
	payload.ContainerID = row.ContainerID
	payload.State = row.State

	// extra_json is the source of truth for the typed runtime fields.
	// A decode failure is non-fatal: we still report container_id /
	// state so the UI is useful, just without the sync history.
	//
	// No mutex here even though the writer (state.go saveState) holds
	// a per-workload mutex on read-modify-write — SQLite returns the
	// ExtraJSON column as a fully-materialized string from a single
	// SELECT, so the reader sees either the pre- or post-write snapshot
	// atomically. There is no torn read to defend against.
	if row.ExtraJSON != "" && row.ExtraJSON != "{}" {
		var st struct {
			Status        string `json:"status"`
			LastCommitSHA string `json:"last_commit_sha"`
			LastSyncAt    string `json:"last_sync_at"`
			LastError     string `json:"last_error"`
		}
		if err := json.Unmarshal([]byte(row.ExtraJSON), &st); err != nil {
			slog.Debug("decode extra_json for runtime-state", "workload", id, "error", err)
		} else {
			payload.Status = st.Status
			payload.LastCommitSHA = st.LastCommitSHA
			payload.LastSyncAt = st.LastSyncAt
			payload.LastError = st.LastError
		}
	}

	respondJSON(w, http.StatusOK, payload)
}

// storageUsagePayload is the JSON shape returned by
// GET /api/workloads/{id}/storage. ProbeError surfaces a non-fatal
// failure to compute used_bytes (du timed out, exec returned non-zero,
// etc.) so the UI can render "usage unavailable" instead of an
// always-zero number.
type storageUsagePayload struct {
	SourceKind string `json:"source_kind"`
	Enabled    bool   `json:"enabled"`
	UsedBytes  int64  `json:"used_bytes"`
	LimitMB    int    `json:"limit_mb,omitempty"`
	ProbeError string `json:"probe_error,omitempty"`
}

// getWorkloadStorage handles GET /api/workloads/{id}/storage.
//
// For static workloads with storage enabled, execs `du -sb /app/data`
// inside the running container to compute the data volume's footprint.
// For workloads without storage (or non-static source kinds), returns
// Enabled=false and zero usage so the UI can hide the panel.
func (s *Server) getWorkloadStorage(w http.ResponseWriter, r *http.Request) {
	id := chi.URLParam(r, "id")
	workload, err := s.store.GetWorkloadByID(id)
	if err != nil {
		if errors.Is(err, store.ErrNotFound) {
			respondNotFound(w, "workload")
			return
		}
		slog.Error("get workload for storage", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}

	payload := storageUsagePayload{SourceKind: workload.SourceKind}

	if workload.SourceKind != "static" {
		respondJSON(w, http.StatusOK, payload)
		return
	}

	// Decode storage knobs from source_config. Missing / malformed
	// blobs are treated as storage-disabled rather than erroring; the
	// validator that runs on workload create already rejects invalid
	// configs at the source.
	var cfg struct {
		StorageEnabled bool `json:"storage_enabled"`
		StorageLimitMB int  `json:"storage_limit_mb"`
	}
	if workload.SourceConfig != "" {
		if err := json.Unmarshal([]byte(workload.SourceConfig), &cfg); err != nil {
			// Validator catches malformed configs at create-time, so
			// this is unexpected — log so a drifted row is traceable.
			slog.Debug("decode source_config for storage", "workload", id, "error", err)
		}
	}
	payload.Enabled = cfg.StorageEnabled
	payload.LimitMB = cfg.StorageLimitMB

	if !cfg.StorageEnabled || s.docker == nil {
		respondJSON(w, http.StatusOK, payload)
		return
	}

	// Cache hit short-circuits the docker exec entirely so a polling
	// frontend cannot turn this into a per-request `du`.
	storageProbeMu.Lock()
	if cached, ok := storageProbeCache[id]; ok && time.Since(cached.at) < storageProbeCacheTTL {
		storageProbeMu.Unlock()
		payload.UsedBytes = cached.usage
		if !cached.probeOk {
			payload.ProbeError = "storage probe unavailable"
		}
		respondJSON(w, http.StatusOK, payload)
		return
	}
	storageProbeMu.Unlock()

	// Find the running container. The static plugin's canonical row is
	// at <id>:site; we also tolerate workloads whose plugin produced
	// multiple containers by scanning the index.
	containers, err := s.store.ListContainersByWorkload(id)
	if err != nil {
		slog.Error("list containers for storage", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}
	probeOk := false
	for _, c := range containers {
		if c.ContainerID == "" {
			continue
		}
		// 15s budget — `du` on a Hugo-style `public/` with tens of
		// thousands of files and a cold page cache can run several
		// seconds. The cache above keeps the amortized cost small.
		ctx, cancel := context.WithTimeout(r.Context(), 15*time.Second)
		usage, err := s.docker.InspectSiteStorageUsage(ctx, c.ContainerID)
		cancel()
		if err != nil {
			slog.Debug("storage usage probe failed", "workload", id, "container", c.ContainerID, "error", err)
			continue
		}
		payload.UsedBytes = usage.UsedBytes
		probeOk = true
		break
	}
	if !probeOk {
		payload.ProbeError = "storage probe unavailable"
	}

	storageProbeMu.Lock()
	storageProbeCache[id] = storageProbeEntry{at: time.Now(), usage: payload.UsedBytes, probeOk: probeOk}
	storageProbeMu.Unlock()

	respondJSON(w, http.StatusOK, payload)
}

// stopStartResult is the JSON shape returned by both stop and start
// handlers — counts so the UI can show "1 of 2 containers stopped".
type stopStartResult struct {
	Touched int `json:"touched"`
	Failed  int `json:"failed"`
}

// stopPluginWorkload handles POST /api/workloads/{id}/stop.
//
// Stops every container row belonging to the workload via Docker. Does
// not remove containers or update runtime state — the reconciler
// (internal/workload/plugin/source/static/reconcile.go) flips state to
// "stopped"/"failed" on its next pass, and the user can immediately see
// the new Docker state via /api/workloads/{id}/containers.
//
// Returning 200 with a `{touched, failed}` envelope even on partial
// failures so the UI can surface "2 of 3 stopped" rather than treating
// the whole call as red.
func (s *Server) stopPluginWorkload(w http.ResponseWriter, r *http.Request) {
	id := chi.URLParam(r, "id")
	if _, err := s.store.GetWorkloadByID(id); err != nil {
		if errors.Is(err, store.ErrNotFound) {
			respondNotFound(w, "workload")
			return
		}
		slog.Error("get workload for stop", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}
	if s.docker == nil {
		respondError(w, http.StatusServiceUnavailable, "docker client unavailable")
		return
	}

	containers, err := s.store.ListContainersByWorkload(id)
	if err != nil {
		slog.Error("list containers for stop", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}

	result := stopStartResult{}
	for _, c := range containers {
		if c.ContainerID == "" {
			continue
		}
		// 30s per-container ctx budget; the third arg to StopContainer
		// is the in-container SIGTERM grace period before SIGKILL.
		ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
		if err := s.docker.StopContainer(ctx, c.ContainerID, 10); err != nil {
			slog.Warn("stop container failed", "workload", id, "container", c.ContainerID, "error", err)
			result.Failed++
		} else {
			result.Touched++
		}
		cancel()
	}
	if result.Touched == 0 && result.Failed == 0 {
		// No live container row to act on — distinguish from a successful
		// stop of zero containers so the UI can show "nothing to stop"
		// rather than a misleading green toast.
		respondError(w, http.StatusConflict, "no running container to stop")
		return
	}
	if result.Touched == 0 && result.Failed > 0 {
		respondError(w, http.StatusBadGateway, "all containers failed to stop")
		return
	}
	respondJSON(w, http.StatusOK, result)
}

// startPluginWorkload handles POST /api/workloads/{id}/start.
//
// Calls `docker start` on every container row belonging to the
// workload. Does not redeploy or recreate; if the container has been
// removed externally, start returns an error and the operator should
// click Deploy. Same partial-failure envelope as stop.
func (s *Server) startPluginWorkload(w http.ResponseWriter, r *http.Request) {
	id := chi.URLParam(r, "id")
	if _, err := s.store.GetWorkloadByID(id); err != nil {
		if errors.Is(err, store.ErrNotFound) {
			respondNotFound(w, "workload")
			return
		}
		slog.Error("get workload for start", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}
	if s.docker == nil {
		respondError(w, http.StatusServiceUnavailable, "docker client unavailable")
		return
	}

	containers, err := s.store.ListContainersByWorkload(id)
	if err != nil {
		slog.Error("list containers for start", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "internal server error")
		return
	}

	result := stopStartResult{}
	for _, c := range containers {
		if c.ContainerID == "" {
			continue
		}
		ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
		if err := s.docker.StartContainer(ctx, c.ContainerID); err != nil {
			slog.Warn("start container failed", "workload", id, "container", c.ContainerID, "error", err)
			result.Failed++
		} else {
			result.Touched++
		}
		cancel()
	}
	if result.Touched == 0 && result.Failed == 0 {
		// No persisted container — deploy first to materialize one.
		respondError(w, http.StatusConflict, "no container to start; deploy first")
		return
	}
	if result.Touched == 0 && result.Failed > 0 {
		respondError(w, http.StatusBadGateway, "all containers failed to start")
		return
	}
	respondJSON(w, http.StatusOK, result)
}