feat(apps): per-workload deploy history, rollback, and resource metrics

Two additions to the app detail page, each backed by a per-workload endpoint. Deploy history + rollback: - New deploy_history table — a structured, version-pinned ledger of every dispatch (success AND failure), distinct from the free-text event_log. Recorded at the single DispatchPlugin choke point so every source kind is covered. The raw deploy error is never persisted (it can carry registry-auth / compose-stdout secrets) — only a generic marker, with detail going to slog. Pruned to the newest N per workload; cascade- deleted with the workload. - GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback (admin) replays a prior successful deploy's pinned reference as a rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable); git-built sources need checkout-by-commit, a later phase. - DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback. Per-workload metrics: - ListContainerStatsSamplesByWorkload joins the existing container stats samples through the containers index; GET /api/workloads/{id}/stats/history aggregates CPU/memory per timestamp across the workload's containers. - WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB, windowed, 15s poll). en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped join, deployer recording (incl. secret-non-leak on failure), API rollback guards, and per-timestamp aggregation. Plans under docs/plans/.
2026-06-19 16:22:12 +03:00
parent c8e71a0c34
commit 0c4c338bfe
23 changed files with 1828 additions and 0 deletions
@@ -1,12 +1,15 @@
 package api

 import (
+	"errors"
 	"log/slog"
 	"net/http"
 	"sort"
 	"strconv"
 	"time"

+	"github.com/go-chi/chi/v5"
+
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 )
@@ -85,6 +88,76 @@ func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, samples)
 }

+// workloadStatsPoint is one aggregated time bucket for a workload's metrics
+// graph: every container the workload owns is summed at each timestamp so a
+// multi-container (compose) workload reads as a single line. MemoryLimit is
+// the max across containers — the effective ceiling — though the UI plots
+// absolute MiB because the limit is often 0 (unlimited).
+type workloadStatsPoint struct {
+	TS          int64   `json:"ts"`
+	CPUPercent  float64 `json:"cpu_percent"`
+	MemoryUsage int64   `json:"memory_usage"`
+	MemoryLimit int64   `json:"memory_limit"`
+}
+
+// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
+// Read-only and open to any authenticated user (mirrors the per-workload
+// events/runtime-state feeds). Always returns a (possibly empty) array — never
+// 503 — because samples come from SQLite, which is available even when the
+// Docker daemon is down or stats collection is disabled. Unknown workload id
+// 404s; a known workload with no samples yet returns [].
+func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+
+	samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
+	if err != nil {
+		slog.Error("failed to list workload stats samples", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list samples")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
+}
+
+// aggregateWorkloadStats folds per-container samples into one series keyed by
+// timestamp: CPU% and memory usage are summed across the workload's containers,
+// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
+// that order without an extra sort.
+func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
+	points := make([]workloadStatsPoint, 0)
+	idx := make(map[int64]int) // ts → index in points
+	for _, sm := range samples {
+		if i, ok := idx[sm.TS]; ok {
+			points[i].CPUPercent += sm.CPUPercent
+			points[i].MemoryUsage += sm.MemoryUsage
+			if sm.MemoryLimit > points[i].MemoryLimit {
+				points[i].MemoryLimit = sm.MemoryLimit
+			}
+			continue
+		}
+		idx[sm.TS] = len(points)
+		points = append(points, workloadStatsPoint{
+			TS:          sm.TS,
+			CPUPercent:  sm.CPUPercent,
+			MemoryUsage: sm.MemoryUsage,
+			MemoryLimit: sm.MemoryLimit,
+		})
+	}
+	return points
+}
+
 // listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
 // Returns the top-N most recent samples across containers, sorted by CPU or
 // memory. Container IDs are stripped for non-admins so a low-privilege viewer