Files
alexei.dolgolyov 0c4c338bfe feat(apps): per-workload deploy history, rollback, and resource metrics
Two additions to the app detail page, each backed by a per-workload
endpoint.

Deploy history + rollback:
- New deploy_history table — a structured, version-pinned ledger of every
  dispatch (success AND failure), distinct from the free-text event_log.
  Recorded at the single DispatchPlugin choke point so every source kind
  is covered. The raw deploy error is never persisted (it can carry
  registry-auth / compose-stdout secrets) — only a generic marker, with
  detail going to slog. Pruned to the newest N per workload; cascade-
  deleted with the workload.
- GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback
  (admin) replays a prior successful deploy's pinned reference as a
  rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable);
  git-built sources need checkout-by-commit, a later phase.
- DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback.

Per-workload metrics:
- ListContainerStatsSamplesByWorkload joins the existing container stats
  samples through the containers index; GET /api/workloads/{id}/stats/history
  aggregates CPU/memory per timestamp across the workload's containers.
- WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB,
  windowed, 15s poll).

en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped
join, deployer recording (incl. secret-non-leak on failure), API rollback
guards, and per-timestamp aggregation. Plans under docs/plans/.
2026-06-19 16:22:12 +03:00

263 lines
8.6 KiB
Go

package api
import (
"errors"
"log/slog"
"net/http"
"sort"
"strconv"
"time"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/auth"
"github.com/alexei/tinyforge/internal/store"
)
// topConsumerMinWindow is how recent a container sample must be to count toward
// the "top consumers" list. Scaled with the collector interval (read from
// settings) so it stays meaningful even when sampling is sparse.
const topConsumerMinWindow = 2 * time.Minute
// TopContainerSample augments a stats sample with the human-readable owner
// name so the UI can show "workload/role" without an extra round-trip per row.
type TopContainerSample struct {
store.ContainerStatsSample
OwnerName string `json:"owner_name"`
}
const (
// defaultHistoryWindow is used when no ?window= param is provided or the
// value fails to parse. Matches the default retention so the "last 2h"
// view always has data when collection is enabled.
defaultHistoryWindow = 2 * time.Hour
maxHistoryWindow = 24 * time.Hour
)
// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
// and returns a bounded duration.
func parseWindow(r *http.Request) time.Duration {
raw := r.URL.Query().Get("window")
if raw == "" {
return defaultHistoryWindow
}
d, err := time.ParseDuration(raw)
if err != nil || d <= 0 {
return defaultHistoryWindow
}
if d > maxHistoryWindow {
return maxHistoryWindow
}
return d
}
// sinceTimestamp converts a duration into a Unix-seconds cutoff.
func sinceTimestamp(window time.Duration) int64 {
return time.Now().UTC().Add(-window).Unix()
}
// getSystemStats handles GET /api/system/stats — current host snapshot.
// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
// handler returns 503 so the frontend can show a dedicated unavailable
// state instead of treating it as a generic 5xx failure.
func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
if s.docker == nil {
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
sys, err := s.docker.GetSystemStats(r.Context())
if err != nil {
slog.Warn("system stats unavailable", "error", err)
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
respondJSON(w, http.StatusOK, sys)
}
// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list system stats samples", "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.SystemStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// workloadStatsPoint is one aggregated time bucket for a workload's metrics
// graph: every container the workload owns is summed at each timestamp so a
// multi-container (compose) workload reads as a single line. MemoryLimit is
// the max across containers — the effective ceiling — though the UI plots
// absolute MiB because the limit is often 0 (unlimited).
type workloadStatsPoint struct {
TS int64 `json:"ts"`
CPUPercent float64 `json:"cpu_percent"`
MemoryUsage int64 `json:"memory_usage"`
MemoryLimit int64 `json:"memory_limit"`
}
// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
// Read-only and open to any authenticated user (mirrors the per-workload
// events/runtime-state feeds). Always returns a (possibly empty) array — never
// 503 — because samples come from SQLite, which is available even when the
// Docker daemon is down or stats collection is disabled. Unknown workload id
// 404s; a known workload with no samples yet returns [].
func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
if id == "" {
respondError(w, http.StatusBadRequest, "workload id is required")
return
}
if _, err := s.store.GetWorkloadByID(id); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "workload")
return
}
respondError(w, http.StatusInternalServerError, "get workload")
return
}
samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list workload stats samples", "workload", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
}
// aggregateWorkloadStats folds per-container samples into one series keyed by
// timestamp: CPU% and memory usage are summed across the workload's containers,
// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
// that order without an extra sort.
func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
points := make([]workloadStatsPoint, 0)
idx := make(map[int64]int) // ts → index in points
for _, sm := range samples {
if i, ok := idx[sm.TS]; ok {
points[i].CPUPercent += sm.CPUPercent
points[i].MemoryUsage += sm.MemoryUsage
if sm.MemoryLimit > points[i].MemoryLimit {
points[i].MemoryLimit = sm.MemoryLimit
}
continue
}
idx[sm.TS] = len(points)
points = append(points, workloadStatsPoint{
TS: sm.TS,
CPUPercent: sm.CPUPercent,
MemoryUsage: sm.MemoryUsage,
MemoryLimit: sm.MemoryLimit,
})
}
return points
}
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
// Returns the top-N most recent samples across containers, sorted by CPU or
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
// cannot enumerate workloads outside their scope.
func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
limit := 5
if raw := r.URL.Query().Get("limit"); raw != "" {
if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
limit = n
}
}
by := r.URL.Query().Get("by")
if by != "memory" {
by = "cpu"
}
// Samples must be at least as recent as max(2*interval, 2 minutes) so the
// list reflects near-current load even when collection is sparse.
window := topConsumerMinWindow
if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
window = w
}
}
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
if err != nil {
slog.Error("failed to list container samples for top", "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
// Keep only the latest sample per container.
latest := make(map[string]store.ContainerStatsSample, len(samples))
for _, sm := range samples {
if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
latest[sm.ContainerID] = sm
}
}
top := make([]store.ContainerStatsSample, 0, len(latest))
for _, sm := range latest {
top = append(top, sm)
}
sort.Slice(top, func(i, j int) bool {
if by == "memory" {
return top[i].MemoryUsage > top[j].MemoryUsage
}
return top[i].CPUPercent > top[j].CPUPercent
})
if len(top) > limit {
top = top[:limit]
}
enriched := s.enrichWithOwnerNames(top)
// Scrub container IDs for non-admins. The owner name is the actionable
// identifier; the container ID is a host-level handle that reveals
// workload existence to viewers who shouldn't have it.
claims, _ := auth.ClaimsFromContext(r.Context())
if claims.Role != "admin" {
for i := range enriched {
enriched[i].ContainerID = ""
}
}
respondJSON(w, http.StatusOK, enriched)
}
// enrichWithOwnerNames attaches a human-readable owner name to each sample.
// Names are resolved through the containers index → workloads, which after
// the cutover is the only available lookup path.
func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
out := make([]TopContainerSample, len(samples))
for i, sm := range samples {
out[i] = TopContainerSample{ContainerStatsSample: sm}
out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
}
return out
}
// lookupInstanceName returns "workload/role" for a container row, or empty
// on any lookup error so a transient miss does not break the response.
func (s *Server) lookupInstanceName(instanceID string) string {
c, err := s.store.GetContainerByID(instanceID)
if err != nil {
return ""
}
w, err := s.store.GetWorkloadByID(c.WorkloadID)
if err != nil {
if c.Role != "" {
return c.Role
}
return ""
}
if c.Role != "" {
return w.Name + "/" + c.Role
}
return w.Name
}