0c4c338bfe
Two additions to the app detail page, each backed by a per-workload
endpoint.
Deploy history + rollback:
- New deploy_history table — a structured, version-pinned ledger of every
dispatch (success AND failure), distinct from the free-text event_log.
Recorded at the single DispatchPlugin choke point so every source kind
is covered. The raw deploy error is never persisted (it can carry
registry-auth / compose-stdout secrets) — only a generic marker, with
detail going to slog. Pruned to the newest N per workload; cascade-
deleted with the workload.
- GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback
(admin) replays a prior successful deploy's pinned reference as a
rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable);
git-built sources need checkout-by-commit, a later phase.
- DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback.
Per-workload metrics:
- ListContainerStatsSamplesByWorkload joins the existing container stats
samples through the containers index; GET /api/workloads/{id}/stats/history
aggregates CPU/memory per timestamp across the workload's containers.
- WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB,
windowed, 15s poll).
en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped
join, deployer recording (incl. secret-non-leak on failure), API rollback
guards, and per-timestamp aggregation. Plans under docs/plans/.
263 lines
8.6 KiB
Go
263 lines
8.6 KiB
Go
package api
|
|
|
|
import (
|
|
"errors"
|
|
"log/slog"
|
|
"net/http"
|
|
"sort"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/go-chi/chi/v5"
|
|
|
|
"github.com/alexei/tinyforge/internal/auth"
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
)
|
|
|
|
// topConsumerMinWindow is how recent a container sample must be to count toward
|
|
// the "top consumers" list. Scaled with the collector interval (read from
|
|
// settings) so it stays meaningful even when sampling is sparse.
|
|
const topConsumerMinWindow = 2 * time.Minute
|
|
|
|
// TopContainerSample augments a stats sample with the human-readable owner
|
|
// name so the UI can show "workload/role" without an extra round-trip per row.
|
|
type TopContainerSample struct {
|
|
store.ContainerStatsSample
|
|
OwnerName string `json:"owner_name"`
|
|
}
|
|
|
|
const (
|
|
// defaultHistoryWindow is used when no ?window= param is provided or the
|
|
// value fails to parse. Matches the default retention so the "last 2h"
|
|
// view always has data when collection is enabled.
|
|
defaultHistoryWindow = 2 * time.Hour
|
|
maxHistoryWindow = 24 * time.Hour
|
|
)
|
|
|
|
// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
|
|
// and returns a bounded duration.
|
|
func parseWindow(r *http.Request) time.Duration {
|
|
raw := r.URL.Query().Get("window")
|
|
if raw == "" {
|
|
return defaultHistoryWindow
|
|
}
|
|
d, err := time.ParseDuration(raw)
|
|
if err != nil || d <= 0 {
|
|
return defaultHistoryWindow
|
|
}
|
|
if d > maxHistoryWindow {
|
|
return maxHistoryWindow
|
|
}
|
|
return d
|
|
}
|
|
|
|
// sinceTimestamp converts a duration into a Unix-seconds cutoff.
|
|
func sinceTimestamp(window time.Duration) int64 {
|
|
return time.Now().UTC().Add(-window).Unix()
|
|
}
|
|
|
|
// getSystemStats handles GET /api/system/stats — current host snapshot.
|
|
// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
|
|
// handler returns 503 so the frontend can show a dedicated unavailable
|
|
// state instead of treating it as a generic 5xx failure.
|
|
func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
|
|
if s.docker == nil {
|
|
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
|
|
return
|
|
}
|
|
sys, err := s.docker.GetSystemStats(r.Context())
|
|
if err != nil {
|
|
slog.Warn("system stats unavailable", "error", err)
|
|
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
|
|
return
|
|
}
|
|
respondJSON(w, http.StatusOK, sys)
|
|
}
|
|
|
|
// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
|
|
func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
|
|
samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
|
|
if err != nil {
|
|
slog.Error("failed to list system stats samples", "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
if samples == nil {
|
|
samples = []store.SystemStatsSample{}
|
|
}
|
|
respondJSON(w, http.StatusOK, samples)
|
|
}
|
|
|
|
// workloadStatsPoint is one aggregated time bucket for a workload's metrics
|
|
// graph: every container the workload owns is summed at each timestamp so a
|
|
// multi-container (compose) workload reads as a single line. MemoryLimit is
|
|
// the max across containers — the effective ceiling — though the UI plots
|
|
// absolute MiB because the limit is often 0 (unlimited).
|
|
type workloadStatsPoint struct {
|
|
TS int64 `json:"ts"`
|
|
CPUPercent float64 `json:"cpu_percent"`
|
|
MemoryUsage int64 `json:"memory_usage"`
|
|
MemoryLimit int64 `json:"memory_limit"`
|
|
}
|
|
|
|
// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
|
|
// Read-only and open to any authenticated user (mirrors the per-workload
|
|
// events/runtime-state feeds). Always returns a (possibly empty) array — never
|
|
// 503 — because samples come from SQLite, which is available even when the
|
|
// Docker daemon is down or stats collection is disabled. Unknown workload id
|
|
// 404s; a known workload with no samples yet returns [].
|
|
func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
|
|
id := chi.URLParam(r, "id")
|
|
if id == "" {
|
|
respondError(w, http.StatusBadRequest, "workload id is required")
|
|
return
|
|
}
|
|
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
|
if errors.Is(err, store.ErrNotFound) {
|
|
respondNotFound(w, "workload")
|
|
return
|
|
}
|
|
respondError(w, http.StatusInternalServerError, "get workload")
|
|
return
|
|
}
|
|
|
|
samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
|
|
if err != nil {
|
|
slog.Error("failed to list workload stats samples", "workload", id, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
|
|
respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
|
|
}
|
|
|
|
// aggregateWorkloadStats folds per-container samples into one series keyed by
|
|
// timestamp: CPU% and memory usage are summed across the workload's containers,
|
|
// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
|
|
// that order without an extra sort.
|
|
func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
|
|
points := make([]workloadStatsPoint, 0)
|
|
idx := make(map[int64]int) // ts → index in points
|
|
for _, sm := range samples {
|
|
if i, ok := idx[sm.TS]; ok {
|
|
points[i].CPUPercent += sm.CPUPercent
|
|
points[i].MemoryUsage += sm.MemoryUsage
|
|
if sm.MemoryLimit > points[i].MemoryLimit {
|
|
points[i].MemoryLimit = sm.MemoryLimit
|
|
}
|
|
continue
|
|
}
|
|
idx[sm.TS] = len(points)
|
|
points = append(points, workloadStatsPoint{
|
|
TS: sm.TS,
|
|
CPUPercent: sm.CPUPercent,
|
|
MemoryUsage: sm.MemoryUsage,
|
|
MemoryLimit: sm.MemoryLimit,
|
|
})
|
|
}
|
|
return points
|
|
}
|
|
|
|
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
|
|
// Returns the top-N most recent samples across containers, sorted by CPU or
|
|
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
|
|
// cannot enumerate workloads outside their scope.
|
|
func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
|
|
limit := 5
|
|
if raw := r.URL.Query().Get("limit"); raw != "" {
|
|
if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
|
|
limit = n
|
|
}
|
|
}
|
|
by := r.URL.Query().Get("by")
|
|
if by != "memory" {
|
|
by = "cpu"
|
|
}
|
|
|
|
// Samples must be at least as recent as max(2*interval, 2 minutes) so the
|
|
// list reflects near-current load even when collection is sparse.
|
|
window := topConsumerMinWindow
|
|
if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
|
|
if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
|
|
window = w
|
|
}
|
|
}
|
|
|
|
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
|
|
if err != nil {
|
|
slog.Error("failed to list container samples for top", "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
|
|
// Keep only the latest sample per container.
|
|
latest := make(map[string]store.ContainerStatsSample, len(samples))
|
|
for _, sm := range samples {
|
|
if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
|
|
latest[sm.ContainerID] = sm
|
|
}
|
|
}
|
|
|
|
top := make([]store.ContainerStatsSample, 0, len(latest))
|
|
for _, sm := range latest {
|
|
top = append(top, sm)
|
|
}
|
|
|
|
sort.Slice(top, func(i, j int) bool {
|
|
if by == "memory" {
|
|
return top[i].MemoryUsage > top[j].MemoryUsage
|
|
}
|
|
return top[i].CPUPercent > top[j].CPUPercent
|
|
})
|
|
if len(top) > limit {
|
|
top = top[:limit]
|
|
}
|
|
|
|
enriched := s.enrichWithOwnerNames(top)
|
|
|
|
// Scrub container IDs for non-admins. The owner name is the actionable
|
|
// identifier; the container ID is a host-level handle that reveals
|
|
// workload existence to viewers who shouldn't have it.
|
|
claims, _ := auth.ClaimsFromContext(r.Context())
|
|
if claims.Role != "admin" {
|
|
for i := range enriched {
|
|
enriched[i].ContainerID = ""
|
|
}
|
|
}
|
|
|
|
respondJSON(w, http.StatusOK, enriched)
|
|
}
|
|
|
|
// enrichWithOwnerNames attaches a human-readable owner name to each sample.
|
|
// Names are resolved through the containers index → workloads, which after
|
|
// the cutover is the only available lookup path.
|
|
func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
|
|
out := make([]TopContainerSample, len(samples))
|
|
for i, sm := range samples {
|
|
out[i] = TopContainerSample{ContainerStatsSample: sm}
|
|
out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// lookupInstanceName returns "workload/role" for a container row, or empty
|
|
// on any lookup error so a transient miss does not break the response.
|
|
func (s *Server) lookupInstanceName(instanceID string) string {
|
|
c, err := s.store.GetContainerByID(instanceID)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
w, err := s.store.GetWorkloadByID(c.WorkloadID)
|
|
if err != nil {
|
|
if c.Role != "" {
|
|
return c.Role
|
|
}
|
|
return ""
|
|
}
|
|
if c.Role != "" {
|
|
return w.Name + "/" + c.Role
|
|
}
|
|
return w.Name
|
|
}
|
|
|