feat(apps): per-workload deploy history, rollback, and resource metrics
Two additions to the app detail page, each backed by a per-workload
endpoint.
Deploy history + rollback:
- New deploy_history table — a structured, version-pinned ledger of every
dispatch (success AND failure), distinct from the free-text event_log.
Recorded at the single DispatchPlugin choke point so every source kind
is covered. The raw deploy error is never persisted (it can carry
registry-auth / compose-stdout secrets) — only a generic marker, with
detail going to slog. Pruned to the newest N per workload; cascade-
deleted with the workload.
- GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback
(admin) replays a prior successful deploy's pinned reference as a
rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable);
git-built sources need checkout-by-commit, a later phase.
- DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback.
Per-workload metrics:
- ListContainerStatsSamplesByWorkload joins the existing container stats
samples through the containers index; GET /api/workloads/{id}/stats/history
aggregates CPU/memory per timestamp across the workload's containers.
- WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB,
windowed, 15s poll).
en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped
join, deployer recording (incl. secret-non-leak on failure), API rollback
guards, and per-timestamp aggregation. Plans under docs/plans/.
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
@@ -85,6 +88,76 @@ func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
|
||||
respondJSON(w, http.StatusOK, samples)
|
||||
}
|
||||
|
||||
// workloadStatsPoint is one aggregated time bucket for a workload's metrics
|
||||
// graph: every container the workload owns is summed at each timestamp so a
|
||||
// multi-container (compose) workload reads as a single line. MemoryLimit is
|
||||
// the max across containers — the effective ceiling — though the UI plots
|
||||
// absolute MiB because the limit is often 0 (unlimited).
|
||||
type workloadStatsPoint struct {
|
||||
TS int64 `json:"ts"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryUsage int64 `json:"memory_usage"`
|
||||
MemoryLimit int64 `json:"memory_limit"`
|
||||
}
|
||||
|
||||
// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
|
||||
// Read-only and open to any authenticated user (mirrors the per-workload
|
||||
// events/runtime-state feeds). Always returns a (possibly empty) array — never
|
||||
// 503 — because samples come from SQLite, which is available even when the
|
||||
// Docker daemon is down or stats collection is disabled. Unknown workload id
|
||||
// 404s; a known workload with no samples yet returns [].
|
||||
func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
|
||||
samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
|
||||
if err != nil {
|
||||
slog.Error("failed to list workload stats samples", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
|
||||
}
|
||||
|
||||
// aggregateWorkloadStats folds per-container samples into one series keyed by
|
||||
// timestamp: CPU% and memory usage are summed across the workload's containers,
|
||||
// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
|
||||
// that order without an extra sort.
|
||||
func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
|
||||
points := make([]workloadStatsPoint, 0)
|
||||
idx := make(map[int64]int) // ts → index in points
|
||||
for _, sm := range samples {
|
||||
if i, ok := idx[sm.TS]; ok {
|
||||
points[i].CPUPercent += sm.CPUPercent
|
||||
points[i].MemoryUsage += sm.MemoryUsage
|
||||
if sm.MemoryLimit > points[i].MemoryLimit {
|
||||
points[i].MemoryLimit = sm.MemoryLimit
|
||||
}
|
||||
continue
|
||||
}
|
||||
idx[sm.TS] = len(points)
|
||||
points = append(points, workloadStatsPoint{
|
||||
TS: sm.TS,
|
||||
CPUPercent: sm.CPUPercent,
|
||||
MemoryUsage: sm.MemoryUsage,
|
||||
MemoryLimit: sm.MemoryLimit,
|
||||
})
|
||||
}
|
||||
return points
|
||||
}
|
||||
|
||||
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
|
||||
// Returns the top-N most recent samples across containers, sorted by CPU or
|
||||
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
|
||||
|
||||
Reference in New Issue
Block a user