feat(apps): per-workload deploy history, rollback, and resource metrics
Two additions to the app detail page, each backed by a per-workload
endpoint.
Deploy history + rollback:
- New deploy_history table — a structured, version-pinned ledger of every
dispatch (success AND failure), distinct from the free-text event_log.
Recorded at the single DispatchPlugin choke point so every source kind
is covered. The raw deploy error is never persisted (it can carry
registry-auth / compose-stdout secrets) — only a generic marker, with
detail going to slog. Pruned to the newest N per workload; cascade-
deleted with the workload.
- GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback
(admin) replays a prior successful deploy's pinned reference as a
rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable);
git-built sources need checkout-by-commit, a later phase.
- DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback.
Per-workload metrics:
- ListContainerStatsSamplesByWorkload joins the existing container stats
samples through the containers index; GET /api/workloads/{id}/stats/history
aggregates CPU/memory per timestamp across the workload's containers.
- WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB,
windowed, 15s poll).
en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped
join, deployer recording (incl. secret-non-leak on failure), API rollback
guards, and per-timestamp aggregation. Plans under docs/plans/.
This commit is contained in:
@@ -0,0 +1,151 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// parseOffset parses a pagination offset, clamping anything invalid or
|
||||
// negative to 0. parseLimit (secrets.go) handles the limit half.
|
||||
func parseOffset(raw string) int {
|
||||
n, err := strconv.Atoi(raw)
|
||||
if err != nil || n < 0 {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// rollbackCapableKinds is the single source of truth for which source kinds
|
||||
// support reference-pinned redeploy. The image source resolves
|
||||
// intent.Reference as the tag, so replaying a prior tag is a real rollback.
|
||||
// static/dockerfile clone branch HEAD and cannot yet check out an arbitrary
|
||||
// commit (a later phase); compose has no single artifact handle.
|
||||
var rollbackCapableKinds = map[string]bool{"image": true}
|
||||
|
||||
// RollbackCapable reports whether a source kind supports one-click rollback.
|
||||
// Used by both the list response (per-row `rollbackable` flag) and the
|
||||
// rollback guard so the UI and the server never disagree.
|
||||
func RollbackCapable(sourceKind string) bool { return rollbackCapableKinds[sourceKind] }
|
||||
|
||||
// listWorkloadDeploys handles GET /api/workloads/{id}/deploys. Read-only,
|
||||
// open to any authenticated user (mirrors the per-workload events feed).
|
||||
// Returns the structured deploy ledger newest-first with a server-computed
|
||||
// `rollbackable` flag per row.
|
||||
func (s *Server) listWorkloadDeploys(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
|
||||
q := r.URL.Query()
|
||||
limit := parseLimit(q.Get("limit"), 50, 200)
|
||||
offset := parseOffset(q.Get("offset"))
|
||||
|
||||
rows, err := s.store.ListDeployHistory(id, limit, offset)
|
||||
if err != nil {
|
||||
slog.Error("failed to list deploy history", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list deploy history")
|
||||
return
|
||||
}
|
||||
for i := range rows {
|
||||
rows[i].Rollbackable = rows[i].Outcome == "success" &&
|
||||
rows[i].Reference != "" &&
|
||||
RollbackCapable(rows[i].SourceKind)
|
||||
}
|
||||
respondJSON(w, http.StatusOK, rows)
|
||||
}
|
||||
|
||||
// rollbackWorkload handles POST /api/workloads/{id}/rollback. Admin-only
|
||||
// (same gate as /deploy). Body: {"deploy_id": <id>}. It resolves the pinned
|
||||
// reference from a prior successful, rollback-capable ledger row belonging
|
||||
// to this workload and replays it as a `rollback`-reason deploy.
|
||||
func (s *Server) rollbackWorkload(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
|
||||
row, err := s.store.GetWorkloadByID(id)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
if row.SourceKind == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload has no source_kind; cannot roll back")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
DeployID int64 `json:"deploy_id"`
|
||||
}
|
||||
if !decodeJSONStrict(w, r, &body) {
|
||||
return
|
||||
}
|
||||
if body.DeployID <= 0 {
|
||||
respondError(w, http.StatusBadRequest, "deploy_id is required")
|
||||
return
|
||||
}
|
||||
|
||||
entry, err := s.store.GetDeployHistory(body.DeployID)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "deploy history entry")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get deploy history")
|
||||
return
|
||||
}
|
||||
// No cross-workload replay: the entry must belong to the path workload.
|
||||
if entry.WorkloadID != id {
|
||||
respondError(w, http.StatusBadRequest, "deploy entry does not belong to this workload")
|
||||
return
|
||||
}
|
||||
if entry.Outcome != "success" {
|
||||
respondError(w, http.StatusBadRequest, "cannot roll back to a failed deploy")
|
||||
return
|
||||
}
|
||||
if entry.Reference == "" || !RollbackCapable(row.SourceKind) {
|
||||
respondError(w, http.StatusBadRequest, "this deploy is not rollback-capable")
|
||||
return
|
||||
}
|
||||
|
||||
actor := "manual"
|
||||
if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
|
||||
actor = claims.Username
|
||||
}
|
||||
intent := plugin.DeploymentIntent{
|
||||
Reason: "rollback",
|
||||
Reference: entry.Reference,
|
||||
Metadata: map[string]string{
|
||||
"note": "rollback to " + entry.Reference,
|
||||
"rollback_of": strconv.FormatInt(entry.ID, 10),
|
||||
},
|
||||
TriggeredAt: time.Now().UTC(),
|
||||
TriggeredBy: actor,
|
||||
}
|
||||
if err := s.deployer.DispatchPlugin(r.Context(), toPluginWorkload(row), intent); err != nil {
|
||||
// Raw error stays in the server log; client gets a generic message
|
||||
// (the wrapped error can carry registry-auth bytes).
|
||||
slog.Warn("rollback dispatch failed", "workload", id, "actor", actor,
|
||||
"reference", entry.Reference, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "rollback failed; see server logs")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusAccepted, map[string]any{
|
||||
"workload_id": id,
|
||||
"reference": entry.Reference,
|
||||
"rollback_of": entry.ID,
|
||||
"triggered_by": actor,
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
"github.com/alexei/tinyforge/internal/workload/plugin"
|
||||
)
|
||||
|
||||
// createImageWorkload creates an image-source workload through the API so
|
||||
// source_kind is persisted exactly as production does, returning its id.
|
||||
func createImageWorkload(t *testing.T, e *apiTestEnv, name string) string {
|
||||
t.Helper()
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads", pluginWorkloadRequest{
|
||||
Name: name, SourceKind: "image", SourceConfig: validImageSourceConfig(),
|
||||
})
|
||||
if resp.StatusCode != http.StatusCreated {
|
||||
_ = decodeEnvelope(t, resp, nil)
|
||||
t.Fatalf("create workload: status %d", resp.StatusCode)
|
||||
}
|
||||
var got plugin.Workload
|
||||
if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
|
||||
t.Fatalf("create workload envelope error: %q", errMsg)
|
||||
}
|
||||
return got.ID
|
||||
}
|
||||
|
||||
func TestListWorkloadDeploys_ComputesRollbackable(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
id := createImageWorkload(t, e, "app")
|
||||
|
||||
// success + reference + image => rollbackable
|
||||
e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
// failure => not rollbackable
|
||||
e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "v2", Outcome: "failure",
|
||||
})
|
||||
// success but empty reference => not rollbackable
|
||||
e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "", Outcome: "success",
|
||||
})
|
||||
|
||||
resp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/deploys", nil)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200", resp.StatusCode)
|
||||
}
|
||||
var rows []store.DeployHistoryEntry
|
||||
if errMsg := decodeEnvelope(t, resp, &rows); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
}
|
||||
if len(rows) != 3 {
|
||||
t.Fatalf("expected 3 rows, got %d", len(rows))
|
||||
}
|
||||
// Newest-first: empty-ref success, failure, then v1 success.
|
||||
if !rows[2].Rollbackable {
|
||||
t.Fatalf("v1 success row should be rollbackable: %+v", rows[2])
|
||||
}
|
||||
if rows[1].Rollbackable || rows[0].Rollbackable {
|
||||
t.Fatalf("failure / empty-ref rows must not be rollbackable")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRollback_HappyPath_DispatchesRollbackIntent(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
id := createImageWorkload(t, e, "app")
|
||||
entry, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
|
||||
before := e.dispatcher.deployCount.Load()
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/rollback",
|
||||
map[string]any{"deploy_id": entry.ID})
|
||||
if resp.StatusCode != http.StatusAccepted {
|
||||
errMsg := decodeEnvelope(t, resp, nil)
|
||||
t.Fatalf("status = %d, want 202 (err=%q)", resp.StatusCode, errMsg)
|
||||
}
|
||||
if got := e.dispatcher.deployCount.Load(); got != before+1 {
|
||||
t.Fatalf("expected one dispatch, got delta %d", got-before)
|
||||
}
|
||||
intent := e.dispatcher.lastIntent.Load()
|
||||
if intent == nil || intent.Reason != "rollback" || intent.Reference != "v1" {
|
||||
t.Fatalf("expected rollback intent for v1, got %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRollback_Guards(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
imageID := createImageWorkload(t, e, "img")
|
||||
otherID := createImageWorkload(t, e, "other")
|
||||
|
||||
success, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: imageID, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
failed, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: imageID, SourceKind: "image", Reference: "v2", Outcome: "failure",
|
||||
})
|
||||
otherWL, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
|
||||
WorkloadID: otherID, SourceKind: "image", Reference: "v1", Outcome: "success",
|
||||
})
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
workload string
|
||||
body any
|
||||
wantCode int
|
||||
}{
|
||||
{"missing deploy_id", imageID, map[string]any{}, http.StatusBadRequest},
|
||||
{"zero deploy_id", imageID, map[string]any{"deploy_id": 0}, http.StatusBadRequest},
|
||||
{"unknown deploy_id", imageID, map[string]any{"deploy_id": 999999}, http.StatusNotFound},
|
||||
{"unknown workload", "nope", map[string]any{"deploy_id": success.ID}, http.StatusNotFound},
|
||||
{"failed deploy", imageID, map[string]any{"deploy_id": failed.ID}, http.StatusBadRequest},
|
||||
{"cross-workload entry", imageID, map[string]any{"deploy_id": otherWL.ID}, http.StatusBadRequest},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
resp := e.do(t, http.MethodPost, "/api/workloads/"+c.workload+"/rollback", c.body)
|
||||
if resp.StatusCode != c.wantCode {
|
||||
errMsg := decodeEnvelope(t, resp, nil)
|
||||
t.Fatalf("status = %d, want %d (err=%q)", resp.StatusCode, c.wantCode, errMsg)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -336,6 +336,12 @@ func (s *Server) Router() chi.Router {
|
||||
r.With(auth.AdminOnly).Post("/start", s.startPluginWorkload)
|
||||
r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload)
|
||||
|
||||
// Deploy ledger + rollback. The history feed is read-only
|
||||
// (any authenticated user); rollback is a redeploy, so it is
|
||||
// admin-gated like /deploy.
|
||||
r.Get("/deploys", s.listWorkloadDeploys)
|
||||
r.With(auth.AdminOnly).Post("/rollback", s.rollbackWorkload)
|
||||
|
||||
// Volume snapshots (admin-only). Capture/list a workload's
|
||||
// host-bind data volumes; {sid}-scoped download/delete live
|
||||
// in the global admin group alongside backups.
|
||||
@@ -348,6 +354,10 @@ func (s *Server) Router() chi.Router {
|
||||
r.Get("/runtime-state", s.getWorkloadRuntimeState)
|
||||
r.Get("/storage", s.getWorkloadStorage)
|
||||
|
||||
// Per-workload metrics history (CPU/memory time-series),
|
||||
// aggregated across the workload's containers. Read-only.
|
||||
r.Get("/stats/history", s.getWorkloadStatsHistory)
|
||||
|
||||
// Per-workload activity / deploy timeline (read-only). Scoped
|
||||
// to this workload's event-log rows; the global feed lives at
|
||||
// /events/log.
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
@@ -85,6 +88,76 @@ func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
|
||||
respondJSON(w, http.StatusOK, samples)
|
||||
}
|
||||
|
||||
// workloadStatsPoint is one aggregated time bucket for a workload's metrics
|
||||
// graph: every container the workload owns is summed at each timestamp so a
|
||||
// multi-container (compose) workload reads as a single line. MemoryLimit is
|
||||
// the max across containers — the effective ceiling — though the UI plots
|
||||
// absolute MiB because the limit is often 0 (unlimited).
|
||||
type workloadStatsPoint struct {
|
||||
TS int64 `json:"ts"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryUsage int64 `json:"memory_usage"`
|
||||
MemoryLimit int64 `json:"memory_limit"`
|
||||
}
|
||||
|
||||
// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
|
||||
// Read-only and open to any authenticated user (mirrors the per-workload
|
||||
// events/runtime-state feeds). Always returns a (possibly empty) array — never
|
||||
// 503 — because samples come from SQLite, which is available even when the
|
||||
// Docker daemon is down or stats collection is disabled. Unknown workload id
|
||||
// 404s; a known workload with no samples yet returns [].
|
||||
func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
if id == "" {
|
||||
respondError(w, http.StatusBadRequest, "workload id is required")
|
||||
return
|
||||
}
|
||||
if _, err := s.store.GetWorkloadByID(id); err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "workload")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "get workload")
|
||||
return
|
||||
}
|
||||
|
||||
samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
|
||||
if err != nil {
|
||||
slog.Error("failed to list workload stats samples", "workload", id, "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
|
||||
}
|
||||
|
||||
// aggregateWorkloadStats folds per-container samples into one series keyed by
|
||||
// timestamp: CPU% and memory usage are summed across the workload's containers,
|
||||
// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
|
||||
// that order without an extra sort.
|
||||
func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
|
||||
points := make([]workloadStatsPoint, 0)
|
||||
idx := make(map[int64]int) // ts → index in points
|
||||
for _, sm := range samples {
|
||||
if i, ok := idx[sm.TS]; ok {
|
||||
points[i].CPUPercent += sm.CPUPercent
|
||||
points[i].MemoryUsage += sm.MemoryUsage
|
||||
if sm.MemoryLimit > points[i].MemoryLimit {
|
||||
points[i].MemoryLimit = sm.MemoryLimit
|
||||
}
|
||||
continue
|
||||
}
|
||||
idx[sm.TS] = len(points)
|
||||
points = append(points, workloadStatsPoint{
|
||||
TS: sm.TS,
|
||||
CPUPercent: sm.CPUPercent,
|
||||
MemoryUsage: sm.MemoryUsage,
|
||||
MemoryLimit: sm.MemoryLimit,
|
||||
})
|
||||
}
|
||||
return points
|
||||
}
|
||||
|
||||
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
|
||||
// Returns the top-N most recent samples across containers, sorted by CPU or
|
||||
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
func TestAggregateWorkloadStats_SumsPerTimestamp(t *testing.T) {
|
||||
// Two containers reporting at the same two ticks → summed per ts.
|
||||
samples := []store.ContainerStatsSample{
|
||||
{TS: 100, CPUPercent: 10, MemoryUsage: 1000, MemoryLimit: 4000},
|
||||
{TS: 100, CPUPercent: 5, MemoryUsage: 500, MemoryLimit: 8000},
|
||||
{TS: 200, CPUPercent: 20, MemoryUsage: 2000, MemoryLimit: 4000},
|
||||
}
|
||||
pts := aggregateWorkloadStats(samples)
|
||||
if len(pts) != 2 {
|
||||
t.Fatalf("expected 2 buckets, got %d", len(pts))
|
||||
}
|
||||
if pts[0].TS != 100 || pts[0].CPUPercent != 15 || pts[0].MemoryUsage != 1500 {
|
||||
t.Fatalf("ts=100 bucket wrong: %+v", pts[0])
|
||||
}
|
||||
// Memory limit takes the max across containers.
|
||||
if pts[0].MemoryLimit != 8000 {
|
||||
t.Fatalf("expected max memory limit 8000, got %d", pts[0].MemoryLimit)
|
||||
}
|
||||
if pts[1].TS != 200 || pts[1].CPUPercent != 20 {
|
||||
t.Fatalf("ts=200 bucket wrong: %+v", pts[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestAggregateWorkloadStats_Empty(t *testing.T) {
|
||||
pts := aggregateWorkloadStats(nil)
|
||||
if pts == nil {
|
||||
t.Fatal("expected non-nil empty slice for clean JSON []")
|
||||
}
|
||||
if len(pts) != 0 {
|
||||
t.Fatalf("expected 0 points, got %d", len(pts))
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkloadStatsHistory_UnknownWorkload404(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
resp := e.do(t, "GET", "/api/workloads/nope/stats/history", nil)
|
||||
if resp.StatusCode != 404 {
|
||||
t.Fatalf("expected 404 for unknown workload, got %d", resp.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkloadStatsHistory_KnownWorkloadEmpty(t *testing.T) {
|
||||
e := newAPITestEnv(t)
|
||||
id := createImageWorkload(t, e, "metrics-app")
|
||||
resp := e.do(t, "GET", "/api/workloads/"+id+"/stats/history", nil)
|
||||
if resp.StatusCode != 200 {
|
||||
t.Fatalf("expected 200, got %d", resp.StatusCode)
|
||||
}
|
||||
var pts []workloadStatsPoint
|
||||
if errMsg := decodeEnvelope(t, resp, &pts); errMsg != "" {
|
||||
t.Fatalf("envelope error: %q", errMsg)
|
||||
}
|
||||
if len(pts) != 0 {
|
||||
t.Fatalf("expected empty series for app with no samples, got %d", len(pts))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user