feat(apps): per-workload deploy history, rollback, and resource metrics

Two additions to the app detail page, each backed by a per-workload endpoint. Deploy history + rollback: - New deploy_history table — a structured, version-pinned ledger of every dispatch (success AND failure), distinct from the free-text event_log. Recorded at the single DispatchPlugin choke point so every source kind is covered. The raw deploy error is never persisted (it can carry registry-auth / compose-stdout secrets) — only a generic marker, with detail going to slog. Pruned to the newest N per workload; cascade- deleted with the workload. - GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback (admin) replays a prior successful deploy's pinned reference as a rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable); git-built sources need checkout-by-commit, a later phase. - DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback. Per-workload metrics: - ListContainerStatsSamplesByWorkload joins the existing container stats samples through the containers index; GET /api/workloads/{id}/stats/history aggregates CPU/memory per timestamp across the workload's containers. - WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB, windowed, 15s poll). en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped join, deployer recording (incl. secret-non-leak on failure), API rollback guards, and per-timestamp aggregation. Plans under docs/plans/.
2026-06-19 16:22:12 +03:00
parent c8e71a0c34
commit 0c4c338bfe
23 changed files with 1828 additions and 0 deletions
@@ -0,0 +1,151 @@
+package api
+
+import (
+	"errors"
+	"log/slog"
+	"net/http"
+	"strconv"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/auth"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// parseOffset parses a pagination offset, clamping anything invalid or
+// negative to 0. parseLimit (secrets.go) handles the limit half.
+func parseOffset(raw string) int {
+	n, err := strconv.Atoi(raw)
+	if err != nil || n < 0 {
+		return 0
+	}
+	return n
+}
+
+// rollbackCapableKinds is the single source of truth for which source kinds
+// support reference-pinned redeploy. The image source resolves
+// intent.Reference as the tag, so replaying a prior tag is a real rollback.
+// static/dockerfile clone branch HEAD and cannot yet check out an arbitrary
+// commit (a later phase); compose has no single artifact handle.
+var rollbackCapableKinds = map[string]bool{"image": true}
+
+// RollbackCapable reports whether a source kind supports one-click rollback.
+// Used by both the list response (per-row `rollbackable` flag) and the
+// rollback guard so the UI and the server never disagree.
+func RollbackCapable(sourceKind string) bool { return rollbackCapableKinds[sourceKind] }
+
+// listWorkloadDeploys handles GET /api/workloads/{id}/deploys. Read-only,
+// open to any authenticated user (mirrors the per-workload events feed).
+// Returns the structured deploy ledger newest-first with a server-computed
+// `rollbackable` flag per row.
+func (s *Server) listWorkloadDeploys(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+
+	q := r.URL.Query()
+	limit := parseLimit(q.Get("limit"), 50, 200)
+	offset := parseOffset(q.Get("offset"))
+
+	rows, err := s.store.ListDeployHistory(id, limit, offset)
+	if err != nil {
+		slog.Error("failed to list deploy history", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list deploy history")
+		return
+	}
+	for i := range rows {
+		rows[i].Rollbackable = rows[i].Outcome == "success" &&
+			rows[i].Reference != "" &&
+			RollbackCapable(rows[i].SourceKind)
+	}
+	respondJSON(w, http.StatusOK, rows)
+}
+
+// rollbackWorkload handles POST /api/workloads/{id}/rollback. Admin-only
+// (same gate as /deploy). Body: {"deploy_id": <id>}. It resolves the pinned
+// reference from a prior successful, rollback-capable ledger row belonging
+// to this workload and replays it as a `rollback`-reason deploy.
+func (s *Server) rollbackWorkload(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	row, err := s.store.GetWorkloadByID(id)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+	if row.SourceKind == "" {
+		respondError(w, http.StatusBadRequest, "workload has no source_kind; cannot roll back")
+		return
+	}
+
+	var body struct {
+		DeployID int64 `json:"deploy_id"`
+	}
+	if !decodeJSONStrict(w, r, &body) {
+		return
+	}
+	if body.DeployID <= 0 {
+		respondError(w, http.StatusBadRequest, "deploy_id is required")
+		return
+	}
+
+	entry, err := s.store.GetDeployHistory(body.DeployID)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "deploy history entry")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get deploy history")
+		return
+	}
+	// No cross-workload replay: the entry must belong to the path workload.
+	if entry.WorkloadID != id {
+		respondError(w, http.StatusBadRequest, "deploy entry does not belong to this workload")
+		return
+	}
+	if entry.Outcome != "success" {
+		respondError(w, http.StatusBadRequest, "cannot roll back to a failed deploy")
+		return
+	}
+	if entry.Reference == "" || !RollbackCapable(row.SourceKind) {
+		respondError(w, http.StatusBadRequest, "this deploy is not rollback-capable")
+		return
+	}
+
+	actor := "manual"
+	if claims, ok := auth.ClaimsFromContext(r.Context()); ok && claims.Username != "" {
+		actor = claims.Username
+	}
+	intent := plugin.DeploymentIntent{
+		Reason:    "rollback",
+		Reference: entry.Reference,
+		Metadata: map[string]string{
+			"note":        "rollback to " + entry.Reference,
+			"rollback_of": strconv.FormatInt(entry.ID, 10),
+		},
+		TriggeredAt: time.Now().UTC(),
+		TriggeredBy: actor,
+	}
+	if err := s.deployer.DispatchPlugin(r.Context(), toPluginWorkload(row), intent); err != nil {
+		// Raw error stays in the server log; client gets a generic message
+		// (the wrapped error can carry registry-auth bytes).
+		slog.Warn("rollback dispatch failed", "workload", id, "actor", actor,
+			"reference", entry.Reference, "error", err)
+		respondError(w, http.StatusInternalServerError, "rollback failed; see server logs")
+		return
+	}
+	respondJSON(w, http.StatusAccepted, map[string]any{
+		"workload_id":  id,
+		"reference":    entry.Reference,
+		"rollback_of":  entry.ID,
+		"triggered_by": actor,
+	})
+}
@@ -0,0 +1,126 @@
+package api
+
+import (
+	"net/http"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// createImageWorkload creates an image-source workload through the API so
+// source_kind is persisted exactly as production does, returning its id.
+func createImageWorkload(t *testing.T, e *apiTestEnv, name string) string {
+	t.Helper()
+	resp := e.do(t, http.MethodPost, "/api/workloads", pluginWorkloadRequest{
+		Name: name, SourceKind: "image", SourceConfig: validImageSourceConfig(),
+	})
+	if resp.StatusCode != http.StatusCreated {
+		_ = decodeEnvelope(t, resp, nil)
+		t.Fatalf("create workload: status %d", resp.StatusCode)
+	}
+	var got plugin.Workload
+	if errMsg := decodeEnvelope(t, resp, &got); errMsg != "" {
+		t.Fatalf("create workload envelope error: %q", errMsg)
+	}
+	return got.ID
+}
+
+func TestListWorkloadDeploys_ComputesRollbackable(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := createImageWorkload(t, e, "app")
+
+	// success + reference + image  => rollbackable
+	e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+	// failure                      => not rollbackable
+	e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "v2", Outcome: "failure",
+	})
+	// success but empty reference  => not rollbackable
+	e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "", Outcome: "success",
+	})
+
+	resp := e.do(t, http.MethodGet, "/api/workloads/"+id+"/deploys", nil)
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+	var rows []store.DeployHistoryEntry
+	if errMsg := decodeEnvelope(t, resp, &rows); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(rows) != 3 {
+		t.Fatalf("expected 3 rows, got %d", len(rows))
+	}
+	// Newest-first: empty-ref success, failure, then v1 success.
+	if !rows[2].Rollbackable {
+		t.Fatalf("v1 success row should be rollbackable: %+v", rows[2])
+	}
+	if rows[1].Rollbackable || rows[0].Rollbackable {
+		t.Fatalf("failure / empty-ref rows must not be rollbackable")
+	}
+}
+
+func TestRollback_HappyPath_DispatchesRollbackIntent(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := createImageWorkload(t, e, "app")
+	entry, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: id, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+
+	before := e.dispatcher.deployCount.Load()
+	resp := e.do(t, http.MethodPost, "/api/workloads/"+id+"/rollback",
+		map[string]any{"deploy_id": entry.ID})
+	if resp.StatusCode != http.StatusAccepted {
+		errMsg := decodeEnvelope(t, resp, nil)
+		t.Fatalf("status = %d, want 202 (err=%q)", resp.StatusCode, errMsg)
+	}
+	if got := e.dispatcher.deployCount.Load(); got != before+1 {
+		t.Fatalf("expected one dispatch, got delta %d", got-before)
+	}
+	intent := e.dispatcher.lastIntent.Load()
+	if intent == nil || intent.Reason != "rollback" || intent.Reference != "v1" {
+		t.Fatalf("expected rollback intent for v1, got %+v", intent)
+	}
+}
+
+func TestRollback_Guards(t *testing.T) {
+	e := newAPITestEnv(t)
+	imageID := createImageWorkload(t, e, "img")
+	otherID := createImageWorkload(t, e, "other")
+
+	success, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: imageID, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+	failed, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: imageID, SourceKind: "image", Reference: "v2", Outcome: "failure",
+	})
+	otherWL, _ := e.store.InsertDeployHistory(store.DeployHistoryEntry{
+		WorkloadID: otherID, SourceKind: "image", Reference: "v1", Outcome: "success",
+	})
+
+	cases := []struct {
+		name     string
+		workload string
+		body     any
+		wantCode int
+	}{
+		{"missing deploy_id", imageID, map[string]any{}, http.StatusBadRequest},
+		{"zero deploy_id", imageID, map[string]any{"deploy_id": 0}, http.StatusBadRequest},
+		{"unknown deploy_id", imageID, map[string]any{"deploy_id": 999999}, http.StatusNotFound},
+		{"unknown workload", "nope", map[string]any{"deploy_id": success.ID}, http.StatusNotFound},
+		{"failed deploy", imageID, map[string]any{"deploy_id": failed.ID}, http.StatusBadRequest},
+		{"cross-workload entry", imageID, map[string]any{"deploy_id": otherWL.ID}, http.StatusBadRequest},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			resp := e.do(t, http.MethodPost, "/api/workloads/"+c.workload+"/rollback", c.body)
+			if resp.StatusCode != c.wantCode {
+				errMsg := decodeEnvelope(t, resp, nil)
+				t.Fatalf("status = %d, want %d (err=%q)", resp.StatusCode, c.wantCode, errMsg)
+			}
+		})
+	}
+}
@@ -336,6 +336,12 @@ func (s *Server) Router() chi.Router {
 				r.With(auth.AdminOnly).Post("/start", s.startPluginWorkload)
 				r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload)

+				// Deploy ledger + rollback. The history feed is read-only
+				// (any authenticated user); rollback is a redeploy, so it is
+				// admin-gated like /deploy.
+				r.Get("/deploys", s.listWorkloadDeploys)
+				r.With(auth.AdminOnly).Post("/rollback", s.rollbackWorkload)
+
 				// Volume snapshots (admin-only). Capture/list a workload's
 				// host-bind data volumes; {sid}-scoped download/delete live
 				// in the global admin group alongside backups.
@@ -348,6 +354,10 @@ func (s *Server) Router() chi.Router {
 				r.Get("/runtime-state", s.getWorkloadRuntimeState)
 				r.Get("/storage", s.getWorkloadStorage)

+				// Per-workload metrics history (CPU/memory time-series),
+				// aggregated across the workload's containers. Read-only.
+				r.Get("/stats/history", s.getWorkloadStatsHistory)
+
 				// Per-workload activity / deploy timeline (read-only). Scoped
 				// to this workload's event-log rows; the global feed lives at
 				// /events/log.
@@ -1,12 +1,15 @@
 package api

 import (
+	"errors"
 	"log/slog"
 	"net/http"
 	"sort"
 	"strconv"
 	"time"

+	"github.com/go-chi/chi/v5"
+
 	"github.com/alexei/tinyforge/internal/auth"
 	"github.com/alexei/tinyforge/internal/store"
 )
@@ -85,6 +88,76 @@ func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
 	respondJSON(w, http.StatusOK, samples)
 }

+// workloadStatsPoint is one aggregated time bucket for a workload's metrics
+// graph: every container the workload owns is summed at each timestamp so a
+// multi-container (compose) workload reads as a single line. MemoryLimit is
+// the max across containers — the effective ceiling — though the UI plots
+// absolute MiB because the limit is often 0 (unlimited).
+type workloadStatsPoint struct {
+	TS          int64   `json:"ts"`
+	CPUPercent  float64 `json:"cpu_percent"`
+	MemoryUsage int64   `json:"memory_usage"`
+	MemoryLimit int64   `json:"memory_limit"`
+}
+
+// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
+// Read-only and open to any authenticated user (mirrors the per-workload
+// events/runtime-state feeds). Always returns a (possibly empty) array — never
+// 503 — because samples come from SQLite, which is available even when the
+// Docker daemon is down or stats collection is disabled. Unknown workload id
+// 404s; a known workload with no samples yet returns [].
+func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+	if id == "" {
+		respondError(w, http.StatusBadRequest, "workload id is required")
+		return
+	}
+	if _, err := s.store.GetWorkloadByID(id); err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "workload")
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "get workload")
+		return
+	}
+
+	samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
+	if err != nil {
+		slog.Error("failed to list workload stats samples", "workload", id, "error", err)
+		respondError(w, http.StatusInternalServerError, "failed to list samples")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
+}
+
+// aggregateWorkloadStats folds per-container samples into one series keyed by
+// timestamp: CPU% and memory usage are summed across the workload's containers,
+// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
+// that order without an extra sort.
+func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
+	points := make([]workloadStatsPoint, 0)
+	idx := make(map[int64]int) // ts → index in points
+	for _, sm := range samples {
+		if i, ok := idx[sm.TS]; ok {
+			points[i].CPUPercent += sm.CPUPercent
+			points[i].MemoryUsage += sm.MemoryUsage
+			if sm.MemoryLimit > points[i].MemoryLimit {
+				points[i].MemoryLimit = sm.MemoryLimit
+			}
+			continue
+		}
+		idx[sm.TS] = len(points)
+		points = append(points, workloadStatsPoint{
+			TS:          sm.TS,
+			CPUPercent:  sm.CPUPercent,
+			MemoryUsage: sm.MemoryUsage,
+			MemoryLimit: sm.MemoryLimit,
+		})
+	}
+	return points
+}
+
 // listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
 // Returns the top-N most recent samples across containers, sorted by CPU or
 // memory. Container IDs are stripped for non-admins so a low-privilege viewer
@@ -0,0 +1,64 @@
+package api
+
+import (
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+func TestAggregateWorkloadStats_SumsPerTimestamp(t *testing.T) {
+	// Two containers reporting at the same two ticks → summed per ts.
+	samples := []store.ContainerStatsSample{
+		{TS: 100, CPUPercent: 10, MemoryUsage: 1000, MemoryLimit: 4000},
+		{TS: 100, CPUPercent: 5, MemoryUsage: 500, MemoryLimit: 8000},
+		{TS: 200, CPUPercent: 20, MemoryUsage: 2000, MemoryLimit: 4000},
+	}
+	pts := aggregateWorkloadStats(samples)
+	if len(pts) != 2 {
+		t.Fatalf("expected 2 buckets, got %d", len(pts))
+	}
+	if pts[0].TS != 100 || pts[0].CPUPercent != 15 || pts[0].MemoryUsage != 1500 {
+		t.Fatalf("ts=100 bucket wrong: %+v", pts[0])
+	}
+	// Memory limit takes the max across containers.
+	if pts[0].MemoryLimit != 8000 {
+		t.Fatalf("expected max memory limit 8000, got %d", pts[0].MemoryLimit)
+	}
+	if pts[1].TS != 200 || pts[1].CPUPercent != 20 {
+		t.Fatalf("ts=200 bucket wrong: %+v", pts[1])
+	}
+}
+
+func TestAggregateWorkloadStats_Empty(t *testing.T) {
+	pts := aggregateWorkloadStats(nil)
+	if pts == nil {
+		t.Fatal("expected non-nil empty slice for clean JSON []")
+	}
+	if len(pts) != 0 {
+		t.Fatalf("expected 0 points, got %d", len(pts))
+	}
+}
+
+func TestWorkloadStatsHistory_UnknownWorkload404(t *testing.T) {
+	e := newAPITestEnv(t)
+	resp := e.do(t, "GET", "/api/workloads/nope/stats/history", nil)
+	if resp.StatusCode != 404 {
+		t.Fatalf("expected 404 for unknown workload, got %d", resp.StatusCode)
+	}
+}
+
+func TestWorkloadStatsHistory_KnownWorkloadEmpty(t *testing.T) {
+	e := newAPITestEnv(t)
+	id := createImageWorkload(t, e, "metrics-app")
+	resp := e.do(t, "GET", "/api/workloads/"+id+"/stats/history", nil)
+	if resp.StatusCode != 200 {
+		t.Fatalf("expected 200, got %d", resp.StatusCode)
+	}
+	var pts []workloadStatsPoint
+	if errMsg := decodeEnvelope(t, resp, &pts); errMsg != "" {
+		t.Fatalf("envelope error: %q", errMsg)
+	}
+	if len(pts) != 0 {
+		t.Fatalf("expected empty series for app with no samples, got %d", len(pts))
+	}
+}
@@ -0,0 +1,76 @@
+package deployer
+
+import (
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// deployHistoryKeepPerWorkload bounds the ledger per workload. Newer rows
+// always have larger ids, so pruning keeps the most recent N — enough for a
+// useful rollback menu without unbounded growth on hot workloads.
+const deployHistoryKeepPerWorkload = 50
+
+// recordDeployHistory appends one ledger row for a completed dispatch.
+//
+// Best-effort: a store failure is logged and swallowed — recording must
+// never turn a successful deploy into a failed request (same contract as
+// EmitDeployEvent and the pre-deploy backup). The raw deploy error is NEVER
+// persisted: it can carry registry-auth bytes or compose stdout, so only a
+// fixed, secret-free marker lands in the row (raw detail goes to slog at the
+// call site). Called only from DispatchPlugin — reconcile/teardown ticks are
+// not deploys and must not appear in the ledger.
+func (d *Deployer) recordDeployHistory(w plugin.Workload, intent plugin.DeploymentIntent, outcome string, deployErr error, startedAt string) {
+	if d.store == nil {
+		return
+	}
+	entry := store.DeployHistoryEntry{
+		WorkloadID:  w.ID,
+		SourceKind:  w.SourceKind,
+		Reference:   d.effectiveReference(w, intent),
+		Reason:      intent.Reason,
+		TriggeredBy: intent.TriggeredBy,
+		Note:        intent.Metadata["note"], // nil map read is safe
+		Outcome:     outcome,
+		StartedAt:   startedAt,
+		FinishedAt:  store.Now(),
+	}
+	if deployErr != nil {
+		entry.Error = "deploy failed (see server logs)"
+	}
+	if _, err := d.store.InsertDeployHistory(entry); err != nil {
+		slog.Warn("deploy history: insert failed", "workload", w.ID, "error", err)
+		return
+	}
+	// Cheap indexed DELETE — negligible next to a multi-second deploy, so it
+	// stays inline rather than on an untracked goroutine that could outrace
+	// graceful shutdown's db.Close().
+	if err := d.store.PruneDeployHistory(w.ID, deployHistoryKeepPerWorkload); err != nil {
+		slog.Warn("deploy history: prune failed", "workload", w.ID, "error", err)
+	}
+}
+
+// effectiveReference resolves the artifact handle to record (and, for
+// rollback-capable sources, to replay). It starts from the trigger-supplied
+// intent.Reference and, for the image source, prefers the tag actually
+// written onto the freshest container row — capturing the DefaultTag /
+// "latest" resolution the source performs when intent.Reference is empty
+// (e.g. a manual deploy with no override). ListContainersByWorkload returns
+// newest-first, so rows[0] is the just-deployed container on success.
+//
+// For static/dockerfile the git trigger already supplies the commit SHA as
+// intent.Reference; a manual deploy of those may record an empty reference
+// (acceptable — they are not rollback-capable in this phase). compose has no
+// single artifact handle.
+func (d *Deployer) effectiveReference(w plugin.Workload, intent plugin.DeploymentIntent) string {
+	ref := intent.Reference
+	if w.SourceKind == "image" && d.store != nil {
+		if rows, err := d.store.ListContainersByWorkload(w.ID); err == nil && len(rows) > 0 {
+			if tag := rows[0].ImageTag; tag != "" {
+				ref = tag
+			}
+		}
+	}
+	return ref
+}
@@ -5,6 +5,7 @@ import (
 	"fmt"

 	"github.com/alexei/tinyforge/internal/metrics"
+	"github.com/alexei/tinyforge/internal/store"
 	"github.com/alexei/tinyforge/internal/workload/plugin"
 )

@@ -33,12 +34,17 @@ func (d *Deployer) DispatchPlugin(ctx context.Context, w plugin.Workload, intent
 	// check (e.g. the image source's same-tag short-circuit), so a same-tag
 	// redeploy still snapshots — "backup before every deploy attempt".
 	d.maybeBackupBeforeDeploy(w.ID)
+	startedAt := store.Now()
 	err = src.Deploy(ctx, d.PluginDeps(), w, intent)
 	outcome := "success"
 	if err != nil {
 		outcome = "failure"
 	}
 	metrics.DeploysTotal.Inc(w.SourceKind, outcome)
+	// Append to the structured deploy ledger (powers the per-app history
+	// panel + rollback). Best-effort and secret-free; see recordDeployHistory.
+	// Only DispatchPlugin records — reconcile/teardown are not deploys.
+	d.recordDeployHistory(w, intent, outcome, err, startedAt)
 	return err
 }

@@ -250,6 +250,84 @@ func TestDispatchReconcile_PropagatesSourceError(t *testing.T) {
 	}
 }

+// ---- Deploy history recording ----------------------------------------------
+
+// seedDispatchWorkload inserts a real workloads row so deploy_history's FK
+// (workload_id REFERENCES workloads) is satisfied, then returns a plugin
+// workload pointing at the fake source.
+func seedDispatchWorkload(t *testing.T, d *Deployer) plugin.Workload {
+	t.Helper()
+	row, err := d.store.CreateWorkload(store.Workload{Kind: "project", RefID: "dh", Name: "dh"})
+	if err != nil {
+		t.Fatalf("CreateWorkload: %v", err)
+	}
+	return plugin.Workload{ID: row.ID, Name: "dh", SourceKind: "dispatchertest"}
+}
+
+func TestDispatchPlugin_RecordsSuccessHistory(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	w := seedDispatchWorkload(t, d)
+
+	intent := plugin.DeploymentIntent{Reason: "manual", Reference: "v9", TriggeredBy: "alice",
+		Metadata: map[string]string{"note": "ship it"}}
+	if err := d.DispatchPlugin(context.Background(), w, intent); err != nil {
+		t.Fatalf("DispatchPlugin: %v", err)
+	}
+	rows, err := d.store.ListDeployHistory(w.ID, 10, 0)
+	if err != nil {
+		t.Fatalf("ListDeployHistory: %v", err)
+	}
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 history row, got %d", len(rows))
+	}
+	got := rows[0]
+	if got.Outcome != "success" || got.Reason != "manual" || got.Reference != "v9" {
+		t.Fatalf("unexpected row: %+v", got)
+	}
+	if got.TriggeredBy != "alice" || got.Note != "ship it" {
+		t.Fatalf("intent fields not recorded: %+v", got)
+	}
+	if got.Error != "" {
+		t.Fatalf("success row must have empty error, got %q", got.Error)
+	}
+}
+
+func TestDispatchPlugin_RecordsFailureWithoutLeakingError(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	w := seedDispatchWorkload(t, d)
+
+	// A deploy error carrying a "secret" must never reach the persisted row.
+	dispatchTestSource.setDeployErr(errors.New("compose up failed (output: SUPER_SECRET=hunter2)"))
+	_ = d.DispatchPlugin(context.Background(), w, plugin.DeploymentIntent{Reason: "manual"})
+
+	rows, _ := d.store.ListDeployHistory(w.ID, 10, 0)
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 history row, got %d", len(rows))
+	}
+	if rows[0].Outcome != "failure" {
+		t.Fatalf("expected failure outcome, got %q", rows[0].Outcome)
+	}
+	if strings.Contains(rows[0].Error, "hunter2") || strings.Contains(rows[0].Error, "SECRET") {
+		t.Fatalf("raw error leaked into history: %q", rows[0].Error)
+	}
+}
+
+func TestDispatchReconcile_RecordsNoHistory(t *testing.T) {
+	resetFake(t)
+	d := newTestDeployer(t)
+	w := seedDispatchWorkload(t, d)
+
+	if err := d.DispatchReconcile(context.Background(), w); err != nil {
+		t.Fatalf("DispatchReconcile: %v", err)
+	}
+	rows, _ := d.store.ListDeployHistory(w.ID, 10, 0)
+	if len(rows) != 0 {
+		t.Fatalf("reconcile must not write history, got %d rows", len(rows))
+	}
+}
+
 // ---- PluginDeps -------------------------------------------------------------

 func TestPluginDeps_PassesStoreAndEncKey(t *testing.T) {
@@ -0,0 +1,123 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+)
+
+// InsertDeployHistory appends one row to the per-workload deploy ledger.
+// Callers (the deployer choke point) treat this as best-effort: a failure
+// here must never fail an otherwise-successful deploy. Error is expected to
+// be a fixed, secret-free marker — never the raw source error.
+func (s *Store) InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error) {
+	if e.StartedAt == "" {
+		e.StartedAt = Now()
+	}
+	if e.FinishedAt == "" {
+		e.FinishedAt = Now()
+	}
+	res, err := s.db.Exec(
+		`INSERT INTO deploy_history
+		   (workload_id, source_kind, reference, reason, triggered_by,
+		    note, outcome, error, started_at, finished_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		e.WorkloadID, e.SourceKind, e.Reference, e.Reason, e.TriggeredBy,
+		e.Note, e.Outcome, e.Error, e.StartedAt, e.FinishedAt,
+	)
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("insert deploy history: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("get deploy history id: %w", err)
+	}
+	e.ID = id
+	return e, nil
+}
+
+// ListDeployHistory returns a workload's ledger newest-first. limit/offset
+// are assumed pre-clamped by the API layer; a non-positive limit falls back
+// to a sane default so a bad query can't return the whole table.
+func (s *Store) ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error) {
+	if limit <= 0 {
+		limit = 50
+	}
+	if offset < 0 {
+		offset = 0
+	}
+	rows, err := s.db.Query(
+		`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
+		        note, outcome, error, started_at, finished_at
+		 FROM deploy_history
+		 WHERE workload_id = ?
+		 ORDER BY id DESC
+		 LIMIT ? OFFSET ?`,
+		workloadID, limit, offset,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("query deploy history: %w", err)
+	}
+	defer rows.Close()
+
+	out := make([]DeployHistoryEntry, 0, limit)
+	for rows.Next() {
+		var e DeployHistoryEntry
+		if err := rows.Scan(
+			&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
+			&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
+		); err != nil {
+			return nil, fmt.Errorf("scan deploy history: %w", err)
+		}
+		out = append(out, e)
+	}
+	return out, rows.Err()
+}
+
+// GetDeployHistory fetches one ledger row by id, or ErrNotFound. The
+// rollback handler uses this to resolve the pinned reference to replay.
+func (s *Store) GetDeployHistory(id int64) (DeployHistoryEntry, error) {
+	row := s.db.QueryRow(
+		`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
+		        note, outcome, error, started_at, finished_at
+		 FROM deploy_history WHERE id = ?`, id,
+	)
+	var e DeployHistoryEntry
+	err := row.Scan(
+		&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
+		&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
+	)
+	if errors.Is(err, sql.ErrNoRows) {
+		return DeployHistoryEntry{}, fmt.Errorf("deploy history %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("scan deploy history: %w", err)
+	}
+	return e, nil
+}
+
+// PruneDeployHistory keeps only the newest `keep` rows for a workload,
+// deleting older ones. Bounds unbounded growth on hot workloads. Best-
+// effort and id-monotonic (newer rows always have larger ids), so it
+// deletes everything below the keep-th id. A non-positive keep is treated
+// as "keep a sane default" rather than "delete everything".
+func (s *Store) PruneDeployHistory(workloadID string, keep int) error {
+	if keep <= 0 {
+		keep = 50
+	}
+	_, err := s.db.Exec(
+		`DELETE FROM deploy_history
+		 WHERE workload_id = ?
+		   AND id NOT IN (
+		       SELECT id FROM deploy_history
+		       WHERE workload_id = ?
+		       ORDER BY id DESC
+		       LIMIT ?
+		   )`,
+		workloadID, workloadID, keep,
+	)
+	if err != nil {
+		return fmt.Errorf("prune deploy history: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,133 @@
+package store
+
+import (
+	"errors"
+	"testing"
+)
+
+func seedWorkload(t *testing.T, s *Store, name string) Workload {
+	t.Helper()
+	w, err := s.CreateWorkload(Workload{Kind: "project", RefID: name, Name: name})
+	if err != nil {
+		t.Fatalf("CreateWorkload(%s): %v", name, err)
+	}
+	return w
+}
+
+func TestDeployHistory_InsertListGet(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "app1")
+
+	first, err := s.InsertDeployHistory(DeployHistoryEntry{
+		WorkloadID: w.ID, SourceKind: "image", Reference: "v1",
+		Reason: "manual", TriggeredBy: "admin", Outcome: "success",
+	})
+	if err != nil {
+		t.Fatalf("InsertDeployHistory: %v", err)
+	}
+	if first.ID == 0 {
+		t.Fatal("expected non-zero id")
+	}
+	if first.StartedAt == "" || first.FinishedAt == "" {
+		t.Fatal("expected timestamps to be defaulted")
+	}
+
+	second, _ := s.InsertDeployHistory(DeployHistoryEntry{
+		WorkloadID: w.ID, SourceKind: "image", Reference: "v2",
+		Reason: "registry-push", Outcome: "success",
+	})
+
+	list, err := s.ListDeployHistory(w.ID, 10, 0)
+	if err != nil {
+		t.Fatalf("ListDeployHistory: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("expected 2 rows, got %d", len(list))
+	}
+	// Newest-first ordering.
+	if list[0].ID != second.ID || list[1].ID != first.ID {
+		t.Fatalf("expected newest-first ordering, got %d then %d", list[0].ID, list[1].ID)
+	}
+
+	got, err := s.GetDeployHistory(first.ID)
+	if err != nil {
+		t.Fatalf("GetDeployHistory: %v", err)
+	}
+	if got.Reference != "v1" || got.SourceKind != "image" {
+		t.Fatalf("unexpected row: %+v", got)
+	}
+}
+
+func TestDeployHistory_GetNotFound(t *testing.T) {
+	s := newTestStore(t)
+	_, err := s.GetDeployHistory(999)
+	if !errors.Is(err, ErrNotFound) {
+		t.Fatalf("expected ErrNotFound, got %v", err)
+	}
+}
+
+func TestDeployHistory_ListScopedToWorkload(t *testing.T) {
+	s := newTestStore(t)
+	a := seedWorkload(t, s, "a")
+	b := seedWorkload(t, s, "b")
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: a.ID, Outcome: "success"})
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: b.ID, Outcome: "success"})
+
+	list, _ := s.ListDeployHistory(a.ID, 10, 0)
+	if len(list) != 1 || list[0].WorkloadID != a.ID {
+		t.Fatalf("expected only workload a's rows, got %+v", list)
+	}
+}
+
+func TestDeployHistory_Pagination(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "paged")
+	for i := 0; i < 5; i++ {
+		s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	}
+	page1, _ := s.ListDeployHistory(w.ID, 2, 0)
+	page2, _ := s.ListDeployHistory(w.ID, 2, 2)
+	if len(page1) != 2 || len(page2) != 2 {
+		t.Fatalf("expected 2 per page, got %d and %d", len(page1), len(page2))
+	}
+	if page1[0].ID == page2[0].ID {
+		t.Fatal("expected distinct rows across pages")
+	}
+}
+
+func TestDeployHistory_Prune(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "noisy")
+	for i := 0; i < 10; i++ {
+		s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	}
+	if err := s.PruneDeployHistory(w.ID, 3); err != nil {
+		t.Fatalf("PruneDeployHistory: %v", err)
+	}
+	list, _ := s.ListDeployHistory(w.ID, 100, 0)
+	if len(list) != 3 {
+		t.Fatalf("expected 3 rows after prune, got %d", len(list))
+	}
+	// Prune keeps the newest rows.
+	all, _ := s.ListDeployHistory(w.ID, 100, 0)
+	for i := 1; i < len(all); i++ {
+		if all[i-1].ID < all[i].ID {
+			t.Fatal("expected newest-first after prune")
+		}
+	}
+}
+
+func TestDeployHistory_CascadeOnWorkloadDelete(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "doomed")
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "failure"})
+
+	if err := s.DeleteWorkload(w.ID); err != nil {
+		t.Fatalf("DeleteWorkload: %v", err)
+	}
+	list, _ := s.ListDeployHistory(w.ID, 100, 0)
+	if len(list) != 0 {
+		t.Fatalf("expected history removed with workload, got %d rows", len(list))
+	}
+}
@@ -507,3 +507,28 @@ type App struct {
 	CreatedAt   string `json:"created_at"`
 	UpdatedAt   string `json:"updated_at"`
 }
+
+// DeployHistoryEntry is one row in the per-workload deploy ledger. Unlike
+// event_log (free-text human timeline), this is the structured, version-
+// pinned record the rollback action replays from. Reference is the
+// effective deployed artifact handle (image tag for image sources, commit
+// sha for git-built sources, "" when none applies). Error is NEVER the raw
+// source error — that can carry registry-auth bytes or compose stdout; it
+// holds only a fixed, secret-free marker. Raw detail goes to slog.
+type DeployHistoryEntry struct {
+	ID          int64  `json:"id"`
+	WorkloadID  string `json:"workload_id"`
+	SourceKind  string `json:"source_kind"`
+	Reference   string `json:"reference"` // effective tag | commit sha | ""
+	Reason      string `json:"reason"`    // manual|registry-push|git-push|cron|rollback|promote
+	TriggeredBy string `json:"triggered_by"`
+	Note        string `json:"note"`
+	Outcome     string `json:"outcome"` // success | failure
+	Error       string `json:"error"`   // generic, secret-free marker on failure
+	StartedAt   string `json:"started_at"`
+	FinishedAt  string `json:"finished_at"`
+	// Rollbackable is computed at the API layer (not persisted): a row is
+	// rollbackable when it succeeded, has a non-empty Reference, and its
+	// source kind supports reference-pinned redeploy.
+	Rollbackable bool `json:"rollbackable"`
+}
@@ -0,0 +1,56 @@
+package store
+
+import "testing"
+
+func TestListContainerStatsSamplesByWorkload_ScopedToWorkload(t *testing.T) {
+	s := newTestStore(t)
+	wa := seedWorkload(t, s, "wa")
+	wb := seedWorkload(t, s, "wb")
+
+	ca, err := s.CreateContainer(Container{WorkloadID: wa.ID, WorkloadKind: "image", ContainerID: "da", Host: "local", State: "running"})
+	if err != nil {
+		t.Fatalf("CreateContainer a: %v", err)
+	}
+	cb, err := s.CreateContainer(Container{WorkloadID: wb.ID, WorkloadKind: "image", ContainerID: "db", Host: "local", State: "running"})
+	if err != nil {
+		t.Fatalf("CreateContainer b: %v", err)
+	}
+
+	// owner_id is the container ROW id.
+	mustInsertSample(t, s, ca.ID, 100, 12.5, 2048)
+	mustInsertSample(t, s, ca.ID, 200, 15.0, 3072)
+	mustInsertSample(t, s, cb.ID, 150, 99.0, 9999)
+
+	got, err := s.ListContainerStatsSamplesByWorkload(wa.ID, 0)
+	if err != nil {
+		t.Fatalf("ListContainerStatsSamplesByWorkload: %v", err)
+	}
+	if len(got) != 2 {
+		t.Fatalf("expected 2 samples for workload a, got %d", len(got))
+	}
+	// ts ascending.
+	if got[0].TS != 100 || got[1].TS != 200 {
+		t.Fatalf("expected ts-ascending 100,200, got %d,%d", got[0].TS, got[1].TS)
+	}
+	for _, sm := range got {
+		if sm.OwnerID != ca.ID {
+			t.Fatalf("leaked a sample from another workload: %+v", sm)
+		}
+	}
+
+	// Since-cutoff filters older samples.
+	recent, _ := s.ListContainerStatsSamplesByWorkload(wa.ID, 150)
+	if len(recent) != 1 || recent[0].TS != 200 {
+		t.Fatalf("expected only ts=200 after cutoff, got %+v", recent)
+	}
+}
+
+func mustInsertSample(t *testing.T, s *Store, ownerID string, ts int64, cpu float64, mem int64) {
+	t.Helper()
+	if err := s.InsertContainerStatsSample(ContainerStatsSample{
+		ContainerID: "c-" + ownerID, OwnerType: "instance", OwnerID: ownerID, TS: ts,
+		CPUPercent: cpu, MemoryUsage: mem, MemoryLimit: mem * 2,
+	}); err != nil {
+		t.Fatalf("InsertContainerStatsSample: %v", err)
+	}
+}
@@ -74,6 +74,43 @@ func (s *Store) ListContainerStatsSamples(ownerType, ownerID string, sinceTS int
 	return out, rows.Err()
 }

+// ListContainerStatsSamplesByWorkload returns every container sample owned by
+// a workload since the given unix timestamp, ordered by ts ascending. Samples
+// are linked to their workload through the containers index (owner_id is the
+// container row id), so this joins through it. Powers the per-workload metrics
+// graph on /apps/[id].
+func (s *Store) ListContainerStatsSamplesByWorkload(workloadID string, sinceTS int64) ([]ContainerStatsSample, error) {
+	rows, err := s.db.Query(
+		`SELECT cs.container_id, cs.owner_type, cs.owner_id, cs.ts,
+		        cs.cpu_percent, cs.memory_usage, cs.memory_limit,
+		        cs.network_rx, cs.network_tx, cs.block_read, cs.block_write
+		 FROM container_stats_samples cs
+		 JOIN containers c ON c.id = cs.owner_id
+		 WHERE c.workload_id = ? AND cs.ts >= ?
+		 ORDER BY cs.ts ASC`,
+		workloadID, sinceTS,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("list container stats samples by workload: %w", err)
+	}
+	defer rows.Close()
+
+	var out []ContainerStatsSample
+	for rows.Next() {
+		var s ContainerStatsSample
+		if err := rows.Scan(
+			&s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS,
+			&s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit,
+			&s.NetworkRxBytes, &s.NetworkTxBytes,
+			&s.BlockReadBytes, &s.BlockWriteBytes,
+		); err != nil {
+			return nil, fmt.Errorf("scan container stats sample: %w", err)
+		}
+		out = append(out, s)
+	}
+	return out, rows.Err()
+}
+
 // ListAllRecentContainerStatsSamples returns samples across every owner since
 // the given unix timestamp, ordered by ts ascending. Used by the system
 // dashboard "top containers" widget where the UI wants a mixed pool.
@@ -459,6 +459,28 @@ func (s *Store) runMigrations() error {
 		)`,
 		`CREATE UNIQUE INDEX IF NOT EXISTS idx_shared_secrets_scope_name ON shared_secrets(scope, app_id, name)`,
 		`CREATE INDEX IF NOT EXISTS idx_shared_secrets_app ON shared_secrets(app_id)`,
+		// deploy_history: structured, version-pinned ledger of every deploy
+		// dispatch (success AND failure) per workload. Distinct from the
+		// free-text event_log — this carries the replayable `reference` the
+		// rollback action redeploys from. `error` holds only a generic,
+		// secret-free marker (the raw source error can echo registry-auth /
+		// compose stdout, so it goes to slog only). FK cascade is backed by
+		// PRAGMA foreign_keys=ON, but DeleteWorkload also deletes these rows
+		// explicitly (matching the containers cleanup convention).
+		`CREATE TABLE IF NOT EXISTS deploy_history (
+			id            INTEGER PRIMARY KEY AUTOINCREMENT,
+			workload_id   TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+			source_kind   TEXT NOT NULL DEFAULT '',
+			reference     TEXT NOT NULL DEFAULT '',
+			reason        TEXT NOT NULL DEFAULT '',
+			triggered_by  TEXT NOT NULL DEFAULT '',
+			note          TEXT NOT NULL DEFAULT '',
+			outcome       TEXT NOT NULL DEFAULT '',
+			error         TEXT NOT NULL DEFAULT '',
+			started_at    TEXT NOT NULL DEFAULT '',
+			finished_at   TEXT NOT NULL DEFAULT ''
+		)`,
+		`CREATE INDEX IF NOT EXISTS idx_deploy_history_workload ON deploy_history(workload_id, id DESC)`,
 	}
 	for _, t := range observabilityTables {
 		if _, err := s.db.Exec(t); err != nil {
@@ -190,6 +190,12 @@ func (s *Store) DeleteWorkload(id string) error {
 	if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
 		return fmt.Errorf("delete containers: %w", err)
 	}
+	// Deploy ledger rows are FK-cascaded, but we delete them explicitly in
+	// the same transaction — consistent with the containers cleanup above
+	// and robust even if the cascade is ever disabled.
+	if _, err := tx.Exec(`DELETE FROM deploy_history WHERE workload_id = ?`, id); err != nil {
+		return fmt.Errorf("delete deploy history: %w", err)
+	}
 	result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
 	if err != nil {
 		return fmt.Errorf("delete workload: %w", err)