feat(apps): per-workload deploy history, rollback, and resource metrics

Two additions to the app detail page, each backed by a per-workload endpoint. Deploy history + rollback: - New deploy_history table — a structured, version-pinned ledger of every dispatch (success AND failure), distinct from the free-text event_log. Recorded at the single DispatchPlugin choke point so every source kind is covered. The raw deploy error is never persisted (it can carry registry-auth / compose-stdout secrets) — only a generic marker, with detail going to slog. Pruned to the newest N per workload; cascade- deleted with the workload. - GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback (admin) replays a prior successful deploy's pinned reference as a rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable); git-built sources need checkout-by-commit, a later phase. - DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback. Per-workload metrics: - ListContainerStatsSamplesByWorkload joins the existing container stats samples through the containers index; GET /api/workloads/{id}/stats/history aggregates CPU/memory per timestamp across the workload's containers. - WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB, windowed, 15s poll). en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped join, deployer recording (incl. secret-non-leak on failure), API rollback guards, and per-timestamp aggregation. Plans under docs/plans/.
2026-06-19 16:22:12 +03:00
parent c8e71a0c34
commit 0c4c338bfe
23 changed files with 1828 additions and 0 deletions
@@ -0,0 +1,123 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+)
+
+// InsertDeployHistory appends one row to the per-workload deploy ledger.
+// Callers (the deployer choke point) treat this as best-effort: a failure
+// here must never fail an otherwise-successful deploy. Error is expected to
+// be a fixed, secret-free marker — never the raw source error.
+func (s *Store) InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error) {
+	if e.StartedAt == "" {
+		e.StartedAt = Now()
+	}
+	if e.FinishedAt == "" {
+		e.FinishedAt = Now()
+	}
+	res, err := s.db.Exec(
+		`INSERT INTO deploy_history
+		   (workload_id, source_kind, reference, reason, triggered_by,
+		    note, outcome, error, started_at, finished_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		e.WorkloadID, e.SourceKind, e.Reference, e.Reason, e.TriggeredBy,
+		e.Note, e.Outcome, e.Error, e.StartedAt, e.FinishedAt,
+	)
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("insert deploy history: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("get deploy history id: %w", err)
+	}
+	e.ID = id
+	return e, nil
+}
+
+// ListDeployHistory returns a workload's ledger newest-first. limit/offset
+// are assumed pre-clamped by the API layer; a non-positive limit falls back
+// to a sane default so a bad query can't return the whole table.
+func (s *Store) ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error) {
+	if limit <= 0 {
+		limit = 50
+	}
+	if offset < 0 {
+		offset = 0
+	}
+	rows, err := s.db.Query(
+		`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
+		        note, outcome, error, started_at, finished_at
+		 FROM deploy_history
+		 WHERE workload_id = ?
+		 ORDER BY id DESC
+		 LIMIT ? OFFSET ?`,
+		workloadID, limit, offset,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("query deploy history: %w", err)
+	}
+	defer rows.Close()
+
+	out := make([]DeployHistoryEntry, 0, limit)
+	for rows.Next() {
+		var e DeployHistoryEntry
+		if err := rows.Scan(
+			&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
+			&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
+		); err != nil {
+			return nil, fmt.Errorf("scan deploy history: %w", err)
+		}
+		out = append(out, e)
+	}
+	return out, rows.Err()
+}
+
+// GetDeployHistory fetches one ledger row by id, or ErrNotFound. The
+// rollback handler uses this to resolve the pinned reference to replay.
+func (s *Store) GetDeployHistory(id int64) (DeployHistoryEntry, error) {
+	row := s.db.QueryRow(
+		`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
+		        note, outcome, error, started_at, finished_at
+		 FROM deploy_history WHERE id = ?`, id,
+	)
+	var e DeployHistoryEntry
+	err := row.Scan(
+		&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
+		&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
+	)
+	if errors.Is(err, sql.ErrNoRows) {
+		return DeployHistoryEntry{}, fmt.Errorf("deploy history %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return DeployHistoryEntry{}, fmt.Errorf("scan deploy history: %w", err)
+	}
+	return e, nil
+}
+
+// PruneDeployHistory keeps only the newest `keep` rows for a workload,
+// deleting older ones. Bounds unbounded growth on hot workloads. Best-
+// effort and id-monotonic (newer rows always have larger ids), so it
+// deletes everything below the keep-th id. A non-positive keep is treated
+// as "keep a sane default" rather than "delete everything".
+func (s *Store) PruneDeployHistory(workloadID string, keep int) error {
+	if keep <= 0 {
+		keep = 50
+	}
+	_, err := s.db.Exec(
+		`DELETE FROM deploy_history
+		 WHERE workload_id = ?
+		   AND id NOT IN (
+		       SELECT id FROM deploy_history
+		       WHERE workload_id = ?
+		       ORDER BY id DESC
+		       LIMIT ?
+		   )`,
+		workloadID, workloadID, keep,
+	)
+	if err != nil {
+		return fmt.Errorf("prune deploy history: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,133 @@
+package store
+
+import (
+	"errors"
+	"testing"
+)
+
+func seedWorkload(t *testing.T, s *Store, name string) Workload {
+	t.Helper()
+	w, err := s.CreateWorkload(Workload{Kind: "project", RefID: name, Name: name})
+	if err != nil {
+		t.Fatalf("CreateWorkload(%s): %v", name, err)
+	}
+	return w
+}
+
+func TestDeployHistory_InsertListGet(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "app1")
+
+	first, err := s.InsertDeployHistory(DeployHistoryEntry{
+		WorkloadID: w.ID, SourceKind: "image", Reference: "v1",
+		Reason: "manual", TriggeredBy: "admin", Outcome: "success",
+	})
+	if err != nil {
+		t.Fatalf("InsertDeployHistory: %v", err)
+	}
+	if first.ID == 0 {
+		t.Fatal("expected non-zero id")
+	}
+	if first.StartedAt == "" || first.FinishedAt == "" {
+		t.Fatal("expected timestamps to be defaulted")
+	}
+
+	second, _ := s.InsertDeployHistory(DeployHistoryEntry{
+		WorkloadID: w.ID, SourceKind: "image", Reference: "v2",
+		Reason: "registry-push", Outcome: "success",
+	})
+
+	list, err := s.ListDeployHistory(w.ID, 10, 0)
+	if err != nil {
+		t.Fatalf("ListDeployHistory: %v", err)
+	}
+	if len(list) != 2 {
+		t.Fatalf("expected 2 rows, got %d", len(list))
+	}
+	// Newest-first ordering.
+	if list[0].ID != second.ID || list[1].ID != first.ID {
+		t.Fatalf("expected newest-first ordering, got %d then %d", list[0].ID, list[1].ID)
+	}
+
+	got, err := s.GetDeployHistory(first.ID)
+	if err != nil {
+		t.Fatalf("GetDeployHistory: %v", err)
+	}
+	if got.Reference != "v1" || got.SourceKind != "image" {
+		t.Fatalf("unexpected row: %+v", got)
+	}
+}
+
+func TestDeployHistory_GetNotFound(t *testing.T) {
+	s := newTestStore(t)
+	_, err := s.GetDeployHistory(999)
+	if !errors.Is(err, ErrNotFound) {
+		t.Fatalf("expected ErrNotFound, got %v", err)
+	}
+}
+
+func TestDeployHistory_ListScopedToWorkload(t *testing.T) {
+	s := newTestStore(t)
+	a := seedWorkload(t, s, "a")
+	b := seedWorkload(t, s, "b")
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: a.ID, Outcome: "success"})
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: b.ID, Outcome: "success"})
+
+	list, _ := s.ListDeployHistory(a.ID, 10, 0)
+	if len(list) != 1 || list[0].WorkloadID != a.ID {
+		t.Fatalf("expected only workload a's rows, got %+v", list)
+	}
+}
+
+func TestDeployHistory_Pagination(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "paged")
+	for i := 0; i < 5; i++ {
+		s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	}
+	page1, _ := s.ListDeployHistory(w.ID, 2, 0)
+	page2, _ := s.ListDeployHistory(w.ID, 2, 2)
+	if len(page1) != 2 || len(page2) != 2 {
+		t.Fatalf("expected 2 per page, got %d and %d", len(page1), len(page2))
+	}
+	if page1[0].ID == page2[0].ID {
+		t.Fatal("expected distinct rows across pages")
+	}
+}
+
+func TestDeployHistory_Prune(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "noisy")
+	for i := 0; i < 10; i++ {
+		s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	}
+	if err := s.PruneDeployHistory(w.ID, 3); err != nil {
+		t.Fatalf("PruneDeployHistory: %v", err)
+	}
+	list, _ := s.ListDeployHistory(w.ID, 100, 0)
+	if len(list) != 3 {
+		t.Fatalf("expected 3 rows after prune, got %d", len(list))
+	}
+	// Prune keeps the newest rows.
+	all, _ := s.ListDeployHistory(w.ID, 100, 0)
+	for i := 1; i < len(all); i++ {
+		if all[i-1].ID < all[i].ID {
+			t.Fatal("expected newest-first after prune")
+		}
+	}
+}
+
+func TestDeployHistory_CascadeOnWorkloadDelete(t *testing.T) {
+	s := newTestStore(t)
+	w := seedWorkload(t, s, "doomed")
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
+	s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "failure"})
+
+	if err := s.DeleteWorkload(w.ID); err != nil {
+		t.Fatalf("DeleteWorkload: %v", err)
+	}
+	list, _ := s.ListDeployHistory(w.ID, 100, 0)
+	if len(list) != 0 {
+		t.Fatalf("expected history removed with workload, got %d rows", len(list))
+	}
+}
@@ -507,3 +507,28 @@ type App struct {
 	CreatedAt   string `json:"created_at"`
 	UpdatedAt   string `json:"updated_at"`
 }
+
+// DeployHistoryEntry is one row in the per-workload deploy ledger. Unlike
+// event_log (free-text human timeline), this is the structured, version-
+// pinned record the rollback action replays from. Reference is the
+// effective deployed artifact handle (image tag for image sources, commit
+// sha for git-built sources, "" when none applies). Error is NEVER the raw
+// source error — that can carry registry-auth bytes or compose stdout; it
+// holds only a fixed, secret-free marker. Raw detail goes to slog.
+type DeployHistoryEntry struct {
+	ID          int64  `json:"id"`
+	WorkloadID  string `json:"workload_id"`
+	SourceKind  string `json:"source_kind"`
+	Reference   string `json:"reference"` // effective tag | commit sha | ""
+	Reason      string `json:"reason"`    // manual|registry-push|git-push|cron|rollback|promote
+	TriggeredBy string `json:"triggered_by"`
+	Note        string `json:"note"`
+	Outcome     string `json:"outcome"` // success | failure
+	Error       string `json:"error"`   // generic, secret-free marker on failure
+	StartedAt   string `json:"started_at"`
+	FinishedAt  string `json:"finished_at"`
+	// Rollbackable is computed at the API layer (not persisted): a row is
+	// rollbackable when it succeeded, has a non-empty Reference, and its
+	// source kind supports reference-pinned redeploy.
+	Rollbackable bool `json:"rollbackable"`
+}
@@ -0,0 +1,56 @@
+package store
+
+import "testing"
+
+func TestListContainerStatsSamplesByWorkload_ScopedToWorkload(t *testing.T) {
+	s := newTestStore(t)
+	wa := seedWorkload(t, s, "wa")
+	wb := seedWorkload(t, s, "wb")
+
+	ca, err := s.CreateContainer(Container{WorkloadID: wa.ID, WorkloadKind: "image", ContainerID: "da", Host: "local", State: "running"})
+	if err != nil {
+		t.Fatalf("CreateContainer a: %v", err)
+	}
+	cb, err := s.CreateContainer(Container{WorkloadID: wb.ID, WorkloadKind: "image", ContainerID: "db", Host: "local", State: "running"})
+	if err != nil {
+		t.Fatalf("CreateContainer b: %v", err)
+	}
+
+	// owner_id is the container ROW id.
+	mustInsertSample(t, s, ca.ID, 100, 12.5, 2048)
+	mustInsertSample(t, s, ca.ID, 200, 15.0, 3072)
+	mustInsertSample(t, s, cb.ID, 150, 99.0, 9999)
+
+	got, err := s.ListContainerStatsSamplesByWorkload(wa.ID, 0)
+	if err != nil {
+		t.Fatalf("ListContainerStatsSamplesByWorkload: %v", err)
+	}
+	if len(got) != 2 {
+		t.Fatalf("expected 2 samples for workload a, got %d", len(got))
+	}
+	// ts ascending.
+	if got[0].TS != 100 || got[1].TS != 200 {
+		t.Fatalf("expected ts-ascending 100,200, got %d,%d", got[0].TS, got[1].TS)
+	}
+	for _, sm := range got {
+		if sm.OwnerID != ca.ID {
+			t.Fatalf("leaked a sample from another workload: %+v", sm)
+		}
+	}
+
+	// Since-cutoff filters older samples.
+	recent, _ := s.ListContainerStatsSamplesByWorkload(wa.ID, 150)
+	if len(recent) != 1 || recent[0].TS != 200 {
+		t.Fatalf("expected only ts=200 after cutoff, got %+v", recent)
+	}
+}
+
+func mustInsertSample(t *testing.T, s *Store, ownerID string, ts int64, cpu float64, mem int64) {
+	t.Helper()
+	if err := s.InsertContainerStatsSample(ContainerStatsSample{
+		ContainerID: "c-" + ownerID, OwnerType: "instance", OwnerID: ownerID, TS: ts,
+		CPUPercent: cpu, MemoryUsage: mem, MemoryLimit: mem * 2,
+	}); err != nil {
+		t.Fatalf("InsertContainerStatsSample: %v", err)
+	}
+}
@@ -74,6 +74,43 @@ func (s *Store) ListContainerStatsSamples(ownerType, ownerID string, sinceTS int
 	return out, rows.Err()
 }

+// ListContainerStatsSamplesByWorkload returns every container sample owned by
+// a workload since the given unix timestamp, ordered by ts ascending. Samples
+// are linked to their workload through the containers index (owner_id is the
+// container row id), so this joins through it. Powers the per-workload metrics
+// graph on /apps/[id].
+func (s *Store) ListContainerStatsSamplesByWorkload(workloadID string, sinceTS int64) ([]ContainerStatsSample, error) {
+	rows, err := s.db.Query(
+		`SELECT cs.container_id, cs.owner_type, cs.owner_id, cs.ts,
+		        cs.cpu_percent, cs.memory_usage, cs.memory_limit,
+		        cs.network_rx, cs.network_tx, cs.block_read, cs.block_write
+		 FROM container_stats_samples cs
+		 JOIN containers c ON c.id = cs.owner_id
+		 WHERE c.workload_id = ? AND cs.ts >= ?
+		 ORDER BY cs.ts ASC`,
+		workloadID, sinceTS,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("list container stats samples by workload: %w", err)
+	}
+	defer rows.Close()
+
+	var out []ContainerStatsSample
+	for rows.Next() {
+		var s ContainerStatsSample
+		if err := rows.Scan(
+			&s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS,
+			&s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit,
+			&s.NetworkRxBytes, &s.NetworkTxBytes,
+			&s.BlockReadBytes, &s.BlockWriteBytes,
+		); err != nil {
+			return nil, fmt.Errorf("scan container stats sample: %w", err)
+		}
+		out = append(out, s)
+	}
+	return out, rows.Err()
+}
+
 // ListAllRecentContainerStatsSamples returns samples across every owner since
 // the given unix timestamp, ordered by ts ascending. Used by the system
 // dashboard "top containers" widget where the UI wants a mixed pool.
@@ -459,6 +459,28 @@ func (s *Store) runMigrations() error {
 		)`,
 		`CREATE UNIQUE INDEX IF NOT EXISTS idx_shared_secrets_scope_name ON shared_secrets(scope, app_id, name)`,
 		`CREATE INDEX IF NOT EXISTS idx_shared_secrets_app ON shared_secrets(app_id)`,
+		// deploy_history: structured, version-pinned ledger of every deploy
+		// dispatch (success AND failure) per workload. Distinct from the
+		// free-text event_log — this carries the replayable `reference` the
+		// rollback action redeploys from. `error` holds only a generic,
+		// secret-free marker (the raw source error can echo registry-auth /
+		// compose stdout, so it goes to slog only). FK cascade is backed by
+		// PRAGMA foreign_keys=ON, but DeleteWorkload also deletes these rows
+		// explicitly (matching the containers cleanup convention).
+		`CREATE TABLE IF NOT EXISTS deploy_history (
+			id            INTEGER PRIMARY KEY AUTOINCREMENT,
+			workload_id   TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+			source_kind   TEXT NOT NULL DEFAULT '',
+			reference     TEXT NOT NULL DEFAULT '',
+			reason        TEXT NOT NULL DEFAULT '',
+			triggered_by  TEXT NOT NULL DEFAULT '',
+			note          TEXT NOT NULL DEFAULT '',
+			outcome       TEXT NOT NULL DEFAULT '',
+			error         TEXT NOT NULL DEFAULT '',
+			started_at    TEXT NOT NULL DEFAULT '',
+			finished_at   TEXT NOT NULL DEFAULT ''
+		)`,
+		`CREATE INDEX IF NOT EXISTS idx_deploy_history_workload ON deploy_history(workload_id, id DESC)`,
 	}
 	for _, t := range observabilityTables {
 		if _, err := s.db.Exec(t); err != nil {
@@ -190,6 +190,12 @@ func (s *Store) DeleteWorkload(id string) error {
 	if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
 		return fmt.Errorf("delete containers: %w", err)
 	}
+	// Deploy ledger rows are FK-cascaded, but we delete them explicitly in
+	// the same transaction — consistent with the containers cleanup above
+	// and robust even if the cascade is ever disabled.
+	if _, err := tx.Exec(`DELETE FROM deploy_history WHERE workload_id = ?`, id); err != nil {
+		return fmt.Errorf("delete deploy history: %w", err)
+	}
 	result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
 	if err != nil {
 		return fmt.Errorf("delete workload: %w", err)