feat(apps): per-workload deploy history, rollback, and resource metrics

Two additions to the app detail page, each backed by a per-workload
endpoint.

Deploy history + rollback:
- New deploy_history table — a structured, version-pinned ledger of every
  dispatch (success AND failure), distinct from the free-text event_log.
  Recorded at the single DispatchPlugin choke point so every source kind
  is covered. The raw deploy error is never persisted (it can carry
  registry-auth / compose-stdout secrets) — only a generic marker, with
  detail going to slog. Pruned to the newest N per workload; cascade-
  deleted with the workload.
- GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback
  (admin) replays a prior successful deploy's pinned reference as a
  rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable);
  git-built sources need checkout-by-commit, a later phase.
- DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback.

Per-workload metrics:
- ListContainerStatsSamplesByWorkload joins the existing container stats
  samples through the containers index; GET /api/workloads/{id}/stats/history
  aggregates CPU/memory per timestamp across the workload's containers.
- WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB,
  windowed, 15s poll).

en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped
join, deployer recording (incl. secret-non-leak on failure), API rollback
guards, and per-timestamp aggregation. Plans under docs/plans/.
This commit is contained in:
2026-06-19 16:22:12 +03:00
parent c8e71a0c34
commit 0c4c338bfe
23 changed files with 1828 additions and 0 deletions
+123
View File
@@ -0,0 +1,123 @@
package store
import (
"database/sql"
"errors"
"fmt"
)
// InsertDeployHistory appends one row to the per-workload deploy ledger.
// Callers (the deployer choke point) treat this as best-effort: a failure
// here must never fail an otherwise-successful deploy. Error is expected to
// be a fixed, secret-free marker — never the raw source error.
func (s *Store) InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error) {
if e.StartedAt == "" {
e.StartedAt = Now()
}
if e.FinishedAt == "" {
e.FinishedAt = Now()
}
res, err := s.db.Exec(
`INSERT INTO deploy_history
(workload_id, source_kind, reference, reason, triggered_by,
note, outcome, error, started_at, finished_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
e.WorkloadID, e.SourceKind, e.Reference, e.Reason, e.TriggeredBy,
e.Note, e.Outcome, e.Error, e.StartedAt, e.FinishedAt,
)
if err != nil {
return DeployHistoryEntry{}, fmt.Errorf("insert deploy history: %w", err)
}
id, err := res.LastInsertId()
if err != nil {
return DeployHistoryEntry{}, fmt.Errorf("get deploy history id: %w", err)
}
e.ID = id
return e, nil
}
// ListDeployHistory returns a workload's ledger newest-first. limit/offset
// are assumed pre-clamped by the API layer; a non-positive limit falls back
// to a sane default so a bad query can't return the whole table.
func (s *Store) ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error) {
if limit <= 0 {
limit = 50
}
if offset < 0 {
offset = 0
}
rows, err := s.db.Query(
`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
note, outcome, error, started_at, finished_at
FROM deploy_history
WHERE workload_id = ?
ORDER BY id DESC
LIMIT ? OFFSET ?`,
workloadID, limit, offset,
)
if err != nil {
return nil, fmt.Errorf("query deploy history: %w", err)
}
defer rows.Close()
out := make([]DeployHistoryEntry, 0, limit)
for rows.Next() {
var e DeployHistoryEntry
if err := rows.Scan(
&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
); err != nil {
return nil, fmt.Errorf("scan deploy history: %w", err)
}
out = append(out, e)
}
return out, rows.Err()
}
// GetDeployHistory fetches one ledger row by id, or ErrNotFound. The
// rollback handler uses this to resolve the pinned reference to replay.
func (s *Store) GetDeployHistory(id int64) (DeployHistoryEntry, error) {
row := s.db.QueryRow(
`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
note, outcome, error, started_at, finished_at
FROM deploy_history WHERE id = ?`, id,
)
var e DeployHistoryEntry
err := row.Scan(
&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
)
if errors.Is(err, sql.ErrNoRows) {
return DeployHistoryEntry{}, fmt.Errorf("deploy history %d: %w", id, ErrNotFound)
}
if err != nil {
return DeployHistoryEntry{}, fmt.Errorf("scan deploy history: %w", err)
}
return e, nil
}
// PruneDeployHistory keeps only the newest `keep` rows for a workload,
// deleting older ones. Bounds unbounded growth on hot workloads. Best-
// effort and id-monotonic (newer rows always have larger ids), so it
// deletes everything below the keep-th id. A non-positive keep is treated
// as "keep a sane default" rather than "delete everything".
func (s *Store) PruneDeployHistory(workloadID string, keep int) error {
if keep <= 0 {
keep = 50
}
_, err := s.db.Exec(
`DELETE FROM deploy_history
WHERE workload_id = ?
AND id NOT IN (
SELECT id FROM deploy_history
WHERE workload_id = ?
ORDER BY id DESC
LIMIT ?
)`,
workloadID, workloadID, keep,
)
if err != nil {
return fmt.Errorf("prune deploy history: %w", err)
}
return nil
}
+133
View File
@@ -0,0 +1,133 @@
package store
import (
"errors"
"testing"
)
func seedWorkload(t *testing.T, s *Store, name string) Workload {
t.Helper()
w, err := s.CreateWorkload(Workload{Kind: "project", RefID: name, Name: name})
if err != nil {
t.Fatalf("CreateWorkload(%s): %v", name, err)
}
return w
}
func TestDeployHistory_InsertListGet(t *testing.T) {
s := newTestStore(t)
w := seedWorkload(t, s, "app1")
first, err := s.InsertDeployHistory(DeployHistoryEntry{
WorkloadID: w.ID, SourceKind: "image", Reference: "v1",
Reason: "manual", TriggeredBy: "admin", Outcome: "success",
})
if err != nil {
t.Fatalf("InsertDeployHistory: %v", err)
}
if first.ID == 0 {
t.Fatal("expected non-zero id")
}
if first.StartedAt == "" || first.FinishedAt == "" {
t.Fatal("expected timestamps to be defaulted")
}
second, _ := s.InsertDeployHistory(DeployHistoryEntry{
WorkloadID: w.ID, SourceKind: "image", Reference: "v2",
Reason: "registry-push", Outcome: "success",
})
list, err := s.ListDeployHistory(w.ID, 10, 0)
if err != nil {
t.Fatalf("ListDeployHistory: %v", err)
}
if len(list) != 2 {
t.Fatalf("expected 2 rows, got %d", len(list))
}
// Newest-first ordering.
if list[0].ID != second.ID || list[1].ID != first.ID {
t.Fatalf("expected newest-first ordering, got %d then %d", list[0].ID, list[1].ID)
}
got, err := s.GetDeployHistory(first.ID)
if err != nil {
t.Fatalf("GetDeployHistory: %v", err)
}
if got.Reference != "v1" || got.SourceKind != "image" {
t.Fatalf("unexpected row: %+v", got)
}
}
func TestDeployHistory_GetNotFound(t *testing.T) {
s := newTestStore(t)
_, err := s.GetDeployHistory(999)
if !errors.Is(err, ErrNotFound) {
t.Fatalf("expected ErrNotFound, got %v", err)
}
}
func TestDeployHistory_ListScopedToWorkload(t *testing.T) {
s := newTestStore(t)
a := seedWorkload(t, s, "a")
b := seedWorkload(t, s, "b")
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: a.ID, Outcome: "success"})
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: b.ID, Outcome: "success"})
list, _ := s.ListDeployHistory(a.ID, 10, 0)
if len(list) != 1 || list[0].WorkloadID != a.ID {
t.Fatalf("expected only workload a's rows, got %+v", list)
}
}
func TestDeployHistory_Pagination(t *testing.T) {
s := newTestStore(t)
w := seedWorkload(t, s, "paged")
for i := 0; i < 5; i++ {
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
}
page1, _ := s.ListDeployHistory(w.ID, 2, 0)
page2, _ := s.ListDeployHistory(w.ID, 2, 2)
if len(page1) != 2 || len(page2) != 2 {
t.Fatalf("expected 2 per page, got %d and %d", len(page1), len(page2))
}
if page1[0].ID == page2[0].ID {
t.Fatal("expected distinct rows across pages")
}
}
func TestDeployHistory_Prune(t *testing.T) {
s := newTestStore(t)
w := seedWorkload(t, s, "noisy")
for i := 0; i < 10; i++ {
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
}
if err := s.PruneDeployHistory(w.ID, 3); err != nil {
t.Fatalf("PruneDeployHistory: %v", err)
}
list, _ := s.ListDeployHistory(w.ID, 100, 0)
if len(list) != 3 {
t.Fatalf("expected 3 rows after prune, got %d", len(list))
}
// Prune keeps the newest rows.
all, _ := s.ListDeployHistory(w.ID, 100, 0)
for i := 1; i < len(all); i++ {
if all[i-1].ID < all[i].ID {
t.Fatal("expected newest-first after prune")
}
}
}
func TestDeployHistory_CascadeOnWorkloadDelete(t *testing.T) {
s := newTestStore(t)
w := seedWorkload(t, s, "doomed")
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "failure"})
if err := s.DeleteWorkload(w.ID); err != nil {
t.Fatalf("DeleteWorkload: %v", err)
}
list, _ := s.ListDeployHistory(w.ID, 100, 0)
if len(list) != 0 {
t.Fatalf("expected history removed with workload, got %d rows", len(list))
}
}
+25
View File
@@ -507,3 +507,28 @@ type App struct {
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// DeployHistoryEntry is one row in the per-workload deploy ledger. Unlike
// event_log (free-text human timeline), this is the structured, version-
// pinned record the rollback action replays from. Reference is the
// effective deployed artifact handle (image tag for image sources, commit
// sha for git-built sources, "" when none applies). Error is NEVER the raw
// source error — that can carry registry-auth bytes or compose stdout; it
// holds only a fixed, secret-free marker. Raw detail goes to slog.
type DeployHistoryEntry struct {
ID int64 `json:"id"`
WorkloadID string `json:"workload_id"`
SourceKind string `json:"source_kind"`
Reference string `json:"reference"` // effective tag | commit sha | ""
Reason string `json:"reason"` // manual|registry-push|git-push|cron|rollback|promote
TriggeredBy string `json:"triggered_by"`
Note string `json:"note"`
Outcome string `json:"outcome"` // success | failure
Error string `json:"error"` // generic, secret-free marker on failure
StartedAt string `json:"started_at"`
FinishedAt string `json:"finished_at"`
// Rollbackable is computed at the API layer (not persisted): a row is
// rollbackable when it succeeded, has a non-empty Reference, and its
// source kind supports reference-pinned redeploy.
Rollbackable bool `json:"rollbackable"`
}
+56
View File
@@ -0,0 +1,56 @@
package store
import "testing"
func TestListContainerStatsSamplesByWorkload_ScopedToWorkload(t *testing.T) {
s := newTestStore(t)
wa := seedWorkload(t, s, "wa")
wb := seedWorkload(t, s, "wb")
ca, err := s.CreateContainer(Container{WorkloadID: wa.ID, WorkloadKind: "image", ContainerID: "da", Host: "local", State: "running"})
if err != nil {
t.Fatalf("CreateContainer a: %v", err)
}
cb, err := s.CreateContainer(Container{WorkloadID: wb.ID, WorkloadKind: "image", ContainerID: "db", Host: "local", State: "running"})
if err != nil {
t.Fatalf("CreateContainer b: %v", err)
}
// owner_id is the container ROW id.
mustInsertSample(t, s, ca.ID, 100, 12.5, 2048)
mustInsertSample(t, s, ca.ID, 200, 15.0, 3072)
mustInsertSample(t, s, cb.ID, 150, 99.0, 9999)
got, err := s.ListContainerStatsSamplesByWorkload(wa.ID, 0)
if err != nil {
t.Fatalf("ListContainerStatsSamplesByWorkload: %v", err)
}
if len(got) != 2 {
t.Fatalf("expected 2 samples for workload a, got %d", len(got))
}
// ts ascending.
if got[0].TS != 100 || got[1].TS != 200 {
t.Fatalf("expected ts-ascending 100,200, got %d,%d", got[0].TS, got[1].TS)
}
for _, sm := range got {
if sm.OwnerID != ca.ID {
t.Fatalf("leaked a sample from another workload: %+v", sm)
}
}
// Since-cutoff filters older samples.
recent, _ := s.ListContainerStatsSamplesByWorkload(wa.ID, 150)
if len(recent) != 1 || recent[0].TS != 200 {
t.Fatalf("expected only ts=200 after cutoff, got %+v", recent)
}
}
func mustInsertSample(t *testing.T, s *Store, ownerID string, ts int64, cpu float64, mem int64) {
t.Helper()
if err := s.InsertContainerStatsSample(ContainerStatsSample{
ContainerID: "c-" + ownerID, OwnerType: "instance", OwnerID: ownerID, TS: ts,
CPUPercent: cpu, MemoryUsage: mem, MemoryLimit: mem * 2,
}); err != nil {
t.Fatalf("InsertContainerStatsSample: %v", err)
}
}
+37
View File
@@ -74,6 +74,43 @@ func (s *Store) ListContainerStatsSamples(ownerType, ownerID string, sinceTS int
return out, rows.Err()
}
// ListContainerStatsSamplesByWorkload returns every container sample owned by
// a workload since the given unix timestamp, ordered by ts ascending. Samples
// are linked to their workload through the containers index (owner_id is the
// container row id), so this joins through it. Powers the per-workload metrics
// graph on /apps/[id].
func (s *Store) ListContainerStatsSamplesByWorkload(workloadID string, sinceTS int64) ([]ContainerStatsSample, error) {
rows, err := s.db.Query(
`SELECT cs.container_id, cs.owner_type, cs.owner_id, cs.ts,
cs.cpu_percent, cs.memory_usage, cs.memory_limit,
cs.network_rx, cs.network_tx, cs.block_read, cs.block_write
FROM container_stats_samples cs
JOIN containers c ON c.id = cs.owner_id
WHERE c.workload_id = ? AND cs.ts >= ?
ORDER BY cs.ts ASC`,
workloadID, sinceTS,
)
if err != nil {
return nil, fmt.Errorf("list container stats samples by workload: %w", err)
}
defer rows.Close()
var out []ContainerStatsSample
for rows.Next() {
var s ContainerStatsSample
if err := rows.Scan(
&s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS,
&s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit,
&s.NetworkRxBytes, &s.NetworkTxBytes,
&s.BlockReadBytes, &s.BlockWriteBytes,
); err != nil {
return nil, fmt.Errorf("scan container stats sample: %w", err)
}
out = append(out, s)
}
return out, rows.Err()
}
// ListAllRecentContainerStatsSamples returns samples across every owner since
// the given unix timestamp, ordered by ts ascending. Used by the system
// dashboard "top containers" widget where the UI wants a mixed pool.
+22
View File
@@ -459,6 +459,28 @@ func (s *Store) runMigrations() error {
)`,
`CREATE UNIQUE INDEX IF NOT EXISTS idx_shared_secrets_scope_name ON shared_secrets(scope, app_id, name)`,
`CREATE INDEX IF NOT EXISTS idx_shared_secrets_app ON shared_secrets(app_id)`,
// deploy_history: structured, version-pinned ledger of every deploy
// dispatch (success AND failure) per workload. Distinct from the
// free-text event_log — this carries the replayable `reference` the
// rollback action redeploys from. `error` holds only a generic,
// secret-free marker (the raw source error can echo registry-auth /
// compose stdout, so it goes to slog only). FK cascade is backed by
// PRAGMA foreign_keys=ON, but DeleteWorkload also deletes these rows
// explicitly (matching the containers cleanup convention).
`CREATE TABLE IF NOT EXISTS deploy_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
source_kind TEXT NOT NULL DEFAULT '',
reference TEXT NOT NULL DEFAULT '',
reason TEXT NOT NULL DEFAULT '',
triggered_by TEXT NOT NULL DEFAULT '',
note TEXT NOT NULL DEFAULT '',
outcome TEXT NOT NULL DEFAULT '',
error TEXT NOT NULL DEFAULT '',
started_at TEXT NOT NULL DEFAULT '',
finished_at TEXT NOT NULL DEFAULT ''
)`,
`CREATE INDEX IF NOT EXISTS idx_deploy_history_workload ON deploy_history(workload_id, id DESC)`,
}
for _, t := range observabilityTables {
if _, err := s.db.Exec(t); err != nil {
+6
View File
@@ -190,6 +190,12 @@ func (s *Store) DeleteWorkload(id string) error {
if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
return fmt.Errorf("delete containers: %w", err)
}
// Deploy ledger rows are FK-cascaded, but we delete them explicitly in
// the same transaction — consistent with the containers cleanup above
// and robust even if the cascade is ever disabled.
if _, err := tx.Exec(`DELETE FROM deploy_history WHERE workload_id = ?`, id); err != nil {
return fmt.Errorf("delete deploy history: %w", err)
}
result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete workload: %w", err)