feat(apps): per-workload deploy history, rollback, and resource metrics
Two additions to the app detail page, each backed by a per-workload
endpoint.
Deploy history + rollback:
- New deploy_history table — a structured, version-pinned ledger of every
dispatch (success AND failure), distinct from the free-text event_log.
Recorded at the single DispatchPlugin choke point so every source kind
is covered. The raw deploy error is never persisted (it can carry
registry-auth / compose-stdout secrets) — only a generic marker, with
detail going to slog. Pruned to the newest N per workload; cascade-
deleted with the workload.
- GET /api/workloads/{id}/deploys lists the ledger; POST .../rollback
(admin) replays a prior successful deploy's pinned reference as a
rollback-reason dispatch. Phase 1 is image-source only (RollbackCapable);
git-built sources need checkout-by-commit, a later phase.
- DeployHistoryPanel.svelte renders the ledger with confirm-gated rollback.
Per-workload metrics:
- ListContainerStatsSamplesByWorkload joins the existing container stats
samples through the containers index; GET /api/workloads/{id}/stats/history
aggregates CPU/memory per timestamp across the workload's containers.
- WorkloadMetricsPanel.svelte reuses ResourceChart (CPU% + memory MiB,
windowed, 15s poll).
en/ru i18n added with parity. Tests: store CRUD + cascade + workload-scoped
join, deployer recording (incl. secret-non-leak on failure), API rollback
guards, and per-timestamp aggregation. Plans under docs/plans/.
This commit is contained in:
@@ -0,0 +1,123 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// InsertDeployHistory appends one row to the per-workload deploy ledger.
|
||||
// Callers (the deployer choke point) treat this as best-effort: a failure
|
||||
// here must never fail an otherwise-successful deploy. Error is expected to
|
||||
// be a fixed, secret-free marker — never the raw source error.
|
||||
func (s *Store) InsertDeployHistory(e DeployHistoryEntry) (DeployHistoryEntry, error) {
|
||||
if e.StartedAt == "" {
|
||||
e.StartedAt = Now()
|
||||
}
|
||||
if e.FinishedAt == "" {
|
||||
e.FinishedAt = Now()
|
||||
}
|
||||
res, err := s.db.Exec(
|
||||
`INSERT INTO deploy_history
|
||||
(workload_id, source_kind, reference, reason, triggered_by,
|
||||
note, outcome, error, started_at, finished_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
e.WorkloadID, e.SourceKind, e.Reference, e.Reason, e.TriggeredBy,
|
||||
e.Note, e.Outcome, e.Error, e.StartedAt, e.FinishedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("insert deploy history: %w", err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("get deploy history id: %w", err)
|
||||
}
|
||||
e.ID = id
|
||||
return e, nil
|
||||
}
|
||||
|
||||
// ListDeployHistory returns a workload's ledger newest-first. limit/offset
|
||||
// are assumed pre-clamped by the API layer; a non-positive limit falls back
|
||||
// to a sane default so a bad query can't return the whole table.
|
||||
func (s *Store) ListDeployHistory(workloadID string, limit, offset int) ([]DeployHistoryEntry, error) {
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
rows, err := s.db.Query(
|
||||
`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
|
||||
note, outcome, error, started_at, finished_at
|
||||
FROM deploy_history
|
||||
WHERE workload_id = ?
|
||||
ORDER BY id DESC
|
||||
LIMIT ? OFFSET ?`,
|
||||
workloadID, limit, offset,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query deploy history: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
out := make([]DeployHistoryEntry, 0, limit)
|
||||
for rows.Next() {
|
||||
var e DeployHistoryEntry
|
||||
if err := rows.Scan(
|
||||
&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
|
||||
&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan deploy history: %w", err)
|
||||
}
|
||||
out = append(out, e)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// GetDeployHistory fetches one ledger row by id, or ErrNotFound. The
|
||||
// rollback handler uses this to resolve the pinned reference to replay.
|
||||
func (s *Store) GetDeployHistory(id int64) (DeployHistoryEntry, error) {
|
||||
row := s.db.QueryRow(
|
||||
`SELECT id, workload_id, source_kind, reference, reason, triggered_by,
|
||||
note, outcome, error, started_at, finished_at
|
||||
FROM deploy_history WHERE id = ?`, id,
|
||||
)
|
||||
var e DeployHistoryEntry
|
||||
err := row.Scan(
|
||||
&e.ID, &e.WorkloadID, &e.SourceKind, &e.Reference, &e.Reason,
|
||||
&e.TriggeredBy, &e.Note, &e.Outcome, &e.Error, &e.StartedAt, &e.FinishedAt,
|
||||
)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("deploy history %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return DeployHistoryEntry{}, fmt.Errorf("scan deploy history: %w", err)
|
||||
}
|
||||
return e, nil
|
||||
}
|
||||
|
||||
// PruneDeployHistory keeps only the newest `keep` rows for a workload,
|
||||
// deleting older ones. Bounds unbounded growth on hot workloads. Best-
|
||||
// effort and id-monotonic (newer rows always have larger ids), so it
|
||||
// deletes everything below the keep-th id. A non-positive keep is treated
|
||||
// as "keep a sane default" rather than "delete everything".
|
||||
func (s *Store) PruneDeployHistory(workloadID string, keep int) error {
|
||||
if keep <= 0 {
|
||||
keep = 50
|
||||
}
|
||||
_, err := s.db.Exec(
|
||||
`DELETE FROM deploy_history
|
||||
WHERE workload_id = ?
|
||||
AND id NOT IN (
|
||||
SELECT id FROM deploy_history
|
||||
WHERE workload_id = ?
|
||||
ORDER BY id DESC
|
||||
LIMIT ?
|
||||
)`,
|
||||
workloadID, workloadID, keep,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prune deploy history: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func seedWorkload(t *testing.T, s *Store, name string) Workload {
|
||||
t.Helper()
|
||||
w, err := s.CreateWorkload(Workload{Kind: "project", RefID: name, Name: name})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateWorkload(%s): %v", name, err)
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
func TestDeployHistory_InsertListGet(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "app1")
|
||||
|
||||
first, err := s.InsertDeployHistory(DeployHistoryEntry{
|
||||
WorkloadID: w.ID, SourceKind: "image", Reference: "v1",
|
||||
Reason: "manual", TriggeredBy: "admin", Outcome: "success",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("InsertDeployHistory: %v", err)
|
||||
}
|
||||
if first.ID == 0 {
|
||||
t.Fatal("expected non-zero id")
|
||||
}
|
||||
if first.StartedAt == "" || first.FinishedAt == "" {
|
||||
t.Fatal("expected timestamps to be defaulted")
|
||||
}
|
||||
|
||||
second, _ := s.InsertDeployHistory(DeployHistoryEntry{
|
||||
WorkloadID: w.ID, SourceKind: "image", Reference: "v2",
|
||||
Reason: "registry-push", Outcome: "success",
|
||||
})
|
||||
|
||||
list, err := s.ListDeployHistory(w.ID, 10, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("ListDeployHistory: %v", err)
|
||||
}
|
||||
if len(list) != 2 {
|
||||
t.Fatalf("expected 2 rows, got %d", len(list))
|
||||
}
|
||||
// Newest-first ordering.
|
||||
if list[0].ID != second.ID || list[1].ID != first.ID {
|
||||
t.Fatalf("expected newest-first ordering, got %d then %d", list[0].ID, list[1].ID)
|
||||
}
|
||||
|
||||
got, err := s.GetDeployHistory(first.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetDeployHistory: %v", err)
|
||||
}
|
||||
if got.Reference != "v1" || got.SourceKind != "image" {
|
||||
t.Fatalf("unexpected row: %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_GetNotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, err := s.GetDeployHistory(999)
|
||||
if !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("expected ErrNotFound, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_ListScopedToWorkload(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
a := seedWorkload(t, s, "a")
|
||||
b := seedWorkload(t, s, "b")
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: a.ID, Outcome: "success"})
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: b.ID, Outcome: "success"})
|
||||
|
||||
list, _ := s.ListDeployHistory(a.ID, 10, 0)
|
||||
if len(list) != 1 || list[0].WorkloadID != a.ID {
|
||||
t.Fatalf("expected only workload a's rows, got %+v", list)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_Pagination(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "paged")
|
||||
for i := 0; i < 5; i++ {
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
|
||||
}
|
||||
page1, _ := s.ListDeployHistory(w.ID, 2, 0)
|
||||
page2, _ := s.ListDeployHistory(w.ID, 2, 2)
|
||||
if len(page1) != 2 || len(page2) != 2 {
|
||||
t.Fatalf("expected 2 per page, got %d and %d", len(page1), len(page2))
|
||||
}
|
||||
if page1[0].ID == page2[0].ID {
|
||||
t.Fatal("expected distinct rows across pages")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_Prune(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "noisy")
|
||||
for i := 0; i < 10; i++ {
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
|
||||
}
|
||||
if err := s.PruneDeployHistory(w.ID, 3); err != nil {
|
||||
t.Fatalf("PruneDeployHistory: %v", err)
|
||||
}
|
||||
list, _ := s.ListDeployHistory(w.ID, 100, 0)
|
||||
if len(list) != 3 {
|
||||
t.Fatalf("expected 3 rows after prune, got %d", len(list))
|
||||
}
|
||||
// Prune keeps the newest rows.
|
||||
all, _ := s.ListDeployHistory(w.ID, 100, 0)
|
||||
for i := 1; i < len(all); i++ {
|
||||
if all[i-1].ID < all[i].ID {
|
||||
t.Fatal("expected newest-first after prune")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployHistory_CascadeOnWorkloadDelete(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
w := seedWorkload(t, s, "doomed")
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "success"})
|
||||
s.InsertDeployHistory(DeployHistoryEntry{WorkloadID: w.ID, Outcome: "failure"})
|
||||
|
||||
if err := s.DeleteWorkload(w.ID); err != nil {
|
||||
t.Fatalf("DeleteWorkload: %v", err)
|
||||
}
|
||||
list, _ := s.ListDeployHistory(w.ID, 100, 0)
|
||||
if len(list) != 0 {
|
||||
t.Fatalf("expected history removed with workload, got %d rows", len(list))
|
||||
}
|
||||
}
|
||||
@@ -507,3 +507,28 @@ type App struct {
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
// DeployHistoryEntry is one row in the per-workload deploy ledger. Unlike
|
||||
// event_log (free-text human timeline), this is the structured, version-
|
||||
// pinned record the rollback action replays from. Reference is the
|
||||
// effective deployed artifact handle (image tag for image sources, commit
|
||||
// sha for git-built sources, "" when none applies). Error is NEVER the raw
|
||||
// source error — that can carry registry-auth bytes or compose stdout; it
|
||||
// holds only a fixed, secret-free marker. Raw detail goes to slog.
|
||||
type DeployHistoryEntry struct {
|
||||
ID int64 `json:"id"`
|
||||
WorkloadID string `json:"workload_id"`
|
||||
SourceKind string `json:"source_kind"`
|
||||
Reference string `json:"reference"` // effective tag | commit sha | ""
|
||||
Reason string `json:"reason"` // manual|registry-push|git-push|cron|rollback|promote
|
||||
TriggeredBy string `json:"triggered_by"`
|
||||
Note string `json:"note"`
|
||||
Outcome string `json:"outcome"` // success | failure
|
||||
Error string `json:"error"` // generic, secret-free marker on failure
|
||||
StartedAt string `json:"started_at"`
|
||||
FinishedAt string `json:"finished_at"`
|
||||
// Rollbackable is computed at the API layer (not persisted): a row is
|
||||
// rollbackable when it succeeded, has a non-empty Reference, and its
|
||||
// source kind supports reference-pinned redeploy.
|
||||
Rollbackable bool `json:"rollbackable"`
|
||||
}
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
package store
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestListContainerStatsSamplesByWorkload_ScopedToWorkload(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
wa := seedWorkload(t, s, "wa")
|
||||
wb := seedWorkload(t, s, "wb")
|
||||
|
||||
ca, err := s.CreateContainer(Container{WorkloadID: wa.ID, WorkloadKind: "image", ContainerID: "da", Host: "local", State: "running"})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateContainer a: %v", err)
|
||||
}
|
||||
cb, err := s.CreateContainer(Container{WorkloadID: wb.ID, WorkloadKind: "image", ContainerID: "db", Host: "local", State: "running"})
|
||||
if err != nil {
|
||||
t.Fatalf("CreateContainer b: %v", err)
|
||||
}
|
||||
|
||||
// owner_id is the container ROW id.
|
||||
mustInsertSample(t, s, ca.ID, 100, 12.5, 2048)
|
||||
mustInsertSample(t, s, ca.ID, 200, 15.0, 3072)
|
||||
mustInsertSample(t, s, cb.ID, 150, 99.0, 9999)
|
||||
|
||||
got, err := s.ListContainerStatsSamplesByWorkload(wa.ID, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("ListContainerStatsSamplesByWorkload: %v", err)
|
||||
}
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("expected 2 samples for workload a, got %d", len(got))
|
||||
}
|
||||
// ts ascending.
|
||||
if got[0].TS != 100 || got[1].TS != 200 {
|
||||
t.Fatalf("expected ts-ascending 100,200, got %d,%d", got[0].TS, got[1].TS)
|
||||
}
|
||||
for _, sm := range got {
|
||||
if sm.OwnerID != ca.ID {
|
||||
t.Fatalf("leaked a sample from another workload: %+v", sm)
|
||||
}
|
||||
}
|
||||
|
||||
// Since-cutoff filters older samples.
|
||||
recent, _ := s.ListContainerStatsSamplesByWorkload(wa.ID, 150)
|
||||
if len(recent) != 1 || recent[0].TS != 200 {
|
||||
t.Fatalf("expected only ts=200 after cutoff, got %+v", recent)
|
||||
}
|
||||
}
|
||||
|
||||
func mustInsertSample(t *testing.T, s *Store, ownerID string, ts int64, cpu float64, mem int64) {
|
||||
t.Helper()
|
||||
if err := s.InsertContainerStatsSample(ContainerStatsSample{
|
||||
ContainerID: "c-" + ownerID, OwnerType: "instance", OwnerID: ownerID, TS: ts,
|
||||
CPUPercent: cpu, MemoryUsage: mem, MemoryLimit: mem * 2,
|
||||
}); err != nil {
|
||||
t.Fatalf("InsertContainerStatsSample: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -74,6 +74,43 @@ func (s *Store) ListContainerStatsSamples(ownerType, ownerID string, sinceTS int
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// ListContainerStatsSamplesByWorkload returns every container sample owned by
|
||||
// a workload since the given unix timestamp, ordered by ts ascending. Samples
|
||||
// are linked to their workload through the containers index (owner_id is the
|
||||
// container row id), so this joins through it. Powers the per-workload metrics
|
||||
// graph on /apps/[id].
|
||||
func (s *Store) ListContainerStatsSamplesByWorkload(workloadID string, sinceTS int64) ([]ContainerStatsSample, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT cs.container_id, cs.owner_type, cs.owner_id, cs.ts,
|
||||
cs.cpu_percent, cs.memory_usage, cs.memory_limit,
|
||||
cs.network_rx, cs.network_tx, cs.block_read, cs.block_write
|
||||
FROM container_stats_samples cs
|
||||
JOIN containers c ON c.id = cs.owner_id
|
||||
WHERE c.workload_id = ? AND cs.ts >= ?
|
||||
ORDER BY cs.ts ASC`,
|
||||
workloadID, sinceTS,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list container stats samples by workload: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var out []ContainerStatsSample
|
||||
for rows.Next() {
|
||||
var s ContainerStatsSample
|
||||
if err := rows.Scan(
|
||||
&s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS,
|
||||
&s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit,
|
||||
&s.NetworkRxBytes, &s.NetworkTxBytes,
|
||||
&s.BlockReadBytes, &s.BlockWriteBytes,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan container stats sample: %w", err)
|
||||
}
|
||||
out = append(out, s)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// ListAllRecentContainerStatsSamples returns samples across every owner since
|
||||
// the given unix timestamp, ordered by ts ascending. Used by the system
|
||||
// dashboard "top containers" widget where the UI wants a mixed pool.
|
||||
|
||||
@@ -459,6 +459,28 @@ func (s *Store) runMigrations() error {
|
||||
)`,
|
||||
`CREATE UNIQUE INDEX IF NOT EXISTS idx_shared_secrets_scope_name ON shared_secrets(scope, app_id, name)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_shared_secrets_app ON shared_secrets(app_id)`,
|
||||
// deploy_history: structured, version-pinned ledger of every deploy
|
||||
// dispatch (success AND failure) per workload. Distinct from the
|
||||
// free-text event_log — this carries the replayable `reference` the
|
||||
// rollback action redeploys from. `error` holds only a generic,
|
||||
// secret-free marker (the raw source error can echo registry-auth /
|
||||
// compose stdout, so it goes to slog only). FK cascade is backed by
|
||||
// PRAGMA foreign_keys=ON, but DeleteWorkload also deletes these rows
|
||||
// explicitly (matching the containers cleanup convention).
|
||||
`CREATE TABLE IF NOT EXISTS deploy_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
|
||||
source_kind TEXT NOT NULL DEFAULT '',
|
||||
reference TEXT NOT NULL DEFAULT '',
|
||||
reason TEXT NOT NULL DEFAULT '',
|
||||
triggered_by TEXT NOT NULL DEFAULT '',
|
||||
note TEXT NOT NULL DEFAULT '',
|
||||
outcome TEXT NOT NULL DEFAULT '',
|
||||
error TEXT NOT NULL DEFAULT '',
|
||||
started_at TEXT NOT NULL DEFAULT '',
|
||||
finished_at TEXT NOT NULL DEFAULT ''
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_deploy_history_workload ON deploy_history(workload_id, id DESC)`,
|
||||
}
|
||||
for _, t := range observabilityTables {
|
||||
if _, err := s.db.Exec(t); err != nil {
|
||||
|
||||
@@ -190,6 +190,12 @@ func (s *Store) DeleteWorkload(id string) error {
|
||||
if _, err := tx.Exec(`DELETE FROM containers WHERE workload_id = ?`, id); err != nil {
|
||||
return fmt.Errorf("delete containers: %w", err)
|
||||
}
|
||||
// Deploy ledger rows are FK-cascaded, but we delete them explicitly in
|
||||
// the same transaction — consistent with the containers cleanup above
|
||||
// and robust even if the cascade is ever disabled.
|
||||
if _, err := tx.Exec(`DELETE FROM deploy_history WHERE workload_id = ?`, id); err != nil {
|
||||
return fmt.Errorf("delete deploy history: %w", err)
|
||||
}
|
||||
result, err := tx.Exec(`DELETE FROM workloads WHERE id = ?`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete workload: %w", err)
|
||||
|
||||
Reference in New Issue
Block a user