feat(alerts): metric-threshold alerting (backend + API)

Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
2026-05-29 14:06:23 +03:00
parent 5c17885197
commit cdb9fd57d1
11 changed files with 1299 additions and 0 deletions
@@ -0,0 +1,191 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+)
+
+// CreateMetricAlertRule inserts a new rule row after validating its
+// metric/comparator/severity enums and rejecting negative cooldowns.
+func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
+	if err := validateMetricAlertRule(r); err != nil {
+		return MetricAlertRule{}, err
+	}
+	now := Now()
+	r.CreatedAt = now
+	r.UpdatedAt = now
+	res, err := s.db.Exec(
+		`INSERT INTO metric_alert_rules
+		   (workload_id, name, metric, comparator, threshold, severity,
+		    cooldown_seconds, enabled, created_at, updated_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
+		r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
+	)
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
+	}
+	r.ID = id
+	return r, nil
+}
+
+// ListMetricAlertRules returns every rule, ordered by id for stable UI
+// rendering.
+func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
+	return s.queryMetricAlertRules(
+		`SELECT id, workload_id, name, metric, comparator, threshold, severity,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM metric_alert_rules ORDER BY id`,
+	)
+}
+
+// ListMetricAlertRulesByWorkload returns rules that apply to the given
+// workload: rows explicitly scoped to it plus global rows (workload_id
+// = ""). Useful for the workload detail page.
+func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
+	return s.queryMetricAlertRules(
+		`SELECT id, workload_id, name, metric, comparator, threshold, severity,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
+		workloadID,
+	)
+}
+
+// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
+func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
+	row := s.db.QueryRow(
+		`SELECT id, workload_id, name, metric, comparator, threshold, severity,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM metric_alert_rules WHERE id = ?`, id,
+	)
+	r, err := scanMetricAlertRuleRow(row)
+	if errors.Is(err, sql.ErrNoRows) {
+		return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
+	}
+	return r, nil
+}
+
+// UpdateMetricAlertRule overwrites the editable columns of a rule row.
+// id and workload_id are immutable on update — change the scope of a
+// rule by deleting + recreating, mirroring the log-scan store.
+func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
+	if r.ID == 0 {
+		return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
+	}
+	if err := validateMetricAlertRule(r); err != nil {
+		return MetricAlertRule{}, err
+	}
+	r.UpdatedAt = Now()
+	res, err := s.db.Exec(
+		`UPDATE metric_alert_rules
+		    SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
+		        cooldown_seconds = ?, enabled = ?, updated_at = ?
+		  WHERE id = ?`,
+		r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
+		r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
+	)
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
+	}
+	return s.GetMetricAlertRule(r.ID)
+}
+
+// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
+// no row matched.
+func (s *Store) DeleteMetricAlertRule(id int64) error {
+	res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete metric alert rule: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
+	}
+	return nil
+}
+
+func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
+	rows, err := s.db.Query(query, args...)
+	if err != nil {
+		return nil, fmt.Errorf("query metric alert rules: %w", err)
+	}
+	defer rows.Close()
+	out := []MetricAlertRule{}
+	for rows.Next() {
+		r, err := scanMetricAlertRuleRows(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, r)
+	}
+	return out, rows.Err()
+}
+
+func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
+	var r MetricAlertRule
+	var enabled int
+	if err := rows.Scan(
+		&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
+		&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
+	); err != nil {
+		return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
+	}
+	r.Enabled = enabled != 0
+	return r, nil
+}
+
+func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
+	var r MetricAlertRule
+	var enabled int
+	if err := row.Scan(
+		&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
+		&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
+	); err != nil {
+		return MetricAlertRule{}, err
+	}
+	r.Enabled = enabled != 0
+	return r, nil
+}
+
+// validateMetricAlertRule enforces the per-row invariants: a non-empty
+// name, a known metric/comparator, a valid severity (blank allowed so
+// the caller can default it), and a non-negative cooldown.
+func validateMetricAlertRule(r MetricAlertRule) error {
+	if strings.TrimSpace(r.Name) == "" {
+		return fmt.Errorf("metric alert rule: name is required")
+	}
+	switch r.Metric {
+	case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
+	default:
+		return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
+	}
+	switch r.Comparator {
+	case MetricComparatorGT, MetricComparatorLT:
+	default:
+		return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
+	}
+	switch r.Severity {
+	case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
+	case "":
+		// Default applied at the caller; allow blank.
+	default:
+		return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
+	}
+	if r.CooldownSeconds < 0 {
+		return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
+	}
+	return nil
+}
@@ -0,0 +1,167 @@
+package store
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestCreateMetricAlertRule_Validates(t *testing.T) {
+	s := newTestStore(t)
+	cases := []struct {
+		name    string
+		in      MetricAlertRule
+		wantErr string
+	}{
+		{
+			name:    "missing name",
+			in:      MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
+			wantErr: "name is required",
+		},
+		{
+			name:    "bad metric",
+			in:      MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
+			wantErr: "invalid metric",
+		},
+		{
+			name:    "bad comparator",
+			in:      MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
+			wantErr: "invalid comparator",
+		},
+		{
+			name:    "bad severity",
+			in:      MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
+			wantErr: "invalid severity",
+		},
+		{
+			name:    "negative cooldown",
+			in:      MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
+			wantErr: "cooldown_seconds must be",
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			_, err := s.CreateMetricAlertRule(c.in)
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", c.wantErr)
+			}
+			if !strings.Contains(err.Error(), c.wantErr) {
+				t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
+			}
+		})
+	}
+}
+
+func TestCreateAndGetMetricAlertRule(t *testing.T) {
+	s := newTestStore(t)
+	r, err := s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
+	})
+	if err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if r.ID == 0 {
+		t.Fatal("id should be set")
+	}
+	got, err := s.GetMetricAlertRule(r.ID)
+	if err != nil {
+		t.Fatalf("get: %v", err)
+	}
+	if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
+		t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
+	}
+	if got.Threshold != 80 {
+		t.Errorf("threshold mismatch: %v", got.Threshold)
+	}
+	if !got.Enabled {
+		t.Error("enabled lost on round-trip")
+	}
+}
+
+func TestGetMetricAlertRule_NotFound(t *testing.T) {
+	s := newTestStore(t)
+	if _, err := s.GetMetricAlertRule(999); err == nil {
+		t.Fatal("expected ErrNotFound for missing rule")
+	}
+}
+
+func TestListMetricAlertRulesByWorkload(t *testing.T) {
+	s := newTestStore(t)
+	_, _ = s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 90, Severity: "warn", Enabled: true,
+	})
+	_, _ = s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
+		Threshold: 85, Severity: "error", Enabled: true,
+	})
+	_, _ = s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
+		Threshold: 1000, Severity: "info", Enabled: true,
+	})
+
+	w1, err := s.ListMetricAlertRulesByWorkload("w1")
+	if err != nil {
+		t.Fatalf("by workload: %v", err)
+	}
+	// w1 sees its own rule + the global, but NOT w2's rule.
+	if len(w1) != 2 {
+		t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
+	}
+	for _, r := range w1 {
+		if r.WorkloadID == "w2" {
+			t.Errorf("w1 should not see w2's rule")
+		}
+	}
+}
+
+func TestUpdateMetricAlertRule(t *testing.T) {
+	s := newTestStore(t)
+	r, _ := s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 80, Severity: "warn", Enabled: true,
+	})
+	r.Threshold = 95
+	r.Comparator = MetricComparatorLT
+	r.Enabled = false
+	got, err := s.UpdateMetricAlertRule(r)
+	if err != nil {
+		t.Fatalf("update: %v", err)
+	}
+	if got.Threshold != 95 {
+		t.Errorf("threshold not updated: %v", got.Threshold)
+	}
+	if got.Comparator != MetricComparatorLT {
+		t.Errorf("comparator not updated: %q", got.Comparator)
+	}
+	if got.Enabled {
+		t.Error("enabled=false not applied")
+	}
+}
+
+func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
+	s := newTestStore(t)
+	_, err := s.UpdateMetricAlertRule(MetricAlertRule{
+		ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+	})
+	if err == nil {
+		t.Fatal("expected ErrNotFound updating missing rule")
+	}
+}
+
+func TestDeleteMetricAlertRule(t *testing.T) {
+	s := newTestStore(t)
+	r, _ := s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 80, Severity: "warn", Enabled: true,
+	})
+	if err := s.DeleteMetricAlertRule(r.ID); err != nil {
+		t.Fatalf("delete: %v", err)
+	}
+	if _, err := s.GetMetricAlertRule(r.ID); err == nil {
+		t.Error("rule should be gone after delete")
+	}
+	if err := s.DeleteMetricAlertRule(r.ID); err == nil {
+		t.Error("expected ErrNotFound deleting already-deleted rule")
+	}
+}
@@ -277,6 +277,39 @@ const (
 	LogScanSeverityError = "error"
 )

+// MetricAlertRule fires an event when a container metric breaches a
+// threshold. Mirrors LogScanRule but evaluated against stats_samples
+// instead of log lines.
+type MetricAlertRule struct {
+	ID              int64   `json:"id"`
+	WorkloadID      string  `json:"workload_id"` // "" = applies to all workloads
+	Name            string  `json:"name"`
+	Metric          string  `json:"metric"`     // cpu_percent | memory_percent | memory_bytes
+	Comparator      string  `json:"comparator"` // gt | lt
+	Threshold       float64 `json:"threshold"`
+	Severity        string  `json:"severity"`         // info | warn | error
+	CooldownSeconds int     `json:"cooldown_seconds"` // min seconds between fires per (rule,workload)
+	Enabled         bool    `json:"enabled"`
+	CreatedAt       string  `json:"created_at"`
+	UpdatedAt       string  `json:"updated_at"`
+}
+
+// Metric-alert metric identifiers. cpu_percent + memory_percent are
+// 0–100 ratios; memory_bytes is an absolute usage figure. Validated in
+// the store on create/update.
+const (
+	MetricCPUPercent    = "cpu_percent"
+	MetricMemoryPercent = "memory_percent"
+	MetricMemoryBytes   = "memory_bytes"
+)
+
+// Metric-alert comparators. gt fires when the value exceeds the
+// threshold; lt when it falls below.
+const (
+	MetricComparatorGT = "gt"
+	MetricComparatorLT = "lt"
+)
+
 // WorkloadKind enumerates the legacy discriminator values written into
 // containers.workload_kind and workloads.kind. After the hard cutover the
 // backing project / stack / static_site tables are gone — these constants
@@ -408,6 +408,24 @@ func (s *Store) runMigrations() error {
 		)`,
 		`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
 		`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
+		// metric_alert_rules: threshold rules the metric-alert manager
+		// evaluates against recent container stats samples. WorkloadID is
+		// nullable (via "" sentinel) so a global rule applies to every
+		// workload; a non-empty value scopes it to one workload.
+		`CREATE TABLE IF NOT EXISTS metric_alert_rules (
+			id                 INTEGER PRIMARY KEY AUTOINCREMENT,
+			workload_id        TEXT NOT NULL DEFAULT '',
+			name               TEXT NOT NULL DEFAULT '',
+			metric             TEXT NOT NULL,
+			comparator         TEXT NOT NULL,
+			threshold          REAL NOT NULL DEFAULT 0,
+			severity           TEXT NOT NULL DEFAULT 'warn',
+			cooldown_seconds   INTEGER NOT NULL DEFAULT 300,
+			enabled            INTEGER NOT NULL DEFAULT 1,
+			created_at         TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at         TEXT NOT NULL DEFAULT (datetime('now'))
+		)`,
+		`CREATE INDEX IF NOT EXISTS idx_metric_alert_rules_workload ON metric_alert_rules(workload_id)`,
 	}
 	for _, t := range observabilityTables {
 		if _, err := s.db.Exec(t); err != nil {