feat(alerts): metric-threshold alerting (backend + API)

Operators can define metric-threshold alert rules (cpu_percent,
memory_percent, memory_bytes; gt/lt) per-workload or global via
/api/metric-alert-rules. A periodic evaluator (internal/metricalert,
30s tick) checks the freshest container stats sample per container
against enabled rules and, on breach (per-rule-per-workload cooldown),
emits into the existing event_log + bus pipeline (source "metric_alert",
workload_id set). Alerts therefore surface on the global events page,
the per-app activity timeline, and any configured event-trigger webhook
-- no new notification plumbing.

Mirrors the log_scan_rules store/API/route patterns and the
stats.Collector lifecycle. Rule CRUD reads are authed, mutations
AdminOnly. Frontend rule-config UI is a follow-up phase.

Reviewed: go APPROVE (0 CRITICAL/HIGH).
This commit is contained in:
2026-05-29 14:06:23 +03:00
parent 5c17885197
commit cdb9fd57d1
11 changed files with 1299 additions and 0 deletions
+191
View File
@@ -0,0 +1,191 @@
package store
import (
"database/sql"
"errors"
"fmt"
"strings"
)
// CreateMetricAlertRule inserts a new rule row after validating its
// metric/comparator/severity enums and rejecting negative cooldowns.
func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
if err := validateMetricAlertRule(r); err != nil {
return MetricAlertRule{}, err
}
now := Now()
r.CreatedAt = now
r.UpdatedAt = now
res, err := s.db.Exec(
`INSERT INTO metric_alert_rules
(workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
)
if err != nil {
return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
}
id, err := res.LastInsertId()
if err != nil {
return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
}
r.ID = id
return r, nil
}
// ListMetricAlertRules returns every rule, ordered by id for stable UI
// rendering.
func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
return s.queryMetricAlertRules(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules ORDER BY id`,
)
}
// ListMetricAlertRulesByWorkload returns rules that apply to the given
// workload: rows explicitly scoped to it plus global rows (workload_id
// = ""). Useful for the workload detail page.
func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
return s.queryMetricAlertRules(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
workloadID,
)
}
// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
row := s.db.QueryRow(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules WHERE id = ?`, id,
)
r, err := scanMetricAlertRuleRow(row)
if errors.Is(err, sql.ErrNoRows) {
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
}
if err != nil {
return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
}
return r, nil
}
// UpdateMetricAlertRule overwrites the editable columns of a rule row.
// id and workload_id are immutable on update — change the scope of a
// rule by deleting + recreating, mirroring the log-scan store.
func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
if r.ID == 0 {
return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
}
if err := validateMetricAlertRule(r); err != nil {
return MetricAlertRule{}, err
}
r.UpdatedAt = Now()
res, err := s.db.Exec(
`UPDATE metric_alert_rules
SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
cooldown_seconds = ?, enabled = ?, updated_at = ?
WHERE id = ?`,
r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
)
if err != nil {
return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
}
return s.GetMetricAlertRule(r.ID)
}
// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
// no row matched.
func (s *Store) DeleteMetricAlertRule(id int64) error {
res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete metric alert rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
}
return nil
}
func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
rows, err := s.db.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("query metric alert rules: %w", err)
}
defer rows.Close()
out := []MetricAlertRule{}
for rows.Next() {
r, err := scanMetricAlertRuleRows(rows)
if err != nil {
return nil, err
}
out = append(out, r)
}
return out, rows.Err()
}
func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
var r MetricAlertRule
var enabled int
if err := rows.Scan(
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
}
r.Enabled = enabled != 0
return r, nil
}
func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
var r MetricAlertRule
var enabled int
if err := row.Scan(
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return MetricAlertRule{}, err
}
r.Enabled = enabled != 0
return r, nil
}
// validateMetricAlertRule enforces the per-row invariants: a non-empty
// name, a known metric/comparator, a valid severity (blank allowed so
// the caller can default it), and a non-negative cooldown.
func validateMetricAlertRule(r MetricAlertRule) error {
if strings.TrimSpace(r.Name) == "" {
return fmt.Errorf("metric alert rule: name is required")
}
switch r.Metric {
case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
default:
return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
}
switch r.Comparator {
case MetricComparatorGT, MetricComparatorLT:
default:
return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
}
switch r.Severity {
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
case "":
// Default applied at the caller; allow blank.
default:
return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
}
if r.CooldownSeconds < 0 {
return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
}
return nil
}
+167
View File
@@ -0,0 +1,167 @@
package store
import (
"strings"
"testing"
)
func TestCreateMetricAlertRule_Validates(t *testing.T) {
s := newTestStore(t)
cases := []struct {
name string
in MetricAlertRule
wantErr string
}{
{
name: "missing name",
in: MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
wantErr: "name is required",
},
{
name: "bad metric",
in: MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
wantErr: "invalid metric",
},
{
name: "bad comparator",
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
wantErr: "invalid comparator",
},
{
name: "bad severity",
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
wantErr: "invalid severity",
},
{
name: "negative cooldown",
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
wantErr: "cooldown_seconds must be",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
_, err := s.CreateMetricAlertRule(c.in)
if err == nil {
t.Fatalf("expected error containing %q, got nil", c.wantErr)
}
if !strings.Contains(err.Error(), c.wantErr) {
t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
}
})
}
}
func TestCreateAndGetMetricAlertRule(t *testing.T) {
s := newTestStore(t)
r, err := s.CreateMetricAlertRule(MetricAlertRule{
Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
})
if err != nil {
t.Fatalf("create: %v", err)
}
if r.ID == 0 {
t.Fatal("id should be set")
}
got, err := s.GetMetricAlertRule(r.ID)
if err != nil {
t.Fatalf("get: %v", err)
}
if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
}
if got.Threshold != 80 {
t.Errorf("threshold mismatch: %v", got.Threshold)
}
if !got.Enabled {
t.Error("enabled lost on round-trip")
}
}
func TestGetMetricAlertRule_NotFound(t *testing.T) {
s := newTestStore(t)
if _, err := s.GetMetricAlertRule(999); err == nil {
t.Fatal("expected ErrNotFound for missing rule")
}
}
func TestListMetricAlertRulesByWorkload(t *testing.T) {
s := newTestStore(t)
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 90, Severity: "warn", Enabled: true,
})
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
Threshold: 85, Severity: "error", Enabled: true,
})
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
Threshold: 1000, Severity: "info", Enabled: true,
})
w1, err := s.ListMetricAlertRulesByWorkload("w1")
if err != nil {
t.Fatalf("by workload: %v", err)
}
// w1 sees its own rule + the global, but NOT w2's rule.
if len(w1) != 2 {
t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
}
for _, r := range w1 {
if r.WorkloadID == "w2" {
t.Errorf("w1 should not see w2's rule")
}
}
}
func TestUpdateMetricAlertRule(t *testing.T) {
s := newTestStore(t)
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 80, Severity: "warn", Enabled: true,
})
r.Threshold = 95
r.Comparator = MetricComparatorLT
r.Enabled = false
got, err := s.UpdateMetricAlertRule(r)
if err != nil {
t.Fatalf("update: %v", err)
}
if got.Threshold != 95 {
t.Errorf("threshold not updated: %v", got.Threshold)
}
if got.Comparator != MetricComparatorLT {
t.Errorf("comparator not updated: %q", got.Comparator)
}
if got.Enabled {
t.Error("enabled=false not applied")
}
}
func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
s := newTestStore(t)
_, err := s.UpdateMetricAlertRule(MetricAlertRule{
ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
})
if err == nil {
t.Fatal("expected ErrNotFound updating missing rule")
}
}
func TestDeleteMetricAlertRule(t *testing.T) {
s := newTestStore(t)
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 80, Severity: "warn", Enabled: true,
})
if err := s.DeleteMetricAlertRule(r.ID); err != nil {
t.Fatalf("delete: %v", err)
}
if _, err := s.GetMetricAlertRule(r.ID); err == nil {
t.Error("rule should be gone after delete")
}
if err := s.DeleteMetricAlertRule(r.ID); err == nil {
t.Error("expected ErrNotFound deleting already-deleted rule")
}
}
+33
View File
@@ -277,6 +277,39 @@ const (
LogScanSeverityError = "error"
)
// MetricAlertRule fires an event when a container metric breaches a
// threshold. Mirrors LogScanRule but evaluated against stats_samples
// instead of log lines.
type MetricAlertRule struct {
ID int64 `json:"id"`
WorkloadID string `json:"workload_id"` // "" = applies to all workloads
Name string `json:"name"`
Metric string `json:"metric"` // cpu_percent | memory_percent | memory_bytes
Comparator string `json:"comparator"` // gt | lt
Threshold float64 `json:"threshold"`
Severity string `json:"severity"` // info | warn | error
CooldownSeconds int `json:"cooldown_seconds"` // min seconds between fires per (rule,workload)
Enabled bool `json:"enabled"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// Metric-alert metric identifiers. cpu_percent + memory_percent are
// 0100 ratios; memory_bytes is an absolute usage figure. Validated in
// the store on create/update.
const (
MetricCPUPercent = "cpu_percent"
MetricMemoryPercent = "memory_percent"
MetricMemoryBytes = "memory_bytes"
)
// Metric-alert comparators. gt fires when the value exceeds the
// threshold; lt when it falls below.
const (
MetricComparatorGT = "gt"
MetricComparatorLT = "lt"
)
// WorkloadKind enumerates the legacy discriminator values written into
// containers.workload_kind and workloads.kind. After the hard cutover the
// backing project / stack / static_site tables are gone — these constants
+18
View File
@@ -408,6 +408,24 @@ func (s *Store) runMigrations() error {
)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
// metric_alert_rules: threshold rules the metric-alert manager
// evaluates against recent container stats samples. WorkloadID is
// nullable (via "" sentinel) so a global rule applies to every
// workload; a non-empty value scopes it to one workload.
`CREATE TABLE IF NOT EXISTS metric_alert_rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
workload_id TEXT NOT NULL DEFAULT '',
name TEXT NOT NULL DEFAULT '',
metric TEXT NOT NULL,
comparator TEXT NOT NULL,
threshold REAL NOT NULL DEFAULT 0,
severity TEXT NOT NULL DEFAULT 'warn',
cooldown_seconds INTEGER NOT NULL DEFAULT 300,
enabled INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
`CREATE INDEX IF NOT EXISTS idx_metric_alert_rules_workload ON metric_alert_rules(workload_id)`,
}
for _, t := range observabilityTables {
if _, err := s.db.Exec(t); err != nil {