feat(alerts): metric-threshold alerting (backend + API)
Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CreateMetricAlertRule inserts a new rule row after validating its
|
||||
// metric/comparator/severity enums and rejecting negative cooldowns.
|
||||
func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
|
||||
if err := validateMetricAlertRule(r); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
now := Now()
|
||||
r.CreatedAt = now
|
||||
r.UpdatedAt = now
|
||||
res, err := s.db.Exec(
|
||||
`INSERT INTO metric_alert_rules
|
||||
(workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
|
||||
}
|
||||
r.ID = id
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ListMetricAlertRules returns every rule, ordered by id for stable UI
|
||||
// rendering.
|
||||
func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
|
||||
return s.queryMetricAlertRules(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules ORDER BY id`,
|
||||
)
|
||||
}
|
||||
|
||||
// ListMetricAlertRulesByWorkload returns rules that apply to the given
|
||||
// workload: rows explicitly scoped to it plus global rows (workload_id
|
||||
// = ""). Useful for the workload detail page.
|
||||
func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
|
||||
return s.queryMetricAlertRules(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
|
||||
workloadID,
|
||||
)
|
||||
}
|
||||
|
||||
// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
|
||||
func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
|
||||
row := s.db.QueryRow(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules WHERE id = ?`, id,
|
||||
)
|
||||
r, err := scanMetricAlertRuleRow(row)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// UpdateMetricAlertRule overwrites the editable columns of a rule row.
|
||||
// id and workload_id are immutable on update — change the scope of a
|
||||
// rule by deleting + recreating, mirroring the log-scan store.
|
||||
func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
|
||||
if r.ID == 0 {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
|
||||
}
|
||||
if err := validateMetricAlertRule(r); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
r.UpdatedAt = Now()
|
||||
res, err := s.db.Exec(
|
||||
`UPDATE metric_alert_rules
|
||||
SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
|
||||
cooldown_seconds = ?, enabled = ?, updated_at = ?
|
||||
WHERE id = ?`,
|
||||
r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
|
||||
)
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
|
||||
}
|
||||
return s.GetMetricAlertRule(r.ID)
|
||||
}
|
||||
|
||||
// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
|
||||
// no row matched.
|
||||
func (s *Store) DeleteMetricAlertRule(id int64) error {
|
||||
res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete metric alert rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
|
||||
rows, err := s.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query metric alert rules: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
out := []MetricAlertRule{}
|
||||
for rows.Next() {
|
||||
r, err := scanMetricAlertRuleRows(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, r)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
|
||||
var r MetricAlertRule
|
||||
var enabled int
|
||||
if err := rows.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
|
||||
var r MetricAlertRule
|
||||
var enabled int
|
||||
if err := row.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// validateMetricAlertRule enforces the per-row invariants: a non-empty
|
||||
// name, a known metric/comparator, a valid severity (blank allowed so
|
||||
// the caller can default it), and a non-negative cooldown.
|
||||
func validateMetricAlertRule(r MetricAlertRule) error {
|
||||
if strings.TrimSpace(r.Name) == "" {
|
||||
return fmt.Errorf("metric alert rule: name is required")
|
||||
}
|
||||
switch r.Metric {
|
||||
case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
|
||||
}
|
||||
switch r.Comparator {
|
||||
case MetricComparatorGT, MetricComparatorLT:
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
|
||||
}
|
||||
switch r.Severity {
|
||||
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
|
||||
case "":
|
||||
// Default applied at the caller; allow blank.
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
|
||||
}
|
||||
if r.CooldownSeconds < 0 {
|
||||
return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCreateMetricAlertRule_Validates(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
cases := []struct {
|
||||
name string
|
||||
in MetricAlertRule
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "missing name",
|
||||
in: MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
|
||||
wantErr: "name is required",
|
||||
},
|
||||
{
|
||||
name: "bad metric",
|
||||
in: MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
|
||||
wantErr: "invalid metric",
|
||||
},
|
||||
{
|
||||
name: "bad comparator",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
|
||||
wantErr: "invalid comparator",
|
||||
},
|
||||
{
|
||||
name: "bad severity",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
|
||||
wantErr: "invalid severity",
|
||||
},
|
||||
{
|
||||
name: "negative cooldown",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
|
||||
wantErr: "cooldown_seconds must be",
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
_, err := s.CreateMetricAlertRule(c.in)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error containing %q, got nil", c.wantErr)
|
||||
}
|
||||
if !strings.Contains(err.Error(), c.wantErr) {
|
||||
t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateAndGetMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, err := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create: %v", err)
|
||||
}
|
||||
if r.ID == 0 {
|
||||
t.Fatal("id should be set")
|
||||
}
|
||||
got, err := s.GetMetricAlertRule(r.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
|
||||
t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
|
||||
}
|
||||
if got.Threshold != 80 {
|
||||
t.Errorf("threshold mismatch: %v", got.Threshold)
|
||||
}
|
||||
if !got.Enabled {
|
||||
t.Error("enabled lost on round-trip")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMetricAlertRule_NotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
if _, err := s.GetMetricAlertRule(999); err == nil {
|
||||
t.Fatal("expected ErrNotFound for missing rule")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListMetricAlertRulesByWorkload(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 90, Severity: "warn", Enabled: true,
|
||||
})
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 85, Severity: "error", Enabled: true,
|
||||
})
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
|
||||
Threshold: 1000, Severity: "info", Enabled: true,
|
||||
})
|
||||
|
||||
w1, err := s.ListMetricAlertRulesByWorkload("w1")
|
||||
if err != nil {
|
||||
t.Fatalf("by workload: %v", err)
|
||||
}
|
||||
// w1 sees its own rule + the global, but NOT w2's rule.
|
||||
if len(w1) != 2 {
|
||||
t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
|
||||
}
|
||||
for _, r := range w1 {
|
||||
if r.WorkloadID == "w2" {
|
||||
t.Errorf("w1 should not see w2's rule")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", Enabled: true,
|
||||
})
|
||||
r.Threshold = 95
|
||||
r.Comparator = MetricComparatorLT
|
||||
r.Enabled = false
|
||||
got, err := s.UpdateMetricAlertRule(r)
|
||||
if err != nil {
|
||||
t.Fatalf("update: %v", err)
|
||||
}
|
||||
if got.Threshold != 95 {
|
||||
t.Errorf("threshold not updated: %v", got.Threshold)
|
||||
}
|
||||
if got.Comparator != MetricComparatorLT {
|
||||
t.Errorf("comparator not updated: %q", got.Comparator)
|
||||
}
|
||||
if got.Enabled {
|
||||
t.Error("enabled=false not applied")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, err := s.UpdateMetricAlertRule(MetricAlertRule{
|
||||
ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected ErrNotFound updating missing rule")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeleteMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", Enabled: true,
|
||||
})
|
||||
if err := s.DeleteMetricAlertRule(r.ID); err != nil {
|
||||
t.Fatalf("delete: %v", err)
|
||||
}
|
||||
if _, err := s.GetMetricAlertRule(r.ID); err == nil {
|
||||
t.Error("rule should be gone after delete")
|
||||
}
|
||||
if err := s.DeleteMetricAlertRule(r.ID); err == nil {
|
||||
t.Error("expected ErrNotFound deleting already-deleted rule")
|
||||
}
|
||||
}
|
||||
@@ -277,6 +277,39 @@ const (
|
||||
LogScanSeverityError = "error"
|
||||
)
|
||||
|
||||
// MetricAlertRule fires an event when a container metric breaches a
|
||||
// threshold. Mirrors LogScanRule but evaluated against stats_samples
|
||||
// instead of log lines.
|
||||
type MetricAlertRule struct {
|
||||
ID int64 `json:"id"`
|
||||
WorkloadID string `json:"workload_id"` // "" = applies to all workloads
|
||||
Name string `json:"name"`
|
||||
Metric string `json:"metric"` // cpu_percent | memory_percent | memory_bytes
|
||||
Comparator string `json:"comparator"` // gt | lt
|
||||
Threshold float64 `json:"threshold"`
|
||||
Severity string `json:"severity"` // info | warn | error
|
||||
CooldownSeconds int `json:"cooldown_seconds"` // min seconds between fires per (rule,workload)
|
||||
Enabled bool `json:"enabled"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
// Metric-alert metric identifiers. cpu_percent + memory_percent are
|
||||
// 0–100 ratios; memory_bytes is an absolute usage figure. Validated in
|
||||
// the store on create/update.
|
||||
const (
|
||||
MetricCPUPercent = "cpu_percent"
|
||||
MetricMemoryPercent = "memory_percent"
|
||||
MetricMemoryBytes = "memory_bytes"
|
||||
)
|
||||
|
||||
// Metric-alert comparators. gt fires when the value exceeds the
|
||||
// threshold; lt when it falls below.
|
||||
const (
|
||||
MetricComparatorGT = "gt"
|
||||
MetricComparatorLT = "lt"
|
||||
)
|
||||
|
||||
// WorkloadKind enumerates the legacy discriminator values written into
|
||||
// containers.workload_kind and workloads.kind. After the hard cutover the
|
||||
// backing project / stack / static_site tables are gone — these constants
|
||||
|
||||
@@ -408,6 +408,24 @@ func (s *Store) runMigrations() error {
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
|
||||
// metric_alert_rules: threshold rules the metric-alert manager
|
||||
// evaluates against recent container stats samples. WorkloadID is
|
||||
// nullable (via "" sentinel) so a global rule applies to every
|
||||
// workload; a non-empty value scopes it to one workload.
|
||||
`CREATE TABLE IF NOT EXISTS metric_alert_rules (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
workload_id TEXT NOT NULL DEFAULT '',
|
||||
name TEXT NOT NULL DEFAULT '',
|
||||
metric TEXT NOT NULL,
|
||||
comparator TEXT NOT NULL,
|
||||
threshold REAL NOT NULL DEFAULT 0,
|
||||
severity TEXT NOT NULL DEFAULT 'warn',
|
||||
cooldown_seconds INTEGER NOT NULL DEFAULT 300,
|
||||
enabled INTEGER NOT NULL DEFAULT 1,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_metric_alert_rules_workload ON metric_alert_rules(workload_id)`,
|
||||
}
|
||||
for _, t := range observabilityTables {
|
||||
if _, err := s.db.Exec(t); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user