feat(alerts): metric-threshold alerting (backend + API)
Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
This commit is contained in:
@@ -0,0 +1,167 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCreateMetricAlertRule_Validates(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
cases := []struct {
|
||||
name string
|
||||
in MetricAlertRule
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "missing name",
|
||||
in: MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
|
||||
wantErr: "name is required",
|
||||
},
|
||||
{
|
||||
name: "bad metric",
|
||||
in: MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
|
||||
wantErr: "invalid metric",
|
||||
},
|
||||
{
|
||||
name: "bad comparator",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
|
||||
wantErr: "invalid comparator",
|
||||
},
|
||||
{
|
||||
name: "bad severity",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
|
||||
wantErr: "invalid severity",
|
||||
},
|
||||
{
|
||||
name: "negative cooldown",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
|
||||
wantErr: "cooldown_seconds must be",
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
_, err := s.CreateMetricAlertRule(c.in)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error containing %q, got nil", c.wantErr)
|
||||
}
|
||||
if !strings.Contains(err.Error(), c.wantErr) {
|
||||
t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateAndGetMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, err := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create: %v", err)
|
||||
}
|
||||
if r.ID == 0 {
|
||||
t.Fatal("id should be set")
|
||||
}
|
||||
got, err := s.GetMetricAlertRule(r.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
|
||||
t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
|
||||
}
|
||||
if got.Threshold != 80 {
|
||||
t.Errorf("threshold mismatch: %v", got.Threshold)
|
||||
}
|
||||
if !got.Enabled {
|
||||
t.Error("enabled lost on round-trip")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMetricAlertRule_NotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
if _, err := s.GetMetricAlertRule(999); err == nil {
|
||||
t.Fatal("expected ErrNotFound for missing rule")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListMetricAlertRulesByWorkload(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 90, Severity: "warn", Enabled: true,
|
||||
})
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 85, Severity: "error", Enabled: true,
|
||||
})
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
|
||||
Threshold: 1000, Severity: "info", Enabled: true,
|
||||
})
|
||||
|
||||
w1, err := s.ListMetricAlertRulesByWorkload("w1")
|
||||
if err != nil {
|
||||
t.Fatalf("by workload: %v", err)
|
||||
}
|
||||
// w1 sees its own rule + the global, but NOT w2's rule.
|
||||
if len(w1) != 2 {
|
||||
t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
|
||||
}
|
||||
for _, r := range w1 {
|
||||
if r.WorkloadID == "w2" {
|
||||
t.Errorf("w1 should not see w2's rule")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", Enabled: true,
|
||||
})
|
||||
r.Threshold = 95
|
||||
r.Comparator = MetricComparatorLT
|
||||
r.Enabled = false
|
||||
got, err := s.UpdateMetricAlertRule(r)
|
||||
if err != nil {
|
||||
t.Fatalf("update: %v", err)
|
||||
}
|
||||
if got.Threshold != 95 {
|
||||
t.Errorf("threshold not updated: %v", got.Threshold)
|
||||
}
|
||||
if got.Comparator != MetricComparatorLT {
|
||||
t.Errorf("comparator not updated: %q", got.Comparator)
|
||||
}
|
||||
if got.Enabled {
|
||||
t.Error("enabled=false not applied")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, err := s.UpdateMetricAlertRule(MetricAlertRule{
|
||||
ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected ErrNotFound updating missing rule")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeleteMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", Enabled: true,
|
||||
})
|
||||
if err := s.DeleteMetricAlertRule(r.ID); err != nil {
|
||||
t.Fatalf("delete: %v", err)
|
||||
}
|
||||
if _, err := s.GetMetricAlertRule(r.ID); err == nil {
|
||||
t.Error("rule should be gone after delete")
|
||||
}
|
||||
if err := s.DeleteMetricAlertRule(r.ID); err == nil {
|
||||
t.Error("expected ErrNotFound deleting already-deleted rule")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user