cdb9fd57d1
Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
285 lines
8.7 KiB
Go
285 lines
8.7 KiB
Go
package metricalert
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/alexei/tinyforge/internal/events"
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
)
|
|
|
|
// --- fakes -----------------------------------------------------------
|
|
|
|
type fakeRules struct {
|
|
rules []store.MetricAlertRule
|
|
err error
|
|
}
|
|
|
|
func (f *fakeRules) ListMetricAlertRules() ([]store.MetricAlertRule, error) {
|
|
return f.rules, f.err
|
|
}
|
|
|
|
type fakeSamples struct {
|
|
samples []store.ContainerStatsSample
|
|
err error
|
|
since int64 // captured arg of the last call
|
|
}
|
|
|
|
func (f *fakeSamples) ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error) {
|
|
f.since = sinceTS
|
|
return f.samples, f.err
|
|
}
|
|
|
|
type recordedEvent struct {
|
|
evt store.EventLog
|
|
}
|
|
|
|
type fakeSink struct {
|
|
events []recordedEvent
|
|
err error
|
|
nextID int64
|
|
}
|
|
|
|
func (f *fakeSink) InsertEvent(e store.EventLog) (store.EventLog, error) {
|
|
if f.err != nil {
|
|
return store.EventLog{}, f.err
|
|
}
|
|
f.nextID++
|
|
e.ID = f.nextID
|
|
e.CreatedAt = "2026-05-29T00:00:00Z"
|
|
f.events = append(f.events, recordedEvent{evt: e})
|
|
return e, nil
|
|
}
|
|
|
|
type fakePublisher struct {
|
|
published []events.Event
|
|
}
|
|
|
|
func (f *fakePublisher) Publish(e events.Event) {
|
|
f.published = append(f.published, e)
|
|
}
|
|
|
|
func newManager(rules []store.MetricAlertRule, samples []store.ContainerStatsSample) (*Manager, *fakeSink, *fakePublisher) {
|
|
sink := &fakeSink{}
|
|
pub := &fakePublisher{}
|
|
m := New(&fakeRules{rules: rules}, &fakeSamples{samples: samples}, sink, pub)
|
|
return m, sink, pub
|
|
}
|
|
|
|
// --- tests -----------------------------------------------------------
|
|
|
|
func TestEvaluate_BreachEmits(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, Severity: "error",
|
|
CooldownSeconds: 300, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{
|
|
ContainerID: "c1", OwnerID: "w1", OwnerType: "instance", TS: 100, CPUPercent: 95,
|
|
}}
|
|
m, sink, pub := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 1 {
|
|
t.Fatalf("expected 1 event, got %d", len(sink.events))
|
|
}
|
|
got := sink.events[0].evt
|
|
if got.Source != "metric_alert" {
|
|
t.Errorf("source = %q, want metric_alert", got.Source)
|
|
}
|
|
if got.Severity != "error" {
|
|
t.Errorf("severity = %q, want error", got.Severity)
|
|
}
|
|
if got.WorkloadID != "w1" {
|
|
t.Errorf("workload_id = %q, want w1", got.WorkloadID)
|
|
}
|
|
if got.Metadata == "" || got.Metadata == "{}" {
|
|
t.Errorf("metadata should be populated JSON, got %q", got.Metadata)
|
|
}
|
|
if len(pub.published) != 1 {
|
|
t.Fatalf("expected 1 published event, got %d", len(pub.published))
|
|
}
|
|
payload, ok := pub.published[0].Payload.(events.EventLogPayload)
|
|
if !ok {
|
|
t.Fatalf("published payload is not EventLogPayload")
|
|
}
|
|
if payload.WorkloadID != "w1" || payload.Source != "metric_alert" {
|
|
t.Errorf("payload workload/source mismatch: %+v", payload)
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_NoBreachNoEmit(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{
|
|
ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10,
|
|
}}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 0 {
|
|
t.Fatalf("expected no events for non-breach, got %d", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_DisabledRuleSkipped(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: false,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 0 {
|
|
t.Fatalf("disabled rule should not emit, got %d", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_PerWorkloadScoping(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "w2-only", WorkloadID: "w2", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{
|
|
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}, // breach but wrong workload
|
|
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95}, // breach, correct workload
|
|
}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 1 {
|
|
t.Fatalf("expected 1 event (only w2), got %d", len(sink.events))
|
|
}
|
|
if sink.events[0].evt.WorkloadID != "w2" {
|
|
t.Errorf("event should be scoped to w2, got %q", sink.events[0].evt.WorkloadID)
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_GlobalRuleMatchesAll(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "global", WorkloadID: "", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{
|
|
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95},
|
|
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95},
|
|
}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 2 {
|
|
t.Fatalf("global rule should fire for both workloads, got %d", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_MemoryPercentDivByZeroSkip(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 50, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{
|
|
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 1000, MemoryLimit: 0,
|
|
}}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 0 {
|
|
t.Fatalf("zero memory limit should be skipped for percent rule, got %d", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_MemoryPercentBreaches(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 90, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{
|
|
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 950, MemoryLimit: 1000, // 95%
|
|
}}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 1 {
|
|
t.Fatalf("95%% should breach 90%% threshold, got %d events", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_CooldownSuppressesSecondEmit(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, CooldownSeconds: 300, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
base := time.Unix(1000, 0)
|
|
m.evaluate(base)
|
|
// 10s later — still inside the 300s cooldown window.
|
|
m.evaluate(base.Add(10 * time.Second))
|
|
|
|
if len(sink.events) != 1 {
|
|
t.Fatalf("cooldown should suppress second emit, got %d events", len(sink.events))
|
|
}
|
|
|
|
// Past the window — should fire again.
|
|
m.evaluate(base.Add(301 * time.Second))
|
|
if len(sink.events) != 2 {
|
|
t.Fatalf("should re-fire after cooldown elapses, got %d events", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_LatestSamplePerContainer(t *testing.T) {
|
|
// Two samples for the same container: an old non-breaching reading
|
|
// and a newer breaching one. Only the freshest should be judged.
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{
|
|
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10},
|
|
{ContainerID: "c1", OwnerID: "w1", TS: 150, CPUPercent: 95},
|
|
}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 1 {
|
|
t.Fatalf("expected exactly 1 event from freshest sample, got %d", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_LessThanComparator(t *testing.T) {
|
|
rules := []store.MetricAlertRule{{
|
|
ID: 1, Name: "cpu-idle", Metric: store.MetricCPUPercent,
|
|
Comparator: store.MetricComparatorLT, Threshold: 5, Enabled: true,
|
|
}}
|
|
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 1}}
|
|
m, sink, _ := newManager(rules, samples)
|
|
|
|
m.evaluate(time.Unix(200, 0))
|
|
|
|
if len(sink.events) != 1 {
|
|
t.Fatalf("1%% < 5%% threshold should breach lt rule, got %d events", len(sink.events))
|
|
}
|
|
}
|
|
|
|
func TestEvaluate_NoRulesNoFetch(t *testing.T) {
|
|
// With no rules there's nothing to do; we shouldn't even query samples.
|
|
samplesSrc := &fakeSamples{samples: nil}
|
|
m := New(&fakeRules{rules: nil}, samplesSrc, &fakeSink{}, &fakePublisher{})
|
|
m.evaluate(time.Unix(200, 0))
|
|
if samplesSrc.since != 0 {
|
|
t.Errorf("samples should not be queried when there are no rules")
|
|
}
|
|
}
|