feat(alerts): metric-threshold alerting (backend + API)

Operators can define metric-threshold alert rules (cpu_percent,
memory_percent, memory_bytes; gt/lt) per-workload or global via
/api/metric-alert-rules. A periodic evaluator (internal/metricalert,
30s tick) checks the freshest container stats sample per container
against enabled rules and, on breach (per-rule-per-workload cooldown),
emits into the existing event_log + bus pipeline (source "metric_alert",
workload_id set). Alerts therefore surface on the global events page,
the per-app activity timeline, and any configured event-trigger webhook
-- no new notification plumbing.

Mirrors the log_scan_rules store/API/route patterns and the
stats.Collector lifecycle. Rule CRUD reads are authed, mutations
AdminOnly. Frontend rule-config UI is a follow-up phase.

Reviewed: go APPROVE (0 CRITICAL/HIGH).
This commit is contained in:
2026-05-29 14:06:23 +03:00
parent 5c17885197
commit cdb9fd57d1
11 changed files with 1299 additions and 0 deletions
+10
View File
@@ -28,6 +28,7 @@ import (
"github.com/alexei/tinyforge/internal/health"
"github.com/alexei/tinyforge/internal/logging"
"github.com/alexei/tinyforge/internal/logscanner"
"github.com/alexei/tinyforge/internal/metricalert"
"github.com/alexei/tinyforge/internal/notify"
"github.com/alexei/tinyforge/internal/npm"
"github.com/alexei/tinyforge/internal/proxy"
@@ -390,6 +391,14 @@ func main() {
}
defer logScanMgr.Stop()
// Metric-alert manager: evaluates threshold rules against recent
// container stats samples and emits event_log entries on breach.
// The store satisfies RuleSource/SampleSource/EventSink; the event
// bus is the Publisher.
metricAlertMgr := metricalert.New(db, db, db, eventBus)
metricAlertMgr.Start()
defer metricAlertMgr.Stop()
// Build API server.
apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
apiServer.SetStaleScanner(staleScanner)
@@ -451,6 +460,7 @@ func main() {
eventBus.Unsubscribe(notifySub)
staleScanner.Stop()
statsCollector.Stop()
metricAlertMgr.Stop()
// Drain in-progress deploys and notifications.
dep.Drain()
+235
View File
@@ -0,0 +1,235 @@
// Package api: metric-alert rule HTTP handlers. The evaluator lives in
// internal/metricalert; this file is the REST surface that lets
// operators create, edit, and delete threshold rules. Mirrors the
// log-scan rule handlers.
package api
import (
"errors"
"net/http"
"strconv"
"strings"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/store"
)
// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
// immutable on update (per store.UpdateMetricAlertRule) so it only takes
// effect on create.
type metricAlertRuleInput struct {
WorkloadID *string `json:"workload_id"`
Name *string `json:"name"`
Metric *string `json:"metric"`
Comparator *string `json:"comparator"`
Threshold *float64 `json:"threshold"`
Severity *string `json:"severity"`
CooldownSeconds *int `json:"cooldown_seconds"`
Enabled *bool `json:"enabled"`
}
// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
// query filter `workload_id=...` returns rules applying to that workload
// (its own rows plus globals).
func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
if err != nil {
respondError(w, http.StatusInternalServerError, "list metric alert rules")
return
}
respondJSON(w, http.StatusOK, out)
return
}
out, err := s.store.ListMetricAlertRules()
if err != nil {
respondError(w, http.StatusInternalServerError, "list metric alert rules")
return
}
respondJSON(w, http.StatusOK, out)
}
// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
id, ok := parseMetricAlertRuleID(w, r)
if !ok {
return
}
rule, err := s.store.GetMetricAlertRule(id)
if err != nil {
mapStoreError(w, err, "metric alert rule")
return
}
respondJSON(w, http.StatusOK, rule)
}
// createMetricAlertRule handles POST /api/metric-alert-rules.
func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
var in metricAlertRuleInput
if !decodeJSON(w, r, &in) {
return
}
rule := store.MetricAlertRule{
WorkloadID: derefString(in.WorkloadID),
Name: derefString(in.Name),
Metric: derefString(in.Metric),
Comparator: derefString(in.Comparator),
Threshold: derefFloat64(in.Threshold),
Severity: firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
Enabled: in.Enabled == nil || *in.Enabled,
}
if msg := validateMetricAlertInput(rule); msg != "" {
respondError(w, http.StatusBadRequest, msg)
return
}
out, err := s.store.CreateMetricAlertRule(rule)
if err != nil {
if isMetricAlertValidationErr(err) {
respondError(w, http.StatusBadRequest, err.Error())
return
}
respondError(w, http.StatusInternalServerError, "create metric alert rule")
return
}
respondJSON(w, http.StatusCreated, out)
}
// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
// workload_id is immutable; name/metric/comparator/threshold/severity/
// cooldown/enabled are individually overridable.
func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
id, ok := parseMetricAlertRuleID(w, r)
if !ok {
return
}
existing, err := s.store.GetMetricAlertRule(id)
if err != nil {
mapStoreError(w, err, "metric alert rule")
return
}
var in metricAlertRuleInput
if !decodeJSON(w, r, &in) {
return
}
if in.Name != nil {
existing.Name = *in.Name
}
if in.Metric != nil && *in.Metric != "" {
existing.Metric = *in.Metric
}
if in.Comparator != nil && *in.Comparator != "" {
existing.Comparator = *in.Comparator
}
if in.Threshold != nil {
existing.Threshold = *in.Threshold
}
if in.Severity != nil && *in.Severity != "" {
existing.Severity = *in.Severity
}
if in.CooldownSeconds != nil {
existing.CooldownSeconds = *in.CooldownSeconds
}
if in.Enabled != nil {
existing.Enabled = *in.Enabled
}
if msg := validateMetricAlertInput(existing); msg != "" {
respondError(w, http.StatusBadRequest, msg)
return
}
out, err := s.store.UpdateMetricAlertRule(existing)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "metric alert rule")
return
}
if isMetricAlertValidationErr(err) {
respondError(w, http.StatusBadRequest, err.Error())
return
}
respondError(w, http.StatusInternalServerError, "update metric alert rule")
return
}
respondJSON(w, http.StatusOK, out)
}
// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
id, ok := parseMetricAlertRuleID(w, r)
if !ok {
return
}
if err := s.store.DeleteMetricAlertRule(id); err != nil {
mapStoreError(w, err, "metric alert rule")
return
}
w.WriteHeader(http.StatusNoContent)
}
// validateMetricAlertInput does boundary validation so we return a
// clear 400 before hitting the store. The store re-validates the same
// invariants as a backstop.
func validateMetricAlertInput(rule store.MetricAlertRule) string {
if strings.TrimSpace(rule.Name) == "" {
return "name is required"
}
switch rule.Metric {
case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
default:
return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
}
switch rule.Comparator {
case store.MetricComparatorGT, store.MetricComparatorLT:
default:
return "invalid comparator: must be gt or lt"
}
switch rule.Severity {
case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
default:
return "invalid severity: must be info, warn, or error"
}
if rule.CooldownSeconds < 0 {
return "cooldown_seconds must be >= 0"
}
return ""
}
// isMetricAlertValidationErr maps the store's validation errors to 400
// rather than 500 without leaking driver text.
func isMetricAlertValidationErr(err error) bool {
if err == nil {
return false
}
msg := err.Error()
for _, needle := range []string{
"name is required",
"invalid metric",
"invalid comparator",
"invalid severity",
"cooldown_seconds must be",
} {
if strings.Contains(msg, needle) {
return true
}
}
return false
}
func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
raw := chi.URLParam(r, "id")
id, err := strconv.ParseInt(raw, 10, 64)
if err != nil || id <= 0 {
respondError(w, http.StatusBadRequest, "invalid rule id")
return 0, false
}
return id, true
}
func derefFloat64(p *float64) float64 {
if p == nil {
return 0
}
return *p
}
+10
View File
@@ -431,6 +431,16 @@ func (s *Server) Router() chi.Router {
r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
})
// Metric-alert rules.
r.Get("/metric-alert-rules", s.listMetricAlertRules)
r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule)
r.Group(func(r chi.Router) {
r.Use(auth.AdminOnly)
r.Post("/metric-alert-rules", s.createMetricAlertRule)
r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule)
r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule)
})
// System resources (read-only).
r.Get("/system/stats", s.getSystemStats)
r.Get("/system/stats/history", s.getSystemStatsHistory)
+349
View File
@@ -0,0 +1,349 @@
// Package metricalert implements a background goroutine that
// periodically evaluates operator-configured metric-threshold rules
// against recent container stats samples. On breach (subject to a
// per-rule-per-workload cooldown) it emits an event into the existing
// event_log + event-bus pipeline — the same fan-out used by the
// log-scanner — instead of building any new notification plumbing.
package metricalert
import (
"encoding/json"
"fmt"
"log/slog"
"sync"
"time"
"github.com/alexei/tinyforge/internal/events"
"github.com/alexei/tinyforge/internal/store"
)
// EvalInterval is how often the evaluator tick fires.
const EvalInterval = 30 * time.Second
// lookbackSeconds bounds how far back we pull samples each tick. Stats
// are collected at most every few seconds (see internal/stats), so a
// 120s window comfortably captures the latest reading per container
// even if collection briefly stalls.
const lookbackSeconds = 120
// RuleSource is the read-side seam for fetching the current rule rows.
// Real callers pass *store.Store; tests pass a fake.
type RuleSource interface {
ListMetricAlertRules() ([]store.MetricAlertRule, error)
}
// SampleSource fetches the recent container stats samples to evaluate.
type SampleSource interface {
ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error)
}
// EventSink writes a breach into event_log.
type EventSink interface {
InsertEvent(store.EventLog) (store.EventLog, error)
}
// Publisher fans the breach out on the event bus. Matches *events.Bus.
type Publisher interface {
Publish(events.Event)
}
// Source identifies metric-alert events in event_log + the bus.
const eventSource = "metric_alert"
// Manager owns the evaluation loop lifecycle. It mirrors
// stats.Collector: a once-guarded Start/Stop pair with stop/done
// channels and a single-goroutine run loop.
type Manager struct {
rules RuleSource
samples SampleSource
sink EventSink
pub Publisher
// now is swappable in tests so cooldown windows can be exercised
// deterministically. Defaults to time.Now.
now func() time.Time
// mu guards lastFired. The run loop is single-goroutine today, but
// Start/Stop and a future ReloadRules may touch shared state; the
// mutex is cheap insurance.
mu sync.Mutex
lastFired map[string]time.Time // "ruleID:ownerID" -> last emit time
startOnce sync.Once
stopOnce sync.Once
started bool
stop chan struct{}
done chan struct{}
}
// New wires a manager with the supplied dependencies. Call Start to
// begin evaluating.
func New(rules RuleSource, samples SampleSource, sink EventSink, pub Publisher) *Manager {
return &Manager{
rules: rules,
samples: samples,
sink: sink,
pub: pub,
now: time.Now,
lastFired: map[string]time.Time{},
stop: make(chan struct{}),
done: make(chan struct{}),
}
}
// Start launches the background loop. Returns immediately. The loop
// exits when Stop is called. Safe to call multiple times — only the
// first call has an effect.
func (m *Manager) Start() {
m.startOnce.Do(func() {
m.started = true
go m.run()
})
}
// Stop signals the loop to exit and blocks until it has finished the
// in-flight tick. If Start was never called, Stop returns immediately.
func (m *Manager) Stop() {
m.stopOnce.Do(func() {
close(m.stop)
if !m.started {
close(m.done)
}
})
<-m.done
}
// run is the main loop. It evaluates once shortly after start, then on
// every EvalInterval tick, until Stop is called.
func (m *Manager) run() {
defer close(m.done)
// Settle delay so the app + first stats samples exist before the
// first evaluation.
select {
case <-time.After(3 * time.Second):
case <-m.stop:
return
}
ticker := time.NewTicker(EvalInterval)
defer ticker.Stop()
m.evaluate(m.now())
for {
select {
case <-m.stop:
return
case <-ticker.C:
m.evaluate(m.now())
}
}
}
// evaluate runs one pass: load rules + recent samples, reduce to the
// freshest sample per (owner, container), and emit on breach subject to
// cooldown. Best-effort throughout — a bad rule or sample never crashes
// the loop.
func (m *Manager) evaluate(now time.Time) {
rules, err := m.rules.ListMetricAlertRules()
if err != nil {
slog.Warn("metricalert: list rules", "error", err)
return
}
if len(rules) == 0 {
return
}
since := now.Unix() - lookbackSeconds
samples, err := m.samples.ListAllRecentContainerStatsSamples(since)
if err != nil {
slog.Warn("metricalert: list samples", "error", err)
return
}
latest := latestPerContainer(samples)
if len(latest) == 0 {
return
}
for _, rule := range rules {
if !rule.Enabled {
continue
}
for _, sample := range latest {
// Per-workload rules only match their workload; "" matches all.
if rule.WorkloadID != "" && rule.WorkloadID != sample.OwnerID {
continue
}
value, ok := metricValue(rule.Metric, sample)
if !ok {
continue // e.g. memory_percent with a zero limit
}
if !breached(rule.Comparator, value, rule.Threshold) {
continue
}
if m.coolingDown(rule, sample.OwnerID, now) {
continue
}
m.emit(rule, sample, value)
m.recordFire(rule, sample.OwnerID, now)
}
}
}
// latestPerContainer keeps only the most recent sample per
// (OwnerID, ContainerID), so each container is judged on its freshest
// reading rather than every historical row in the window.
func latestPerContainer(samples []store.ContainerStatsSample) []store.ContainerStatsSample {
newest := map[string]store.ContainerStatsSample{}
for _, s := range samples {
key := s.OwnerID + "\x00" + s.ContainerID
if prev, ok := newest[key]; !ok || s.TS > prev.TS {
newest[key] = s
}
}
out := make([]store.ContainerStatsSample, 0, len(newest))
for _, s := range newest {
out = append(out, s)
}
return out
}
// metricValue resolves a rule's metric against a sample. The bool is
// false when the sample can't be judged for that metric (memory_percent
// with a zero/unknown limit) so the caller skips it instead of dividing
// by zero.
func metricValue(metric string, s store.ContainerStatsSample) (float64, bool) {
switch metric {
case store.MetricCPUPercent:
return s.CPUPercent, true
case store.MetricMemoryPercent:
if s.MemoryLimit <= 0 {
return 0, false
}
return float64(s.MemoryUsage) / float64(s.MemoryLimit) * 100, true
case store.MetricMemoryBytes:
return float64(s.MemoryUsage), true
default:
return 0, false
}
}
// breached returns whether value crosses threshold per the comparator.
func breached(comparator string, value, threshold float64) bool {
switch comparator {
case store.MetricComparatorGT:
return value > threshold
case store.MetricComparatorLT:
return value < threshold
default:
return false
}
}
// cooldownKey is the per-rule-per-workload cooldown key.
func cooldownKey(ruleID int64, ownerID string) string {
return fmt.Sprintf("%d:%s", ruleID, ownerID)
}
func (m *Manager) coolingDown(rule store.MetricAlertRule, ownerID string, now time.Time) bool {
if rule.CooldownSeconds <= 0 {
return false
}
m.mu.Lock()
defer m.mu.Unlock()
last, ok := m.lastFired[cooldownKey(rule.ID, ownerID)]
if !ok {
return false
}
return now.Sub(last) < time.Duration(rule.CooldownSeconds)*time.Second
}
func (m *Manager) recordFire(rule store.MetricAlertRule, ownerID string, now time.Time) {
m.mu.Lock()
m.lastFired[cooldownKey(rule.ID, ownerID)] = now
m.mu.Unlock()
}
// emit persists the breach as an event_log row and publishes it on the
// bus. WorkloadID routes the alert to that app's activity timeline.
// Metadata is JSON-marshalled (never string-concatenated). Any
// marshal/insert failure is logged and skipped — emitting must never
// crash the loop.
func (m *Manager) emit(rule store.MetricAlertRule, sample store.ContainerStatsSample, value float64) {
message := formatMessage(rule, value)
meta := map[string]any{
"workload_id": sample.OwnerID,
"rule": rule.Name,
"metric": rule.Metric,
"value": value,
"threshold": rule.Threshold,
"comparator": rule.Comparator,
}
metaJSON, err := json.Marshal(meta)
if err != nil {
slog.Error("metricalert: marshal metadata", "rule", rule.Name, "error", err)
return
}
severity := rule.Severity
if severity == "" {
severity = store.LogScanSeverityWarn
}
evt, err := m.sink.InsertEvent(store.EventLog{
Source: eventSource,
Severity: severity,
Message: message,
WorkloadID: sample.OwnerID,
Metadata: string(metaJSON),
})
if err != nil {
slog.Error("metricalert: persist event", "rule", rule.Name, "error", err)
return
}
if m.pub != nil {
m.pub.Publish(events.Event{
Type: events.EventLog,
Payload: events.EventLogPayload{
ID: evt.ID,
Source: eventSource,
WorkloadID: sample.OwnerID,
Severity: severity,
Message: message,
Metadata: string(metaJSON),
CreatedAt: evt.CreatedAt,
},
})
}
}
// formatMessage builds a concise, human, secret-free breach line. The
// only operator-supplied text is rule.Name; the rest are numbers and
// fixed labels.
func formatMessage(rule store.MetricAlertRule, value float64) string {
label, unit := metricLabelUnit(rule.Metric)
word := comparatorWord(rule.Comparator)
return fmt.Sprintf("%s: %s is %.0f%s (threshold %s %.0f%s)",
rule.Name, label, value, unit, word, rule.Threshold, unit)
}
func metricLabelUnit(metric string) (label, unit string) {
switch metric {
case store.MetricCPUPercent:
return "CPU", "%"
case store.MetricMemoryPercent:
return "Memory", "%"
case store.MetricMemoryBytes:
return "Memory", " bytes"
default:
return metric, ""
}
}
func comparatorWord(comparator string) string {
switch comparator {
case store.MetricComparatorGT:
return ">"
case store.MetricComparatorLT:
return "<"
default:
return comparator
}
}
+284
View File
@@ -0,0 +1,284 @@
package metricalert
import (
"testing"
"time"
"github.com/alexei/tinyforge/internal/events"
"github.com/alexei/tinyforge/internal/store"
)
// --- fakes -----------------------------------------------------------
type fakeRules struct {
rules []store.MetricAlertRule
err error
}
func (f *fakeRules) ListMetricAlertRules() ([]store.MetricAlertRule, error) {
return f.rules, f.err
}
type fakeSamples struct {
samples []store.ContainerStatsSample
err error
since int64 // captured arg of the last call
}
func (f *fakeSamples) ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error) {
f.since = sinceTS
return f.samples, f.err
}
type recordedEvent struct {
evt store.EventLog
}
type fakeSink struct {
events []recordedEvent
err error
nextID int64
}
func (f *fakeSink) InsertEvent(e store.EventLog) (store.EventLog, error) {
if f.err != nil {
return store.EventLog{}, f.err
}
f.nextID++
e.ID = f.nextID
e.CreatedAt = "2026-05-29T00:00:00Z"
f.events = append(f.events, recordedEvent{evt: e})
return e, nil
}
type fakePublisher struct {
published []events.Event
}
func (f *fakePublisher) Publish(e events.Event) {
f.published = append(f.published, e)
}
func newManager(rules []store.MetricAlertRule, samples []store.ContainerStatsSample) (*Manager, *fakeSink, *fakePublisher) {
sink := &fakeSink{}
pub := &fakePublisher{}
m := New(&fakeRules{rules: rules}, &fakeSamples{samples: samples}, sink, pub)
return m, sink, pub
}
// --- tests -----------------------------------------------------------
func TestEvaluate_BreachEmits(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, Severity: "error",
CooldownSeconds: 300, Enabled: true,
}}
samples := []store.ContainerStatsSample{{
ContainerID: "c1", OwnerID: "w1", OwnerType: "instance", TS: 100, CPUPercent: 95,
}}
m, sink, pub := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 1 {
t.Fatalf("expected 1 event, got %d", len(sink.events))
}
got := sink.events[0].evt
if got.Source != "metric_alert" {
t.Errorf("source = %q, want metric_alert", got.Source)
}
if got.Severity != "error" {
t.Errorf("severity = %q, want error", got.Severity)
}
if got.WorkloadID != "w1" {
t.Errorf("workload_id = %q, want w1", got.WorkloadID)
}
if got.Metadata == "" || got.Metadata == "{}" {
t.Errorf("metadata should be populated JSON, got %q", got.Metadata)
}
if len(pub.published) != 1 {
t.Fatalf("expected 1 published event, got %d", len(pub.published))
}
payload, ok := pub.published[0].Payload.(events.EventLogPayload)
if !ok {
t.Fatalf("published payload is not EventLogPayload")
}
if payload.WorkloadID != "w1" || payload.Source != "metric_alert" {
t.Errorf("payload workload/source mismatch: %+v", payload)
}
}
func TestEvaluate_NoBreachNoEmit(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
}}
samples := []store.ContainerStatsSample{{
ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10,
}}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 0 {
t.Fatalf("expected no events for non-breach, got %d", len(sink.events))
}
}
func TestEvaluate_DisabledRuleSkipped(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: false,
}}
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 0 {
t.Fatalf("disabled rule should not emit, got %d", len(sink.events))
}
}
func TestEvaluate_PerWorkloadScoping(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "w2-only", WorkloadID: "w2", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
}}
samples := []store.ContainerStatsSample{
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}, // breach but wrong workload
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95}, // breach, correct workload
}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 1 {
t.Fatalf("expected 1 event (only w2), got %d", len(sink.events))
}
if sink.events[0].evt.WorkloadID != "w2" {
t.Errorf("event should be scoped to w2, got %q", sink.events[0].evt.WorkloadID)
}
}
func TestEvaluate_GlobalRuleMatchesAll(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "global", WorkloadID: "", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
}}
samples := []store.ContainerStatsSample{
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95},
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95},
}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 2 {
t.Fatalf("global rule should fire for both workloads, got %d", len(sink.events))
}
}
func TestEvaluate_MemoryPercentDivByZeroSkip(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
Comparator: store.MetricComparatorGT, Threshold: 50, Enabled: true,
}}
samples := []store.ContainerStatsSample{{
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 1000, MemoryLimit: 0,
}}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 0 {
t.Fatalf("zero memory limit should be skipped for percent rule, got %d", len(sink.events))
}
}
func TestEvaluate_MemoryPercentBreaches(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
Comparator: store.MetricComparatorGT, Threshold: 90, Enabled: true,
}}
samples := []store.ContainerStatsSample{{
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 950, MemoryLimit: 1000, // 95%
}}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 1 {
t.Fatalf("95%% should breach 90%% threshold, got %d events", len(sink.events))
}
}
func TestEvaluate_CooldownSuppressesSecondEmit(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, CooldownSeconds: 300, Enabled: true,
}}
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
m, sink, _ := newManager(rules, samples)
base := time.Unix(1000, 0)
m.evaluate(base)
// 10s later — still inside the 300s cooldown window.
m.evaluate(base.Add(10 * time.Second))
if len(sink.events) != 1 {
t.Fatalf("cooldown should suppress second emit, got %d events", len(sink.events))
}
// Past the window — should fire again.
m.evaluate(base.Add(301 * time.Second))
if len(sink.events) != 2 {
t.Fatalf("should re-fire after cooldown elapses, got %d events", len(sink.events))
}
}
func TestEvaluate_LatestSamplePerContainer(t *testing.T) {
// Two samples for the same container: an old non-breaching reading
// and a newer breaching one. Only the freshest should be judged.
rules := []store.MetricAlertRule{{
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
}}
samples := []store.ContainerStatsSample{
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10},
{ContainerID: "c1", OwnerID: "w1", TS: 150, CPUPercent: 95},
}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 1 {
t.Fatalf("expected exactly 1 event from freshest sample, got %d", len(sink.events))
}
}
func TestEvaluate_LessThanComparator(t *testing.T) {
rules := []store.MetricAlertRule{{
ID: 1, Name: "cpu-idle", Metric: store.MetricCPUPercent,
Comparator: store.MetricComparatorLT, Threshold: 5, Enabled: true,
}}
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 1}}
m, sink, _ := newManager(rules, samples)
m.evaluate(time.Unix(200, 0))
if len(sink.events) != 1 {
t.Fatalf("1%% < 5%% threshold should breach lt rule, got %d events", len(sink.events))
}
}
func TestEvaluate_NoRulesNoFetch(t *testing.T) {
// With no rules there's nothing to do; we shouldn't even query samples.
samplesSrc := &fakeSamples{samples: nil}
m := New(&fakeRules{rules: nil}, samplesSrc, &fakeSink{}, &fakePublisher{})
m.evaluate(time.Unix(200, 0))
if samplesSrc.since != 0 {
t.Errorf("samples should not be queried when there are no rules")
}
}
+191
View File
@@ -0,0 +1,191 @@
package store
import (
"database/sql"
"errors"
"fmt"
"strings"
)
// CreateMetricAlertRule inserts a new rule row after validating its
// metric/comparator/severity enums and rejecting negative cooldowns.
func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
if err := validateMetricAlertRule(r); err != nil {
return MetricAlertRule{}, err
}
now := Now()
r.CreatedAt = now
r.UpdatedAt = now
res, err := s.db.Exec(
`INSERT INTO metric_alert_rules
(workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
)
if err != nil {
return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
}
id, err := res.LastInsertId()
if err != nil {
return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
}
r.ID = id
return r, nil
}
// ListMetricAlertRules returns every rule, ordered by id for stable UI
// rendering.
func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
return s.queryMetricAlertRules(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules ORDER BY id`,
)
}
// ListMetricAlertRulesByWorkload returns rules that apply to the given
// workload: rows explicitly scoped to it plus global rows (workload_id
// = ""). Useful for the workload detail page.
func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
return s.queryMetricAlertRules(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
workloadID,
)
}
// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
row := s.db.QueryRow(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules WHERE id = ?`, id,
)
r, err := scanMetricAlertRuleRow(row)
if errors.Is(err, sql.ErrNoRows) {
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
}
if err != nil {
return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
}
return r, nil
}
// UpdateMetricAlertRule overwrites the editable columns of a rule row.
// id and workload_id are immutable on update — change the scope of a
// rule by deleting + recreating, mirroring the log-scan store.
func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
if r.ID == 0 {
return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
}
if err := validateMetricAlertRule(r); err != nil {
return MetricAlertRule{}, err
}
r.UpdatedAt = Now()
res, err := s.db.Exec(
`UPDATE metric_alert_rules
SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
cooldown_seconds = ?, enabled = ?, updated_at = ?
WHERE id = ?`,
r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
)
if err != nil {
return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
}
return s.GetMetricAlertRule(r.ID)
}
// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
// no row matched.
func (s *Store) DeleteMetricAlertRule(id int64) error {
res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete metric alert rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
}
return nil
}
func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
rows, err := s.db.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("query metric alert rules: %w", err)
}
defer rows.Close()
out := []MetricAlertRule{}
for rows.Next() {
r, err := scanMetricAlertRuleRows(rows)
if err != nil {
return nil, err
}
out = append(out, r)
}
return out, rows.Err()
}
func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
var r MetricAlertRule
var enabled int
if err := rows.Scan(
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
}
r.Enabled = enabled != 0
return r, nil
}
func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
var r MetricAlertRule
var enabled int
if err := row.Scan(
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return MetricAlertRule{}, err
}
r.Enabled = enabled != 0
return r, nil
}
// validateMetricAlertRule enforces the per-row invariants: a non-empty
// name, a known metric/comparator, a valid severity (blank allowed so
// the caller can default it), and a non-negative cooldown.
func validateMetricAlertRule(r MetricAlertRule) error {
if strings.TrimSpace(r.Name) == "" {
return fmt.Errorf("metric alert rule: name is required")
}
switch r.Metric {
case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
default:
return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
}
switch r.Comparator {
case MetricComparatorGT, MetricComparatorLT:
default:
return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
}
switch r.Severity {
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
case "":
// Default applied at the caller; allow blank.
default:
return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
}
if r.CooldownSeconds < 0 {
return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
}
return nil
}
+167
View File
@@ -0,0 +1,167 @@
package store
import (
"strings"
"testing"
)
func TestCreateMetricAlertRule_Validates(t *testing.T) {
s := newTestStore(t)
cases := []struct {
name string
in MetricAlertRule
wantErr string
}{
{
name: "missing name",
in: MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
wantErr: "name is required",
},
{
name: "bad metric",
in: MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
wantErr: "invalid metric",
},
{
name: "bad comparator",
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
wantErr: "invalid comparator",
},
{
name: "bad severity",
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
wantErr: "invalid severity",
},
{
name: "negative cooldown",
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
wantErr: "cooldown_seconds must be",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
_, err := s.CreateMetricAlertRule(c.in)
if err == nil {
t.Fatalf("expected error containing %q, got nil", c.wantErr)
}
if !strings.Contains(err.Error(), c.wantErr) {
t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
}
})
}
}
func TestCreateAndGetMetricAlertRule(t *testing.T) {
s := newTestStore(t)
r, err := s.CreateMetricAlertRule(MetricAlertRule{
Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
})
if err != nil {
t.Fatalf("create: %v", err)
}
if r.ID == 0 {
t.Fatal("id should be set")
}
got, err := s.GetMetricAlertRule(r.ID)
if err != nil {
t.Fatalf("get: %v", err)
}
if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
}
if got.Threshold != 80 {
t.Errorf("threshold mismatch: %v", got.Threshold)
}
if !got.Enabled {
t.Error("enabled lost on round-trip")
}
}
func TestGetMetricAlertRule_NotFound(t *testing.T) {
s := newTestStore(t)
if _, err := s.GetMetricAlertRule(999); err == nil {
t.Fatal("expected ErrNotFound for missing rule")
}
}
func TestListMetricAlertRulesByWorkload(t *testing.T) {
s := newTestStore(t)
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 90, Severity: "warn", Enabled: true,
})
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
Threshold: 85, Severity: "error", Enabled: true,
})
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
Threshold: 1000, Severity: "info", Enabled: true,
})
w1, err := s.ListMetricAlertRulesByWorkload("w1")
if err != nil {
t.Fatalf("by workload: %v", err)
}
// w1 sees its own rule + the global, but NOT w2's rule.
if len(w1) != 2 {
t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
}
for _, r := range w1 {
if r.WorkloadID == "w2" {
t.Errorf("w1 should not see w2's rule")
}
}
}
func TestUpdateMetricAlertRule(t *testing.T) {
s := newTestStore(t)
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 80, Severity: "warn", Enabled: true,
})
r.Threshold = 95
r.Comparator = MetricComparatorLT
r.Enabled = false
got, err := s.UpdateMetricAlertRule(r)
if err != nil {
t.Fatalf("update: %v", err)
}
if got.Threshold != 95 {
t.Errorf("threshold not updated: %v", got.Threshold)
}
if got.Comparator != MetricComparatorLT {
t.Errorf("comparator not updated: %q", got.Comparator)
}
if got.Enabled {
t.Error("enabled=false not applied")
}
}
func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
s := newTestStore(t)
_, err := s.UpdateMetricAlertRule(MetricAlertRule{
ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
})
if err == nil {
t.Fatal("expected ErrNotFound updating missing rule")
}
}
func TestDeleteMetricAlertRule(t *testing.T) {
s := newTestStore(t)
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
Threshold: 80, Severity: "warn", Enabled: true,
})
if err := s.DeleteMetricAlertRule(r.ID); err != nil {
t.Fatalf("delete: %v", err)
}
if _, err := s.GetMetricAlertRule(r.ID); err == nil {
t.Error("rule should be gone after delete")
}
if err := s.DeleteMetricAlertRule(r.ID); err == nil {
t.Error("expected ErrNotFound deleting already-deleted rule")
}
}
+33
View File
@@ -277,6 +277,39 @@ const (
LogScanSeverityError = "error"
)
// MetricAlertRule fires an event when a container metric breaches a
// threshold. Mirrors LogScanRule but evaluated against stats_samples
// instead of log lines.
type MetricAlertRule struct {
ID int64 `json:"id"`
WorkloadID string `json:"workload_id"` // "" = applies to all workloads
Name string `json:"name"`
Metric string `json:"metric"` // cpu_percent | memory_percent | memory_bytes
Comparator string `json:"comparator"` // gt | lt
Threshold float64 `json:"threshold"`
Severity string `json:"severity"` // info | warn | error
CooldownSeconds int `json:"cooldown_seconds"` // min seconds between fires per (rule,workload)
Enabled bool `json:"enabled"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// Metric-alert metric identifiers. cpu_percent + memory_percent are
// 0100 ratios; memory_bytes is an absolute usage figure. Validated in
// the store on create/update.
const (
MetricCPUPercent = "cpu_percent"
MetricMemoryPercent = "memory_percent"
MetricMemoryBytes = "memory_bytes"
)
// Metric-alert comparators. gt fires when the value exceeds the
// threshold; lt when it falls below.
const (
MetricComparatorGT = "gt"
MetricComparatorLT = "lt"
)
// WorkloadKind enumerates the legacy discriminator values written into
// containers.workload_kind and workloads.kind. After the hard cutover the
// backing project / stack / static_site tables are gone — these constants
+18
View File
@@ -408,6 +408,24 @@ func (s *Store) runMigrations() error {
)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
// metric_alert_rules: threshold rules the metric-alert manager
// evaluates against recent container stats samples. WorkloadID is
// nullable (via "" sentinel) so a global rule applies to every
// workload; a non-empty value scopes it to one workload.
`CREATE TABLE IF NOT EXISTS metric_alert_rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
workload_id TEXT NOT NULL DEFAULT '',
name TEXT NOT NULL DEFAULT '',
metric TEXT NOT NULL,
comparator TEXT NOT NULL,
threshold REAL NOT NULL DEFAULT 0,
severity TEXT NOT NULL DEFAULT 'warn',
cooldown_seconds INTEGER NOT NULL DEFAULT 300,
enabled INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
`CREATE INDEX IF NOT EXISTS idx_metric_alert_rules_workload ON metric_alert_rules(workload_id)`,
}
for _, t := range observabilityTables {
if _, err := s.db.Exec(t); err != nil {
+1
View File
@@ -551,6 +551,7 @@
"static_site": "Static Site",
"stale_scanner": "Stale Scanner",
"stale_cleanup": "Stale Cleanup",
"metric_alert": "Metric Alert",
"admin": "Admin"
},
"metadata": "Details"
+1
View File
@@ -551,6 +551,7 @@
"static_site": "Статический сайт",
"stale_scanner": "Сканер устаревших",
"stale_cleanup": "Очистка устаревших",
"metric_alert": "Метрика",
"admin": "Администратор"
},
"metadata": "Подробности"