feat(alerts): metric-threshold alerting (backend + API)
Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
This commit is contained in:
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/alexei/tinyforge/internal/health"
|
||||
"github.com/alexei/tinyforge/internal/logging"
|
||||
"github.com/alexei/tinyforge/internal/logscanner"
|
||||
"github.com/alexei/tinyforge/internal/metricalert"
|
||||
"github.com/alexei/tinyforge/internal/notify"
|
||||
"github.com/alexei/tinyforge/internal/npm"
|
||||
"github.com/alexei/tinyforge/internal/proxy"
|
||||
@@ -390,6 +391,14 @@ func main() {
|
||||
}
|
||||
defer logScanMgr.Stop()
|
||||
|
||||
// Metric-alert manager: evaluates threshold rules against recent
|
||||
// container stats samples and emits event_log entries on breach.
|
||||
// The store satisfies RuleSource/SampleSource/EventSink; the event
|
||||
// bus is the Publisher.
|
||||
metricAlertMgr := metricalert.New(db, db, db, eventBus)
|
||||
metricAlertMgr.Start()
|
||||
defer metricAlertMgr.Stop()
|
||||
|
||||
// Build API server.
|
||||
apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
|
||||
apiServer.SetStaleScanner(staleScanner)
|
||||
@@ -451,6 +460,7 @@ func main() {
|
||||
eventBus.Unsubscribe(notifySub)
|
||||
staleScanner.Stop()
|
||||
statsCollector.Stop()
|
||||
metricAlertMgr.Stop()
|
||||
|
||||
// Drain in-progress deploys and notifications.
|
||||
dep.Drain()
|
||||
|
||||
@@ -0,0 +1,235 @@
|
||||
// Package api: metric-alert rule HTTP handlers. The evaluator lives in
|
||||
// internal/metricalert; this file is the REST surface that lets
|
||||
// operators create, edit, and delete threshold rules. Mirrors the
|
||||
// log-scan rule handlers.
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
|
||||
// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
|
||||
// immutable on update (per store.UpdateMetricAlertRule) so it only takes
|
||||
// effect on create.
|
||||
type metricAlertRuleInput struct {
|
||||
WorkloadID *string `json:"workload_id"`
|
||||
Name *string `json:"name"`
|
||||
Metric *string `json:"metric"`
|
||||
Comparator *string `json:"comparator"`
|
||||
Threshold *float64 `json:"threshold"`
|
||||
Severity *string `json:"severity"`
|
||||
CooldownSeconds *int `json:"cooldown_seconds"`
|
||||
Enabled *bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
|
||||
// query filter `workload_id=...` returns rules applying to that workload
|
||||
// (its own rows plus globals).
|
||||
func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
|
||||
if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
|
||||
out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list metric alert rules")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
return
|
||||
}
|
||||
out, err := s.store.ListMetricAlertRules()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list metric alert rules")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
|
||||
func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
id, ok := parseMetricAlertRuleID(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
rule, err := s.store.GetMetricAlertRule(id)
|
||||
if err != nil {
|
||||
mapStoreError(w, err, "metric alert rule")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, rule)
|
||||
}
|
||||
|
||||
// createMetricAlertRule handles POST /api/metric-alert-rules.
|
||||
func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
var in metricAlertRuleInput
|
||||
if !decodeJSON(w, r, &in) {
|
||||
return
|
||||
}
|
||||
rule := store.MetricAlertRule{
|
||||
WorkloadID: derefString(in.WorkloadID),
|
||||
Name: derefString(in.Name),
|
||||
Metric: derefString(in.Metric),
|
||||
Comparator: derefString(in.Comparator),
|
||||
Threshold: derefFloat64(in.Threshold),
|
||||
Severity: firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
|
||||
CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
|
||||
Enabled: in.Enabled == nil || *in.Enabled,
|
||||
}
|
||||
if msg := validateMetricAlertInput(rule); msg != "" {
|
||||
respondError(w, http.StatusBadRequest, msg)
|
||||
return
|
||||
}
|
||||
out, err := s.store.CreateMetricAlertRule(rule)
|
||||
if err != nil {
|
||||
if isMetricAlertValidationErr(err) {
|
||||
respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "create metric alert rule")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusCreated, out)
|
||||
}
|
||||
|
||||
// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
|
||||
// workload_id is immutable; name/metric/comparator/threshold/severity/
|
||||
// cooldown/enabled are individually overridable.
|
||||
func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
id, ok := parseMetricAlertRuleID(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
existing, err := s.store.GetMetricAlertRule(id)
|
||||
if err != nil {
|
||||
mapStoreError(w, err, "metric alert rule")
|
||||
return
|
||||
}
|
||||
var in metricAlertRuleInput
|
||||
if !decodeJSON(w, r, &in) {
|
||||
return
|
||||
}
|
||||
if in.Name != nil {
|
||||
existing.Name = *in.Name
|
||||
}
|
||||
if in.Metric != nil && *in.Metric != "" {
|
||||
existing.Metric = *in.Metric
|
||||
}
|
||||
if in.Comparator != nil && *in.Comparator != "" {
|
||||
existing.Comparator = *in.Comparator
|
||||
}
|
||||
if in.Threshold != nil {
|
||||
existing.Threshold = *in.Threshold
|
||||
}
|
||||
if in.Severity != nil && *in.Severity != "" {
|
||||
existing.Severity = *in.Severity
|
||||
}
|
||||
if in.CooldownSeconds != nil {
|
||||
existing.CooldownSeconds = *in.CooldownSeconds
|
||||
}
|
||||
if in.Enabled != nil {
|
||||
existing.Enabled = *in.Enabled
|
||||
}
|
||||
if msg := validateMetricAlertInput(existing); msg != "" {
|
||||
respondError(w, http.StatusBadRequest, msg)
|
||||
return
|
||||
}
|
||||
out, err := s.store.UpdateMetricAlertRule(existing)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "metric alert rule")
|
||||
return
|
||||
}
|
||||
if isMetricAlertValidationErr(err) {
|
||||
respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "update metric alert rule")
|
||||
return
|
||||
}
|
||||
respondJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
|
||||
func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
|
||||
id, ok := parseMetricAlertRuleID(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if err := s.store.DeleteMetricAlertRule(id); err != nil {
|
||||
mapStoreError(w, err, "metric alert rule")
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
|
||||
// validateMetricAlertInput does boundary validation so we return a
|
||||
// clear 400 before hitting the store. The store re-validates the same
|
||||
// invariants as a backstop.
|
||||
func validateMetricAlertInput(rule store.MetricAlertRule) string {
|
||||
if strings.TrimSpace(rule.Name) == "" {
|
||||
return "name is required"
|
||||
}
|
||||
switch rule.Metric {
|
||||
case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
|
||||
default:
|
||||
return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
|
||||
}
|
||||
switch rule.Comparator {
|
||||
case store.MetricComparatorGT, store.MetricComparatorLT:
|
||||
default:
|
||||
return "invalid comparator: must be gt or lt"
|
||||
}
|
||||
switch rule.Severity {
|
||||
case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
|
||||
default:
|
||||
return "invalid severity: must be info, warn, or error"
|
||||
}
|
||||
if rule.CooldownSeconds < 0 {
|
||||
return "cooldown_seconds must be >= 0"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// isMetricAlertValidationErr maps the store's validation errors to 400
|
||||
// rather than 500 without leaking driver text.
|
||||
func isMetricAlertValidationErr(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := err.Error()
|
||||
for _, needle := range []string{
|
||||
"name is required",
|
||||
"invalid metric",
|
||||
"invalid comparator",
|
||||
"invalid severity",
|
||||
"cooldown_seconds must be",
|
||||
} {
|
||||
if strings.Contains(msg, needle) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
|
||||
raw := chi.URLParam(r, "id")
|
||||
id, err := strconv.ParseInt(raw, 10, 64)
|
||||
if err != nil || id <= 0 {
|
||||
respondError(w, http.StatusBadRequest, "invalid rule id")
|
||||
return 0, false
|
||||
}
|
||||
return id, true
|
||||
}
|
||||
|
||||
func derefFloat64(p *float64) float64 {
|
||||
if p == nil {
|
||||
return 0
|
||||
}
|
||||
return *p
|
||||
}
|
||||
@@ -431,6 +431,16 @@ func (s *Server) Router() chi.Router {
|
||||
r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
|
||||
})
|
||||
|
||||
// Metric-alert rules.
|
||||
r.Get("/metric-alert-rules", s.listMetricAlertRules)
|
||||
r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule)
|
||||
r.Group(func(r chi.Router) {
|
||||
r.Use(auth.AdminOnly)
|
||||
r.Post("/metric-alert-rules", s.createMetricAlertRule)
|
||||
r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule)
|
||||
r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule)
|
||||
})
|
||||
|
||||
// System resources (read-only).
|
||||
r.Get("/system/stats", s.getSystemStats)
|
||||
r.Get("/system/stats/history", s.getSystemStatsHistory)
|
||||
|
||||
@@ -0,0 +1,349 @@
|
||||
// Package metricalert implements a background goroutine that
|
||||
// periodically evaluates operator-configured metric-threshold rules
|
||||
// against recent container stats samples. On breach (subject to a
|
||||
// per-rule-per-workload cooldown) it emits an event into the existing
|
||||
// event_log + event-bus pipeline — the same fan-out used by the
|
||||
// log-scanner — instead of building any new notification plumbing.
|
||||
package metricalert
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// EvalInterval is how often the evaluator tick fires.
|
||||
const EvalInterval = 30 * time.Second
|
||||
|
||||
// lookbackSeconds bounds how far back we pull samples each tick. Stats
|
||||
// are collected at most every few seconds (see internal/stats), so a
|
||||
// 120s window comfortably captures the latest reading per container
|
||||
// even if collection briefly stalls.
|
||||
const lookbackSeconds = 120
|
||||
|
||||
// RuleSource is the read-side seam for fetching the current rule rows.
|
||||
// Real callers pass *store.Store; tests pass a fake.
|
||||
type RuleSource interface {
|
||||
ListMetricAlertRules() ([]store.MetricAlertRule, error)
|
||||
}
|
||||
|
||||
// SampleSource fetches the recent container stats samples to evaluate.
|
||||
type SampleSource interface {
|
||||
ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error)
|
||||
}
|
||||
|
||||
// EventSink writes a breach into event_log.
|
||||
type EventSink interface {
|
||||
InsertEvent(store.EventLog) (store.EventLog, error)
|
||||
}
|
||||
|
||||
// Publisher fans the breach out on the event bus. Matches *events.Bus.
|
||||
type Publisher interface {
|
||||
Publish(events.Event)
|
||||
}
|
||||
|
||||
// Source identifies metric-alert events in event_log + the bus.
|
||||
const eventSource = "metric_alert"
|
||||
|
||||
// Manager owns the evaluation loop lifecycle. It mirrors
|
||||
// stats.Collector: a once-guarded Start/Stop pair with stop/done
|
||||
// channels and a single-goroutine run loop.
|
||||
type Manager struct {
|
||||
rules RuleSource
|
||||
samples SampleSource
|
||||
sink EventSink
|
||||
pub Publisher
|
||||
|
||||
// now is swappable in tests so cooldown windows can be exercised
|
||||
// deterministically. Defaults to time.Now.
|
||||
now func() time.Time
|
||||
|
||||
// mu guards lastFired. The run loop is single-goroutine today, but
|
||||
// Start/Stop and a future ReloadRules may touch shared state; the
|
||||
// mutex is cheap insurance.
|
||||
mu sync.Mutex
|
||||
lastFired map[string]time.Time // "ruleID:ownerID" -> last emit time
|
||||
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
started bool
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// New wires a manager with the supplied dependencies. Call Start to
|
||||
// begin evaluating.
|
||||
func New(rules RuleSource, samples SampleSource, sink EventSink, pub Publisher) *Manager {
|
||||
return &Manager{
|
||||
rules: rules,
|
||||
samples: samples,
|
||||
sink: sink,
|
||||
pub: pub,
|
||||
now: time.Now,
|
||||
lastFired: map[string]time.Time{},
|
||||
stop: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start launches the background loop. Returns immediately. The loop
|
||||
// exits when Stop is called. Safe to call multiple times — only the
|
||||
// first call has an effect.
|
||||
func (m *Manager) Start() {
|
||||
m.startOnce.Do(func() {
|
||||
m.started = true
|
||||
go m.run()
|
||||
})
|
||||
}
|
||||
|
||||
// Stop signals the loop to exit and blocks until it has finished the
|
||||
// in-flight tick. If Start was never called, Stop returns immediately.
|
||||
func (m *Manager) Stop() {
|
||||
m.stopOnce.Do(func() {
|
||||
close(m.stop)
|
||||
if !m.started {
|
||||
close(m.done)
|
||||
}
|
||||
})
|
||||
<-m.done
|
||||
}
|
||||
|
||||
// run is the main loop. It evaluates once shortly after start, then on
|
||||
// every EvalInterval tick, until Stop is called.
|
||||
func (m *Manager) run() {
|
||||
defer close(m.done)
|
||||
|
||||
// Settle delay so the app + first stats samples exist before the
|
||||
// first evaluation.
|
||||
select {
|
||||
case <-time.After(3 * time.Second):
|
||||
case <-m.stop:
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(EvalInterval)
|
||||
defer ticker.Stop()
|
||||
m.evaluate(m.now())
|
||||
for {
|
||||
select {
|
||||
case <-m.stop:
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.evaluate(m.now())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate runs one pass: load rules + recent samples, reduce to the
|
||||
// freshest sample per (owner, container), and emit on breach subject to
|
||||
// cooldown. Best-effort throughout — a bad rule or sample never crashes
|
||||
// the loop.
|
||||
func (m *Manager) evaluate(now time.Time) {
|
||||
rules, err := m.rules.ListMetricAlertRules()
|
||||
if err != nil {
|
||||
slog.Warn("metricalert: list rules", "error", err)
|
||||
return
|
||||
}
|
||||
if len(rules) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
since := now.Unix() - lookbackSeconds
|
||||
samples, err := m.samples.ListAllRecentContainerStatsSamples(since)
|
||||
if err != nil {
|
||||
slog.Warn("metricalert: list samples", "error", err)
|
||||
return
|
||||
}
|
||||
latest := latestPerContainer(samples)
|
||||
if len(latest) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for _, rule := range rules {
|
||||
if !rule.Enabled {
|
||||
continue
|
||||
}
|
||||
for _, sample := range latest {
|
||||
// Per-workload rules only match their workload; "" matches all.
|
||||
if rule.WorkloadID != "" && rule.WorkloadID != sample.OwnerID {
|
||||
continue
|
||||
}
|
||||
value, ok := metricValue(rule.Metric, sample)
|
||||
if !ok {
|
||||
continue // e.g. memory_percent with a zero limit
|
||||
}
|
||||
if !breached(rule.Comparator, value, rule.Threshold) {
|
||||
continue
|
||||
}
|
||||
if m.coolingDown(rule, sample.OwnerID, now) {
|
||||
continue
|
||||
}
|
||||
m.emit(rule, sample, value)
|
||||
m.recordFire(rule, sample.OwnerID, now)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// latestPerContainer keeps only the most recent sample per
|
||||
// (OwnerID, ContainerID), so each container is judged on its freshest
|
||||
// reading rather than every historical row in the window.
|
||||
func latestPerContainer(samples []store.ContainerStatsSample) []store.ContainerStatsSample {
|
||||
newest := map[string]store.ContainerStatsSample{}
|
||||
for _, s := range samples {
|
||||
key := s.OwnerID + "\x00" + s.ContainerID
|
||||
if prev, ok := newest[key]; !ok || s.TS > prev.TS {
|
||||
newest[key] = s
|
||||
}
|
||||
}
|
||||
out := make([]store.ContainerStatsSample, 0, len(newest))
|
||||
for _, s := range newest {
|
||||
out = append(out, s)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// metricValue resolves a rule's metric against a sample. The bool is
|
||||
// false when the sample can't be judged for that metric (memory_percent
|
||||
// with a zero/unknown limit) so the caller skips it instead of dividing
|
||||
// by zero.
|
||||
func metricValue(metric string, s store.ContainerStatsSample) (float64, bool) {
|
||||
switch metric {
|
||||
case store.MetricCPUPercent:
|
||||
return s.CPUPercent, true
|
||||
case store.MetricMemoryPercent:
|
||||
if s.MemoryLimit <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return float64(s.MemoryUsage) / float64(s.MemoryLimit) * 100, true
|
||||
case store.MetricMemoryBytes:
|
||||
return float64(s.MemoryUsage), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
// breached returns whether value crosses threshold per the comparator.
|
||||
func breached(comparator string, value, threshold float64) bool {
|
||||
switch comparator {
|
||||
case store.MetricComparatorGT:
|
||||
return value > threshold
|
||||
case store.MetricComparatorLT:
|
||||
return value < threshold
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// cooldownKey is the per-rule-per-workload cooldown key.
|
||||
func cooldownKey(ruleID int64, ownerID string) string {
|
||||
return fmt.Sprintf("%d:%s", ruleID, ownerID)
|
||||
}
|
||||
|
||||
func (m *Manager) coolingDown(rule store.MetricAlertRule, ownerID string, now time.Time) bool {
|
||||
if rule.CooldownSeconds <= 0 {
|
||||
return false
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
last, ok := m.lastFired[cooldownKey(rule.ID, ownerID)]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return now.Sub(last) < time.Duration(rule.CooldownSeconds)*time.Second
|
||||
}
|
||||
|
||||
func (m *Manager) recordFire(rule store.MetricAlertRule, ownerID string, now time.Time) {
|
||||
m.mu.Lock()
|
||||
m.lastFired[cooldownKey(rule.ID, ownerID)] = now
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
// emit persists the breach as an event_log row and publishes it on the
|
||||
// bus. WorkloadID routes the alert to that app's activity timeline.
|
||||
// Metadata is JSON-marshalled (never string-concatenated). Any
|
||||
// marshal/insert failure is logged and skipped — emitting must never
|
||||
// crash the loop.
|
||||
func (m *Manager) emit(rule store.MetricAlertRule, sample store.ContainerStatsSample, value float64) {
|
||||
message := formatMessage(rule, value)
|
||||
meta := map[string]any{
|
||||
"workload_id": sample.OwnerID,
|
||||
"rule": rule.Name,
|
||||
"metric": rule.Metric,
|
||||
"value": value,
|
||||
"threshold": rule.Threshold,
|
||||
"comparator": rule.Comparator,
|
||||
}
|
||||
metaJSON, err := json.Marshal(meta)
|
||||
if err != nil {
|
||||
slog.Error("metricalert: marshal metadata", "rule", rule.Name, "error", err)
|
||||
return
|
||||
}
|
||||
severity := rule.Severity
|
||||
if severity == "" {
|
||||
severity = store.LogScanSeverityWarn
|
||||
}
|
||||
evt, err := m.sink.InsertEvent(store.EventLog{
|
||||
Source: eventSource,
|
||||
Severity: severity,
|
||||
Message: message,
|
||||
WorkloadID: sample.OwnerID,
|
||||
Metadata: string(metaJSON),
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("metricalert: persist event", "rule", rule.Name, "error", err)
|
||||
return
|
||||
}
|
||||
if m.pub != nil {
|
||||
m.pub.Publish(events.Event{
|
||||
Type: events.EventLog,
|
||||
Payload: events.EventLogPayload{
|
||||
ID: evt.ID,
|
||||
Source: eventSource,
|
||||
WorkloadID: sample.OwnerID,
|
||||
Severity: severity,
|
||||
Message: message,
|
||||
Metadata: string(metaJSON),
|
||||
CreatedAt: evt.CreatedAt,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// formatMessage builds a concise, human, secret-free breach line. The
|
||||
// only operator-supplied text is rule.Name; the rest are numbers and
|
||||
// fixed labels.
|
||||
func formatMessage(rule store.MetricAlertRule, value float64) string {
|
||||
label, unit := metricLabelUnit(rule.Metric)
|
||||
word := comparatorWord(rule.Comparator)
|
||||
return fmt.Sprintf("%s: %s is %.0f%s (threshold %s %.0f%s)",
|
||||
rule.Name, label, value, unit, word, rule.Threshold, unit)
|
||||
}
|
||||
|
||||
func metricLabelUnit(metric string) (label, unit string) {
|
||||
switch metric {
|
||||
case store.MetricCPUPercent:
|
||||
return "CPU", "%"
|
||||
case store.MetricMemoryPercent:
|
||||
return "Memory", "%"
|
||||
case store.MetricMemoryBytes:
|
||||
return "Memory", " bytes"
|
||||
default:
|
||||
return metric, ""
|
||||
}
|
||||
}
|
||||
|
||||
func comparatorWord(comparator string) string {
|
||||
switch comparator {
|
||||
case store.MetricComparatorGT:
|
||||
return ">"
|
||||
case store.MetricComparatorLT:
|
||||
return "<"
|
||||
default:
|
||||
return comparator
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,284 @@
|
||||
package metricalert
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/events"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// --- fakes -----------------------------------------------------------
|
||||
|
||||
type fakeRules struct {
|
||||
rules []store.MetricAlertRule
|
||||
err error
|
||||
}
|
||||
|
||||
func (f *fakeRules) ListMetricAlertRules() ([]store.MetricAlertRule, error) {
|
||||
return f.rules, f.err
|
||||
}
|
||||
|
||||
type fakeSamples struct {
|
||||
samples []store.ContainerStatsSample
|
||||
err error
|
||||
since int64 // captured arg of the last call
|
||||
}
|
||||
|
||||
func (f *fakeSamples) ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error) {
|
||||
f.since = sinceTS
|
||||
return f.samples, f.err
|
||||
}
|
||||
|
||||
type recordedEvent struct {
|
||||
evt store.EventLog
|
||||
}
|
||||
|
||||
type fakeSink struct {
|
||||
events []recordedEvent
|
||||
err error
|
||||
nextID int64
|
||||
}
|
||||
|
||||
func (f *fakeSink) InsertEvent(e store.EventLog) (store.EventLog, error) {
|
||||
if f.err != nil {
|
||||
return store.EventLog{}, f.err
|
||||
}
|
||||
f.nextID++
|
||||
e.ID = f.nextID
|
||||
e.CreatedAt = "2026-05-29T00:00:00Z"
|
||||
f.events = append(f.events, recordedEvent{evt: e})
|
||||
return e, nil
|
||||
}
|
||||
|
||||
type fakePublisher struct {
|
||||
published []events.Event
|
||||
}
|
||||
|
||||
func (f *fakePublisher) Publish(e events.Event) {
|
||||
f.published = append(f.published, e)
|
||||
}
|
||||
|
||||
func newManager(rules []store.MetricAlertRule, samples []store.ContainerStatsSample) (*Manager, *fakeSink, *fakePublisher) {
|
||||
sink := &fakeSink{}
|
||||
pub := &fakePublisher{}
|
||||
m := New(&fakeRules{rules: rules}, &fakeSamples{samples: samples}, sink, pub)
|
||||
return m, sink, pub
|
||||
}
|
||||
|
||||
// --- tests -----------------------------------------------------------
|
||||
|
||||
func TestEvaluate_BreachEmits(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Severity: "error",
|
||||
CooldownSeconds: 300, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", OwnerType: "instance", TS: 100, CPUPercent: 95,
|
||||
}}
|
||||
m, sink, pub := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("expected 1 event, got %d", len(sink.events))
|
||||
}
|
||||
got := sink.events[0].evt
|
||||
if got.Source != "metric_alert" {
|
||||
t.Errorf("source = %q, want metric_alert", got.Source)
|
||||
}
|
||||
if got.Severity != "error" {
|
||||
t.Errorf("severity = %q, want error", got.Severity)
|
||||
}
|
||||
if got.WorkloadID != "w1" {
|
||||
t.Errorf("workload_id = %q, want w1", got.WorkloadID)
|
||||
}
|
||||
if got.Metadata == "" || got.Metadata == "{}" {
|
||||
t.Errorf("metadata should be populated JSON, got %q", got.Metadata)
|
||||
}
|
||||
if len(pub.published) != 1 {
|
||||
t.Fatalf("expected 1 published event, got %d", len(pub.published))
|
||||
}
|
||||
payload, ok := pub.published[0].Payload.(events.EventLogPayload)
|
||||
if !ok {
|
||||
t.Fatalf("published payload is not EventLogPayload")
|
||||
}
|
||||
if payload.WorkloadID != "w1" || payload.Source != "metric_alert" {
|
||||
t.Errorf("payload workload/source mismatch: %+v", payload)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_NoBreachNoEmit(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10,
|
||||
}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 0 {
|
||||
t.Fatalf("expected no events for non-breach, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_DisabledRuleSkipped(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: false,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 0 {
|
||||
t.Fatalf("disabled rule should not emit, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_PerWorkloadScoping(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "w2-only", WorkloadID: "w2", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}, // breach but wrong workload
|
||||
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95}, // breach, correct workload
|
||||
}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("expected 1 event (only w2), got %d", len(sink.events))
|
||||
}
|
||||
if sink.events[0].evt.WorkloadID != "w2" {
|
||||
t.Errorf("event should be scoped to w2, got %q", sink.events[0].evt.WorkloadID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_GlobalRuleMatchesAll(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "global", WorkloadID: "", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95},
|
||||
{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95},
|
||||
}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 2 {
|
||||
t.Fatalf("global rule should fire for both workloads, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_MemoryPercentDivByZeroSkip(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 50, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 1000, MemoryLimit: 0,
|
||||
}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 0 {
|
||||
t.Fatalf("zero memory limit should be skipped for percent rule, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_MemoryPercentBreaches(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 90, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{
|
||||
ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 950, MemoryLimit: 1000, // 95%
|
||||
}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("95%% should breach 90%% threshold, got %d events", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_CooldownSuppressesSecondEmit(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, CooldownSeconds: 300, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
base := time.Unix(1000, 0)
|
||||
m.evaluate(base)
|
||||
// 10s later — still inside the 300s cooldown window.
|
||||
m.evaluate(base.Add(10 * time.Second))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("cooldown should suppress second emit, got %d events", len(sink.events))
|
||||
}
|
||||
|
||||
// Past the window — should fire again.
|
||||
m.evaluate(base.Add(301 * time.Second))
|
||||
if len(sink.events) != 2 {
|
||||
t.Fatalf("should re-fire after cooldown elapses, got %d events", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_LatestSamplePerContainer(t *testing.T) {
|
||||
// Two samples for the same container: an old non-breaching reading
|
||||
// and a newer breaching one. Only the freshest should be judged.
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10},
|
||||
{ContainerID: "c1", OwnerID: "w1", TS: 150, CPUPercent: 95},
|
||||
}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("expected exactly 1 event from freshest sample, got %d", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_LessThanComparator(t *testing.T) {
|
||||
rules := []store.MetricAlertRule{{
|
||||
ID: 1, Name: "cpu-idle", Metric: store.MetricCPUPercent,
|
||||
Comparator: store.MetricComparatorLT, Threshold: 5, Enabled: true,
|
||||
}}
|
||||
samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 1}}
|
||||
m, sink, _ := newManager(rules, samples)
|
||||
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
|
||||
if len(sink.events) != 1 {
|
||||
t.Fatalf("1%% < 5%% threshold should breach lt rule, got %d events", len(sink.events))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluate_NoRulesNoFetch(t *testing.T) {
|
||||
// With no rules there's nothing to do; we shouldn't even query samples.
|
||||
samplesSrc := &fakeSamples{samples: nil}
|
||||
m := New(&fakeRules{rules: nil}, samplesSrc, &fakeSink{}, &fakePublisher{})
|
||||
m.evaluate(time.Unix(200, 0))
|
||||
if samplesSrc.since != 0 {
|
||||
t.Errorf("samples should not be queried when there are no rules")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,191 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CreateMetricAlertRule inserts a new rule row after validating its
|
||||
// metric/comparator/severity enums and rejecting negative cooldowns.
|
||||
func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
|
||||
if err := validateMetricAlertRule(r); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
now := Now()
|
||||
r.CreatedAt = now
|
||||
r.UpdatedAt = now
|
||||
res, err := s.db.Exec(
|
||||
`INSERT INTO metric_alert_rules
|
||||
(workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
|
||||
}
|
||||
r.ID = id
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ListMetricAlertRules returns every rule, ordered by id for stable UI
|
||||
// rendering.
|
||||
func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
|
||||
return s.queryMetricAlertRules(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules ORDER BY id`,
|
||||
)
|
||||
}
|
||||
|
||||
// ListMetricAlertRulesByWorkload returns rules that apply to the given
|
||||
// workload: rows explicitly scoped to it plus global rows (workload_id
|
||||
// = ""). Useful for the workload detail page.
|
||||
func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
|
||||
return s.queryMetricAlertRules(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
|
||||
workloadID,
|
||||
)
|
||||
}
|
||||
|
||||
// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
|
||||
func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
|
||||
row := s.db.QueryRow(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules WHERE id = ?`, id,
|
||||
)
|
||||
r, err := scanMetricAlertRuleRow(row)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// UpdateMetricAlertRule overwrites the editable columns of a rule row.
|
||||
// id and workload_id are immutable on update — change the scope of a
|
||||
// rule by deleting + recreating, mirroring the log-scan store.
|
||||
func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
|
||||
if r.ID == 0 {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
|
||||
}
|
||||
if err := validateMetricAlertRule(r); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
r.UpdatedAt = Now()
|
||||
res, err := s.db.Exec(
|
||||
`UPDATE metric_alert_rules
|
||||
SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
|
||||
cooldown_seconds = ?, enabled = ?, updated_at = ?
|
||||
WHERE id = ?`,
|
||||
r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
|
||||
)
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
|
||||
}
|
||||
return s.GetMetricAlertRule(r.ID)
|
||||
}
|
||||
|
||||
// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
|
||||
// no row matched.
|
||||
func (s *Store) DeleteMetricAlertRule(id int64) error {
|
||||
res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete metric alert rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
|
||||
rows, err := s.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query metric alert rules: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
out := []MetricAlertRule{}
|
||||
for rows.Next() {
|
||||
r, err := scanMetricAlertRuleRows(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, r)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
|
||||
var r MetricAlertRule
|
||||
var enabled int
|
||||
if err := rows.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
|
||||
var r MetricAlertRule
|
||||
var enabled int
|
||||
if err := row.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// validateMetricAlertRule enforces the per-row invariants: a non-empty
|
||||
// name, a known metric/comparator, a valid severity (blank allowed so
|
||||
// the caller can default it), and a non-negative cooldown.
|
||||
func validateMetricAlertRule(r MetricAlertRule) error {
|
||||
if strings.TrimSpace(r.Name) == "" {
|
||||
return fmt.Errorf("metric alert rule: name is required")
|
||||
}
|
||||
switch r.Metric {
|
||||
case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
|
||||
}
|
||||
switch r.Comparator {
|
||||
case MetricComparatorGT, MetricComparatorLT:
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
|
||||
}
|
||||
switch r.Severity {
|
||||
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
|
||||
case "":
|
||||
// Default applied at the caller; allow blank.
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
|
||||
}
|
||||
if r.CooldownSeconds < 0 {
|
||||
return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestCreateMetricAlertRule_Validates(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
cases := []struct {
|
||||
name string
|
||||
in MetricAlertRule
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "missing name",
|
||||
in: MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
|
||||
wantErr: "name is required",
|
||||
},
|
||||
{
|
||||
name: "bad metric",
|
||||
in: MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
|
||||
wantErr: "invalid metric",
|
||||
},
|
||||
{
|
||||
name: "bad comparator",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
|
||||
wantErr: "invalid comparator",
|
||||
},
|
||||
{
|
||||
name: "bad severity",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
|
||||
wantErr: "invalid severity",
|
||||
},
|
||||
{
|
||||
name: "negative cooldown",
|
||||
in: MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
|
||||
wantErr: "cooldown_seconds must be",
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
_, err := s.CreateMetricAlertRule(c.in)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error containing %q, got nil", c.wantErr)
|
||||
}
|
||||
if !strings.Contains(err.Error(), c.wantErr) {
|
||||
t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateAndGetMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, err := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create: %v", err)
|
||||
}
|
||||
if r.ID == 0 {
|
||||
t.Fatal("id should be set")
|
||||
}
|
||||
got, err := s.GetMetricAlertRule(r.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
|
||||
t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
|
||||
}
|
||||
if got.Threshold != 80 {
|
||||
t.Errorf("threshold mismatch: %v", got.Threshold)
|
||||
}
|
||||
if !got.Enabled {
|
||||
t.Error("enabled lost on round-trip")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMetricAlertRule_NotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
if _, err := s.GetMetricAlertRule(999); err == nil {
|
||||
t.Fatal("expected ErrNotFound for missing rule")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListMetricAlertRulesByWorkload(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 90, Severity: "warn", Enabled: true,
|
||||
})
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 85, Severity: "error", Enabled: true,
|
||||
})
|
||||
_, _ = s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
|
||||
Threshold: 1000, Severity: "info", Enabled: true,
|
||||
})
|
||||
|
||||
w1, err := s.ListMetricAlertRulesByWorkload("w1")
|
||||
if err != nil {
|
||||
t.Fatalf("by workload: %v", err)
|
||||
}
|
||||
// w1 sees its own rule + the global, but NOT w2's rule.
|
||||
if len(w1) != 2 {
|
||||
t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
|
||||
}
|
||||
for _, r := range w1 {
|
||||
if r.WorkloadID == "w2" {
|
||||
t.Errorf("w1 should not see w2's rule")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", Enabled: true,
|
||||
})
|
||||
r.Threshold = 95
|
||||
r.Comparator = MetricComparatorLT
|
||||
r.Enabled = false
|
||||
got, err := s.UpdateMetricAlertRule(r)
|
||||
if err != nil {
|
||||
t.Fatalf("update: %v", err)
|
||||
}
|
||||
if got.Threshold != 95 {
|
||||
t.Errorf("threshold not updated: %v", got.Threshold)
|
||||
}
|
||||
if got.Comparator != MetricComparatorLT {
|
||||
t.Errorf("comparator not updated: %q", got.Comparator)
|
||||
}
|
||||
if got.Enabled {
|
||||
t.Error("enabled=false not applied")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
_, err := s.UpdateMetricAlertRule(MetricAlertRule{
|
||||
ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected ErrNotFound updating missing rule")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeleteMetricAlertRule(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
r, _ := s.CreateMetricAlertRule(MetricAlertRule{
|
||||
Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
|
||||
Threshold: 80, Severity: "warn", Enabled: true,
|
||||
})
|
||||
if err := s.DeleteMetricAlertRule(r.ID); err != nil {
|
||||
t.Fatalf("delete: %v", err)
|
||||
}
|
||||
if _, err := s.GetMetricAlertRule(r.ID); err == nil {
|
||||
t.Error("rule should be gone after delete")
|
||||
}
|
||||
if err := s.DeleteMetricAlertRule(r.ID); err == nil {
|
||||
t.Error("expected ErrNotFound deleting already-deleted rule")
|
||||
}
|
||||
}
|
||||
@@ -277,6 +277,39 @@ const (
|
||||
LogScanSeverityError = "error"
|
||||
)
|
||||
|
||||
// MetricAlertRule fires an event when a container metric breaches a
|
||||
// threshold. Mirrors LogScanRule but evaluated against stats_samples
|
||||
// instead of log lines.
|
||||
type MetricAlertRule struct {
|
||||
ID int64 `json:"id"`
|
||||
WorkloadID string `json:"workload_id"` // "" = applies to all workloads
|
||||
Name string `json:"name"`
|
||||
Metric string `json:"metric"` // cpu_percent | memory_percent | memory_bytes
|
||||
Comparator string `json:"comparator"` // gt | lt
|
||||
Threshold float64 `json:"threshold"`
|
||||
Severity string `json:"severity"` // info | warn | error
|
||||
CooldownSeconds int `json:"cooldown_seconds"` // min seconds between fires per (rule,workload)
|
||||
Enabled bool `json:"enabled"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
// Metric-alert metric identifiers. cpu_percent + memory_percent are
|
||||
// 0–100 ratios; memory_bytes is an absolute usage figure. Validated in
|
||||
// the store on create/update.
|
||||
const (
|
||||
MetricCPUPercent = "cpu_percent"
|
||||
MetricMemoryPercent = "memory_percent"
|
||||
MetricMemoryBytes = "memory_bytes"
|
||||
)
|
||||
|
||||
// Metric-alert comparators. gt fires when the value exceeds the
|
||||
// threshold; lt when it falls below.
|
||||
const (
|
||||
MetricComparatorGT = "gt"
|
||||
MetricComparatorLT = "lt"
|
||||
)
|
||||
|
||||
// WorkloadKind enumerates the legacy discriminator values written into
|
||||
// containers.workload_kind and workloads.kind. After the hard cutover the
|
||||
// backing project / stack / static_site tables are gone — these constants
|
||||
|
||||
@@ -408,6 +408,24 @@ func (s *Store) runMigrations() error {
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
|
||||
// metric_alert_rules: threshold rules the metric-alert manager
|
||||
// evaluates against recent container stats samples. WorkloadID is
|
||||
// nullable (via "" sentinel) so a global rule applies to every
|
||||
// workload; a non-empty value scopes it to one workload.
|
||||
`CREATE TABLE IF NOT EXISTS metric_alert_rules (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
workload_id TEXT NOT NULL DEFAULT '',
|
||||
name TEXT NOT NULL DEFAULT '',
|
||||
metric TEXT NOT NULL,
|
||||
comparator TEXT NOT NULL,
|
||||
threshold REAL NOT NULL DEFAULT 0,
|
||||
severity TEXT NOT NULL DEFAULT 'warn',
|
||||
cooldown_seconds INTEGER NOT NULL DEFAULT 300,
|
||||
enabled INTEGER NOT NULL DEFAULT 1,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_metric_alert_rules_workload ON metric_alert_rules(workload_id)`,
|
||||
}
|
||||
for _, t := range observabilityTables {
|
||||
if _, err := s.db.Exec(t); err != nil {
|
||||
|
||||
@@ -551,6 +551,7 @@
|
||||
"static_site": "Static Site",
|
||||
"stale_scanner": "Stale Scanner",
|
||||
"stale_cleanup": "Stale Cleanup",
|
||||
"metric_alert": "Metric Alert",
|
||||
"admin": "Admin"
|
||||
},
|
||||
"metadata": "Details"
|
||||
|
||||
@@ -551,6 +551,7 @@
|
||||
"static_site": "Статический сайт",
|
||||
"stale_scanner": "Сканер устаревших",
|
||||
"stale_cleanup": "Очистка устаревших",
|
||||
"metric_alert": "Метрика",
|
||||
"admin": "Администратор"
|
||||
},
|
||||
"metadata": "Подробности"
|
||||
|
||||
Reference in New Issue
Block a user