Files
alexei.dolgolyov cdb9fd57d1 feat(alerts): metric-threshold alerting (backend + API)
Operators can define metric-threshold alert rules (cpu_percent,
memory_percent, memory_bytes; gt/lt) per-workload or global via
/api/metric-alert-rules. A periodic evaluator (internal/metricalert,
30s tick) checks the freshest container stats sample per container
against enabled rules and, on breach (per-rule-per-workload cooldown),
emits into the existing event_log + bus pipeline (source "metric_alert",
workload_id set). Alerts therefore surface on the global events page,
the per-app activity timeline, and any configured event-trigger webhook
-- no new notification plumbing.

Mirrors the log_scan_rules store/API/route patterns and the
stats.Collector lifecycle. Rule CRUD reads are authed, mutations
AdminOnly. Frontend rule-config UI is a follow-up phase.

Reviewed: go APPROVE (0 CRITICAL/HIGH).
2026-05-29 14:06:23 +03:00

192 lines
6.2 KiB
Go

package store
import (
"database/sql"
"errors"
"fmt"
"strings"
)
// CreateMetricAlertRule inserts a new rule row after validating its
// metric/comparator/severity enums and rejecting negative cooldowns.
func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
if err := validateMetricAlertRule(r); err != nil {
return MetricAlertRule{}, err
}
now := Now()
r.CreatedAt = now
r.UpdatedAt = now
res, err := s.db.Exec(
`INSERT INTO metric_alert_rules
(workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
)
if err != nil {
return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
}
id, err := res.LastInsertId()
if err != nil {
return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
}
r.ID = id
return r, nil
}
// ListMetricAlertRules returns every rule, ordered by id for stable UI
// rendering.
func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
return s.queryMetricAlertRules(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules ORDER BY id`,
)
}
// ListMetricAlertRulesByWorkload returns rules that apply to the given
// workload: rows explicitly scoped to it plus global rows (workload_id
// = ""). Useful for the workload detail page.
func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
return s.queryMetricAlertRules(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
workloadID,
)
}
// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
row := s.db.QueryRow(
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
cooldown_seconds, enabled, created_at, updated_at
FROM metric_alert_rules WHERE id = ?`, id,
)
r, err := scanMetricAlertRuleRow(row)
if errors.Is(err, sql.ErrNoRows) {
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
}
if err != nil {
return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
}
return r, nil
}
// UpdateMetricAlertRule overwrites the editable columns of a rule row.
// id and workload_id are immutable on update — change the scope of a
// rule by deleting + recreating, mirroring the log-scan store.
func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
if r.ID == 0 {
return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
}
if err := validateMetricAlertRule(r); err != nil {
return MetricAlertRule{}, err
}
r.UpdatedAt = Now()
res, err := s.db.Exec(
`UPDATE metric_alert_rules
SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
cooldown_seconds = ?, enabled = ?, updated_at = ?
WHERE id = ?`,
r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
)
if err != nil {
return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
}
return s.GetMetricAlertRule(r.ID)
}
// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
// no row matched.
func (s *Store) DeleteMetricAlertRule(id int64) error {
res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete metric alert rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
}
return nil
}
func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
rows, err := s.db.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("query metric alert rules: %w", err)
}
defer rows.Close()
out := []MetricAlertRule{}
for rows.Next() {
r, err := scanMetricAlertRuleRows(rows)
if err != nil {
return nil, err
}
out = append(out, r)
}
return out, rows.Err()
}
func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
var r MetricAlertRule
var enabled int
if err := rows.Scan(
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
}
r.Enabled = enabled != 0
return r, nil
}
func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
var r MetricAlertRule
var enabled int
if err := row.Scan(
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return MetricAlertRule{}, err
}
r.Enabled = enabled != 0
return r, nil
}
// validateMetricAlertRule enforces the per-row invariants: a non-empty
// name, a known metric/comparator, a valid severity (blank allowed so
// the caller can default it), and a non-negative cooldown.
func validateMetricAlertRule(r MetricAlertRule) error {
if strings.TrimSpace(r.Name) == "" {
return fmt.Errorf("metric alert rule: name is required")
}
switch r.Metric {
case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
default:
return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
}
switch r.Comparator {
case MetricComparatorGT, MetricComparatorLT:
default:
return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
}
switch r.Severity {
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
case "":
// Default applied at the caller; allow blank.
default:
return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
}
if r.CooldownSeconds < 0 {
return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
}
return nil
}