feat(alerts): metric-threshold alerting (backend + API)
Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CreateMetricAlertRule inserts a new rule row after validating its
|
||||
// metric/comparator/severity enums and rejecting negative cooldowns.
|
||||
func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
|
||||
if err := validateMetricAlertRule(r); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
now := Now()
|
||||
r.CreatedAt = now
|
||||
r.UpdatedAt = now
|
||||
res, err := s.db.Exec(
|
||||
`INSERT INTO metric_alert_rules
|
||||
(workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
|
||||
}
|
||||
r.ID = id
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ListMetricAlertRules returns every rule, ordered by id for stable UI
|
||||
// rendering.
|
||||
func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
|
||||
return s.queryMetricAlertRules(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules ORDER BY id`,
|
||||
)
|
||||
}
|
||||
|
||||
// ListMetricAlertRulesByWorkload returns rules that apply to the given
|
||||
// workload: rows explicitly scoped to it plus global rows (workload_id
|
||||
// = ""). Useful for the workload detail page.
|
||||
func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
|
||||
return s.queryMetricAlertRules(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
|
||||
workloadID,
|
||||
)
|
||||
}
|
||||
|
||||
// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
|
||||
func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
|
||||
row := s.db.QueryRow(
|
||||
`SELECT id, workload_id, name, metric, comparator, threshold, severity,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM metric_alert_rules WHERE id = ?`, id,
|
||||
)
|
||||
r, err := scanMetricAlertRuleRow(row)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// UpdateMetricAlertRule overwrites the editable columns of a rule row.
|
||||
// id and workload_id are immutable on update — change the scope of a
|
||||
// rule by deleting + recreating, mirroring the log-scan store.
|
||||
func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
|
||||
if r.ID == 0 {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
|
||||
}
|
||||
if err := validateMetricAlertRule(r); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
r.UpdatedAt = Now()
|
||||
res, err := s.db.Exec(
|
||||
`UPDATE metric_alert_rules
|
||||
SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
|
||||
cooldown_seconds = ?, enabled = ?, updated_at = ?
|
||||
WHERE id = ?`,
|
||||
r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
|
||||
)
|
||||
if err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
|
||||
}
|
||||
return s.GetMetricAlertRule(r.ID)
|
||||
}
|
||||
|
||||
// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
|
||||
// no row matched.
|
||||
func (s *Store) DeleteMetricAlertRule(id int64) error {
|
||||
res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete metric alert rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
|
||||
rows, err := s.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query metric alert rules: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
out := []MetricAlertRule{}
|
||||
for rows.Next() {
|
||||
r, err := scanMetricAlertRuleRows(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, r)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
|
||||
var r MetricAlertRule
|
||||
var enabled int
|
||||
if err := rows.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
|
||||
var r MetricAlertRule
|
||||
var enabled int
|
||||
if err := row.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return MetricAlertRule{}, err
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// validateMetricAlertRule enforces the per-row invariants: a non-empty
|
||||
// name, a known metric/comparator, a valid severity (blank allowed so
|
||||
// the caller can default it), and a non-negative cooldown.
|
||||
func validateMetricAlertRule(r MetricAlertRule) error {
|
||||
if strings.TrimSpace(r.Name) == "" {
|
||||
return fmt.Errorf("metric alert rule: name is required")
|
||||
}
|
||||
switch r.Metric {
|
||||
case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
|
||||
}
|
||||
switch r.Comparator {
|
||||
case MetricComparatorGT, MetricComparatorLT:
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
|
||||
}
|
||||
switch r.Severity {
|
||||
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
|
||||
case "":
|
||||
// Default applied at the caller; allow blank.
|
||||
default:
|
||||
return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
|
||||
}
|
||||
if r.CooldownSeconds < 0 {
|
||||
return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user