Files
tiny-forge/internal/api/metric_alert_rules.go
T
alexei.dolgolyov cdb9fd57d1 feat(alerts): metric-threshold alerting (backend + API)
Operators can define metric-threshold alert rules (cpu_percent,
memory_percent, memory_bytes; gt/lt) per-workload or global via
/api/metric-alert-rules. A periodic evaluator (internal/metricalert,
30s tick) checks the freshest container stats sample per container
against enabled rules and, on breach (per-rule-per-workload cooldown),
emits into the existing event_log + bus pipeline (source "metric_alert",
workload_id set). Alerts therefore surface on the global events page,
the per-app activity timeline, and any configured event-trigger webhook
-- no new notification plumbing.

Mirrors the log_scan_rules store/API/route patterns and the
stats.Collector lifecycle. Rule CRUD reads are authed, mutations
AdminOnly. Frontend rule-config UI is a follow-up phase.

Reviewed: go APPROVE (0 CRITICAL/HIGH).
2026-05-29 14:06:23 +03:00

236 lines
6.7 KiB
Go

// Package api: metric-alert rule HTTP handlers. The evaluator lives in
// internal/metricalert; this file is the REST surface that lets
// operators create, edit, and delete threshold rules. Mirrors the
// log-scan rule handlers.
package api
import (
"errors"
"net/http"
"strconv"
"strings"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/store"
)
// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
// immutable on update (per store.UpdateMetricAlertRule) so it only takes
// effect on create.
type metricAlertRuleInput struct {
WorkloadID *string `json:"workload_id"`
Name *string `json:"name"`
Metric *string `json:"metric"`
Comparator *string `json:"comparator"`
Threshold *float64 `json:"threshold"`
Severity *string `json:"severity"`
CooldownSeconds *int `json:"cooldown_seconds"`
Enabled *bool `json:"enabled"`
}
// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
// query filter `workload_id=...` returns rules applying to that workload
// (its own rows plus globals).
func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
if err != nil {
respondError(w, http.StatusInternalServerError, "list metric alert rules")
return
}
respondJSON(w, http.StatusOK, out)
return
}
out, err := s.store.ListMetricAlertRules()
if err != nil {
respondError(w, http.StatusInternalServerError, "list metric alert rules")
return
}
respondJSON(w, http.StatusOK, out)
}
// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
id, ok := parseMetricAlertRuleID(w, r)
if !ok {
return
}
rule, err := s.store.GetMetricAlertRule(id)
if err != nil {
mapStoreError(w, err, "metric alert rule")
return
}
respondJSON(w, http.StatusOK, rule)
}
// createMetricAlertRule handles POST /api/metric-alert-rules.
func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
var in metricAlertRuleInput
if !decodeJSON(w, r, &in) {
return
}
rule := store.MetricAlertRule{
WorkloadID: derefString(in.WorkloadID),
Name: derefString(in.Name),
Metric: derefString(in.Metric),
Comparator: derefString(in.Comparator),
Threshold: derefFloat64(in.Threshold),
Severity: firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
Enabled: in.Enabled == nil || *in.Enabled,
}
if msg := validateMetricAlertInput(rule); msg != "" {
respondError(w, http.StatusBadRequest, msg)
return
}
out, err := s.store.CreateMetricAlertRule(rule)
if err != nil {
if isMetricAlertValidationErr(err) {
respondError(w, http.StatusBadRequest, err.Error())
return
}
respondError(w, http.StatusInternalServerError, "create metric alert rule")
return
}
respondJSON(w, http.StatusCreated, out)
}
// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
// workload_id is immutable; name/metric/comparator/threshold/severity/
// cooldown/enabled are individually overridable.
func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
id, ok := parseMetricAlertRuleID(w, r)
if !ok {
return
}
existing, err := s.store.GetMetricAlertRule(id)
if err != nil {
mapStoreError(w, err, "metric alert rule")
return
}
var in metricAlertRuleInput
if !decodeJSON(w, r, &in) {
return
}
if in.Name != nil {
existing.Name = *in.Name
}
if in.Metric != nil && *in.Metric != "" {
existing.Metric = *in.Metric
}
if in.Comparator != nil && *in.Comparator != "" {
existing.Comparator = *in.Comparator
}
if in.Threshold != nil {
existing.Threshold = *in.Threshold
}
if in.Severity != nil && *in.Severity != "" {
existing.Severity = *in.Severity
}
if in.CooldownSeconds != nil {
existing.CooldownSeconds = *in.CooldownSeconds
}
if in.Enabled != nil {
existing.Enabled = *in.Enabled
}
if msg := validateMetricAlertInput(existing); msg != "" {
respondError(w, http.StatusBadRequest, msg)
return
}
out, err := s.store.UpdateMetricAlertRule(existing)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "metric alert rule")
return
}
if isMetricAlertValidationErr(err) {
respondError(w, http.StatusBadRequest, err.Error())
return
}
respondError(w, http.StatusInternalServerError, "update metric alert rule")
return
}
respondJSON(w, http.StatusOK, out)
}
// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
id, ok := parseMetricAlertRuleID(w, r)
if !ok {
return
}
if err := s.store.DeleteMetricAlertRule(id); err != nil {
mapStoreError(w, err, "metric alert rule")
return
}
w.WriteHeader(http.StatusNoContent)
}
// validateMetricAlertInput does boundary validation so we return a
// clear 400 before hitting the store. The store re-validates the same
// invariants as a backstop.
func validateMetricAlertInput(rule store.MetricAlertRule) string {
if strings.TrimSpace(rule.Name) == "" {
return "name is required"
}
switch rule.Metric {
case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
default:
return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
}
switch rule.Comparator {
case store.MetricComparatorGT, store.MetricComparatorLT:
default:
return "invalid comparator: must be gt or lt"
}
switch rule.Severity {
case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
default:
return "invalid severity: must be info, warn, or error"
}
if rule.CooldownSeconds < 0 {
return "cooldown_seconds must be >= 0"
}
return ""
}
// isMetricAlertValidationErr maps the store's validation errors to 400
// rather than 500 without leaking driver text.
func isMetricAlertValidationErr(err error) bool {
if err == nil {
return false
}
msg := err.Error()
for _, needle := range []string{
"name is required",
"invalid metric",
"invalid comparator",
"invalid severity",
"cooldown_seconds must be",
} {
if strings.Contains(msg, needle) {
return true
}
}
return false
}
func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
raw := chi.URLParam(r, "id")
id, err := strconv.ParseInt(raw, 10, 64)
if err != nil || id <= 0 {
respondError(w, http.StatusBadRequest, "invalid rule id")
return 0, false
}
return id, true
}
func derefFloat64(p *float64) float64 {
if p == nil {
return 0
}
return *p
}