feat(alerts): metric-alert rule-management UI (Phase 2)

Completes metric-threshold alerting end-to-end: /metric-alert-rules list/new/edit routes (mirroring log-scan-rules) with metric/comparator/ threshold fields, the workload scope picker, ToggleSwitch, and a ConfirmDialog delete flow; an api.ts MetricAlertRule CRUD client; an "Observe" nav entry; and a full metricalert.* i18n namespace (en/ru parity). Create-form cooldown defaults to 300s to match the server. Rules are now manageable in the WebUI; breaches already surface in the per-app activity timeline and fire any configured event-trigger webhook. Reviewed: typescript APPROVE (0 CRITICAL/HIGH).
2026-05-29 14:34:01 +03:00
parent 2e26f555c5
commit 7576f54e76
7 changed files with 1927 additions and 1 deletions
@@ -1375,3 +1375,56 @@ export function getLogScanStats(signal?: AbortSignal): Promise<LogScanStats> {
 	return get<LogScanStats>('/api/log-scan-rules/stats', signal);
 }

+// ── Metric alert rules ──────────────────────────────────────────────
+// Backend: internal/api/metric_alert_rules.go. Rules compare a sampled
+// container metric (cpu/memory) against a threshold using a comparator.
+// Scope model: workload_id="" → global; workload_id set → workload-only.
+// Unlike log-scan rules there is no override / test / effective-rules
+// concept — a metric-alert rule is a flat threshold check.
+
+export interface MetricAlertRule {
+	id: number;
+	workload_id: string; // "" = global
+	name: string;
+	metric: 'cpu_percent' | 'memory_percent' | 'memory_bytes';
+	comparator: 'gt' | 'lt';
+	threshold: number;
+	severity: 'info' | 'warn' | 'error';
+	cooldown_seconds: number;
+	enabled: boolean;
+	created_at: string;
+	updated_at: string;
+}
+export interface MetricAlertRuleInput {
+	workload_id?: string;
+	name: string;
+	metric: 'cpu_percent' | 'memory_percent' | 'memory_bytes';
+	comparator: 'gt' | 'lt';
+	threshold: number;
+	severity?: 'info' | 'warn' | 'error';
+	cooldown_seconds?: number;
+	enabled?: boolean;
+}
+export function listMetricAlertRules(opts?: {
+	workloadID?: string;
+	signal?: AbortSignal;
+}): Promise<MetricAlertRule[]> {
+	const params = opts?.workloadID ? `?workload_id=${encodeURIComponent(opts.workloadID)}` : '';
+	return get<MetricAlertRule[]>(`/api/metric-alert-rules${params}`, opts?.signal);
+}
+export function getMetricAlertRule(id: number, signal?: AbortSignal): Promise<MetricAlertRule> {
+	return get<MetricAlertRule>(`/api/metric-alert-rules/${id}`, signal);
+}
+export function createMetricAlertRule(data: MetricAlertRuleInput): Promise<MetricAlertRule> {
+	return post<MetricAlertRule>('/api/metric-alert-rules', data);
+}
+export function updateMetricAlertRule(
+	id: number,
+	data: MetricAlertRuleInput
+): Promise<MetricAlertRule> {
+	return patch<MetricAlertRule>(`/api/metric-alert-rules/${id}`, data);
+}
+export function deleteMetricAlertRule(id: number): Promise<void> {
+	return del<void>(`/api/metric-alert-rules/${id}`);
+}
+
@@ -17,6 +17,7 @@
    "apps": "Apps",
    "eventTriggers": "Event Triggers",
    "logScanRules": "Log Rules",
+    "metricAlertRules": "Metric Alerts",
    "triggers": "Triggers",
    "proxies": "Proxies",
    "events": "Events",
@@ -887,6 +888,105 @@
      "disabled": "disabled"
    }
  },
+  "metricalert": {
+    "title": "Metric alert rules",
+    "titleNew": "Forge a new alert",
+    "titleSingular": "Alert rule",
+    "lede": "Threshold checks the watcher runs against each running container's sampled CPU and memory. When a sample crosses the threshold the rule fires into event_log with the rule's severity, where event triggers pick it up and fan out to operator-configured webhooks. {enabled} of {total} enabled.",
+    "ledeNew": "Pick a metric, a comparator, and a threshold. Leave the workload field empty to create a global rule that applies to every workload, or scope it to a single workload.",
+    "stat": {
+      "total": "TOTAL",
+      "global": "GLOBAL",
+      "workload": "WORKLOAD",
+      "enabled": "ENABLED"
+    },
+    "toolbar": {
+      "newButton": "New alert",
+      "backToList": "Back to alerts"
+    },
+    "filter": {
+      "all": "ALL",
+      "global": "GLOBAL",
+      "workload": "WORKLOAD"
+    },
+    "empty": {
+      "heading": "No alert rules yet",
+      "body": "Start with a global rule like CPU greater than 80%, then narrow per-workload by scoping a rule to a single workload.",
+      "cta": "Create the first alert"
+    },
+    "list": {
+      "name": "Name",
+      "condition": "Condition",
+      "scope": "Scope",
+      "severity": "Severity",
+      "status": "Status",
+      "open": "Open"
+    },
+    "detail": {
+      "config": "Configuration",
+      "configSub": "id #{id} · scope {scope}",
+      "dangerZone": "Danger zone",
+      "dangerZoneSub": "Deleting an alert rule removes it immediately and stops it from firing.",
+      "deleteButton": "Delete alert",
+      "deleteTitle": "Delete alert rule?",
+      "deleteMessage": "Rule \"{name}\" will be removed immediately and will stop firing."
+    },
+    "form": {
+      "name": "Name",
+      "namePlaceholder": "e.g. Worker CPU saturated",
+      "condition": "Condition",
+      "metric": "Metric",
+      "comparator": "Comparator",
+      "threshold": "Threshold",
+      "thresholdPlaceholder": "e.g. 80",
+      "thresholdHintPercent": "Percent of the limit (0–100). The rule fires when the sampled value crosses this threshold.",
+      "thresholdHintBytes": "Absolute bytes (e.g. 536870912 for 512 MiB). The rule fires when sampled memory crosses this threshold.",
+      "matchShape": "Match shape",
+      "matchShapeOpts": "SEVERITY · COOLDOWN",
+      "severity": "Severity",
+      "cooldown": "Cooldown (s)",
+      "cooldownHint": "Cooldown is per-rule per-container — the same rule firing on two containers stays independent. It caps how often a sustained breach re-emits to event_log.",
+      "scope": "Scope",
+      "scopeHint": "Workload-scoped rules apply only to that workload's containers. Leave empty to apply the rule to every workload.",
+      "scopeGlobal": "Global (applies to every workload)",
+      "scopePick": "Pick workload…",
+      "scopePickTitle": "Pick a workload",
+      "scopeClear": "Make global",
+      "scopeSelected": "Workload",
+      "scopeUnknown": "Unknown workload",
+      "enabled": "Enabled",
+      "enabledHint": "Disabled rules stay in the table but never fire.",
+      "required": "REQUIRED",
+      "optional": "OPTIONAL",
+      "submit": "Forge alert",
+      "submitting": "Forging…"
+    },
+    "metric": {
+      "cpu_percent": "CPU %",
+      "memory_percent": "Memory %",
+      "memory_bytes": "Memory (bytes)"
+    },
+    "metricShort": {
+      "cpu": "CPU",
+      "memory": "Memory"
+    },
+    "comparator": {
+      "gt": "greater than",
+      "lt": "less than"
+    },
+    "unit": {
+      "percent": "%",
+      "bytes": "bytes"
+    },
+    "scope": {
+      "global": "global",
+      "workload": "workload {id}"
+    },
+    "status": {
+      "enabled": "enabled",
+      "disabled": "disabled"
+    }
+  },
  "logscan": {
    "title": "Log scan rules",
    "titleNew": "Forge a new rule",
@@ -17,6 +17,7 @@
    "apps": "Приложения",
    "eventTriggers": "Триггеры событий",
    "logScanRules": "Лог-правила",
+    "metricAlertRules": "Метрик-алерты",
    "triggers": "Триггеры",
    "proxies": "Прокси",
    "events": "События",
@@ -887,6 +888,105 @@
      "disabled": "выключен"
    }
  },
+  "metricalert": {
+    "title": "Правила метрик-алертов",
+    "titleNew": "Создать новый алерт",
+    "titleSingular": "Правило алерта",
+    "lede": "Пороговые проверки, которые наблюдатель выполняет по выборкам CPU и памяти каждого запущенного контейнера. Когда выборка пересекает порог, правило записывается в event_log с указанной важностью, откуда триггеры событий подхватывают его и рассылают по настроенным вебхукам. Включено {enabled} из {total}.",
+    "ledeNew": "Выберите метрику, оператор сравнения и порог. Оставьте поле рабочей нагрузки пустым, чтобы создать глобальное правило для всех нагрузок, или ограничьте его одной нагрузкой.",
+    "stat": {
+      "total": "ВСЕГО",
+      "global": "ГЛОБАЛЬНЫЕ",
+      "workload": "НАГРУЗКА",
+      "enabled": "ВКЛЮЧЕНО"
+    },
+    "toolbar": {
+      "newButton": "Новый алерт",
+      "backToList": "К списку алертов"
+    },
+    "filter": {
+      "all": "ВСЕ",
+      "global": "ГЛОБАЛЬНЫЕ",
+      "workload": "НАГРУЗКА"
+    },
+    "empty": {
+      "heading": "Пока нет правил алертов",
+      "body": "Начните с глобального правила, например «CPU больше 80%», затем сузьте его, ограничив правило отдельной рабочей нагрузкой.",
+      "cta": "Создать первый алерт"
+    },
+    "list": {
+      "name": "Название",
+      "condition": "Условие",
+      "scope": "Область",
+      "severity": "Важность",
+      "status": "Статус",
+      "open": "Открыть"
+    },
+    "detail": {
+      "config": "Конфигурация",
+      "configSub": "id #{id} · область {scope}",
+      "dangerZone": "Опасная зона",
+      "dangerZoneSub": "Удаление правила алерта немедленно убирает его и прекращает срабатывания.",
+      "deleteButton": "Удалить алерт",
+      "deleteTitle": "Удалить правило алерта?",
+      "deleteMessage": "Правило «{name}» будет удалено немедленно и перестанет срабатывать."
+    },
+    "form": {
+      "name": "Название",
+      "namePlaceholder": "напр. Перегрузка CPU воркера",
+      "condition": "Условие",
+      "metric": "Метрика",
+      "comparator": "Оператор",
+      "threshold": "Порог",
+      "thresholdPlaceholder": "напр. 80",
+      "thresholdHintPercent": "Процент от лимита (0–100). Правило срабатывает, когда выборка пересекает этот порог.",
+      "thresholdHintBytes": "Абсолютные байты (напр. 536870912 для 512 МиБ). Правило срабатывает, когда выборка памяти пересекает этот порог.",
+      "matchShape": "Параметры срабатывания",
+      "matchShapeOpts": "ВАЖНОСТЬ · ЗАДЕРЖКА",
+      "severity": "Важность",
+      "cooldown": "Задержка (с)",
+      "cooldownHint": "Задержка действует на каждое правило и контейнер отдельно — одно правило на двух контейнерах работает независимо. Она ограничивает, как часто длительное превышение повторно пишется в event_log.",
+      "scope": "Область",
+      "scopeHint": "Правила, привязанные к нагрузке, применяются только к её контейнерам. Оставьте пустым, чтобы применить правило ко всем нагрузкам.",
+      "scopeGlobal": "Глобально (применяется ко всем нагрузкам)",
+      "scopePick": "Выбрать нагрузку…",
+      "scopePickTitle": "Выберите нагрузку",
+      "scopeClear": "Сделать глобальным",
+      "scopeSelected": "Нагрузка",
+      "scopeUnknown": "Неизвестная нагрузка",
+      "enabled": "Включено",
+      "enabledHint": "Отключённые правила остаются в таблице, но не срабатывают.",
+      "required": "ОБЯЗАТЕЛЬНО",
+      "optional": "НЕОБЯЗАТЕЛЬНО",
+      "submit": "Создать алерт",
+      "submitting": "Создаём…"
+    },
+    "metric": {
+      "cpu_percent": "CPU %",
+      "memory_percent": "Память %",
+      "memory_bytes": "Память (байты)"
+    },
+    "metricShort": {
+      "cpu": "CPU",
+      "memory": "Память"
+    },
+    "comparator": {
+      "gt": "больше чем",
+      "lt": "меньше чем"
+    },
+    "unit": {
+      "percent": "%",
+      "bytes": "байт"
+    },
+    "scope": {
+      "global": "глобально",
+      "workload": "нагрузка {id}"
+    },
+    "status": {
+      "enabled": "включено",
+      "disabled": "отключено"
+    }
+  },
  "logscan": {
    "title": "Правила сканирования логов",
    "titleNew": "Новое правило",