feat(alerts): metric-threshold alerting (backend + API)

Operators can define metric-threshold alert rules (cpu_percent, memory_percent, memory_bytes; gt/lt) per-workload or global via /api/metric-alert-rules. A periodic evaluator (internal/metricalert, 30s tick) checks the freshest container stats sample per container against enabled rules and, on breach (per-rule-per-workload cooldown), emits into the existing event_log + bus pipeline (source "metric_alert", workload_id set). Alerts therefore surface on the global events page, the per-app activity timeline, and any configured event-trigger webhook -- no new notification plumbing. Mirrors the log_scan_rules store/API/route patterns and the stats.Collector lifecycle. Rule CRUD reads are authed, mutations AdminOnly. Frontend rule-config UI is a follow-up phase. Reviewed: go APPROVE (0 CRITICAL/HIGH).
2026-05-29 14:06:23 +03:00
parent 5c17885197
commit cdb9fd57d1
11 changed files with 1299 additions and 0 deletions
@@ -28,6 +28,7 @@ import (
 	"github.com/alexei/tinyforge/internal/health"
 	"github.com/alexei/tinyforge/internal/logging"
 	"github.com/alexei/tinyforge/internal/logscanner"
+	"github.com/alexei/tinyforge/internal/metricalert"
 	"github.com/alexei/tinyforge/internal/notify"
 	"github.com/alexei/tinyforge/internal/npm"
 	"github.com/alexei/tinyforge/internal/proxy"
@@ -390,6 +391,14 @@ func main() {
 	}
 	defer logScanMgr.Stop()

+	// Metric-alert manager: evaluates threshold rules against recent
+	// container stats samples and emits event_log entries on breach.
+	// The store satisfies RuleSource/SampleSource/EventSink; the event
+	// bus is the Publisher.
+	metricAlertMgr := metricalert.New(db, db, db, eventBus)
+	metricAlertMgr.Start()
+	defer metricAlertMgr.Stop()
+
 	// Build API server.
 	apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
 	apiServer.SetStaleScanner(staleScanner)
@@ -451,6 +460,7 @@ func main() {
 	eventBus.Unsubscribe(notifySub)
 	staleScanner.Stop()
 	statsCollector.Stop()
+	metricAlertMgr.Stop()

 	// Drain in-progress deploys and notifications.
 	dep.Drain()
@@ -0,0 +1,235 @@
+// Package api: metric-alert rule HTTP handlers. The evaluator lives in
+// internal/metricalert; this file is the REST surface that lets
+// operators create, edit, and delete threshold rules. Mirrors the
+// log-scan rule handlers.
+package api
+
+import (
+	"errors"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// metricAlertRuleInput is the JSON shape accepted by POST + PATCH.
+// Pointers distinguish "absent" from explicit empty/zero. WorkloadID is
+// immutable on update (per store.UpdateMetricAlertRule) so it only takes
+// effect on create.
+type metricAlertRuleInput struct {
+	WorkloadID      *string  `json:"workload_id"`
+	Name            *string  `json:"name"`
+	Metric          *string  `json:"metric"`
+	Comparator      *string  `json:"comparator"`
+	Threshold       *float64 `json:"threshold"`
+	Severity        *string  `json:"severity"`
+	CooldownSeconds *int     `json:"cooldown_seconds"`
+	Enabled         *bool    `json:"enabled"`
+}
+
+// listMetricAlertRules handles GET /api/metric-alert-rules. Optional
+// query filter `workload_id=...` returns rules applying to that workload
+// (its own rows plus globals).
+func (s *Server) listMetricAlertRules(w http.ResponseWriter, r *http.Request) {
+	if wlID := r.URL.Query().Get("workload_id"); wlID != "" {
+		out, err := s.store.ListMetricAlertRulesByWorkload(wlID)
+		if err != nil {
+			respondError(w, http.StatusInternalServerError, "list metric alert rules")
+			return
+		}
+		respondJSON(w, http.StatusOK, out)
+		return
+	}
+	out, err := s.store.ListMetricAlertRules()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list metric alert rules")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// getMetricAlertRule handles GET /api/metric-alert-rules/{id}.
+func (s *Server) getMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	rule, err := s.store.GetMetricAlertRule(id)
+	if err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, rule)
+}
+
+// createMetricAlertRule handles POST /api/metric-alert-rules.
+func (s *Server) createMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	var in metricAlertRuleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	rule := store.MetricAlertRule{
+		WorkloadID:      derefString(in.WorkloadID),
+		Name:            derefString(in.Name),
+		Metric:          derefString(in.Metric),
+		Comparator:      derefString(in.Comparator),
+		Threshold:       derefFloat64(in.Threshold),
+		Severity:        firstNonEmpty(derefString(in.Severity), store.LogScanSeverityWarn),
+		CooldownSeconds: derefIntDefault(in.CooldownSeconds, 300),
+		Enabled:         in.Enabled == nil || *in.Enabled,
+	}
+	if msg := validateMetricAlertInput(rule); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.CreateMetricAlertRule(rule)
+	if err != nil {
+		if isMetricAlertValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "create metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusCreated, out)
+}
+
+// updateMetricAlertRule handles PATCH /api/metric-alert-rules/{id}.
+// workload_id is immutable; name/metric/comparator/threshold/severity/
+// cooldown/enabled are individually overridable.
+func (s *Server) updateMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	existing, err := s.store.GetMetricAlertRule(id)
+	if err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	var in metricAlertRuleInput
+	if !decodeJSON(w, r, &in) {
+		return
+	}
+	if in.Name != nil {
+		existing.Name = *in.Name
+	}
+	if in.Metric != nil && *in.Metric != "" {
+		existing.Metric = *in.Metric
+	}
+	if in.Comparator != nil && *in.Comparator != "" {
+		existing.Comparator = *in.Comparator
+	}
+	if in.Threshold != nil {
+		existing.Threshold = *in.Threshold
+	}
+	if in.Severity != nil && *in.Severity != "" {
+		existing.Severity = *in.Severity
+	}
+	if in.CooldownSeconds != nil {
+		existing.CooldownSeconds = *in.CooldownSeconds
+	}
+	if in.Enabled != nil {
+		existing.Enabled = *in.Enabled
+	}
+	if msg := validateMetricAlertInput(existing); msg != "" {
+		respondError(w, http.StatusBadRequest, msg)
+		return
+	}
+	out, err := s.store.UpdateMetricAlertRule(existing)
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			respondNotFound(w, "metric alert rule")
+			return
+		}
+		if isMetricAlertValidationErr(err) {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		respondError(w, http.StatusInternalServerError, "update metric alert rule")
+		return
+	}
+	respondJSON(w, http.StatusOK, out)
+}
+
+// deleteMetricAlertRule handles DELETE /api/metric-alert-rules/{id}.
+func (s *Server) deleteMetricAlertRule(w http.ResponseWriter, r *http.Request) {
+	id, ok := parseMetricAlertRuleID(w, r)
+	if !ok {
+		return
+	}
+	if err := s.store.DeleteMetricAlertRule(id); err != nil {
+		mapStoreError(w, err, "metric alert rule")
+		return
+	}
+	w.WriteHeader(http.StatusNoContent)
+}
+
+// validateMetricAlertInput does boundary validation so we return a
+// clear 400 before hitting the store. The store re-validates the same
+// invariants as a backstop.
+func validateMetricAlertInput(rule store.MetricAlertRule) string {
+	if strings.TrimSpace(rule.Name) == "" {
+		return "name is required"
+	}
+	switch rule.Metric {
+	case store.MetricCPUPercent, store.MetricMemoryPercent, store.MetricMemoryBytes:
+	default:
+		return "invalid metric: must be cpu_percent, memory_percent, or memory_bytes"
+	}
+	switch rule.Comparator {
+	case store.MetricComparatorGT, store.MetricComparatorLT:
+	default:
+		return "invalid comparator: must be gt or lt"
+	}
+	switch rule.Severity {
+	case store.LogScanSeverityInfo, store.LogScanSeverityWarn, store.LogScanSeverityError, "":
+	default:
+		return "invalid severity: must be info, warn, or error"
+	}
+	if rule.CooldownSeconds < 0 {
+		return "cooldown_seconds must be >= 0"
+	}
+	return ""
+}
+
+// isMetricAlertValidationErr maps the store's validation errors to 400
+// rather than 500 without leaking driver text.
+func isMetricAlertValidationErr(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := err.Error()
+	for _, needle := range []string{
+		"name is required",
+		"invalid metric",
+		"invalid comparator",
+		"invalid severity",
+		"cooldown_seconds must be",
+	} {
+		if strings.Contains(msg, needle) {
+			return true
+		}
+	}
+	return false
+}
+
+func parseMetricAlertRuleID(w http.ResponseWriter, r *http.Request) (int64, bool) {
+	raw := chi.URLParam(r, "id")
+	id, err := strconv.ParseInt(raw, 10, 64)
+	if err != nil || id <= 0 {
+		respondError(w, http.StatusBadRequest, "invalid rule id")
+		return 0, false
+	}
+	return id, true
+}
+
+func derefFloat64(p *float64) float64 {
+	if p == nil {
+		return 0
+	}
+	return *p
+}
@@ -431,6 +431,16 @@ func (s *Server) Router() chi.Router {
 				r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
 			})

+			// Metric-alert rules.
+			r.Get("/metric-alert-rules", s.listMetricAlertRules)
+			r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule)
+			r.Group(func(r chi.Router) {
+				r.Use(auth.AdminOnly)
+				r.Post("/metric-alert-rules", s.createMetricAlertRule)
+				r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule)
+				r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule)
+			})
+
 			// System resources (read-only).
 			r.Get("/system/stats", s.getSystemStats)
 			r.Get("/system/stats/history", s.getSystemStatsHistory)
@@ -0,0 +1,349 @@
+// Package metricalert implements a background goroutine that
+// periodically evaluates operator-configured metric-threshold rules
+// against recent container stats samples. On breach (subject to a
+// per-rule-per-workload cooldown) it emits an event into the existing
+// event_log + event-bus pipeline — the same fan-out used by the
+// log-scanner — instead of building any new notification plumbing.
+package metricalert
+
+import (
+	"encoding/json"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// EvalInterval is how often the evaluator tick fires.
+const EvalInterval = 30 * time.Second
+
+// lookbackSeconds bounds how far back we pull samples each tick. Stats
+// are collected at most every few seconds (see internal/stats), so a
+// 120s window comfortably captures the latest reading per container
+// even if collection briefly stalls.
+const lookbackSeconds = 120
+
+// RuleSource is the read-side seam for fetching the current rule rows.
+// Real callers pass *store.Store; tests pass a fake.
+type RuleSource interface {
+	ListMetricAlertRules() ([]store.MetricAlertRule, error)
+}
+
+// SampleSource fetches the recent container stats samples to evaluate.
+type SampleSource interface {
+	ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error)
+}
+
+// EventSink writes a breach into event_log.
+type EventSink interface {
+	InsertEvent(store.EventLog) (store.EventLog, error)
+}
+
+// Publisher fans the breach out on the event bus. Matches *events.Bus.
+type Publisher interface {
+	Publish(events.Event)
+}
+
+// Source identifies metric-alert events in event_log + the bus.
+const eventSource = "metric_alert"
+
+// Manager owns the evaluation loop lifecycle. It mirrors
+// stats.Collector: a once-guarded Start/Stop pair with stop/done
+// channels and a single-goroutine run loop.
+type Manager struct {
+	rules   RuleSource
+	samples SampleSource
+	sink    EventSink
+	pub     Publisher
+
+	// now is swappable in tests so cooldown windows can be exercised
+	// deterministically. Defaults to time.Now.
+	now func() time.Time
+
+	// mu guards lastFired. The run loop is single-goroutine today, but
+	// Start/Stop and a future ReloadRules may touch shared state; the
+	// mutex is cheap insurance.
+	mu        sync.Mutex
+	lastFired map[string]time.Time // "ruleID:ownerID" -> last emit time
+
+	startOnce sync.Once
+	stopOnce  sync.Once
+	started   bool
+	stop      chan struct{}
+	done      chan struct{}
+}
+
+// New wires a manager with the supplied dependencies. Call Start to
+// begin evaluating.
+func New(rules RuleSource, samples SampleSource, sink EventSink, pub Publisher) *Manager {
+	return &Manager{
+		rules:     rules,
+		samples:   samples,
+		sink:      sink,
+		pub:       pub,
+		now:       time.Now,
+		lastFired: map[string]time.Time{},
+		stop:      make(chan struct{}),
+		done:      make(chan struct{}),
+	}
+}
+
+// Start launches the background loop. Returns immediately. The loop
+// exits when Stop is called. Safe to call multiple times — only the
+// first call has an effect.
+func (m *Manager) Start() {
+	m.startOnce.Do(func() {
+		m.started = true
+		go m.run()
+	})
+}
+
+// Stop signals the loop to exit and blocks until it has finished the
+// in-flight tick. If Start was never called, Stop returns immediately.
+func (m *Manager) Stop() {
+	m.stopOnce.Do(func() {
+		close(m.stop)
+		if !m.started {
+			close(m.done)
+		}
+	})
+	<-m.done
+}
+
+// run is the main loop. It evaluates once shortly after start, then on
+// every EvalInterval tick, until Stop is called.
+func (m *Manager) run() {
+	defer close(m.done)
+
+	// Settle delay so the app + first stats samples exist before the
+	// first evaluation.
+	select {
+	case <-time.After(3 * time.Second):
+	case <-m.stop:
+		return
+	}
+
+	ticker := time.NewTicker(EvalInterval)
+	defer ticker.Stop()
+	m.evaluate(m.now())
+	for {
+		select {
+		case <-m.stop:
+			return
+		case <-ticker.C:
+			m.evaluate(m.now())
+		}
+	}
+}
+
+// evaluate runs one pass: load rules + recent samples, reduce to the
+// freshest sample per (owner, container), and emit on breach subject to
+// cooldown. Best-effort throughout — a bad rule or sample never crashes
+// the loop.
+func (m *Manager) evaluate(now time.Time) {
+	rules, err := m.rules.ListMetricAlertRules()
+	if err != nil {
+		slog.Warn("metricalert: list rules", "error", err)
+		return
+	}
+	if len(rules) == 0 {
+		return
+	}
+
+	since := now.Unix() - lookbackSeconds
+	samples, err := m.samples.ListAllRecentContainerStatsSamples(since)
+	if err != nil {
+		slog.Warn("metricalert: list samples", "error", err)
+		return
+	}
+	latest := latestPerContainer(samples)
+	if len(latest) == 0 {
+		return
+	}
+
+	for _, rule := range rules {
+		if !rule.Enabled {
+			continue
+		}
+		for _, sample := range latest {
+			// Per-workload rules only match their workload; "" matches all.
+			if rule.WorkloadID != "" && rule.WorkloadID != sample.OwnerID {
+				continue
+			}
+			value, ok := metricValue(rule.Metric, sample)
+			if !ok {
+				continue // e.g. memory_percent with a zero limit
+			}
+			if !breached(rule.Comparator, value, rule.Threshold) {
+				continue
+			}
+			if m.coolingDown(rule, sample.OwnerID, now) {
+				continue
+			}
+			m.emit(rule, sample, value)
+			m.recordFire(rule, sample.OwnerID, now)
+		}
+	}
+}
+
+// latestPerContainer keeps only the most recent sample per
+// (OwnerID, ContainerID), so each container is judged on its freshest
+// reading rather than every historical row in the window.
+func latestPerContainer(samples []store.ContainerStatsSample) []store.ContainerStatsSample {
+	newest := map[string]store.ContainerStatsSample{}
+	for _, s := range samples {
+		key := s.OwnerID + "\x00" + s.ContainerID
+		if prev, ok := newest[key]; !ok || s.TS > prev.TS {
+			newest[key] = s
+		}
+	}
+	out := make([]store.ContainerStatsSample, 0, len(newest))
+	for _, s := range newest {
+		out = append(out, s)
+	}
+	return out
+}
+
+// metricValue resolves a rule's metric against a sample. The bool is
+// false when the sample can't be judged for that metric (memory_percent
+// with a zero/unknown limit) so the caller skips it instead of dividing
+// by zero.
+func metricValue(metric string, s store.ContainerStatsSample) (float64, bool) {
+	switch metric {
+	case store.MetricCPUPercent:
+		return s.CPUPercent, true
+	case store.MetricMemoryPercent:
+		if s.MemoryLimit <= 0 {
+			return 0, false
+		}
+		return float64(s.MemoryUsage) / float64(s.MemoryLimit) * 100, true
+	case store.MetricMemoryBytes:
+		return float64(s.MemoryUsage), true
+	default:
+		return 0, false
+	}
+}
+
+// breached returns whether value crosses threshold per the comparator.
+func breached(comparator string, value, threshold float64) bool {
+	switch comparator {
+	case store.MetricComparatorGT:
+		return value > threshold
+	case store.MetricComparatorLT:
+		return value < threshold
+	default:
+		return false
+	}
+}
+
+// cooldownKey is the per-rule-per-workload cooldown key.
+func cooldownKey(ruleID int64, ownerID string) string {
+	return fmt.Sprintf("%d:%s", ruleID, ownerID)
+}
+
+func (m *Manager) coolingDown(rule store.MetricAlertRule, ownerID string, now time.Time) bool {
+	if rule.CooldownSeconds <= 0 {
+		return false
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	last, ok := m.lastFired[cooldownKey(rule.ID, ownerID)]
+	if !ok {
+		return false
+	}
+	return now.Sub(last) < time.Duration(rule.CooldownSeconds)*time.Second
+}
+
+func (m *Manager) recordFire(rule store.MetricAlertRule, ownerID string, now time.Time) {
+	m.mu.Lock()
+	m.lastFired[cooldownKey(rule.ID, ownerID)] = now
+	m.mu.Unlock()
+}
+
+// emit persists the breach as an event_log row and publishes it on the
+// bus. WorkloadID routes the alert to that app's activity timeline.
+// Metadata is JSON-marshalled (never string-concatenated). Any
+// marshal/insert failure is logged and skipped — emitting must never
+// crash the loop.
+func (m *Manager) emit(rule store.MetricAlertRule, sample store.ContainerStatsSample, value float64) {
+	message := formatMessage(rule, value)
+	meta := map[string]any{
+		"workload_id": sample.OwnerID,
+		"rule":        rule.Name,
+		"metric":      rule.Metric,
+		"value":       value,
+		"threshold":   rule.Threshold,
+		"comparator":  rule.Comparator,
+	}
+	metaJSON, err := json.Marshal(meta)
+	if err != nil {
+		slog.Error("metricalert: marshal metadata", "rule", rule.Name, "error", err)
+		return
+	}
+	severity := rule.Severity
+	if severity == "" {
+		severity = store.LogScanSeverityWarn
+	}
+	evt, err := m.sink.InsertEvent(store.EventLog{
+		Source:     eventSource,
+		Severity:   severity,
+		Message:    message,
+		WorkloadID: sample.OwnerID,
+		Metadata:   string(metaJSON),
+	})
+	if err != nil {
+		slog.Error("metricalert: persist event", "rule", rule.Name, "error", err)
+		return
+	}
+	if m.pub != nil {
+		m.pub.Publish(events.Event{
+			Type: events.EventLog,
+			Payload: events.EventLogPayload{
+				ID:         evt.ID,
+				Source:     eventSource,
+				WorkloadID: sample.OwnerID,
+				Severity:   severity,
+				Message:    message,
+				Metadata:   string(metaJSON),
+				CreatedAt:  evt.CreatedAt,
+			},
+		})
+	}
+}
+
+// formatMessage builds a concise, human, secret-free breach line. The
+// only operator-supplied text is rule.Name; the rest are numbers and
+// fixed labels.
+func formatMessage(rule store.MetricAlertRule, value float64) string {
+	label, unit := metricLabelUnit(rule.Metric)
+	word := comparatorWord(rule.Comparator)
+	return fmt.Sprintf("%s: %s is %.0f%s (threshold %s %.0f%s)",
+		rule.Name, label, value, unit, word, rule.Threshold, unit)
+}
+
+func metricLabelUnit(metric string) (label, unit string) {
+	switch metric {
+	case store.MetricCPUPercent:
+		return "CPU", "%"
+	case store.MetricMemoryPercent:
+		return "Memory", "%"
+	case store.MetricMemoryBytes:
+		return "Memory", " bytes"
+	default:
+		return metric, ""
+	}
+}
+
+func comparatorWord(comparator string) string {
+	switch comparator {
+	case store.MetricComparatorGT:
+		return ">"
+	case store.MetricComparatorLT:
+		return "<"
+	default:
+		return comparator
+	}
+}
@@ -0,0 +1,284 @@
+package metricalert
+
+import (
+	"testing"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// --- fakes -----------------------------------------------------------
+
+type fakeRules struct {
+	rules []store.MetricAlertRule
+	err   error
+}
+
+func (f *fakeRules) ListMetricAlertRules() ([]store.MetricAlertRule, error) {
+	return f.rules, f.err
+}
+
+type fakeSamples struct {
+	samples []store.ContainerStatsSample
+	err     error
+	since   int64 // captured arg of the last call
+}
+
+func (f *fakeSamples) ListAllRecentContainerStatsSamples(sinceTS int64) ([]store.ContainerStatsSample, error) {
+	f.since = sinceTS
+	return f.samples, f.err
+}
+
+type recordedEvent struct {
+	evt store.EventLog
+}
+
+type fakeSink struct {
+	events []recordedEvent
+	err    error
+	nextID int64
+}
+
+func (f *fakeSink) InsertEvent(e store.EventLog) (store.EventLog, error) {
+	if f.err != nil {
+		return store.EventLog{}, f.err
+	}
+	f.nextID++
+	e.ID = f.nextID
+	e.CreatedAt = "2026-05-29T00:00:00Z"
+	f.events = append(f.events, recordedEvent{evt: e})
+	return e, nil
+}
+
+type fakePublisher struct {
+	published []events.Event
+}
+
+func (f *fakePublisher) Publish(e events.Event) {
+	f.published = append(f.published, e)
+}
+
+func newManager(rules []store.MetricAlertRule, samples []store.ContainerStatsSample) (*Manager, *fakeSink, *fakePublisher) {
+	sink := &fakeSink{}
+	pub := &fakePublisher{}
+	m := New(&fakeRules{rules: rules}, &fakeSamples{samples: samples}, sink, pub)
+	return m, sink, pub
+}
+
+// --- tests -----------------------------------------------------------
+
+func TestEvaluate_BreachEmits(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Severity: "error",
+		CooldownSeconds: 300, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", OwnerType: "instance", TS: 100, CPUPercent: 95,
+	}}
+	m, sink, pub := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("expected 1 event, got %d", len(sink.events))
+	}
+	got := sink.events[0].evt
+	if got.Source != "metric_alert" {
+		t.Errorf("source = %q, want metric_alert", got.Source)
+	}
+	if got.Severity != "error" {
+		t.Errorf("severity = %q, want error", got.Severity)
+	}
+	if got.WorkloadID != "w1" {
+		t.Errorf("workload_id = %q, want w1", got.WorkloadID)
+	}
+	if got.Metadata == "" || got.Metadata == "{}" {
+		t.Errorf("metadata should be populated JSON, got %q", got.Metadata)
+	}
+	if len(pub.published) != 1 {
+		t.Fatalf("expected 1 published event, got %d", len(pub.published))
+	}
+	payload, ok := pub.published[0].Payload.(events.EventLogPayload)
+	if !ok {
+		t.Fatalf("published payload is not EventLogPayload")
+	}
+	if payload.WorkloadID != "w1" || payload.Source != "metric_alert" {
+		t.Errorf("payload workload/source mismatch: %+v", payload)
+	}
+}
+
+func TestEvaluate_NoBreachNoEmit(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10,
+	}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 0 {
+		t.Fatalf("expected no events for non-breach, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_DisabledRuleSkipped(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: false,
+	}}
+	samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 0 {
+		t.Fatalf("disabled rule should not emit, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_PerWorkloadScoping(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "w2-only", WorkloadID: "w2", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{
+		{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}, // breach but wrong workload
+		{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95}, // breach, correct workload
+	}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("expected 1 event (only w2), got %d", len(sink.events))
+	}
+	if sink.events[0].evt.WorkloadID != "w2" {
+		t.Errorf("event should be scoped to w2, got %q", sink.events[0].evt.WorkloadID)
+	}
+}
+
+func TestEvaluate_GlobalRuleMatchesAll(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "global", WorkloadID: "", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{
+		{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95},
+		{ContainerID: "c2", OwnerID: "w2", TS: 100, CPUPercent: 95},
+	}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 2 {
+		t.Fatalf("global rule should fire for both workloads, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_MemoryPercentDivByZeroSkip(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 50, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 1000, MemoryLimit: 0,
+	}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 0 {
+		t.Fatalf("zero memory limit should be skipped for percent rule, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_MemoryPercentBreaches(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "mem", Metric: store.MetricMemoryPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 90, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{
+		ContainerID: "c1", OwnerID: "w1", TS: 100, MemoryUsage: 950, MemoryLimit: 1000, // 95%
+	}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("95%% should breach 90%% threshold, got %d events", len(sink.events))
+	}
+}
+
+func TestEvaluate_CooldownSuppressesSecondEmit(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, CooldownSeconds: 300, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 95}}
+	m, sink, _ := newManager(rules, samples)
+
+	base := time.Unix(1000, 0)
+	m.evaluate(base)
+	// 10s later — still inside the 300s cooldown window.
+	m.evaluate(base.Add(10 * time.Second))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("cooldown should suppress second emit, got %d events", len(sink.events))
+	}
+
+	// Past the window — should fire again.
+	m.evaluate(base.Add(301 * time.Second))
+	if len(sink.events) != 2 {
+		t.Fatalf("should re-fire after cooldown elapses, got %d events", len(sink.events))
+	}
+}
+
+func TestEvaluate_LatestSamplePerContainer(t *testing.T) {
+	// Two samples for the same container: an old non-breaching reading
+	// and a newer breaching one. Only the freshest should be judged.
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-hot", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorGT, Threshold: 80, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{
+		{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 10},
+		{ContainerID: "c1", OwnerID: "w1", TS: 150, CPUPercent: 95},
+	}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("expected exactly 1 event from freshest sample, got %d", len(sink.events))
+	}
+}
+
+func TestEvaluate_LessThanComparator(t *testing.T) {
+	rules := []store.MetricAlertRule{{
+		ID: 1, Name: "cpu-idle", Metric: store.MetricCPUPercent,
+		Comparator: store.MetricComparatorLT, Threshold: 5, Enabled: true,
+	}}
+	samples := []store.ContainerStatsSample{{ContainerID: "c1", OwnerID: "w1", TS: 100, CPUPercent: 1}}
+	m, sink, _ := newManager(rules, samples)
+
+	m.evaluate(time.Unix(200, 0))
+
+	if len(sink.events) != 1 {
+		t.Fatalf("1%% < 5%% threshold should breach lt rule, got %d events", len(sink.events))
+	}
+}
+
+func TestEvaluate_NoRulesNoFetch(t *testing.T) {
+	// With no rules there's nothing to do; we shouldn't even query samples.
+	samplesSrc := &fakeSamples{samples: nil}
+	m := New(&fakeRules{rules: nil}, samplesSrc, &fakeSink{}, &fakePublisher{})
+	m.evaluate(time.Unix(200, 0))
+	if samplesSrc.since != 0 {
+		t.Errorf("samples should not be queried when there are no rules")
+	}
+}
@@ -0,0 +1,191 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+)
+
+// CreateMetricAlertRule inserts a new rule row after validating its
+// metric/comparator/severity enums and rejecting negative cooldowns.
+func (s *Store) CreateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
+	if err := validateMetricAlertRule(r); err != nil {
+		return MetricAlertRule{}, err
+	}
+	now := Now()
+	r.CreatedAt = now
+	r.UpdatedAt = now
+	res, err := s.db.Exec(
+		`INSERT INTO metric_alert_rules
+		   (workload_id, name, metric, comparator, threshold, severity,
+		    cooldown_seconds, enabled, created_at, updated_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		r.WorkloadID, r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
+		r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
+	)
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("insert metric alert rule: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("get metric alert rule id: %w", err)
+	}
+	r.ID = id
+	return r, nil
+}
+
+// ListMetricAlertRules returns every rule, ordered by id for stable UI
+// rendering.
+func (s *Store) ListMetricAlertRules() ([]MetricAlertRule, error) {
+	return s.queryMetricAlertRules(
+		`SELECT id, workload_id, name, metric, comparator, threshold, severity,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM metric_alert_rules ORDER BY id`,
+	)
+}
+
+// ListMetricAlertRulesByWorkload returns rules that apply to the given
+// workload: rows explicitly scoped to it plus global rows (workload_id
+// = ""). Useful for the workload detail page.
+func (s *Store) ListMetricAlertRulesByWorkload(workloadID string) ([]MetricAlertRule, error) {
+	return s.queryMetricAlertRules(
+		`SELECT id, workload_id, name, metric, comparator, threshold, severity,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM metric_alert_rules WHERE workload_id = ? OR workload_id = '' ORDER BY id`,
+		workloadID,
+	)
+}
+
+// GetMetricAlertRule fetches one rule by id or returns ErrNotFound.
+func (s *Store) GetMetricAlertRule(id int64) (MetricAlertRule, error) {
+	row := s.db.QueryRow(
+		`SELECT id, workload_id, name, metric, comparator, threshold, severity,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM metric_alert_rules WHERE id = ?`, id,
+	)
+	r, err := scanMetricAlertRuleRow(row)
+	if errors.Is(err, sql.ErrNoRows) {
+		return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("query metric alert rule: %w", err)
+	}
+	return r, nil
+}
+
+// UpdateMetricAlertRule overwrites the editable columns of a rule row.
+// id and workload_id are immutable on update — change the scope of a
+// rule by deleting + recreating, mirroring the log-scan store.
+func (s *Store) UpdateMetricAlertRule(r MetricAlertRule) (MetricAlertRule, error) {
+	if r.ID == 0 {
+		return MetricAlertRule{}, fmt.Errorf("metric alert rule: id is required for update")
+	}
+	if err := validateMetricAlertRule(r); err != nil {
+		return MetricAlertRule{}, err
+	}
+	r.UpdatedAt = Now()
+	res, err := s.db.Exec(
+		`UPDATE metric_alert_rules
+		    SET name = ?, metric = ?, comparator = ?, threshold = ?, severity = ?,
+		        cooldown_seconds = ?, enabled = ?, updated_at = ?
+		  WHERE id = ?`,
+		r.Name, r.Metric, r.Comparator, r.Threshold, r.Severity,
+		r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
+	)
+	if err != nil {
+		return MetricAlertRule{}, fmt.Errorf("update metric alert rule: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return MetricAlertRule{}, fmt.Errorf("metric alert rule %d: %w", r.ID, ErrNotFound)
+	}
+	return s.GetMetricAlertRule(r.ID)
+}
+
+// DeleteMetricAlertRule removes a rule by id, returning ErrNotFound when
+// no row matched.
+func (s *Store) DeleteMetricAlertRule(id int64) error {
+	res, err := s.db.Exec(`DELETE FROM metric_alert_rules WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete metric alert rule: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("metric alert rule %d: %w", id, ErrNotFound)
+	}
+	return nil
+}
+
+func (s *Store) queryMetricAlertRules(query string, args ...any) ([]MetricAlertRule, error) {
+	rows, err := s.db.Query(query, args...)
+	if err != nil {
+		return nil, fmt.Errorf("query metric alert rules: %w", err)
+	}
+	defer rows.Close()
+	out := []MetricAlertRule{}
+	for rows.Next() {
+		r, err := scanMetricAlertRuleRows(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, r)
+	}
+	return out, rows.Err()
+}
+
+func scanMetricAlertRuleRows(rows *sql.Rows) (MetricAlertRule, error) {
+	var r MetricAlertRule
+	var enabled int
+	if err := rows.Scan(
+		&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
+		&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
+	); err != nil {
+		return MetricAlertRule{}, fmt.Errorf("scan metric alert rule: %w", err)
+	}
+	r.Enabled = enabled != 0
+	return r, nil
+}
+
+func scanMetricAlertRuleRow(row *sql.Row) (MetricAlertRule, error) {
+	var r MetricAlertRule
+	var enabled int
+	if err := row.Scan(
+		&r.ID, &r.WorkloadID, &r.Name, &r.Metric, &r.Comparator, &r.Threshold, &r.Severity,
+		&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
+	); err != nil {
+		return MetricAlertRule{}, err
+	}
+	r.Enabled = enabled != 0
+	return r, nil
+}
+
+// validateMetricAlertRule enforces the per-row invariants: a non-empty
+// name, a known metric/comparator, a valid severity (blank allowed so
+// the caller can default it), and a non-negative cooldown.
+func validateMetricAlertRule(r MetricAlertRule) error {
+	if strings.TrimSpace(r.Name) == "" {
+		return fmt.Errorf("metric alert rule: name is required")
+	}
+	switch r.Metric {
+	case MetricCPUPercent, MetricMemoryPercent, MetricMemoryBytes:
+	default:
+		return fmt.Errorf("metric alert rule: invalid metric %q", r.Metric)
+	}
+	switch r.Comparator {
+	case MetricComparatorGT, MetricComparatorLT:
+	default:
+		return fmt.Errorf("metric alert rule: invalid comparator %q", r.Comparator)
+	}
+	switch r.Severity {
+	case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
+	case "":
+		// Default applied at the caller; allow blank.
+	default:
+		return fmt.Errorf("metric alert rule: invalid severity %q", r.Severity)
+	}
+	if r.CooldownSeconds < 0 {
+		return fmt.Errorf("metric alert rule: cooldown_seconds must be >= 0")
+	}
+	return nil
+}
@@ -0,0 +1,167 @@
+package store
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestCreateMetricAlertRule_Validates(t *testing.T) {
+	s := newTestStore(t)
+	cases := []struct {
+		name    string
+		in      MetricAlertRule
+		wantErr string
+	}{
+		{
+			name:    "missing name",
+			in:      MetricAlertRule{Metric: MetricCPUPercent, Comparator: MetricComparatorGT},
+			wantErr: "name is required",
+		},
+		{
+			name:    "bad metric",
+			in:      MetricAlertRule{Name: "n", Metric: "load_avg", Comparator: MetricComparatorGT},
+			wantErr: "invalid metric",
+		},
+		{
+			name:    "bad comparator",
+			in:      MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: "eq"},
+			wantErr: "invalid comparator",
+		},
+		{
+			name:    "bad severity",
+			in:      MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, Severity: "loud"},
+			wantErr: "invalid severity",
+		},
+		{
+			name:    "negative cooldown",
+			in:      MetricAlertRule{Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT, CooldownSeconds: -1},
+			wantErr: "cooldown_seconds must be",
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			_, err := s.CreateMetricAlertRule(c.in)
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", c.wantErr)
+			}
+			if !strings.Contains(err.Error(), c.wantErr) {
+				t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
+			}
+		})
+	}
+}
+
+func TestCreateAndGetMetricAlertRule(t *testing.T) {
+	s := newTestStore(t)
+	r, err := s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "cpu-hot", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 80, Severity: "warn", CooldownSeconds: 300, Enabled: true,
+	})
+	if err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if r.ID == 0 {
+		t.Fatal("id should be set")
+	}
+	got, err := s.GetMetricAlertRule(r.ID)
+	if err != nil {
+		t.Fatalf("get: %v", err)
+	}
+	if got.Metric != MetricCPUPercent || got.Comparator != MetricComparatorGT {
+		t.Errorf("metric/comparator mismatch: %q %q", got.Metric, got.Comparator)
+	}
+	if got.Threshold != 80 {
+		t.Errorf("threshold mismatch: %v", got.Threshold)
+	}
+	if !got.Enabled {
+		t.Error("enabled lost on round-trip")
+	}
+}
+
+func TestGetMetricAlertRule_NotFound(t *testing.T) {
+	s := newTestStore(t)
+	if _, err := s.GetMetricAlertRule(999); err == nil {
+		t.Fatal("expected ErrNotFound for missing rule")
+	}
+}
+
+func TestListMetricAlertRulesByWorkload(t *testing.T) {
+	s := newTestStore(t)
+	_, _ = s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "global", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 90, Severity: "warn", Enabled: true,
+	})
+	_, _ = s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "w1-mem", WorkloadID: "w1", Metric: MetricMemoryPercent, Comparator: MetricComparatorGT,
+		Threshold: 85, Severity: "error", Enabled: true,
+	})
+	_, _ = s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "w2-mem", WorkloadID: "w2", Metric: MetricMemoryBytes, Comparator: MetricComparatorGT,
+		Threshold: 1000, Severity: "info", Enabled: true,
+	})
+
+	w1, err := s.ListMetricAlertRulesByWorkload("w1")
+	if err != nil {
+		t.Fatalf("by workload: %v", err)
+	}
+	// w1 sees its own rule + the global, but NOT w2's rule.
+	if len(w1) != 2 {
+		t.Fatalf("w1 should see 2 rules (own + global), got %d", len(w1))
+	}
+	for _, r := range w1 {
+		if r.WorkloadID == "w2" {
+			t.Errorf("w1 should not see w2's rule")
+		}
+	}
+}
+
+func TestUpdateMetricAlertRule(t *testing.T) {
+	s := newTestStore(t)
+	r, _ := s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 80, Severity: "warn", Enabled: true,
+	})
+	r.Threshold = 95
+	r.Comparator = MetricComparatorLT
+	r.Enabled = false
+	got, err := s.UpdateMetricAlertRule(r)
+	if err != nil {
+		t.Fatalf("update: %v", err)
+	}
+	if got.Threshold != 95 {
+		t.Errorf("threshold not updated: %v", got.Threshold)
+	}
+	if got.Comparator != MetricComparatorLT {
+		t.Errorf("comparator not updated: %q", got.Comparator)
+	}
+	if got.Enabled {
+		t.Error("enabled=false not applied")
+	}
+}
+
+func TestUpdateMetricAlertRule_NotFound(t *testing.T) {
+	s := newTestStore(t)
+	_, err := s.UpdateMetricAlertRule(MetricAlertRule{
+		ID: 999, Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+	})
+	if err == nil {
+		t.Fatal("expected ErrNotFound updating missing rule")
+	}
+}
+
+func TestDeleteMetricAlertRule(t *testing.T) {
+	s := newTestStore(t)
+	r, _ := s.CreateMetricAlertRule(MetricAlertRule{
+		Name: "n", Metric: MetricCPUPercent, Comparator: MetricComparatorGT,
+		Threshold: 80, Severity: "warn", Enabled: true,
+	})
+	if err := s.DeleteMetricAlertRule(r.ID); err != nil {
+		t.Fatalf("delete: %v", err)
+	}
+	if _, err := s.GetMetricAlertRule(r.ID); err == nil {
+		t.Error("rule should be gone after delete")
+	}
+	if err := s.DeleteMetricAlertRule(r.ID); err == nil {
+		t.Error("expected ErrNotFound deleting already-deleted rule")
+	}
+}
@@ -277,6 +277,39 @@ const (
 	LogScanSeverityError = "error"
 )

+// MetricAlertRule fires an event when a container metric breaches a
+// threshold. Mirrors LogScanRule but evaluated against stats_samples
+// instead of log lines.
+type MetricAlertRule struct {
+	ID              int64   `json:"id"`
+	WorkloadID      string  `json:"workload_id"` // "" = applies to all workloads
+	Name            string  `json:"name"`
+	Metric          string  `json:"metric"`     // cpu_percent | memory_percent | memory_bytes
+	Comparator      string  `json:"comparator"` // gt | lt
+	Threshold       float64 `json:"threshold"`
+	Severity        string  `json:"severity"`         // info | warn | error
+	CooldownSeconds int     `json:"cooldown_seconds"` // min seconds between fires per (rule,workload)
+	Enabled         bool    `json:"enabled"`
+	CreatedAt       string  `json:"created_at"`
+	UpdatedAt       string  `json:"updated_at"`
+}
+
+// Metric-alert metric identifiers. cpu_percent + memory_percent are
+// 0–100 ratios; memory_bytes is an absolute usage figure. Validated in
+// the store on create/update.
+const (
+	MetricCPUPercent    = "cpu_percent"
+	MetricMemoryPercent = "memory_percent"
+	MetricMemoryBytes   = "memory_bytes"
+)
+
+// Metric-alert comparators. gt fires when the value exceeds the
+// threshold; lt when it falls below.
+const (
+	MetricComparatorGT = "gt"
+	MetricComparatorLT = "lt"
+)
+
 // WorkloadKind enumerates the legacy discriminator values written into
 // containers.workload_kind and workloads.kind. After the hard cutover the
 // backing project / stack / static_site tables are gone — these constants
@@ -408,6 +408,24 @@ func (s *Store) runMigrations() error {
 		)`,
 		`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
 		`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
+		// metric_alert_rules: threshold rules the metric-alert manager
+		// evaluates against recent container stats samples. WorkloadID is
+		// nullable (via "" sentinel) so a global rule applies to every
+		// workload; a non-empty value scopes it to one workload.
+		`CREATE TABLE IF NOT EXISTS metric_alert_rules (
+			id                 INTEGER PRIMARY KEY AUTOINCREMENT,
+			workload_id        TEXT NOT NULL DEFAULT '',
+			name               TEXT NOT NULL DEFAULT '',
+			metric             TEXT NOT NULL,
+			comparator         TEXT NOT NULL,
+			threshold          REAL NOT NULL DEFAULT 0,
+			severity           TEXT NOT NULL DEFAULT 'warn',
+			cooldown_seconds   INTEGER NOT NULL DEFAULT 300,
+			enabled            INTEGER NOT NULL DEFAULT 1,
+			created_at         TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at         TEXT NOT NULL DEFAULT (datetime('now'))
+		)`,
+		`CREATE INDEX IF NOT EXISTS idx_metric_alert_rules_workload ON metric_alert_rules(workload_id)`,
 	}
 	for _, t := range observabilityTables {
 		if _, err := s.db.Exec(t); err != nil {
@@ -551,6 +551,7 @@
      "static_site": "Static Site",
      "stale_scanner": "Stale Scanner",
      "stale_cleanup": "Stale Cleanup",
+      "metric_alert": "Metric Alert",
      "admin": "Admin"
    },
    "metadata": "Details"
@@ -551,6 +551,7 @@
      "static_site": "Статический сайт",
      "stale_scanner": "Сканер устаревших",
      "stale_cleanup": "Очистка устаревших",
+      "metric_alert": "Метрика",
      "admin": "Администратор"
    },
    "metadata": "Подробности"