feat(observability): event triggers + log scanner backend

Two paired backends sharing the events.Bus seam: Event triggers (consumer-side): - internal/store/event_triggers.go — CRUD with action_secret redaction on read (placeholder echo treated as "no change" on PATCH so secrets aren't accidentally wiped). - internal/events/dispatcher.go — bus subscriber, AND-composed filters (severity CSV, source CSV, message regex with memoized compile cache). Structural loop-prevention: never writes to event_log. Sends via notifier.SendPayload. - internal/notify: SendPayload + SendSyncForTestPayload methods, TierEventTrigger constant, doSendRaw shared with the legacy Event-shaped path. - internal/api/event_triggers.go — admin-gated CRUD + /test sending the real TriggerWebhookPayload shape. SSRF guard rejects loopback / link-local / unspecified targets. PATCH uses pointer-typed DTO for partial updates. Log scanner (producer-side): - internal/logscanner/ — engine (per-rule cooldown + per-container token bucket, atomic drop counters), tail (multiplexed docker frame demuxer with TTY fallback + 16 MiB payload cap + 1 MiB reassembly cap + RFC3339Nano-validated timestamp strip + UTF-8-safe message truncation), manager (5s container polling, atomic.Pointer[Snapshot] hot-reload, HitEmitter writes event_log + publishes EventLog so the trigger dispatcher picks them up immediately). - internal/docker/container.go — ContainerLogsOpts exposes stream selection for stderr-only / stdout-only rules. - internal/store: log_scan_rules table + CRUD with EffectiveLogScanRules resolver (globals minus per-workload overrides plus workload-only additions). Transactional cascade-delete of overrides when a global rule is removed. - internal/api/log_scan_rules.go — admin-gated CRUD + /test (sample_line → matched/captures) + /stats (drop counters + active tail count + last-snapshot compile errors) + GET /api/workloads/{id}/effective-rules. cmd/server/main.go wires both subsystems next to the existing RegisterPersistentLogger. Coverage spans engine cooldown / bucket counter tests, snapshot effective-set semantics, manager compile- error capture, dispatcher matching, store validation + cascade-delete, API URL validator + secret redaction. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 22:18:11 +03:00
parent 82d32181ba
commit 7a9ff7ad54
23 changed files with 3974 additions and 19 deletions
@@ -0,0 +1,208 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+)
+
+// CreateEventTrigger inserts a new trigger row. ID is assigned by the
+// auto-increment column and returned on the populated struct.
+func (s *Store) CreateEventTrigger(t EventTrigger) (EventTrigger, error) {
+	if strings.TrimSpace(t.Name) == "" {
+		return EventTrigger{}, fmt.Errorf("event_trigger: name is required")
+	}
+	if t.ActionType == "" {
+		t.ActionType = EventTriggerActionWebhook
+	}
+	if t.ActionType != EventTriggerActionWebhook {
+		return EventTrigger{}, fmt.Errorf("event_trigger: unsupported action_type %q", t.ActionType)
+	}
+	if strings.TrimSpace(t.ActionTarget) == "" {
+		return EventTrigger{}, fmt.Errorf("event_trigger: action_target is required")
+	}
+
+	now := Now()
+	t.CreatedAt = now
+	t.UpdatedAt = now
+
+	res, err := s.db.Exec(
+		`INSERT INTO event_triggers
+		   (name, filter_severity, filter_source, filter_message_regex,
+		    action_type, action_target, action_secret, enabled,
+		    created_at, updated_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		t.Name, t.FilterSeverity, t.FilterSource, t.FilterMessageRegex,
+		t.ActionType, t.ActionTarget, t.ActionSecret, boolToInt(t.Enabled),
+		t.CreatedAt, t.UpdatedAt,
+	)
+	if err != nil {
+		return EventTrigger{}, fmt.Errorf("insert event trigger: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return EventTrigger{}, fmt.Errorf("get event trigger id: %w", err)
+	}
+	t.ID = id
+	return t, nil
+}
+
+// ListEventTriggers returns every trigger row, ordered by id so the UI
+// rendering is stable across requests. Trigger counts are expected to
+// be small (operator-curated), so unbounded listing is fine.
+func (s *Store) ListEventTriggers() ([]EventTrigger, error) {
+	rows, err := s.db.Query(
+		`SELECT id, name, filter_severity, filter_source, filter_message_regex,
+		        action_type, action_target, action_secret, enabled,
+		        created_at, updated_at
+		 FROM event_triggers ORDER BY id`,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("query event triggers: %w", err)
+	}
+	defer rows.Close()
+
+	out := []EventTrigger{}
+	for rows.Next() {
+		t, err := scanEventTrigger(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, t)
+	}
+	return out, rows.Err()
+}
+
+// ListEnabledEventTriggers returns only the rows with enabled=1. The
+// dispatcher hot path uses this so a disabled trigger costs nothing.
+func (s *Store) ListEnabledEventTriggers() ([]EventTrigger, error) {
+	rows, err := s.db.Query(
+		`SELECT id, name, filter_severity, filter_source, filter_message_regex,
+		        action_type, action_target, action_secret, enabled,
+		        created_at, updated_at
+		 FROM event_triggers WHERE enabled = 1 ORDER BY id`,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("query enabled event triggers: %w", err)
+	}
+	defer rows.Close()
+
+	out := []EventTrigger{}
+	for rows.Next() {
+		t, err := scanEventTrigger(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, t)
+	}
+	return out, rows.Err()
+}
+
+// GetEventTrigger returns one trigger by ID or ErrNotFound.
+func (s *Store) GetEventTrigger(id int64) (EventTrigger, error) {
+	row := s.db.QueryRow(
+		`SELECT id, name, filter_severity, filter_source, filter_message_regex,
+		        action_type, action_target, action_secret, enabled,
+		        created_at, updated_at
+		 FROM event_triggers WHERE id = ?`, id,
+	)
+	t, err := scanEventTriggerRow(row)
+	if errors.Is(err, sql.ErrNoRows) {
+		return EventTrigger{}, fmt.Errorf("event trigger %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return EventTrigger{}, fmt.Errorf("query event trigger: %w", err)
+	}
+	return t, nil
+}
+
+// UpdateEventTrigger overwrites the editable columns of an existing row.
+// CreatedAt is preserved; UpdatedAt is refreshed.
+func (s *Store) UpdateEventTrigger(t EventTrigger) (EventTrigger, error) {
+	if t.ID == 0 {
+		return EventTrigger{}, fmt.Errorf("event_trigger: id is required for update")
+	}
+	if strings.TrimSpace(t.Name) == "" {
+		return EventTrigger{}, fmt.Errorf("event_trigger: name is required")
+	}
+	if t.ActionType == "" {
+		t.ActionType = EventTriggerActionWebhook
+	}
+	if t.ActionType != EventTriggerActionWebhook {
+		return EventTrigger{}, fmt.Errorf("event_trigger: unsupported action_type %q", t.ActionType)
+	}
+	if strings.TrimSpace(t.ActionTarget) == "" {
+		return EventTrigger{}, fmt.Errorf("event_trigger: action_target is required")
+	}
+
+	t.UpdatedAt = Now()
+	res, err := s.db.Exec(
+		`UPDATE event_triggers
+		    SET name = ?, filter_severity = ?, filter_source = ?,
+		        filter_message_regex = ?, action_type = ?, action_target = ?,
+		        action_secret = ?, enabled = ?, updated_at = ?
+		  WHERE id = ?`,
+		t.Name, t.FilterSeverity, t.FilterSource, t.FilterMessageRegex,
+		t.ActionType, t.ActionTarget, t.ActionSecret, boolToInt(t.Enabled),
+		t.UpdatedAt, t.ID,
+	)
+	if err != nil {
+		return EventTrigger{}, fmt.Errorf("update event trigger: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return EventTrigger{}, fmt.Errorf("event trigger %d: %w", t.ID, ErrNotFound)
+	}
+	return s.GetEventTrigger(t.ID)
+}
+
+// DeleteEventTrigger removes a trigger by ID. Idempotent on the
+// caller's side: returns ErrNotFound if the row is already gone so a
+// double-click in the UI gives a clean error rather than 500.
+func (s *Store) DeleteEventTrigger(id int64) error {
+	res, err := s.db.Exec(`DELETE FROM event_triggers WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete event trigger: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("event trigger %d: %w", id, ErrNotFound)
+	}
+	return nil
+}
+
+func scanEventTrigger(rows *sql.Rows) (EventTrigger, error) {
+	var t EventTrigger
+	var enabled int
+	if err := rows.Scan(
+		&t.ID, &t.Name, &t.FilterSeverity, &t.FilterSource, &t.FilterMessageRegex,
+		&t.ActionType, &t.ActionTarget, &t.ActionSecret, &enabled,
+		&t.CreatedAt, &t.UpdatedAt,
+	); err != nil {
+		return EventTrigger{}, fmt.Errorf("scan event trigger: %w", err)
+	}
+	t.Enabled = enabled != 0
+	return t, nil
+}
+
+func scanEventTriggerRow(row *sql.Row) (EventTrigger, error) {
+	var t EventTrigger
+	var enabled int
+	if err := row.Scan(
+		&t.ID, &t.Name, &t.FilterSeverity, &t.FilterSource, &t.FilterMessageRegex,
+		&t.ActionType, &t.ActionTarget, &t.ActionSecret, &enabled,
+		&t.CreatedAt, &t.UpdatedAt,
+	); err != nil {
+		return EventTrigger{}, err
+	}
+	t.Enabled = enabled != 0
+	return t, nil
+}
+
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
@@ -0,0 +1,256 @@
+package store
+
+import (
+	"database/sql"
+	"errors"
+	"fmt"
+	"strings"
+)
+
+// CreateLogScanRule inserts a new rule row. Validates severity +
+// streams enum membership and rejects negative cooldowns.
+func (s *Store) CreateLogScanRule(r LogScanRule) (LogScanRule, error) {
+	if err := validateLogScanRule(r); err != nil {
+		return LogScanRule{}, err
+	}
+	now := Now()
+	r.CreatedAt = now
+	r.UpdatedAt = now
+	res, err := s.db.Exec(
+		`INSERT INTO log_scan_rules
+		   (workload_id, overrides_id, name, pattern, severity, streams,
+		    cooldown_seconds, enabled, created_at, updated_at)
+		 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		r.WorkloadID, r.OverridesID, r.Name, r.Pattern, r.Severity, r.Streams,
+		r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
+	)
+	if err != nil {
+		return LogScanRule{}, fmt.Errorf("insert log scan rule: %w", err)
+	}
+	id, err := res.LastInsertId()
+	if err != nil {
+		return LogScanRule{}, fmt.Errorf("get log scan rule id: %w", err)
+	}
+	r.ID = id
+	return r, nil
+}
+
+// ListLogScanRules returns every rule, ordered by id for stable UI
+// rendering.
+func (s *Store) ListLogScanRules() ([]LogScanRule, error) {
+	return s.queryLogScanRules(
+		`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM log_scan_rules ORDER BY id`,
+	)
+}
+
+// ListLogScanRulesByWorkload returns all rows directly attached to
+// the workload (workload-only additions and per-workload overrides),
+// excluding global rules. Useful for the workload detail page.
+func (s *Store) ListLogScanRulesByWorkload(workloadID string) ([]LogScanRule, error) {
+	return s.queryLogScanRules(
+		`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM log_scan_rules WHERE workload_id = ? ORDER BY id`,
+		workloadID,
+	)
+}
+
+// GetLogScanRule fetches one rule by id or returns ErrNotFound.
+func (s *Store) GetLogScanRule(id int64) (LogScanRule, error) {
+	row := s.db.QueryRow(
+		`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
+		        cooldown_seconds, enabled, created_at, updated_at
+		 FROM log_scan_rules WHERE id = ?`, id,
+	)
+	r, err := scanLogScanRuleRow(row)
+	if errors.Is(err, sql.ErrNoRows) {
+		return LogScanRule{}, fmt.Errorf("log scan rule %d: %w", id, ErrNotFound)
+	}
+	if err != nil {
+		return LogScanRule{}, fmt.Errorf("query log scan rule: %w", err)
+	}
+	return r, nil
+}
+
+// UpdateLogScanRule overwrites the editable columns of a rule row.
+// id, workload_id, overrides_id are immutable on update — change the
+// scope of a rule by deleting + recreating, to keep the
+// hot-reload-snapshot semantics simple.
+func (s *Store) UpdateLogScanRule(r LogScanRule) (LogScanRule, error) {
+	if r.ID == 0 {
+		return LogScanRule{}, fmt.Errorf("log scan rule: id is required for update")
+	}
+	if err := validateLogScanRule(r); err != nil {
+		return LogScanRule{}, err
+	}
+	r.UpdatedAt = Now()
+	res, err := s.db.Exec(
+		`UPDATE log_scan_rules
+		    SET name = ?, pattern = ?, severity = ?, streams = ?,
+		        cooldown_seconds = ?, enabled = ?, updated_at = ?
+		  WHERE id = ?`,
+		r.Name, r.Pattern, r.Severity, r.Streams,
+		r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
+	)
+	if err != nil {
+		return LogScanRule{}, fmt.Errorf("update log scan rule: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return LogScanRule{}, fmt.Errorf("log scan rule %d: %w", r.ID, ErrNotFound)
+	}
+	return s.GetLogScanRule(r.ID)
+}
+
+// DeleteLogScanRule removes a rule by id. Override rows referencing
+// this id are cascade-deleted at the application layer because we
+// don't enforce SQLite FK constraints repo-wide. The two DELETEs run
+// inside a single transaction so a mid-cascade failure can't leave
+// overrides orphaned by a vanished global.
+func (s *Store) DeleteLogScanRule(id int64) error {
+	tx, err := s.db.Begin()
+	if err != nil {
+		return fmt.Errorf("begin delete tx: %w", err)
+	}
+	defer tx.Rollback() //nolint:errcheck // commit path returns nil; rollback after commit is a no-op
+	if _, err := tx.Exec(`DELETE FROM log_scan_rules WHERE overrides_id = ?`, id); err != nil {
+		return fmt.Errorf("delete dependent log scan overrides: %w", err)
+	}
+	res, err := tx.Exec(`DELETE FROM log_scan_rules WHERE id = ?`, id)
+	if err != nil {
+		return fmt.Errorf("delete log scan rule: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("log scan rule %d: %w", id, ErrNotFound)
+	}
+	if err := tx.Commit(); err != nil {
+		return fmt.Errorf("commit delete tx: %w", err)
+	}
+	return nil
+}
+
+// EffectiveLogScanRules computes the effective rule set for one
+// workload according to the spec in docs/LOGSCAN_AND_TRIGGERS_TODO.md:
+//
+//  1. All global rules (workload_id == "" AND overrides_id == 0)
+//     minus globals that have a per-workload override row.
+//  2. Plus workload-only rules (workload_id == X AND overrides_id == 0).
+//  3. Plus per-workload override rules (workload_id == X AND overrides_id != 0),
+//     which carry the override's own enabled/pattern/severity.
+//
+// Computed in Go after two simple SELECTs since rule counts will be
+// small (operator-curated, dozens not thousands).
+func (s *Store) EffectiveLogScanRules(workloadID string) ([]LogScanRule, error) {
+	all, err := s.ListLogScanRules()
+	if err != nil {
+		return nil, err
+	}
+	overrides := map[int64]LogScanRule{} // globalID -> override row
+	var workloadOnly []LogScanRule
+	var globals []LogScanRule
+	for _, r := range all {
+		switch {
+		case r.WorkloadID == "" && r.OverridesID == 0:
+			globals = append(globals, r)
+		case r.WorkloadID == workloadID && r.OverridesID == 0:
+			workloadOnly = append(workloadOnly, r)
+		case r.WorkloadID == workloadID && r.OverridesID != 0:
+			overrides[r.OverridesID] = r
+		}
+	}
+	out := make([]LogScanRule, 0, len(globals)+len(workloadOnly))
+	for _, g := range globals {
+		if ov, ok := overrides[g.ID]; ok {
+			// Override row's fields win — including enabled=false to
+			// turn off the global for this workload.
+			out = append(out, ov)
+		} else {
+			out = append(out, g)
+		}
+	}
+	out = append(out, workloadOnly...)
+	return out, nil
+}
+
+func (s *Store) queryLogScanRules(query string, args ...any) ([]LogScanRule, error) {
+	rows, err := s.db.Query(query, args...)
+	if err != nil {
+		return nil, fmt.Errorf("query log scan rules: %w", err)
+	}
+	defer rows.Close()
+	out := []LogScanRule{}
+	for rows.Next() {
+		r, err := scanLogScanRuleRows(rows)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, r)
+	}
+	return out, rows.Err()
+}
+
+func scanLogScanRuleRows(rows *sql.Rows) (LogScanRule, error) {
+	var r LogScanRule
+	var enabled int
+	if err := rows.Scan(
+		&r.ID, &r.WorkloadID, &r.OverridesID, &r.Name, &r.Pattern, &r.Severity, &r.Streams,
+		&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
+	); err != nil {
+		return LogScanRule{}, fmt.Errorf("scan log scan rule: %w", err)
+	}
+	r.Enabled = enabled != 0
+	return r, nil
+}
+
+func scanLogScanRuleRow(row *sql.Row) (LogScanRule, error) {
+	var r LogScanRule
+	var enabled int
+	if err := row.Scan(
+		&r.ID, &r.WorkloadID, &r.OverridesID, &r.Name, &r.Pattern, &r.Severity, &r.Streams,
+		&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
+	); err != nil {
+		return LogScanRule{}, err
+	}
+	r.Enabled = enabled != 0
+	return r, nil
+}
+
+// validateLogScanRule enforces the per-row invariants. Regex
+// compilation is intentionally NOT done here — it's a hot-path
+// concern owned by the engine snapshot, and engine compile errors
+// become engine-side warnings rather than store-side rejections to
+// keep the failure mode operator-debuggable.
+func validateLogScanRule(r LogScanRule) error {
+	if strings.TrimSpace(r.Name) == "" {
+		return fmt.Errorf("log scan rule: name is required")
+	}
+	if strings.TrimSpace(r.Pattern) == "" {
+		return fmt.Errorf("log scan rule: pattern is required")
+	}
+	switch r.Severity {
+	case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
+	case "":
+		// Default applied at the caller; allow blank.
+	default:
+		return fmt.Errorf("log scan rule: invalid severity %q", r.Severity)
+	}
+	switch r.Streams {
+	case LogScanStreamAll, LogScanStreamStdout, LogScanStreamStderr:
+	case "":
+	default:
+		return fmt.Errorf("log scan rule: invalid streams %q", r.Streams)
+	}
+	if r.CooldownSeconds < 0 {
+		return fmt.Errorf("log scan rule: cooldown_seconds must be >= 0")
+	}
+	// An override row must reference an existing global id and live
+	// under a specific workload. The store doesn't verify the FK
+	// (no PRAGMA foreign_keys), but we can sanity-check the shape.
+	if r.OverridesID != 0 && r.WorkloadID == "" {
+		return fmt.Errorf("log scan rule: override row requires workload_id")
+	}
+	return nil
+}
@@ -0,0 +1,155 @@
+package store
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestCreateLogScanRule_Validates(t *testing.T) {
+	s := newTestStore(t)
+	cases := []struct {
+		name    string
+		in      LogScanRule
+		wantErr string
+	}{
+		{
+			name:    "missing name",
+			in:      LogScanRule{Pattern: "x"},
+			wantErr: "name is required",
+		},
+		{
+			name:    "missing pattern",
+			in:      LogScanRule{Name: "n"},
+			wantErr: "pattern is required",
+		},
+		{
+			name:    "bad severity",
+			in:      LogScanRule{Name: "n", Pattern: "x", Severity: "loud"},
+			wantErr: "invalid severity",
+		},
+		{
+			name:    "bad streams",
+			in:      LogScanRule{Name: "n", Pattern: "x", Streams: "both"},
+			wantErr: "invalid streams",
+		},
+		{
+			name:    "negative cooldown",
+			in:      LogScanRule{Name: "n", Pattern: "x", CooldownSeconds: -1},
+			wantErr: "cooldown_seconds must be",
+		},
+		{
+			name:    "override without workload",
+			in:      LogScanRule{Name: "n", Pattern: "x", OverridesID: 5},
+			wantErr: "override row requires workload_id",
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			_, err := s.CreateLogScanRule(c.in)
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", c.wantErr)
+			}
+			if !strings.Contains(err.Error(), c.wantErr) {
+				t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
+			}
+		})
+	}
+}
+
+func TestCreateAndGetLogScanRule(t *testing.T) {
+	s := newTestStore(t)
+	r, err := s.CreateLogScanRule(LogScanRule{
+		Name: "panics", Pattern: `\bpanic\b`, Severity: "error", Streams: "stderr",
+		CooldownSeconds: 30, Enabled: true,
+	})
+	if err != nil {
+		t.Fatalf("create: %v", err)
+	}
+	if r.ID == 0 {
+		t.Fatal("id should be set")
+	}
+	got, err := s.GetLogScanRule(r.ID)
+	if err != nil {
+		t.Fatalf("get: %v", err)
+	}
+	if got.Pattern != `\bpanic\b` {
+		t.Errorf("pattern mismatch: %q", got.Pattern)
+	}
+	if !got.Enabled {
+		t.Error("enabled lost on round-trip")
+	}
+}
+
+func TestEffectiveLogScanRules(t *testing.T) {
+	s := newTestStore(t)
+	g, _ := s.CreateLogScanRule(LogScanRule{
+		Name: "global", Pattern: "panic", Severity: "warn", Streams: "all", Enabled: true,
+	})
+	_, _ = s.CreateLogScanRule(LogScanRule{
+		Name: "w1-only", Pattern: "slow_query", WorkloadID: "w1", Severity: "info", Streams: "all", Enabled: true,
+	})
+	_, _ = s.CreateLogScanRule(LogScanRule{
+		Name: "override-for-w1", Pattern: "panic", WorkloadID: "w1", OverridesID: g.ID,
+		Severity: "error", Streams: "all", Enabled: true,
+	})
+
+	w1, err := s.EffectiveLogScanRules("w1")
+	if err != nil {
+		t.Fatalf("effective w1: %v", err)
+	}
+	if len(w1) != 2 {
+		t.Fatalf("w1 effective should be 2 (override + addition), got %d", len(w1))
+	}
+	// First entry replaces the global with the override (error severity).
+	if w1[0].Severity != "error" {
+		t.Errorf("override severity not applied: %q", w1[0].Severity)
+	}
+
+	w2, err := s.EffectiveLogScanRules("w2")
+	if err != nil {
+		t.Fatalf("effective w2: %v", err)
+	}
+	if len(w2) != 1 {
+		t.Fatalf("w2 effective should be 1 (just the global), got %d", len(w2))
+	}
+	if w2[0].Severity != "warn" {
+		t.Errorf("w2 should see original severity: %q", w2[0].Severity)
+	}
+}
+
+func TestDeleteLogScanRule_CascadesOverrides(t *testing.T) {
+	s := newTestStore(t)
+	g, _ := s.CreateLogScanRule(LogScanRule{
+		Name: "global", Pattern: "panic", Severity: "warn", Streams: "all", Enabled: true,
+	})
+	ov, _ := s.CreateLogScanRule(LogScanRule{
+		Name: "override", Pattern: "panic", WorkloadID: "w1", OverridesID: g.ID,
+		Severity: "error", Streams: "all", Enabled: true,
+	})
+
+	if err := s.DeleteLogScanRule(g.ID); err != nil {
+		t.Fatalf("delete: %v", err)
+	}
+	if _, err := s.GetLogScanRule(ov.ID); err == nil {
+		t.Error("override should be cascade-deleted with its global")
+	}
+}
+
+func TestUpdateLogScanRule(t *testing.T) {
+	s := newTestStore(t)
+	r, _ := s.CreateLogScanRule(LogScanRule{
+		Name: "n", Pattern: "x", Severity: "warn", Streams: "all", Enabled: true,
+	})
+	r.Pattern = "y"
+	r.Enabled = false
+	got, err := s.UpdateLogScanRule(r)
+	if err != nil {
+		t.Fatalf("update: %v", err)
+	}
+	if got.Pattern != "y" {
+		t.Errorf("pattern not updated: %q", got.Pattern)
+	}
+	if got.Enabled {
+		t.Error("enabled=false not applied")
+	}
+}
@@ -197,6 +197,34 @@ type StageEnv struct {
 	UpdatedAt string `json:"updated_at"`
 }

+// WorkloadVolume is the plugin-shape equivalent of legacy Volume: a
+// per-workload mount declaration. The Scope enum matches the existing
+// VolumeScope contract so the legacy resolver can be reused once its
+// project_id assumption is loosened.
+type WorkloadVolume struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Source     string `json:"source"`
+	Target     string `json:"target"`
+	Scope      string `json:"scope"`
+	Name       string `json:"name"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
+// WorkloadEnv is the plugin-shape equivalent of StageEnv: per-workload
+// environment variable overrides, optionally encrypted at rest. Read by
+// the Source plugin at deploy time, merged on top of source_config.env.
+type WorkloadEnv struct {
+	ID         string `json:"id"`
+	WorkloadID string `json:"workload_id"`
+	Key        string `json:"key"`
+	Value      string `json:"value"`
+	Encrypted  bool   `json:"encrypted"`
+	CreatedAt  string `json:"created_at"`
+	UpdatedAt  string `json:"updated_at"`
+}
+
 // VolumeScope defines the sharing scope for a volume mount.
 // Valid scopes: instance, stage, project, project_named, named, ephemeral.
 type VolumeScope string
@@ -333,6 +361,82 @@ type EventLog struct {
 	CreatedAt string `json:"created_at"`
 }

+// EventTrigger is a filter+action rule evaluated against EventLog
+// entries published on the bus. When all non-empty filters match, the
+// trigger fires its configured action (webhook today, additional action
+// types extensible via the ActionType enum).
+//
+// Filter fields use a comma-separated list shape for multi-value
+// filters (severity, source) to keep the schema flat — empty string
+// means "no filter on this dimension." FilterMessageRegex is a single
+// regex evaluated against EventLog.Message.
+//
+// Loop-prevention: deliveries are recorded in webhook_deliveries (the
+// existing audit trail). The dispatcher MUST NOT write to event_log
+// or it will recurse.
+type EventTrigger struct {
+	ID                 int64  `json:"id"`
+	Name               string `json:"name"`
+	FilterSeverity     string `json:"filter_severity"`      // comma list: "warn,error"; "" = any
+	FilterSource       string `json:"filter_source"`        // comma list: "logscan,deploy"; "" = any
+	FilterMessageRegex string `json:"filter_message_regex"` // "" = any
+	ActionType         string `json:"action_type"`          // "webhook" today
+	ActionTarget       string `json:"action_target"`        // URL for webhook
+	ActionSecret       string `json:"action_secret"`        // optional HMAC secret for signed delivery
+	Enabled            bool   `json:"enabled"`
+	CreatedAt          string `json:"created_at"`
+	UpdatedAt          string `json:"updated_at"`
+}
+
+// EventTriggerActionType enumerates the supported action_type values.
+// Adding a new action is additive — old triggers keep working, the
+// dispatcher just learns a new branch.
+const (
+	EventTriggerActionWebhook = "webhook"
+)
+
+// LogScanRule is one regex-based pattern the log scanner evaluates
+// against container log lines. The (workload_id, overrides_id) pair
+// implements the "global rule with optional per-workload override"
+// pattern documented in docs/LOGSCAN_AND_TRIGGERS_TODO.md:
+//
+//   - WorkloadID == "" && OverridesID == 0 → global rule, applies to
+//     every workload unless overridden.
+//   - WorkloadID != "" && OverridesID == 0 → workload-only addition.
+//   - WorkloadID != "" && OverridesID != 0 → override of the named
+//     global rule for one workload (Enabled=false to disable globally
+//     for this workload).
+type LogScanRule struct {
+	ID              int64  `json:"id"`
+	WorkloadID      string `json:"workload_id"`        // "" = global
+	OverridesID     int64  `json:"overrides_id"`       // 0 = not an override
+	Name            string `json:"name"`
+	Pattern         string `json:"pattern"`            // regex, compiled at load
+	Severity        string `json:"severity"`           // info|warn|error
+	Streams         string `json:"streams"`            // all|stdout|stderr
+	CooldownSeconds int    `json:"cooldown_seconds"`
+	Enabled         bool   `json:"enabled"`
+	CreatedAt       string `json:"created_at"`
+	UpdatedAt       string `json:"updated_at"`
+}
+
+// Log scan stream filter values. "all" reads both streams; "stdout"
+// or "stderr" filter to one. Used both for store validation and at
+// docker-side log read time.
+const (
+	LogScanStreamAll    = "all"
+	LogScanStreamStdout = "stdout"
+	LogScanStreamStderr = "stderr"
+)
+
+// Log scan severity values mirror the event_log enum so a matched
+// rule lands as an event_log row with the rule's severity verbatim.
+const (
+	LogScanSeverityInfo  = "info"
+	LogScanSeverityWarn  = "warn"
+	LogScanSeverityError = "error"
+)
+
 // WorkloadKind enumerates the kinds of things that own containers.
 // Each kind has a corresponding row in projects/stacks/static_sites referenced via Workload.RefID.
 type WorkloadKind string
@@ -346,12 +450,24 @@ const (
 // Workload is the unifying primitive that abstracts Project, Stack, and StaticSite.
 // Each row is paired with exactly one project/stack/site via (Kind, RefID).
 // Notification + webhook config moves here so it lives in one place across kinds.
+//
+// SourceKind / SourceConfig / TriggerKind / TriggerConfig / PublicFaces /
+// ParentWorkloadID populate the unified plugin model from the Workload-first
+// refactor. Existing rows keep these empty until they are explicitly migrated
+// or replaced — the legacy Kind/RefID columns continue to point at
+// project/stack/site rows in parallel during the cutover.
 type Workload struct {
 	ID                      string `json:"id"`
-	Kind                    string `json:"kind"` // project | stack | site
+	Kind                    string `json:"kind"` // project | stack | site (legacy discriminator)
 	RefID                   string `json:"ref_id"`
 	Name                    string `json:"name"`
-	AppID                   string `json:"app_id"` // nullable; "" = unassigned
+	AppID                   string `json:"app_id"` // nullable; "" = unassigned (a.k.a. GroupID after rename)
+	SourceKind              string `json:"source_kind"`              // "" until plugin-mode populated
+	SourceConfig            string `json:"source_config"`            // JSON-encoded, decoded by the matching Source
+	TriggerKind             string `json:"trigger_kind"`
+	TriggerConfig           string `json:"trigger_config"`           // JSON-encoded, decoded by the matching Trigger
+	PublicFaces             string `json:"public_faces"`             // JSON-encoded []PublicFace
+	ParentWorkloadID        string `json:"parent_workload_id"`       // "" = root; non-empty = stage chain
 	NotificationURL         string `json:"notification_url"`
 	NotificationSecret      string `json:"-"` // never serialized
 	WebhookSecret           string `json:"-"` // URL-identifier secret; never serialized
@@ -384,8 +500,14 @@ type Container struct {
 	ProxyRouteID string `json:"proxy_route_id"`
 	NpmProxyID   int    `json:"npm_proxy_id"`
 	LastSeenAt   string `json:"last_seen_at"`
-	CreatedAt    string `json:"created_at"`
-	UpdatedAt    string `json:"updated_at"`
+	// ExtraJSON carries source-specific metadata that isn't promoted to a
+	// first-class column — currently per-face proxy route IDs for
+	// multi-face image deploys. Stored as a JSON object; '{}' on empty
+	// rows. Sources own the shape; consumers should tolerate unknown
+	// keys.
+	ExtraJSON string `json:"extra_json"`
+	CreatedAt string `json:"created_at"`
+	UpdatedAt string `json:"updated_at"`
 }

 // App is an optional grouping of workloads (e.g., "my-saas" = web project + worker stack + redis stack).
@@ -181,6 +181,15 @@ func (s *Store) runMigrations() error {
 		// re-write path; the LEFT JOIN in ListContainersByStageID falls back
 		// to (project_id, role=stage_name) so legacy rows still resolve.
 		`ALTER TABLE containers ADD COLUMN stage_id TEXT NOT NULL DEFAULT ''`,
+		// Workload-first refactor columns (2026-05-10). Land additively so
+		// the legacy kind/ref_id columns continue to serve existing
+		// project/stack/site rows during cutover.
+		`ALTER TABLE workloads ADD COLUMN source_kind TEXT NOT NULL DEFAULT ''`,
+		`ALTER TABLE workloads ADD COLUMN source_config TEXT NOT NULL DEFAULT '{}'`,
+		`ALTER TABLE workloads ADD COLUMN trigger_kind TEXT NOT NULL DEFAULT ''`,
+		`ALTER TABLE workloads ADD COLUMN trigger_config TEXT NOT NULL DEFAULT '{}'`,
+		`ALTER TABLE workloads ADD COLUMN public_faces TEXT NOT NULL DEFAULT '[]'`,
+		`ALTER TABLE workloads ADD COLUMN parent_workload_id TEXT NOT NULL DEFAULT ''`,
 	}

 	// Workload refactor tables (2026-05-09). Workload is the unifying primitive
@@ -195,6 +204,12 @@ func (s *Store) runMigrations() error {
 			ref_id                      TEXT NOT NULL,
 			name                        TEXT NOT NULL,
 			app_id                      TEXT NOT NULL DEFAULT '',
+			source_kind                 TEXT NOT NULL DEFAULT '',
+			source_config               TEXT NOT NULL DEFAULT '{}',
+			trigger_kind                TEXT NOT NULL DEFAULT '',
+			trigger_config              TEXT NOT NULL DEFAULT '{}',
+			public_faces                TEXT NOT NULL DEFAULT '[]',
+			parent_workload_id          TEXT NOT NULL DEFAULT '',
 			notification_url            TEXT NOT NULL DEFAULT '',
 			notification_secret         TEXT NOT NULL DEFAULT '',
 			webhook_secret              TEXT NOT NULL DEFAULT '',
@@ -231,6 +246,34 @@ func (s *Store) runMigrations() error {
 			created_at  TEXT NOT NULL DEFAULT (datetime('now')),
 			updated_at  TEXT NOT NULL DEFAULT (datetime('now'))
 		)`,
+		// workload_env: per-workload env overrides (encrypt-at-rest for
+		// secrets). Functional analog of stage_env. Workload deletion
+		// cascades through the FK so orphan rows are impossible.
+		`CREATE TABLE IF NOT EXISTS workload_env (
+			id          TEXT PRIMARY KEY,
+			workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+			key         TEXT NOT NULL,
+			value       TEXT NOT NULL DEFAULT '',
+			encrypted   INTEGER NOT NULL DEFAULT 0,
+			created_at  TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at  TEXT NOT NULL DEFAULT (datetime('now')),
+			UNIQUE(workload_id, key)
+		)`,
+		// workload_volumes: per-workload mount declarations. Mirrors the
+		// legacy `volumes` table shape (source / target / scope / name)
+		// but keyed on workload_id. UNIQUE on (workload_id, target) so a
+		// re-add overwrites instead of duplicating.
+		`CREATE TABLE IF NOT EXISTS workload_volumes (
+			id          TEXT PRIMARY KEY,
+			workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
+			source      TEXT NOT NULL DEFAULT '',
+			target      TEXT NOT NULL,
+			scope       TEXT NOT NULL DEFAULT 'absolute',
+			name        TEXT NOT NULL DEFAULT '',
+			created_at  TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at  TEXT NOT NULL DEFAULT (datetime('now')),
+			UNIQUE(workload_id, target)
+		)`,
 	}
 	for _, t := range workloadTables {
 		if _, err := s.db.Exec(t); err != nil {
@@ -312,6 +355,49 @@ func (s *Store) runMigrations() error {
 		}
 	}

+	// Observability: event_triggers — consume EventLog entries off the
+	// bus and dispatch webhook actions. Schema kept flat (comma-list
+	// filters, single optional regex) — see LOGSCAN_AND_TRIGGERS_TODO.md.
+	observabilityTables := []string{
+		`CREATE TABLE IF NOT EXISTS event_triggers (
+			id                    INTEGER PRIMARY KEY AUTOINCREMENT,
+			name                  TEXT NOT NULL,
+			filter_severity       TEXT NOT NULL DEFAULT '',
+			filter_source         TEXT NOT NULL DEFAULT '',
+			filter_message_regex  TEXT NOT NULL DEFAULT '',
+			action_type           TEXT NOT NULL DEFAULT 'webhook',
+			action_target         TEXT NOT NULL DEFAULT '',
+			action_secret         TEXT NOT NULL DEFAULT '',
+			enabled               INTEGER NOT NULL DEFAULT 1,
+			created_at            TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at            TEXT NOT NULL DEFAULT (datetime('now'))
+		)`,
+		// log_scan_rules: regex patterns the log-scanner manager
+		// applies to container log lines. WorkloadID is nullable (via
+		// "" sentinel) so a global rule can have OverridesID = 0 and
+		// per-workload overrides reference the global's id.
+		`CREATE TABLE IF NOT EXISTS log_scan_rules (
+			id                 INTEGER PRIMARY KEY AUTOINCREMENT,
+			workload_id        TEXT NOT NULL DEFAULT '',
+			overrides_id       INTEGER NOT NULL DEFAULT 0,
+			name               TEXT NOT NULL,
+			pattern            TEXT NOT NULL,
+			severity           TEXT NOT NULL DEFAULT 'warn',
+			streams            TEXT NOT NULL DEFAULT 'all',
+			cooldown_seconds   INTEGER NOT NULL DEFAULT 60,
+			enabled            INTEGER NOT NULL DEFAULT 1,
+			created_at         TEXT NOT NULL DEFAULT (datetime('now')),
+			updated_at         TEXT NOT NULL DEFAULT (datetime('now'))
+		)`,
+		`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
+		`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
+	}
+	for _, t := range observabilityTables {
+		if _, err := s.db.Exec(t); err != nil {
+			return fmt.Errorf("create observability table: %w", err)
+		}
+	}
+
 	for _, m := range migrations {
 		if _, err := s.db.Exec(m); err != nil {
 			// "duplicate column" / "already exists" are expected when a
@@ -366,6 +452,8 @@ func (s *Store) runMigrations() error {
 		`CREATE INDEX IF NOT EXISTS idx_containers_container_id ON containers(container_id) WHERE container_id != ''`,
 		`CREATE INDEX IF NOT EXISTS idx_containers_kind         ON containers(workload_kind)`,
 		`CREATE INDEX IF NOT EXISTS idx_containers_stage_id     ON containers(stage_id) WHERE stage_id != ''`,
+		`CREATE INDEX IF NOT EXISTS idx_workload_env_workload     ON workload_env(workload_id)`,
+		`CREATE INDEX IF NOT EXISTS idx_workload_volumes_workload ON workload_volumes(workload_id)`,
 	}
 	for _, idx := range indexes {
 		if _, err := s.db.Exec(idx); err != nil {