feat(observability): event triggers + log scanner backend

Two paired backends sharing the events.Bus seam:

Event triggers (consumer-side):
- internal/store/event_triggers.go — CRUD with action_secret
  redaction on read (placeholder echo treated as "no change" on
  PATCH so secrets aren't accidentally wiped).
- internal/events/dispatcher.go — bus subscriber, AND-composed
  filters (severity CSV, source CSV, message regex with memoized
  compile cache). Structural loop-prevention: never writes to
  event_log. Sends via notifier.SendPayload.
- internal/notify: SendPayload + SendSyncForTestPayload methods,
  TierEventTrigger constant, doSendRaw shared with the legacy
  Event-shaped path.
- internal/api/event_triggers.go — admin-gated CRUD + /test
  sending the real TriggerWebhookPayload shape. SSRF guard
  rejects loopback / link-local / unspecified targets. PATCH
  uses pointer-typed DTO for partial updates.

Log scanner (producer-side):
- internal/logscanner/ — engine (per-rule cooldown +
  per-container token bucket, atomic drop counters), tail
  (multiplexed docker frame demuxer with TTY fallback + 16 MiB
  payload cap + 1 MiB reassembly cap + RFC3339Nano-validated
  timestamp strip + UTF-8-safe message truncation), manager
  (5s container polling, atomic.Pointer[Snapshot] hot-reload,
  HitEmitter writes event_log + publishes EventLog so the
  trigger dispatcher picks them up immediately).
- internal/docker/container.go — ContainerLogsOpts exposes
  stream selection for stderr-only / stdout-only rules.
- internal/store: log_scan_rules table + CRUD with
  EffectiveLogScanRules resolver (globals minus per-workload
  overrides plus workload-only additions). Transactional
  cascade-delete of overrides when a global rule is removed.
- internal/api/log_scan_rules.go — admin-gated CRUD + /test
  (sample_line → matched/captures) + /stats (drop counters +
  active tail count + last-snapshot compile errors) +
  GET /api/workloads/{id}/effective-rules.

cmd/server/main.go wires both subsystems next to the existing
RegisterPersistentLogger. Coverage spans engine cooldown / bucket
counter tests, snapshot effective-set semantics, manager compile-
error capture, dispatcher matching, store validation +
cascade-delete, API URL validator + secret redaction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-11 22:18:11 +03:00
parent 82d32181ba
commit 7a9ff7ad54
23 changed files with 3974 additions and 19 deletions
+88
View File
@@ -181,6 +181,15 @@ func (s *Store) runMigrations() error {
// re-write path; the LEFT JOIN in ListContainersByStageID falls back
// to (project_id, role=stage_name) so legacy rows still resolve.
`ALTER TABLE containers ADD COLUMN stage_id TEXT NOT NULL DEFAULT ''`,
// Workload-first refactor columns (2026-05-10). Land additively so
// the legacy kind/ref_id columns continue to serve existing
// project/stack/site rows during cutover.
`ALTER TABLE workloads ADD COLUMN source_kind TEXT NOT NULL DEFAULT ''`,
`ALTER TABLE workloads ADD COLUMN source_config TEXT NOT NULL DEFAULT '{}'`,
`ALTER TABLE workloads ADD COLUMN trigger_kind TEXT NOT NULL DEFAULT ''`,
`ALTER TABLE workloads ADD COLUMN trigger_config TEXT NOT NULL DEFAULT '{}'`,
`ALTER TABLE workloads ADD COLUMN public_faces TEXT NOT NULL DEFAULT '[]'`,
`ALTER TABLE workloads ADD COLUMN parent_workload_id TEXT NOT NULL DEFAULT ''`,
}
// Workload refactor tables (2026-05-09). Workload is the unifying primitive
@@ -195,6 +204,12 @@ func (s *Store) runMigrations() error {
ref_id TEXT NOT NULL,
name TEXT NOT NULL,
app_id TEXT NOT NULL DEFAULT '',
source_kind TEXT NOT NULL DEFAULT '',
source_config TEXT NOT NULL DEFAULT '{}',
trigger_kind TEXT NOT NULL DEFAULT '',
trigger_config TEXT NOT NULL DEFAULT '{}',
public_faces TEXT NOT NULL DEFAULT '[]',
parent_workload_id TEXT NOT NULL DEFAULT '',
notification_url TEXT NOT NULL DEFAULT '',
notification_secret TEXT NOT NULL DEFAULT '',
webhook_secret TEXT NOT NULL DEFAULT '',
@@ -231,6 +246,34 @@ func (s *Store) runMigrations() error {
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
// workload_env: per-workload env overrides (encrypt-at-rest for
// secrets). Functional analog of stage_env. Workload deletion
// cascades through the FK so orphan rows are impossible.
`CREATE TABLE IF NOT EXISTS workload_env (
id TEXT PRIMARY KEY,
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
key TEXT NOT NULL,
value TEXT NOT NULL DEFAULT '',
encrypted INTEGER NOT NULL DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE(workload_id, key)
)`,
// workload_volumes: per-workload mount declarations. Mirrors the
// legacy `volumes` table shape (source / target / scope / name)
// but keyed on workload_id. UNIQUE on (workload_id, target) so a
// re-add overwrites instead of duplicating.
`CREATE TABLE IF NOT EXISTS workload_volumes (
id TEXT PRIMARY KEY,
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
source TEXT NOT NULL DEFAULT '',
target TEXT NOT NULL,
scope TEXT NOT NULL DEFAULT 'absolute',
name TEXT NOT NULL DEFAULT '',
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE(workload_id, target)
)`,
}
for _, t := range workloadTables {
if _, err := s.db.Exec(t); err != nil {
@@ -312,6 +355,49 @@ func (s *Store) runMigrations() error {
}
}
// Observability: event_triggers — consume EventLog entries off the
// bus and dispatch webhook actions. Schema kept flat (comma-list
// filters, single optional regex) — see LOGSCAN_AND_TRIGGERS_TODO.md.
observabilityTables := []string{
`CREATE TABLE IF NOT EXISTS event_triggers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
filter_severity TEXT NOT NULL DEFAULT '',
filter_source TEXT NOT NULL DEFAULT '',
filter_message_regex TEXT NOT NULL DEFAULT '',
action_type TEXT NOT NULL DEFAULT 'webhook',
action_target TEXT NOT NULL DEFAULT '',
action_secret TEXT NOT NULL DEFAULT '',
enabled INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
// log_scan_rules: regex patterns the log-scanner manager
// applies to container log lines. WorkloadID is nullable (via
// "" sentinel) so a global rule can have OverridesID = 0 and
// per-workload overrides reference the global's id.
`CREATE TABLE IF NOT EXISTS log_scan_rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
workload_id TEXT NOT NULL DEFAULT '',
overrides_id INTEGER NOT NULL DEFAULT 0,
name TEXT NOT NULL,
pattern TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'warn',
streams TEXT NOT NULL DEFAULT 'all',
cooldown_seconds INTEGER NOT NULL DEFAULT 60,
enabled INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
}
for _, t := range observabilityTables {
if _, err := s.db.Exec(t); err != nil {
return fmt.Errorf("create observability table: %w", err)
}
}
for _, m := range migrations {
if _, err := s.db.Exec(m); err != nil {
// "duplicate column" / "already exists" are expected when a
@@ -366,6 +452,8 @@ func (s *Store) runMigrations() error {
`CREATE INDEX IF NOT EXISTS idx_containers_container_id ON containers(container_id) WHERE container_id != ''`,
`CREATE INDEX IF NOT EXISTS idx_containers_kind ON containers(workload_kind)`,
`CREATE INDEX IF NOT EXISTS idx_containers_stage_id ON containers(stage_id) WHERE stage_id != ''`,
`CREATE INDEX IF NOT EXISTS idx_workload_env_workload ON workload_env(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_workload_volumes_workload ON workload_volumes(workload_id)`,
}
for _, idx := range indexes {
if _, err := s.db.Exec(idx); err != nil {