feat(observability): event triggers + log scanner backend
Two paired backends sharing the events.Bus seam:
Event triggers (consumer-side):
- internal/store/event_triggers.go — CRUD with action_secret
redaction on read (placeholder echo treated as "no change" on
PATCH so secrets aren't accidentally wiped).
- internal/events/dispatcher.go — bus subscriber, AND-composed
filters (severity CSV, source CSV, message regex with memoized
compile cache). Structural loop-prevention: never writes to
event_log. Sends via notifier.SendPayload.
- internal/notify: SendPayload + SendSyncForTestPayload methods,
TierEventTrigger constant, doSendRaw shared with the legacy
Event-shaped path.
- internal/api/event_triggers.go — admin-gated CRUD + /test
sending the real TriggerWebhookPayload shape. SSRF guard
rejects loopback / link-local / unspecified targets. PATCH
uses pointer-typed DTO for partial updates.
Log scanner (producer-side):
- internal/logscanner/ — engine (per-rule cooldown +
per-container token bucket, atomic drop counters), tail
(multiplexed docker frame demuxer with TTY fallback + 16 MiB
payload cap + 1 MiB reassembly cap + RFC3339Nano-validated
timestamp strip + UTF-8-safe message truncation), manager
(5s container polling, atomic.Pointer[Snapshot] hot-reload,
HitEmitter writes event_log + publishes EventLog so the
trigger dispatcher picks them up immediately).
- internal/docker/container.go — ContainerLogsOpts exposes
stream selection for stderr-only / stdout-only rules.
- internal/store: log_scan_rules table + CRUD with
EffectiveLogScanRules resolver (globals minus per-workload
overrides plus workload-only additions). Transactional
cascade-delete of overrides when a global rule is removed.
- internal/api/log_scan_rules.go — admin-gated CRUD + /test
(sample_line → matched/captures) + /stats (drop counters +
active tail count + last-snapshot compile errors) +
GET /api/workloads/{id}/effective-rules.
cmd/server/main.go wires both subsystems next to the existing
RegisterPersistentLogger. Coverage spans engine cooldown / bucket
counter tests, snapshot effective-set semantics, manager compile-
error capture, dispatcher matching, store validation +
cascade-delete, API URL validator + secret redaction.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,256 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CreateLogScanRule inserts a new rule row. Validates severity +
|
||||
// streams enum membership and rejects negative cooldowns.
|
||||
func (s *Store) CreateLogScanRule(r LogScanRule) (LogScanRule, error) {
|
||||
if err := validateLogScanRule(r); err != nil {
|
||||
return LogScanRule{}, err
|
||||
}
|
||||
now := Now()
|
||||
r.CreatedAt = now
|
||||
r.UpdatedAt = now
|
||||
res, err := s.db.Exec(
|
||||
`INSERT INTO log_scan_rules
|
||||
(workload_id, overrides_id, name, pattern, severity, streams,
|
||||
cooldown_seconds, enabled, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
r.WorkloadID, r.OverridesID, r.Name, r.Pattern, r.Severity, r.Streams,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return LogScanRule{}, fmt.Errorf("insert log scan rule: %w", err)
|
||||
}
|
||||
id, err := res.LastInsertId()
|
||||
if err != nil {
|
||||
return LogScanRule{}, fmt.Errorf("get log scan rule id: %w", err)
|
||||
}
|
||||
r.ID = id
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// ListLogScanRules returns every rule, ordered by id for stable UI
|
||||
// rendering.
|
||||
func (s *Store) ListLogScanRules() ([]LogScanRule, error) {
|
||||
return s.queryLogScanRules(
|
||||
`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM log_scan_rules ORDER BY id`,
|
||||
)
|
||||
}
|
||||
|
||||
// ListLogScanRulesByWorkload returns all rows directly attached to
|
||||
// the workload (workload-only additions and per-workload overrides),
|
||||
// excluding global rules. Useful for the workload detail page.
|
||||
func (s *Store) ListLogScanRulesByWorkload(workloadID string) ([]LogScanRule, error) {
|
||||
return s.queryLogScanRules(
|
||||
`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM log_scan_rules WHERE workload_id = ? ORDER BY id`,
|
||||
workloadID,
|
||||
)
|
||||
}
|
||||
|
||||
// GetLogScanRule fetches one rule by id or returns ErrNotFound.
|
||||
func (s *Store) GetLogScanRule(id int64) (LogScanRule, error) {
|
||||
row := s.db.QueryRow(
|
||||
`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
|
||||
cooldown_seconds, enabled, created_at, updated_at
|
||||
FROM log_scan_rules WHERE id = ?`, id,
|
||||
)
|
||||
r, err := scanLogScanRuleRow(row)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return LogScanRule{}, fmt.Errorf("log scan rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return LogScanRule{}, fmt.Errorf("query log scan rule: %w", err)
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// UpdateLogScanRule overwrites the editable columns of a rule row.
|
||||
// id, workload_id, overrides_id are immutable on update — change the
|
||||
// scope of a rule by deleting + recreating, to keep the
|
||||
// hot-reload-snapshot semantics simple.
|
||||
func (s *Store) UpdateLogScanRule(r LogScanRule) (LogScanRule, error) {
|
||||
if r.ID == 0 {
|
||||
return LogScanRule{}, fmt.Errorf("log scan rule: id is required for update")
|
||||
}
|
||||
if err := validateLogScanRule(r); err != nil {
|
||||
return LogScanRule{}, err
|
||||
}
|
||||
r.UpdatedAt = Now()
|
||||
res, err := s.db.Exec(
|
||||
`UPDATE log_scan_rules
|
||||
SET name = ?, pattern = ?, severity = ?, streams = ?,
|
||||
cooldown_seconds = ?, enabled = ?, updated_at = ?
|
||||
WHERE id = ?`,
|
||||
r.Name, r.Pattern, r.Severity, r.Streams,
|
||||
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
|
||||
)
|
||||
if err != nil {
|
||||
return LogScanRule{}, fmt.Errorf("update log scan rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return LogScanRule{}, fmt.Errorf("log scan rule %d: %w", r.ID, ErrNotFound)
|
||||
}
|
||||
return s.GetLogScanRule(r.ID)
|
||||
}
|
||||
|
||||
// DeleteLogScanRule removes a rule by id. Override rows referencing
|
||||
// this id are cascade-deleted at the application layer because we
|
||||
// don't enforce SQLite FK constraints repo-wide. The two DELETEs run
|
||||
// inside a single transaction so a mid-cascade failure can't leave
|
||||
// overrides orphaned by a vanished global.
|
||||
func (s *Store) DeleteLogScanRule(id int64) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("begin delete tx: %w", err)
|
||||
}
|
||||
defer tx.Rollback() //nolint:errcheck // commit path returns nil; rollback after commit is a no-op
|
||||
if _, err := tx.Exec(`DELETE FROM log_scan_rules WHERE overrides_id = ?`, id); err != nil {
|
||||
return fmt.Errorf("delete dependent log scan overrides: %w", err)
|
||||
}
|
||||
res, err := tx.Exec(`DELETE FROM log_scan_rules WHERE id = ?`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete log scan rule: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("log scan rule %d: %w", id, ErrNotFound)
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return fmt.Errorf("commit delete tx: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// EffectiveLogScanRules computes the effective rule set for one
|
||||
// workload according to the spec in docs/LOGSCAN_AND_TRIGGERS_TODO.md:
|
||||
//
|
||||
// 1. All global rules (workload_id == "" AND overrides_id == 0)
|
||||
// minus globals that have a per-workload override row.
|
||||
// 2. Plus workload-only rules (workload_id == X AND overrides_id == 0).
|
||||
// 3. Plus per-workload override rules (workload_id == X AND overrides_id != 0),
|
||||
// which carry the override's own enabled/pattern/severity.
|
||||
//
|
||||
// Computed in Go after two simple SELECTs since rule counts will be
|
||||
// small (operator-curated, dozens not thousands).
|
||||
func (s *Store) EffectiveLogScanRules(workloadID string) ([]LogScanRule, error) {
|
||||
all, err := s.ListLogScanRules()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
overrides := map[int64]LogScanRule{} // globalID -> override row
|
||||
var workloadOnly []LogScanRule
|
||||
var globals []LogScanRule
|
||||
for _, r := range all {
|
||||
switch {
|
||||
case r.WorkloadID == "" && r.OverridesID == 0:
|
||||
globals = append(globals, r)
|
||||
case r.WorkloadID == workloadID && r.OverridesID == 0:
|
||||
workloadOnly = append(workloadOnly, r)
|
||||
case r.WorkloadID == workloadID && r.OverridesID != 0:
|
||||
overrides[r.OverridesID] = r
|
||||
}
|
||||
}
|
||||
out := make([]LogScanRule, 0, len(globals)+len(workloadOnly))
|
||||
for _, g := range globals {
|
||||
if ov, ok := overrides[g.ID]; ok {
|
||||
// Override row's fields win — including enabled=false to
|
||||
// turn off the global for this workload.
|
||||
out = append(out, ov)
|
||||
} else {
|
||||
out = append(out, g)
|
||||
}
|
||||
}
|
||||
out = append(out, workloadOnly...)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *Store) queryLogScanRules(query string, args ...any) ([]LogScanRule, error) {
|
||||
rows, err := s.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query log scan rules: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
out := []LogScanRule{}
|
||||
for rows.Next() {
|
||||
r, err := scanLogScanRuleRows(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, r)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func scanLogScanRuleRows(rows *sql.Rows) (LogScanRule, error) {
|
||||
var r LogScanRule
|
||||
var enabled int
|
||||
if err := rows.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.OverridesID, &r.Name, &r.Pattern, &r.Severity, &r.Streams,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return LogScanRule{}, fmt.Errorf("scan log scan rule: %w", err)
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func scanLogScanRuleRow(row *sql.Row) (LogScanRule, error) {
|
||||
var r LogScanRule
|
||||
var enabled int
|
||||
if err := row.Scan(
|
||||
&r.ID, &r.WorkloadID, &r.OverridesID, &r.Name, &r.Pattern, &r.Severity, &r.Streams,
|
||||
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
|
||||
); err != nil {
|
||||
return LogScanRule{}, err
|
||||
}
|
||||
r.Enabled = enabled != 0
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// validateLogScanRule enforces the per-row invariants. Regex
|
||||
// compilation is intentionally NOT done here — it's a hot-path
|
||||
// concern owned by the engine snapshot, and engine compile errors
|
||||
// become engine-side warnings rather than store-side rejections to
|
||||
// keep the failure mode operator-debuggable.
|
||||
func validateLogScanRule(r LogScanRule) error {
|
||||
if strings.TrimSpace(r.Name) == "" {
|
||||
return fmt.Errorf("log scan rule: name is required")
|
||||
}
|
||||
if strings.TrimSpace(r.Pattern) == "" {
|
||||
return fmt.Errorf("log scan rule: pattern is required")
|
||||
}
|
||||
switch r.Severity {
|
||||
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
|
||||
case "":
|
||||
// Default applied at the caller; allow blank.
|
||||
default:
|
||||
return fmt.Errorf("log scan rule: invalid severity %q", r.Severity)
|
||||
}
|
||||
switch r.Streams {
|
||||
case LogScanStreamAll, LogScanStreamStdout, LogScanStreamStderr:
|
||||
case "":
|
||||
default:
|
||||
return fmt.Errorf("log scan rule: invalid streams %q", r.Streams)
|
||||
}
|
||||
if r.CooldownSeconds < 0 {
|
||||
return fmt.Errorf("log scan rule: cooldown_seconds must be >= 0")
|
||||
}
|
||||
// An override row must reference an existing global id and live
|
||||
// under a specific workload. The store doesn't verify the FK
|
||||
// (no PRAGMA foreign_keys), but we can sanity-check the shape.
|
||||
if r.OverridesID != 0 && r.WorkloadID == "" {
|
||||
return fmt.Errorf("log scan rule: override row requires workload_id")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user