feat(observability): event triggers + log scanner backend

Two paired backends sharing the events.Bus seam:

Event triggers (consumer-side):
- internal/store/event_triggers.go — CRUD with action_secret
  redaction on read (placeholder echo treated as "no change" on
  PATCH so secrets aren't accidentally wiped).
- internal/events/dispatcher.go — bus subscriber, AND-composed
  filters (severity CSV, source CSV, message regex with memoized
  compile cache). Structural loop-prevention: never writes to
  event_log. Sends via notifier.SendPayload.
- internal/notify: SendPayload + SendSyncForTestPayload methods,
  TierEventTrigger constant, doSendRaw shared with the legacy
  Event-shaped path.
- internal/api/event_triggers.go — admin-gated CRUD + /test
  sending the real TriggerWebhookPayload shape. SSRF guard
  rejects loopback / link-local / unspecified targets. PATCH
  uses pointer-typed DTO for partial updates.

Log scanner (producer-side):
- internal/logscanner/ — engine (per-rule cooldown +
  per-container token bucket, atomic drop counters), tail
  (multiplexed docker frame demuxer with TTY fallback + 16 MiB
  payload cap + 1 MiB reassembly cap + RFC3339Nano-validated
  timestamp strip + UTF-8-safe message truncation), manager
  (5s container polling, atomic.Pointer[Snapshot] hot-reload,
  HitEmitter writes event_log + publishes EventLog so the
  trigger dispatcher picks them up immediately).
- internal/docker/container.go — ContainerLogsOpts exposes
  stream selection for stderr-only / stdout-only rules.
- internal/store: log_scan_rules table + CRUD with
  EffectiveLogScanRules resolver (globals minus per-workload
  overrides plus workload-only additions). Transactional
  cascade-delete of overrides when a global rule is removed.
- internal/api/log_scan_rules.go — admin-gated CRUD + /test
  (sample_line → matched/captures) + /stats (drop counters +
  active tail count + last-snapshot compile errors) +
  GET /api/workloads/{id}/effective-rules.

cmd/server/main.go wires both subsystems next to the existing
RegisterPersistentLogger. Coverage spans engine cooldown / bucket
counter tests, snapshot effective-set semantics, manager compile-
error capture, dispatcher matching, store validation +
cascade-delete, API URL validator + secret redaction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-11 22:18:11 +03:00
parent 82d32181ba
commit 7a9ff7ad54
23 changed files with 3974 additions and 19 deletions
+208
View File
@@ -0,0 +1,208 @@
package store
import (
"database/sql"
"errors"
"fmt"
"strings"
)
// CreateEventTrigger inserts a new trigger row. ID is assigned by the
// auto-increment column and returned on the populated struct.
func (s *Store) CreateEventTrigger(t EventTrigger) (EventTrigger, error) {
if strings.TrimSpace(t.Name) == "" {
return EventTrigger{}, fmt.Errorf("event_trigger: name is required")
}
if t.ActionType == "" {
t.ActionType = EventTriggerActionWebhook
}
if t.ActionType != EventTriggerActionWebhook {
return EventTrigger{}, fmt.Errorf("event_trigger: unsupported action_type %q", t.ActionType)
}
if strings.TrimSpace(t.ActionTarget) == "" {
return EventTrigger{}, fmt.Errorf("event_trigger: action_target is required")
}
now := Now()
t.CreatedAt = now
t.UpdatedAt = now
res, err := s.db.Exec(
`INSERT INTO event_triggers
(name, filter_severity, filter_source, filter_message_regex,
action_type, action_target, action_secret, enabled,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
t.Name, t.FilterSeverity, t.FilterSource, t.FilterMessageRegex,
t.ActionType, t.ActionTarget, t.ActionSecret, boolToInt(t.Enabled),
t.CreatedAt, t.UpdatedAt,
)
if err != nil {
return EventTrigger{}, fmt.Errorf("insert event trigger: %w", err)
}
id, err := res.LastInsertId()
if err != nil {
return EventTrigger{}, fmt.Errorf("get event trigger id: %w", err)
}
t.ID = id
return t, nil
}
// ListEventTriggers returns every trigger row, ordered by id so the UI
// rendering is stable across requests. Trigger counts are expected to
// be small (operator-curated), so unbounded listing is fine.
func (s *Store) ListEventTriggers() ([]EventTrigger, error) {
rows, err := s.db.Query(
`SELECT id, name, filter_severity, filter_source, filter_message_regex,
action_type, action_target, action_secret, enabled,
created_at, updated_at
FROM event_triggers ORDER BY id`,
)
if err != nil {
return nil, fmt.Errorf("query event triggers: %w", err)
}
defer rows.Close()
out := []EventTrigger{}
for rows.Next() {
t, err := scanEventTrigger(rows)
if err != nil {
return nil, err
}
out = append(out, t)
}
return out, rows.Err()
}
// ListEnabledEventTriggers returns only the rows with enabled=1. The
// dispatcher hot path uses this so a disabled trigger costs nothing.
func (s *Store) ListEnabledEventTriggers() ([]EventTrigger, error) {
rows, err := s.db.Query(
`SELECT id, name, filter_severity, filter_source, filter_message_regex,
action_type, action_target, action_secret, enabled,
created_at, updated_at
FROM event_triggers WHERE enabled = 1 ORDER BY id`,
)
if err != nil {
return nil, fmt.Errorf("query enabled event triggers: %w", err)
}
defer rows.Close()
out := []EventTrigger{}
for rows.Next() {
t, err := scanEventTrigger(rows)
if err != nil {
return nil, err
}
out = append(out, t)
}
return out, rows.Err()
}
// GetEventTrigger returns one trigger by ID or ErrNotFound.
func (s *Store) GetEventTrigger(id int64) (EventTrigger, error) {
row := s.db.QueryRow(
`SELECT id, name, filter_severity, filter_source, filter_message_regex,
action_type, action_target, action_secret, enabled,
created_at, updated_at
FROM event_triggers WHERE id = ?`, id,
)
t, err := scanEventTriggerRow(row)
if errors.Is(err, sql.ErrNoRows) {
return EventTrigger{}, fmt.Errorf("event trigger %d: %w", id, ErrNotFound)
}
if err != nil {
return EventTrigger{}, fmt.Errorf("query event trigger: %w", err)
}
return t, nil
}
// UpdateEventTrigger overwrites the editable columns of an existing row.
// CreatedAt is preserved; UpdatedAt is refreshed.
func (s *Store) UpdateEventTrigger(t EventTrigger) (EventTrigger, error) {
if t.ID == 0 {
return EventTrigger{}, fmt.Errorf("event_trigger: id is required for update")
}
if strings.TrimSpace(t.Name) == "" {
return EventTrigger{}, fmt.Errorf("event_trigger: name is required")
}
if t.ActionType == "" {
t.ActionType = EventTriggerActionWebhook
}
if t.ActionType != EventTriggerActionWebhook {
return EventTrigger{}, fmt.Errorf("event_trigger: unsupported action_type %q", t.ActionType)
}
if strings.TrimSpace(t.ActionTarget) == "" {
return EventTrigger{}, fmt.Errorf("event_trigger: action_target is required")
}
t.UpdatedAt = Now()
res, err := s.db.Exec(
`UPDATE event_triggers
SET name = ?, filter_severity = ?, filter_source = ?,
filter_message_regex = ?, action_type = ?, action_target = ?,
action_secret = ?, enabled = ?, updated_at = ?
WHERE id = ?`,
t.Name, t.FilterSeverity, t.FilterSource, t.FilterMessageRegex,
t.ActionType, t.ActionTarget, t.ActionSecret, boolToInt(t.Enabled),
t.UpdatedAt, t.ID,
)
if err != nil {
return EventTrigger{}, fmt.Errorf("update event trigger: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return EventTrigger{}, fmt.Errorf("event trigger %d: %w", t.ID, ErrNotFound)
}
return s.GetEventTrigger(t.ID)
}
// DeleteEventTrigger removes a trigger by ID. Idempotent on the
// caller's side: returns ErrNotFound if the row is already gone so a
// double-click in the UI gives a clean error rather than 500.
func (s *Store) DeleteEventTrigger(id int64) error {
res, err := s.db.Exec(`DELETE FROM event_triggers WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete event trigger: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return fmt.Errorf("event trigger %d: %w", id, ErrNotFound)
}
return nil
}
func scanEventTrigger(rows *sql.Rows) (EventTrigger, error) {
var t EventTrigger
var enabled int
if err := rows.Scan(
&t.ID, &t.Name, &t.FilterSeverity, &t.FilterSource, &t.FilterMessageRegex,
&t.ActionType, &t.ActionTarget, &t.ActionSecret, &enabled,
&t.CreatedAt, &t.UpdatedAt,
); err != nil {
return EventTrigger{}, fmt.Errorf("scan event trigger: %w", err)
}
t.Enabled = enabled != 0
return t, nil
}
func scanEventTriggerRow(row *sql.Row) (EventTrigger, error) {
var t EventTrigger
var enabled int
if err := row.Scan(
&t.ID, &t.Name, &t.FilterSeverity, &t.FilterSource, &t.FilterMessageRegex,
&t.ActionType, &t.ActionTarget, &t.ActionSecret, &enabled,
&t.CreatedAt, &t.UpdatedAt,
); err != nil {
return EventTrigger{}, err
}
t.Enabled = enabled != 0
return t, nil
}
func boolToInt(b bool) int {
if b {
return 1
}
return 0
}
+256
View File
@@ -0,0 +1,256 @@
package store
import (
"database/sql"
"errors"
"fmt"
"strings"
)
// CreateLogScanRule inserts a new rule row. Validates severity +
// streams enum membership and rejects negative cooldowns.
func (s *Store) CreateLogScanRule(r LogScanRule) (LogScanRule, error) {
if err := validateLogScanRule(r); err != nil {
return LogScanRule{}, err
}
now := Now()
r.CreatedAt = now
r.UpdatedAt = now
res, err := s.db.Exec(
`INSERT INTO log_scan_rules
(workload_id, overrides_id, name, pattern, severity, streams,
cooldown_seconds, enabled, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
r.WorkloadID, r.OverridesID, r.Name, r.Pattern, r.Severity, r.Streams,
r.CooldownSeconds, boolToInt(r.Enabled), r.CreatedAt, r.UpdatedAt,
)
if err != nil {
return LogScanRule{}, fmt.Errorf("insert log scan rule: %w", err)
}
id, err := res.LastInsertId()
if err != nil {
return LogScanRule{}, fmt.Errorf("get log scan rule id: %w", err)
}
r.ID = id
return r, nil
}
// ListLogScanRules returns every rule, ordered by id for stable UI
// rendering.
func (s *Store) ListLogScanRules() ([]LogScanRule, error) {
return s.queryLogScanRules(
`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
cooldown_seconds, enabled, created_at, updated_at
FROM log_scan_rules ORDER BY id`,
)
}
// ListLogScanRulesByWorkload returns all rows directly attached to
// the workload (workload-only additions and per-workload overrides),
// excluding global rules. Useful for the workload detail page.
func (s *Store) ListLogScanRulesByWorkload(workloadID string) ([]LogScanRule, error) {
return s.queryLogScanRules(
`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
cooldown_seconds, enabled, created_at, updated_at
FROM log_scan_rules WHERE workload_id = ? ORDER BY id`,
workloadID,
)
}
// GetLogScanRule fetches one rule by id or returns ErrNotFound.
func (s *Store) GetLogScanRule(id int64) (LogScanRule, error) {
row := s.db.QueryRow(
`SELECT id, workload_id, overrides_id, name, pattern, severity, streams,
cooldown_seconds, enabled, created_at, updated_at
FROM log_scan_rules WHERE id = ?`, id,
)
r, err := scanLogScanRuleRow(row)
if errors.Is(err, sql.ErrNoRows) {
return LogScanRule{}, fmt.Errorf("log scan rule %d: %w", id, ErrNotFound)
}
if err != nil {
return LogScanRule{}, fmt.Errorf("query log scan rule: %w", err)
}
return r, nil
}
// UpdateLogScanRule overwrites the editable columns of a rule row.
// id, workload_id, overrides_id are immutable on update — change the
// scope of a rule by deleting + recreating, to keep the
// hot-reload-snapshot semantics simple.
func (s *Store) UpdateLogScanRule(r LogScanRule) (LogScanRule, error) {
if r.ID == 0 {
return LogScanRule{}, fmt.Errorf("log scan rule: id is required for update")
}
if err := validateLogScanRule(r); err != nil {
return LogScanRule{}, err
}
r.UpdatedAt = Now()
res, err := s.db.Exec(
`UPDATE log_scan_rules
SET name = ?, pattern = ?, severity = ?, streams = ?,
cooldown_seconds = ?, enabled = ?, updated_at = ?
WHERE id = ?`,
r.Name, r.Pattern, r.Severity, r.Streams,
r.CooldownSeconds, boolToInt(r.Enabled), r.UpdatedAt, r.ID,
)
if err != nil {
return LogScanRule{}, fmt.Errorf("update log scan rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return LogScanRule{}, fmt.Errorf("log scan rule %d: %w", r.ID, ErrNotFound)
}
return s.GetLogScanRule(r.ID)
}
// DeleteLogScanRule removes a rule by id. Override rows referencing
// this id are cascade-deleted at the application layer because we
// don't enforce SQLite FK constraints repo-wide. The two DELETEs run
// inside a single transaction so a mid-cascade failure can't leave
// overrides orphaned by a vanished global.
func (s *Store) DeleteLogScanRule(id int64) error {
tx, err := s.db.Begin()
if err != nil {
return fmt.Errorf("begin delete tx: %w", err)
}
defer tx.Rollback() //nolint:errcheck // commit path returns nil; rollback after commit is a no-op
if _, err := tx.Exec(`DELETE FROM log_scan_rules WHERE overrides_id = ?`, id); err != nil {
return fmt.Errorf("delete dependent log scan overrides: %w", err)
}
res, err := tx.Exec(`DELETE FROM log_scan_rules WHERE id = ?`, id)
if err != nil {
return fmt.Errorf("delete log scan rule: %w", err)
}
n, _ := res.RowsAffected()
if n == 0 {
return fmt.Errorf("log scan rule %d: %w", id, ErrNotFound)
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("commit delete tx: %w", err)
}
return nil
}
// EffectiveLogScanRules computes the effective rule set for one
// workload according to the spec in docs/LOGSCAN_AND_TRIGGERS_TODO.md:
//
// 1. All global rules (workload_id == "" AND overrides_id == 0)
// minus globals that have a per-workload override row.
// 2. Plus workload-only rules (workload_id == X AND overrides_id == 0).
// 3. Plus per-workload override rules (workload_id == X AND overrides_id != 0),
// which carry the override's own enabled/pattern/severity.
//
// Computed in Go after two simple SELECTs since rule counts will be
// small (operator-curated, dozens not thousands).
func (s *Store) EffectiveLogScanRules(workloadID string) ([]LogScanRule, error) {
all, err := s.ListLogScanRules()
if err != nil {
return nil, err
}
overrides := map[int64]LogScanRule{} // globalID -> override row
var workloadOnly []LogScanRule
var globals []LogScanRule
for _, r := range all {
switch {
case r.WorkloadID == "" && r.OverridesID == 0:
globals = append(globals, r)
case r.WorkloadID == workloadID && r.OverridesID == 0:
workloadOnly = append(workloadOnly, r)
case r.WorkloadID == workloadID && r.OverridesID != 0:
overrides[r.OverridesID] = r
}
}
out := make([]LogScanRule, 0, len(globals)+len(workloadOnly))
for _, g := range globals {
if ov, ok := overrides[g.ID]; ok {
// Override row's fields win — including enabled=false to
// turn off the global for this workload.
out = append(out, ov)
} else {
out = append(out, g)
}
}
out = append(out, workloadOnly...)
return out, nil
}
func (s *Store) queryLogScanRules(query string, args ...any) ([]LogScanRule, error) {
rows, err := s.db.Query(query, args...)
if err != nil {
return nil, fmt.Errorf("query log scan rules: %w", err)
}
defer rows.Close()
out := []LogScanRule{}
for rows.Next() {
r, err := scanLogScanRuleRows(rows)
if err != nil {
return nil, err
}
out = append(out, r)
}
return out, rows.Err()
}
func scanLogScanRuleRows(rows *sql.Rows) (LogScanRule, error) {
var r LogScanRule
var enabled int
if err := rows.Scan(
&r.ID, &r.WorkloadID, &r.OverridesID, &r.Name, &r.Pattern, &r.Severity, &r.Streams,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return LogScanRule{}, fmt.Errorf("scan log scan rule: %w", err)
}
r.Enabled = enabled != 0
return r, nil
}
func scanLogScanRuleRow(row *sql.Row) (LogScanRule, error) {
var r LogScanRule
var enabled int
if err := row.Scan(
&r.ID, &r.WorkloadID, &r.OverridesID, &r.Name, &r.Pattern, &r.Severity, &r.Streams,
&r.CooldownSeconds, &enabled, &r.CreatedAt, &r.UpdatedAt,
); err != nil {
return LogScanRule{}, err
}
r.Enabled = enabled != 0
return r, nil
}
// validateLogScanRule enforces the per-row invariants. Regex
// compilation is intentionally NOT done here — it's a hot-path
// concern owned by the engine snapshot, and engine compile errors
// become engine-side warnings rather than store-side rejections to
// keep the failure mode operator-debuggable.
func validateLogScanRule(r LogScanRule) error {
if strings.TrimSpace(r.Name) == "" {
return fmt.Errorf("log scan rule: name is required")
}
if strings.TrimSpace(r.Pattern) == "" {
return fmt.Errorf("log scan rule: pattern is required")
}
switch r.Severity {
case LogScanSeverityInfo, LogScanSeverityWarn, LogScanSeverityError:
case "":
// Default applied at the caller; allow blank.
default:
return fmt.Errorf("log scan rule: invalid severity %q", r.Severity)
}
switch r.Streams {
case LogScanStreamAll, LogScanStreamStdout, LogScanStreamStderr:
case "":
default:
return fmt.Errorf("log scan rule: invalid streams %q", r.Streams)
}
if r.CooldownSeconds < 0 {
return fmt.Errorf("log scan rule: cooldown_seconds must be >= 0")
}
// An override row must reference an existing global id and live
// under a specific workload. The store doesn't verify the FK
// (no PRAGMA foreign_keys), but we can sanity-check the shape.
if r.OverridesID != 0 && r.WorkloadID == "" {
return fmt.Errorf("log scan rule: override row requires workload_id")
}
return nil
}
+155
View File
@@ -0,0 +1,155 @@
package store
import (
"strings"
"testing"
)
func TestCreateLogScanRule_Validates(t *testing.T) {
s := newTestStore(t)
cases := []struct {
name string
in LogScanRule
wantErr string
}{
{
name: "missing name",
in: LogScanRule{Pattern: "x"},
wantErr: "name is required",
},
{
name: "missing pattern",
in: LogScanRule{Name: "n"},
wantErr: "pattern is required",
},
{
name: "bad severity",
in: LogScanRule{Name: "n", Pattern: "x", Severity: "loud"},
wantErr: "invalid severity",
},
{
name: "bad streams",
in: LogScanRule{Name: "n", Pattern: "x", Streams: "both"},
wantErr: "invalid streams",
},
{
name: "negative cooldown",
in: LogScanRule{Name: "n", Pattern: "x", CooldownSeconds: -1},
wantErr: "cooldown_seconds must be",
},
{
name: "override without workload",
in: LogScanRule{Name: "n", Pattern: "x", OverridesID: 5},
wantErr: "override row requires workload_id",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
_, err := s.CreateLogScanRule(c.in)
if err == nil {
t.Fatalf("expected error containing %q, got nil", c.wantErr)
}
if !strings.Contains(err.Error(), c.wantErr) {
t.Fatalf("error mismatch: got %q want substring %q", err.Error(), c.wantErr)
}
})
}
}
func TestCreateAndGetLogScanRule(t *testing.T) {
s := newTestStore(t)
r, err := s.CreateLogScanRule(LogScanRule{
Name: "panics", Pattern: `\bpanic\b`, Severity: "error", Streams: "stderr",
CooldownSeconds: 30, Enabled: true,
})
if err != nil {
t.Fatalf("create: %v", err)
}
if r.ID == 0 {
t.Fatal("id should be set")
}
got, err := s.GetLogScanRule(r.ID)
if err != nil {
t.Fatalf("get: %v", err)
}
if got.Pattern != `\bpanic\b` {
t.Errorf("pattern mismatch: %q", got.Pattern)
}
if !got.Enabled {
t.Error("enabled lost on round-trip")
}
}
func TestEffectiveLogScanRules(t *testing.T) {
s := newTestStore(t)
g, _ := s.CreateLogScanRule(LogScanRule{
Name: "global", Pattern: "panic", Severity: "warn", Streams: "all", Enabled: true,
})
_, _ = s.CreateLogScanRule(LogScanRule{
Name: "w1-only", Pattern: "slow_query", WorkloadID: "w1", Severity: "info", Streams: "all", Enabled: true,
})
_, _ = s.CreateLogScanRule(LogScanRule{
Name: "override-for-w1", Pattern: "panic", WorkloadID: "w1", OverridesID: g.ID,
Severity: "error", Streams: "all", Enabled: true,
})
w1, err := s.EffectiveLogScanRules("w1")
if err != nil {
t.Fatalf("effective w1: %v", err)
}
if len(w1) != 2 {
t.Fatalf("w1 effective should be 2 (override + addition), got %d", len(w1))
}
// First entry replaces the global with the override (error severity).
if w1[0].Severity != "error" {
t.Errorf("override severity not applied: %q", w1[0].Severity)
}
w2, err := s.EffectiveLogScanRules("w2")
if err != nil {
t.Fatalf("effective w2: %v", err)
}
if len(w2) != 1 {
t.Fatalf("w2 effective should be 1 (just the global), got %d", len(w2))
}
if w2[0].Severity != "warn" {
t.Errorf("w2 should see original severity: %q", w2[0].Severity)
}
}
func TestDeleteLogScanRule_CascadesOverrides(t *testing.T) {
s := newTestStore(t)
g, _ := s.CreateLogScanRule(LogScanRule{
Name: "global", Pattern: "panic", Severity: "warn", Streams: "all", Enabled: true,
})
ov, _ := s.CreateLogScanRule(LogScanRule{
Name: "override", Pattern: "panic", WorkloadID: "w1", OverridesID: g.ID,
Severity: "error", Streams: "all", Enabled: true,
})
if err := s.DeleteLogScanRule(g.ID); err != nil {
t.Fatalf("delete: %v", err)
}
if _, err := s.GetLogScanRule(ov.ID); err == nil {
t.Error("override should be cascade-deleted with its global")
}
}
func TestUpdateLogScanRule(t *testing.T) {
s := newTestStore(t)
r, _ := s.CreateLogScanRule(LogScanRule{
Name: "n", Pattern: "x", Severity: "warn", Streams: "all", Enabled: true,
})
r.Pattern = "y"
r.Enabled = false
got, err := s.UpdateLogScanRule(r)
if err != nil {
t.Fatalf("update: %v", err)
}
if got.Pattern != "y" {
t.Errorf("pattern not updated: %q", got.Pattern)
}
if got.Enabled {
t.Error("enabled=false not applied")
}
}
+126 -4
View File
@@ -197,6 +197,34 @@ type StageEnv struct {
UpdatedAt string `json:"updated_at"`
}
// WorkloadVolume is the plugin-shape equivalent of legacy Volume: a
// per-workload mount declaration. The Scope enum matches the existing
// VolumeScope contract so the legacy resolver can be reused once its
// project_id assumption is loosened.
type WorkloadVolume struct {
ID string `json:"id"`
WorkloadID string `json:"workload_id"`
Source string `json:"source"`
Target string `json:"target"`
Scope string `json:"scope"`
Name string `json:"name"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// WorkloadEnv is the plugin-shape equivalent of StageEnv: per-workload
// environment variable overrides, optionally encrypted at rest. Read by
// the Source plugin at deploy time, merged on top of source_config.env.
type WorkloadEnv struct {
ID string `json:"id"`
WorkloadID string `json:"workload_id"`
Key string `json:"key"`
Value string `json:"value"`
Encrypted bool `json:"encrypted"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// VolumeScope defines the sharing scope for a volume mount.
// Valid scopes: instance, stage, project, project_named, named, ephemeral.
type VolumeScope string
@@ -333,6 +361,82 @@ type EventLog struct {
CreatedAt string `json:"created_at"`
}
// EventTrigger is a filter+action rule evaluated against EventLog
// entries published on the bus. When all non-empty filters match, the
// trigger fires its configured action (webhook today, additional action
// types extensible via the ActionType enum).
//
// Filter fields use a comma-separated list shape for multi-value
// filters (severity, source) to keep the schema flat — empty string
// means "no filter on this dimension." FilterMessageRegex is a single
// regex evaluated against EventLog.Message.
//
// Loop-prevention: deliveries are recorded in webhook_deliveries (the
// existing audit trail). The dispatcher MUST NOT write to event_log
// or it will recurse.
type EventTrigger struct {
ID int64 `json:"id"`
Name string `json:"name"`
FilterSeverity string `json:"filter_severity"` // comma list: "warn,error"; "" = any
FilterSource string `json:"filter_source"` // comma list: "logscan,deploy"; "" = any
FilterMessageRegex string `json:"filter_message_regex"` // "" = any
ActionType string `json:"action_type"` // "webhook" today
ActionTarget string `json:"action_target"` // URL for webhook
ActionSecret string `json:"action_secret"` // optional HMAC secret for signed delivery
Enabled bool `json:"enabled"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// EventTriggerActionType enumerates the supported action_type values.
// Adding a new action is additive — old triggers keep working, the
// dispatcher just learns a new branch.
const (
EventTriggerActionWebhook = "webhook"
)
// LogScanRule is one regex-based pattern the log scanner evaluates
// against container log lines. The (workload_id, overrides_id) pair
// implements the "global rule with optional per-workload override"
// pattern documented in docs/LOGSCAN_AND_TRIGGERS_TODO.md:
//
// - WorkloadID == "" && OverridesID == 0 → global rule, applies to
// every workload unless overridden.
// - WorkloadID != "" && OverridesID == 0 → workload-only addition.
// - WorkloadID != "" && OverridesID != 0 → override of the named
// global rule for one workload (Enabled=false to disable globally
// for this workload).
type LogScanRule struct {
ID int64 `json:"id"`
WorkloadID string `json:"workload_id"` // "" = global
OverridesID int64 `json:"overrides_id"` // 0 = not an override
Name string `json:"name"`
Pattern string `json:"pattern"` // regex, compiled at load
Severity string `json:"severity"` // info|warn|error
Streams string `json:"streams"` // all|stdout|stderr
CooldownSeconds int `json:"cooldown_seconds"`
Enabled bool `json:"enabled"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// Log scan stream filter values. "all" reads both streams; "stdout"
// or "stderr" filter to one. Used both for store validation and at
// docker-side log read time.
const (
LogScanStreamAll = "all"
LogScanStreamStdout = "stdout"
LogScanStreamStderr = "stderr"
)
// Log scan severity values mirror the event_log enum so a matched
// rule lands as an event_log row with the rule's severity verbatim.
const (
LogScanSeverityInfo = "info"
LogScanSeverityWarn = "warn"
LogScanSeverityError = "error"
)
// WorkloadKind enumerates the kinds of things that own containers.
// Each kind has a corresponding row in projects/stacks/static_sites referenced via Workload.RefID.
type WorkloadKind string
@@ -346,12 +450,24 @@ const (
// Workload is the unifying primitive that abstracts Project, Stack, and StaticSite.
// Each row is paired with exactly one project/stack/site via (Kind, RefID).
// Notification + webhook config moves here so it lives in one place across kinds.
//
// SourceKind / SourceConfig / TriggerKind / TriggerConfig / PublicFaces /
// ParentWorkloadID populate the unified plugin model from the Workload-first
// refactor. Existing rows keep these empty until they are explicitly migrated
// or replaced — the legacy Kind/RefID columns continue to point at
// project/stack/site rows in parallel during the cutover.
type Workload struct {
ID string `json:"id"`
Kind string `json:"kind"` // project | stack | site
Kind string `json:"kind"` // project | stack | site (legacy discriminator)
RefID string `json:"ref_id"`
Name string `json:"name"`
AppID string `json:"app_id"` // nullable; "" = unassigned
AppID string `json:"app_id"` // nullable; "" = unassigned (a.k.a. GroupID after rename)
SourceKind string `json:"source_kind"` // "" until plugin-mode populated
SourceConfig string `json:"source_config"` // JSON-encoded, decoded by the matching Source
TriggerKind string `json:"trigger_kind"`
TriggerConfig string `json:"trigger_config"` // JSON-encoded, decoded by the matching Trigger
PublicFaces string `json:"public_faces"` // JSON-encoded []PublicFace
ParentWorkloadID string `json:"parent_workload_id"` // "" = root; non-empty = stage chain
NotificationURL string `json:"notification_url"`
NotificationSecret string `json:"-"` // never serialized
WebhookSecret string `json:"-"` // URL-identifier secret; never serialized
@@ -384,8 +500,14 @@ type Container struct {
ProxyRouteID string `json:"proxy_route_id"`
NpmProxyID int `json:"npm_proxy_id"`
LastSeenAt string `json:"last_seen_at"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
// ExtraJSON carries source-specific metadata that isn't promoted to a
// first-class column — currently per-face proxy route IDs for
// multi-face image deploys. Stored as a JSON object; '{}' on empty
// rows. Sources own the shape; consumers should tolerate unknown
// keys.
ExtraJSON string `json:"extra_json"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
// App is an optional grouping of workloads (e.g., "my-saas" = web project + worker stack + redis stack).
+88
View File
@@ -181,6 +181,15 @@ func (s *Store) runMigrations() error {
// re-write path; the LEFT JOIN in ListContainersByStageID falls back
// to (project_id, role=stage_name) so legacy rows still resolve.
`ALTER TABLE containers ADD COLUMN stage_id TEXT NOT NULL DEFAULT ''`,
// Workload-first refactor columns (2026-05-10). Land additively so
// the legacy kind/ref_id columns continue to serve existing
// project/stack/site rows during cutover.
`ALTER TABLE workloads ADD COLUMN source_kind TEXT NOT NULL DEFAULT ''`,
`ALTER TABLE workloads ADD COLUMN source_config TEXT NOT NULL DEFAULT '{}'`,
`ALTER TABLE workloads ADD COLUMN trigger_kind TEXT NOT NULL DEFAULT ''`,
`ALTER TABLE workloads ADD COLUMN trigger_config TEXT NOT NULL DEFAULT '{}'`,
`ALTER TABLE workloads ADD COLUMN public_faces TEXT NOT NULL DEFAULT '[]'`,
`ALTER TABLE workloads ADD COLUMN parent_workload_id TEXT NOT NULL DEFAULT ''`,
}
// Workload refactor tables (2026-05-09). Workload is the unifying primitive
@@ -195,6 +204,12 @@ func (s *Store) runMigrations() error {
ref_id TEXT NOT NULL,
name TEXT NOT NULL,
app_id TEXT NOT NULL DEFAULT '',
source_kind TEXT NOT NULL DEFAULT '',
source_config TEXT NOT NULL DEFAULT '{}',
trigger_kind TEXT NOT NULL DEFAULT '',
trigger_config TEXT NOT NULL DEFAULT '{}',
public_faces TEXT NOT NULL DEFAULT '[]',
parent_workload_id TEXT NOT NULL DEFAULT '',
notification_url TEXT NOT NULL DEFAULT '',
notification_secret TEXT NOT NULL DEFAULT '',
webhook_secret TEXT NOT NULL DEFAULT '',
@@ -231,6 +246,34 @@ func (s *Store) runMigrations() error {
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
// workload_env: per-workload env overrides (encrypt-at-rest for
// secrets). Functional analog of stage_env. Workload deletion
// cascades through the FK so orphan rows are impossible.
`CREATE TABLE IF NOT EXISTS workload_env (
id TEXT PRIMARY KEY,
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
key TEXT NOT NULL,
value TEXT NOT NULL DEFAULT '',
encrypted INTEGER NOT NULL DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE(workload_id, key)
)`,
// workload_volumes: per-workload mount declarations. Mirrors the
// legacy `volumes` table shape (source / target / scope / name)
// but keyed on workload_id. UNIQUE on (workload_id, target) so a
// re-add overwrites instead of duplicating.
`CREATE TABLE IF NOT EXISTS workload_volumes (
id TEXT PRIMARY KEY,
workload_id TEXT NOT NULL REFERENCES workloads(id) ON DELETE CASCADE,
source TEXT NOT NULL DEFAULT '',
target TEXT NOT NULL,
scope TEXT NOT NULL DEFAULT 'absolute',
name TEXT NOT NULL DEFAULT '',
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE(workload_id, target)
)`,
}
for _, t := range workloadTables {
if _, err := s.db.Exec(t); err != nil {
@@ -312,6 +355,49 @@ func (s *Store) runMigrations() error {
}
}
// Observability: event_triggers — consume EventLog entries off the
// bus and dispatch webhook actions. Schema kept flat (comma-list
// filters, single optional regex) — see LOGSCAN_AND_TRIGGERS_TODO.md.
observabilityTables := []string{
`CREATE TABLE IF NOT EXISTS event_triggers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
filter_severity TEXT NOT NULL DEFAULT '',
filter_source TEXT NOT NULL DEFAULT '',
filter_message_regex TEXT NOT NULL DEFAULT '',
action_type TEXT NOT NULL DEFAULT 'webhook',
action_target TEXT NOT NULL DEFAULT '',
action_secret TEXT NOT NULL DEFAULT '',
enabled INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
// log_scan_rules: regex patterns the log-scanner manager
// applies to container log lines. WorkloadID is nullable (via
// "" sentinel) so a global rule can have OverridesID = 0 and
// per-workload overrides reference the global's id.
`CREATE TABLE IF NOT EXISTS log_scan_rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
workload_id TEXT NOT NULL DEFAULT '',
overrides_id INTEGER NOT NULL DEFAULT 0,
name TEXT NOT NULL,
pattern TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'warn',
streams TEXT NOT NULL DEFAULT 'all',
cooldown_seconds INTEGER NOT NULL DEFAULT 60,
enabled INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_workload ON log_scan_rules(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_log_scan_rules_overrides ON log_scan_rules(overrides_id)`,
}
for _, t := range observabilityTables {
if _, err := s.db.Exec(t); err != nil {
return fmt.Errorf("create observability table: %w", err)
}
}
for _, m := range migrations {
if _, err := s.db.Exec(m); err != nil {
// "duplicate column" / "already exists" are expected when a
@@ -366,6 +452,8 @@ func (s *Store) runMigrations() error {
`CREATE INDEX IF NOT EXISTS idx_containers_container_id ON containers(container_id) WHERE container_id != ''`,
`CREATE INDEX IF NOT EXISTS idx_containers_kind ON containers(workload_kind)`,
`CREATE INDEX IF NOT EXISTS idx_containers_stage_id ON containers(stage_id) WHERE stage_id != ''`,
`CREATE INDEX IF NOT EXISTS idx_workload_env_workload ON workload_env(workload_id)`,
`CREATE INDEX IF NOT EXISTS idx_workload_volumes_workload ON workload_volumes(workload_id)`,
}
for _, idx := range indexes {
if _, err := s.db.Exec(idx); err != nil {