feat(observability): event triggers + log scanner backend

Two paired backends sharing the events.Bus seam:

Event triggers (consumer-side):
- internal/store/event_triggers.go — CRUD with action_secret
  redaction on read (placeholder echo treated as "no change" on
  PATCH so secrets aren't accidentally wiped).
- internal/events/dispatcher.go — bus subscriber, AND-composed
  filters (severity CSV, source CSV, message regex with memoized
  compile cache). Structural loop-prevention: never writes to
  event_log. Sends via notifier.SendPayload.
- internal/notify: SendPayload + SendSyncForTestPayload methods,
  TierEventTrigger constant, doSendRaw shared with the legacy
  Event-shaped path.
- internal/api/event_triggers.go — admin-gated CRUD + /test
  sending the real TriggerWebhookPayload shape. SSRF guard
  rejects loopback / link-local / unspecified targets. PATCH
  uses pointer-typed DTO for partial updates.

Log scanner (producer-side):
- internal/logscanner/ — engine (per-rule cooldown +
  per-container token bucket, atomic drop counters), tail
  (multiplexed docker frame demuxer with TTY fallback + 16 MiB
  payload cap + 1 MiB reassembly cap + RFC3339Nano-validated
  timestamp strip + UTF-8-safe message truncation), manager
  (5s container polling, atomic.Pointer[Snapshot] hot-reload,
  HitEmitter writes event_log + publishes EventLog so the
  trigger dispatcher picks them up immediately).
- internal/docker/container.go — ContainerLogsOpts exposes
  stream selection for stderr-only / stdout-only rules.
- internal/store: log_scan_rules table + CRUD with
  EffectiveLogScanRules resolver (globals minus per-workload
  overrides plus workload-only additions). Transactional
  cascade-delete of overrides when a global rule is removed.
- internal/api/log_scan_rules.go — admin-gated CRUD + /test
  (sample_line → matched/captures) + /stats (drop counters +
  active tail count + last-snapshot compile errors) +
  GET /api/workloads/{id}/effective-rules.

cmd/server/main.go wires both subsystems next to the existing
RegisterPersistentLogger. Coverage spans engine cooldown / bucket
counter tests, snapshot effective-set semantics, manager compile-
error capture, dispatcher matching, store validation +
cascade-delete, API URL validator + secret redaction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-11 22:18:11 +03:00
parent 82d32181ba
commit 7a9ff7ad54
23 changed files with 3974 additions and 19 deletions
+52
View File
@@ -27,6 +27,7 @@ import (
"github.com/alexei/tinyforge/internal/events"
"github.com/alexei/tinyforge/internal/health"
"github.com/alexei/tinyforge/internal/logging"
"github.com/alexei/tinyforge/internal/logscanner"
"github.com/alexei/tinyforge/internal/notify"
"github.com/alexei/tinyforge/internal/npm"
"github.com/alexei/tinyforge/internal/proxy"
@@ -38,6 +39,16 @@ import (
"github.com/alexei/tinyforge/internal/staticsite"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/webhook"
// Plugin registrations: each blank-import runs its init() and registers
// itself with internal/workload/plugin. Adding a new Source or Trigger
// is a matter of dropping a new package and adding it to this list.
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/compose"
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/image"
_ "github.com/alexei/tinyforge/internal/workload/plugin/source/static"
_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/git"
_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/manual"
_ "github.com/alexei/tinyforge/internal/workload/plugin/trigger/registry"
)
func main() {
@@ -105,6 +116,9 @@ func main() {
rec := reconciler.New(db, dockerClient, 30*time.Second)
rec.Start(context.Background())
defer rec.Stop()
// The plugin pass is wired after the deployer is constructed (below);
// the reconciler tolerates a nil dispatcher until then. SetPluginReconciler
// is safe to call at any time, including mid-tick.
// Read settings for NPM URL and polling interval.
settings, err := db.GetSettings()
@@ -166,12 +180,24 @@ func main() {
})
defer stopLogger()
// Event-trigger dispatcher: consume EventLog publishes off the bus
// and fan out to operator-configured webhook actions. Loop-prevention
// is structural — the dispatcher never writes back to event_log; all
// delivery outcomes land in notifier audit logging.
stopTriggerDispatcher := events.RegisterEventTriggerDispatcher(eventBus, db, notifier)
defer stopTriggerDispatcher()
dep := deployer.New(dockerClient, proxyProvider, db, healthChecker, notifier, eventBus, encKey)
rec.SetPluginReconciler(dep)
// Initialize webhook handler. Per-project and per-site secrets are stored
// on their respective rows; the static-site triggerer is wired in below
// once the site manager has been constructed.
webhookHandler := webhook.NewHandler(db, dep, nil)
// Plugin-pipeline dispatcher for /api/webhook/workloads/{secret}.
// Wired here so the same *deployer.Deployer serves both legacy and
// plugin-native paths from one place.
webhookHandler.SetPluginDispatcher(dep)
// Initialize registry poller.
poller := registry.NewPoller(db, dep, encKey)
@@ -322,6 +348,11 @@ func main() {
// Initialize static site manager and health checker.
staticSiteMgr := staticsite.NewManager(db, dockerClient, proxyProvider, eventBus, notifier, encKey)
webhookHandler.SetSiteSyncTriggerer(staticSiteMgr)
// Wire the plugin static source's backend to the manager. After this
// call the "static" kind appears in /api/hooks/kinds and the /apps/new
// picker; before it, the source registers no kind, so the frontend
// silently omits it.
wireStaticBackend(db, staticSiteMgr)
staticSiteHealth := staticsite.NewHealthChecker(db, dockerClient, staticSiteMgr)
if err := staticSiteHealth.Start("2m"); err != nil {
slog.Warn("failed to start static site health checker", "error", err)
@@ -339,6 +370,26 @@ func main() {
stackMgr = nil
}
// Log-scan manager: tails running containers and emits event_log
// entries when log lines match operator-configured regex rules.
// Start before the API server is wired so the reload callback can
// be plugged in via SetLogScanReloader.
logScanMgr := logscanner.NewManager(logscanner.Config{
Rules: db,
Containers: db,
Docker: dockerClient,
Events: db,
Bus: eventBus,
PollInterval: 5 * time.Second,
})
// Manager owns its own cancellation; Stop() drives the loop and
// every tail to exit. Using Background here matches the
// reconciler + stale-scanner pattern elsewhere in this file.
if err := logScanMgr.Start(context.Background()); err != nil {
slog.Warn("logscanner: initial rule load failed", "error", err)
}
defer logScanMgr.Stop()
// Build API server.
apiServer := api.NewServer(db, dockerClient, npmClient, proxyProvider, dep, notifier, webhookHandler, eventBus, encKey)
apiServer.SetStaticSiteManager(staticSiteMgr)
@@ -346,6 +397,7 @@ func main() {
apiServer.SetStackManager(stackMgr)
}
apiServer.SetStaleScanner(staleScanner)
apiServer.SetLogScanReloader(logScanMgr)
apiServer.SetBackupEngine(backupEngine)
apiServer.SetDBPath(dbPath)
apiServer.SetBackupSettingsChangedCallback(scheduleAutobackup)