feat(observability): event triggers + log scanner backend

Two paired backends sharing the events.Bus seam:

Event triggers (consumer-side):
- internal/store/event_triggers.go — CRUD with action_secret
  redaction on read (placeholder echo treated as "no change" on
  PATCH so secrets aren't accidentally wiped).
- internal/events/dispatcher.go — bus subscriber, AND-composed
  filters (severity CSV, source CSV, message regex with memoized
  compile cache). Structural loop-prevention: never writes to
  event_log. Sends via notifier.SendPayload.
- internal/notify: SendPayload + SendSyncForTestPayload methods,
  TierEventTrigger constant, doSendRaw shared with the legacy
  Event-shaped path.
- internal/api/event_triggers.go — admin-gated CRUD + /test
  sending the real TriggerWebhookPayload shape. SSRF guard
  rejects loopback / link-local / unspecified targets. PATCH
  uses pointer-typed DTO for partial updates.

Log scanner (producer-side):
- internal/logscanner/ — engine (per-rule cooldown +
  per-container token bucket, atomic drop counters), tail
  (multiplexed docker frame demuxer with TTY fallback + 16 MiB
  payload cap + 1 MiB reassembly cap + RFC3339Nano-validated
  timestamp strip + UTF-8-safe message truncation), manager
  (5s container polling, atomic.Pointer[Snapshot] hot-reload,
  HitEmitter writes event_log + publishes EventLog so the
  trigger dispatcher picks them up immediately).
- internal/docker/container.go — ContainerLogsOpts exposes
  stream selection for stderr-only / stdout-only rules.
- internal/store: log_scan_rules table + CRUD with
  EffectiveLogScanRules resolver (globals minus per-workload
  overrides plus workload-only additions). Transactional
  cascade-delete of overrides when a global rule is removed.
- internal/api/log_scan_rules.go — admin-gated CRUD + /test
  (sample_line → matched/captures) + /stats (drop counters +
  active tail count + last-snapshot compile errors) +
  GET /api/workloads/{id}/effective-rules.

cmd/server/main.go wires both subsystems next to the existing
RegisterPersistentLogger. Coverage spans engine cooldown / bucket
counter tests, snapshot effective-set semantics, manager compile-
error capture, dispatcher matching, store validation +
cascade-delete, API URL validator + secret redaction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-11 22:18:11 +03:00
parent 82d32181ba
commit 7a9ff7ad54
23 changed files with 3974 additions and 19 deletions
+84 -5
View File
@@ -50,6 +50,7 @@ type Server struct {
stackManager *stack.Manager
backupEngine *backup.Engine
sseGate *sseGate
logScanReloader LogScanReloader
dbPath string
shutdownFunc func() // called after restore to trigger graceful shutdown
onBackupSettingsChanged func(enabled bool, intervalHours int) // called when backup settings change
@@ -217,13 +218,26 @@ func (s *Server) Router() chi.Router {
r.Group(func(r chi.Router) {
r.Use(auth.Middleware(s.localAuth))
// Plugin registry inspection + unified ingress (Workload refactor).
// /hooks/kinds is informational and visible to any authenticated
// caller. /hooks/generic dispatches deploys and is admin-gated —
// vendor-specific webhooks (with their own per-target HMAC
// secrets) live under /webhook/* and remain the only ingress
// reachable by external CI systems until Phase 5 consolidates them.
r.Get("/hooks/kinds", s.listHookKinds)
r.Get("/hooks/kinds/{kind}/schema", s.getHookKindSchema)
r.With(auth.AdminOnly).Post("/hooks/generic", s.dispatchGeneric)
// Read-only endpoints (any authenticated user).
r.Get("/health", s.getHealth)
r.Get("/auth/me", s.currentUser)
r.Post("/auth/logout", s.logout)
r.Get("/proxies", s.listProxyRoutes)
r.Get("/docker/unused-images", s.unusedImageStats)
r.Get("/projects", s.listProjects)
// Legacy project/stage/site/stack endpoints carry a Deprecation
// header pointing at /api/workloads. Functional behavior is
// unchanged until the hard cutover removes them.
r.With(deprecated("/api/workloads")).Get("/projects", s.listProjects)
r.Route("/projects/{id}", func(r chi.Router) {
r.Get("/", s.getProject)
r.Get("/stages/{stage}/env", s.listStageEnv)
@@ -290,7 +304,7 @@ func (s *Server) Router() chi.Router {
})
})
// Stacks (docker-compose).
r.Get("/stacks", s.listStacks)
r.With(deprecated("/api/workloads?kind=plugin&source_kind=compose")).Get("/stacks", s.listStacks)
r.Route("/stacks/{id}", func(r chi.Router) {
r.Get("/", s.getStack)
r.Get("/revisions", s.listStackRevisions)
@@ -311,7 +325,7 @@ func (s *Server) Router() chi.Router {
r.With(auth.AdminOnly).Post("/stacks", s.createStack)
// Static sites.
r.Get("/sites", s.listStaticSites)
r.With(deprecated("/api/workloads?kind=plugin&source_kind=static")).Get("/sites", s.listStaticSites)
r.Route("/sites/{id}", func(r chi.Router) {
r.Get("/", s.getStaticSite)
r.Get("/secrets", s.listStaticSiteSecrets)
@@ -375,13 +389,47 @@ func (s *Server) Router() chi.Router {
r.Get("/containers/stale", s.listStaleContainers)
// Workload-shaped endpoints (the unifying layer over project /
// stack / site). Read-only; mutations still go through the
// kind-specific endpoints (POST /projects, PUT /stacks/{id}, …).
// stack / site). Read endpoints are open to any authenticated
// user; create / update / deploy mutate state and are admin-gated.
// Plugin-native workloads (source_kind + trigger_kind set) are
// created here; legacy project / stack / site mutations remain at
// their dedicated endpoints during the cutover.
r.Get("/workloads", s.listWorkloads)
r.With(auth.AdminOnly).Post("/workloads", s.createPluginWorkload)
r.Route("/workloads/{id}", func(r chi.Router) {
r.Get("/", s.getWorkload)
r.Get("/containers", s.listWorkloadContainers)
r.Get("/containers/{cid}/logs", s.streamWorkloadContainerLogs)
r.With(auth.AdminOnly).Patch("/app", s.updateWorkloadAppID)
r.With(auth.AdminOnly).Put("/plugin", s.updatePluginWorkload)
r.With(auth.AdminOnly).Post("/deploy", s.deployPluginWorkload)
r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload)
// Per-workload env vars (analog of legacy stage_env).
// Listing is open to authenticated readers; mutations are
// admin-gated. Encrypted values are write-only after store.
r.Get("/env", s.listWorkloadEnv)
r.With(auth.AdminOnly).Put("/env", s.setWorkloadEnv)
r.With(auth.AdminOnly).Delete("/env/{envID}", s.deleteWorkloadEnv)
// Per-workload inbound webhook URL: rotate the secret + fetch
// the canonical URL. Mirrors the project / site webhook UX.
r.With(auth.AdminOnly).Get("/webhook", s.getWorkloadWebhook)
r.With(auth.AdminOnly).Post("/webhook/regenerate", s.regenerateWorkloadWebhook)
// Per-workload volume mounts (analog of legacy project volumes).
// Reads are open to authenticated users; mutations admin-gated.
// Source/target paths are validated for traversal safety here;
// host-path allow-listing happens at deploy time.
r.Get("/volumes", s.listWorkloadVolumes)
r.With(auth.AdminOnly).Put("/volumes", s.setWorkloadVolume)
r.With(auth.AdminOnly).Delete("/volumes/{volID}", s.deleteWorkloadVolume)
// Stages chain: parent + self + direct children, plus a
// promote-from action that copies the source workload's
// running image tag onto this workload's default_tag.
r.Get("/chain", s.getWorkloadChain)
r.With(auth.AdminOnly).Post("/promote-from/{sourceID}", s.promoteFromWorkload)
})
// Global container index, joined to workload + app names.
@@ -398,6 +446,37 @@ func (s *Server) Router() chi.Router {
r.Delete("/apps/{id}", s.deleteApp)
})
// Event triggers: filter+action rules over the event_log
// stream. Read endpoints are available to any authenticated
// user; mutations + test-dispatch are admin-gated since they
// can fire arbitrary outbound webhooks.
r.Get("/event-triggers", s.listEventTriggers)
r.Get("/event-triggers/{id}", s.getEventTrigger)
r.Group(func(r chi.Router) {
r.Use(auth.AdminOnly)
r.Post("/event-triggers", s.createEventTrigger)
r.Patch("/event-triggers/{id}", s.updateEventTrigger)
r.Delete("/event-triggers/{id}", s.deleteEventTrigger)
r.Post("/event-triggers/{id}/test", s.testEventTrigger)
})
// Log-scan rules: regex patterns the scanner manager
// applies to container log lines. Read endpoints are
// available to any authenticated user; mutations are
// admin-gated since they can change global observability
// behavior across every workload.
r.Get("/log-scan-rules", s.listLogScanRules)
r.Get("/log-scan-rules/stats", s.getLogScanStats)
r.Get("/log-scan-rules/{id}", s.getLogScanRule)
r.Get("/workloads/{id}/effective-rules", s.getEffectiveLogScanRules)
r.Group(func(r chi.Router) {
r.Use(auth.AdminOnly)
r.Post("/log-scan-rules", s.createLogScanRule)
r.Patch("/log-scan-rules/{id}", s.updateLogScanRule)
r.Delete("/log-scan-rules/{id}", s.deleteLogScanRule)
r.Post("/log-scan-rules/{id}/test", s.testLogScanRule)
})
// System resources (read-only).
r.Get("/system/stats", s.getSystemStats)
r.Get("/system/stats/history", s.getSystemStatsHistory)