fix: harden security, fix concurrency bugs, and address review findings
Build / build (push) Successful in 11m42s

Security:
- rate limit /api/webhook routes per-IP and cap concurrent site syncs
- global SSE connection cap (256) with new sse_gate
- validate ?tail= and cap JSON log responses at 4 MiB
- strip ANSI/CSI/OSC and control bytes from streamed log lines
- redact webhook secret from request log middleware
- scrub host details from /api/health for non-admin viewers
- drop container_id from /api/system/stats/top for non-admins
- generate webhook secrets via crypto/rand; require >=32 chars on insert
- verify iid path consistency in streamContainerLogs
- LimitReader on site webhook body; reject malformed non-empty bodies

Concurrency / correctness:
- stats collector: Stop() no longer hangs without Start(), semaphore
  acquired in parent loop so ctx cancellation short-circuits the queue,
  in-flight tick cancellable via shared base context, zero-ts guard
- webhook handler: replace fire-and-forget goroutine with WaitGroup-tracked
  workers + Drain() wired into graceful shutdown
- $derived(() => ...) mis-idiom fixed in ContainerStats / InstanceCard /
  ProjectCard (returned function instead of value)
- SystemResourcesCard: rename `window` and `t` locals to avoid shadowing
  globalThis.window and the i18n `t` import

Quality / performance:
- replace O(n^2) insertion sort with sort.Slice in stats top
- runMigrations only swallows duplicate-column / already-exists errors
- PruneStatsSamplesBefore wrapped in a transaction
- collapse N+1 in unusedImageStats / pruneImages to one ListAllInstances
  pass; surface DB errors instead of silently treating them as inactive
- run Docker Info + DiskUsage in parallel via errgroup
- container log SSE emits `: ping` heartbeat every 20 s
- imageMatches case-insensitive on registry host (RFC behaviour)
- log warning on invalid stage tag pattern instead of silent skip
- reject malformed non-empty site webhook payloads

Frontend / i18n:
- shared formatBytes utility replaces three local copies
- statsInterval store drives dynamic "no samples / collection disabled"
  copy across ContainerStats and SystemResourcesCard
- top consumers row now shows owner_name (project/stage or site name)
- drop seven `as any` casts on the Settings type; add cloudflare_api_token
  write-only field
- move "Service status", "Docker daemon", "Docker unreachable",
  "Proxy unreachable", "reachable", and "Docker daemon is not reachable."
  strings into en/ru i18n bundles
This commit is contained in:
2026-05-07 00:56:14 +03:00
parent 05440a5f92
commit a4362b842d
39 changed files with 1249 additions and 213 deletions
+46 -12
View File
@@ -36,9 +36,11 @@ type Collector struct {
store *store.Store
docker *docker.Client
stopOnce sync.Once
stop chan struct{}
done chan struct{}
startOnce sync.Once
stopOnce sync.Once
started bool
stop chan struct{}
done chan struct{}
}
// New creates a new stats collector. Call Start to begin sampling.
@@ -52,15 +54,24 @@ func New(s *store.Store, d *docker.Client) *Collector {
}
// Start launches the background loop. Returns immediately. The loop exits
// when Stop is called.
// when Stop is called. Safe to call multiple times — only the first call has
// an effect.
func (c *Collector) Start() {
go c.run()
c.startOnce.Do(func() {
c.started = true
go c.run()
})
}
// Stop signals the collector to exit and blocks until it has finished the
// in-flight tick.
// in-flight tick. If Start was never called, Stop returns immediately.
func (c *Collector) Stop() {
c.stopOnce.Do(func() { close(c.stop) })
c.stopOnce.Do(func() {
close(c.stop)
if !c.started {
close(c.done)
}
})
<-c.done
}
@@ -70,6 +81,15 @@ func (c *Collector) Stop() {
func (c *Collector) run() {
defer close(c.done)
// Derive a base context that's cancelled when Stop is called so in-flight
// Docker requests abort instead of waiting out their timeout.
baseCtx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
<-c.stop
cancel()
}()
// Wait a few seconds before the first sample so the app has settled.
select {
case <-time.After(3 * time.Second):
@@ -90,7 +110,7 @@ func (c *Collector) run() {
}
}
c.tick(retention)
c.tick(baseCtx, retention)
select {
case <-time.After(time.Duration(interval) * time.Second):
@@ -126,8 +146,8 @@ func (c *Collector) readConfig() (intervalSeconds, retentionHours int) {
// persists samples, and prunes rows beyond the retention window. When
// the Docker daemon is unreachable the whole tick is skipped with a
// single debug log instead of one warning per container.
func (c *Collector) tick(retentionHours int) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
func (c *Collector) tick(parent context.Context, retentionHours int) {
ctx, cancel := context.WithTimeout(parent, 30*time.Second)
defer cancel()
pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second)
@@ -224,10 +244,20 @@ func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.Con
var wg sync.WaitGroup
for i, t := range targets {
// Acquire the semaphore in the parent loop so ctx cancellation
// short-circuits the queue rather than spawning goroutines that
// block on an unreachable slot.
select {
case sem <- struct{}{}:
case <-ctx.Done():
break
}
if ctx.Err() != nil {
break
}
wg.Add(1)
go func(i int, t target) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
@@ -278,8 +308,12 @@ func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64,
slog.Warn("stats collector: get system stats", "error", err)
return
}
ts := sysStats.Timestamp.Unix()
if ts <= 0 {
ts = time.Now().UTC().Unix()
}
sample := store.SystemStatsSample{
TS: sysStats.Timestamp.Unix(),
TS: ts,
NCPU: sysStats.NCPU,
MemoryTotal: sysStats.MemoryTotal,
WorkloadCPUPercent: workloadCPU,