fix: harden security, fix concurrency bugs, and address review findings
Build / build (push) Successful in 11m42s
Build / build (push) Successful in 11m42s
Security: - rate limit /api/webhook routes per-IP and cap concurrent site syncs - global SSE connection cap (256) with new sse_gate - validate ?tail= and cap JSON log responses at 4 MiB - strip ANSI/CSI/OSC and control bytes from streamed log lines - redact webhook secret from request log middleware - scrub host details from /api/health for non-admin viewers - drop container_id from /api/system/stats/top for non-admins - generate webhook secrets via crypto/rand; require >=32 chars on insert - verify iid path consistency in streamContainerLogs - LimitReader on site webhook body; reject malformed non-empty bodies Concurrency / correctness: - stats collector: Stop() no longer hangs without Start(), semaphore acquired in parent loop so ctx cancellation short-circuits the queue, in-flight tick cancellable via shared base context, zero-ts guard - webhook handler: replace fire-and-forget goroutine with WaitGroup-tracked workers + Drain() wired into graceful shutdown - $derived(() => ...) mis-idiom fixed in ContainerStats / InstanceCard / ProjectCard (returned function instead of value) - SystemResourcesCard: rename `window` and `t` locals to avoid shadowing globalThis.window and the i18n `t` import Quality / performance: - replace O(n^2) insertion sort with sort.Slice in stats top - runMigrations only swallows duplicate-column / already-exists errors - PruneStatsSamplesBefore wrapped in a transaction - collapse N+1 in unusedImageStats / pruneImages to one ListAllInstances pass; surface DB errors instead of silently treating them as inactive - run Docker Info + DiskUsage in parallel via errgroup - container log SSE emits `: ping` heartbeat every 20 s - imageMatches case-insensitive on registry host (RFC behaviour) - log warning on invalid stage tag pattern instead of silent skip - reject malformed non-empty site webhook payloads Frontend / i18n: - shared formatBytes utility replaces three local copies - statsInterval store drives dynamic "no samples / collection disabled" copy across ContainerStats and SystemResourcesCard - top consumers row now shows owner_name (project/stage or site name) - drop seven `as any` casts on the Settings type; add cloudflare_api_token write-only field - move "Service status", "Docker daemon", "Docker unreachable", "Proxy unreachable", "reachable", and "Docker daemon is not reachable." strings into en/ru i18n bundles
This commit is contained in:
+46
-12
@@ -36,9 +36,11 @@ type Collector struct {
|
||||
store *store.Store
|
||||
docker *docker.Client
|
||||
|
||||
stopOnce sync.Once
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
started bool
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// New creates a new stats collector. Call Start to begin sampling.
|
||||
@@ -52,15 +54,24 @@ func New(s *store.Store, d *docker.Client) *Collector {
|
||||
}
|
||||
|
||||
// Start launches the background loop. Returns immediately. The loop exits
|
||||
// when Stop is called.
|
||||
// when Stop is called. Safe to call multiple times — only the first call has
|
||||
// an effect.
|
||||
func (c *Collector) Start() {
|
||||
go c.run()
|
||||
c.startOnce.Do(func() {
|
||||
c.started = true
|
||||
go c.run()
|
||||
})
|
||||
}
|
||||
|
||||
// Stop signals the collector to exit and blocks until it has finished the
|
||||
// in-flight tick.
|
||||
// in-flight tick. If Start was never called, Stop returns immediately.
|
||||
func (c *Collector) Stop() {
|
||||
c.stopOnce.Do(func() { close(c.stop) })
|
||||
c.stopOnce.Do(func() {
|
||||
close(c.stop)
|
||||
if !c.started {
|
||||
close(c.done)
|
||||
}
|
||||
})
|
||||
<-c.done
|
||||
}
|
||||
|
||||
@@ -70,6 +81,15 @@ func (c *Collector) Stop() {
|
||||
func (c *Collector) run() {
|
||||
defer close(c.done)
|
||||
|
||||
// Derive a base context that's cancelled when Stop is called so in-flight
|
||||
// Docker requests abort instead of waiting out their timeout.
|
||||
baseCtx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
go func() {
|
||||
<-c.stop
|
||||
cancel()
|
||||
}()
|
||||
|
||||
// Wait a few seconds before the first sample so the app has settled.
|
||||
select {
|
||||
case <-time.After(3 * time.Second):
|
||||
@@ -90,7 +110,7 @@ func (c *Collector) run() {
|
||||
}
|
||||
}
|
||||
|
||||
c.tick(retention)
|
||||
c.tick(baseCtx, retention)
|
||||
|
||||
select {
|
||||
case <-time.After(time.Duration(interval) * time.Second):
|
||||
@@ -126,8 +146,8 @@ func (c *Collector) readConfig() (intervalSeconds, retentionHours int) {
|
||||
// persists samples, and prunes rows beyond the retention window. When
|
||||
// the Docker daemon is unreachable the whole tick is skipped with a
|
||||
// single debug log instead of one warning per container.
|
||||
func (c *Collector) tick(retentionHours int) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
func (c *Collector) tick(parent context.Context, retentionHours int) {
|
||||
ctx, cancel := context.WithTimeout(parent, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
@@ -224,10 +244,20 @@ func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.Con
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i, t := range targets {
|
||||
// Acquire the semaphore in the parent loop so ctx cancellation
|
||||
// short-circuits the queue rather than spawning goroutines that
|
||||
// block on an unreachable slot.
|
||||
select {
|
||||
case sem <- struct{}{}:
|
||||
case <-ctx.Done():
|
||||
break
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(i int, t target) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
|
||||
sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
@@ -278,8 +308,12 @@ func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64,
|
||||
slog.Warn("stats collector: get system stats", "error", err)
|
||||
return
|
||||
}
|
||||
ts := sysStats.Timestamp.Unix()
|
||||
if ts <= 0 {
|
||||
ts = time.Now().UTC().Unix()
|
||||
}
|
||||
sample := store.SystemStatsSample{
|
||||
TS: sysStats.Timestamp.Unix(),
|
||||
TS: ts,
|
||||
NCPU: sysStats.NCPU,
|
||||
MemoryTotal: sysStats.MemoryTotal,
|
||||
WorkloadCPUPercent: workloadCPU,
|
||||
|
||||
Reference in New Issue
Block a user