fix: harden security, fix concurrency bugs, and address review findings
Build / build (push) Successful in 11m42s
Build / build (push) Successful in 11m42s
Security: - rate limit /api/webhook routes per-IP and cap concurrent site syncs - global SSE connection cap (256) with new sse_gate - validate ?tail= and cap JSON log responses at 4 MiB - strip ANSI/CSI/OSC and control bytes from streamed log lines - redact webhook secret from request log middleware - scrub host details from /api/health for non-admin viewers - drop container_id from /api/system/stats/top for non-admins - generate webhook secrets via crypto/rand; require >=32 chars on insert - verify iid path consistency in streamContainerLogs - LimitReader on site webhook body; reject malformed non-empty bodies Concurrency / correctness: - stats collector: Stop() no longer hangs without Start(), semaphore acquired in parent loop so ctx cancellation short-circuits the queue, in-flight tick cancellable via shared base context, zero-ts guard - webhook handler: replace fire-and-forget goroutine with WaitGroup-tracked workers + Drain() wired into graceful shutdown - $derived(() => ...) mis-idiom fixed in ContainerStats / InstanceCard / ProjectCard (returned function instead of value) - SystemResourcesCard: rename `window` and `t` locals to avoid shadowing globalThis.window and the i18n `t` import Quality / performance: - replace O(n^2) insertion sort with sort.Slice in stats top - runMigrations only swallows duplicate-column / already-exists errors - PruneStatsSamplesBefore wrapped in a transaction - collapse N+1 in unusedImageStats / pruneImages to one ListAllInstances pass; surface DB errors instead of silently treating them as inactive - run Docker Info + DiskUsage in parallel via errgroup - container log SSE emits `: ping` heartbeat every 20 s - imageMatches case-insensitive on registry host (RFC behaviour) - log warning on invalid stage tag pattern instead of silent skip - reject malformed non-empty site webhook payloads Frontend / i18n: - shared formatBytes utility replaces three local copies - statsInterval store drives dynamic "no samples / collection disabled" copy across ContainerStats and SystemResourcesCard - top consumers row now shows owner_name (project/stage or site name) - drop seven `as any` casts on the Settings type; add cloudflare_api_token write-only field - move "Service status", "Docker daemon", "Docker unreachable", "Proxy unreachable", "reachable", and "Docker daemon is not reachable." strings into en/ru i18n bundles
This commit is contained in:
@@ -4,15 +4,30 @@ import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/auth"
|
||||
"github.com/alexei/tinyforge/internal/stats"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// topConsumerWindow is how recent a container sample must be to count toward
|
||||
// the "top consumers" list. Scaled with the collector interval (read from
|
||||
// settings) so it stays meaningful even when sampling is sparse.
|
||||
const topConsumerMinWindow = 2 * time.Minute
|
||||
|
||||
// TopContainerSample augments a stats sample with the human-readable owner
|
||||
// name so the UI can show "project/stage" or the static-site name without an
|
||||
// extra round-trip per row.
|
||||
type TopContainerSample struct {
|
||||
store.ContainerStatsSample
|
||||
OwnerName string `json:"owner_name"`
|
||||
}
|
||||
|
||||
const (
|
||||
// defaultHistoryWindow is used when no ?window= param is provided or the
|
||||
// value fails to parse. Matches the default retention so the "last 2h"
|
||||
@@ -175,11 +190,11 @@ func (s *Server) streamStaticSiteLogs(w http.ResponseWriter, r *http.Request) {
|
||||
s.streamLogsForContainer(w, r, site.ContainerID)
|
||||
}
|
||||
|
||||
// listTopContainersByCPU handles GET /api/system/stats/top?limit=5&by=cpu.
|
||||
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
|
||||
// Returns the top-N most recent samples across containers, sorted by CPU or
|
||||
// memory. Useful for a system dashboard "top consumers" widget without
|
||||
// requiring the frontend to aggregate per-container history on its own.
|
||||
func (s *Server) listTopContainersByCPU(w http.ResponseWriter, r *http.Request) {
|
||||
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
|
||||
// cannot enumerate workloads outside their scope.
|
||||
func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
|
||||
limit := 5
|
||||
if raw := r.URL.Query().Get("limit"); raw != "" {
|
||||
if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
|
||||
@@ -191,9 +206,16 @@ func (s *Server) listTopContainersByCPU(w http.ResponseWriter, r *http.Request)
|
||||
by = "cpu"
|
||||
}
|
||||
|
||||
// Samples from the last 2 minutes window so "top" reflects near-current
|
||||
// load, not long-dead rows.
|
||||
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(2 * time.Minute))
|
||||
// Samples must be at least as recent as max(2*interval, 2 minutes) so the
|
||||
// list reflects near-current load even when collection is sparse.
|
||||
window := topConsumerMinWindow
|
||||
if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
|
||||
if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
|
||||
window = w
|
||||
}
|
||||
}
|
||||
|
||||
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
|
||||
if err != nil {
|
||||
slog.Error("failed to list container samples for top", "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
||||
@@ -213,33 +235,75 @@ func (s *Server) listTopContainersByCPU(w http.ResponseWriter, r *http.Request)
|
||||
top = append(top, sm)
|
||||
}
|
||||
|
||||
// Partial-sort by the requested metric, descending. For small N a simple
|
||||
// insertion-like approach is plenty.
|
||||
sortContainerSamples(top, by)
|
||||
sort.Slice(top, func(i, j int) bool {
|
||||
if by == "memory" {
|
||||
return top[i].MemoryUsage > top[j].MemoryUsage
|
||||
}
|
||||
return top[i].CPUPercent > top[j].CPUPercent
|
||||
})
|
||||
if len(top) > limit {
|
||||
top = top[:limit]
|
||||
}
|
||||
respondJSON(w, http.StatusOK, top)
|
||||
}
|
||||
|
||||
// sortContainerSamples sorts in place by CPU (or memory) descending.
|
||||
// Note: ListContainerStatsSamples with empty ownerID returns no rows — the
|
||||
// caller uses per-owner-type queries and merges; this helper is applied to
|
||||
// the already-merged slice.
|
||||
func sortContainerSamples(s []store.ContainerStatsSample, by string) {
|
||||
// O(n^2) is fine — N is small (bounded by the number of containers).
|
||||
for i := 1; i < len(s); i++ {
|
||||
for j := i; j > 0; j-- {
|
||||
var less bool
|
||||
if by == "memory" {
|
||||
less = s[j].MemoryUsage > s[j-1].MemoryUsage
|
||||
} else {
|
||||
less = s[j].CPUPercent > s[j-1].CPUPercent
|
||||
}
|
||||
if !less {
|
||||
break
|
||||
}
|
||||
s[j-1], s[j] = s[j], s[j-1]
|
||||
// Resolve owner names so the UI can show "project/stage" or the site name
|
||||
// without a per-row round trip.
|
||||
enriched := s.enrichWithOwnerNames(top)
|
||||
|
||||
// Scrub container IDs for non-admins. The owner name is the actionable
|
||||
// identifier; the container ID is a host-level handle that reveals
|
||||
// workload existence to viewers who shouldn't have it.
|
||||
claims, _ := auth.ClaimsFromContext(r.Context())
|
||||
if claims.Role != "admin" {
|
||||
for i := range enriched {
|
||||
enriched[i].ContainerID = ""
|
||||
}
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, enriched)
|
||||
}
|
||||
|
||||
// enrichWithOwnerNames attaches a human-readable owner name to each sample.
|
||||
// Looks up instances and sites in batch so the cost is independent of the
|
||||
// number of samples (which is at most 'limit').
|
||||
func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
|
||||
out := make([]TopContainerSample, len(samples))
|
||||
for i, sm := range samples {
|
||||
out[i] = TopContainerSample{ContainerStatsSample: sm}
|
||||
switch sm.OwnerType {
|
||||
case stats.OwnerTypeInstance:
|
||||
out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
|
||||
case stats.OwnerTypeSite:
|
||||
out[i].OwnerName = s.lookupSiteName(sm.OwnerID)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// lookupInstanceName returns "project/stage" for an instance, or empty on
|
||||
// any lookup error so a transient miss does not break the response.
|
||||
func (s *Server) lookupInstanceName(instanceID string) string {
|
||||
inst, err := s.store.GetInstanceByID(instanceID)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
project, perr := s.store.GetProjectByID(inst.ProjectID)
|
||||
stage, serr := s.store.GetStageByID(inst.StageID)
|
||||
switch {
|
||||
case perr == nil && serr == nil:
|
||||
return project.Name + "/" + stage.Name
|
||||
case perr == nil:
|
||||
return project.Name
|
||||
case serr == nil:
|
||||
return stage.Name
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// lookupSiteName returns the site's display name or empty on lookup error.
|
||||
func (s *Server) lookupSiteName(siteID string) string {
|
||||
site, err := s.store.GetStaticSiteByID(siteID)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return site.Name
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user