fix: harden security, fix concurrency bugs, and address review findings
Build / build (push) Successful in 11m42s

Security:
- rate limit /api/webhook routes per-IP and cap concurrent site syncs
- global SSE connection cap (256) with new sse_gate
- validate ?tail= and cap JSON log responses at 4 MiB
- strip ANSI/CSI/OSC and control bytes from streamed log lines
- redact webhook secret from request log middleware
- scrub host details from /api/health for non-admin viewers
- drop container_id from /api/system/stats/top for non-admins
- generate webhook secrets via crypto/rand; require >=32 chars on insert
- verify iid path consistency in streamContainerLogs
- LimitReader on site webhook body; reject malformed non-empty bodies

Concurrency / correctness:
- stats collector: Stop() no longer hangs without Start(), semaphore
  acquired in parent loop so ctx cancellation short-circuits the queue,
  in-flight tick cancellable via shared base context, zero-ts guard
- webhook handler: replace fire-and-forget goroutine with WaitGroup-tracked
  workers + Drain() wired into graceful shutdown
- $derived(() => ...) mis-idiom fixed in ContainerStats / InstanceCard /
  ProjectCard (returned function instead of value)
- SystemResourcesCard: rename `window` and `t` locals to avoid shadowing
  globalThis.window and the i18n `t` import

Quality / performance:
- replace O(n^2) insertion sort with sort.Slice in stats top
- runMigrations only swallows duplicate-column / already-exists errors
- PruneStatsSamplesBefore wrapped in a transaction
- collapse N+1 in unusedImageStats / pruneImages to one ListAllInstances
  pass; surface DB errors instead of silently treating them as inactive
- run Docker Info + DiskUsage in parallel via errgroup
- container log SSE emits `: ping` heartbeat every 20 s
- imageMatches case-insensitive on registry host (RFC behaviour)
- log warning on invalid stage tag pattern instead of silent skip
- reject malformed non-empty site webhook payloads

Frontend / i18n:
- shared formatBytes utility replaces three local copies
- statsInterval store drives dynamic "no samples / collection disabled"
  copy across ContainerStats and SystemResourcesCard
- top consumers row now shows owner_name (project/stage or site name)
- drop seven `as any` casts on the Settings type; add cloudflare_api_token
  write-only field
- move "Service status", "Docker daemon", "Docker unreachable",
  "Proxy unreachable", "reachable", and "Docker daemon is not reachable."
  strings into en/ru i18n bundles
This commit is contained in:
2026-05-07 00:56:14 +03:00
parent 05440a5f92
commit a4362b842d
39 changed files with 1249 additions and 213 deletions
+138 -32
View File
@@ -5,15 +5,38 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/store"
)
// Limits and constants for the log endpoints.
const (
defaultLogTail = 200
maxLogTail = 5000
maxJSONLogBytes = 4 << 20 // 4 MiB cap for non-streaming log responses
maxLogLineBytes = 1 << 20 // 1 MiB max line length for the bufio.Scanner
logHeartbeatPeriod = 20 * time.Second
)
// ANSI escape sequence patterns. Stripped from streamed log lines so a
// hostile container cannot inject terminal control sequences (cursor moves,
// hyperlink escapes, screen clears) into operator displays or pasted output.
var (
ansiCSIPattern = regexp.MustCompile(`\x1b\[[0-9;?]*[ -/]*[@-~]`)
ansiOSCPattern = regexp.MustCompile(`\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)`)
ctlBytePattern = regexp.MustCompile(`[\x00-\x08\x0b-\x1a\x1c-\x1f\x7f]`)
)
// listProjectImages handles GET /api/projects/{id}/images.
// Returns all local Docker images matching the project's image reference.
func (s *Server) listProjectImages(w http.ResponseWriter, r *http.Request) {
@@ -50,6 +73,8 @@ func (s *Server) listProjectImages(w http.ResponseWriter, r *http.Request) {
// - tail: number of lines from end (default "200")
// - follow: "true" to stream new lines in real-time
func (s *Server) streamContainerLogs(w http.ResponseWriter, r *http.Request) {
projectID := chi.URLParam(r, "id")
stageID := chi.URLParam(r, "stage")
instanceID := chi.URLParam(r, "iid")
inst, err := s.store.GetInstanceByID(instanceID)
@@ -63,6 +88,14 @@ func (s *Server) streamContainerLogs(w http.ResponseWriter, r *http.Request) {
return
}
// Verify the instance actually belongs to the project/stage in the path.
// Without this, a user could stream logs for any instance ID by guessing
// it under the wrong project — defence-in-depth for future per-project ACLs.
if inst.ProjectID != projectID || inst.StageID != stageID {
respondNotFound(w, "instance")
return
}
if inst.ContainerID == "" {
respondError(w, http.StatusBadRequest, "instance has no container")
return
@@ -80,10 +113,7 @@ func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request,
return
}
tail := r.URL.Query().Get("tail")
if tail == "" {
tail = "200"
}
tail := parseTailParam(r.URL.Query().Get("tail"))
follow := r.URL.Query().Get("follow") == "true"
// Check if client accepts SSE.
@@ -99,8 +129,10 @@ func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request,
defer logReader.Close()
if !isSSE {
// JSON mode: read all lines and return as array.
scanner := bufio.NewScanner(logReader)
// JSON mode: cap the total bytes read so a chatty container with
// tail=large cannot exhaust server memory.
scanner := bufio.NewScanner(io.LimitReader(logReader, maxJSONLogBytes))
scanner.Buffer(make([]byte, 0, 64*1024), maxLogLineBytes)
var lines []string
for scanner.Scan() {
line := sanitizeDockerLogLine(scanner.Text())
@@ -116,6 +148,12 @@ func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request,
}
// SSE mode: stream lines as they arrive.
release, ok := acquireSSESlot(w, s.sseGate)
if !ok {
return
}
defer release()
flusher, ok := w.(http.Flusher)
if !ok {
respondError(w, http.StatusInternalServerError, "streaming not supported")
@@ -126,7 +164,31 @@ func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request,
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
// Heartbeat keeps the connection warm through proxies that close idle
// streams. Sent as an SSE comment which the EventSource API ignores.
heartbeat := time.NewTicker(logHeartbeatPeriod)
defer heartbeat.Stop()
heartbeatDone := make(chan struct{})
defer close(heartbeatDone)
var hbMu sync.Mutex
go func() {
for {
select {
case <-heartbeat.C:
hbMu.Lock()
_, _ = io.WriteString(w, ": ping\n\n")
flusher.Flush()
hbMu.Unlock()
case <-heartbeatDone:
return
case <-r.Context().Done():
return
}
}
}()
scanner := bufio.NewScanner(logReader)
scanner.Buffer(make([]byte, 0, 64*1024), maxLogLineBytes)
for scanner.Scan() {
line := sanitizeDockerLogLine(scanner.Text())
if line == "" {
@@ -134,8 +196,10 @@ func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request,
}
data, _ := json.Marshal(map[string]string{"line": line})
hbMu.Lock()
fmt.Fprintf(w, "data: %s\n\n", data)
flusher.Flush()
hbMu.Unlock()
// Check if client disconnected.
select {
@@ -146,17 +210,67 @@ func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request,
}
}
// parseTailParam validates and clamps the ?tail= query value. Empty/invalid
// inputs fall back to the default; values above the cap are clamped down.
// "all" is rejected — letting the caller request unbounded log history is a
// trivial DoS vector.
func parseTailParam(raw string) string {
if raw == "" {
return strconv.Itoa(defaultLogTail)
}
n, err := strconv.Atoi(raw)
if err != nil || n <= 0 {
return strconv.Itoa(defaultLogTail)
}
if n > maxLogTail {
n = maxLogTail
}
return strconv.Itoa(n)
}
// sanitizeDockerLogLine strips the Docker log stream header (8-byte prefix)
// that Docker adds to non-TTY container logs.
// that Docker adds to non-TTY container logs, and removes terminal control
// sequences so a hostile container cannot inject ANSI escapes that hijack an
// operator's terminal when log output is pasted or rendered raw.
func sanitizeDockerLogLine(line string) string {
// Docker multiplexed stream: first 8 bytes are header (stream type + size).
// If the line starts with a non-printable byte followed by 0x00 0x00 0x00, strip 8 bytes.
if len(line) > 8 && (line[0] == 1 || line[0] == 2) && line[1] == 0 && line[2] == 0 && line[3] == 0 {
return line[8:]
line = line[8:]
}
line = ansiOSCPattern.ReplaceAllString(line, "")
line = ansiCSIPattern.ReplaceAllString(line, "")
line = ctlBytePattern.ReplaceAllString(line, "")
return line
}
// buildActiveImagesSet returns the set of "image:tag" strings currently used
// by any instance, computed in a single DB pass instead of N×K queries.
// Returning an error (rather than swallowing) prevents prune logic from
// treating a transient DB failure as "nothing is active".
func buildActiveImagesSet(st *store.Store, projects []store.Project) (map[string]bool, error) {
imageByProject := make(map[string]string, len(projects))
for _, p := range projects {
imageByProject[p.ID] = p.Image
}
instances, err := st.ListAllInstances()
if err != nil {
return nil, fmt.Errorf("list instances: %w", err)
}
active := make(map[string]bool, len(instances))
for _, inst := range instances {
if inst.ImageTag == "" {
continue
}
image := imageByProject[inst.ProjectID]
if image == "" {
continue
}
active[image+":"+inst.ImageTag] = true
}
return active, nil
}
// unusedImageStats handles GET /api/docker/unused-images.
// Returns the total size of unused project images and whether the threshold is exceeded.
func (s *Server) unusedImageStats(w http.ResponseWriter, r *http.Request) {
@@ -181,18 +295,14 @@ func (s *Server) unusedImageStats(w http.ResponseWriter, r *http.Request) {
return
}
// Build set of active image refs.
activeImages := make(map[string]bool)
for _, p := range projects {
stages, _ := s.store.GetStagesByProjectID(p.ID)
for _, st := range stages {
instances, _ := s.store.GetInstancesByStageID(st.ID)
for _, inst := range instances {
if inst.ImageTag != "" {
activeImages[p.Image+":"+inst.ImageTag] = true
}
}
}
// Build set of active image refs in one DB pass instead of N×K queries.
// A flaky read here previously masqueraded as "no images are active",
// which on the prune endpoint would have deleted *running* images.
activeImages, err := buildActiveImagesSet(s.store, projects)
if err != nil {
slog.Error("unused images: build active set", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Sum unused image sizes.
@@ -242,18 +352,14 @@ func (s *Server) pruneImages(w http.ResponseWriter, r *http.Request) {
return
}
// Build a set of image refs used by active instances.
activeImages := make(map[string]bool)
for _, p := range projects {
stages, _ := s.store.GetStagesByProjectID(p.ID)
for _, st := range stages {
instances, _ := s.store.GetInstancesByStageID(st.ID)
for _, inst := range instances {
if inst.ImageTag != "" {
activeImages[p.Image+":"+inst.ImageTag] = true
}
}
}
// Build a set of image refs used by active instances. Bail out on error
// — silently treating a DB blip as "no active images" would prune
// images currently in use by running containers.
activeImages, err := buildActiveImagesSet(s.store, projects)
if err != nil {
slog.Error("prune: build active set", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Collect all unique image bases from projects (without tags).