Files
tiny-forge/internal/api/health.go
T
alexei.dolgolyov a4362b842d
Build / build (push) Successful in 11m42s
fix: harden security, fix concurrency bugs, and address review findings
Security:
- rate limit /api/webhook routes per-IP and cap concurrent site syncs
- global SSE connection cap (256) with new sse_gate
- validate ?tail= and cap JSON log responses at 4 MiB
- strip ANSI/CSI/OSC and control bytes from streamed log lines
- redact webhook secret from request log middleware
- scrub host details from /api/health for non-admin viewers
- drop container_id from /api/system/stats/top for non-admins
- generate webhook secrets via crypto/rand; require >=32 chars on insert
- verify iid path consistency in streamContainerLogs
- LimitReader on site webhook body; reject malformed non-empty bodies

Concurrency / correctness:
- stats collector: Stop() no longer hangs without Start(), semaphore
  acquired in parent loop so ctx cancellation short-circuits the queue,
  in-flight tick cancellable via shared base context, zero-ts guard
- webhook handler: replace fire-and-forget goroutine with WaitGroup-tracked
  workers + Drain() wired into graceful shutdown
- $derived(() => ...) mis-idiom fixed in ContainerStats / InstanceCard /
  ProjectCard (returned function instead of value)
- SystemResourcesCard: rename `window` and `t` locals to avoid shadowing
  globalThis.window and the i18n `t` import

Quality / performance:
- replace O(n^2) insertion sort with sort.Slice in stats top
- runMigrations only swallows duplicate-column / already-exists errors
- PruneStatsSamplesBefore wrapped in a transaction
- collapse N+1 in unusedImageStats / pruneImages to one ListAllInstances
  pass; surface DB errors instead of silently treating them as inactive
- run Docker Info + DiskUsage in parallel via errgroup
- container log SSE emits `: ping` heartbeat every 20 s
- imageMatches case-insensitive on registry host (RFC behaviour)
- log warning on invalid stage tag pattern instead of silent skip
- reject malformed non-empty site webhook payloads

Frontend / i18n:
- shared formatBytes utility replaces three local copies
- statsInterval store drives dynamic "no samples / collection disabled"
  copy across ContainerStats and SystemResourcesCard
- top consumers row now shows owner_name (project/stage or site name)
- drop seven `as any` casts on the Settings type; add cloudflare_api_token
  write-only field
- move "Service status", "Docker daemon", "Docker unreachable",
  "Proxy unreachable", "reachable", and "Docker daemon is not reachable."
  strings into en/ru i18n bundles
2026-05-07 00:56:14 +03:00

256 lines
7.6 KiB
Go

package api
import (
"context"
"net/http"
"time"
"github.com/alexei/tinyforge/internal/auth"
"github.com/alexei/tinyforge/internal/proxy"
)
// healthProbeTimeout caps a single health probe so a stuck dependency does
// not hold the polling endpoint open. The UI polls every 30 s, so 8 s leaves
// headroom for the ping + Info + NPM list calls.
const healthProbeTimeout = 8 * time.Second
// nonAdminDockerFields enumerates the fields any authenticated user is
// allowed to see — version + connectivity + container counts. Host-detail
// fields (kernel, root_dir, hostname, OS, storage driver) are admin-only to
// avoid recon information leaks.
var nonAdminDockerFields = map[string]bool{
"connected": true,
"latency_ms": true,
"error": true,
"version": true,
"api_version": true,
"containers": true,
"running": true,
"paused": true,
"stopped": true,
"images": true,
"ncpu": true,
"memory_total": true,
}
// nonAdminProxyFields are the proxy fields safe to share with non-admins.
// Configured URLs and aggregate counts of internal lists/certs are stripped.
var nonAdminProxyFields = map[string]bool{
"provider": true,
"connected": true,
"latency_ms": true,
"error": true,
"proxy_hosts_managed": true,
}
// getHealth handles GET /api/health.
//
// Returns the connectivity state and (when connected) diagnostics for the
// Docker daemon and the active proxy provider. Detailed host information
// (kernel, root_dir, internal NPM URL, …) is stripped for non-admin users to
// avoid leaking infrastructure details to read-only viewers.
func (s *Server) getHealth(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), healthProbeTimeout)
defer cancel()
claims, _ := auth.ClaimsFromContext(r.Context())
isAdmin := claims.Role == "admin"
now := time.Now().UTC().Format(time.RFC3339)
result := map[string]any{
"checked_at": now,
}
// ── Database ─────────────────────────────────────────────────────
if err := s.store.DB().PingContext(ctx); err != nil {
result["database"] = map[string]any{"connected": false, "error": "database unreachable"}
} else {
result["database"] = map[string]any{"connected": true}
}
// ── Docker daemon ────────────────────────────────────────────────
docker := s.dockerHealth(ctx)
if !isAdmin {
docker = filterFields(docker, nonAdminDockerFields)
}
result["docker"] = docker
// ── Proxy provider ───────────────────────────────────────────────
if s.proxyProvider != nil {
proxyInfo := s.proxyHealth(ctx)
if !isAdmin {
proxyInfo = filterFields(proxyInfo, nonAdminProxyFields)
}
result["proxy"] = proxyInfo
}
respondJSON(w, http.StatusOK, result)
}
// filterFields returns a copy of m containing only the keys present in allow.
func filterFields(m map[string]any, allow map[string]bool) map[string]any {
out := make(map[string]any, len(allow))
for k, v := range m {
if allow[k] {
out[k] = v
}
}
return out
}
// dockerHealth probes the Docker daemon and, if reachable, attaches a full
// DaemonInfo snapshot. The caller does not need to error-check the Info()
// call — if it fails, the connected flag remains true (ping succeeded) but
// the detail fields are simply omitted.
func (s *Server) dockerHealth(ctx context.Context) map[string]any {
if s.docker == nil {
return map[string]any{
"connected": false,
"error": "docker client not initialized",
}
}
start := time.Now()
if err := s.docker.Ping(ctx); err != nil {
return map[string]any{
"connected": false,
"error": err.Error(),
"latency_ms": time.Since(start).Milliseconds(),
}
}
out := map[string]any{
"connected": true,
"latency_ms": time.Since(start).Milliseconds(),
}
// Info enriches the payload; failures are non-fatal.
info, err := s.docker.Info(ctx)
if err == nil {
if info.Version != "" {
out["version"] = info.Version
}
if info.APIVersion != "" {
out["api_version"] = info.APIVersion
}
if info.OS != "" {
out["os"] = info.OS
}
if info.Arch != "" {
out["arch"] = info.Arch
}
if info.Kernel != "" {
out["kernel"] = info.Kernel
}
if info.OperatingSystem != "" {
out["operating_system"] = info.OperatingSystem
}
if info.StorageDriver != "" {
out["storage_driver"] = info.StorageDriver
}
if info.RootDir != "" {
out["root_dir"] = info.RootDir
}
if info.Name != "" {
out["name"] = info.Name
}
if info.NCPU > 0 {
out["ncpu"] = info.NCPU
}
if info.MemoryTotal > 0 {
out["memory_total"] = info.MemoryTotal
}
out["containers"] = info.Containers
out["running"] = info.Running
out["paused"] = info.Paused
out["stopped"] = info.Stopped
out["images"] = info.Images
}
return out
}
// proxyHealth probes the configured proxy provider. For NPM, attaches
// aggregate counts (proxy hosts, access lists, certificates) which the
// dashboard surfaces alongside the connection indicator.
func (s *Server) proxyHealth(ctx context.Context) map[string]any {
providerName := s.proxyProvider.Name()
start := time.Now()
err := s.proxyProvider.Ping(ctx)
latency := time.Since(start).Milliseconds()
if err != nil {
return map[string]any{
"provider": providerName,
"connected": false,
"error": providerName + " unreachable: " + err.Error(),
"latency_ms": latency,
}
}
out := map[string]any{
"provider": providerName,
"connected": true,
"latency_ms": latency,
}
// Attach configured URL from settings for both NPM and Traefik.
if settings, serr := s.store.GetSettings(); serr == nil {
switch providerName {
case "npm":
if settings.NpmURL != "" {
out["url"] = settings.NpmURL
}
case "traefik":
if settings.TraefikAPIURL != "" {
out["url"] = settings.TraefikAPIURL
}
}
}
// NPM-specific aggregates — a quick glance at route/list/cert counts.
// These calls require an authenticated NPM session, so we trigger the
// provider's auth step first (it's cheap: cached JWT is reused for 1h).
if providerName == "npm" && s.npm != nil {
if np, ok := s.proxyProvider.(*proxy.NpmProvider); ok {
if err := np.Authenticate(ctx); err == nil {
if hosts, herr := s.npm.ListProxyHosts(ctx); herr == nil {
out["proxy_hosts"] = len(hosts)
}
if lists, lerr := s.npm.ListAccessLists(ctx); lerr == nil {
out["access_lists"] = len(lists)
}
if certs, cerr := s.npm.ListCertificates(ctx); cerr == nil {
out["certificates"] = len(certs)
}
}
}
}
// Managed-route count — how many of the proxy's routes were deployed
// by Tinyforge itself, counting both Docker instances and static sites.
// This works for every provider (NPM, Traefik, …) because it reads from
// our own store, not the external proxy API.
if managed, merr := s.managedRouteCount(); merr == nil {
out["proxy_hosts_managed"] = managed
}
return out
}
// managedRouteCount returns the number of proxy routes Tinyforge manages
// (Docker instances + static sites combined). The domain argument doesn't
// affect the count so we pass an empty string to skip FQDN rendering.
func (s *Server) managedRouteCount() (int, error) {
instanceRoutes, err := s.store.ListProxyRoutes("")
if err != nil {
return 0, err
}
siteRoutes, err := s.store.ListStaticSiteProxyRoutes("")
if err != nil {
return 0, err
}
return len(instanceRoutes) + len(siteRoutes), nil
}