feat(stats): resource metrics dashboard + sites logs/stats

Background collector samples CPU/memory/network/block I/O for every instance and site on a configurable interval (default 15s, range 5-300s), persists samples to SQLite with a configurable retention window (default 2h, range 0-24h), and skips ticks gracefully when the Docker daemon is unreachable. Settings are reloadable without a restart — each tick re-reads them. New API endpoints: - GET /api/system/stats (host snapshot: info + df) - GET /api/system/stats/history - GET /api/system/stats/top?by=cpu|memory - GET /api/projects/{id}/stages/{s}/instances/{iid}/stats/history - GET /api/sites/{id}/stats[/history] - GET /api/sites/{id}/logs (SSE + JSON, reuses instance log streamer) Frontend: - ECharts added with tree-shaken imports (~180KB gzip) for future-proof time-series/gantt/graph visualizations - CollapsibleSection wraps all dashboard sections (system health, daemons, system resources, static sites, projects) with localStorage-persisted open state - SystemResourcesCard shows capacity tiles, workload utilization chart with 30m/2h/6h/24h window picker, disk breakdown with reclaimable callouts, and top 5 consumers - ContainerStats and ContainerLogs take a source discriminated union so sites reuse the same components as instances; sites detail page embeds both for Deno backend debugging - Settings › Maintenance exposes collection interval + retention - Docker-unavailable state returns 503 and renders an amber banner instead of a generic 500 Full i18n coverage (en + ru) for all new strings.
2026-04-24 15:02:43 +03:00
parent 0632f512e6
commit 05440a5f92
27 changed files with 1897 additions and 112 deletions
@@ -0,0 +1,309 @@
+// Package stats implements a background goroutine that periodically samples
+// Docker container and host-level resource usage and persists the samples
+// into SQLite. It reads its interval and retention from settings on every
+// tick so configuration changes take effect without a restart.
+package stats
+
+import (
+	"context"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/store"
+)
+
+// Defaults applied when settings values are outside their valid range.
+const (
+	DefaultIntervalSeconds = 15
+	DefaultRetentionHours  = 2
+	MinIntervalSeconds     = 5
+	MaxIntervalSeconds     = 300
+	// Hard cap on parallel container stat requests to avoid overwhelming
+	// the Docker daemon when the user has many containers.
+	maxParallelSamples = 8
+)
+
+// OwnerType values for ContainerStatsSample.OwnerType.
+const (
+	OwnerTypeInstance = "instance"
+	OwnerTypeSite     = "site"
+)
+
+// Collector runs the background sampling loop.
+type Collector struct {
+	store  *store.Store
+	docker *docker.Client
+
+	stopOnce sync.Once
+	stop     chan struct{}
+	done     chan struct{}
+}
+
+// New creates a new stats collector. Call Start to begin sampling.
+func New(s *store.Store, d *docker.Client) *Collector {
+	return &Collector{
+		store:  s,
+		docker: d,
+		stop:   make(chan struct{}),
+		done:   make(chan struct{}),
+	}
+}
+
+// Start launches the background loop. Returns immediately. The loop exits
+// when Stop is called.
+func (c *Collector) Start() {
+	go c.run()
+}
+
+// Stop signals the collector to exit and blocks until it has finished the
+// in-flight tick.
+func (c *Collector) Stop() {
+	c.stopOnce.Do(func() { close(c.stop) })
+	<-c.done
+}
+
+// run is the main loop. It reads the interval from settings on every tick,
+// which lets configuration changes propagate within one tick without a
+// dedicated reload mechanism.
+func (c *Collector) run() {
+	defer close(c.done)
+
+	// Wait a few seconds before the first sample so the app has settled.
+	select {
+	case <-time.After(3 * time.Second):
+	case <-c.stop:
+		return
+	}
+
+	for {
+		interval, retention := c.readConfig()
+		if interval == 0 || retention == 0 {
+			// Collection disabled. Poll settings every minute in case the
+			// user re-enables it.
+			select {
+			case <-time.After(time.Minute):
+				continue
+			case <-c.stop:
+				return
+			}
+		}
+
+		c.tick(retention)
+
+		select {
+		case <-time.After(time.Duration(interval) * time.Second):
+		case <-c.stop:
+			return
+		}
+	}
+}
+
+// readConfig reads the current interval + retention from settings, applying
+// defaults and clamping to the valid range.
+func (c *Collector) readConfig() (intervalSeconds, retentionHours int) {
+	settings, err := c.store.GetSettings()
+	if err != nil {
+		slog.Warn("stats collector: failed to read settings — using defaults", "error", err)
+		return DefaultIntervalSeconds, DefaultRetentionHours
+	}
+	intervalSeconds = settings.StatsIntervalSeconds
+	retentionHours = settings.StatsRetentionHours
+	if intervalSeconds < 0 || retentionHours < 0 {
+		return 0, 0
+	}
+	if intervalSeconds > 0 && intervalSeconds < MinIntervalSeconds {
+		intervalSeconds = MinIntervalSeconds
+	}
+	if intervalSeconds > MaxIntervalSeconds {
+		intervalSeconds = MaxIntervalSeconds
+	}
+	return intervalSeconds, retentionHours
+}
+
+// tick samples all known containers, aggregates workload-level totals,
+// persists samples, and prunes rows beyond the retention window. When
+// the Docker daemon is unreachable the whole tick is skipped with a
+// single debug log instead of one warning per container.
+func (c *Collector) tick(retentionHours int) {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second)
+	defer pingCancel()
+	if err := c.docker.Ping(pingCtx); err != nil {
+		slog.Debug("stats collector: docker unreachable, skipping tick", "error", err)
+		return
+	}
+
+	targets := c.buildTargets()
+	if len(targets) == 0 {
+		// No containers to sample, but still record a system sample so the
+		// host history isn't empty.
+		c.recordSystemSample(ctx, 0, 0, 0)
+		c.prune(retentionHours)
+		return
+	}
+
+	samples := c.sampleAll(ctx, targets)
+
+	var (
+		totalCPU float64
+		totalMem int64
+		running  int
+	)
+	for _, s := range samples {
+		if err := c.store.InsertContainerStatsSample(s); err != nil {
+			slog.Warn("stats collector: insert container sample",
+				"container", s.ContainerID, "error", err)
+			continue
+		}
+		totalCPU += s.CPUPercent
+		totalMem += s.MemoryUsage
+		running++
+	}
+
+	c.recordSystemSample(ctx, totalCPU, totalMem, running)
+	c.prune(retentionHours)
+}
+
+// target describes a single container to sample.
+type target struct {
+	ContainerID string
+	OwnerType   string
+	OwnerID     string
+}
+
+// buildTargets fetches running instances and sites that have a container ID.
+func (c *Collector) buildTargets() []target {
+	var out []target
+
+	instances, err := c.store.ListAllInstances()
+	if err != nil {
+		slog.Warn("stats collector: list instances", "error", err)
+	} else {
+		for _, inst := range instances {
+			if inst.ContainerID == "" {
+				continue
+			}
+			out = append(out, target{
+				ContainerID: inst.ContainerID,
+				OwnerType:   OwnerTypeInstance,
+				OwnerID:     inst.ID,
+			})
+		}
+	}
+
+	sites, err := c.store.GetAllStaticSites()
+	if err != nil {
+		slog.Warn("stats collector: list sites", "error", err)
+	} else {
+		for _, site := range sites {
+			if site.ContainerID == "" {
+				continue
+			}
+			out = append(out, target{
+				ContainerID: site.ContainerID,
+				OwnerType:   OwnerTypeSite,
+				OwnerID:     site.ID,
+			})
+		}
+	}
+
+	return out
+}
+
+// sampleAll fetches Docker stats for every target in bounded parallelism.
+// Failed samples are logged and skipped — a missing container must not kill
+// the whole tick.
+func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.ContainerStatsSample {
+	sem := make(chan struct{}, maxParallelSamples)
+	results := make([]store.ContainerStatsSample, len(targets))
+	found := make([]bool, len(targets))
+
+	var wg sync.WaitGroup
+	for i, t := range targets {
+		wg.Add(1)
+		go func(i int, t target) {
+			defer wg.Done()
+			sem <- struct{}{}
+			defer func() { <-sem }()
+
+			sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
+			defer cancel()
+
+			stats, err := c.docker.GetContainerStats(sampleCtx, t.ContainerID)
+			if err != nil {
+				slog.Debug("stats collector: get container stats",
+					"container", t.ContainerID, "owner_type", t.OwnerType, "error", err)
+				return
+			}
+			ts := stats.Timestamp.Unix()
+			if ts <= 0 {
+				ts = time.Now().UTC().Unix()
+			}
+			results[i] = store.ContainerStatsSample{
+				ContainerID:     t.ContainerID,
+				OwnerType:       t.OwnerType,
+				OwnerID:         t.OwnerID,
+				TS:              ts,
+				CPUPercent:      stats.CPUPercent,
+				MemoryUsage:     stats.MemoryUsage,
+				MemoryLimit:     stats.MemoryLimit,
+				NetworkRxBytes:  stats.NetworkRxBytes,
+				NetworkTxBytes:  stats.NetworkTxBytes,
+				BlockReadBytes:  stats.BlockReadBytes,
+				BlockWriteBytes: stats.BlockWriteBytes,
+			}
+			found[i] = true
+		}(i, t)
+	}
+	wg.Wait()
+
+	out := results[:0]
+	for i := range results {
+		if found[i] {
+			out = append(out, results[i])
+		}
+	}
+	return out
+}
+
+// recordSystemSample fetches host info + disk usage and persists a combined
+// system-level sample. Failures are warned but do not propagate.
+func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64, workloadMem int64, running int) {
+	sysStats, err := c.docker.GetSystemStats(ctx)
+	if err != nil {
+		slog.Warn("stats collector: get system stats", "error", err)
+		return
+	}
+	sample := store.SystemStatsSample{
+		TS:                 sysStats.Timestamp.Unix(),
+		NCPU:               sysStats.NCPU,
+		MemoryTotal:        sysStats.MemoryTotal,
+		WorkloadCPUPercent: workloadCPU,
+		WorkloadMemUsage:   workloadMem,
+		ContainersRunning:  running,
+		DiskTotalBytes:     sysStats.DiskTotalBytes,
+	}
+	// Prefer the Docker-reported running count when we have no running samples
+	// (e.g., very first tick may race with container readiness).
+	if running == 0 && sysStats.Running > 0 {
+		sample.ContainersRunning = sysStats.Running
+	}
+	if err := c.store.InsertSystemStatsSample(sample); err != nil {
+		slog.Warn("stats collector: insert system sample", "error", err)
+	}
+}
+
+// prune drops rows older than the retention window.
+func (c *Collector) prune(retentionHours int) {
+	if retentionHours <= 0 {
+		return
+	}
+	cutoff := time.Now().UTC().Add(-time.Duration(retentionHours) * time.Hour).Unix()
+	if _, err := c.store.PruneStatsSamplesBefore(cutoff); err != nil {
+		slog.Warn("stats collector: prune", "error", err)
+	}
+}