feat(stats): resource metrics dashboard + sites logs/stats
Build / build (push) Successful in 10m50s
Build / build (push) Successful in 10m50s
Background collector samples CPU/memory/network/block I/O for every
instance and site on a configurable interval (default 15s, range
5-300s), persists samples to SQLite with a configurable retention
window (default 2h, range 0-24h), and skips ticks gracefully when
the Docker daemon is unreachable. Settings are reloadable without
a restart — each tick re-reads them.
New API endpoints:
- GET /api/system/stats (host snapshot: info + df)
- GET /api/system/stats/history
- GET /api/system/stats/top?by=cpu|memory
- GET /api/projects/{id}/stages/{s}/instances/{iid}/stats/history
- GET /api/sites/{id}/stats[/history]
- GET /api/sites/{id}/logs (SSE + JSON, reuses instance log streamer)
Frontend:
- ECharts added with tree-shaken imports (~180KB gzip) for
future-proof time-series/gantt/graph visualizations
- CollapsibleSection wraps all dashboard sections (system health,
daemons, system resources, static sites, projects) with
localStorage-persisted open state
- SystemResourcesCard shows capacity tiles, workload utilization
chart with 30m/2h/6h/24h window picker, disk breakdown with
reclaimable callouts, and top 5 consumers
- ContainerStats and ContainerLogs take a source discriminated union
so sites reuse the same components as instances; sites detail page
embeds both for Deno backend debugging
- Settings › Maintenance exposes collection interval + retention
- Docker-unavailable state returns 503 and renders an amber banner
instead of a generic 500
Full i18n coverage (en + ru) for all new strings.
This commit is contained in:
@@ -0,0 +1,309 @@
|
||||
// Package stats implements a background goroutine that periodically samples
|
||||
// Docker container and host-level resource usage and persists the samples
|
||||
// into SQLite. It reads its interval and retention from settings on every
|
||||
// tick so configuration changes take effect without a restart.
|
||||
package stats
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/alexei/tinyforge/internal/docker"
|
||||
"github.com/alexei/tinyforge/internal/store"
|
||||
)
|
||||
|
||||
// Defaults applied when settings values are outside their valid range.
|
||||
const (
|
||||
DefaultIntervalSeconds = 15
|
||||
DefaultRetentionHours = 2
|
||||
MinIntervalSeconds = 5
|
||||
MaxIntervalSeconds = 300
|
||||
// Hard cap on parallel container stat requests to avoid overwhelming
|
||||
// the Docker daemon when the user has many containers.
|
||||
maxParallelSamples = 8
|
||||
)
|
||||
|
||||
// OwnerType values for ContainerStatsSample.OwnerType.
|
||||
const (
|
||||
OwnerTypeInstance = "instance"
|
||||
OwnerTypeSite = "site"
|
||||
)
|
||||
|
||||
// Collector runs the background sampling loop.
|
||||
type Collector struct {
|
||||
store *store.Store
|
||||
docker *docker.Client
|
||||
|
||||
stopOnce sync.Once
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// New creates a new stats collector. Call Start to begin sampling.
|
||||
func New(s *store.Store, d *docker.Client) *Collector {
|
||||
return &Collector{
|
||||
store: s,
|
||||
docker: d,
|
||||
stop: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start launches the background loop. Returns immediately. The loop exits
|
||||
// when Stop is called.
|
||||
func (c *Collector) Start() {
|
||||
go c.run()
|
||||
}
|
||||
|
||||
// Stop signals the collector to exit and blocks until it has finished the
|
||||
// in-flight tick.
|
||||
func (c *Collector) Stop() {
|
||||
c.stopOnce.Do(func() { close(c.stop) })
|
||||
<-c.done
|
||||
}
|
||||
|
||||
// run is the main loop. It reads the interval from settings on every tick,
|
||||
// which lets configuration changes propagate within one tick without a
|
||||
// dedicated reload mechanism.
|
||||
func (c *Collector) run() {
|
||||
defer close(c.done)
|
||||
|
||||
// Wait a few seconds before the first sample so the app has settled.
|
||||
select {
|
||||
case <-time.After(3 * time.Second):
|
||||
case <-c.stop:
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
interval, retention := c.readConfig()
|
||||
if interval == 0 || retention == 0 {
|
||||
// Collection disabled. Poll settings every minute in case the
|
||||
// user re-enables it.
|
||||
select {
|
||||
case <-time.After(time.Minute):
|
||||
continue
|
||||
case <-c.stop:
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
c.tick(retention)
|
||||
|
||||
select {
|
||||
case <-time.After(time.Duration(interval) * time.Second):
|
||||
case <-c.stop:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// readConfig reads the current interval + retention from settings, applying
|
||||
// defaults and clamping to the valid range.
|
||||
func (c *Collector) readConfig() (intervalSeconds, retentionHours int) {
|
||||
settings, err := c.store.GetSettings()
|
||||
if err != nil {
|
||||
slog.Warn("stats collector: failed to read settings — using defaults", "error", err)
|
||||
return DefaultIntervalSeconds, DefaultRetentionHours
|
||||
}
|
||||
intervalSeconds = settings.StatsIntervalSeconds
|
||||
retentionHours = settings.StatsRetentionHours
|
||||
if intervalSeconds < 0 || retentionHours < 0 {
|
||||
return 0, 0
|
||||
}
|
||||
if intervalSeconds > 0 && intervalSeconds < MinIntervalSeconds {
|
||||
intervalSeconds = MinIntervalSeconds
|
||||
}
|
||||
if intervalSeconds > MaxIntervalSeconds {
|
||||
intervalSeconds = MaxIntervalSeconds
|
||||
}
|
||||
return intervalSeconds, retentionHours
|
||||
}
|
||||
|
||||
// tick samples all known containers, aggregates workload-level totals,
|
||||
// persists samples, and prunes rows beyond the retention window. When
|
||||
// the Docker daemon is unreachable the whole tick is skipped with a
|
||||
// single debug log instead of one warning per container.
|
||||
func (c *Collector) tick(retentionHours int) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
defer pingCancel()
|
||||
if err := c.docker.Ping(pingCtx); err != nil {
|
||||
slog.Debug("stats collector: docker unreachable, skipping tick", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
targets := c.buildTargets()
|
||||
if len(targets) == 0 {
|
||||
// No containers to sample, but still record a system sample so the
|
||||
// host history isn't empty.
|
||||
c.recordSystemSample(ctx, 0, 0, 0)
|
||||
c.prune(retentionHours)
|
||||
return
|
||||
}
|
||||
|
||||
samples := c.sampleAll(ctx, targets)
|
||||
|
||||
var (
|
||||
totalCPU float64
|
||||
totalMem int64
|
||||
running int
|
||||
)
|
||||
for _, s := range samples {
|
||||
if err := c.store.InsertContainerStatsSample(s); err != nil {
|
||||
slog.Warn("stats collector: insert container sample",
|
||||
"container", s.ContainerID, "error", err)
|
||||
continue
|
||||
}
|
||||
totalCPU += s.CPUPercent
|
||||
totalMem += s.MemoryUsage
|
||||
running++
|
||||
}
|
||||
|
||||
c.recordSystemSample(ctx, totalCPU, totalMem, running)
|
||||
c.prune(retentionHours)
|
||||
}
|
||||
|
||||
// target describes a single container to sample.
|
||||
type target struct {
|
||||
ContainerID string
|
||||
OwnerType string
|
||||
OwnerID string
|
||||
}
|
||||
|
||||
// buildTargets fetches running instances and sites that have a container ID.
|
||||
func (c *Collector) buildTargets() []target {
|
||||
var out []target
|
||||
|
||||
instances, err := c.store.ListAllInstances()
|
||||
if err != nil {
|
||||
slog.Warn("stats collector: list instances", "error", err)
|
||||
} else {
|
||||
for _, inst := range instances {
|
||||
if inst.ContainerID == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, target{
|
||||
ContainerID: inst.ContainerID,
|
||||
OwnerType: OwnerTypeInstance,
|
||||
OwnerID: inst.ID,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
sites, err := c.store.GetAllStaticSites()
|
||||
if err != nil {
|
||||
slog.Warn("stats collector: list sites", "error", err)
|
||||
} else {
|
||||
for _, site := range sites {
|
||||
if site.ContainerID == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, target{
|
||||
ContainerID: site.ContainerID,
|
||||
OwnerType: OwnerTypeSite,
|
||||
OwnerID: site.ID,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// sampleAll fetches Docker stats for every target in bounded parallelism.
|
||||
// Failed samples are logged and skipped — a missing container must not kill
|
||||
// the whole tick.
|
||||
func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.ContainerStatsSample {
|
||||
sem := make(chan struct{}, maxParallelSamples)
|
||||
results := make([]store.ContainerStatsSample, len(targets))
|
||||
found := make([]bool, len(targets))
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i, t := range targets {
|
||||
wg.Add(1)
|
||||
go func(i int, t target) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
|
||||
sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
stats, err := c.docker.GetContainerStats(sampleCtx, t.ContainerID)
|
||||
if err != nil {
|
||||
slog.Debug("stats collector: get container stats",
|
||||
"container", t.ContainerID, "owner_type", t.OwnerType, "error", err)
|
||||
return
|
||||
}
|
||||
ts := stats.Timestamp.Unix()
|
||||
if ts <= 0 {
|
||||
ts = time.Now().UTC().Unix()
|
||||
}
|
||||
results[i] = store.ContainerStatsSample{
|
||||
ContainerID: t.ContainerID,
|
||||
OwnerType: t.OwnerType,
|
||||
OwnerID: t.OwnerID,
|
||||
TS: ts,
|
||||
CPUPercent: stats.CPUPercent,
|
||||
MemoryUsage: stats.MemoryUsage,
|
||||
MemoryLimit: stats.MemoryLimit,
|
||||
NetworkRxBytes: stats.NetworkRxBytes,
|
||||
NetworkTxBytes: stats.NetworkTxBytes,
|
||||
BlockReadBytes: stats.BlockReadBytes,
|
||||
BlockWriteBytes: stats.BlockWriteBytes,
|
||||
}
|
||||
found[i] = true
|
||||
}(i, t)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
out := results[:0]
|
||||
for i := range results {
|
||||
if found[i] {
|
||||
out = append(out, results[i])
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// recordSystemSample fetches host info + disk usage and persists a combined
|
||||
// system-level sample. Failures are warned but do not propagate.
|
||||
func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64, workloadMem int64, running int) {
|
||||
sysStats, err := c.docker.GetSystemStats(ctx)
|
||||
if err != nil {
|
||||
slog.Warn("stats collector: get system stats", "error", err)
|
||||
return
|
||||
}
|
||||
sample := store.SystemStatsSample{
|
||||
TS: sysStats.Timestamp.Unix(),
|
||||
NCPU: sysStats.NCPU,
|
||||
MemoryTotal: sysStats.MemoryTotal,
|
||||
WorkloadCPUPercent: workloadCPU,
|
||||
WorkloadMemUsage: workloadMem,
|
||||
ContainersRunning: running,
|
||||
DiskTotalBytes: sysStats.DiskTotalBytes,
|
||||
}
|
||||
// Prefer the Docker-reported running count when we have no running samples
|
||||
// (e.g., very first tick may race with container readiness).
|
||||
if running == 0 && sysStats.Running > 0 {
|
||||
sample.ContainersRunning = sysStats.Running
|
||||
}
|
||||
if err := c.store.InsertSystemStatsSample(sample); err != nil {
|
||||
slog.Warn("stats collector: insert system sample", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// prune drops rows older than the retention window.
|
||||
func (c *Collector) prune(retentionHours int) {
|
||||
if retentionHours <= 0 {
|
||||
return
|
||||
}
|
||||
cutoff := time.Now().UTC().Add(-time.Duration(retentionHours) * time.Hour).Unix()
|
||||
if _, err := c.store.PruneStatsSamplesBefore(cutoff); err != nil {
|
||||
slog.Warn("stats collector: prune", "error", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user