feat(stats): resource metrics dashboard + sites logs/stats
Build / build (push) Successful in 10m50s

Background collector samples CPU/memory/network/block I/O for every
instance and site on a configurable interval (default 15s, range
5-300s), persists samples to SQLite with a configurable retention
window (default 2h, range 0-24h), and skips ticks gracefully when
the Docker daemon is unreachable. Settings are reloadable without
a restart — each tick re-reads them.

New API endpoints:
- GET /api/system/stats (host snapshot: info + df)
- GET /api/system/stats/history
- GET /api/system/stats/top?by=cpu|memory
- GET /api/projects/{id}/stages/{s}/instances/{iid}/stats/history
- GET /api/sites/{id}/stats[/history]
- GET /api/sites/{id}/logs (SSE + JSON, reuses instance log streamer)

Frontend:
- ECharts added with tree-shaken imports (~180KB gzip) for
  future-proof time-series/gantt/graph visualizations
- CollapsibleSection wraps all dashboard sections (system health,
  daemons, system resources, static sites, projects) with
  localStorage-persisted open state
- SystemResourcesCard shows capacity tiles, workload utilization
  chart with 30m/2h/6h/24h window picker, disk breakdown with
  reclaimable callouts, and top 5 consumers
- ContainerStats and ContainerLogs take a source discriminated union
  so sites reuse the same components as instances; sites detail page
  embeds both for Deno backend debugging
- Settings › Maintenance exposes collection interval + retention
- Docker-unavailable state returns 503 and renders an amber banner
  instead of a generic 500

Full i18n coverage (en + ru) for all new strings.
This commit is contained in:
2026-04-24 15:02:43 +03:00
parent 0632f512e6
commit 05440a5f92
27 changed files with 1897 additions and 112 deletions
+9 -2
View File
@@ -68,6 +68,13 @@ func (s *Server) streamContainerLogs(w http.ResponseWriter, r *http.Request) {
return
}
s.streamLogsForContainer(w, r, inst.ContainerID)
}
// streamLogsForContainer streams logs for an arbitrary container ID using the
// shared SSE/JSON dual-mode pattern. Owner-specific handlers (instance, site)
// should validate ownership and then delegate here.
func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request, containerID string) {
if s.docker == nil {
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
@@ -83,9 +90,9 @@ func (s *Server) streamContainerLogs(w http.ResponseWriter, r *http.Request) {
accept := r.Header.Get("Accept")
isSSE := strings.Contains(accept, "text/event-stream")
logReader, err := s.docker.ContainerLogs(r.Context(), inst.ContainerID, follow && isSSE, tail)
logReader, err := s.docker.ContainerLogs(r.Context(), containerID, follow && isSSE, tail)
if err != nil {
slog.Error("failed to get container logs", "instance", instanceID, "error", err)
slog.Error("failed to get container logs", "container", containerID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get container logs")
return
}
+9
View File
@@ -219,6 +219,7 @@ func (s *Server) Router() chi.Router {
r.Get("/stages/{stage}/env", s.listStageEnv)
r.Get("/stages/{stage}/instances", s.listInstances)
r.Get("/stages/{stage}/instances/{iid}/stats", s.getInstanceStats)
r.Get("/stages/{stage}/instances/{iid}/stats/history", s.getInstanceStatsHistory)
r.Get("/stages/{stage}/instances/{iid}/logs", s.streamContainerLogs)
r.Get("/images", s.listProjectImages)
r.Get("/volumes", s.listVolumes)
@@ -288,6 +289,9 @@ func (s *Server) Router() chi.Router {
r.Get("/", s.getStaticSite)
r.Get("/secrets", s.listStaticSiteSecrets)
r.Get("/storage", s.getStaticSiteStorage)
r.Get("/logs", s.streamStaticSiteLogs)
r.Get("/stats", s.getStaticSiteStats)
r.Get("/stats/history", s.getStaticSiteStatsHistory)
// Admin-only mutations.
r.Group(func(r chi.Router) {
@@ -333,6 +337,11 @@ func (s *Server) Router() chi.Router {
// Stale container endpoints (read).
r.Get("/containers/stale", s.listStaleContainers)
// System resources (read-only).
r.Get("/system/stats", s.getSystemStats)
r.Get("/system/stats/history", s.getSystemStatsHistory)
r.Get("/system/stats/top", s.listTopContainersByCPU)
// Admin-only routes: require admin role.
r.Group(func(r chi.Router) {
r.Use(auth.AdminOnly)
+20
View File
@@ -46,6 +46,8 @@ type settingsRequest struct {
BackupEnabled *bool `json:"backup_enabled,omitempty"`
BackupIntervalHours *int `json:"backup_interval_hours,omitempty"`
BackupRetentionCount *int `json:"backup_retention_count,omitempty"`
StatsIntervalSeconds *int `json:"stats_interval_seconds,omitempty"`
StatsRetentionHours *int `json:"stats_retention_hours,omitempty"`
}
// getSettings handles GET /api/settings.
@@ -86,6 +88,8 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) {
"backup_enabled": settings.BackupEnabled,
"backup_interval_hours": settings.BackupIntervalHours,
"backup_retention_count": settings.BackupRetentionCount,
"stats_interval_seconds": settings.StatsIntervalSeconds,
"stats_retention_hours": settings.StatsRetentionHours,
"updated_at": settings.UpdatedAt,
})
}
@@ -238,6 +242,22 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) {
}
updated.BackupRetentionCount = *req.BackupRetentionCount
}
if req.StatsIntervalSeconds != nil {
v := *req.StatsIntervalSeconds
if v != 0 && (v < 5 || v > 300) {
respondError(w, http.StatusBadRequest, "stats_interval_seconds must be 0 (disabled) or between 5 and 300")
return
}
updated.StatsIntervalSeconds = v
}
if req.StatsRetentionHours != nil {
v := *req.StatsRetentionHours
if v < 0 || v > 24 {
respondError(w, http.StatusBadRequest, "stats_retention_hours must be between 0 and 24")
return
}
updated.StatsRetentionHours = v
}
if err := s.store.UpdateSettings(updated); err != nil {
respondError(w, http.StatusInternalServerError, "failed to update settings: "+err.Error())
+245
View File
@@ -0,0 +1,245 @@
package api
import (
"errors"
"log/slog"
"net/http"
"strconv"
"time"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/stats"
"github.com/alexei/tinyforge/internal/store"
)
const (
// defaultHistoryWindow is used when no ?window= param is provided or the
// value fails to parse. Matches the default retention so the "last 2h"
// view always has data when collection is enabled.
defaultHistoryWindow = 2 * time.Hour
maxHistoryWindow = 24 * time.Hour
)
// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
// and returns a bounded duration.
func parseWindow(r *http.Request) time.Duration {
raw := r.URL.Query().Get("window")
if raw == "" {
return defaultHistoryWindow
}
d, err := time.ParseDuration(raw)
if err != nil || d <= 0 {
return defaultHistoryWindow
}
if d > maxHistoryWindow {
return maxHistoryWindow
}
return d
}
// sinceTimestamp converts a duration into a Unix-seconds cutoff.
func sinceTimestamp(window time.Duration) int64 {
return time.Now().UTC().Add(-window).Unix()
}
// getSystemStats handles GET /api/system/stats — current host snapshot.
// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
// handler returns 503 so the frontend can show a dedicated unavailable
// state instead of treating it as a generic 5xx failure.
func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
if s.docker == nil {
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
sys, err := s.docker.GetSystemStats(r.Context())
if err != nil {
slog.Warn("system stats unavailable", "error", err)
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
respondJSON(w, http.StatusOK, sys)
}
// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list system stats samples", "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.SystemStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// getInstanceStatsHistory handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats/history.
func (s *Server) getInstanceStatsHistory(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "iid")
if _, err := s.store.GetInstanceByID(instanceID); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
return
}
slog.Error("failed to get instance", "instance_id", instanceID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get instance")
return
}
samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeInstance, instanceID, sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list instance stats samples", "instance_id", instanceID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.ContainerStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// getStaticSiteStats handles GET /api/sites/{id}/stats — current snapshot.
func (s *Server) getStaticSiteStats(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
site, err := s.store.GetStaticSiteByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "site")
return
}
slog.Error("failed to get site", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site")
return
}
if site.ContainerID == "" {
respondError(w, http.StatusConflict, "site has no container")
return
}
if s.docker == nil {
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
cs, err := s.docker.GetContainerStats(r.Context(), site.ContainerID)
if err != nil {
slog.Error("failed to get site stats", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site stats")
return
}
respondJSON(w, http.StatusOK, cs)
}
// getStaticSiteStatsHistory handles GET /api/sites/{id}/stats/history.
func (s *Server) getStaticSiteStatsHistory(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
if _, err := s.store.GetStaticSiteByID(id); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "site")
return
}
slog.Error("failed to get site", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site")
return
}
samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeSite, id, sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list site stats samples", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.ContainerStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// streamStaticSiteLogs handles GET /api/sites/{id}/logs?tail=200&follow=true.
// Reuses the shared container log streamer so the SSE + multiplex handling
// matches /api/projects/.../instances/.../logs exactly.
func (s *Server) streamStaticSiteLogs(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
site, err := s.store.GetStaticSiteByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "site")
return
}
slog.Error("failed to get site", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site")
return
}
if site.ContainerID == "" {
respondError(w, http.StatusConflict, "site has no container")
return
}
s.streamLogsForContainer(w, r, site.ContainerID)
}
// listTopContainersByCPU handles GET /api/system/stats/top?limit=5&by=cpu.
// Returns the top-N most recent samples across containers, sorted by CPU or
// memory. Useful for a system dashboard "top consumers" widget without
// requiring the frontend to aggregate per-container history on its own.
func (s *Server) listTopContainersByCPU(w http.ResponseWriter, r *http.Request) {
limit := 5
if raw := r.URL.Query().Get("limit"); raw != "" {
if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
limit = n
}
}
by := r.URL.Query().Get("by")
if by != "memory" {
by = "cpu"
}
// Samples from the last 2 minutes window so "top" reflects near-current
// load, not long-dead rows.
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(2 * time.Minute))
if err != nil {
slog.Error("failed to list container samples for top", "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
// Keep only the latest sample per container.
latest := make(map[string]store.ContainerStatsSample, len(samples))
for _, sm := range samples {
if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
latest[sm.ContainerID] = sm
}
}
top := make([]store.ContainerStatsSample, 0, len(latest))
for _, sm := range latest {
top = append(top, sm)
}
// Partial-sort by the requested metric, descending. For small N a simple
// insertion-like approach is plenty.
sortContainerSamples(top, by)
if len(top) > limit {
top = top[:limit]
}
respondJSON(w, http.StatusOK, top)
}
// sortContainerSamples sorts in place by CPU (or memory) descending.
// Note: ListContainerStatsSamples with empty ownerID returns no rows — the
// caller uses per-owner-type queries and merges; this helper is applied to
// the already-merged slice.
func sortContainerSamples(s []store.ContainerStatsSample, by string) {
// O(n^2) is fine — N is small (bounded by the number of containers).
for i := 1; i < len(s); i++ {
for j := i; j > 0; j-- {
var less bool
if by == "memory" {
less = s[j].MemoryUsage > s[j-1].MemoryUsage
} else {
less = s[j].CPUPercent > s[j-1].CPUPercent
}
if !less {
break
}
s[j-1], s[j] = s[j], s[j-1]
}
}
}
+58 -10
View File
@@ -4,21 +4,30 @@ import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"github.com/moby/moby/api/types/container"
"github.com/moby/moby/client"
)
// ContainerStats holds computed CPU and memory usage for a container.
// ContainerStats holds computed CPU, memory, network, and block I/O
// usage for a container. Network and block I/O values are cumulative
// byte counters — compute rates by differencing two samples.
type ContainerStats struct {
CPUPercent float64 `json:"cpu_percent"`
MemoryUsage int64 `json:"memory_usage"`
MemoryLimit int64 `json:"memory_limit"`
MemoryPercent float64 `json:"memory_percent"`
Timestamp time.Time `json:"timestamp"`
CPUPercent float64 `json:"cpu_percent"`
MemoryUsage int64 `json:"memory_usage"`
MemoryLimit int64 `json:"memory_limit"`
MemoryPercent float64 `json:"memory_percent"`
NetworkRxBytes int64 `json:"network_rx_bytes"`
NetworkTxBytes int64 `json:"network_tx_bytes"`
BlockReadBytes int64 `json:"block_read_bytes"`
BlockWriteBytes int64 `json:"block_write_bytes"`
}
// GetContainerStats retrieves a one-shot stats snapshot for the given container
// and computes CPU and memory percentages.
// and computes CPU, memory, network, and block I/O metrics.
func (c *Client) GetContainerStats(ctx context.Context, containerID string) (ContainerStats, error) {
result, err := c.api.ContainerStats(ctx, containerID, client.ContainerStatsOptions{
Stream: false,
@@ -42,14 +51,53 @@ func (c *Client) GetContainerStats(ctx context.Context, containerID string) (Con
memPercent = float64(memUsage) / float64(memLimit) * 100.0
}
rxBytes, txBytes := sumNetworkBytes(stats.Networks)
readBytes, writeBytes := sumBlockIOBytes(stats.BlkioStats.IoServiceBytesRecursive)
ts := stats.Read
if ts.IsZero() {
ts = time.Now().UTC()
}
return ContainerStats{
CPUPercent: cpuPercent,
MemoryUsage: memUsage,
MemoryLimit: memLimit,
MemoryPercent: memPercent,
Timestamp: ts,
CPUPercent: cpuPercent,
MemoryUsage: memUsage,
MemoryLimit: memLimit,
MemoryPercent: memPercent,
NetworkRxBytes: rxBytes,
NetworkTxBytes: txBytes,
BlockReadBytes: readBytes,
BlockWriteBytes: writeBytes,
}, nil
}
// sumNetworkBytes aggregates rx/tx byte counters across all network interfaces
// for a single container. Missing Networks map (disabled networking) yields zeros.
func sumNetworkBytes(nets map[string]container.NetworkStats) (rx, tx int64) {
for _, n := range nets {
rx += int64(n.RxBytes)
tx += int64(n.TxBytes)
}
return rx, tx
}
// sumBlockIOBytes totals read/write bytes across all block devices from the
// cgroup io_service_bytes_recursive entries. The "Op" field is "read"/"write"
// on cgroup v2 and "Read"/"Write" on cgroup v1 — match case-insensitively so
// either runtime works.
func sumBlockIOBytes(entries []container.BlkioStatEntry) (read, write int64) {
for _, e := range entries {
switch {
case strings.EqualFold(e.Op, "read"):
read += int64(e.Value)
case strings.EqualFold(e.Op, "write"):
write += int64(e.Value)
}
}
return read, write
}
// calculateCPUPercent computes CPU usage percentage from a stats response
// using the delta between current and previous CPU readings.
func calculateCPUPercent(stats container.StatsResponse) float64 {
+87
View File
@@ -0,0 +1,87 @@
package docker
import (
"context"
"fmt"
"time"
"github.com/moby/moby/client"
)
// SystemStats is a host-level snapshot combining daemon capacity
// (NCPU, memory total) with container counts and disk usage broken down
// by category. Workload CPU/memory utilization is aggregated from
// per-container samples by the stats collector, not here.
type SystemStats struct {
Timestamp time.Time `json:"timestamp"`
// Capacity from Docker daemon.
NCPU int `json:"ncpu"`
MemoryTotal int64 `json:"memory_total"`
// Container/image counts.
Containers int `json:"containers"`
Running int `json:"running"`
Paused int `json:"paused"`
Stopped int `json:"stopped"`
Images int `json:"images"`
// Disk usage by category (bytes).
DiskImagesBytes int64 `json:"disk_images_bytes"`
DiskContainersBytes int64 `json:"disk_containers_bytes"`
DiskVolumesBytes int64 `json:"disk_volumes_bytes"`
DiskBuildCacheBytes int64 `json:"disk_build_cache_bytes"`
// Reclaimable disk space by category (bytes).
DiskImagesReclaimable int64 `json:"disk_images_reclaimable"`
DiskContainersReclaimable int64 `json:"disk_containers_reclaimable"`
DiskVolumesReclaimable int64 `json:"disk_volumes_reclaimable"`
DiskBuildCacheReclaimable int64 `json:"disk_build_cache_reclaimable"`
// DiskTotalBytes is the sum of the category totals.
DiskTotalBytes int64 `json:"disk_total_bytes"`
}
// GetSystemStats returns a one-shot host-level snapshot. The Info() call
// and disk usage call are made in sequence. Disk usage failures do not
// fail the whole call — the result degrades gracefully with zero disk fields.
func (c *Client) GetSystemStats(ctx context.Context) (SystemStats, error) {
info, err := c.Info(ctx)
if err != nil {
return SystemStats{}, fmt.Errorf("system stats: %w", err)
}
stats := SystemStats{
Timestamp: time.Now().UTC(),
NCPU: info.NCPU,
MemoryTotal: info.MemoryTotal,
Containers: info.Containers,
Running: info.Running,
Paused: info.Paused,
Stopped: info.Stopped,
Images: info.Images,
}
du, derr := c.api.DiskUsage(ctx, client.DiskUsageOptions{
Containers: true,
Images: true,
Volumes: true,
BuildCache: true,
})
if derr == nil {
stats.DiskImagesBytes = du.Images.TotalSize
stats.DiskContainersBytes = du.Containers.TotalSize
stats.DiskVolumesBytes = du.Volumes.TotalSize
stats.DiskBuildCacheBytes = du.BuildCache.TotalSize
stats.DiskImagesReclaimable = du.Images.Reclaimable
stats.DiskContainersReclaimable = du.Containers.Reclaimable
stats.DiskVolumesReclaimable = du.Volumes.Reclaimable
stats.DiskBuildCacheReclaimable = du.BuildCache.Reclaimable
stats.DiskTotalBytes = stats.DiskImagesBytes +
stats.DiskContainersBytes +
stats.DiskVolumesBytes +
stats.DiskBuildCacheBytes
}
return stats, nil
}
+309
View File
@@ -0,0 +1,309 @@
// Package stats implements a background goroutine that periodically samples
// Docker container and host-level resource usage and persists the samples
// into SQLite. It reads its interval and retention from settings on every
// tick so configuration changes take effect without a restart.
package stats
import (
"context"
"log/slog"
"sync"
"time"
"github.com/alexei/tinyforge/internal/docker"
"github.com/alexei/tinyforge/internal/store"
)
// Defaults applied when settings values are outside their valid range.
const (
DefaultIntervalSeconds = 15
DefaultRetentionHours = 2
MinIntervalSeconds = 5
MaxIntervalSeconds = 300
// Hard cap on parallel container stat requests to avoid overwhelming
// the Docker daemon when the user has many containers.
maxParallelSamples = 8
)
// OwnerType values for ContainerStatsSample.OwnerType.
const (
OwnerTypeInstance = "instance"
OwnerTypeSite = "site"
)
// Collector runs the background sampling loop.
type Collector struct {
store *store.Store
docker *docker.Client
stopOnce sync.Once
stop chan struct{}
done chan struct{}
}
// New creates a new stats collector. Call Start to begin sampling.
func New(s *store.Store, d *docker.Client) *Collector {
return &Collector{
store: s,
docker: d,
stop: make(chan struct{}),
done: make(chan struct{}),
}
}
// Start launches the background loop. Returns immediately. The loop exits
// when Stop is called.
func (c *Collector) Start() {
go c.run()
}
// Stop signals the collector to exit and blocks until it has finished the
// in-flight tick.
func (c *Collector) Stop() {
c.stopOnce.Do(func() { close(c.stop) })
<-c.done
}
// run is the main loop. It reads the interval from settings on every tick,
// which lets configuration changes propagate within one tick without a
// dedicated reload mechanism.
func (c *Collector) run() {
defer close(c.done)
// Wait a few seconds before the first sample so the app has settled.
select {
case <-time.After(3 * time.Second):
case <-c.stop:
return
}
for {
interval, retention := c.readConfig()
if interval == 0 || retention == 0 {
// Collection disabled. Poll settings every minute in case the
// user re-enables it.
select {
case <-time.After(time.Minute):
continue
case <-c.stop:
return
}
}
c.tick(retention)
select {
case <-time.After(time.Duration(interval) * time.Second):
case <-c.stop:
return
}
}
}
// readConfig reads the current interval + retention from settings, applying
// defaults and clamping to the valid range.
func (c *Collector) readConfig() (intervalSeconds, retentionHours int) {
settings, err := c.store.GetSettings()
if err != nil {
slog.Warn("stats collector: failed to read settings — using defaults", "error", err)
return DefaultIntervalSeconds, DefaultRetentionHours
}
intervalSeconds = settings.StatsIntervalSeconds
retentionHours = settings.StatsRetentionHours
if intervalSeconds < 0 || retentionHours < 0 {
return 0, 0
}
if intervalSeconds > 0 && intervalSeconds < MinIntervalSeconds {
intervalSeconds = MinIntervalSeconds
}
if intervalSeconds > MaxIntervalSeconds {
intervalSeconds = MaxIntervalSeconds
}
return intervalSeconds, retentionHours
}
// tick samples all known containers, aggregates workload-level totals,
// persists samples, and prunes rows beyond the retention window. When
// the Docker daemon is unreachable the whole tick is skipped with a
// single debug log instead of one warning per container.
func (c *Collector) tick(retentionHours int) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second)
defer pingCancel()
if err := c.docker.Ping(pingCtx); err != nil {
slog.Debug("stats collector: docker unreachable, skipping tick", "error", err)
return
}
targets := c.buildTargets()
if len(targets) == 0 {
// No containers to sample, but still record a system sample so the
// host history isn't empty.
c.recordSystemSample(ctx, 0, 0, 0)
c.prune(retentionHours)
return
}
samples := c.sampleAll(ctx, targets)
var (
totalCPU float64
totalMem int64
running int
)
for _, s := range samples {
if err := c.store.InsertContainerStatsSample(s); err != nil {
slog.Warn("stats collector: insert container sample",
"container", s.ContainerID, "error", err)
continue
}
totalCPU += s.CPUPercent
totalMem += s.MemoryUsage
running++
}
c.recordSystemSample(ctx, totalCPU, totalMem, running)
c.prune(retentionHours)
}
// target describes a single container to sample.
type target struct {
ContainerID string
OwnerType string
OwnerID string
}
// buildTargets fetches running instances and sites that have a container ID.
func (c *Collector) buildTargets() []target {
var out []target
instances, err := c.store.ListAllInstances()
if err != nil {
slog.Warn("stats collector: list instances", "error", err)
} else {
for _, inst := range instances {
if inst.ContainerID == "" {
continue
}
out = append(out, target{
ContainerID: inst.ContainerID,
OwnerType: OwnerTypeInstance,
OwnerID: inst.ID,
})
}
}
sites, err := c.store.GetAllStaticSites()
if err != nil {
slog.Warn("stats collector: list sites", "error", err)
} else {
for _, site := range sites {
if site.ContainerID == "" {
continue
}
out = append(out, target{
ContainerID: site.ContainerID,
OwnerType: OwnerTypeSite,
OwnerID: site.ID,
})
}
}
return out
}
// sampleAll fetches Docker stats for every target in bounded parallelism.
// Failed samples are logged and skipped — a missing container must not kill
// the whole tick.
func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.ContainerStatsSample {
sem := make(chan struct{}, maxParallelSamples)
results := make([]store.ContainerStatsSample, len(targets))
found := make([]bool, len(targets))
var wg sync.WaitGroup
for i, t := range targets {
wg.Add(1)
go func(i int, t target) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
stats, err := c.docker.GetContainerStats(sampleCtx, t.ContainerID)
if err != nil {
slog.Debug("stats collector: get container stats",
"container", t.ContainerID, "owner_type", t.OwnerType, "error", err)
return
}
ts := stats.Timestamp.Unix()
if ts <= 0 {
ts = time.Now().UTC().Unix()
}
results[i] = store.ContainerStatsSample{
ContainerID: t.ContainerID,
OwnerType: t.OwnerType,
OwnerID: t.OwnerID,
TS: ts,
CPUPercent: stats.CPUPercent,
MemoryUsage: stats.MemoryUsage,
MemoryLimit: stats.MemoryLimit,
NetworkRxBytes: stats.NetworkRxBytes,
NetworkTxBytes: stats.NetworkTxBytes,
BlockReadBytes: stats.BlockReadBytes,
BlockWriteBytes: stats.BlockWriteBytes,
}
found[i] = true
}(i, t)
}
wg.Wait()
out := results[:0]
for i := range results {
if found[i] {
out = append(out, results[i])
}
}
return out
}
// recordSystemSample fetches host info + disk usage and persists a combined
// system-level sample. Failures are warned but do not propagate.
func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64, workloadMem int64, running int) {
sysStats, err := c.docker.GetSystemStats(ctx)
if err != nil {
slog.Warn("stats collector: get system stats", "error", err)
return
}
sample := store.SystemStatsSample{
TS: sysStats.Timestamp.Unix(),
NCPU: sysStats.NCPU,
MemoryTotal: sysStats.MemoryTotal,
WorkloadCPUPercent: workloadCPU,
WorkloadMemUsage: workloadMem,
ContainersRunning: running,
DiskTotalBytes: sysStats.DiskTotalBytes,
}
// Prefer the Docker-reported running count when we have no running samples
// (e.g., very first tick may race with container readiness).
if running == 0 && sysStats.Running > 0 {
sample.ContainersRunning = sysStats.Running
}
if err := c.store.InsertSystemStatsSample(sample); err != nil {
slog.Warn("stats collector: insert system sample", "error", err)
}
}
// prune drops rows older than the retention window.
func (c *Collector) prune(retentionHours int) {
if retentionHours <= 0 {
return
}
cutoff := time.Now().UTC().Add(-time.Duration(retentionHours) * time.Hour).Unix()
if _, err := c.store.PruneStatsSamplesBefore(cutoff); err != nil {
slog.Warn("stats collector: prune", "error", err)
}
}
+31
View File
@@ -78,9 +78,40 @@ type Settings struct {
BackupEnabled bool `json:"backup_enabled"`
BackupIntervalHours int `json:"backup_interval_hours"`
BackupRetentionCount int `json:"backup_retention_count"`
StatsIntervalSeconds int `json:"stats_interval_seconds"` // 0 disables collection
StatsRetentionHours int `json:"stats_retention_hours"` // 0 disables collection
UpdatedAt string `json:"updated_at"`
}
// ContainerStatsSample is one persisted sample of container resource usage.
// Cumulative counters (network, block I/O) require differencing two samples
// to get rates; CPU is already a percent-since-previous-sample value.
type ContainerStatsSample struct {
ContainerID string `json:"container_id"`
OwnerType string `json:"owner_type"` // "instance" or "site"
OwnerID string `json:"owner_id"`
TS int64 `json:"ts"` // Unix seconds UTC
CPUPercent float64 `json:"cpu_percent"`
MemoryUsage int64 `json:"memory_usage"`
MemoryLimit int64 `json:"memory_limit"`
NetworkRxBytes int64 `json:"network_rx_bytes"`
NetworkTxBytes int64 `json:"network_tx_bytes"`
BlockReadBytes int64 `json:"block_read_bytes"`
BlockWriteBytes int64 `json:"block_write_bytes"`
}
// SystemStatsSample is one persisted host-level snapshot that aggregates
// workload usage across all containers plus daemon capacity + disk totals.
type SystemStatsSample struct {
TS int64 `json:"ts"` // Unix seconds UTC
NCPU int `json:"ncpu"`
MemoryTotal int64 `json:"memory_total"`
WorkloadCPUPercent float64 `json:"workload_cpu_percent"`
WorkloadMemUsage int64 `json:"workload_mem_usage"`
ContainersRunning int `json:"containers_running"`
DiskTotalBytes int64 `json:"disk_total_bytes"`
}
// Backup represents a backup metadata record.
type Backup struct {
ID string `json:"id"`
+4
View File
@@ -18,6 +18,7 @@ func (s *Store) GetSettings() (Settings, error) {
traefik_entrypoint, traefik_cert_resolver, traefik_network, traefik_api_url,
image_prune_threshold_mb,
backup_enabled, backup_interval_hours, backup_retention_count,
stats_interval_seconds, stats_retention_hours,
updated_at
FROM settings WHERE id = 1`,
).Scan(&st.Domain, &st.ServerIP, &st.PublicIP, &st.Network, &st.SubdomainPattern, &st.NotificationURL,
@@ -29,6 +30,7 @@ func (s *Store) GetSettings() (Settings, error) {
&st.TraefikEntrypoint, &st.TraefikCertResolver, &st.TraefikNetwork, &st.TraefikAPIURL,
&st.ImagePruneThresholdMB,
&backupEnabled, &st.BackupIntervalHours, &st.BackupRetentionCount,
&st.StatsIntervalSeconds, &st.StatsRetentionHours,
&st.UpdatedAt)
if err != nil {
return Settings{}, fmt.Errorf("query settings: %w", err)
@@ -65,6 +67,7 @@ func (s *Store) UpdateSettings(st Settings) error {
traefik_entrypoint=?, traefik_cert_resolver=?, traefik_network=?, traefik_api_url=?,
image_prune_threshold_mb=?,
backup_enabled=?, backup_interval_hours=?, backup_retention_count=?,
stats_interval_seconds=?, stats_retention_hours=?,
updated_at=?
WHERE id = 1`,
st.Domain, st.ServerIP, st.PublicIP, st.Network, st.SubdomainPattern, st.NotificationURL,
@@ -76,6 +79,7 @@ func (s *Store) UpdateSettings(st Settings) error {
st.TraefikEntrypoint, st.TraefikCertResolver, st.TraefikNetwork, st.TraefikAPIURL,
st.ImagePruneThresholdMB,
backupEnabled, st.BackupIntervalHours, st.BackupRetentionCount,
st.StatsIntervalSeconds, st.StatsRetentionHours,
st.UpdatedAt,
)
if err != nil {
+157
View File
@@ -0,0 +1,157 @@
package store
import (
"fmt"
)
// InsertContainerStatsSample appends a single container sample row.
func (s *Store) InsertContainerStatsSample(sample ContainerStatsSample) error {
_, err := s.db.Exec(
`INSERT INTO container_stats_samples (
container_id, owner_type, owner_id, ts,
cpu_percent, memory_usage, memory_limit,
network_rx, network_tx, block_read, block_write
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
sample.ContainerID, sample.OwnerType, sample.OwnerID, sample.TS,
sample.CPUPercent, sample.MemoryUsage, sample.MemoryLimit,
sample.NetworkRxBytes, sample.NetworkTxBytes,
sample.BlockReadBytes, sample.BlockWriteBytes,
)
if err != nil {
return fmt.Errorf("insert container stats sample: %w", err)
}
return nil
}
// InsertSystemStatsSample appends a single host-level sample row.
func (s *Store) InsertSystemStatsSample(sample SystemStatsSample) error {
_, err := s.db.Exec(
`INSERT INTO system_stats_samples (
ts, ncpu, memory_total,
workload_cpu_percent, workload_mem_usage,
containers_running, disk_total_bytes
) VALUES (?, ?, ?, ?, ?, ?, ?)`,
sample.TS, sample.NCPU, sample.MemoryTotal,
sample.WorkloadCPUPercent, sample.WorkloadMemUsage,
sample.ContainersRunning, sample.DiskTotalBytes,
)
if err != nil {
return fmt.Errorf("insert system stats sample: %w", err)
}
return nil
}
// ListContainerStatsSamples returns samples for the given owner since the
// given unix timestamp (inclusive), ordered by ts ascending.
func (s *Store) ListContainerStatsSamples(ownerType, ownerID string, sinceTS int64) ([]ContainerStatsSample, error) {
rows, err := s.db.Query(
`SELECT container_id, owner_type, owner_id, ts,
cpu_percent, memory_usage, memory_limit,
network_rx, network_tx, block_read, block_write
FROM container_stats_samples
WHERE owner_type = ? AND owner_id = ? AND ts >= ?
ORDER BY ts ASC`,
ownerType, ownerID, sinceTS,
)
if err != nil {
return nil, fmt.Errorf("list container stats samples: %w", err)
}
defer rows.Close()
var out []ContainerStatsSample
for rows.Next() {
var s ContainerStatsSample
if err := rows.Scan(
&s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS,
&s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit,
&s.NetworkRxBytes, &s.NetworkTxBytes,
&s.BlockReadBytes, &s.BlockWriteBytes,
); err != nil {
return nil, fmt.Errorf("scan container stats sample: %w", err)
}
out = append(out, s)
}
return out, rows.Err()
}
// ListAllRecentContainerStatsSamples returns samples across every owner since
// the given unix timestamp, ordered by ts ascending. Used by the system
// dashboard "top containers" widget where the UI wants a mixed pool.
func (s *Store) ListAllRecentContainerStatsSamples(sinceTS int64) ([]ContainerStatsSample, error) {
rows, err := s.db.Query(
`SELECT container_id, owner_type, owner_id, ts,
cpu_percent, memory_usage, memory_limit,
network_rx, network_tx, block_read, block_write
FROM container_stats_samples
WHERE ts >= ?
ORDER BY ts ASC`,
sinceTS,
)
if err != nil {
return nil, fmt.Errorf("list all recent container stats samples: %w", err)
}
defer rows.Close()
var out []ContainerStatsSample
for rows.Next() {
var s ContainerStatsSample
if err := rows.Scan(
&s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS,
&s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit,
&s.NetworkRxBytes, &s.NetworkTxBytes,
&s.BlockReadBytes, &s.BlockWriteBytes,
); err != nil {
return nil, fmt.Errorf("scan container stats sample: %w", err)
}
out = append(out, s)
}
return out, rows.Err()
}
// ListSystemStatsSamples returns host samples since the given unix timestamp.
func (s *Store) ListSystemStatsSamples(sinceTS int64) ([]SystemStatsSample, error) {
rows, err := s.db.Query(
`SELECT ts, ncpu, memory_total,
workload_cpu_percent, workload_mem_usage,
containers_running, disk_total_bytes
FROM system_stats_samples
WHERE ts >= ?
ORDER BY ts ASC`,
sinceTS,
)
if err != nil {
return nil, fmt.Errorf("list system stats samples: %w", err)
}
defer rows.Close()
var out []SystemStatsSample
for rows.Next() {
var s SystemStatsSample
if err := rows.Scan(
&s.TS, &s.NCPU, &s.MemoryTotal,
&s.WorkloadCPUPercent, &s.WorkloadMemUsage,
&s.ContainersRunning, &s.DiskTotalBytes,
); err != nil {
return nil, fmt.Errorf("scan system stats sample: %w", err)
}
out = append(out, s)
}
return out, rows.Err()
}
// PruneStatsSamplesBefore deletes all samples older than the given unix timestamp
// from both the container and system stats tables. Returns rows deleted across
// both tables.
func (s *Store) PruneStatsSamplesBefore(ts int64) (int64, error) {
r1, err := s.db.Exec(`DELETE FROM container_stats_samples WHERE ts < ?`, ts)
if err != nil {
return 0, fmt.Errorf("prune container stats samples: %w", err)
}
r2, err := s.db.Exec(`DELETE FROM system_stats_samples WHERE ts < ?`, ts)
if err != nil {
return 0, fmt.Errorf("prune system stats samples: %w", err)
}
n1, _ := r1.RowsAffected()
n2, _ := r2.RowsAffected()
return n1 + n2, nil
}
+40
View File
@@ -133,10 +133,46 @@ func (s *Store) runMigrations() error {
// avoid a destructive migration on SQLite.
`ALTER TABLE projects ADD COLUMN webhook_secret TEXT NOT NULL DEFAULT ''`,
`ALTER TABLE static_sites ADD COLUMN webhook_secret TEXT NOT NULL DEFAULT ''`,
// Resource metrics collection (2026-04-24). Interval in seconds,
// retention in hours. 0 in either disables collection.
`ALTER TABLE settings ADD COLUMN stats_interval_seconds INTEGER NOT NULL DEFAULT 15`,
`ALTER TABLE settings ADD COLUMN stats_retention_hours INTEGER NOT NULL DEFAULT 2`,
}
// Additive stack tables (2026-04-16). Created here rather than in the
// schema constant so older databases pick them up on restart.
statsTables := []string{
`CREATE TABLE IF NOT EXISTS container_stats_samples (
id INTEGER PRIMARY KEY AUTOINCREMENT,
container_id TEXT NOT NULL,
owner_type TEXT NOT NULL,
owner_id TEXT NOT NULL,
ts INTEGER NOT NULL,
cpu_percent REAL NOT NULL DEFAULT 0,
memory_usage INTEGER NOT NULL DEFAULT 0,
memory_limit INTEGER NOT NULL DEFAULT 0,
network_rx INTEGER NOT NULL DEFAULT 0,
network_tx INTEGER NOT NULL DEFAULT 0,
block_read INTEGER NOT NULL DEFAULT 0,
block_write INTEGER NOT NULL DEFAULT 0
)`,
`CREATE TABLE IF NOT EXISTS system_stats_samples (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts INTEGER NOT NULL,
ncpu INTEGER NOT NULL DEFAULT 0,
memory_total INTEGER NOT NULL DEFAULT 0,
workload_cpu_percent REAL NOT NULL DEFAULT 0,
workload_mem_usage INTEGER NOT NULL DEFAULT 0,
containers_running INTEGER NOT NULL DEFAULT 0,
disk_total_bytes INTEGER NOT NULL DEFAULT 0
)`,
}
for _, t := range statsTables {
if _, err := s.db.Exec(t); err != nil {
return fmt.Errorf("create stats table: %w", err)
}
}
stackTables := []string{
`CREATE TABLE IF NOT EXISTS stacks (
id TEXT PRIMARY KEY,
@@ -201,6 +237,10 @@ func (s *Store) runMigrations() error {
`CREATE INDEX IF NOT EXISTS idx_stack_deploys_stack_id ON stack_deploys(stack_id)`,
`CREATE UNIQUE INDEX IF NOT EXISTS idx_projects_webhook_secret ON projects(webhook_secret) WHERE webhook_secret != ''`,
`CREATE UNIQUE INDEX IF NOT EXISTS idx_static_sites_webhook_secret ON static_sites(webhook_secret) WHERE webhook_secret != ''`,
`CREATE INDEX IF NOT EXISTS idx_container_stats_owner_ts ON container_stats_samples(owner_type, owner_id, ts)`,
`CREATE INDEX IF NOT EXISTS idx_container_stats_container_ts ON container_stats_samples(container_id, ts)`,
`CREATE INDEX IF NOT EXISTS idx_container_stats_ts ON container_stats_samples(ts)`,
`CREATE INDEX IF NOT EXISTS idx_system_stats_ts ON system_stats_samples(ts)`,
}
for _, idx := range indexes {
if _, err := s.db.Exec(idx); err != nil {