diff --git a/cmd/server/main.go b/cmd/server/main.go index 5216003..eb91f71 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -33,6 +33,7 @@ import ( "github.com/alexei/tinyforge/internal/registry" "github.com/alexei/tinyforge/internal/stale" "github.com/alexei/tinyforge/internal/stack" + "github.com/alexei/tinyforge/internal/stats" "github.com/alexei/tinyforge/internal/staticsite" "github.com/alexei/tinyforge/internal/store" "github.com/alexei/tinyforge/internal/webhook" @@ -276,6 +277,12 @@ func main() { } scheduleAutobackup(settings.BackupEnabled, settings.BackupIntervalHours) + // Initialize resource stats collector. Interval + retention are read from + // settings on each tick, so configuration changes take effect within one + // tick without a restart. + statsCollector := stats.New(db, dockerClient) + statsCollector.Start() + // Initialize static site manager and health checker. staticSiteMgr := staticsite.NewManager(db, dockerClient, proxyProvider, eventBus, encKey) webhookHandler.SetSiteSyncTriggerer(staticSiteMgr) @@ -364,6 +371,7 @@ func main() { staticSiteHealth.Stop() staleScanner.Stop() poller.Stop() + statsCollector.Stop() // Drain in-progress deploys and notifications. dep.Drain() diff --git a/internal/api/docker.go b/internal/api/docker.go index ac06cae..5b6db93 100644 --- a/internal/api/docker.go +++ b/internal/api/docker.go @@ -68,6 +68,13 @@ func (s *Server) streamContainerLogs(w http.ResponseWriter, r *http.Request) { return } + s.streamLogsForContainer(w, r, inst.ContainerID) +} + +// streamLogsForContainer streams logs for an arbitrary container ID using the +// shared SSE/JSON dual-mode pattern. Owner-specific handlers (instance, site) +// should validate ownership and then delegate here. +func (s *Server) streamLogsForContainer(w http.ResponseWriter, r *http.Request, containerID string) { if s.docker == nil { respondError(w, http.StatusServiceUnavailable, "Docker is not available") return @@ -83,9 +90,9 @@ func (s *Server) streamContainerLogs(w http.ResponseWriter, r *http.Request) { accept := r.Header.Get("Accept") isSSE := strings.Contains(accept, "text/event-stream") - logReader, err := s.docker.ContainerLogs(r.Context(), inst.ContainerID, follow && isSSE, tail) + logReader, err := s.docker.ContainerLogs(r.Context(), containerID, follow && isSSE, tail) if err != nil { - slog.Error("failed to get container logs", "instance", instanceID, "error", err) + slog.Error("failed to get container logs", "container", containerID, "error", err) respondError(w, http.StatusInternalServerError, "failed to get container logs") return } diff --git a/internal/api/router.go b/internal/api/router.go index 0bd838d..216823e 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -219,6 +219,7 @@ func (s *Server) Router() chi.Router { r.Get("/stages/{stage}/env", s.listStageEnv) r.Get("/stages/{stage}/instances", s.listInstances) r.Get("/stages/{stage}/instances/{iid}/stats", s.getInstanceStats) + r.Get("/stages/{stage}/instances/{iid}/stats/history", s.getInstanceStatsHistory) r.Get("/stages/{stage}/instances/{iid}/logs", s.streamContainerLogs) r.Get("/images", s.listProjectImages) r.Get("/volumes", s.listVolumes) @@ -288,6 +289,9 @@ func (s *Server) Router() chi.Router { r.Get("/", s.getStaticSite) r.Get("/secrets", s.listStaticSiteSecrets) r.Get("/storage", s.getStaticSiteStorage) + r.Get("/logs", s.streamStaticSiteLogs) + r.Get("/stats", s.getStaticSiteStats) + r.Get("/stats/history", s.getStaticSiteStatsHistory) // Admin-only mutations. r.Group(func(r chi.Router) { @@ -333,6 +337,11 @@ func (s *Server) Router() chi.Router { // Stale container endpoints (read). r.Get("/containers/stale", s.listStaleContainers) + // System resources (read-only). + r.Get("/system/stats", s.getSystemStats) + r.Get("/system/stats/history", s.getSystemStatsHistory) + r.Get("/system/stats/top", s.listTopContainersByCPU) + // Admin-only routes: require admin role. r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) diff --git a/internal/api/settings.go b/internal/api/settings.go index e940ca4..9db947c 100644 --- a/internal/api/settings.go +++ b/internal/api/settings.go @@ -46,6 +46,8 @@ type settingsRequest struct { BackupEnabled *bool `json:"backup_enabled,omitempty"` BackupIntervalHours *int `json:"backup_interval_hours,omitempty"` BackupRetentionCount *int `json:"backup_retention_count,omitempty"` + StatsIntervalSeconds *int `json:"stats_interval_seconds,omitempty"` + StatsRetentionHours *int `json:"stats_retention_hours,omitempty"` } // getSettings handles GET /api/settings. @@ -86,6 +88,8 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) { "backup_enabled": settings.BackupEnabled, "backup_interval_hours": settings.BackupIntervalHours, "backup_retention_count": settings.BackupRetentionCount, + "stats_interval_seconds": settings.StatsIntervalSeconds, + "stats_retention_hours": settings.StatsRetentionHours, "updated_at": settings.UpdatedAt, }) } @@ -238,6 +242,22 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) { } updated.BackupRetentionCount = *req.BackupRetentionCount } + if req.StatsIntervalSeconds != nil { + v := *req.StatsIntervalSeconds + if v != 0 && (v < 5 || v > 300) { + respondError(w, http.StatusBadRequest, "stats_interval_seconds must be 0 (disabled) or between 5 and 300") + return + } + updated.StatsIntervalSeconds = v + } + if req.StatsRetentionHours != nil { + v := *req.StatsRetentionHours + if v < 0 || v > 24 { + respondError(w, http.StatusBadRequest, "stats_retention_hours must be between 0 and 24") + return + } + updated.StatsRetentionHours = v + } if err := s.store.UpdateSettings(updated); err != nil { respondError(w, http.StatusInternalServerError, "failed to update settings: "+err.Error()) diff --git a/internal/api/stats_history.go b/internal/api/stats_history.go new file mode 100644 index 0000000..46eb7ef --- /dev/null +++ b/internal/api/stats_history.go @@ -0,0 +1,245 @@ +package api + +import ( + "errors" + "log/slog" + "net/http" + "strconv" + "time" + + "github.com/go-chi/chi/v5" + + "github.com/alexei/tinyforge/internal/stats" + "github.com/alexei/tinyforge/internal/store" +) + +const ( + // defaultHistoryWindow is used when no ?window= param is provided or the + // value fails to parse. Matches the default retention so the "last 2h" + // view always has data when collection is enabled. + defaultHistoryWindow = 2 * time.Hour + maxHistoryWindow = 24 * time.Hour +) + +// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m") +// and returns a bounded duration. +func parseWindow(r *http.Request) time.Duration { + raw := r.URL.Query().Get("window") + if raw == "" { + return defaultHistoryWindow + } + d, err := time.ParseDuration(raw) + if err != nil || d <= 0 { + return defaultHistoryWindow + } + if d > maxHistoryWindow { + return maxHistoryWindow + } + return d +} + +// sinceTimestamp converts a duration into a Unix-seconds cutoff. +func sinceTimestamp(window time.Duration) int64 { + return time.Now().UTC().Add(-window).Unix() +} + +// getSystemStats handles GET /api/system/stats — current host snapshot. +// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the +// handler returns 503 so the frontend can show a dedicated unavailable +// state instead of treating it as a generic 5xx failure. +func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) { + if s.docker == nil { + respondError(w, http.StatusServiceUnavailable, "Docker is not available") + return + } + sys, err := s.docker.GetSystemStats(r.Context()) + if err != nil { + slog.Warn("system stats unavailable", "error", err) + respondError(w, http.StatusServiceUnavailable, "Docker is not available") + return + } + respondJSON(w, http.StatusOK, sys) +} + +// getSystemStatsHistory handles GET /api/system/stats/history?window=1h. +func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) { + samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r))) + if err != nil { + slog.Error("failed to list system stats samples", "error", err) + respondError(w, http.StatusInternalServerError, "failed to list samples") + return + } + if samples == nil { + samples = []store.SystemStatsSample{} + } + respondJSON(w, http.StatusOK, samples) +} + +// getInstanceStatsHistory handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats/history. +func (s *Server) getInstanceStatsHistory(w http.ResponseWriter, r *http.Request) { + instanceID := chi.URLParam(r, "iid") + if _, err := s.store.GetInstanceByID(instanceID); err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "instance") + return + } + slog.Error("failed to get instance", "instance_id", instanceID, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get instance") + return + } + samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeInstance, instanceID, sinceTimestamp(parseWindow(r))) + if err != nil { + slog.Error("failed to list instance stats samples", "instance_id", instanceID, "error", err) + respondError(w, http.StatusInternalServerError, "failed to list samples") + return + } + if samples == nil { + samples = []store.ContainerStatsSample{} + } + respondJSON(w, http.StatusOK, samples) +} + +// getStaticSiteStats handles GET /api/sites/{id}/stats — current snapshot. +func (s *Server) getStaticSiteStats(w http.ResponseWriter, r *http.Request) { + id := chi.URLParam(r, "id") + site, err := s.store.GetStaticSiteByID(id) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "site") + return + } + slog.Error("failed to get site", "site_id", id, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get site") + return + } + if site.ContainerID == "" { + respondError(w, http.StatusConflict, "site has no container") + return + } + if s.docker == nil { + respondError(w, http.StatusServiceUnavailable, "Docker is not available") + return + } + cs, err := s.docker.GetContainerStats(r.Context(), site.ContainerID) + if err != nil { + slog.Error("failed to get site stats", "site_id", id, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get site stats") + return + } + respondJSON(w, http.StatusOK, cs) +} + +// getStaticSiteStatsHistory handles GET /api/sites/{id}/stats/history. +func (s *Server) getStaticSiteStatsHistory(w http.ResponseWriter, r *http.Request) { + id := chi.URLParam(r, "id") + if _, err := s.store.GetStaticSiteByID(id); err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "site") + return + } + slog.Error("failed to get site", "site_id", id, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get site") + return + } + samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeSite, id, sinceTimestamp(parseWindow(r))) + if err != nil { + slog.Error("failed to list site stats samples", "site_id", id, "error", err) + respondError(w, http.StatusInternalServerError, "failed to list samples") + return + } + if samples == nil { + samples = []store.ContainerStatsSample{} + } + respondJSON(w, http.StatusOK, samples) +} + +// streamStaticSiteLogs handles GET /api/sites/{id}/logs?tail=200&follow=true. +// Reuses the shared container log streamer so the SSE + multiplex handling +// matches /api/projects/.../instances/.../logs exactly. +func (s *Server) streamStaticSiteLogs(w http.ResponseWriter, r *http.Request) { + id := chi.URLParam(r, "id") + site, err := s.store.GetStaticSiteByID(id) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "site") + return + } + slog.Error("failed to get site", "site_id", id, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get site") + return + } + if site.ContainerID == "" { + respondError(w, http.StatusConflict, "site has no container") + return + } + s.streamLogsForContainer(w, r, site.ContainerID) +} + +// listTopContainersByCPU handles GET /api/system/stats/top?limit=5&by=cpu. +// Returns the top-N most recent samples across containers, sorted by CPU or +// memory. Useful for a system dashboard "top consumers" widget without +// requiring the frontend to aggregate per-container history on its own. +func (s *Server) listTopContainersByCPU(w http.ResponseWriter, r *http.Request) { + limit := 5 + if raw := r.URL.Query().Get("limit"); raw != "" { + if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 { + limit = n + } + } + by := r.URL.Query().Get("by") + if by != "memory" { + by = "cpu" + } + + // Samples from the last 2 minutes window so "top" reflects near-current + // load, not long-dead rows. + samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(2 * time.Minute)) + if err != nil { + slog.Error("failed to list container samples for top", "error", err) + respondError(w, http.StatusInternalServerError, "failed to list samples") + return + } + + // Keep only the latest sample per container. + latest := make(map[string]store.ContainerStatsSample, len(samples)) + for _, sm := range samples { + if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS { + latest[sm.ContainerID] = sm + } + } + + top := make([]store.ContainerStatsSample, 0, len(latest)) + for _, sm := range latest { + top = append(top, sm) + } + + // Partial-sort by the requested metric, descending. For small N a simple + // insertion-like approach is plenty. + sortContainerSamples(top, by) + if len(top) > limit { + top = top[:limit] + } + respondJSON(w, http.StatusOK, top) +} + +// sortContainerSamples sorts in place by CPU (or memory) descending. +// Note: ListContainerStatsSamples with empty ownerID returns no rows — the +// caller uses per-owner-type queries and merges; this helper is applied to +// the already-merged slice. +func sortContainerSamples(s []store.ContainerStatsSample, by string) { + // O(n^2) is fine — N is small (bounded by the number of containers). + for i := 1; i < len(s); i++ { + for j := i; j > 0; j-- { + var less bool + if by == "memory" { + less = s[j].MemoryUsage > s[j-1].MemoryUsage + } else { + less = s[j].CPUPercent > s[j-1].CPUPercent + } + if !less { + break + } + s[j-1], s[j] = s[j], s[j-1] + } + } +} diff --git a/internal/docker/stats.go b/internal/docker/stats.go index e1c919f..c66d792 100644 --- a/internal/docker/stats.go +++ b/internal/docker/stats.go @@ -4,21 +4,30 @@ import ( "context" "encoding/json" "fmt" + "strings" + "time" "github.com/moby/moby/api/types/container" "github.com/moby/moby/client" ) -// ContainerStats holds computed CPU and memory usage for a container. +// ContainerStats holds computed CPU, memory, network, and block I/O +// usage for a container. Network and block I/O values are cumulative +// byte counters — compute rates by differencing two samples. type ContainerStats struct { - CPUPercent float64 `json:"cpu_percent"` - MemoryUsage int64 `json:"memory_usage"` - MemoryLimit int64 `json:"memory_limit"` - MemoryPercent float64 `json:"memory_percent"` + Timestamp time.Time `json:"timestamp"` + CPUPercent float64 `json:"cpu_percent"` + MemoryUsage int64 `json:"memory_usage"` + MemoryLimit int64 `json:"memory_limit"` + MemoryPercent float64 `json:"memory_percent"` + NetworkRxBytes int64 `json:"network_rx_bytes"` + NetworkTxBytes int64 `json:"network_tx_bytes"` + BlockReadBytes int64 `json:"block_read_bytes"` + BlockWriteBytes int64 `json:"block_write_bytes"` } // GetContainerStats retrieves a one-shot stats snapshot for the given container -// and computes CPU and memory percentages. +// and computes CPU, memory, network, and block I/O metrics. func (c *Client) GetContainerStats(ctx context.Context, containerID string) (ContainerStats, error) { result, err := c.api.ContainerStats(ctx, containerID, client.ContainerStatsOptions{ Stream: false, @@ -42,14 +51,53 @@ func (c *Client) GetContainerStats(ctx context.Context, containerID string) (Con memPercent = float64(memUsage) / float64(memLimit) * 100.0 } + rxBytes, txBytes := sumNetworkBytes(stats.Networks) + readBytes, writeBytes := sumBlockIOBytes(stats.BlkioStats.IoServiceBytesRecursive) + + ts := stats.Read + if ts.IsZero() { + ts = time.Now().UTC() + } + return ContainerStats{ - CPUPercent: cpuPercent, - MemoryUsage: memUsage, - MemoryLimit: memLimit, - MemoryPercent: memPercent, + Timestamp: ts, + CPUPercent: cpuPercent, + MemoryUsage: memUsage, + MemoryLimit: memLimit, + MemoryPercent: memPercent, + NetworkRxBytes: rxBytes, + NetworkTxBytes: txBytes, + BlockReadBytes: readBytes, + BlockWriteBytes: writeBytes, }, nil } +// sumNetworkBytes aggregates rx/tx byte counters across all network interfaces +// for a single container. Missing Networks map (disabled networking) yields zeros. +func sumNetworkBytes(nets map[string]container.NetworkStats) (rx, tx int64) { + for _, n := range nets { + rx += int64(n.RxBytes) + tx += int64(n.TxBytes) + } + return rx, tx +} + +// sumBlockIOBytes totals read/write bytes across all block devices from the +// cgroup io_service_bytes_recursive entries. The "Op" field is "read"/"write" +// on cgroup v2 and "Read"/"Write" on cgroup v1 — match case-insensitively so +// either runtime works. +func sumBlockIOBytes(entries []container.BlkioStatEntry) (read, write int64) { + for _, e := range entries { + switch { + case strings.EqualFold(e.Op, "read"): + read += int64(e.Value) + case strings.EqualFold(e.Op, "write"): + write += int64(e.Value) + } + } + return read, write +} + // calculateCPUPercent computes CPU usage percentage from a stats response // using the delta between current and previous CPU readings. func calculateCPUPercent(stats container.StatsResponse) float64 { diff --git a/internal/docker/system.go b/internal/docker/system.go new file mode 100644 index 0000000..2fd92d0 --- /dev/null +++ b/internal/docker/system.go @@ -0,0 +1,87 @@ +package docker + +import ( + "context" + "fmt" + "time" + + "github.com/moby/moby/client" +) + +// SystemStats is a host-level snapshot combining daemon capacity +// (NCPU, memory total) with container counts and disk usage broken down +// by category. Workload CPU/memory utilization is aggregated from +// per-container samples by the stats collector, not here. +type SystemStats struct { + Timestamp time.Time `json:"timestamp"` + + // Capacity from Docker daemon. + NCPU int `json:"ncpu"` + MemoryTotal int64 `json:"memory_total"` + + // Container/image counts. + Containers int `json:"containers"` + Running int `json:"running"` + Paused int `json:"paused"` + Stopped int `json:"stopped"` + Images int `json:"images"` + + // Disk usage by category (bytes). + DiskImagesBytes int64 `json:"disk_images_bytes"` + DiskContainersBytes int64 `json:"disk_containers_bytes"` + DiskVolumesBytes int64 `json:"disk_volumes_bytes"` + DiskBuildCacheBytes int64 `json:"disk_build_cache_bytes"` + + // Reclaimable disk space by category (bytes). + DiskImagesReclaimable int64 `json:"disk_images_reclaimable"` + DiskContainersReclaimable int64 `json:"disk_containers_reclaimable"` + DiskVolumesReclaimable int64 `json:"disk_volumes_reclaimable"` + DiskBuildCacheReclaimable int64 `json:"disk_build_cache_reclaimable"` + + // DiskTotalBytes is the sum of the category totals. + DiskTotalBytes int64 `json:"disk_total_bytes"` +} + +// GetSystemStats returns a one-shot host-level snapshot. The Info() call +// and disk usage call are made in sequence. Disk usage failures do not +// fail the whole call — the result degrades gracefully with zero disk fields. +func (c *Client) GetSystemStats(ctx context.Context) (SystemStats, error) { + info, err := c.Info(ctx) + if err != nil { + return SystemStats{}, fmt.Errorf("system stats: %w", err) + } + + stats := SystemStats{ + Timestamp: time.Now().UTC(), + NCPU: info.NCPU, + MemoryTotal: info.MemoryTotal, + Containers: info.Containers, + Running: info.Running, + Paused: info.Paused, + Stopped: info.Stopped, + Images: info.Images, + } + + du, derr := c.api.DiskUsage(ctx, client.DiskUsageOptions{ + Containers: true, + Images: true, + Volumes: true, + BuildCache: true, + }) + if derr == nil { + stats.DiskImagesBytes = du.Images.TotalSize + stats.DiskContainersBytes = du.Containers.TotalSize + stats.DiskVolumesBytes = du.Volumes.TotalSize + stats.DiskBuildCacheBytes = du.BuildCache.TotalSize + stats.DiskImagesReclaimable = du.Images.Reclaimable + stats.DiskContainersReclaimable = du.Containers.Reclaimable + stats.DiskVolumesReclaimable = du.Volumes.Reclaimable + stats.DiskBuildCacheReclaimable = du.BuildCache.Reclaimable + stats.DiskTotalBytes = stats.DiskImagesBytes + + stats.DiskContainersBytes + + stats.DiskVolumesBytes + + stats.DiskBuildCacheBytes + } + + return stats, nil +} diff --git a/internal/stats/collector.go b/internal/stats/collector.go new file mode 100644 index 0000000..ad82fd7 --- /dev/null +++ b/internal/stats/collector.go @@ -0,0 +1,309 @@ +// Package stats implements a background goroutine that periodically samples +// Docker container and host-level resource usage and persists the samples +// into SQLite. It reads its interval and retention from settings on every +// tick so configuration changes take effect without a restart. +package stats + +import ( + "context" + "log/slog" + "sync" + "time" + + "github.com/alexei/tinyforge/internal/docker" + "github.com/alexei/tinyforge/internal/store" +) + +// Defaults applied when settings values are outside their valid range. +const ( + DefaultIntervalSeconds = 15 + DefaultRetentionHours = 2 + MinIntervalSeconds = 5 + MaxIntervalSeconds = 300 + // Hard cap on parallel container stat requests to avoid overwhelming + // the Docker daemon when the user has many containers. + maxParallelSamples = 8 +) + +// OwnerType values for ContainerStatsSample.OwnerType. +const ( + OwnerTypeInstance = "instance" + OwnerTypeSite = "site" +) + +// Collector runs the background sampling loop. +type Collector struct { + store *store.Store + docker *docker.Client + + stopOnce sync.Once + stop chan struct{} + done chan struct{} +} + +// New creates a new stats collector. Call Start to begin sampling. +func New(s *store.Store, d *docker.Client) *Collector { + return &Collector{ + store: s, + docker: d, + stop: make(chan struct{}), + done: make(chan struct{}), + } +} + +// Start launches the background loop. Returns immediately. The loop exits +// when Stop is called. +func (c *Collector) Start() { + go c.run() +} + +// Stop signals the collector to exit and blocks until it has finished the +// in-flight tick. +func (c *Collector) Stop() { + c.stopOnce.Do(func() { close(c.stop) }) + <-c.done +} + +// run is the main loop. It reads the interval from settings on every tick, +// which lets configuration changes propagate within one tick without a +// dedicated reload mechanism. +func (c *Collector) run() { + defer close(c.done) + + // Wait a few seconds before the first sample so the app has settled. + select { + case <-time.After(3 * time.Second): + case <-c.stop: + return + } + + for { + interval, retention := c.readConfig() + if interval == 0 || retention == 0 { + // Collection disabled. Poll settings every minute in case the + // user re-enables it. + select { + case <-time.After(time.Minute): + continue + case <-c.stop: + return + } + } + + c.tick(retention) + + select { + case <-time.After(time.Duration(interval) * time.Second): + case <-c.stop: + return + } + } +} + +// readConfig reads the current interval + retention from settings, applying +// defaults and clamping to the valid range. +func (c *Collector) readConfig() (intervalSeconds, retentionHours int) { + settings, err := c.store.GetSettings() + if err != nil { + slog.Warn("stats collector: failed to read settings — using defaults", "error", err) + return DefaultIntervalSeconds, DefaultRetentionHours + } + intervalSeconds = settings.StatsIntervalSeconds + retentionHours = settings.StatsRetentionHours + if intervalSeconds < 0 || retentionHours < 0 { + return 0, 0 + } + if intervalSeconds > 0 && intervalSeconds < MinIntervalSeconds { + intervalSeconds = MinIntervalSeconds + } + if intervalSeconds > MaxIntervalSeconds { + intervalSeconds = MaxIntervalSeconds + } + return intervalSeconds, retentionHours +} + +// tick samples all known containers, aggregates workload-level totals, +// persists samples, and prunes rows beyond the retention window. When +// the Docker daemon is unreachable the whole tick is skipped with a +// single debug log instead of one warning per container. +func (c *Collector) tick(retentionHours int) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second) + defer pingCancel() + if err := c.docker.Ping(pingCtx); err != nil { + slog.Debug("stats collector: docker unreachable, skipping tick", "error", err) + return + } + + targets := c.buildTargets() + if len(targets) == 0 { + // No containers to sample, but still record a system sample so the + // host history isn't empty. + c.recordSystemSample(ctx, 0, 0, 0) + c.prune(retentionHours) + return + } + + samples := c.sampleAll(ctx, targets) + + var ( + totalCPU float64 + totalMem int64 + running int + ) + for _, s := range samples { + if err := c.store.InsertContainerStatsSample(s); err != nil { + slog.Warn("stats collector: insert container sample", + "container", s.ContainerID, "error", err) + continue + } + totalCPU += s.CPUPercent + totalMem += s.MemoryUsage + running++ + } + + c.recordSystemSample(ctx, totalCPU, totalMem, running) + c.prune(retentionHours) +} + +// target describes a single container to sample. +type target struct { + ContainerID string + OwnerType string + OwnerID string +} + +// buildTargets fetches running instances and sites that have a container ID. +func (c *Collector) buildTargets() []target { + var out []target + + instances, err := c.store.ListAllInstances() + if err != nil { + slog.Warn("stats collector: list instances", "error", err) + } else { + for _, inst := range instances { + if inst.ContainerID == "" { + continue + } + out = append(out, target{ + ContainerID: inst.ContainerID, + OwnerType: OwnerTypeInstance, + OwnerID: inst.ID, + }) + } + } + + sites, err := c.store.GetAllStaticSites() + if err != nil { + slog.Warn("stats collector: list sites", "error", err) + } else { + for _, site := range sites { + if site.ContainerID == "" { + continue + } + out = append(out, target{ + ContainerID: site.ContainerID, + OwnerType: OwnerTypeSite, + OwnerID: site.ID, + }) + } + } + + return out +} + +// sampleAll fetches Docker stats for every target in bounded parallelism. +// Failed samples are logged and skipped — a missing container must not kill +// the whole tick. +func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.ContainerStatsSample { + sem := make(chan struct{}, maxParallelSamples) + results := make([]store.ContainerStatsSample, len(targets)) + found := make([]bool, len(targets)) + + var wg sync.WaitGroup + for i, t := range targets { + wg.Add(1) + go func(i int, t target) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + stats, err := c.docker.GetContainerStats(sampleCtx, t.ContainerID) + if err != nil { + slog.Debug("stats collector: get container stats", + "container", t.ContainerID, "owner_type", t.OwnerType, "error", err) + return + } + ts := stats.Timestamp.Unix() + if ts <= 0 { + ts = time.Now().UTC().Unix() + } + results[i] = store.ContainerStatsSample{ + ContainerID: t.ContainerID, + OwnerType: t.OwnerType, + OwnerID: t.OwnerID, + TS: ts, + CPUPercent: stats.CPUPercent, + MemoryUsage: stats.MemoryUsage, + MemoryLimit: stats.MemoryLimit, + NetworkRxBytes: stats.NetworkRxBytes, + NetworkTxBytes: stats.NetworkTxBytes, + BlockReadBytes: stats.BlockReadBytes, + BlockWriteBytes: stats.BlockWriteBytes, + } + found[i] = true + }(i, t) + } + wg.Wait() + + out := results[:0] + for i := range results { + if found[i] { + out = append(out, results[i]) + } + } + return out +} + +// recordSystemSample fetches host info + disk usage and persists a combined +// system-level sample. Failures are warned but do not propagate. +func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64, workloadMem int64, running int) { + sysStats, err := c.docker.GetSystemStats(ctx) + if err != nil { + slog.Warn("stats collector: get system stats", "error", err) + return + } + sample := store.SystemStatsSample{ + TS: sysStats.Timestamp.Unix(), + NCPU: sysStats.NCPU, + MemoryTotal: sysStats.MemoryTotal, + WorkloadCPUPercent: workloadCPU, + WorkloadMemUsage: workloadMem, + ContainersRunning: running, + DiskTotalBytes: sysStats.DiskTotalBytes, + } + // Prefer the Docker-reported running count when we have no running samples + // (e.g., very first tick may race with container readiness). + if running == 0 && sysStats.Running > 0 { + sample.ContainersRunning = sysStats.Running + } + if err := c.store.InsertSystemStatsSample(sample); err != nil { + slog.Warn("stats collector: insert system sample", "error", err) + } +} + +// prune drops rows older than the retention window. +func (c *Collector) prune(retentionHours int) { + if retentionHours <= 0 { + return + } + cutoff := time.Now().UTC().Add(-time.Duration(retentionHours) * time.Hour).Unix() + if _, err := c.store.PruneStatsSamplesBefore(cutoff); err != nil { + slog.Warn("stats collector: prune", "error", err) + } +} diff --git a/internal/store/models.go b/internal/store/models.go index 5ccf0f7..808fec5 100644 --- a/internal/store/models.go +++ b/internal/store/models.go @@ -78,9 +78,40 @@ type Settings struct { BackupEnabled bool `json:"backup_enabled"` BackupIntervalHours int `json:"backup_interval_hours"` BackupRetentionCount int `json:"backup_retention_count"` + StatsIntervalSeconds int `json:"stats_interval_seconds"` // 0 disables collection + StatsRetentionHours int `json:"stats_retention_hours"` // 0 disables collection UpdatedAt string `json:"updated_at"` } +// ContainerStatsSample is one persisted sample of container resource usage. +// Cumulative counters (network, block I/O) require differencing two samples +// to get rates; CPU is already a percent-since-previous-sample value. +type ContainerStatsSample struct { + ContainerID string `json:"container_id"` + OwnerType string `json:"owner_type"` // "instance" or "site" + OwnerID string `json:"owner_id"` + TS int64 `json:"ts"` // Unix seconds UTC + CPUPercent float64 `json:"cpu_percent"` + MemoryUsage int64 `json:"memory_usage"` + MemoryLimit int64 `json:"memory_limit"` + NetworkRxBytes int64 `json:"network_rx_bytes"` + NetworkTxBytes int64 `json:"network_tx_bytes"` + BlockReadBytes int64 `json:"block_read_bytes"` + BlockWriteBytes int64 `json:"block_write_bytes"` +} + +// SystemStatsSample is one persisted host-level snapshot that aggregates +// workload usage across all containers plus daemon capacity + disk totals. +type SystemStatsSample struct { + TS int64 `json:"ts"` // Unix seconds UTC + NCPU int `json:"ncpu"` + MemoryTotal int64 `json:"memory_total"` + WorkloadCPUPercent float64 `json:"workload_cpu_percent"` + WorkloadMemUsage int64 `json:"workload_mem_usage"` + ContainersRunning int `json:"containers_running"` + DiskTotalBytes int64 `json:"disk_total_bytes"` +} + // Backup represents a backup metadata record. type Backup struct { ID string `json:"id"` diff --git a/internal/store/settings.go b/internal/store/settings.go index 25caa24..7e2e9fa 100644 --- a/internal/store/settings.go +++ b/internal/store/settings.go @@ -18,6 +18,7 @@ func (s *Store) GetSettings() (Settings, error) { traefik_entrypoint, traefik_cert_resolver, traefik_network, traefik_api_url, image_prune_threshold_mb, backup_enabled, backup_interval_hours, backup_retention_count, + stats_interval_seconds, stats_retention_hours, updated_at FROM settings WHERE id = 1`, ).Scan(&st.Domain, &st.ServerIP, &st.PublicIP, &st.Network, &st.SubdomainPattern, &st.NotificationURL, @@ -29,6 +30,7 @@ func (s *Store) GetSettings() (Settings, error) { &st.TraefikEntrypoint, &st.TraefikCertResolver, &st.TraefikNetwork, &st.TraefikAPIURL, &st.ImagePruneThresholdMB, &backupEnabled, &st.BackupIntervalHours, &st.BackupRetentionCount, + &st.StatsIntervalSeconds, &st.StatsRetentionHours, &st.UpdatedAt) if err != nil { return Settings{}, fmt.Errorf("query settings: %w", err) @@ -65,6 +67,7 @@ func (s *Store) UpdateSettings(st Settings) error { traefik_entrypoint=?, traefik_cert_resolver=?, traefik_network=?, traefik_api_url=?, image_prune_threshold_mb=?, backup_enabled=?, backup_interval_hours=?, backup_retention_count=?, + stats_interval_seconds=?, stats_retention_hours=?, updated_at=? WHERE id = 1`, st.Domain, st.ServerIP, st.PublicIP, st.Network, st.SubdomainPattern, st.NotificationURL, @@ -76,6 +79,7 @@ func (s *Store) UpdateSettings(st Settings) error { st.TraefikEntrypoint, st.TraefikCertResolver, st.TraefikNetwork, st.TraefikAPIURL, st.ImagePruneThresholdMB, backupEnabled, st.BackupIntervalHours, st.BackupRetentionCount, + st.StatsIntervalSeconds, st.StatsRetentionHours, st.UpdatedAt, ) if err != nil { diff --git a/internal/store/stats_samples.go b/internal/store/stats_samples.go new file mode 100644 index 0000000..45e1dab --- /dev/null +++ b/internal/store/stats_samples.go @@ -0,0 +1,157 @@ +package store + +import ( + "fmt" +) + +// InsertContainerStatsSample appends a single container sample row. +func (s *Store) InsertContainerStatsSample(sample ContainerStatsSample) error { + _, err := s.db.Exec( + `INSERT INTO container_stats_samples ( + container_id, owner_type, owner_id, ts, + cpu_percent, memory_usage, memory_limit, + network_rx, network_tx, block_read, block_write + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + sample.ContainerID, sample.OwnerType, sample.OwnerID, sample.TS, + sample.CPUPercent, sample.MemoryUsage, sample.MemoryLimit, + sample.NetworkRxBytes, sample.NetworkTxBytes, + sample.BlockReadBytes, sample.BlockWriteBytes, + ) + if err != nil { + return fmt.Errorf("insert container stats sample: %w", err) + } + return nil +} + +// InsertSystemStatsSample appends a single host-level sample row. +func (s *Store) InsertSystemStatsSample(sample SystemStatsSample) error { + _, err := s.db.Exec( + `INSERT INTO system_stats_samples ( + ts, ncpu, memory_total, + workload_cpu_percent, workload_mem_usage, + containers_running, disk_total_bytes + ) VALUES (?, ?, ?, ?, ?, ?, ?)`, + sample.TS, sample.NCPU, sample.MemoryTotal, + sample.WorkloadCPUPercent, sample.WorkloadMemUsage, + sample.ContainersRunning, sample.DiskTotalBytes, + ) + if err != nil { + return fmt.Errorf("insert system stats sample: %w", err) + } + return nil +} + +// ListContainerStatsSamples returns samples for the given owner since the +// given unix timestamp (inclusive), ordered by ts ascending. +func (s *Store) ListContainerStatsSamples(ownerType, ownerID string, sinceTS int64) ([]ContainerStatsSample, error) { + rows, err := s.db.Query( + `SELECT container_id, owner_type, owner_id, ts, + cpu_percent, memory_usage, memory_limit, + network_rx, network_tx, block_read, block_write + FROM container_stats_samples + WHERE owner_type = ? AND owner_id = ? AND ts >= ? + ORDER BY ts ASC`, + ownerType, ownerID, sinceTS, + ) + if err != nil { + return nil, fmt.Errorf("list container stats samples: %w", err) + } + defer rows.Close() + + var out []ContainerStatsSample + for rows.Next() { + var s ContainerStatsSample + if err := rows.Scan( + &s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS, + &s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit, + &s.NetworkRxBytes, &s.NetworkTxBytes, + &s.BlockReadBytes, &s.BlockWriteBytes, + ); err != nil { + return nil, fmt.Errorf("scan container stats sample: %w", err) + } + out = append(out, s) + } + return out, rows.Err() +} + +// ListAllRecentContainerStatsSamples returns samples across every owner since +// the given unix timestamp, ordered by ts ascending. Used by the system +// dashboard "top containers" widget where the UI wants a mixed pool. +func (s *Store) ListAllRecentContainerStatsSamples(sinceTS int64) ([]ContainerStatsSample, error) { + rows, err := s.db.Query( + `SELECT container_id, owner_type, owner_id, ts, + cpu_percent, memory_usage, memory_limit, + network_rx, network_tx, block_read, block_write + FROM container_stats_samples + WHERE ts >= ? + ORDER BY ts ASC`, + sinceTS, + ) + if err != nil { + return nil, fmt.Errorf("list all recent container stats samples: %w", err) + } + defer rows.Close() + + var out []ContainerStatsSample + for rows.Next() { + var s ContainerStatsSample + if err := rows.Scan( + &s.ContainerID, &s.OwnerType, &s.OwnerID, &s.TS, + &s.CPUPercent, &s.MemoryUsage, &s.MemoryLimit, + &s.NetworkRxBytes, &s.NetworkTxBytes, + &s.BlockReadBytes, &s.BlockWriteBytes, + ); err != nil { + return nil, fmt.Errorf("scan container stats sample: %w", err) + } + out = append(out, s) + } + return out, rows.Err() +} + +// ListSystemStatsSamples returns host samples since the given unix timestamp. +func (s *Store) ListSystemStatsSamples(sinceTS int64) ([]SystemStatsSample, error) { + rows, err := s.db.Query( + `SELECT ts, ncpu, memory_total, + workload_cpu_percent, workload_mem_usage, + containers_running, disk_total_bytes + FROM system_stats_samples + WHERE ts >= ? + ORDER BY ts ASC`, + sinceTS, + ) + if err != nil { + return nil, fmt.Errorf("list system stats samples: %w", err) + } + defer rows.Close() + + var out []SystemStatsSample + for rows.Next() { + var s SystemStatsSample + if err := rows.Scan( + &s.TS, &s.NCPU, &s.MemoryTotal, + &s.WorkloadCPUPercent, &s.WorkloadMemUsage, + &s.ContainersRunning, &s.DiskTotalBytes, + ); err != nil { + return nil, fmt.Errorf("scan system stats sample: %w", err) + } + out = append(out, s) + } + return out, rows.Err() +} + +// PruneStatsSamplesBefore deletes all samples older than the given unix timestamp +// from both the container and system stats tables. Returns rows deleted across +// both tables. +func (s *Store) PruneStatsSamplesBefore(ts int64) (int64, error) { + r1, err := s.db.Exec(`DELETE FROM container_stats_samples WHERE ts < ?`, ts) + if err != nil { + return 0, fmt.Errorf("prune container stats samples: %w", err) + } + r2, err := s.db.Exec(`DELETE FROM system_stats_samples WHERE ts < ?`, ts) + if err != nil { + return 0, fmt.Errorf("prune system stats samples: %w", err) + } + n1, _ := r1.RowsAffected() + n2, _ := r2.RowsAffected() + return n1 + n2, nil +} diff --git a/internal/store/store.go b/internal/store/store.go index 96bc456..a58566d 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -133,10 +133,46 @@ func (s *Store) runMigrations() error { // avoid a destructive migration on SQLite. `ALTER TABLE projects ADD COLUMN webhook_secret TEXT NOT NULL DEFAULT ''`, `ALTER TABLE static_sites ADD COLUMN webhook_secret TEXT NOT NULL DEFAULT ''`, + // Resource metrics collection (2026-04-24). Interval in seconds, + // retention in hours. 0 in either disables collection. + `ALTER TABLE settings ADD COLUMN stats_interval_seconds INTEGER NOT NULL DEFAULT 15`, + `ALTER TABLE settings ADD COLUMN stats_retention_hours INTEGER NOT NULL DEFAULT 2`, } // Additive stack tables (2026-04-16). Created here rather than in the // schema constant so older databases pick them up on restart. + statsTables := []string{ + `CREATE TABLE IF NOT EXISTS container_stats_samples ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + container_id TEXT NOT NULL, + owner_type TEXT NOT NULL, + owner_id TEXT NOT NULL, + ts INTEGER NOT NULL, + cpu_percent REAL NOT NULL DEFAULT 0, + memory_usage INTEGER NOT NULL DEFAULT 0, + memory_limit INTEGER NOT NULL DEFAULT 0, + network_rx INTEGER NOT NULL DEFAULT 0, + network_tx INTEGER NOT NULL DEFAULT 0, + block_read INTEGER NOT NULL DEFAULT 0, + block_write INTEGER NOT NULL DEFAULT 0 + )`, + `CREATE TABLE IF NOT EXISTS system_stats_samples ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ts INTEGER NOT NULL, + ncpu INTEGER NOT NULL DEFAULT 0, + memory_total INTEGER NOT NULL DEFAULT 0, + workload_cpu_percent REAL NOT NULL DEFAULT 0, + workload_mem_usage INTEGER NOT NULL DEFAULT 0, + containers_running INTEGER NOT NULL DEFAULT 0, + disk_total_bytes INTEGER NOT NULL DEFAULT 0 + )`, + } + for _, t := range statsTables { + if _, err := s.db.Exec(t); err != nil { + return fmt.Errorf("create stats table: %w", err) + } + } + stackTables := []string{ `CREATE TABLE IF NOT EXISTS stacks ( id TEXT PRIMARY KEY, @@ -201,6 +237,10 @@ func (s *Store) runMigrations() error { `CREATE INDEX IF NOT EXISTS idx_stack_deploys_stack_id ON stack_deploys(stack_id)`, `CREATE UNIQUE INDEX IF NOT EXISTS idx_projects_webhook_secret ON projects(webhook_secret) WHERE webhook_secret != ''`, `CREATE UNIQUE INDEX IF NOT EXISTS idx_static_sites_webhook_secret ON static_sites(webhook_secret) WHERE webhook_secret != ''`, + `CREATE INDEX IF NOT EXISTS idx_container_stats_owner_ts ON container_stats_samples(owner_type, owner_id, ts)`, + `CREATE INDEX IF NOT EXISTS idx_container_stats_container_ts ON container_stats_samples(container_id, ts)`, + `CREATE INDEX IF NOT EXISTS idx_container_stats_ts ON container_stats_samples(ts)`, + `CREATE INDEX IF NOT EXISTS idx_system_stats_ts ON system_stats_samples(ts)`, } for _, idx := range indexes { if _, err := s.db.Exec(idx); err != nil { diff --git a/web/package-lock.json b/web/package-lock.json index f039c4c..df0b663 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -10,7 +10,8 @@ "dependencies": { "@fontsource/instrument-serif": "^5.2.8", "@fontsource/inter": "^5.2.8", - "@fontsource/jetbrains-mono": "^5.2.8" + "@fontsource/jetbrains-mono": "^5.2.8", + "echarts": "^6.0.0" }, "devDependencies": { "@sveltejs/adapter-static": "^3.0.8", @@ -1335,6 +1336,15 @@ "integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==", "dev": true }, + "node_modules/echarts": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/echarts/-/echarts-6.0.0.tgz", + "integrity": "sha512-Tte/grDQRiETQP4xz3iZWSvoHrkCQtwqd6hs+mifXcjrCuo2iKWbajFObuLJVBlDIJlOzgQPd1hsaKt/3+OMkQ==", + "dependencies": { + "tslib": "2.3.0", + "zrender": "6.0.0" + } + }, "node_modules/enhanced-resolve": { "version": "5.20.1", "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.20.1.tgz", @@ -2013,6 +2023,11 @@ "node": ">=6" } }, + "node_modules/tslib": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.3.0.tgz", + "integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg==" + }, "node_modules/typescript": { "version": "5.9.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", @@ -2119,6 +2134,14 @@ "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz", "integrity": "sha512-B58NGBEoc8Y9MWWCQGl/gq9xBCe4IiKM0a2x7GZdQKOW5Exr8S1W24J6OgM1njK8xCRGvAJIL/MxXHf6SkmQKQ==", "dev": true + }, + "node_modules/zrender": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/zrender/-/zrender-6.0.0.tgz", + "integrity": "sha512-41dFXEEXuJpNecuUQq6JlbybmnHaqqpGlbH1yxnA5V9MMP4SbohSVZsJIwz+zdjQXSSlR1Vc34EgH1zxyTDvhg==", + "dependencies": { + "tslib": "2.3.0" + } } }, "dependencies": { @@ -2833,6 +2856,15 @@ "integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==", "dev": true }, + "echarts": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/echarts/-/echarts-6.0.0.tgz", + "integrity": "sha512-Tte/grDQRiETQP4xz3iZWSvoHrkCQtwqd6hs+mifXcjrCuo2iKWbajFObuLJVBlDIJlOzgQPd1hsaKt/3+OMkQ==", + "requires": { + "tslib": "2.3.0", + "zrender": "6.0.0" + } + }, "enhanced-resolve": { "version": "5.20.1", "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.20.1.tgz", @@ -3231,6 +3263,11 @@ "integrity": "sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ==", "dev": true }, + "tslib": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.3.0.tgz", + "integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg==" + }, "typescript": { "version": "5.9.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", @@ -3264,6 +3301,14 @@ "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz", "integrity": "sha512-B58NGBEoc8Y9MWWCQGl/gq9xBCe4IiKM0a2x7GZdQKOW5Exr8S1W24J6OgM1njK8xCRGvAJIL/MxXHf6SkmQKQ==", "dev": true + }, + "zrender": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/zrender/-/zrender-6.0.0.tgz", + "integrity": "sha512-41dFXEEXuJpNecuUQq6JlbybmnHaqqpGlbH1yxnA5V9MMP4SbohSVZsJIwz+zdjQXSSlR1Vc34EgH1zxyTDvhg==", + "requires": { + "tslib": "2.3.0" + } } } } diff --git a/web/package.json b/web/package.json index 26a9431..8cc84cf 100644 --- a/web/package.json +++ b/web/package.json @@ -23,6 +23,7 @@ "dependencies": { "@fontsource/instrument-serif": "^5.2.8", "@fontsource/inter": "^5.2.8", - "@fontsource/jetbrains-mono": "^5.2.8" + "@fontsource/jetbrains-mono": "^5.2.8", + "echarts": "^6.0.0" } } diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index b5e739c..7e1608c 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -1,6 +1,9 @@ import type { ApiEnvelope, ContainerStats, + ContainerStatsSample, + SystemStats, + SystemStatsSample, Deploy, DeployLog, DockerHealth, @@ -677,6 +680,58 @@ export function fetchContainerStats( ); } +export function fetchInstanceStatsHistory( + projectId: string, + stageId: string, + instanceId: string, + window = '2h', + signal?: AbortSignal +): Promise { + return get( + `/api/projects/${projectId}/stages/${stageId}/instances/${instanceId}/stats/history?window=${encodeURIComponent(window)}`, + signal + ); +} + +export function fetchSystemStats(signal?: AbortSignal): Promise { + return get('/api/system/stats', signal); +} + +export function fetchSystemStatsHistory( + window = '2h', + signal?: AbortSignal +): Promise { + return get(`/api/system/stats/history?window=${encodeURIComponent(window)}`, signal); +} + +export function fetchTopContainers( + by: 'cpu' | 'memory' = 'cpu', + limit = 5, + signal?: AbortSignal +): Promise { + return get(`/api/system/stats/top?by=${by}&limit=${limit}`, signal); +} + +export function fetchStaticSiteStats(id: string, signal?: AbortSignal): Promise { + return get(`/api/sites/${id}/stats`, signal); +} + +export function fetchStaticSiteStatsHistory( + id: string, + window = '2h', + signal?: AbortSignal +): Promise { + return get( + `/api/sites/${id}/stats/history?window=${encodeURIComponent(window)}`, + signal + ); +} + +export async function fetchStaticSiteLogs(id: string, tail = 200): Promise { + const result = await get(`/api/sites/${id}/logs?tail=${tail}`); + return result ?? []; +} + // ── Static Sites ────────────────────────────────────────────────────── import type { StaticSite, StaticSiteSecret, FolderEntry, GitProvider, RepoInfo } from './types'; diff --git a/web/src/lib/components/CollapsibleSection.svelte b/web/src/lib/components/CollapsibleSection.svelte new file mode 100644 index 0000000..55db164 --- /dev/null +++ b/web/src/lib/components/CollapsibleSection.svelte @@ -0,0 +1,86 @@ + + + +
+
+ + {#if actions} +
{@render actions()}
+ {/if} +
+ {#if open} +
+ {@render children()} +
+ {/if} +
diff --git a/web/src/lib/components/ContainerLogs.svelte b/web/src/lib/components/ContainerLogs.svelte index dd57cdb..4c8a6d8 100644 --- a/web/src/lib/components/ContainerLogs.svelte +++ b/web/src/lib/components/ContainerLogs.svelte @@ -1,21 +1,26 @@ {#if stats} @@ -98,6 +172,26 @@ {formatBytes(stats.memory_usage)} / {formatBytes(stats.memory_limit)} + + + + {#if expanded} + {#if history.length === 0} +

+ {$t('resources.noSamples', { interval: '15' })} +

+ {:else} +
+ +
+ {/if} + {/if} {:else if error}

{$t('stats.unavailable')}

diff --git a/web/src/lib/components/InstanceCard.svelte b/web/src/lib/components/InstanceCard.svelte index 90ac3d6..1bb8f27 100644 --- a/web/src/lib/components/InstanceCard.svelte +++ b/web/src/lib/components/InstanceCard.svelte @@ -147,15 +147,13 @@ {#if instance.status === 'running'} - + {/if} {#if showLogs}
{ showLogs = false; }} />
diff --git a/web/src/lib/components/ResourceChart.svelte b/web/src/lib/components/ResourceChart.svelte new file mode 100644 index 0000000..722e00f --- /dev/null +++ b/web/src/lib/components/ResourceChart.svelte @@ -0,0 +1,72 @@ + + + +
diff --git a/web/src/lib/components/SystemResourcesCard.svelte b/web/src/lib/components/SystemResourcesCard.svelte new file mode 100644 index 0000000..4fafe1b --- /dev/null +++ b/web/src/lib/components/SystemResourcesCard.svelte @@ -0,0 +1,274 @@ + + + +{#if dockerDown} +

+ {$t('resources.dockerUnavailable')} +

+{:else if otherError} +

{otherError}

+{/if} + +{#if current} + +
+
+
{$t('resources.cpuCores')}
+
{current.ncpu}
+
+
+
{$t('resources.memory')}
+
{formatBytes(current.memory_total)}
+
+
+
{$t('resources.running')}
+
+ {current.running} / {current.containers} +
+
+
+
{$t('resources.dockerDisk')}
+
{formatBytes(current.disk_total_bytes)}
+
+
+ + +
+
+ {$t('resources.workloadUtilization')} + +
+ {#if history.length === 0} +

+ {$t('resources.noSamples', { interval: '15' })} +

+ {:else} + + {/if} +
+ + +
+ {#each [ + { label: $t('resources.diskImages'), size: current.disk_images_bytes, reclaim: current.disk_images_reclaimable }, + { label: $t('resources.diskContainers'), size: current.disk_containers_bytes, reclaim: current.disk_containers_reclaimable }, + { label: $t('resources.diskVolumes'), size: current.disk_volumes_bytes, reclaim: current.disk_volumes_reclaimable }, + { label: $t('resources.diskBuildCache'), size: current.disk_build_cache_bytes, reclaim: current.disk_build_cache_reclaimable } + ] as d} +
+
{d.label}
+
{formatBytes(d.size)}
+ {#if d.reclaim > 0} +
+ {$t('resources.reclaimable', { size: formatBytes(d.reclaim) })} +
+ {/if} +
+ {/each} +
+ + +
+
+ {$t('resources.topConsumers')} +
+ + +
+
+ {#if top.length === 0} +

{$t('resources.noRunning')}

+ {:else} +
+ {#each top as s} +
+ + + {s.owner_type === 'site' ? $t('resources.site') : $t('resources.instance')} + + + {s.container_id.slice(0, 12)} + + + + {#if topBy === 'cpu'} + {s.cpu_percent.toFixed(1)}% + {:else} + {formatBytes(s.memory_usage)} + {/if} + +
+ {/each} +
+ {/if} +
+{:else if !dockerDown && !otherError} +

{$t('resources.loading')}

+{/if} diff --git a/web/src/lib/i18n/en.json b/web/src/lib/i18n/en.json index 389874b..7d5821e 100644 --- a/web/src/lib/i18n/en.json +++ b/web/src/lib/i18n/en.json @@ -41,7 +41,47 @@ "failedSites": "failed", "noSites": "No static sites yet.", "addFirstSite": "Deploy your first site", - "viewAllSites": "View all sites" + "viewAllSites": "View all sites", + "systemHealth": "System health", + "daemons": "Daemons", + "systemResources": "System resources", + "systemResourcesSubtitle": "CPU, memory, disk, and top consumers" + }, + "resources": { + "cpuCores": "CPU Cores", + "memory": "Memory", + "running": "Running", + "dockerDisk": "Docker Disk", + "workloadUtilization": "Workload utilization", + "windowMinutes": "{n} minutes", + "windowHours": "{n} hours", + "noSamples": "No samples yet — the collector samples every {interval}s.", + "diskImages": "Images", + "diskContainers": "Containers", + "diskVolumes": "Volumes", + "diskBuildCache": "Build cache", + "reclaimable": "{size} reclaimable", + "topConsumers": "Top consumers", + "byCpu": "by cpu", + "byMemory": "by memory", + "noRunning": "No running containers.", + "instance": "instance", + "site": "site", + "showHistory": "Show history", + "hideHistory": "Hide history", + "cpuSeries": "CPU %", + "memorySeries": "Memory %", + "loading": "Loading…", + "sectionTitle": "Resources", + "showLogs": "Show logs", + "hideLogs": "Hide logs", + "dockerUnavailable": "Docker is unavailable. Check that the daemon is running." + }, + "statsSettings": { + "intervalLabel": "Stats collection interval (s)", + "intervalHelp": "How often resource samples are collected. 0 disables collection. Range: 5–300s.", + "retentionLabel": "Stats retention (hours)", + "retentionHelp": "How long resource samples are kept. 0 disables collection. Range: 0–24h." }, "projects": { "title": "Projects", diff --git a/web/src/lib/i18n/ru.json b/web/src/lib/i18n/ru.json index 9929147..8aed142 100644 --- a/web/src/lib/i18n/ru.json +++ b/web/src/lib/i18n/ru.json @@ -41,7 +41,47 @@ "failedSites": "с ошибкой", "noSites": "Статических сайтов пока нет.", "addFirstSite": "Разверните первый сайт", - "viewAllSites": "Все сайты" + "viewAllSites": "Все сайты", + "systemHealth": "Состояние системы", + "daemons": "Демоны", + "systemResources": "Системные ресурсы", + "systemResourcesSubtitle": "CPU, память, диск и топ потребителей" + }, + "resources": { + "cpuCores": "Ядра CPU", + "memory": "Память", + "running": "Запущено", + "dockerDisk": "Диск Docker", + "workloadUtilization": "Использование нагрузкой", + "windowMinutes": "{n} минут", + "windowHours": "{n} часов", + "noSamples": "Пока нет данных — сбор идёт каждые {interval}с.", + "diskImages": "Образы", + "diskContainers": "Контейнеры", + "diskVolumes": "Тома", + "diskBuildCache": "Кэш сборки", + "reclaimable": "{size} можно освободить", + "topConsumers": "Топ потребителей", + "byCpu": "по CPU", + "byMemory": "по памяти", + "noRunning": "Нет запущенных контейнеров.", + "instance": "экземпляр", + "site": "сайт", + "showHistory": "Показать историю", + "hideHistory": "Скрыть историю", + "cpuSeries": "CPU %", + "memorySeries": "Память %", + "loading": "Загрузка…", + "sectionTitle": "Ресурсы", + "showLogs": "Показать логи", + "hideLogs": "Скрыть логи", + "dockerUnavailable": "Docker недоступен. Проверьте, что демон запущен." + }, + "statsSettings": { + "intervalLabel": "Интервал сбора статистики (с)", + "intervalHelp": "Как часто собираются замеры ресурсов. 0 отключает сбор. Диапазон: 5–300с.", + "retentionLabel": "Хранение статистики (часы)", + "retentionHelp": "Как долго хранятся замеры ресурсов. 0 отключает сбор. Диапазон: 0–24ч." }, "projects": { "title": "Проекты", diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 12db3ee..40e51f2 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -130,6 +130,8 @@ export interface Settings { backup_enabled: boolean; backup_interval_hours: number; backup_retention_count: number; + stats_interval_seconds: number; + stats_retention_hours: number; updated_at: string; } @@ -462,11 +464,63 @@ export interface FolderEntry { is_dir: boolean; } -/** Container CPU and memory stats from the Docker stats API. */ +/** Container CPU, memory, network, and block I/O stats from the Docker stats API. */ export interface ContainerStats { + timestamp?: string; cpu_percent: number; memory_usage: number; memory_limit: number; memory_percent: number; + network_rx_bytes?: number; + network_tx_bytes?: number; + block_read_bytes?: number; + block_write_bytes?: number; +} + +/** One persisted container sample returned by the history endpoints. */ +export interface ContainerStatsSample { + container_id: string; + owner_type: 'instance' | 'site'; + owner_id: string; + ts: number; + cpu_percent: number; + memory_usage: number; + memory_limit: number; + network_rx_bytes: number; + network_tx_bytes: number; + block_read_bytes: number; + block_write_bytes: number; +} + +/** Host-level snapshot returned by /api/system/stats. */ +export interface SystemStats { + timestamp: string; + ncpu: number; + memory_total: number; + containers: number; + running: number; + paused: number; + stopped: number; + images: number; + disk_images_bytes: number; + disk_containers_bytes: number; + disk_volumes_bytes: number; + disk_build_cache_bytes: number; + disk_images_reclaimable: number; + disk_containers_reclaimable: number; + disk_volumes_reclaimable: number; + disk_build_cache_reclaimable: number; + disk_total_bytes: number; +} + +/** One persisted system sample returned by /api/system/stats/history. */ +export interface SystemStatsSample { + ts: number; + ncpu: number; + memory_total: number; + workload_cpu_percent: number; + workload_mem_usage: number; + containers_running: number; + disk_total_bytes: number; } diff --git a/web/src/routes/+page.svelte b/web/src/routes/+page.svelte index be4da1a..508a72a 100644 --- a/web/src/routes/+page.svelte +++ b/web/src/routes/+page.svelte @@ -6,6 +6,8 @@ import EmptyState from '$lib/components/EmptyState.svelte'; import SystemHealthCard from '$lib/components/SystemHealthCard.svelte'; import SystemDaemonsCard from '$lib/components/SystemDaemonsCard.svelte'; + import SystemResourcesCard from '$lib/components/SystemResourcesCard.svelte'; + import CollapsibleSection from '$lib/components/CollapsibleSection.svelte'; import ForgeHero from '$lib/components/ForgeHero.svelte'; import { IconDeploy, IconAlert } from '$lib/components/icons'; import { t } from '$lib/i18n'; @@ -181,35 +183,49 @@ {/if} - + + + - + + + + + + + + {#if !loading} -
-
-

{$t('dashboard.staticSites')}.

- {#if sites.length > 0} - - {$t('dashboard.viewAllSites')} - - {/if} -
- + {#snippet sitesActions()} + {#if sites.length > 0} + + {$t('dashboard.viewAllSites')} → + + {/if} + {/snippet} + 0 ? String(sites.length) : ''} + actions={sitesActions} + > {#if sites.length === 0} -
- -
+ {:else} -
+
{#each sites as site (site.id)} {@const badge = siteStatusBadge(site.status)} @@ -230,21 +246,23 @@ {/each}
{/if} -
+ {/if} -
-

{$t('dashboard.projects')}.

- + 0 ? String(projects.length) : ''} + > {#if loading} -
+
{#each Array(3) as _} {/each}
{:else if error} -
+

{error}

{:else if projects.length === 0} -
- -
+ {:else} -
+
{#each projects as project (project.id)} {/each}
{/if} -
+ diff --git a/web/src/routes/settings/maintenance/+page.svelte b/web/src/routes/settings/maintenance/+page.svelte index 2a6a0c8..24af5cb 100644 --- a/web/src/routes/settings/maintenance/+page.svelte +++ b/web/src/routes/settings/maintenance/+page.svelte @@ -21,6 +21,8 @@ let staleThresholdDays = $state('7'); let imagePruneThresholdMb = $state('1024'); + let statsIntervalSeconds = $state('15'); + let statsRetentionHours = $state('2'); async function load() { loading = true; @@ -28,6 +30,8 @@ const s = await getSettings(); staleThresholdDays = String(s.stale_threshold_days ?? 7); imagePruneThresholdMb = String(s.image_prune_threshold_mb ?? 1024); + statsIntervalSeconds = String(s.stats_interval_seconds ?? 15); + statsRetentionHours = String(s.stats_retention_hours ?? 2); } catch (err) { toasts.error(err instanceof Error ? err.message : $t('settingsGeneral.loadFailed')); } finally { @@ -38,9 +42,20 @@ async function handleSave() { saving = true; try { + const intervalParsed = parseInt(statsIntervalSeconds, 10); + const retentionParsed = parseInt(statsRetentionHours, 10); + // Interval 0 disables collection; otherwise clamp to [5, 300]. + const interval = isNaN(intervalParsed) + ? 15 + : intervalParsed === 0 + ? 0 + : Math.max(5, Math.min(300, intervalParsed)); + const retention = isNaN(retentionParsed) ? 2 : Math.max(0, Math.min(24, retentionParsed)); await updateSettings({ stale_threshold_days: Math.max(1, parseInt(staleThresholdDays, 10) || 7), - image_prune_threshold_mb: Math.max(0, parseInt(imagePruneThresholdMb, 10) || 0) + image_prune_threshold_mb: Math.max(0, parseInt(imagePruneThresholdMb, 10) || 0), + stats_interval_seconds: interval, + stats_retention_hours: retention } as any); toasts.success($t('settingsGeneral.saved')); } catch (err) { @@ -99,6 +114,22 @@ placeholder="1024" helpText={$t('settings.pruneThresholdHelp')} /> + +
diff --git a/web/src/routes/sites/[id]/+page.svelte b/web/src/routes/sites/[id]/+page.svelte index d563b08..05d405e 100644 --- a/web/src/routes/sites/[id]/+page.svelte +++ b/web/src/routes/sites/[id]/+page.svelte @@ -9,6 +9,8 @@ import ConfirmDialog from '$lib/components/ConfirmDialog.svelte'; import ForgeHero from '$lib/components/ForgeHero.svelte'; import WebhookPanel from '$lib/components/WebhookPanel.svelte'; + import ContainerStats from '$lib/components/ContainerStats.svelte'; + import ContainerLogs from '$lib/components/ContainerLogs.svelte'; import { IconRefresh, IconGlobe, IconTrash, IconPlus, IconLoader, IconLock, IconUnlock, IconPlay, IconStop } from '$lib/components/icons'; let site = $state(null); @@ -26,6 +28,7 @@ let secretEncrypted = $state(true); let secretSubmitting = $state(false); let storageUsage = $state(null); + let showLogs = $state(false); const siteId = $derived($page.params.id); @@ -251,6 +254,30 @@ {/if}
+ + {#if site.container_id} +
+
+

{$t('resources.sectionTitle')}

+ +
+ +
+ + {#if showLogs} + { showLogs = false; }} + /> + {/if} + {/if} +