tiny-forge/internal/api/stats_history.go

package api

import (
	"errors"
	"log/slog"
	"net/http"
	"sort"
	"strconv"
	"time"

	"github.com/go-chi/chi/v5"

	"github.com/alexei/tinyforge/internal/auth"
	"github.com/alexei/tinyforge/internal/store"
)

// topConsumerMinWindow is how recent a container sample must be to count toward
// the "top consumers" list. Scaled with the collector interval (read from
// settings) so it stays meaningful even when sampling is sparse.
const topConsumerMinWindow = 2 * time.Minute

// TopContainerSample augments a stats sample with the human-readable owner
// name so the UI can show "workload/role" without an extra round-trip per row.
type TopContainerSample struct {
	store.ContainerStatsSample
	OwnerName string `json:"owner_name"`
}

const (
	// defaultHistoryWindow is used when no ?window= param is provided or the
	// value fails to parse. Matches the default retention so the "last 2h"
	// view always has data when collection is enabled.
	defaultHistoryWindow = 2 * time.Hour
	maxHistoryWindow     = 24 * time.Hour
)

// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
// and returns a bounded duration.
func parseWindow(r *http.Request) time.Duration {
	raw := r.URL.Query().Get("window")
	if raw == "" {
		return defaultHistoryWindow
	}
	d, err := time.ParseDuration(raw)
	if err != nil || d <= 0 {
		return defaultHistoryWindow
	}
	if d > maxHistoryWindow {
		return maxHistoryWindow
	}
	return d
}

// sinceTimestamp converts a duration into a Unix-seconds cutoff.
func sinceTimestamp(window time.Duration) int64 {
	return time.Now().UTC().Add(-window).Unix()
}

// getSystemStats handles GET /api/system/stats — current host snapshot.
// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
// handler returns 503 so the frontend can show a dedicated unavailable
// state instead of treating it as a generic 5xx failure.
func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
	if s.docker == nil {
		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
		return
	}
	sys, err := s.docker.GetSystemStats(r.Context())
	if err != nil {
		slog.Warn("system stats unavailable", "error", err)
		respondError(w, http.StatusServiceUnavailable, "Docker is not available")
		return
	}
	respondJSON(w, http.StatusOK, sys)
}

// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
	samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
	if err != nil {
		slog.Error("failed to list system stats samples", "error", err)
		respondError(w, http.StatusInternalServerError, "failed to list samples")
		return
	}
	if samples == nil {
		samples = []store.SystemStatsSample{}
	}
	respondJSON(w, http.StatusOK, samples)
}

// workloadStatsPoint is one aggregated time bucket for a workload's metrics
// graph: every container the workload owns is summed at each timestamp so a
// multi-container (compose) workload reads as a single line. MemoryLimit is
// the max across containers — the effective ceiling — though the UI plots
// absolute MiB because the limit is often 0 (unlimited).
type workloadStatsPoint struct {
	TS          int64   `json:"ts"`
	CPUPercent  float64 `json:"cpu_percent"`
	MemoryUsage int64   `json:"memory_usage"`
	MemoryLimit int64   `json:"memory_limit"`
}

// getWorkloadStatsHistory handles GET /api/workloads/{id}/stats/history?window=1h.
// Read-only and open to any authenticated user (mirrors the per-workload
// events/runtime-state feeds). Always returns a (possibly empty) array — never
// 503 — because samples come from SQLite, which is available even when the
// Docker daemon is down or stats collection is disabled. Unknown workload id
// 404s; a known workload with no samples yet returns [].
func (s *Server) getWorkloadStatsHistory(w http.ResponseWriter, r *http.Request) {
	id := chi.URLParam(r, "id")
	if id == "" {
		respondError(w, http.StatusBadRequest, "workload id is required")
		return
	}
	if _, err := s.store.GetWorkloadByID(id); err != nil {
		if errors.Is(err, store.ErrNotFound) {
			respondNotFound(w, "workload")
			return
		}
		respondError(w, http.StatusInternalServerError, "get workload")
		return
	}

	samples, err := s.store.ListContainerStatsSamplesByWorkload(id, sinceTimestamp(parseWindow(r)))
	if err != nil {
		slog.Error("failed to list workload stats samples", "workload", id, "error", err)
		respondError(w, http.StatusInternalServerError, "failed to list samples")
		return
	}

	respondJSON(w, http.StatusOK, aggregateWorkloadStats(samples))
}

// aggregateWorkloadStats folds per-container samples into one series keyed by
// timestamp: CPU% and memory usage are summed across the workload's containers,
// memory limit takes the max. Samples arrive ts-ascending, so the output keeps
// that order without an extra sort.
func aggregateWorkloadStats(samples []store.ContainerStatsSample) []workloadStatsPoint {
	points := make([]workloadStatsPoint, 0)
	idx := make(map[int64]int) // ts → index in points
	for _, sm := range samples {
		if i, ok := idx[sm.TS]; ok {
			points[i].CPUPercent += sm.CPUPercent
			points[i].MemoryUsage += sm.MemoryUsage
			if sm.MemoryLimit > points[i].MemoryLimit {
				points[i].MemoryLimit = sm.MemoryLimit
			}
			continue
		}
		idx[sm.TS] = len(points)
		points = append(points, workloadStatsPoint{
			TS:          sm.TS,
			CPUPercent:  sm.CPUPercent,
			MemoryUsage: sm.MemoryUsage,
			MemoryLimit: sm.MemoryLimit,
		})
	}
	return points
}

// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
// Returns the top-N most recent samples across containers, sorted by CPU or
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
// cannot enumerate workloads outside their scope.
func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
	limit := 5
	if raw := r.URL.Query().Get("limit"); raw != "" {
		if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
			limit = n
		}
	}
	by := r.URL.Query().Get("by")
	if by != "memory" {
		by = "cpu"
	}

	// Samples must be at least as recent as max(2*interval, 2 minutes) so the
	// list reflects near-current load even when collection is sparse.
	window := topConsumerMinWindow
	if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
		if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
			window = w
		}
	}

	samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
	if err != nil {
		slog.Error("failed to list container samples for top", "error", err)
		respondError(w, http.StatusInternalServerError, "failed to list samples")
		return
	}

	// Keep only the latest sample per container.
	latest := make(map[string]store.ContainerStatsSample, len(samples))
	for _, sm := range samples {
		if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
			latest[sm.ContainerID] = sm
		}
	}

	top := make([]store.ContainerStatsSample, 0, len(latest))
	for _, sm := range latest {
		top = append(top, sm)
	}

	sort.Slice(top, func(i, j int) bool {
		if by == "memory" {
			return top[i].MemoryUsage > top[j].MemoryUsage
		}
		return top[i].CPUPercent > top[j].CPUPercent
	})
	if len(top) > limit {
		top = top[:limit]
	}

	enriched := s.enrichWithOwnerNames(top)

	// Scrub container IDs for non-admins. The owner name is the actionable
	// identifier; the container ID is a host-level handle that reveals
	// workload existence to viewers who shouldn't have it.
	claims, _ := auth.ClaimsFromContext(r.Context())
	if claims.Role != "admin" {
		for i := range enriched {
			enriched[i].ContainerID = ""
		}
	}

	respondJSON(w, http.StatusOK, enriched)
}

// enrichWithOwnerNames attaches a human-readable owner name to each sample.
// Names are resolved through the containers index → workloads, which after
// the cutover is the only available lookup path.
func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
	out := make([]TopContainerSample, len(samples))
	for i, sm := range samples {
		out[i] = TopContainerSample{ContainerStatsSample: sm}
		out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
	}
	return out
}

// lookupInstanceName returns "workload/role" for a container row, or empty
// on any lookup error so a transient miss does not break the response.
func (s *Server) lookupInstanceName(instanceID string) string {
	c, err := s.store.GetContainerByID(instanceID)
	if err != nil {
		return ""
	}
	w, err := s.store.GetWorkloadByID(c.WorkloadID)
	if err != nil {
		if c.Role != "" {
			return c.Role
		}
		return ""
	}
	if c.Role != "" {
		return w.Name + "/" + c.Role
	}
	return w.Name
}