Files
tiny-forge/internal/api/stats_history.go
T
alexei.dolgolyov d8ab22876f
Build / build (push) Successful in 10m41s
refactor(workload): extract Instance entirely; Container is canonical
End-to-end extraction of the Instance concept. After this commit:

  * internal/store/instances.go — DELETED
  * internal/store/models.go — Instance struct gone, ProxyRoute moved here
  * containers table is the single source of truth for project/stack/site
    container state. instances table is dropped via DROP TABLE migration
    (idempotent; re-runnable on every boot).
  * Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
    Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
    tinyforge.role, and tinyforge.managed are stamped on new containers.

Backend rewrites:
  - internal/deployer:        executeDeploy + blueGreenDeploy + rollback +
                              promote use store.Container natively. New
                              removeContainer() replaces removeInstance().
                              enforceMaxInstances reads via
                              ListContainersByStageID.
  - internal/reconciler:      legacy tinyforge.instance-id dispatch removed;
                              upsertByWorkloadLabel now finds existing rows
                              by docker container ID first and falls back to
                              the deterministic workloadID:role key.
  - internal/stale/scanner:   Scan + new FindStaleContainers walk the
                              containers table; emit StaleContainer JSON.
  - internal/stats/collector: ListContainers replaces ListAllInstances.
  - internal/webhook/handler: workload-secret lookup tried first; falls back
                              to project / static_site secret column.
  - internal/api: instances.go, stale.go, stats.go, stats_history.go,
                  projects.go, settings.go, docker.go, dns.go all read /
                  write through Container.

Docker layer:
  - ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
  - ListContainers filters by tinyforge.managed=true.
  - Network creation uses LabelManaged instead of LabelProject.

Frontend:
  - Instance type is now a Container alias; .status → .state,
    .last_alive_at → .last_seen_at.
  - InstanceCard takes stageId as a prop (no longer derived from Instance).
  - StaleContainer JSON shape rewritten: { container, workload_name, role,
    days_stale }. StaleContainerCard + /containers/stale page updated.
  - ProjectCard / homepage / SystemHealthCard filter by .state.

The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.

Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
2026-05-09 14:43:12 +03:00

311 lines
10 KiB
Go

package api
import (
"errors"
"log/slog"
"net/http"
"sort"
"strconv"
"time"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/auth"
"github.com/alexei/tinyforge/internal/stats"
"github.com/alexei/tinyforge/internal/store"
)
// topConsumerWindow is how recent a container sample must be to count toward
// the "top consumers" list. Scaled with the collector interval (read from
// settings) so it stays meaningful even when sampling is sparse.
const topConsumerMinWindow = 2 * time.Minute
// TopContainerSample augments a stats sample with the human-readable owner
// name so the UI can show "project/stage" or the static-site name without an
// extra round-trip per row.
type TopContainerSample struct {
store.ContainerStatsSample
OwnerName string `json:"owner_name"`
}
const (
// defaultHistoryWindow is used when no ?window= param is provided or the
// value fails to parse. Matches the default retention so the "last 2h"
// view always has data when collection is enabled.
defaultHistoryWindow = 2 * time.Hour
maxHistoryWindow = 24 * time.Hour
)
// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
// and returns a bounded duration.
func parseWindow(r *http.Request) time.Duration {
raw := r.URL.Query().Get("window")
if raw == "" {
return defaultHistoryWindow
}
d, err := time.ParseDuration(raw)
if err != nil || d <= 0 {
return defaultHistoryWindow
}
if d > maxHistoryWindow {
return maxHistoryWindow
}
return d
}
// sinceTimestamp converts a duration into a Unix-seconds cutoff.
func sinceTimestamp(window time.Duration) int64 {
return time.Now().UTC().Add(-window).Unix()
}
// getSystemStats handles GET /api/system/stats — current host snapshot.
// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
// handler returns 503 so the frontend can show a dedicated unavailable
// state instead of treating it as a generic 5xx failure.
func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
if s.docker == nil {
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
sys, err := s.docker.GetSystemStats(r.Context())
if err != nil {
slog.Warn("system stats unavailable", "error", err)
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
respondJSON(w, http.StatusOK, sys)
}
// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list system stats samples", "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.SystemStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// getInstanceStatsHistory handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats/history.
// {iid} is the container row ID (same UUID as the legacy instance ID).
func (s *Server) getInstanceStatsHistory(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "iid")
if _, err := s.store.GetContainerByID(instanceID); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "container")
return
}
slog.Error("failed to get container", "id", instanceID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get container")
return
}
samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeInstance, instanceID, sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list instance stats samples", "instance_id", instanceID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.ContainerStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// getStaticSiteStats handles GET /api/sites/{id}/stats — current snapshot.
func (s *Server) getStaticSiteStats(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
site, err := s.store.GetStaticSiteByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "site")
return
}
slog.Error("failed to get site", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site")
return
}
if site.ContainerID == "" {
respondError(w, http.StatusConflict, "site has no container")
return
}
if s.docker == nil {
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
return
}
cs, err := s.docker.GetContainerStats(r.Context(), site.ContainerID)
if err != nil {
slog.Error("failed to get site stats", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site stats")
return
}
respondJSON(w, http.StatusOK, cs)
}
// getStaticSiteStatsHistory handles GET /api/sites/{id}/stats/history.
func (s *Server) getStaticSiteStatsHistory(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
if _, err := s.store.GetStaticSiteByID(id); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "site")
return
}
slog.Error("failed to get site", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site")
return
}
samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeSite, id, sinceTimestamp(parseWindow(r)))
if err != nil {
slog.Error("failed to list site stats samples", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
if samples == nil {
samples = []store.ContainerStatsSample{}
}
respondJSON(w, http.StatusOK, samples)
}
// streamStaticSiteLogs handles GET /api/sites/{id}/logs?tail=200&follow=true.
// Reuses the shared container log streamer so the SSE + multiplex handling
// matches /api/projects/.../instances/.../logs exactly.
func (s *Server) streamStaticSiteLogs(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
site, err := s.store.GetStaticSiteByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "site")
return
}
slog.Error("failed to get site", "site_id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get site")
return
}
if site.ContainerID == "" {
respondError(w, http.StatusConflict, "site has no container")
return
}
s.streamLogsForContainer(w, r, site.ContainerID)
}
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
// Returns the top-N most recent samples across containers, sorted by CPU or
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
// cannot enumerate workloads outside their scope.
func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
limit := 5
if raw := r.URL.Query().Get("limit"); raw != "" {
if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
limit = n
}
}
by := r.URL.Query().Get("by")
if by != "memory" {
by = "cpu"
}
// Samples must be at least as recent as max(2*interval, 2 minutes) so the
// list reflects near-current load even when collection is sparse.
window := topConsumerMinWindow
if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
window = w
}
}
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
if err != nil {
slog.Error("failed to list container samples for top", "error", err)
respondError(w, http.StatusInternalServerError, "failed to list samples")
return
}
// Keep only the latest sample per container.
latest := make(map[string]store.ContainerStatsSample, len(samples))
for _, sm := range samples {
if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
latest[sm.ContainerID] = sm
}
}
top := make([]store.ContainerStatsSample, 0, len(latest))
for _, sm := range latest {
top = append(top, sm)
}
sort.Slice(top, func(i, j int) bool {
if by == "memory" {
return top[i].MemoryUsage > top[j].MemoryUsage
}
return top[i].CPUPercent > top[j].CPUPercent
})
if len(top) > limit {
top = top[:limit]
}
// Resolve owner names so the UI can show "project/stage" or the site name
// without a per-row round trip.
enriched := s.enrichWithOwnerNames(top)
// Scrub container IDs for non-admins. The owner name is the actionable
// identifier; the container ID is a host-level handle that reveals
// workload existence to viewers who shouldn't have it.
claims, _ := auth.ClaimsFromContext(r.Context())
if claims.Role != "admin" {
for i := range enriched {
enriched[i].ContainerID = ""
}
}
respondJSON(w, http.StatusOK, enriched)
}
// enrichWithOwnerNames attaches a human-readable owner name to each sample.
// Looks up instances and sites in batch so the cost is independent of the
// number of samples (which is at most 'limit').
func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
out := make([]TopContainerSample, len(samples))
for i, sm := range samples {
out[i] = TopContainerSample{ContainerStatsSample: sm}
switch sm.OwnerType {
case stats.OwnerTypeInstance:
out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
case stats.OwnerTypeSite:
out[i].OwnerName = s.lookupSiteName(sm.OwnerID)
}
}
return out
}
// lookupInstanceName returns "workload/role" for a container row, or empty
// on any lookup error so a transient miss does not break the response.
func (s *Server) lookupInstanceName(instanceID string) string {
c, err := s.store.GetContainerByID(instanceID)
if err != nil {
return ""
}
w, err := s.store.GetWorkloadByID(c.WorkloadID)
if err != nil {
if c.Role != "" {
return c.Role
}
return ""
}
if c.Role != "" {
return w.Name + "/" + c.Role
}
return w.Name
}
// lookupSiteName returns the site's display name or empty on lookup error.
func (s *Server) lookupSiteName(siteID string) string {
site, err := s.store.GetStaticSiteByID(siteID)
if err != nil {
return ""
}
return site.Name
}