d8ab22876f
Build / build (push) Successful in 10m41s
End-to-end extraction of the Instance concept. After this commit:
* internal/store/instances.go — DELETED
* internal/store/models.go — Instance struct gone, ProxyRoute moved here
* containers table is the single source of truth for project/stack/site
container state. instances table is dropped via DROP TABLE migration
(idempotent; re-runnable on every boot).
* Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
tinyforge.role, and tinyforge.managed are stamped on new containers.
Backend rewrites:
- internal/deployer: executeDeploy + blueGreenDeploy + rollback +
promote use store.Container natively. New
removeContainer() replaces removeInstance().
enforceMaxInstances reads via
ListContainersByStageID.
- internal/reconciler: legacy tinyforge.instance-id dispatch removed;
upsertByWorkloadLabel now finds existing rows
by docker container ID first and falls back to
the deterministic workloadID:role key.
- internal/stale/scanner: Scan + new FindStaleContainers walk the
containers table; emit StaleContainer JSON.
- internal/stats/collector: ListContainers replaces ListAllInstances.
- internal/webhook/handler: workload-secret lookup tried first; falls back
to project / static_site secret column.
- internal/api: instances.go, stale.go, stats.go, stats_history.go,
projects.go, settings.go, docker.go, dns.go all read /
write through Container.
Docker layer:
- ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
- ListContainers filters by tinyforge.managed=true.
- Network creation uses LabelManaged instead of LabelProject.
Frontend:
- Instance type is now a Container alias; .status → .state,
.last_alive_at → .last_seen_at.
- InstanceCard takes stageId as a prop (no longer derived from Instance).
- StaleContainer JSON shape rewritten: { container, workload_name, role,
days_stale }. StaleContainerCard + /containers/stale page updated.
- ProjectCard / homepage / SystemHealthCard filter by .state.
The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.
Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
311 lines
10 KiB
Go
311 lines
10 KiB
Go
package api
|
|
|
|
import (
|
|
"errors"
|
|
"log/slog"
|
|
"net/http"
|
|
"sort"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/go-chi/chi/v5"
|
|
|
|
"github.com/alexei/tinyforge/internal/auth"
|
|
"github.com/alexei/tinyforge/internal/stats"
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
)
|
|
|
|
// topConsumerWindow is how recent a container sample must be to count toward
|
|
// the "top consumers" list. Scaled with the collector interval (read from
|
|
// settings) so it stays meaningful even when sampling is sparse.
|
|
const topConsumerMinWindow = 2 * time.Minute
|
|
|
|
// TopContainerSample augments a stats sample with the human-readable owner
|
|
// name so the UI can show "project/stage" or the static-site name without an
|
|
// extra round-trip per row.
|
|
type TopContainerSample struct {
|
|
store.ContainerStatsSample
|
|
OwnerName string `json:"owner_name"`
|
|
}
|
|
|
|
const (
|
|
// defaultHistoryWindow is used when no ?window= param is provided or the
|
|
// value fails to parse. Matches the default retention so the "last 2h"
|
|
// view always has data when collection is enabled.
|
|
defaultHistoryWindow = 2 * time.Hour
|
|
maxHistoryWindow = 24 * time.Hour
|
|
)
|
|
|
|
// parseWindow reads the ?window= query (Go duration string, e.g. "1h", "30m")
|
|
// and returns a bounded duration.
|
|
func parseWindow(r *http.Request) time.Duration {
|
|
raw := r.URL.Query().Get("window")
|
|
if raw == "" {
|
|
return defaultHistoryWindow
|
|
}
|
|
d, err := time.ParseDuration(raw)
|
|
if err != nil || d <= 0 {
|
|
return defaultHistoryWindow
|
|
}
|
|
if d > maxHistoryWindow {
|
|
return maxHistoryWindow
|
|
}
|
|
return d
|
|
}
|
|
|
|
// sinceTimestamp converts a duration into a Unix-seconds cutoff.
|
|
func sinceTimestamp(window time.Duration) int64 {
|
|
return time.Now().UTC().Add(-window).Unix()
|
|
}
|
|
|
|
// getSystemStats handles GET /api/system/stats — current host snapshot.
|
|
// When the Docker daemon is unreachable (e.g. Docker Desktop stopped) the
|
|
// handler returns 503 so the frontend can show a dedicated unavailable
|
|
// state instead of treating it as a generic 5xx failure.
|
|
func (s *Server) getSystemStats(w http.ResponseWriter, r *http.Request) {
|
|
if s.docker == nil {
|
|
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
|
|
return
|
|
}
|
|
sys, err := s.docker.GetSystemStats(r.Context())
|
|
if err != nil {
|
|
slog.Warn("system stats unavailable", "error", err)
|
|
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
|
|
return
|
|
}
|
|
respondJSON(w, http.StatusOK, sys)
|
|
}
|
|
|
|
// getSystemStatsHistory handles GET /api/system/stats/history?window=1h.
|
|
func (s *Server) getSystemStatsHistory(w http.ResponseWriter, r *http.Request) {
|
|
samples, err := s.store.ListSystemStatsSamples(sinceTimestamp(parseWindow(r)))
|
|
if err != nil {
|
|
slog.Error("failed to list system stats samples", "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
if samples == nil {
|
|
samples = []store.SystemStatsSample{}
|
|
}
|
|
respondJSON(w, http.StatusOK, samples)
|
|
}
|
|
|
|
// getInstanceStatsHistory handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats/history.
|
|
// {iid} is the container row ID (same UUID as the legacy instance ID).
|
|
func (s *Server) getInstanceStatsHistory(w http.ResponseWriter, r *http.Request) {
|
|
instanceID := chi.URLParam(r, "iid")
|
|
if _, err := s.store.GetContainerByID(instanceID); err != nil {
|
|
if errors.Is(err, store.ErrNotFound) {
|
|
respondNotFound(w, "container")
|
|
return
|
|
}
|
|
slog.Error("failed to get container", "id", instanceID, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to get container")
|
|
return
|
|
}
|
|
samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeInstance, instanceID, sinceTimestamp(parseWindow(r)))
|
|
if err != nil {
|
|
slog.Error("failed to list instance stats samples", "instance_id", instanceID, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
if samples == nil {
|
|
samples = []store.ContainerStatsSample{}
|
|
}
|
|
respondJSON(w, http.StatusOK, samples)
|
|
}
|
|
|
|
// getStaticSiteStats handles GET /api/sites/{id}/stats — current snapshot.
|
|
func (s *Server) getStaticSiteStats(w http.ResponseWriter, r *http.Request) {
|
|
id := chi.URLParam(r, "id")
|
|
site, err := s.store.GetStaticSiteByID(id)
|
|
if err != nil {
|
|
if errors.Is(err, store.ErrNotFound) {
|
|
respondNotFound(w, "site")
|
|
return
|
|
}
|
|
slog.Error("failed to get site", "site_id", id, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to get site")
|
|
return
|
|
}
|
|
if site.ContainerID == "" {
|
|
respondError(w, http.StatusConflict, "site has no container")
|
|
return
|
|
}
|
|
if s.docker == nil {
|
|
respondError(w, http.StatusServiceUnavailable, "Docker is not available")
|
|
return
|
|
}
|
|
cs, err := s.docker.GetContainerStats(r.Context(), site.ContainerID)
|
|
if err != nil {
|
|
slog.Error("failed to get site stats", "site_id", id, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to get site stats")
|
|
return
|
|
}
|
|
respondJSON(w, http.StatusOK, cs)
|
|
}
|
|
|
|
// getStaticSiteStatsHistory handles GET /api/sites/{id}/stats/history.
|
|
func (s *Server) getStaticSiteStatsHistory(w http.ResponseWriter, r *http.Request) {
|
|
id := chi.URLParam(r, "id")
|
|
if _, err := s.store.GetStaticSiteByID(id); err != nil {
|
|
if errors.Is(err, store.ErrNotFound) {
|
|
respondNotFound(w, "site")
|
|
return
|
|
}
|
|
slog.Error("failed to get site", "site_id", id, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to get site")
|
|
return
|
|
}
|
|
samples, err := s.store.ListContainerStatsSamples(stats.OwnerTypeSite, id, sinceTimestamp(parseWindow(r)))
|
|
if err != nil {
|
|
slog.Error("failed to list site stats samples", "site_id", id, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
if samples == nil {
|
|
samples = []store.ContainerStatsSample{}
|
|
}
|
|
respondJSON(w, http.StatusOK, samples)
|
|
}
|
|
|
|
// streamStaticSiteLogs handles GET /api/sites/{id}/logs?tail=200&follow=true.
|
|
// Reuses the shared container log streamer so the SSE + multiplex handling
|
|
// matches /api/projects/.../instances/.../logs exactly.
|
|
func (s *Server) streamStaticSiteLogs(w http.ResponseWriter, r *http.Request) {
|
|
id := chi.URLParam(r, "id")
|
|
site, err := s.store.GetStaticSiteByID(id)
|
|
if err != nil {
|
|
if errors.Is(err, store.ErrNotFound) {
|
|
respondNotFound(w, "site")
|
|
return
|
|
}
|
|
slog.Error("failed to get site", "site_id", id, "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to get site")
|
|
return
|
|
}
|
|
if site.ContainerID == "" {
|
|
respondError(w, http.StatusConflict, "site has no container")
|
|
return
|
|
}
|
|
s.streamLogsForContainer(w, r, site.ContainerID)
|
|
}
|
|
|
|
// listTopContainers handles GET /api/system/stats/top?limit=5&by=cpu.
|
|
// Returns the top-N most recent samples across containers, sorted by CPU or
|
|
// memory. Container IDs are stripped for non-admins so a low-privilege viewer
|
|
// cannot enumerate workloads outside their scope.
|
|
func (s *Server) listTopContainers(w http.ResponseWriter, r *http.Request) {
|
|
limit := 5
|
|
if raw := r.URL.Query().Get("limit"); raw != "" {
|
|
if n, err := strconv.Atoi(raw); err == nil && n > 0 && n <= 50 {
|
|
limit = n
|
|
}
|
|
}
|
|
by := r.URL.Query().Get("by")
|
|
if by != "memory" {
|
|
by = "cpu"
|
|
}
|
|
|
|
// Samples must be at least as recent as max(2*interval, 2 minutes) so the
|
|
// list reflects near-current load even when collection is sparse.
|
|
window := topConsumerMinWindow
|
|
if settings, err := s.store.GetSettings(); err == nil && settings.StatsIntervalSeconds > 0 {
|
|
if w := time.Duration(settings.StatsIntervalSeconds*2) * time.Second; w > window {
|
|
window = w
|
|
}
|
|
}
|
|
|
|
samples, err := s.store.ListAllRecentContainerStatsSamples(sinceTimestamp(window))
|
|
if err != nil {
|
|
slog.Error("failed to list container samples for top", "error", err)
|
|
respondError(w, http.StatusInternalServerError, "failed to list samples")
|
|
return
|
|
}
|
|
|
|
// Keep only the latest sample per container.
|
|
latest := make(map[string]store.ContainerStatsSample, len(samples))
|
|
for _, sm := range samples {
|
|
if prev, ok := latest[sm.ContainerID]; !ok || sm.TS > prev.TS {
|
|
latest[sm.ContainerID] = sm
|
|
}
|
|
}
|
|
|
|
top := make([]store.ContainerStatsSample, 0, len(latest))
|
|
for _, sm := range latest {
|
|
top = append(top, sm)
|
|
}
|
|
|
|
sort.Slice(top, func(i, j int) bool {
|
|
if by == "memory" {
|
|
return top[i].MemoryUsage > top[j].MemoryUsage
|
|
}
|
|
return top[i].CPUPercent > top[j].CPUPercent
|
|
})
|
|
if len(top) > limit {
|
|
top = top[:limit]
|
|
}
|
|
|
|
// Resolve owner names so the UI can show "project/stage" or the site name
|
|
// without a per-row round trip.
|
|
enriched := s.enrichWithOwnerNames(top)
|
|
|
|
// Scrub container IDs for non-admins. The owner name is the actionable
|
|
// identifier; the container ID is a host-level handle that reveals
|
|
// workload existence to viewers who shouldn't have it.
|
|
claims, _ := auth.ClaimsFromContext(r.Context())
|
|
if claims.Role != "admin" {
|
|
for i := range enriched {
|
|
enriched[i].ContainerID = ""
|
|
}
|
|
}
|
|
|
|
respondJSON(w, http.StatusOK, enriched)
|
|
}
|
|
|
|
// enrichWithOwnerNames attaches a human-readable owner name to each sample.
|
|
// Looks up instances and sites in batch so the cost is independent of the
|
|
// number of samples (which is at most 'limit').
|
|
func (s *Server) enrichWithOwnerNames(samples []store.ContainerStatsSample) []TopContainerSample {
|
|
out := make([]TopContainerSample, len(samples))
|
|
for i, sm := range samples {
|
|
out[i] = TopContainerSample{ContainerStatsSample: sm}
|
|
switch sm.OwnerType {
|
|
case stats.OwnerTypeInstance:
|
|
out[i].OwnerName = s.lookupInstanceName(sm.OwnerID)
|
|
case stats.OwnerTypeSite:
|
|
out[i].OwnerName = s.lookupSiteName(sm.OwnerID)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// lookupInstanceName returns "workload/role" for a container row, or empty
|
|
// on any lookup error so a transient miss does not break the response.
|
|
func (s *Server) lookupInstanceName(instanceID string) string {
|
|
c, err := s.store.GetContainerByID(instanceID)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
w, err := s.store.GetWorkloadByID(c.WorkloadID)
|
|
if err != nil {
|
|
if c.Role != "" {
|
|
return c.Role
|
|
}
|
|
return ""
|
|
}
|
|
if c.Role != "" {
|
|
return w.Name + "/" + c.Role
|
|
}
|
|
return w.Name
|
|
}
|
|
|
|
// lookupSiteName returns the site's display name or empty on lookup error.
|
|
func (s *Server) lookupSiteName(siteID string) string {
|
|
site, err := s.store.GetStaticSiteByID(siteID)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return site.Name
|
|
}
|