package api import ( "context" "encoding/json" "errors" "log/slog" "net/http" "sync" "time" "github.com/go-chi/chi/v5" "github.com/alexei/tinyforge/internal/store" ) // storageProbeCache memoizes the `du` result per workload for a short // window so a tight polling loop on /storage cannot turn into one // `docker exec du` per request. The TTL is intentionally short — the // panel is a coarse usage indicator, not a real-time meter. var ( storageProbeCacheTTL = 30 * time.Second storageProbeMu sync.Mutex storageProbeCache = map[string]storageProbeEntry{} ) type storageProbeEntry struct { at time.Time usage int64 probeOk bool } // Runtime endpoints surface what the legacy /api/sites/* surface used // to expose on the static-site detail page: the last commit SHA / last // sync timestamp / status persisted by the static plugin in // containers.extra_json, the data-volume disk usage, and stop / start // controls that don't require a full re-deploy. // // The handlers are deliberately decoupled from the plugin interface so // they work uniformly across source kinds: stop/start operate on the // Docker container IDs stored in the containers index regardless of // kind; runtime-state reads what the source persisted (currently only // "static" writes a structured blob); storage usage is static-only // today but the endpoint shape allows future sources to opt in. // runtimeStatePayload is the JSON shape returned by // GET /api/workloads/{id}/runtime-state. // // SourceKind is always present so the UI can decide whether to render // the static-specific fields (last_commit_sha, last_sync_at, ...). The // container-row fields (ContainerID, State) come from the canonical // containers row that the static plugin maintains under the // deterministic ID `:site`. type runtimeStatePayload struct { SourceKind string `json:"source_kind"` HasState bool `json:"has_state"` ContainerID string `json:"container_id,omitempty"` State string `json:"state,omitempty"` Status string `json:"status,omitempty"` LastCommitSHA string `json:"last_commit_sha,omitempty"` LastSyncAt string `json:"last_sync_at,omitempty"` LastError string `json:"last_error,omitempty"` } // getWorkloadRuntimeState handles GET /api/workloads/{id}/runtime-state. // Reads the typed state the static plugin writes into containers.extra_json // (see internal/workload/plugin/source/static/state.go). Non-static // source kinds return SourceKind + HasState=false; the panel hides // itself rather than the endpoint 404ing. func (s *Server) getWorkloadRuntimeState(w http.ResponseWriter, r *http.Request) { id := chi.URLParam(r, "id") workload, err := s.store.GetWorkloadByID(id) if err != nil { if errors.Is(err, store.ErrNotFound) { respondNotFound(w, "workload") return } slog.Error("get workload for runtime-state", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } payload := runtimeStatePayload{SourceKind: workload.SourceKind} if workload.SourceKind != "static" { respondJSON(w, http.StatusOK, payload) return } // The static plugin owns one container row per workload at the // deterministic ID :site. A missing row means the // workload has never been deployed — return HasState=false so the // UI can prompt the operator to deploy. row, err := s.store.GetContainerByID(id + ":site") if err != nil { if errors.Is(err, store.ErrNotFound) { respondJSON(w, http.StatusOK, payload) return } slog.Error("get container row for runtime-state", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } payload.HasState = true payload.ContainerID = row.ContainerID payload.State = row.State // extra_json is the source of truth for the typed runtime fields. // A decode failure is non-fatal: we still report container_id / // state so the UI is useful, just without the sync history. // // No mutex here even though the writer (state.go saveState) holds // a per-workload mutex on read-modify-write — SQLite returns the // ExtraJSON column as a fully-materialized string from a single // SELECT, so the reader sees either the pre- or post-write snapshot // atomically. There is no torn read to defend against. if row.ExtraJSON != "" && row.ExtraJSON != "{}" { var st struct { Status string `json:"status"` LastCommitSHA string `json:"last_commit_sha"` LastSyncAt string `json:"last_sync_at"` LastError string `json:"last_error"` } if err := json.Unmarshal([]byte(row.ExtraJSON), &st); err != nil { slog.Debug("decode extra_json for runtime-state", "workload", id, "error", err) } else { payload.Status = st.Status payload.LastCommitSHA = st.LastCommitSHA payload.LastSyncAt = st.LastSyncAt payload.LastError = st.LastError } } respondJSON(w, http.StatusOK, payload) } // storageUsagePayload is the JSON shape returned by // GET /api/workloads/{id}/storage. ProbeError surfaces a non-fatal // failure to compute used_bytes (du timed out, exec returned non-zero, // etc.) so the UI can render "usage unavailable" instead of an // always-zero number. type storageUsagePayload struct { SourceKind string `json:"source_kind"` Enabled bool `json:"enabled"` UsedBytes int64 `json:"used_bytes"` LimitMB int `json:"limit_mb,omitempty"` ProbeError string `json:"probe_error,omitempty"` } // getWorkloadStorage handles GET /api/workloads/{id}/storage. // // For static workloads with storage enabled, execs `du -sb /app/data` // inside the running container to compute the data volume's footprint. // For workloads without storage (or non-static source kinds), returns // Enabled=false and zero usage so the UI can hide the panel. func (s *Server) getWorkloadStorage(w http.ResponseWriter, r *http.Request) { id := chi.URLParam(r, "id") workload, err := s.store.GetWorkloadByID(id) if err != nil { if errors.Is(err, store.ErrNotFound) { respondNotFound(w, "workload") return } slog.Error("get workload for storage", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } payload := storageUsagePayload{SourceKind: workload.SourceKind} if workload.SourceKind != "static" { respondJSON(w, http.StatusOK, payload) return } // Decode storage knobs from source_config. Missing / malformed // blobs are treated as storage-disabled rather than erroring; the // validator that runs on workload create already rejects invalid // configs at the source. var cfg struct { StorageEnabled bool `json:"storage_enabled"` StorageLimitMB int `json:"storage_limit_mb"` } if workload.SourceConfig != "" { if err := json.Unmarshal([]byte(workload.SourceConfig), &cfg); err != nil { // Validator catches malformed configs at create-time, so // this is unexpected — log so a drifted row is traceable. slog.Debug("decode source_config for storage", "workload", id, "error", err) } } payload.Enabled = cfg.StorageEnabled payload.LimitMB = cfg.StorageLimitMB if !cfg.StorageEnabled || s.docker == nil { respondJSON(w, http.StatusOK, payload) return } // Cache hit short-circuits the docker exec entirely so a polling // frontend cannot turn this into a per-request `du`. storageProbeMu.Lock() if cached, ok := storageProbeCache[id]; ok && time.Since(cached.at) < storageProbeCacheTTL { storageProbeMu.Unlock() payload.UsedBytes = cached.usage if !cached.probeOk { payload.ProbeError = "storage probe unavailable" } respondJSON(w, http.StatusOK, payload) return } storageProbeMu.Unlock() // Find the running container. The static plugin's canonical row is // at :site; we also tolerate workloads whose plugin produced // multiple containers by scanning the index. containers, err := s.store.ListContainersByWorkload(id) if err != nil { slog.Error("list containers for storage", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } probeOk := false for _, c := range containers { if c.ContainerID == "" { continue } // 15s budget — `du` on a Hugo-style `public/` with tens of // thousands of files and a cold page cache can run several // seconds. The cache above keeps the amortized cost small. ctx, cancel := context.WithTimeout(r.Context(), 15*time.Second) usage, err := s.docker.InspectSiteStorageUsage(ctx, c.ContainerID) cancel() if err != nil { slog.Debug("storage usage probe failed", "workload", id, "container", c.ContainerID, "error", err) continue } payload.UsedBytes = usage.UsedBytes probeOk = true break } if !probeOk { payload.ProbeError = "storage probe unavailable" } storageProbeMu.Lock() storageProbeCache[id] = storageProbeEntry{at: time.Now(), usage: payload.UsedBytes, probeOk: probeOk} storageProbeMu.Unlock() respondJSON(w, http.StatusOK, payload) } // stopStartResult is the JSON shape returned by both stop and start // handlers — counts so the UI can show "1 of 2 containers stopped". type stopStartResult struct { Touched int `json:"touched"` Failed int `json:"failed"` } // stopPluginWorkload handles POST /api/workloads/{id}/stop. // // Stops every container row belonging to the workload via Docker. Does // not remove containers or update runtime state — the reconciler // (internal/workload/plugin/source/static/reconcile.go) flips state to // "stopped"/"failed" on its next pass, and the user can immediately see // the new Docker state via /api/workloads/{id}/containers. // // Returning 200 with a `{touched, failed}` envelope even on partial // failures so the UI can surface "2 of 3 stopped" rather than treating // the whole call as red. func (s *Server) stopPluginWorkload(w http.ResponseWriter, r *http.Request) { id := chi.URLParam(r, "id") if _, err := s.store.GetWorkloadByID(id); err != nil { if errors.Is(err, store.ErrNotFound) { respondNotFound(w, "workload") return } slog.Error("get workload for stop", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } if s.docker == nil { respondError(w, http.StatusServiceUnavailable, "docker client unavailable") return } containers, err := s.store.ListContainersByWorkload(id) if err != nil { slog.Error("list containers for stop", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } result := stopStartResult{} for _, c := range containers { if c.ContainerID == "" { continue } // 30s per-container ctx budget; the third arg to StopContainer // is the in-container SIGTERM grace period before SIGKILL. ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second) if err := s.docker.StopContainer(ctx, c.ContainerID, 10); err != nil { slog.Warn("stop container failed", "workload", id, "container", c.ContainerID, "error", err) result.Failed++ } else { result.Touched++ } cancel() } if result.Touched == 0 && result.Failed == 0 { // No live container row to act on — distinguish from a successful // stop of zero containers so the UI can show "nothing to stop" // rather than a misleading green toast. respondError(w, http.StatusConflict, "no running container to stop") return } if result.Touched == 0 && result.Failed > 0 { respondError(w, http.StatusBadGateway, "all containers failed to stop") return } respondJSON(w, http.StatusOK, result) } // startPluginWorkload handles POST /api/workloads/{id}/start. // // Calls `docker start` on every container row belonging to the // workload. Does not redeploy or recreate; if the container has been // removed externally, start returns an error and the operator should // click Deploy. Same partial-failure envelope as stop. func (s *Server) startPluginWorkload(w http.ResponseWriter, r *http.Request) { id := chi.URLParam(r, "id") if _, err := s.store.GetWorkloadByID(id); err != nil { if errors.Is(err, store.ErrNotFound) { respondNotFound(w, "workload") return } slog.Error("get workload for start", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } if s.docker == nil { respondError(w, http.StatusServiceUnavailable, "docker client unavailable") return } containers, err := s.store.ListContainersByWorkload(id) if err != nil { slog.Error("list containers for start", "workload", id, "error", err) respondError(w, http.StatusInternalServerError, "internal server error") return } result := stopStartResult{} for _, c := range containers { if c.ContainerID == "" { continue } ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second) if err := s.docker.StartContainer(ctx, c.ContainerID); err != nil { slog.Warn("start container failed", "workload", id, "container", c.ContainerID, "error", err) result.Failed++ } else { result.Touched++ } cancel() } if result.Touched == 0 && result.Failed == 0 { // No persisted container — deploy first to materialize one. respondError(w, http.StatusConflict, "no container to start; deploy first") return } if result.Touched == 0 && result.Failed > 0 { respondError(w, http.StatusBadGateway, "all containers failed to start") return } respondJSON(w, http.StatusOK, result) }