refactor(workload): extract Instance entirely; Container is canonical
Build / build (push) Successful in 10m41s

End-to-end extraction of the Instance concept. After this commit:

  * internal/store/instances.go — DELETED
  * internal/store/models.go — Instance struct gone, ProxyRoute moved here
  * containers table is the single source of truth for project/stack/site
    container state. instances table is dropped via DROP TABLE migration
    (idempotent; re-runnable on every boot).
  * Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
    Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
    tinyforge.role, and tinyforge.managed are stamped on new containers.

Backend rewrites:
  - internal/deployer:        executeDeploy + blueGreenDeploy + rollback +
                              promote use store.Container natively. New
                              removeContainer() replaces removeInstance().
                              enforceMaxInstances reads via
                              ListContainersByStageID.
  - internal/reconciler:      legacy tinyforge.instance-id dispatch removed;
                              upsertByWorkloadLabel now finds existing rows
                              by docker container ID first and falls back to
                              the deterministic workloadID:role key.
  - internal/stale/scanner:   Scan + new FindStaleContainers walk the
                              containers table; emit StaleContainer JSON.
  - internal/stats/collector: ListContainers replaces ListAllInstances.
  - internal/webhook/handler: workload-secret lookup tried first; falls back
                              to project / static_site secret column.
  - internal/api: instances.go, stale.go, stats.go, stats_history.go,
                  projects.go, settings.go, docker.go, dns.go all read /
                  write through Container.

Docker layer:
  - ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
  - ListContainers filters by tinyforge.managed=true.
  - Network creation uses LabelManaged instead of LabelProject.

Frontend:
  - Instance type is now a Container alias; .status → .state,
    .last_alive_at → .last_seen_at.
  - InstanceCard takes stageId as a prop (no longer derived from Instance).
  - StaleContainer JSON shape rewritten: { container, workload_name, role,
    days_stale }. StaleContainerCard + /containers/stale page updated.
  - ProjectCard / homepage / SystemHealthCard filter by .state.

The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.

Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
This commit is contained in:
2026-05-09 14:43:12 +03:00
parent d516462750
commit d8ab22876f
32 changed files with 649 additions and 957 deletions
+63 -67
View File
@@ -13,48 +13,53 @@ import (
)
// listInstances handles GET /api/projects/{id}/stages/{stage}/instances.
// Reads the normalized container index — the legacy `instances` table is gone.
// JSON shape stays Container-shaped (id, container_id, image_tag, subdomain,
// state, port, etc.), so the frontend type may show some renamed fields
// (status→state, last_alive_at→last_seen_at).
func (s *Server) listInstances(w http.ResponseWriter, r *http.Request) {
stageID := chi.URLParam(r, "stage")
// Verify stage exists.
if _, err := s.store.GetStageByID(stageID); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "stage")
return
}
slog.Error("failed to get stage", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
instances, err := s.store.GetInstancesByStageID(stageID)
if err != nil {
slog.Error("failed to list instances", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Reconcile instance statuses with Docker's actual state.
containers, err := s.store.ListContainersByStageID(stageID)
if err != nil {
slog.Error("failed to list containers", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Reconcile container state with Docker's actual state — covers the
// case where a container was killed externally between deployer writes
// and the next reconciler tick.
ctx := r.Context()
for i, inst := range instances {
if inst.ContainerID == "" || inst.Status == "removing" {
for i, c := range containers {
if c.ContainerID == "" || c.State == "removing" {
continue
}
running, err := s.docker.IsContainerRunning(ctx, inst.ContainerID)
running, err := s.docker.IsContainerRunning(ctx, c.ContainerID)
if err != nil {
continue // Docker unreachable, keep stored status.
continue
}
actualStatus := "stopped"
actual := "stopped"
if running {
actualStatus = "running"
actual = "running"
}
if inst.Status != actualStatus {
instances[i].Status = actualStatus
_ = s.store.UpdateInstanceStatus(inst.ID, actualStatus)
if c.State != actual {
containers[i].State = actual
_ = s.store.UpdateContainerState(c.ID, actual)
}
}
respondJSON(w, http.StatusOK, instances)
respondJSON(w, http.StatusOK, containers)
}
// deployRequest is the expected JSON body for triggering a deploy.
@@ -62,30 +67,28 @@ type deployRequest struct {
ImageTag string `json:"image_tag"`
}
// deployInstance handles POST /api/projects/{id}/stages/{stage}/instances (trigger deploy).
// deployInstance handles POST /api/projects/{id}/stages/{stage}/instances.
func (s *Server) deployInstance(w http.ResponseWriter, r *http.Request) {
projectID := chi.URLParam(r, "id")
stageID := chi.URLParam(r, "stage")
// Verify project exists.
if _, err := s.store.GetProjectByID(projectID); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "project")
return
}
slog.Error("failed to get project", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Verify stage exists.
if _, err := s.store.GetStageByID(stageID); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "stage")
return
}
slog.Error("failed to get stage", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
@@ -115,40 +118,41 @@ func (s *Server) deployInstance(w http.ResponseWriter, r *http.Request) {
}
// removeInstance handles DELETE /api/projects/{id}/stages/{stage}/instances/{iid}.
// {iid} is the container row ID (same UUID as the legacy instance ID).
func (s *Server) removeInstance(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "iid")
id := chi.URLParam(r, "iid")
inst, err := s.store.GetInstanceByID(instanceID)
c, err := s.store.GetContainerByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
respondNotFound(w, "container")
return
}
slog.Error("failed to get instance", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
slog.Error("failed to get container", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Remove the Docker container if it has one.
if inst.ContainerID != "" {
if err := s.docker.RemoveContainer(r.Context(), inst.ContainerID, true); err != nil {
slog.Error("remove container", "container_id", inst.ContainerID, "error", err)
if c.ContainerID != "" {
if err := s.docker.RemoveContainer(r.Context(), c.ContainerID, true); err != nil {
slog.Error("remove container", "container_id", c.ContainerID, "error", err)
}
}
// Delete proxy route if it has one.
if inst.ProxyRouteID != "" {
if err := s.proxyProvider.DeleteRoute(r.Context(), inst.ProxyRouteID); err != nil {
slog.Warn("delete proxy route on instance removal", "route_id", inst.ProxyRouteID, "error", err)
if c.ProxyRouteID != "" {
if err := s.proxyProvider.DeleteRoute(r.Context(), c.ProxyRouteID); err != nil {
slog.Warn("delete proxy route on container removal", "route_id", c.ProxyRouteID, "error", err)
}
}
// Delete instance record.
if err := s.store.DeleteInstance(instanceID); err != nil {
respondError(w, http.StatusInternalServerError, "failed to delete instance")
// Delete container row.
if err := s.store.DeleteContainer(id); err != nil {
respondError(w, http.StatusInternalServerError, "failed to delete container")
return
}
respondJSON(w, http.StatusOK, map[string]string{"deleted": instanceID})
respondJSON(w, http.StatusOK, map[string]string{"deleted": id})
}
// stopInstance handles POST /api/projects/{id}/stages/{stage}/instances/{iid}/stop.
@@ -166,67 +170,59 @@ func (s *Server) restartInstance(w http.ResponseWriter, r *http.Request) {
s.controlInstance(w, r, "restart")
}
// controlInstance performs a stop/start/restart action on an instance's container.
// controlInstance performs a stop/start/restart action on a container.
func (s *Server) controlInstance(w http.ResponseWriter, r *http.Request, action string) {
instanceID := chi.URLParam(r, "iid")
id := chi.URLParam(r, "iid")
inst, err := s.store.GetInstanceByID(instanceID)
c, err := s.store.GetContainerByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
respondNotFound(w, "container")
return
}
slog.Error("failed to get instance", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
slog.Error("failed to get container", "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
if inst.ContainerID == "" {
respondError(w, http.StatusBadRequest, "instance has no container")
if c.ContainerID == "" {
respondError(w, http.StatusBadRequest, "container row has no docker container bound")
return
}
ctx := r.Context()
var controlErr error
var newStatus string
var newState string
switch action {
case "stop":
controlErr = s.docker.StopContainer(ctx, inst.ContainerID, 10)
newStatus = "stopped"
controlErr = s.docker.StopContainer(ctx, c.ContainerID, 10)
newState = "stopped"
case "start":
controlErr = s.docker.StartContainer(ctx, inst.ContainerID)
newStatus = "running"
controlErr = s.docker.StartContainer(ctx, c.ContainerID)
newState = "running"
case "restart":
controlErr = s.docker.RestartContainer(ctx, inst.ContainerID, 10)
newStatus = "running"
controlErr = s.docker.RestartContainer(ctx, c.ContainerID, 10)
newState = "running"
default:
respondError(w, http.StatusBadRequest, fmt.Sprintf("unknown action: %s", action))
return
}
if controlErr != nil {
slog.Error("failed to control instance", "action", action, "instance_id", instanceID, "error", controlErr)
slog.Error("failed to control container", "action", action, "id", id, "error", controlErr)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
// Update status in store.
if err := s.store.UpdateInstanceStatus(instanceID, newStatus); err != nil {
slog.Error("update instance status", "instance_id", instanceID, "status", newStatus, "error", err)
}
// Track last_alive_at when container becomes running.
if newStatus == "running" {
if err := s.store.UpdateLastAliveAt(instanceID); err != nil {
slog.Error("update last_alive_at", "instance_id", instanceID, "error", err)
}
if err := s.store.UpdateContainerState(id, newState); err != nil {
slog.Error("update container state", "id", id, "state", newState, "error", err)
}
respondJSON(w, http.StatusOK, map[string]string{
"instance_id": instanceID,
"instance_id": id,
"action": action,
"status": newStatus,
"status": newState,
})
}