refactor(workload): extract Instance entirely; Container is canonical
Build / build (push) Successful in 10m41s

End-to-end extraction of the Instance concept. After this commit:

  * internal/store/instances.go — DELETED
  * internal/store/models.go — Instance struct gone, ProxyRoute moved here
  * containers table is the single source of truth for project/stack/site
    container state. instances table is dropped via DROP TABLE migration
    (idempotent; re-runnable on every boot).
  * Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
    Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
    tinyforge.role, and tinyforge.managed are stamped on new containers.

Backend rewrites:
  - internal/deployer:        executeDeploy + blueGreenDeploy + rollback +
                              promote use store.Container natively. New
                              removeContainer() replaces removeInstance().
                              enforceMaxInstances reads via
                              ListContainersByStageID.
  - internal/reconciler:      legacy tinyforge.instance-id dispatch removed;
                              upsertByWorkloadLabel now finds existing rows
                              by docker container ID first and falls back to
                              the deterministic workloadID:role key.
  - internal/stale/scanner:   Scan + new FindStaleContainers walk the
                              containers table; emit StaleContainer JSON.
  - internal/stats/collector: ListContainers replaces ListAllInstances.
  - internal/webhook/handler: workload-secret lookup tried first; falls back
                              to project / static_site secret column.
  - internal/api: instances.go, stale.go, stats.go, stats_history.go,
                  projects.go, settings.go, docker.go, dns.go all read /
                  write through Container.

Docker layer:
  - ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
  - ListContainers filters by tinyforge.managed=true.
  - Network creation uses LabelManaged instead of LabelProject.

Frontend:
  - Instance type is now a Container alias; .status → .state,
    .last_alive_at → .last_seen_at.
  - InstanceCard takes stageId as a prop (no longer derived from Instance).
  - StaleContainer JSON shape rewritten: { container, workload_name, role,
    days_stale }. StaleContainerCard + /containers/stale page updated.
  - ProjectCard / homepage / SystemHealthCard filter by .state.

The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.

Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
This commit is contained in:
2026-05-09 14:43:12 +03:00
parent d516462750
commit d8ab22876f
32 changed files with 649 additions and 957 deletions
+45 -51
View File
@@ -19,59 +19,58 @@ func (s *Server) listStaleContainers(w http.ResponseWriter, r *http.Request) {
return
}
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
staleRows, err := s.staleScanner.FindStaleContainers(r.Context())
if err != nil {
slog.Error("failed to find stale containers", "error", err)
respondError(w, http.StatusInternalServerError, "failed to find stale containers")
return
}
if staleInstances == nil {
staleInstances = []stale.StaleInstance{}
if staleRows == nil {
staleRows = []stale.StaleContainer{}
}
respondJSON(w, http.StatusOK, staleInstances)
respondJSON(w, http.StatusOK, staleRows)
}
// cleanupStaleContainer handles POST /api/containers/stale/{id}/cleanup.
// Stops the Docker container, removes the NPM proxy, and deletes the instance from the store.
// Stops the Docker container, removes the proxy route, and deletes the
// container row. {id} is the container row ID.
func (s *Server) cleanupStaleContainer(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "id")
id := chi.URLParam(r, "id")
inst, err := s.store.GetInstanceByID(instanceID)
c, err := s.store.GetContainerByID(id)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
respondNotFound(w, "container")
return
}
slog.Error("failed to get instance", "instance_id", instanceID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get instance")
slog.Error("failed to get container", "id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to get container")
return
}
// Don't remove instances already being cleaned up.
if inst.Status == "removing" {
respondError(w, http.StatusConflict, "instance is already being removed")
if c.State == "removing" {
respondError(w, http.StatusConflict, "container is already being removed")
return
}
if err := s.cleanupInstance(r, inst); err != nil {
slog.Error("failed to cleanup instance", "instance_id", instanceID, "error", err)
respondError(w, http.StatusInternalServerError, "failed to cleanup instance")
if err := s.cleanupContainer(r, c); err != nil {
slog.Error("failed to cleanup container", "id", id, "error", err)
respondError(w, http.StatusInternalServerError, "failed to cleanup container")
return
}
respondJSON(w, http.StatusOK, map[string]string{"cleaned": instanceID})
respondJSON(w, http.StatusOK, map[string]string{"cleaned": id})
}
// bulkCleanupStaleContainers handles POST /api/containers/stale/cleanup.
// Cleans up all currently stale containers.
func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Request) {
if s.staleScanner == nil {
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
return
}
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
staleRows, err := s.staleScanner.FindStaleContainers(r.Context())
if err != nil {
slog.Error("failed to find stale containers for bulk cleanup", "error", err)
respondError(w, http.StatusInternalServerError, "failed to find stale containers")
@@ -81,17 +80,17 @@ func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Reque
var cleaned []string
var failed []string
for _, si := range staleInstances {
if si.Instance.Status == "removing" {
for _, sc := range staleRows {
if sc.Container.State == "removing" {
continue
}
if err := s.cleanupInstance(r, si.Instance); err != nil {
if err := s.cleanupContainer(r, sc.Container); err != nil {
slog.Error("bulk stale cleanup failed",
"instance_id", si.Instance.ID, "error", err)
failed = append(failed, si.Instance.ID)
"id", sc.Container.ID, "error", err)
failed = append(failed, sc.Container.ID)
continue
}
cleaned = append(cleaned, si.Instance.ID)
cleaned = append(cleaned, sc.Container.ID)
}
respondJSON(w, http.StatusOK, map[string]any{
@@ -100,53 +99,48 @@ func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Reque
})
}
// cleanupInstance stops a Docker container, removes the NPM proxy, deletes
// the store record, and emits an event.
func (s *Server) cleanupInstance(r *http.Request, inst store.Instance) error {
// cleanupContainer stops a Docker container, removes its proxy route,
// deletes the container row, and emits an event.
func (s *Server) cleanupContainer(r *http.Request, c store.Container) error {
ctx := r.Context()
// Mark as removing.
if err := s.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
slog.Warn("stale cleanup: update status to removing", "instance_id", inst.ID, "error", err)
if err := s.store.UpdateContainerState(c.ID, "removing"); err != nil {
slog.Warn("stale cleanup: update state to removing", "id", c.ID, "error", err)
}
// Stop and remove Docker container.
if inst.ContainerID != "" {
if err := s.docker.StopContainer(ctx, inst.ContainerID, 10); err != nil {
slog.Warn("stale cleanup: stop container", "container_id", inst.ContainerID, "error", err)
if c.ContainerID != "" {
if err := s.docker.StopContainer(ctx, c.ContainerID, 10); err != nil {
slog.Warn("stale cleanup: stop container", "container_id", c.ContainerID, "error", err)
}
if err := s.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
slog.Warn("stale cleanup: remove container", "container_id", inst.ContainerID, "error", err)
if err := s.docker.RemoveContainer(ctx, c.ContainerID, true); err != nil {
slog.Warn("stale cleanup: remove container", "container_id", c.ContainerID, "error", err)
}
}
// Delete proxy route if present.
if inst.ProxyRouteID != "" {
if err := s.proxyProvider.DeleteRoute(ctx, inst.ProxyRouteID); err != nil {
slog.Warn("stale cleanup: delete proxy route", "route_id", inst.ProxyRouteID, "error", err)
if c.ProxyRouteID != "" {
if err := s.proxyProvider.DeleteRoute(ctx, c.ProxyRouteID); err != nil {
slog.Warn("stale cleanup: delete proxy route", "route_id", c.ProxyRouteID, "error", err)
}
}
// Delete instance record.
if err := s.store.DeleteInstance(inst.ID); err != nil {
if err := s.store.DeleteContainer(c.ID); err != nil {
return err
}
// Emit cleanup event.
s.emitStaleCleanupEvent(inst)
s.emitStaleCleanupEvent(c)
return nil
}
// emitStaleCleanupEvent publishes an event when a stale container is cleaned up.
func (s *Server) emitStaleCleanupEvent(inst store.Instance) {
msg := "Stale container cleaned up: " + inst.ID + " (tag: " + inst.ImageTag + ")"
func (s *Server) emitStaleCleanupEvent(c store.Container) {
msg := "Stale container cleaned up: " + c.ID + " (tag: " + c.ImageTag + ")"
evt, err := s.store.InsertEvent(store.EventLog{
Source: "stale_cleanup",
Severity: "info",
Message: msg,
Metadata: `{"instance_id":"` + inst.ID + `","project_id":"` + inst.ProjectID + `","stage_id":"` + inst.StageID + `"}`,
Metadata: `{"container_id":"` + c.ID + `","workload_id":"` + c.WorkloadID + `","role":"` + c.Role + `"}`,
})
if err != nil {
slog.Error("stale cleanup: failed to persist event", "error", err)
@@ -158,9 +152,9 @@ func (s *Server) emitStaleCleanupEvent(inst store.Instance) {
Payload: events.EventLogPayload{
ID: evt.ID,
Source: "stale_cleanup",
Severity: "info",
Message: msg,
Metadata: evt.Metadata,
Severity: "info",
Message: msg,
Metadata: evt.Metadata,
CreatedAt: evt.CreatedAt,
},
})