refactor(workload): extract Instance entirely; Container is canonical

End-to-end extraction of the Instance concept. After this commit: * internal/store/instances.go — DELETED * internal/store/models.go — Instance struct gone, ProxyRoute moved here * containers table is the single source of truth for project/stack/site container state. instances table is dropped via DROP TABLE migration (idempotent; re-runnable on every boot). * Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id Docker labels are no longer emitted; only tinyforge.workload.{id,kind}, tinyforge.role, and tinyforge.managed are stamped on new containers. Backend rewrites: - internal/deployer: executeDeploy + blueGreenDeploy + rollback + promote use store.Container natively. New removeContainer() replaces removeInstance(). enforceMaxInstances reads via ListContainersByStageID. - internal/reconciler: legacy tinyforge.instance-id dispatch removed; upsertByWorkloadLabel now finds existing rows by docker container ID first and falls back to the deterministic workloadID:role key. - internal/stale/scanner: Scan + new FindStaleContainers walk the containers table; emit StaleContainer JSON. - internal/stats/collector: ListContainers replaces ListAllInstances. - internal/webhook/handler: workload-secret lookup tried first; falls back to project / static_site secret column. - internal/api: instances.go, stale.go, stats.go, stats_history.go, projects.go, settings.go, docker.go, dns.go all read / write through Container. Docker layer: - ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels. - ListContainers filters by tinyforge.managed=true. - Network creation uses LabelManaged instead of LabelProject. Frontend: - Instance type is now a Container alias; .status → .state, .last_alive_at → .last_seen_at. - InstanceCard takes stageId as a prop (no longer derived from Instance). - StaleContainer JSON shape rewritten: { container, workload_name, role, days_stale }. StaleContainerCard + /containers/stale page updated. - ProjectCard / homepage / SystemHealthCard filter by .state. The migration loop now tolerates "no such table" alongside "duplicate column" / "already exists" so obsolete ALTER TABLE entries targeting the dropped instances table no-op cleanly on first boot. Tests: store + deployer + reconciler + webhook + staticsite + notify all still pass. Frontend svelte-check: zero errors.
2026-05-09 14:43:12 +03:00
parent d516462750
commit d8ab22876f
32 changed files with 649 additions and 957 deletions
@@ -25,17 +25,17 @@ func (d *Deployer) blueGreenDeploy(
 	deployID string,
 	imageTag string,
 ) (string, string, string, error) {
-	// Find existing running instance for this stage (the "blue" instance).
-	existingInstances, err := d.store.GetInstancesByStageID(stage.ID)
+	// Find existing running container for this stage (the "blue" container).
+	existing, err := d.store.ListContainersByStageID(stage.ID)
 	if err != nil {
-		return "", "", "", fmt.Errorf("get existing instances: %w", err)
+		return "", "", "", fmt.Errorf("get existing containers: %w", err)
 	}

-	var blueInstance *store.Instance
-	for _, inst := range existingInstances {
-		if inst.Status == "running" {
-			instCopy := inst
-			blueInstance = &instCopy
+	var blueContainer *store.Container
+	for _, c := range existing {
+		if c.State == "running" {
+			cCopy := c
+			blueContainer = &cCopy
 			break
 		}
 	}
@@ -84,9 +84,6 @@ func (d *Deployer) blueGreenDeploy(
 		ExposedPorts: []string{portStr},
 		NetworkName:  settings.Network,
 		NetworkID:    networkID,
-		Project:      project.Name,
-		Stage:        stage.Name,
-		InstanceID:   instanceID,
 		WorkloadID:   workloadID,
 		WorkloadKind: string(store.WorkloadKindProject),
 		Role:         stage.Name,
@@ -114,25 +111,27 @@ func (d *Deployer) blueGreenDeploy(
 		return "", "", instanceID, fmt.Errorf("create container: %w", err)
 	}

-	// Create instance record.
-	inst, err := d.store.CreateInstanceWithID(store.Instance{
-		ID:          instanceID,
-		StageID:     stage.ID,
-		ProjectID:   project.ID,
-		ContainerID: containerID,
-		ImageTag:    imageTag,
-		Subdomain:   subdomain,
-		Status:      "stopped",
-		Port:        project.Port,
+	// Create container row.
+	row, err := d.store.CreateContainer(store.Container{
+		ID:           instanceID,
+		WorkloadID:   workloadID,
+		WorkloadKind: string(store.WorkloadKindProject),
+		Role:         stage.Name,
+		ContainerID:  containerID,
+		ImageRef:     project.Image + ":" + imageTag,
+		ImageTag:     imageTag,
+		Host:         "local",
+		State:        "stopped",
+		Port:         project.Port,
+		Subdomain:    subdomain,
 	})
 	if err != nil {
-		return containerID, "", instanceID, fmt.Errorf("create instance record: %w", err)
+		return containerID, "", instanceID, fmt.Errorf("create container row: %w", err)
 	}
-	instanceID = inst.ID
-	d.upsertContainerForInstance(project, stage, inst, workloadID)
+	instanceID = row.ID

 	if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
-		slog.Warn("link deploy to instance", "error", err)
+		slog.Warn("link deploy to container", "error", err)
 	}

 	d.logDeploy(deployID, fmt.Sprintf("Blue-green: starting green container %s", containerName), "info")
@@ -140,11 +139,10 @@ func (d *Deployer) blueGreenDeploy(
 		return containerID, "", instanceID, fmt.Errorf("start container: %w", err)
 	}

-	if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
-		slog.Warn("update instance status", "error", err)
+	if err := d.store.UpdateContainerState(instanceID, "running"); err != nil {
+		slog.Warn("update container state", "error", err)
 	}
-	inst.Status = "running"
-	d.upsertContainerForInstance(project, stage, inst, workloadID)
+	row.State = "running"
 	d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")

 	// Step 4: Health check the green container.
@@ -181,30 +179,29 @@ func (d *Deployer) blueGreenDeploy(
 			return containerID, "", instanceID, fmt.Errorf("configure proxy: %w", err)
 		}

-		inst.ProxyRouteID = proxyRouteID
+		row.ProxyRouteID = proxyRouteID
 		d.logDeploy(deployID, "Blue-green: proxy swapped to green container", "info")

-		// Create/update DNS record for the green instance.
+		// Create/update DNS record for the green container.
 		fqdn := subdomain + "." + settings.Domain
 		d.ensureDNS(ctx, fqdn, "instance", instanceID, deployID)
 	} else {
 		d.logDeploy(deployID, "Blue-green: proxy skipped (disabled for this stage)", "info")
 	}

-	inst.Subdomain = subdomain
-	if err := d.store.UpdateInstance(inst); err != nil {
-		slog.Warn("update instance with proxy ID", "error", err)
+	row.Subdomain = subdomain
+	if err := d.store.UpdateContainer(row); err != nil {
+		slog.Warn("update container with proxy ID", "error", err)
 	}
-	d.upsertContainerForInstance(project, stage, inst, workloadID)

 	// Step 6: Stop the blue container.
-	if blueInstance != nil {
-		d.logDeploy(deployID, fmt.Sprintf("Blue-green: stopping blue instance %s (tag: %s)", blueInstance.ID, blueInstance.ImageTag), "info")
-		if err := d.removeInstance(ctx, *blueInstance, settings); err != nil {
+	if blueContainer != nil {
+		d.logDeploy(deployID, fmt.Sprintf("Blue-green: stopping blue container %s (tag: %s)", blueContainer.ID, blueContainer.ImageTag), "info")
+		if err := d.removeContainer(ctx, *blueContainer, settings); err != nil {
 			// Non-fatal: log but continue. Green is already serving traffic.
-			d.logDeploy(deployID, fmt.Sprintf("Blue-green: warning: failed to remove blue instance: %v", err), "warn")
+			d.logDeploy(deployID, fmt.Sprintf("Blue-green: warning: failed to remove blue container: %v", err), "warn")
 		} else {
-			d.logDeploy(deployID, "Blue-green: blue instance removed", "info")
+			d.logDeploy(deployID, "Blue-green: blue container removed", "info")
 		}
 	}

@@ -376,9 +376,6 @@ func (d *Deployer) executeDeploy(
 		ExposedPorts: []string{portStr},
 		NetworkName:  settings.Network,
 		NetworkID:    networkID,
-		Project:      project.Name,
-		Stage:        stage.Name,
-		InstanceID:   instanceID,
 		WorkloadID:   workloadID,
 		WorkloadKind: string(store.WorkloadKindProject),
 		Role:         stage.Name,
@@ -407,26 +404,32 @@ func (d *Deployer) executeDeploy(
 	}
 	d.logDeploy(deployID, fmt.Sprintf("Container created (ID: %s)", truncateID(containerID)), "info")

-	// Create instance record in store with the pre-generated ID.
-	inst, err := d.store.CreateInstanceWithID(store.Instance{
-		ID:          instanceID,
-		StageID:     stage.ID,
-		ProjectID:   project.ID,
-		ContainerID: containerID,
-		ImageTag:    imageTag,
-		Subdomain:   subdomain,
-		Status:      "stopped",
-		Port:        project.Port,
+	// Create container row with the pre-generated ID. The deployer is the
+	// authoritative writer until the next reconciler tick — it's important
+	// the row exists before StartContainer so a fast tick doesn't see an
+	// orphan and mark it missing.
+	row, err := d.store.CreateContainer(store.Container{
+		ID:           instanceID,
+		WorkloadID:   workloadID,
+		WorkloadKind: string(store.WorkloadKindProject),
+		Role:         stage.Name,
+		ContainerID:  containerID,
+		ImageRef:     project.Image + ":" + imageTag,
+		ImageTag:     imageTag,
+		Host:         "local",
+		State:        "stopped",
+		Port:         project.Port,
+		Subdomain:    subdomain,
 	})
 	if err != nil {
-		return containerID, proxyRouteID, instanceID, fmt.Errorf("create instance record: %w", err)
+		return containerID, proxyRouteID, instanceID, fmt.Errorf("create container row: %w", err)
 	}
-	instanceID = inst.ID
-	d.upsertContainerForInstance(project, stage, inst, workloadID)
+	instanceID = row.ID

-	// Link deploy to instance.
+	// Link deploy to container row (the existing Deploy.InstanceID column
+	// stores the row ID — same value as before, just a renamed concept).
 	if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
-		slog.Warn("link deploy to instance", "error", err)
+		slog.Warn("link deploy to container", "error", err)
 	}

 	d.logDeploy(deployID, fmt.Sprintf("Starting container %s", containerName), "info")
@@ -434,15 +437,11 @@ func (d *Deployer) executeDeploy(
 		return containerID, proxyRouteID, instanceID, fmt.Errorf("start container: %w", err)
 	}

-	if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
-		slog.Warn("update instance status to running", "error", err)
+	if err := d.store.UpdateContainerState(instanceID, "running"); err != nil {
+		slog.Warn("update container state to running", "error", err)
 	}
-	if err := d.store.UpdateLastAliveAt(instanceID); err != nil {
-		slog.Warn("update last_alive_at on deploy", "instance_id", instanceID, "error", err)
-	}
-	inst.Status = "running"
-	inst.LastAliveAt = store.Now()
-	d.upsertContainerForInstance(project, stage, inst, workloadID)
+	row.State = "running"
+	row.LastSeenAt = store.Now()
 	d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
 	d.logDeploy(deployID, "Container started", "info")

@@ -463,24 +462,22 @@ func (d *Deployer) executeDeploy(
 			return containerID, proxyRouteID, instanceID, fmt.Errorf("configure proxy: %w", err)
 		}

-		// Update instance with proxy route ID.
-		inst.ProxyRouteID = proxyRouteID
-		inst.Subdomain = subdomain
-		if err := d.store.UpdateInstance(inst); err != nil {
-			slog.Warn("update instance with proxy ID", "error", err)
+		// Update container row with proxy route ID.
+		row.ProxyRouteID = proxyRouteID
+		row.Subdomain = subdomain
+		if err := d.store.UpdateContainer(row); err != nil {
+			slog.Warn("update container with proxy ID", "error", err)
 		}
-		d.upsertContainerForInstance(project, stage, inst, workloadID)

-		// Create DNS record for this instance.
+		// Create DNS record for this container.
 		fqdn := subdomain + "." + settings.Domain
 		d.ensureDNS(ctx, fqdn, "instance", instanceID, deployID)
 	} else {
 		d.logDeploy(deployID, "Proxy creation skipped (disabled for this stage)", "info")
-		inst.Subdomain = subdomain
-		if err := d.store.UpdateInstance(inst); err != nil {
-			slog.Warn("update instance", "error", err)
+		row.Subdomain = subdomain
+		if err := d.store.UpdateContainer(row); err != nil {
+			slog.Warn("update container", "error", err)
 		}
-		d.upsertContainerForInstance(project, stage, inst, workloadID)
 	}

 	// Step 5: Health check.
@@ -554,27 +551,27 @@ func (d *Deployer) configureProxy(
 	return routeID, nil
 }

-// enforceMaxInstances removes the oldest instances when the stage has reached its limit.
-// This makes room for the new deployment.
+// enforceMaxInstances removes the oldest container rows when the stage has
+// reached its instance limit, making room for the new deploy.
 func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, deployID string, settings store.Settings) error {
 	if stage.MaxInstances <= 0 {
 		return nil
 	}

-	instances, err := d.store.GetInstancesByStageID(stage.ID)
+	containers, err := d.store.ListContainersByStageID(stage.ID)
 	if err != nil {
-		return fmt.Errorf("get instances for stage: %w", err)
+		return fmt.Errorf("get containers for stage: %w", err)
 	}

-	// Filter to running/stopped instances (not already failed/removing).
-	var active []store.Instance
-	for _, inst := range instances {
-		if inst.Status == "running" || inst.Status == "stopped" {
-			active = append(active, inst)
+	// Filter to running/stopped containers (not already failed/removing).
+	var active []store.Container
+	for _, c := range containers {
+		if c.State == "running" || c.State == "stopped" {
+			active = append(active, c)
 		}
 	}

-	// We need room for one more instance, so remove oldest when at limit.
+	// We need room for one more container, so remove the oldest when at limit.
 	removeCount := len(active) - stage.MaxInstances + 1
 	if removeCount <= 0 {
 		return nil
@@ -586,57 +583,50 @@ func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, d
 	})

 	for i := 0; i < removeCount && i < len(active); i++ {
-		inst := active[i]
-		d.logDeploy(deployID, fmt.Sprintf("Removing oldest instance %s (tag: %s) to enforce max_instances=%d", inst.ID, inst.ImageTag, stage.MaxInstances), "info")
+		c := active[i]
+		d.logDeploy(deployID, fmt.Sprintf("Removing oldest container %s (tag: %s) to enforce max_instances=%d", c.ID, c.ImageTag, stage.MaxInstances), "info")

-		if err := d.removeInstance(ctx, inst, settings); err != nil {
-			d.logDeploy(deployID, fmt.Sprintf("Failed to remove instance %s: %v", inst.ID, err), "warn")
+		if err := d.removeContainer(ctx, c, settings); err != nil {
+			d.logDeploy(deployID, fmt.Sprintf("Failed to remove container %s: %v", c.ID, err), "warn")
 			continue
 		}
-		d.logDeploy(deployID, fmt.Sprintf("Removed instance %s", inst.ID), "info")
+		d.logDeploy(deployID, fmt.Sprintf("Removed container %s", c.ID), "info")
 	}

 	return nil
 }

-// removeInstance stops and removes a container, deletes its NPM proxy host,
-// and removes the instance record from the store.
-func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, settings store.Settings) error {
+// removeContainer stops + removes the Docker container, deletes its proxy
+// route, drops the DNS record, and removes the container row from the store.
+func (d *Deployer) removeContainer(ctx context.Context, c store.Container, settings store.Settings) error {
 	// Mark as removing.
-	if err := d.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
-		slog.Warn("update instance status to removing", "instance_id", inst.ID, "error", err)
+	if err := d.store.UpdateContainerState(c.ID, "removing"); err != nil {
+		slog.Warn("update container state to removing", "id", c.ID, "error", err)
 	}

 	// Remove Docker container.
-	if inst.ContainerID != "" {
-		if err := d.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
-			slog.Warn("remove container", "container_id", inst.ContainerID, "error", err)
+	if c.ContainerID != "" {
+		if err := d.docker.RemoveContainer(ctx, c.ContainerID, true); err != nil {
+			slog.Warn("remove docker container", "container_id", c.ContainerID, "error", err)
 		}
 	}

 	// Delete proxy route.
-	if inst.ProxyRouteID != "" {
-		if err := d.proxy.DeleteRoute(ctx, inst.ProxyRouteID); err != nil {
-			slog.Warn("delete proxy route", "route_id", inst.ProxyRouteID, "error", err)
+	if c.ProxyRouteID != "" {
+		if err := d.proxy.DeleteRoute(ctx, c.ProxyRouteID); err != nil {
+			slog.Warn("delete proxy route", "route_id", c.ProxyRouteID, "error", err)
 		}

-		// Remove DNS record for this instance.
-		if inst.Subdomain != "" && settings.Domain != "" {
-			fqdn := inst.Subdomain + "." + settings.Domain
+		// Remove DNS record.
+		if c.Subdomain != "" && settings.Domain != "" {
+			fqdn := c.Subdomain + "." + settings.Domain
 			d.removeDNS(ctx, fqdn, "")
 		}
 	}

-	// Delete instance record.
-	if err := d.store.DeleteInstance(inst.ID); err != nil {
-		return fmt.Errorf("delete instance record: %w", err)
-	}
-
-	// Drop the matching container index row. ID matches instance.ID by
-	// construction; ignore NotFound which is harmless if the row predates
-	// this refactor.
-	if err := d.store.DeleteContainer(inst.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
-		slog.Warn("delete container row", "instance_id", inst.ID, "error", err)
+	// Drop the container row.
+	if err := d.store.DeleteContainer(c.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
+		return fmt.Errorf("delete container row: %w", err)
 	}

 	return nil
@@ -903,33 +893,6 @@ func truncateID(id string) string {
 	return id
 }

-// upsertContainerForInstance keeps the normalized containers index in sync
-// with the project-specific instance row. Same UUID is used for both rows so
-// the reconciler can find them later. Best-effort: a sync failure is logged
-// but does not abort the deploy — the container is still running and the
-// reconciler will pick it up on the next tick (once that lands).
-func (d *Deployer) upsertContainerForInstance(project store.Project, stage store.Stage, inst store.Instance, workloadID string) {
-	c := store.Container{
-		ID:           inst.ID,
-		WorkloadID:   workloadID,
-		WorkloadKind: string(store.WorkloadKindProject),
-		Role:         stage.Name,
-		ContainerID:  inst.ContainerID,
-		ImageRef:     project.Image + ":" + inst.ImageTag,
-		ImageTag:     inst.ImageTag,
-		Host:         "local",
-		State:        inst.Status,
-		Port:         inst.Port,
-		Subdomain:    inst.Subdomain,
-		ProxyRouteID: inst.ProxyRouteID,
-		NpmProxyID:   inst.NpmProxyID,
-		LastSeenAt:   inst.LastAliveAt,
-	}
-	if err := d.store.UpsertContainer(c); err != nil {
-		slog.Warn("upsert container row", "instance_id", inst.ID, "error", err)
-	}
-}
-
 // resolveProjectWorkloadID returns the workload ID paired with a project.
 // Backfill-on-boot guarantees the row exists, so this is essentially a lookup.
 // On miss (defensive), it logs and returns empty so the caller can decide.
@@ -34,13 +34,13 @@ func (d *Deployer) validatePromoteFrom(stage store.Stage, imageTag string) error
 	}

 	// Check if the tag is running in the source stage.
-	instances, err := d.store.GetInstancesByStageID(sourceStage.ID)
+	containers, err := d.store.ListContainersByStageID(sourceStage.ID)
 	if err != nil {
-		return fmt.Errorf("get instances for source stage: %w", err)
+		return fmt.Errorf("get containers for source stage: %w", err)
 	}

-	for _, inst := range instances {
-		if inst.ImageTag == imageTag && (inst.Status == "running" || inst.Status == "stopped") {
+	for _, c := range containers {
+		if c.ImageTag == imageTag && (c.State == "running" || c.State == "stopped") {
 			return nil // Tag found in source stage, promotion is allowed.
 		}
 	}
@@ -32,24 +32,25 @@ func (d *Deployer) rollback(ctx context.Context, deployID string, containerID st
 		}
 	}

-	// Clean up DNS record if the instance had a subdomain.
+	// Clean up DNS record if the container had a subdomain. instanceID is
+	// the container row ID (same UUID either way) — read from containers.
 	if instanceID != "" {
-		inst, err := d.store.GetInstanceByID(instanceID)
-		if err == nil && inst.Subdomain != "" {
+		c, err := d.store.GetContainerByID(instanceID)
+		if err == nil && c.Subdomain != "" {
 			settings, settingsErr := d.store.GetSettings()
 			if settingsErr != nil {
 				slog.Warn("rollback: failed to get settings for DNS cleanup", "error", settingsErr)
 			} else if settings.Domain != "" {
-				fqdn := inst.Subdomain + "." + settings.Domain
+				fqdn := c.Subdomain + "." + settings.Domain
 				d.removeDNS(ctx, fqdn, deployID)
 			}
 		}
 	}

-	// Update instance status to failed if it was created.
+	// Mark the container row as failed if it was created.
 	if instanceID != "" {
-		if err := d.store.UpdateInstanceStatus(instanceID, "failed"); err != nil {
-			slog.Warn("rollback: update instance status", "instance_id", instanceID, "error", err)
+		if err := d.store.UpdateContainerState(instanceID, "failed"); err != nil {
+			slog.Warn("rollback: update container state", "id", instanceID, "error", err)
 		}
 	}