refactor(workload): extract Instance entirely; Container is canonical
Build / build (push) Successful in 10m41s

End-to-end extraction of the Instance concept. After this commit:

  * internal/store/instances.go — DELETED
  * internal/store/models.go — Instance struct gone, ProxyRoute moved here
  * containers table is the single source of truth for project/stack/site
    container state. instances table is dropped via DROP TABLE migration
    (idempotent; re-runnable on every boot).
  * Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
    Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
    tinyforge.role, and tinyforge.managed are stamped on new containers.

Backend rewrites:
  - internal/deployer:        executeDeploy + blueGreenDeploy + rollback +
                              promote use store.Container natively. New
                              removeContainer() replaces removeInstance().
                              enforceMaxInstances reads via
                              ListContainersByStageID.
  - internal/reconciler:      legacy tinyforge.instance-id dispatch removed;
                              upsertByWorkloadLabel now finds existing rows
                              by docker container ID first and falls back to
                              the deterministic workloadID:role key.
  - internal/stale/scanner:   Scan + new FindStaleContainers walk the
                              containers table; emit StaleContainer JSON.
  - internal/stats/collector: ListContainers replaces ListAllInstances.
  - internal/webhook/handler: workload-secret lookup tried first; falls back
                              to project / static_site secret column.
  - internal/api: instances.go, stale.go, stats.go, stats_history.go,
                  projects.go, settings.go, docker.go, dns.go all read /
                  write through Container.

Docker layer:
  - ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
  - ListContainers filters by tinyforge.managed=true.
  - Network creation uses LabelManaged instead of LabelProject.

Frontend:
  - Instance type is now a Container alias; .status → .state,
    .last_alive_at → .last_seen_at.
  - InstanceCard takes stageId as a prop (no longer derived from Instance).
  - StaleContainer JSON shape rewritten: { container, workload_name, role,
    days_stale }. StaleContainerCard + /containers/stale page updated.
  - ProjectCard / homepage / SystemHealthCard filter by .state.

The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.

Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
This commit is contained in:
2026-05-09 14:43:12 +03:00
parent d516462750
commit d8ab22876f
32 changed files with 649 additions and 957 deletions
+37 -40
View File
@@ -25,17 +25,17 @@ func (d *Deployer) blueGreenDeploy(
deployID string,
imageTag string,
) (string, string, string, error) {
// Find existing running instance for this stage (the "blue" instance).
existingInstances, err := d.store.GetInstancesByStageID(stage.ID)
// Find existing running container for this stage (the "blue" container).
existing, err := d.store.ListContainersByStageID(stage.ID)
if err != nil {
return "", "", "", fmt.Errorf("get existing instances: %w", err)
return "", "", "", fmt.Errorf("get existing containers: %w", err)
}
var blueInstance *store.Instance
for _, inst := range existingInstances {
if inst.Status == "running" {
instCopy := inst
blueInstance = &instCopy
var blueContainer *store.Container
for _, c := range existing {
if c.State == "running" {
cCopy := c
blueContainer = &cCopy
break
}
}
@@ -84,9 +84,6 @@ func (d *Deployer) blueGreenDeploy(
ExposedPorts: []string{portStr},
NetworkName: settings.Network,
NetworkID: networkID,
Project: project.Name,
Stage: stage.Name,
InstanceID: instanceID,
WorkloadID: workloadID,
WorkloadKind: string(store.WorkloadKindProject),
Role: stage.Name,
@@ -114,25 +111,27 @@ func (d *Deployer) blueGreenDeploy(
return "", "", instanceID, fmt.Errorf("create container: %w", err)
}
// Create instance record.
inst, err := d.store.CreateInstanceWithID(store.Instance{
ID: instanceID,
StageID: stage.ID,
ProjectID: project.ID,
ContainerID: containerID,
ImageTag: imageTag,
Subdomain: subdomain,
Status: "stopped",
Port: project.Port,
// Create container row.
row, err := d.store.CreateContainer(store.Container{
ID: instanceID,
WorkloadID: workloadID,
WorkloadKind: string(store.WorkloadKindProject),
Role: stage.Name,
ContainerID: containerID,
ImageRef: project.Image + ":" + imageTag,
ImageTag: imageTag,
Host: "local",
State: "stopped",
Port: project.Port,
Subdomain: subdomain,
})
if err != nil {
return containerID, "", instanceID, fmt.Errorf("create instance record: %w", err)
return containerID, "", instanceID, fmt.Errorf("create container row: %w", err)
}
instanceID = inst.ID
d.upsertContainerForInstance(project, stage, inst, workloadID)
instanceID = row.ID
if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
slog.Warn("link deploy to instance", "error", err)
slog.Warn("link deploy to container", "error", err)
}
d.logDeploy(deployID, fmt.Sprintf("Blue-green: starting green container %s", containerName), "info")
@@ -140,11 +139,10 @@ func (d *Deployer) blueGreenDeploy(
return containerID, "", instanceID, fmt.Errorf("start container: %w", err)
}
if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
slog.Warn("update instance status", "error", err)
if err := d.store.UpdateContainerState(instanceID, "running"); err != nil {
slog.Warn("update container state", "error", err)
}
inst.Status = "running"
d.upsertContainerForInstance(project, stage, inst, workloadID)
row.State = "running"
d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
// Step 4: Health check the green container.
@@ -181,30 +179,29 @@ func (d *Deployer) blueGreenDeploy(
return containerID, "", instanceID, fmt.Errorf("configure proxy: %w", err)
}
inst.ProxyRouteID = proxyRouteID
row.ProxyRouteID = proxyRouteID
d.logDeploy(deployID, "Blue-green: proxy swapped to green container", "info")
// Create/update DNS record for the green instance.
// Create/update DNS record for the green container.
fqdn := subdomain + "." + settings.Domain
d.ensureDNS(ctx, fqdn, "instance", instanceID, deployID)
} else {
d.logDeploy(deployID, "Blue-green: proxy skipped (disabled for this stage)", "info")
}
inst.Subdomain = subdomain
if err := d.store.UpdateInstance(inst); err != nil {
slog.Warn("update instance with proxy ID", "error", err)
row.Subdomain = subdomain
if err := d.store.UpdateContainer(row); err != nil {
slog.Warn("update container with proxy ID", "error", err)
}
d.upsertContainerForInstance(project, stage, inst, workloadID)
// Step 6: Stop the blue container.
if blueInstance != nil {
d.logDeploy(deployID, fmt.Sprintf("Blue-green: stopping blue instance %s (tag: %s)", blueInstance.ID, blueInstance.ImageTag), "info")
if err := d.removeInstance(ctx, *blueInstance, settings); err != nil {
if blueContainer != nil {
d.logDeploy(deployID, fmt.Sprintf("Blue-green: stopping blue container %s (tag: %s)", blueContainer.ID, blueContainer.ImageTag), "info")
if err := d.removeContainer(ctx, *blueContainer, settings); err != nil {
// Non-fatal: log but continue. Green is already serving traffic.
d.logDeploy(deployID, fmt.Sprintf("Blue-green: warning: failed to remove blue instance: %v", err), "warn")
d.logDeploy(deployID, fmt.Sprintf("Blue-green: warning: failed to remove blue container: %v", err), "warn")
} else {
d.logDeploy(deployID, "Blue-green: blue instance removed", "info")
d.logDeploy(deployID, "Blue-green: blue container removed", "info")
}
}
+66 -103
View File
@@ -376,9 +376,6 @@ func (d *Deployer) executeDeploy(
ExposedPorts: []string{portStr},
NetworkName: settings.Network,
NetworkID: networkID,
Project: project.Name,
Stage: stage.Name,
InstanceID: instanceID,
WorkloadID: workloadID,
WorkloadKind: string(store.WorkloadKindProject),
Role: stage.Name,
@@ -407,26 +404,32 @@ func (d *Deployer) executeDeploy(
}
d.logDeploy(deployID, fmt.Sprintf("Container created (ID: %s)", truncateID(containerID)), "info")
// Create instance record in store with the pre-generated ID.
inst, err := d.store.CreateInstanceWithID(store.Instance{
ID: instanceID,
StageID: stage.ID,
ProjectID: project.ID,
ContainerID: containerID,
ImageTag: imageTag,
Subdomain: subdomain,
Status: "stopped",
Port: project.Port,
// Create container row with the pre-generated ID. The deployer is the
// authoritative writer until the next reconciler tick — it's important
// the row exists before StartContainer so a fast tick doesn't see an
// orphan and mark it missing.
row, err := d.store.CreateContainer(store.Container{
ID: instanceID,
WorkloadID: workloadID,
WorkloadKind: string(store.WorkloadKindProject),
Role: stage.Name,
ContainerID: containerID,
ImageRef: project.Image + ":" + imageTag,
ImageTag: imageTag,
Host: "local",
State: "stopped",
Port: project.Port,
Subdomain: subdomain,
})
if err != nil {
return containerID, proxyRouteID, instanceID, fmt.Errorf("create instance record: %w", err)
return containerID, proxyRouteID, instanceID, fmt.Errorf("create container row: %w", err)
}
instanceID = inst.ID
d.upsertContainerForInstance(project, stage, inst, workloadID)
instanceID = row.ID
// Link deploy to instance.
// Link deploy to container row (the existing Deploy.InstanceID column
// stores the row ID — same value as before, just a renamed concept).
if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
slog.Warn("link deploy to instance", "error", err)
slog.Warn("link deploy to container", "error", err)
}
d.logDeploy(deployID, fmt.Sprintf("Starting container %s", containerName), "info")
@@ -434,15 +437,11 @@ func (d *Deployer) executeDeploy(
return containerID, proxyRouteID, instanceID, fmt.Errorf("start container: %w", err)
}
if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
slog.Warn("update instance status to running", "error", err)
if err := d.store.UpdateContainerState(instanceID, "running"); err != nil {
slog.Warn("update container state to running", "error", err)
}
if err := d.store.UpdateLastAliveAt(instanceID); err != nil {
slog.Warn("update last_alive_at on deploy", "instance_id", instanceID, "error", err)
}
inst.Status = "running"
inst.LastAliveAt = store.Now()
d.upsertContainerForInstance(project, stage, inst, workloadID)
row.State = "running"
row.LastSeenAt = store.Now()
d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
d.logDeploy(deployID, "Container started", "info")
@@ -463,24 +462,22 @@ func (d *Deployer) executeDeploy(
return containerID, proxyRouteID, instanceID, fmt.Errorf("configure proxy: %w", err)
}
// Update instance with proxy route ID.
inst.ProxyRouteID = proxyRouteID
inst.Subdomain = subdomain
if err := d.store.UpdateInstance(inst); err != nil {
slog.Warn("update instance with proxy ID", "error", err)
// Update container row with proxy route ID.
row.ProxyRouteID = proxyRouteID
row.Subdomain = subdomain
if err := d.store.UpdateContainer(row); err != nil {
slog.Warn("update container with proxy ID", "error", err)
}
d.upsertContainerForInstance(project, stage, inst, workloadID)
// Create DNS record for this instance.
// Create DNS record for this container.
fqdn := subdomain + "." + settings.Domain
d.ensureDNS(ctx, fqdn, "instance", instanceID, deployID)
} else {
d.logDeploy(deployID, "Proxy creation skipped (disabled for this stage)", "info")
inst.Subdomain = subdomain
if err := d.store.UpdateInstance(inst); err != nil {
slog.Warn("update instance", "error", err)
row.Subdomain = subdomain
if err := d.store.UpdateContainer(row); err != nil {
slog.Warn("update container", "error", err)
}
d.upsertContainerForInstance(project, stage, inst, workloadID)
}
// Step 5: Health check.
@@ -554,27 +551,27 @@ func (d *Deployer) configureProxy(
return routeID, nil
}
// enforceMaxInstances removes the oldest instances when the stage has reached its limit.
// This makes room for the new deployment.
// enforceMaxInstances removes the oldest container rows when the stage has
// reached its instance limit, making room for the new deploy.
func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, deployID string, settings store.Settings) error {
if stage.MaxInstances <= 0 {
return nil
}
instances, err := d.store.GetInstancesByStageID(stage.ID)
containers, err := d.store.ListContainersByStageID(stage.ID)
if err != nil {
return fmt.Errorf("get instances for stage: %w", err)
return fmt.Errorf("get containers for stage: %w", err)
}
// Filter to running/stopped instances (not already failed/removing).
var active []store.Instance
for _, inst := range instances {
if inst.Status == "running" || inst.Status == "stopped" {
active = append(active, inst)
// Filter to running/stopped containers (not already failed/removing).
var active []store.Container
for _, c := range containers {
if c.State == "running" || c.State == "stopped" {
active = append(active, c)
}
}
// We need room for one more instance, so remove oldest when at limit.
// We need room for one more container, so remove the oldest when at limit.
removeCount := len(active) - stage.MaxInstances + 1
if removeCount <= 0 {
return nil
@@ -586,57 +583,50 @@ func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, d
})
for i := 0; i < removeCount && i < len(active); i++ {
inst := active[i]
d.logDeploy(deployID, fmt.Sprintf("Removing oldest instance %s (tag: %s) to enforce max_instances=%d", inst.ID, inst.ImageTag, stage.MaxInstances), "info")
c := active[i]
d.logDeploy(deployID, fmt.Sprintf("Removing oldest container %s (tag: %s) to enforce max_instances=%d", c.ID, c.ImageTag, stage.MaxInstances), "info")
if err := d.removeInstance(ctx, inst, settings); err != nil {
d.logDeploy(deployID, fmt.Sprintf("Failed to remove instance %s: %v", inst.ID, err), "warn")
if err := d.removeContainer(ctx, c, settings); err != nil {
d.logDeploy(deployID, fmt.Sprintf("Failed to remove container %s: %v", c.ID, err), "warn")
continue
}
d.logDeploy(deployID, fmt.Sprintf("Removed instance %s", inst.ID), "info")
d.logDeploy(deployID, fmt.Sprintf("Removed container %s", c.ID), "info")
}
return nil
}
// removeInstance stops and removes a container, deletes its NPM proxy host,
// and removes the instance record from the store.
func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, settings store.Settings) error {
// removeContainer stops + removes the Docker container, deletes its proxy
// route, drops the DNS record, and removes the container row from the store.
func (d *Deployer) removeContainer(ctx context.Context, c store.Container, settings store.Settings) error {
// Mark as removing.
if err := d.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
slog.Warn("update instance status to removing", "instance_id", inst.ID, "error", err)
if err := d.store.UpdateContainerState(c.ID, "removing"); err != nil {
slog.Warn("update container state to removing", "id", c.ID, "error", err)
}
// Remove Docker container.
if inst.ContainerID != "" {
if err := d.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
slog.Warn("remove container", "container_id", inst.ContainerID, "error", err)
if c.ContainerID != "" {
if err := d.docker.RemoveContainer(ctx, c.ContainerID, true); err != nil {
slog.Warn("remove docker container", "container_id", c.ContainerID, "error", err)
}
}
// Delete proxy route.
if inst.ProxyRouteID != "" {
if err := d.proxy.DeleteRoute(ctx, inst.ProxyRouteID); err != nil {
slog.Warn("delete proxy route", "route_id", inst.ProxyRouteID, "error", err)
if c.ProxyRouteID != "" {
if err := d.proxy.DeleteRoute(ctx, c.ProxyRouteID); err != nil {
slog.Warn("delete proxy route", "route_id", c.ProxyRouteID, "error", err)
}
// Remove DNS record for this instance.
if inst.Subdomain != "" && settings.Domain != "" {
fqdn := inst.Subdomain + "." + settings.Domain
// Remove DNS record.
if c.Subdomain != "" && settings.Domain != "" {
fqdn := c.Subdomain + "." + settings.Domain
d.removeDNS(ctx, fqdn, "")
}
}
// Delete instance record.
if err := d.store.DeleteInstance(inst.ID); err != nil {
return fmt.Errorf("delete instance record: %w", err)
}
// Drop the matching container index row. ID matches instance.ID by
// construction; ignore NotFound which is harmless if the row predates
// this refactor.
if err := d.store.DeleteContainer(inst.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
slog.Warn("delete container row", "instance_id", inst.ID, "error", err)
// Drop the container row.
if err := d.store.DeleteContainer(c.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
return fmt.Errorf("delete container row: %w", err)
}
return nil
@@ -903,33 +893,6 @@ func truncateID(id string) string {
return id
}
// upsertContainerForInstance keeps the normalized containers index in sync
// with the project-specific instance row. Same UUID is used for both rows so
// the reconciler can find them later. Best-effort: a sync failure is logged
// but does not abort the deploy — the container is still running and the
// reconciler will pick it up on the next tick (once that lands).
func (d *Deployer) upsertContainerForInstance(project store.Project, stage store.Stage, inst store.Instance, workloadID string) {
c := store.Container{
ID: inst.ID,
WorkloadID: workloadID,
WorkloadKind: string(store.WorkloadKindProject),
Role: stage.Name,
ContainerID: inst.ContainerID,
ImageRef: project.Image + ":" + inst.ImageTag,
ImageTag: inst.ImageTag,
Host: "local",
State: inst.Status,
Port: inst.Port,
Subdomain: inst.Subdomain,
ProxyRouteID: inst.ProxyRouteID,
NpmProxyID: inst.NpmProxyID,
LastSeenAt: inst.LastAliveAt,
}
if err := d.store.UpsertContainer(c); err != nil {
slog.Warn("upsert container row", "instance_id", inst.ID, "error", err)
}
}
// resolveProjectWorkloadID returns the workload ID paired with a project.
// Backfill-on-boot guarantees the row exists, so this is essentially a lookup.
// On miss (defensive), it logs and returns empty so the caller can decide.
+4 -4
View File
@@ -34,13 +34,13 @@ func (d *Deployer) validatePromoteFrom(stage store.Stage, imageTag string) error
}
// Check if the tag is running in the source stage.
instances, err := d.store.GetInstancesByStageID(sourceStage.ID)
containers, err := d.store.ListContainersByStageID(sourceStage.ID)
if err != nil {
return fmt.Errorf("get instances for source stage: %w", err)
return fmt.Errorf("get containers for source stage: %w", err)
}
for _, inst := range instances {
if inst.ImageTag == imageTag && (inst.Status == "running" || inst.Status == "stopped") {
for _, c := range containers {
if c.ImageTag == imageTag && (c.State == "running" || c.State == "stopped") {
return nil // Tag found in source stage, promotion is allowed.
}
}
+8 -7
View File
@@ -32,24 +32,25 @@ func (d *Deployer) rollback(ctx context.Context, deployID string, containerID st
}
}
// Clean up DNS record if the instance had a subdomain.
// Clean up DNS record if the container had a subdomain. instanceID is
// the container row ID (same UUID either way) — read from containers.
if instanceID != "" {
inst, err := d.store.GetInstanceByID(instanceID)
if err == nil && inst.Subdomain != "" {
c, err := d.store.GetContainerByID(instanceID)
if err == nil && c.Subdomain != "" {
settings, settingsErr := d.store.GetSettings()
if settingsErr != nil {
slog.Warn("rollback: failed to get settings for DNS cleanup", "error", settingsErr)
} else if settings.Domain != "" {
fqdn := inst.Subdomain + "." + settings.Domain
fqdn := c.Subdomain + "." + settings.Domain
d.removeDNS(ctx, fqdn, deployID)
}
}
}
// Update instance status to failed if it was created.
// Mark the container row as failed if it was created.
if instanceID != "" {
if err := d.store.UpdateInstanceStatus(instanceID, "failed"); err != nil {
slog.Warn("rollback: update instance status", "instance_id", instanceID, "error", err)
if err := d.store.UpdateContainerState(instanceID, "failed"); err != nil {
slog.Warn("rollback: update container state", "id", instanceID, "error", err)
}
}