refactor(workload): extract Instance entirely; Container is canonical

End-to-end extraction of the Instance concept. After this commit: * internal/store/instances.go — DELETED * internal/store/models.go — Instance struct gone, ProxyRoute moved here * containers table is the single source of truth for project/stack/site container state. instances table is dropped via DROP TABLE migration (idempotent; re-runnable on every boot). * Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id Docker labels are no longer emitted; only tinyforge.workload.{id,kind}, tinyforge.role, and tinyforge.managed are stamped on new containers. Backend rewrites: - internal/deployer: executeDeploy + blueGreenDeploy + rollback + promote use store.Container natively. New removeContainer() replaces removeInstance(). enforceMaxInstances reads via ListContainersByStageID. - internal/reconciler: legacy tinyforge.instance-id dispatch removed; upsertByWorkloadLabel now finds existing rows by docker container ID first and falls back to the deterministic workloadID:role key. - internal/stale/scanner: Scan + new FindStaleContainers walk the containers table; emit StaleContainer JSON. - internal/stats/collector: ListContainers replaces ListAllInstances. - internal/webhook/handler: workload-secret lookup tried first; falls back to project / static_site secret column. - internal/api: instances.go, stale.go, stats.go, stats_history.go, projects.go, settings.go, docker.go, dns.go all read / write through Container. Docker layer: - ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels. - ListContainers filters by tinyforge.managed=true. - Network creation uses LabelManaged instead of LabelProject. Frontend: - Instance type is now a Container alias; .status → .state, .last_alive_at → .last_seen_at. - InstanceCard takes stageId as a prop (no longer derived from Instance). - StaleContainer JSON shape rewritten: { container, workload_name, role, days_stale }. StaleContainerCard + /containers/stale page updated. - ProjectCard / homepage / SystemHealthCard filter by .state. The migration loop now tolerates "no such table" alongside "duplicate column" / "already exists" so obsolete ALTER TABLE entries targeting the dropped instances table no-op cleanly on first boot. Tests: store + deployer + reconciler + webhook + staticsite + notify all still pass. Frontend svelte-check: zero errors.
2026-05-09 14:43:12 +03:00
parent d516462750
commit d8ab22876f
32 changed files with 649 additions and 957 deletions
@@ -14,16 +14,21 @@ import (
 	"github.com/robfig/cron/v3"
 )

-// StaleInstance holds enriched info about a stale container for API responses.
-type StaleInstance struct {
-	Instance    store.Instance `json:"instance"`
-	ProjectName string         `json:"project_name"`
-	StageName   string         `json:"stage_name"`
-	DaysStale   int            `json:"days_stale"`
+// StaleContainer is a stale container row enriched with the human-readable
+// labels needed to render the Stale view (workload + role + days).
+//
+// JSON shape uses container_id semantics — the frontend type was historically
+// "Instance"; after the workload refactor it consumes Container fields directly.
+type StaleContainer struct {
+	Container    store.Container `json:"container"`
+	WorkloadID   string          `json:"workload_id"`
+	WorkloadName string          `json:"workload_name"`
+	Role         string          `json:"role"`
+	DaysStale    int             `json:"days_stale"`
 }

-// Scanner periodically checks for stale containers that have been
-// non-running for longer than the configured threshold.
+// Scanner periodically checks for containers that have been non-running for
+// longer than the configured threshold.
 type Scanner struct {
 	store    *store.Store
 	docker   *docker.Client
@@ -34,8 +39,8 @@ type Scanner struct {
 	entryID cron.EntryID
 	running bool

-	// knownStale tracks instance IDs that have already had a stale event emitted,
-	// to avoid re-emitting warnings for the same instance.
+	// knownStale tracks container row IDs that have already had a stale event
+	// emitted, to avoid re-emitting the same warning on every tick.
 	knownStale map[string]struct{}
 }

@@ -101,7 +106,7 @@ func (s *Scanner) Stop() {
 }

 // Scan performs a single stale-container scan cycle.
-// It updates last_alive_at for running containers and detects newly stale ones.
+// Updates last_seen_at for running containers and detects newly stale ones.
 func (s *Scanner) Scan(ctx context.Context) error {
 	settings, err := s.store.GetSettings()
 	if err != nil {
@@ -113,67 +118,53 @@ func (s *Scanner) Scan(ctx context.Context) error {
 		thresholdDays = 7
 	}

-	// Get all instances from the store.
-	instances, err := s.store.ListAllInstances()
+	containers, err := s.store.ListContainers(store.ContainerFilter{})
 	if err != nil {
-		return fmt.Errorf("list all instances: %w", err)
+		return fmt.Errorf("list containers: %w", err)
 	}
-
-	if len(instances) == 0 {
+	if len(containers) == 0 {
 		return nil
 	}

-	// Get all managed Docker containers to check live state.
-	containers, err := s.docker.ListContainers(ctx, nil)
+	// Live state from Docker, indexed by container_id label so we can
+	// reconcile on a single pass.
+	dockerContainers, err := s.docker.ListContainers(ctx, nil)
 	if err != nil {
 		return fmt.Errorf("list docker containers: %w", err)
 	}
-
-	// Build a lookup: instance ID -> container state.
-	containerStateByInstanceID := make(map[string]string, len(containers))
-	for _, c := range containers {
-		if c.InstanceID != "" {
-			containerStateByInstanceID[c.InstanceID] = c.State
-		}
+	stateByContainerID := make(map[string]string, len(dockerContainers))
+	for _, dc := range dockerContainers {
+		stateByContainerID[dc.ID] = dc.State
 	}

 	now := time.Now().UTC()
 	currentStaleIDs := make(map[string]struct{})

-	for _, inst := range instances {
-		// Skip instances already being cleaned up.
-		if inst.Status == "removing" {
+	for _, c := range containers {
+		if c.State == "removing" {
 			continue
 		}

-		dockerState := containerStateByInstanceID[inst.ID]
+		dockerState := stateByContainerID[c.ContainerID]

-		// If the container is running in Docker, update last_alive_at.
 		if dockerState == "running" {
-			if err := s.store.UpdateLastAliveAt(inst.ID); err != nil {
-				slog.Warn("stale scanner: failed to update last_alive_at",
-					"instance_id", inst.ID, "error", err)
-			}
-			// Also sync store status if it was out of date.
-			if inst.Status != "running" {
-				if err := s.store.UpdateInstanceStatus(inst.ID, "running"); err != nil {
-					slog.Warn("stale scanner: failed to sync instance status",
-						"instance_id", inst.ID, "error", err)
-				}
+			if err := s.store.UpdateContainerState(c.ID, "running"); err != nil {
+				slog.Warn("stale scanner: failed to update state",
+					"id", c.ID, "error", err)
 			}
 			continue
 		}

-		// Container is not running. Check if it's stale.
-		if inst.LastAliveAt == "" {
-			// Never been seen running. Use created_at as fallback.
-			inst.LastAliveAt = inst.CreatedAt
+		// Container is not running. Check staleness against last_seen_at,
+		// falling back to created_at if it never came up.
+		ref := c.LastSeenAt
+		if ref == "" {
+			ref = c.CreatedAt
 		}
-
-		lastAlive, parseErr := time.Parse("2006-01-02 15:04:05", inst.LastAliveAt)
+		lastAlive, parseErr := time.Parse("2006-01-02 15:04:05", ref)
 		if parseErr != nil {
-			slog.Warn("stale scanner: failed to parse last_alive_at",
-				"instance_id", inst.ID, "last_alive_at", inst.LastAliveAt, "error", parseErr)
+			slog.Warn("stale scanner: failed to parse last_seen_at",
+				"id", c.ID, "ref", ref, "error", parseErr)
 			continue
 		}

@@ -182,23 +173,19 @@ func (s *Scanner) Scan(ctx context.Context) error {
 			continue
 		}

-		// This instance is stale.
-		currentStaleIDs[inst.ID] = struct{}{}
-
-		// Emit event only if this is newly detected as stale.
-		if _, alreadyKnown := s.knownStale[inst.ID]; !alreadyKnown {
-			s.emitStaleEvent(inst, daysSinceAlive)
+		currentStaleIDs[c.ID] = struct{}{}
+		if _, alreadyKnown := s.knownStale[c.ID]; !alreadyKnown {
+			s.emitStaleEvent(c, daysSinceAlive)
 		}
 	}

-	// Update known stale set: remove IDs that are no longer stale.
 	s.knownStale = currentStaleIDs
-
 	return nil
 }

-// FindStaleInstances returns all currently stale instances with enriched project/stage info.
-func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, error) {
+// FindStaleContainers returns all currently stale containers enriched with
+// workload + role labels for rendering.
+func (s *Scanner) FindStaleContainers(ctx context.Context) ([]StaleContainer, error) {
 	settings, err := s.store.GetSettings()
 	if err != nil {
 		return nil, fmt.Errorf("get settings: %w", err)
@@ -209,58 +196,45 @@ func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, erro
 		thresholdDays = 7
 	}

-	instances, err := s.store.ListAllInstances()
+	containers, err := s.store.ListContainers(store.ContainerFilter{})
 	if err != nil {
-		return nil, fmt.Errorf("list all instances: %w", err)
+		return nil, fmt.Errorf("list containers: %w", err)
 	}

-	containers, err := s.docker.ListContainers(ctx, nil)
+	dockerContainers, err := s.docker.ListContainers(ctx, nil)
 	if err != nil {
-		// Docker unavailable — fall back to store-only detection (no live state).
+		// Docker unavailable — fall back to store-only detection.
 		slog.Warn("stale scanner: docker unavailable, using store status only", "error", err)
-		containers = nil
+		dockerContainers = nil
+	}
+	stateByContainerID := make(map[string]string, len(dockerContainers))
+	for _, dc := range dockerContainers {
+		stateByContainerID[dc.ID] = dc.State
 	}

-	containerStateByInstanceID := make(map[string]string, len(containers))
-	for _, c := range containers {
-		if c.InstanceID != "" {
-			containerStateByInstanceID[c.InstanceID] = c.State
-		}
-	}
-
-	// Pre-load project and stage names to avoid N+1 queries.
-	allProjects, _ := s.store.GetAllProjects()
-	projectNames := make(map[string]string, len(allProjects))
-	for _, p := range allProjects {
-		projectNames[p.ID] = p.Name
-	}
-	stageNames := make(map[string]string)
-	for _, p := range allProjects {
-		stages, _ := s.store.GetStagesByProjectID(p.ID)
-		for _, st := range stages {
-			stageNames[st.ID] = st.Name
-		}
+	// Pre-load workload names so each stale row carries a friendly identifier.
+	workloads, _ := s.store.ListWorkloads("")
+	workloadNameByID := make(map[string]string, len(workloads))
+	for _, w := range workloads {
+		workloadNameByID[w.ID] = w.Name
 	}

 	now := time.Now().UTC()
-	var result []StaleInstance
+	var result []StaleContainer

-	for _, inst := range instances {
-		if inst.Status == "removing" {
+	for _, c := range containers {
+		if c.State == "removing" {
+			continue
+		}
+		if stateByContainerID[c.ContainerID] == "running" {
 			continue
 		}

-		// If Docker says it's running, it's not stale.
-		if containerStateByInstanceID[inst.ID] == "running" {
-			continue
+		ref := c.LastSeenAt
+		if ref == "" {
+			ref = c.CreatedAt
 		}
-
-		lastAlive := inst.LastAliveAt
-		if lastAlive == "" {
-			lastAlive = inst.CreatedAt
-		}
-
-		lastAliveTime, parseErr := time.Parse("2006-01-02 15:04:05", lastAlive)
+		lastAliveTime, parseErr := time.Parse("2006-01-02 15:04:05", ref)
 		if parseErr != nil {
 			continue
 		}
@@ -270,21 +244,17 @@ func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, erro
 			continue
 		}

-		// Look up project and stage names from pre-loaded maps.
-		projectName := projectNames[inst.ProjectID]
-		if projectName == "" {
-			projectName = inst.ProjectID
-		}
-		stageName := stageNames[inst.StageID]
-		if stageName == "" {
-			stageName = inst.StageID
+		name := workloadNameByID[c.WorkloadID]
+		if name == "" {
+			name = c.WorkloadID
 		}

-		result = append(result, StaleInstance{
-			Instance:    inst,
-			ProjectName: projectName,
-			StageName:   stageName,
-			DaysStale:   daysSinceAlive,
+		result = append(result, StaleContainer{
+			Container:    c,
+			WorkloadID:   c.WorkloadID,
+			WorkloadName: name,
+			Role:         c.Role,
+			DaysStale:    daysSinceAlive,
 		})
 	}

@@ -292,20 +262,20 @@ func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, erro
 }

 // emitStaleEvent publishes a warning event for a newly detected stale container.
-func (s *Scanner) emitStaleEvent(inst store.Instance, daysStale int) {
+func (s *Scanner) emitStaleEvent(c store.Container, daysStale int) {
 	metadata, _ := json.Marshal(map[string]any{
-		"instance_id":  inst.ID,
-		"project_id":   inst.ProjectID,
-		"stage_id":     inst.StageID,
-		"image_tag":    inst.ImageTag,
-		"last_alive_at": inst.LastAliveAt,
-		"days_stale":   daysStale,
+		"container_id":  c.ID,
+		"workload_id":   c.WorkloadID,
+		"workload_kind": c.WorkloadKind,
+		"role":          c.Role,
+		"image_tag":     c.ImageTag,
+		"last_seen_at":  c.LastSeenAt,
+		"days_stale":    daysStale,
 	})

 	msg := fmt.Sprintf("Container %s (tag: %s) has been non-running for %d days",
-		inst.ID, inst.ImageTag, daysStale)
+		c.ID, c.ImageTag, daysStale)

-	// Persist directly to event log.
 	evt, err := s.store.InsertEvent(store.EventLog{
 		Source:   "stale_scanner",
 		Severity: "warn",
@@ -317,15 +287,14 @@ func (s *Scanner) emitStaleEvent(inst store.Instance, daysStale int) {
 		return
 	}

-	// Publish for SSE clients.
 	s.eventBus.Publish(events.Event{
 		Type: events.EventLog,
 		Payload: events.EventLogPayload{
 			ID:        evt.ID,
 			Source:    "stale_scanner",
-			Severity: "warn",
-			Message:  msg,
-			Metadata: string(metadata),
+			Severity:  "warn",
+			Message:   msg,
+			Metadata:  string(metadata),
 			CreatedAt: evt.CreatedAt,
 		},
 	})