refactor(workload): extract Instance entirely; Container is canonical
Build / build (push) Successful in 10m41s
Build / build (push) Successful in 10m41s
End-to-end extraction of the Instance concept. After this commit:
* internal/store/instances.go — DELETED
* internal/store/models.go — Instance struct gone, ProxyRoute moved here
* containers table is the single source of truth for project/stack/site
container state. instances table is dropped via DROP TABLE migration
(idempotent; re-runnable on every boot).
* Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
tinyforge.role, and tinyforge.managed are stamped on new containers.
Backend rewrites:
- internal/deployer: executeDeploy + blueGreenDeploy + rollback +
promote use store.Container natively. New
removeContainer() replaces removeInstance().
enforceMaxInstances reads via
ListContainersByStageID.
- internal/reconciler: legacy tinyforge.instance-id dispatch removed;
upsertByWorkloadLabel now finds existing rows
by docker container ID first and falls back to
the deterministic workloadID:role key.
- internal/stale/scanner: Scan + new FindStaleContainers walk the
containers table; emit StaleContainer JSON.
- internal/stats/collector: ListContainers replaces ListAllInstances.
- internal/webhook/handler: workload-secret lookup tried first; falls back
to project / static_site secret column.
- internal/api: instances.go, stale.go, stats.go, stats_history.go,
projects.go, settings.go, docker.go, dns.go all read /
write through Container.
Docker layer:
- ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
- ListContainers filters by tinyforge.managed=true.
- Network creation uses LabelManaged instead of LabelProject.
Frontend:
- Instance type is now a Container alias; .status → .state,
.last_alive_at → .last_seen_at.
- InstanceCard takes stageId as a prop (no longer derived from Instance).
- StaleContainer JSON shape rewritten: { container, workload_name, role,
days_stale }. StaleContainerCard + /containers/stale page updated.
- ProjectCard / homepage / SystemHealthCard filter by .state.
The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.
Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
This commit is contained in:
+90
-121
@@ -14,16 +14,21 @@ import (
|
||||
"github.com/robfig/cron/v3"
|
||||
)
|
||||
|
||||
// StaleInstance holds enriched info about a stale container for API responses.
|
||||
type StaleInstance struct {
|
||||
Instance store.Instance `json:"instance"`
|
||||
ProjectName string `json:"project_name"`
|
||||
StageName string `json:"stage_name"`
|
||||
DaysStale int `json:"days_stale"`
|
||||
// StaleContainer is a stale container row enriched with the human-readable
|
||||
// labels needed to render the Stale view (workload + role + days).
|
||||
//
|
||||
// JSON shape uses container_id semantics — the frontend type was historically
|
||||
// "Instance"; after the workload refactor it consumes Container fields directly.
|
||||
type StaleContainer struct {
|
||||
Container store.Container `json:"container"`
|
||||
WorkloadID string `json:"workload_id"`
|
||||
WorkloadName string `json:"workload_name"`
|
||||
Role string `json:"role"`
|
||||
DaysStale int `json:"days_stale"`
|
||||
}
|
||||
|
||||
// Scanner periodically checks for stale containers that have been
|
||||
// non-running for longer than the configured threshold.
|
||||
// Scanner periodically checks for containers that have been non-running for
|
||||
// longer than the configured threshold.
|
||||
type Scanner struct {
|
||||
store *store.Store
|
||||
docker *docker.Client
|
||||
@@ -34,8 +39,8 @@ type Scanner struct {
|
||||
entryID cron.EntryID
|
||||
running bool
|
||||
|
||||
// knownStale tracks instance IDs that have already had a stale event emitted,
|
||||
// to avoid re-emitting warnings for the same instance.
|
||||
// knownStale tracks container row IDs that have already had a stale event
|
||||
// emitted, to avoid re-emitting the same warning on every tick.
|
||||
knownStale map[string]struct{}
|
||||
}
|
||||
|
||||
@@ -101,7 +106,7 @@ func (s *Scanner) Stop() {
|
||||
}
|
||||
|
||||
// Scan performs a single stale-container scan cycle.
|
||||
// It updates last_alive_at for running containers and detects newly stale ones.
|
||||
// Updates last_seen_at for running containers and detects newly stale ones.
|
||||
func (s *Scanner) Scan(ctx context.Context) error {
|
||||
settings, err := s.store.GetSettings()
|
||||
if err != nil {
|
||||
@@ -113,67 +118,53 @@ func (s *Scanner) Scan(ctx context.Context) error {
|
||||
thresholdDays = 7
|
||||
}
|
||||
|
||||
// Get all instances from the store.
|
||||
instances, err := s.store.ListAllInstances()
|
||||
containers, err := s.store.ListContainers(store.ContainerFilter{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("list all instances: %w", err)
|
||||
return fmt.Errorf("list containers: %w", err)
|
||||
}
|
||||
|
||||
if len(instances) == 0 {
|
||||
if len(containers) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get all managed Docker containers to check live state.
|
||||
containers, err := s.docker.ListContainers(ctx, nil)
|
||||
// Live state from Docker, indexed by container_id label so we can
|
||||
// reconcile on a single pass.
|
||||
dockerContainers, err := s.docker.ListContainers(ctx, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("list docker containers: %w", err)
|
||||
}
|
||||
|
||||
// Build a lookup: instance ID -> container state.
|
||||
containerStateByInstanceID := make(map[string]string, len(containers))
|
||||
for _, c := range containers {
|
||||
if c.InstanceID != "" {
|
||||
containerStateByInstanceID[c.InstanceID] = c.State
|
||||
}
|
||||
stateByContainerID := make(map[string]string, len(dockerContainers))
|
||||
for _, dc := range dockerContainers {
|
||||
stateByContainerID[dc.ID] = dc.State
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
currentStaleIDs := make(map[string]struct{})
|
||||
|
||||
for _, inst := range instances {
|
||||
// Skip instances already being cleaned up.
|
||||
if inst.Status == "removing" {
|
||||
for _, c := range containers {
|
||||
if c.State == "removing" {
|
||||
continue
|
||||
}
|
||||
|
||||
dockerState := containerStateByInstanceID[inst.ID]
|
||||
dockerState := stateByContainerID[c.ContainerID]
|
||||
|
||||
// If the container is running in Docker, update last_alive_at.
|
||||
if dockerState == "running" {
|
||||
if err := s.store.UpdateLastAliveAt(inst.ID); err != nil {
|
||||
slog.Warn("stale scanner: failed to update last_alive_at",
|
||||
"instance_id", inst.ID, "error", err)
|
||||
}
|
||||
// Also sync store status if it was out of date.
|
||||
if inst.Status != "running" {
|
||||
if err := s.store.UpdateInstanceStatus(inst.ID, "running"); err != nil {
|
||||
slog.Warn("stale scanner: failed to sync instance status",
|
||||
"instance_id", inst.ID, "error", err)
|
||||
}
|
||||
if err := s.store.UpdateContainerState(c.ID, "running"); err != nil {
|
||||
slog.Warn("stale scanner: failed to update state",
|
||||
"id", c.ID, "error", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Container is not running. Check if it's stale.
|
||||
if inst.LastAliveAt == "" {
|
||||
// Never been seen running. Use created_at as fallback.
|
||||
inst.LastAliveAt = inst.CreatedAt
|
||||
// Container is not running. Check staleness against last_seen_at,
|
||||
// falling back to created_at if it never came up.
|
||||
ref := c.LastSeenAt
|
||||
if ref == "" {
|
||||
ref = c.CreatedAt
|
||||
}
|
||||
|
||||
lastAlive, parseErr := time.Parse("2006-01-02 15:04:05", inst.LastAliveAt)
|
||||
lastAlive, parseErr := time.Parse("2006-01-02 15:04:05", ref)
|
||||
if parseErr != nil {
|
||||
slog.Warn("stale scanner: failed to parse last_alive_at",
|
||||
"instance_id", inst.ID, "last_alive_at", inst.LastAliveAt, "error", parseErr)
|
||||
slog.Warn("stale scanner: failed to parse last_seen_at",
|
||||
"id", c.ID, "ref", ref, "error", parseErr)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -182,23 +173,19 @@ func (s *Scanner) Scan(ctx context.Context) error {
|
||||
continue
|
||||
}
|
||||
|
||||
// This instance is stale.
|
||||
currentStaleIDs[inst.ID] = struct{}{}
|
||||
|
||||
// Emit event only if this is newly detected as stale.
|
||||
if _, alreadyKnown := s.knownStale[inst.ID]; !alreadyKnown {
|
||||
s.emitStaleEvent(inst, daysSinceAlive)
|
||||
currentStaleIDs[c.ID] = struct{}{}
|
||||
if _, alreadyKnown := s.knownStale[c.ID]; !alreadyKnown {
|
||||
s.emitStaleEvent(c, daysSinceAlive)
|
||||
}
|
||||
}
|
||||
|
||||
// Update known stale set: remove IDs that are no longer stale.
|
||||
s.knownStale = currentStaleIDs
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindStaleInstances returns all currently stale instances with enriched project/stage info.
|
||||
func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, error) {
|
||||
// FindStaleContainers returns all currently stale containers enriched with
|
||||
// workload + role labels for rendering.
|
||||
func (s *Scanner) FindStaleContainers(ctx context.Context) ([]StaleContainer, error) {
|
||||
settings, err := s.store.GetSettings()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get settings: %w", err)
|
||||
@@ -209,58 +196,45 @@ func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, erro
|
||||
thresholdDays = 7
|
||||
}
|
||||
|
||||
instances, err := s.store.ListAllInstances()
|
||||
containers, err := s.store.ListContainers(store.ContainerFilter{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list all instances: %w", err)
|
||||
return nil, fmt.Errorf("list containers: %w", err)
|
||||
}
|
||||
|
||||
containers, err := s.docker.ListContainers(ctx, nil)
|
||||
dockerContainers, err := s.docker.ListContainers(ctx, nil)
|
||||
if err != nil {
|
||||
// Docker unavailable — fall back to store-only detection (no live state).
|
||||
// Docker unavailable — fall back to store-only detection.
|
||||
slog.Warn("stale scanner: docker unavailable, using store status only", "error", err)
|
||||
containers = nil
|
||||
dockerContainers = nil
|
||||
}
|
||||
stateByContainerID := make(map[string]string, len(dockerContainers))
|
||||
for _, dc := range dockerContainers {
|
||||
stateByContainerID[dc.ID] = dc.State
|
||||
}
|
||||
|
||||
containerStateByInstanceID := make(map[string]string, len(containers))
|
||||
for _, c := range containers {
|
||||
if c.InstanceID != "" {
|
||||
containerStateByInstanceID[c.InstanceID] = c.State
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-load project and stage names to avoid N+1 queries.
|
||||
allProjects, _ := s.store.GetAllProjects()
|
||||
projectNames := make(map[string]string, len(allProjects))
|
||||
for _, p := range allProjects {
|
||||
projectNames[p.ID] = p.Name
|
||||
}
|
||||
stageNames := make(map[string]string)
|
||||
for _, p := range allProjects {
|
||||
stages, _ := s.store.GetStagesByProjectID(p.ID)
|
||||
for _, st := range stages {
|
||||
stageNames[st.ID] = st.Name
|
||||
}
|
||||
// Pre-load workload names so each stale row carries a friendly identifier.
|
||||
workloads, _ := s.store.ListWorkloads("")
|
||||
workloadNameByID := make(map[string]string, len(workloads))
|
||||
for _, w := range workloads {
|
||||
workloadNameByID[w.ID] = w.Name
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
var result []StaleInstance
|
||||
var result []StaleContainer
|
||||
|
||||
for _, inst := range instances {
|
||||
if inst.Status == "removing" {
|
||||
for _, c := range containers {
|
||||
if c.State == "removing" {
|
||||
continue
|
||||
}
|
||||
if stateByContainerID[c.ContainerID] == "running" {
|
||||
continue
|
||||
}
|
||||
|
||||
// If Docker says it's running, it's not stale.
|
||||
if containerStateByInstanceID[inst.ID] == "running" {
|
||||
continue
|
||||
ref := c.LastSeenAt
|
||||
if ref == "" {
|
||||
ref = c.CreatedAt
|
||||
}
|
||||
|
||||
lastAlive := inst.LastAliveAt
|
||||
if lastAlive == "" {
|
||||
lastAlive = inst.CreatedAt
|
||||
}
|
||||
|
||||
lastAliveTime, parseErr := time.Parse("2006-01-02 15:04:05", lastAlive)
|
||||
lastAliveTime, parseErr := time.Parse("2006-01-02 15:04:05", ref)
|
||||
if parseErr != nil {
|
||||
continue
|
||||
}
|
||||
@@ -270,21 +244,17 @@ func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, erro
|
||||
continue
|
||||
}
|
||||
|
||||
// Look up project and stage names from pre-loaded maps.
|
||||
projectName := projectNames[inst.ProjectID]
|
||||
if projectName == "" {
|
||||
projectName = inst.ProjectID
|
||||
}
|
||||
stageName := stageNames[inst.StageID]
|
||||
if stageName == "" {
|
||||
stageName = inst.StageID
|
||||
name := workloadNameByID[c.WorkloadID]
|
||||
if name == "" {
|
||||
name = c.WorkloadID
|
||||
}
|
||||
|
||||
result = append(result, StaleInstance{
|
||||
Instance: inst,
|
||||
ProjectName: projectName,
|
||||
StageName: stageName,
|
||||
DaysStale: daysSinceAlive,
|
||||
result = append(result, StaleContainer{
|
||||
Container: c,
|
||||
WorkloadID: c.WorkloadID,
|
||||
WorkloadName: name,
|
||||
Role: c.Role,
|
||||
DaysStale: daysSinceAlive,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -292,20 +262,20 @@ func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, erro
|
||||
}
|
||||
|
||||
// emitStaleEvent publishes a warning event for a newly detected stale container.
|
||||
func (s *Scanner) emitStaleEvent(inst store.Instance, daysStale int) {
|
||||
func (s *Scanner) emitStaleEvent(c store.Container, daysStale int) {
|
||||
metadata, _ := json.Marshal(map[string]any{
|
||||
"instance_id": inst.ID,
|
||||
"project_id": inst.ProjectID,
|
||||
"stage_id": inst.StageID,
|
||||
"image_tag": inst.ImageTag,
|
||||
"last_alive_at": inst.LastAliveAt,
|
||||
"days_stale": daysStale,
|
||||
"container_id": c.ID,
|
||||
"workload_id": c.WorkloadID,
|
||||
"workload_kind": c.WorkloadKind,
|
||||
"role": c.Role,
|
||||
"image_tag": c.ImageTag,
|
||||
"last_seen_at": c.LastSeenAt,
|
||||
"days_stale": daysStale,
|
||||
})
|
||||
|
||||
msg := fmt.Sprintf("Container %s (tag: %s) has been non-running for %d days",
|
||||
inst.ID, inst.ImageTag, daysStale)
|
||||
c.ID, c.ImageTag, daysStale)
|
||||
|
||||
// Persist directly to event log.
|
||||
evt, err := s.store.InsertEvent(store.EventLog{
|
||||
Source: "stale_scanner",
|
||||
Severity: "warn",
|
||||
@@ -317,15 +287,14 @@ func (s *Scanner) emitStaleEvent(inst store.Instance, daysStale int) {
|
||||
return
|
||||
}
|
||||
|
||||
// Publish for SSE clients.
|
||||
s.eventBus.Publish(events.Event{
|
||||
Type: events.EventLog,
|
||||
Payload: events.EventLogPayload{
|
||||
ID: evt.ID,
|
||||
Source: "stale_scanner",
|
||||
Severity: "warn",
|
||||
Message: msg,
|
||||
Metadata: string(metadata),
|
||||
Severity: "warn",
|
||||
Message: msg,
|
||||
Metadata: string(metadata),
|
||||
CreatedAt: evt.CreatedAt,
|
||||
},
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user