tiny-forge/cmd/server/restore_lifecycle.go

package main

import (
	"context"
	"fmt"
	"log/slog"
	"time"

	"github.com/alexei/tinyforge/internal/deployer"
	"github.com/alexei/tinyforge/internal/docker"
	"github.com/alexei/tinyforge/internal/store"
	"github.com/alexei/tinyforge/internal/workload/plugin"
)

// restoreStopTimeoutSeconds bounds the graceful-stop window per container during
// a restore quiesce before Docker kills it.
const restoreStopTimeoutSeconds = 10

// restoreLifecycle adapts the deployer + Docker client + store to the
// volsnap.Lifecycle seam the volume-snapshot restore flow needs. It lives in the
// composition root so the volsnap package stays decoupled from deployer/docker.
type restoreLifecycle struct {
	dep    *deployer.Deployer
	docker *docker.Client
	store  *store.Store
}

// Lock takes the deployer's per-workload deploy lock so the restore serializes
// against every deploy entrypoint (C1).
func (l *restoreLifecycle) Lock(workloadID string) func() { return l.dep.LockWorkload(workloadID) }

// StopContainers stops every running container for the workload (quiesce before
// the volume swap, C4) and returns the image tag the newest running container
// was on, so the redeploy brings the SAME version back up. ListContainersByWorkload
// returns rows newest-first, so the first running row is the newest.
func (l *restoreLifecycle) StopContainers(ctx context.Context, workloadID string) (string, error) {
	rows, err := l.store.ListContainersByWorkload(workloadID)
	if err != nil {
		return "", fmt.Errorf("list containers: %w", err)
	}
	tag := ""
	for _, c := range rows {
		if c.State != "running" || c.ContainerID == "" {
			continue
		}
		if tag == "" && c.ImageTag != "" {
			tag = c.ImageTag // newest running container's tag
		}
		if err := l.docker.StopContainer(ctx, c.ContainerID, restoreStopTimeoutSeconds); err != nil {
			return "", fmt.Errorf("stop container %s: %w", c.ContainerID, err)
		}
		if err := l.store.UpdateContainerState(c.ID, "stopped"); err != nil {
			slog.Warn("restore: mark container stopped", "container", c.ID, "error", err)
		}
	}
	return tag, nil
}

// Redeploy re-dispatches the workload via the deployer's unlocked path (the
// restore already holds the per-workload lock). reference pins the image tag.
func (l *restoreLifecycle) Redeploy(ctx context.Context, w store.Workload, reference string) error {
	intent := plugin.DeploymentIntent{
		Reason:      "restore",
		Reference:   reference,
		Metadata:    map[string]string{"note": "redeploy after volume snapshot restore"},
		TriggeredAt: time.Now().UTC(),
		TriggeredBy: "restore",
	}
	return l.dep.RedeployLocked(ctx, plugin.WorkloadFromStore(w), intent)
}