Files
tiny-forge/cmd/server/restore_lifecycle.go
T
alexei.dolgolyov 1c47030854 feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00

71 lines
2.6 KiB
Go

package main
import (
"context"
"fmt"
"log/slog"
"time"
"github.com/alexei/tinyforge/internal/deployer"
"github.com/alexei/tinyforge/internal/docker"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/workload/plugin"
)
// restoreStopTimeoutSeconds bounds the graceful-stop window per container during
// a restore quiesce before Docker kills it.
const restoreStopTimeoutSeconds = 10
// restoreLifecycle adapts the deployer + Docker client + store to the
// volsnap.Lifecycle seam the volume-snapshot restore flow needs. It lives in the
// composition root so the volsnap package stays decoupled from deployer/docker.
type restoreLifecycle struct {
dep *deployer.Deployer
docker *docker.Client
store *store.Store
}
// Lock takes the deployer's per-workload deploy lock so the restore serializes
// against every deploy entrypoint (C1).
func (l *restoreLifecycle) Lock(workloadID string) func() { return l.dep.LockWorkload(workloadID) }
// StopContainers stops every running container for the workload (quiesce before
// the volume swap, C4) and returns the image tag the newest running container
// was on, so the redeploy brings the SAME version back up. ListContainersByWorkload
// returns rows newest-first, so the first running row is the newest.
func (l *restoreLifecycle) StopContainers(ctx context.Context, workloadID string) (string, error) {
rows, err := l.store.ListContainersByWorkload(workloadID)
if err != nil {
return "", fmt.Errorf("list containers: %w", err)
}
tag := ""
for _, c := range rows {
if c.State != "running" || c.ContainerID == "" {
continue
}
if tag == "" && c.ImageTag != "" {
tag = c.ImageTag // newest running container's tag
}
if err := l.docker.StopContainer(ctx, c.ContainerID, restoreStopTimeoutSeconds); err != nil {
return "", fmt.Errorf("stop container %s: %w", c.ContainerID, err)
}
if err := l.store.UpdateContainerState(c.ID, "stopped"); err != nil {
slog.Warn("restore: mark container stopped", "container", c.ID, "error", err)
}
}
return tag, nil
}
// Redeploy re-dispatches the workload via the deployer's unlocked path (the
// restore already holds the per-workload lock). reference pins the image tag.
func (l *restoreLifecycle) Redeploy(ctx context.Context, w store.Workload, reference string) error {
intent := plugin.DeploymentIntent{
Reason: "restore",
Reference: reference,
Metadata: map[string]string{"note": "redeploy after volume snapshot restore"},
TriggeredAt: time.Now().UTC(),
TriggeredBy: "restore",
}
return l.dep.RedeployLocked(ctx, plugin.WorkloadFromStore(w), intent)
}