1c47030854
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
71 lines
2.6 KiB
Go
71 lines
2.6 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"github.com/alexei/tinyforge/internal/deployer"
|
|
"github.com/alexei/tinyforge/internal/docker"
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
"github.com/alexei/tinyforge/internal/workload/plugin"
|
|
)
|
|
|
|
// restoreStopTimeoutSeconds bounds the graceful-stop window per container during
|
|
// a restore quiesce before Docker kills it.
|
|
const restoreStopTimeoutSeconds = 10
|
|
|
|
// restoreLifecycle adapts the deployer + Docker client + store to the
|
|
// volsnap.Lifecycle seam the volume-snapshot restore flow needs. It lives in the
|
|
// composition root so the volsnap package stays decoupled from deployer/docker.
|
|
type restoreLifecycle struct {
|
|
dep *deployer.Deployer
|
|
docker *docker.Client
|
|
store *store.Store
|
|
}
|
|
|
|
// Lock takes the deployer's per-workload deploy lock so the restore serializes
|
|
// against every deploy entrypoint (C1).
|
|
func (l *restoreLifecycle) Lock(workloadID string) func() { return l.dep.LockWorkload(workloadID) }
|
|
|
|
// StopContainers stops every running container for the workload (quiesce before
|
|
// the volume swap, C4) and returns the image tag the newest running container
|
|
// was on, so the redeploy brings the SAME version back up. ListContainersByWorkload
|
|
// returns rows newest-first, so the first running row is the newest.
|
|
func (l *restoreLifecycle) StopContainers(ctx context.Context, workloadID string) (string, error) {
|
|
rows, err := l.store.ListContainersByWorkload(workloadID)
|
|
if err != nil {
|
|
return "", fmt.Errorf("list containers: %w", err)
|
|
}
|
|
tag := ""
|
|
for _, c := range rows {
|
|
if c.State != "running" || c.ContainerID == "" {
|
|
continue
|
|
}
|
|
if tag == "" && c.ImageTag != "" {
|
|
tag = c.ImageTag // newest running container's tag
|
|
}
|
|
if err := l.docker.StopContainer(ctx, c.ContainerID, restoreStopTimeoutSeconds); err != nil {
|
|
return "", fmt.Errorf("stop container %s: %w", c.ContainerID, err)
|
|
}
|
|
if err := l.store.UpdateContainerState(c.ID, "stopped"); err != nil {
|
|
slog.Warn("restore: mark container stopped", "container", c.ID, "error", err)
|
|
}
|
|
}
|
|
return tag, nil
|
|
}
|
|
|
|
// Redeploy re-dispatches the workload via the deployer's unlocked path (the
|
|
// restore already holds the per-workload lock). reference pins the image tag.
|
|
func (l *restoreLifecycle) Redeploy(ctx context.Context, w store.Workload, reference string) error {
|
|
intent := plugin.DeploymentIntent{
|
|
Reason: "restore",
|
|
Reference: reference,
|
|
Metadata: map[string]string{"note": "redeploy after volume snapshot restore"},
|
|
TriggeredAt: time.Now().UTC(),
|
|
TriggeredBy: "restore",
|
|
}
|
|
return l.dep.RedeployLocked(ctx, plugin.WorkloadFromStore(w), intent)
|
|
}
|