package volsnap import ( "context" "encoding/json" "fmt" "log/slog" "os" "path/filepath" "strings" "github.com/google/uuid" "github.com/alexei/tinyforge/internal/store" ) // Lifecycle is the deploy-side seam Engine.Restore needs but volsnap must not // import directly (it would couple the snapshot package to the deployer/docker // packages). The composition root supplies an adapter over the Deployer + // Docker client via Engine.SetLifecycle. type Lifecycle interface { // Lock acquires the per-workload deploy lock (C1) and returns the release // func. Held by Restore across stop→swap→redeploy. Lock(workloadID string) func() // StopContainers stops every running container for the workload (C4 quiesce) // and returns the image tag the newest running container was on, so the // redeploy can bring the SAME version back up ("" ⇒ source default tag). StopContainers(ctx context.Context, workloadID string) (runningTag string, err error) // Redeploy re-dispatches the workload's current config WITHOUT re-acquiring // the per-workload lock (the caller holds it). reference pins the image tag. Redeploy(ctx context.Context, w store.Workload, reference string) error } // SetLifecycle wires the deploy-side seam. Pass nil to leave restore disabled. func (e *Engine) SetLifecycle(lc Lifecycle) { e.lifecycle = lc } // restoreJournal is the on-disk write-ahead record of an in-flight restore. // Written before the first destructive rename and deleted on completion; the // startup RecoverInterruptedRestores sweep replays it after a crash. type restoreJournal struct { SnapshotID string `json:"snapshot_id"` WorkloadID string `json:"workload_id"` Volumes []journalVolume `json:"volumes"` } type journalVolume struct { Live string `json:"live"` Old string `json:"old"` Tmp string `json:"tmp"` Swapped bool `json:"swapped"` HadOld bool `json:"had_old"` } // staged pairs a resolved volume with its per-restore staging dirs. type staged struct { rv resolvedVol tmp string old string } // Restore overwrites the workload's live host-bind volumes with a snapshot's // contents and brings the app back up. It is the single, engine-owned entry // point for the data-loss-sensitive restore flow (image-source workloads only). // // Ordering is deliberate and crash-aware: // // pre-flight (re-resolve all volumes C3, size + per-fs disk check C5) — abort // here touches nothing // → Lock (C1) → re-validate workload → StopContainers (C4 quiesce) // → extract ALL volumes to sibling .tmp staging dirs (reads the source archive // fully BEFORE the next step can prune it; shrinks the later destructive // window to pure renames — R3) // → capture a pre-restore snapshot (durable escape hatch, after quiesce, // before any destructive rename — folded suggestion) // → write the restore journal (R3 crash recovery) // → swap each volume atomically (rename live→.old, .tmp→live — C2) // → Redeploy (C4 — image containers are recreated, never reused) // → remove .old + journal, emit audit event // // Engine.Restore holds NO e.mu (R1): per-workload serialization is the // Lifecycle lock; e.Create takes its own e.mu for the pre-restore archive // write, so calling it here cannot self-deadlock. func (e *Engine) Restore(ctx context.Context, snapshotID, workloadID string) error { if e.lifecycle == nil { return fmt.Errorf("restore: lifecycle not configured") } snap, err := e.store.GetVolumeSnapshot(snapshotID) if err != nil { return err } if snap.WorkloadID != workloadID { return fmt.Errorf("snapshot %s does not belong to workload %s", snapshotID, workloadID) } w, err := e.store.GetWorkloadByID(workloadID) if err != nil { return err } if w.SourceKind != "image" { return fmt.Errorf("restore is only supported for image-source workloads") } settings, err := e.store.GetSettings() if err != nil { return fmt.Errorf("load settings: %w", err) } manifest, err := parseManifest(snap) if err != nil { return err } resolved, err := preflightResolve(e.store, w, settings, manifest) // C3 all-or-nothing if err != nil { return fmt.Errorf("pre-flight: %w", err) } archivePath, err := e.FilePath(snap) if err != nil { return err } perIndex, _, err := archiveUncompressedSize(archivePath, maxRestoreUncompressedBytes) if err != nil { return fmt.Errorf("size snapshot: %w", err) } if err := checkDiskSpace(resolved, perIndex); err != nil { // C5 return err } // ── past pre-flight: take the per-workload lock and quiesce ────────────── unlock := e.lifecycle.Lock(workloadID) defer unlock() // A teardown may have won the lock and deleted the workload while we waited. if _, err := e.store.GetWorkloadByID(workloadID); err != nil { return fmt.Errorf("workload disappeared before restore: %w", err) } tag, err := e.lifecycle.StopContainers(ctx, workloadID) // C4 stop if err != nil { return fmt.Errorf("stop containers: %w", err) } // Extract every volume to its staging dir FIRST. This reads the source // archive fully before the pre-restore capture below can prune it, and // leaves only pure renames for the destructive phase (R3). token := uuid.New().String()[:8] stagedVols := make([]staged, 0, len(resolved)) for _, rv := range resolved { tmp, old := stagingDirs(rv.LivePath, token, rv.Index) if _, exErr := safeExtractIndex(archivePath, rv.Index, tmp, maxRestoreUncompressedBytes); exErr != nil { cleanupStaging(stagedVols) _ = os.RemoveAll(tmp) // Nothing swapped yet — bring the app back up on its original data. e.redeployAfterAbort(ctx, w, tag) return fmt.Errorf("extract volume %q: %w", rv.Target, exErr) } stagedVols = append(stagedVols, staged{rv: rv, tmp: tmp, old: old}) } // Durable pre-restore snapshot (escape hatch). Quiesced (after stop), and // the source archive is already fully extracted so a prune here is harmless. // Best-effort, matching the DB-restore precedent: a failure is logged but // does not abort — the .old dirs + journal are the in-operation safety net. if _, err := e.Create(w, settings, "pre-restore"); err != nil { slog.Warn("restore: pre-restore snapshot failed (continuing)", "workload", workloadID, "error", err) } // Journal before the first destructive rename so a crash can be recovered. jr := restoreJournal{SnapshotID: snapshotID, WorkloadID: workloadID} for _, sv := range stagedVols { jr.Volumes = append(jr.Volumes, journalVolume{Live: sv.rv.LivePath, Old: sv.old, Tmp: sv.tmp}) } if err := e.writeJournal(jr); err != nil { cleanupStaging(stagedVols) e.redeployAfterAbort(ctx, w, tag) return fmt.Errorf("write restore journal: %w", err) } // ── destructive phase: pure atomic renames ────────────────────────────── done := make([]swap, 0, len(stagedVols)) for i, sv := range stagedVols { hadOld, swErr := swapVolumeDir(sv.rv.LivePath, sv.tmp, sv.old) if swErr != nil { rollbackSwaps(done) // restore already-swapped volumes cleanupStagingFrom(stagedVols, i) // drop remaining un-swapped tmp/old e.removeJournal(workloadID) e.redeployAfterAbort(ctx, w, tag) return fmt.Errorf("swap volume %q: %w", sv.rv.Target, swErr) } done = append(done, swap{live: sv.rv.LivePath, old: sv.old, tmp: sv.tmp, hadOld: hadOld}) jr.Volumes[i].Swapped = true jr.Volumes[i].HadOld = hadOld _ = e.writeJournal(jr) // progress checkpoint (best-effort) } // Bring the app back up against the restored data (C4 — recreate, redeploy). if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil { // The data IS restored; only the app failed to come back. Do NOT roll // back the volumes — surface the redeploy error so the operator retries // a deploy. Clean up the .old set-asides and the journal. cleanupOld(done) e.removeJournal(workloadID) return fmt.Errorf("redeploy after restore: %w", err) } cleanupOld(done) e.removeJournal(workloadID) e.emitRestoreEvent(workloadID, snapshotID, len(done)) slog.Info("volume snapshot restored", "workload", workloadID, "snapshot", snapshotID, "volumes", len(done)) return nil } // redeployAfterAbort re-dispatches after an aborted restore so a stopped app // does not stay down. Best-effort: the error is logged, not returned (the // restore failure is the primary error the caller surfaces). func (e *Engine) redeployAfterAbort(ctx context.Context, w store.Workload, tag string) { if err := e.lifecycle.Redeploy(ctx, w, tag); err != nil { slog.Warn("restore: redeploy after abort failed", "workload", w.ID, "error", err) } } // RecoverInterruptedRestores replays restore journals left by a crash mid- // restore, mirroring CleanOrphans (run once at startup, before serving). For // each volume: a completed swap keeps the restored live dir and drops the set- // aside original; an incomplete swap that left live missing is reverted from // .old; stray staging dirs are removed. Returns the number of journals handled. func (e *Engine) RecoverInterruptedRestores() (int, error) { e.mu.Lock() defer e.mu.Unlock() entries, err := os.ReadDir(e.snapDir) if err != nil { return 0, fmt.Errorf("read snapshot dir: %w", err) } recovered := 0 for _, ent := range entries { name := ent.Name() if ent.IsDir() || !strings.HasPrefix(name, "restore-") || !strings.HasSuffix(name, ".json") { continue } path := filepath.Join(e.snapDir, name) data, rerr := os.ReadFile(path) if rerr != nil { slog.Warn("restore recovery: read journal", "file", name, "error", rerr) continue } var jr restoreJournal if jerr := json.Unmarshal(data, &jr); jerr != nil { slog.Warn("restore recovery: parse journal", "file", name, "error", jerr) continue } slog.Warn("restore recovery: replaying interrupted restore", "workload", jr.WorkloadID, "snapshot", jr.SnapshotID, "volumes", len(jr.Volumes)) for _, v := range jr.Volumes { recoverVolume(v) } if rmErr := os.Remove(path); rmErr != nil { slog.Warn("restore recovery: remove journal", "file", name, "error", rmErr) } recovered++ } return recovered, nil } // recoverVolume reconciles a single volume's on-disk state from its journal // entry after a crash. Each branch leaves the live dir intact (either restored // or original) and removes staging leftovers. func recoverVolume(v journalVolume) { if v.Swapped { // Swap completed: live already holds restored data. Drop the set-aside. _ = os.RemoveAll(v.Old) _ = os.RemoveAll(v.Tmp) return } if _, err := os.Lstat(v.Live); os.IsNotExist(err) { if _, oerr := os.Lstat(v.Old); oerr == nil { // Crashed mid-rename (live→old done, tmp→live not): revert. _ = os.Rename(v.Old, v.Live) } } else { // live is intact (original). Any .old is a dangling partial copy. _ = os.RemoveAll(v.Old) } _ = os.RemoveAll(v.Tmp) } // ── journal + cleanup helpers ─────────────────────────────────────────────── func (e *Engine) journalPath(workloadID string) string { // workloadID is a server-generated id (loaded from the DB before we get // here). filepath.Base defends against any separator sneaking into the name. return filepath.Join(e.snapDir, "restore-"+filepath.Base(workloadID)+".json") } func (e *Engine) writeJournal(jr restoreJournal) error { data, err := json.Marshal(jr) if err != nil { return fmt.Errorf("encode journal: %w", err) } // Write atomically (tmp + rename): a torn journal would silently disable the // recovery sweep (RecoverInterruptedRestores skips unparseable journals), so // a crash mid-write must never leave a half-written WAL on disk. The .tmp // suffix is ignored by the recovery scan (it matches *.json only). final := e.journalPath(jr.WorkloadID) tmp := final + ".tmp" if err := os.WriteFile(tmp, data, 0o600); err != nil { return fmt.Errorf("write journal: %w", err) } if err := os.Rename(tmp, final); err != nil { _ = os.Remove(tmp) return fmt.Errorf("commit journal: %w", err) } return nil } func (e *Engine) removeJournal(workloadID string) { if err := os.Remove(e.journalPath(workloadID)); err != nil && !os.IsNotExist(err) { slog.Warn("restore: remove journal", "workload", workloadID, "error", err) } } func (e *Engine) emitRestoreEvent(workloadID, snapshotID string, volumes int) { meta, _ := json.Marshal(map[string]any{"snapshot_id": snapshotID, "volumes": volumes}) if _, err := e.store.InsertEvent(store.EventLog{ Source: "volsnap", WorkloadID: workloadID, Severity: "info", Message: "volume snapshot restored", Metadata: string(meta), }); err != nil { slog.Warn("restore: record event", "workload", workloadID, "error", err) } } // cleanupStaging removes the tmp + old staging dirs for every staged volume // (used when aborting before the swap phase). func cleanupStaging(sv []staged) { for _, s := range sv { _ = os.RemoveAll(s.tmp) _ = os.RemoveAll(s.old) } } // cleanupStagingFrom removes staging dirs from index `from` onward (the volumes // not yet swapped when a swap failed). func cleanupStagingFrom(sv []staged, from int) { for i := from; i < len(sv); i++ { _ = os.RemoveAll(sv[i].tmp) _ = os.RemoveAll(sv[i].old) } } // cleanupOld removes the .old set-aside dirs after a successful (or data- // committed) restore to reclaim disk; the pre-restore snapshot is the durable // rollback target. func cleanupOld(done []swap) { for _, s := range done { _ = os.RemoveAll(s.old) } } // checkDiskSpace verifies each target filesystem has room for the volumes that // will be staged on it (C5). Peak usage co-locates the live copy (renamed // aside, no new space) and the extracted copy (new space ≈ uncompressed size), // so the new allocation per filesystem is the sum of its volumes' extracted // sizes plus headroom. The estimate is a lower bound (see archiveUncompressedSize); // a mid-extract ENOSPC is still caught and rolled back. func checkDiskSpace(resolved []resolvedVol, perIndex map[int]int64) error { needByParent := map[string]int64{} for _, rv := range resolved { needByParent[filepath.Dir(rv.LivePath)] += perIndex[rv.Index] } for parent, need := range needByParent { probe := firstExistingAncestor(parent) free, err := freeDiskBytes(probe) if err != nil { return fmt.Errorf("check disk space at %s: %w", probe, err) } if int64(free) < need+diskFreeHeadroomBytes { return fmt.Errorf("insufficient disk space at %s: need ~%d bytes, have %d", parent, need+diskFreeHeadroomBytes, free) } } return nil } // firstExistingAncestor walks up p until it finds a path that exists, so the // free-space probe has a real filesystem to stat even when the volume dir (or // its parent) hasn't been created yet. func firstExistingAncestor(p string) string { for { if _, err := os.Stat(p); err == nil { return p } parent := filepath.Dir(p) if parent == p { return p } p = parent } }