tiny-forge/internal/volsnap/restore.go

package volsnap

import (
	"archive/tar"
	"compress/gzip"
	"encoding/json"
	"fmt"
	"io"
	"os"
	"path"
	"path/filepath"

	"github.com/alexei/tinyforge/internal/store"
	"github.com/alexei/tinyforge/internal/volume"
)

// maxRestoreUncompressedBytes caps the total decompressed size accepted from a
// snapshot archive during restore (decompression-bomb defence). 50 GiB is far
// above any realistic app data volume while still bounding a hostile archive.
const maxRestoreUncompressedBytes int64 = 50 << 30

// diskFreeHeadroomBytes is extra free space required beyond the extracted size
// so a restore never fills the target filesystem to the brim. The live copy is
// renamed aside (no new space), so the new allocation is ~the extracted size;
// this headroom covers filesystem overhead and metadata.
const diskFreeHeadroomBytes int64 = 256 << 20

// resolvedVol is a manifest volume whose live host path has been re-resolved
// against the workload's CURRENT config (all-or-nothing pre-flight, C3).
type resolvedVol struct {
	Index    int
	Target   string
	Scope    string
	LivePath string
}

// parseManifest decodes the snapshot row's manifest JSON ([]SnapshotVolume).
func parseManifest(snap store.VolumeSnapshot) ([]SnapshotVolume, error) {
	var m []SnapshotVolume
	if err := json.Unmarshal([]byte(snap.Manifest), &m); err != nil {
		return nil, fmt.Errorf("parse snapshot manifest: %w", err)
	}
	if len(m) == 0 {
		return nil, fmt.Errorf("snapshot manifest is empty")
	}
	return m, nil
}

// preflightResolve re-derives every manifest volume's live host path from the
// workload's CURRENT config, ALL-OR-NOTHING (C3): if any snapshotted target is
// no longer declared, its scope is unsupported, or it can't resolve, it returns
// an error and the caller MUST abort BEFORE stopping containers or touching
// disk — config drift mid-restore is silent corruption.
//
// SECURITY: the swap target is keyed on the manifest's container Target path but
// its host directory is derived from the CURRENT (trusted, operator-set)
// Source/Scope — never from the snapshot manifest's persisted Source/Scope. The
// manifest column is attacker-influenceable (e.g. a restored/tampered DB), and
// trusting its Source for stage/project scope would let `Source:"../../etc"`
// redirect the destructive rename-swap outside the volume tree. As defence in
// depth, base-relative resolved paths are asserted to stay under BaseVolumePath.
func preflightResolve(st *store.Store, w store.Workload, settings store.Settings, manifest []SnapshotVolume) ([]resolvedVol, error) {
	current, err := volumesByTarget(st, w)
	if err != nil {
		return nil, fmt.Errorf("load current volumes: %w", err)
	}
	params := volume.ResolveWorkloadParams{
		BasePath:           settings.BaseVolumePath,
		WorkloadID:         w.ID,
		WorkloadName:       w.Name,
		AllowedVolumePaths: settings.AllowedVolumePaths,
	}
	out := make([]resolvedVol, 0, len(manifest))
	for _, mv := range manifest {
		// A negative index can never name an archive subtree.
		if mv.Index < 0 {
			return nil, fmt.Errorf("volume %q has invalid index %d", mv.Target, mv.Index)
		}
		cur, ok := current[mv.Target]
		if !ok {
			return nil, fmt.Errorf("volume %q is no longer declared by the workload", mv.Target)
		}
		if !supportedScopes[cur.Scope] {
			return nil, fmt.Errorf("volume %q scope %q is not restorable", mv.Target, cur.Scope)
		}
		live, err := volume.ResolveWorkloadPath(cur, params)
		if err != nil {
			return nil, fmt.Errorf("resolve volume %q (%s): %w", mv.Target, cur.Scope, err)
		}
		// Containment: the destructive swap target must stay inside the volume
		// root. Base-relative scopes must resolve under BaseVolumePath; absolute
		// scope is already constrained to AllowedVolumePaths by the resolver.
		if cur.Scope != string(store.VolumeScopeAbsolute) {
			contained, cerr := pathWithinBase(settings.BaseVolumePath, live)
			if cerr != nil || !contained {
				return nil, fmt.Errorf("resolved path for volume %q escapes the volume root", mv.Target)
			}
		}
		out = append(out, resolvedVol{Index: mv.Index, Target: mv.Target, Scope: cur.Scope, LivePath: live})
	}
	return out, nil
}

// pathWithinBase reports whether target resolves to base or a path beneath it.
// An empty base is treated as non-containing (refuse rather than allow).
func pathWithinBase(base, target string) (bool, error) {
	if base == "" {
		return false, nil
	}
	absBase, err := filepath.Abs(base)
	if err != nil {
		return false, err
	}
	absTarget, err := filepath.Abs(target)
	if err != nil {
		return false, err
	}
	return withinDir(absBase, absTarget), nil
}

// archiveUncompressedSize scans the archive's tar headers and returns the
// per-index and total uncompressed sizes, enforcing bombCap so a hostile
// archive can't make the disk pre-check allocate unbounded. Feeds the
// per-filesystem free-space pre-check (C5).
//
// The total is a LOWER-BOUND estimate of on-disk consumption: it sums regular-
// file bytes only, ignoring directory entries and per-file inode/block-rounding
// overhead, so a volume of many tiny files consumes more than reported. The
// real safety net is the staged extract + atomic swap (a mid-extract ENOSPC
// discards the staging dir and leaves live untouched), not this pre-check.
//
// "No body copy" is at the API level only — tar.Next still inflates and
// discards each skipped body, so a 50 GiB-of-headers archive does 50 GiB of
// gzip work; bombCap bounds that.
func archiveUncompressedSize(archivePath string, bombCap int64) (perIndex map[int]int64, total int64, err error) {
	f, err := os.Open(archivePath)
	if err != nil {
		return nil, 0, fmt.Errorf("open archive: %w", err)
	}
	defer f.Close()
	gz, err := gzip.NewReader(f)
	if err != nil {
		return nil, 0, fmt.Errorf("gzip reader: %w", err)
	}
	defer gz.Close()

	perIndex = map[int]int64{}
	tr := tar.NewReader(gz)
	for {
		hdr, e := tr.Next()
		if e == io.EOF {
			break
		}
		if e != nil {
			return nil, 0, fmt.Errorf("read tar: %w", e)
		}
		if hdr.Typeflag != tar.TypeReg {
			continue
		}
		name := path.Clean(hdr.Name)
		if name == "manifest.json" {
			continue
		}
		idx, ok := leadingIndex(name)
		if !ok {
			continue
		}
		total += hdr.Size
		if total > bombCap {
			return nil, 0, fmt.Errorf("archive exceeds decompression cap of %d bytes", bombCap)
		}
		perIndex[idx] += hdr.Size
	}
	return perIndex, total, nil
}

// swap records one volume's atomic dir replacement so it can be rolled back.
type swap struct {
	live   string
	old    string // where the prior live dir was set aside ("" if live didn't exist)
	tmp    string // staging dir holding the freshly-extracted data
	hadOld bool   // whether a prior live dir existed and was moved to old
}

// stagingDirs returns the per-volume tmp and old staging paths as SIBLINGS of
// the live dir's parent, so every rename in the swap is intra-filesystem and
// therefore atomic (R2). A cross-device rename (live is itself a mountpoint)
// fails loudly in swapVolumeDir rather than silently degrading to a copy.
func stagingDirs(live, token string, index int) (tmp, old string) {
	parent := filepath.Dir(live)
	base := fmt.Sprintf(".tf-restore-%s-%d", token, index)
	return filepath.Join(parent, base+".tmp"), filepath.Join(parent, base+".old")
}

// swapVolumeDir performs the crash-minimal two-rename swap: set the live dir
// aside to old (if it exists), then move the staged tmp into place (C2). On the
// second rename failing it reverts the first so live is never left missing.
// Returns whether a prior live dir was preserved at old (for rollback).
func swapVolumeDir(live, tmp, old string) (hadOld bool, err error) {
	if _, statErr := os.Lstat(live); statErr == nil {
		if rerr := os.Rename(live, old); rerr != nil {
			return false, fmt.Errorf("set aside live %s: %w", live, rerr)
		}
		hadOld = true
	} else if !os.IsNotExist(statErr) {
		return false, fmt.Errorf("stat live %s: %w", live, statErr)
	}

	if mkErr := os.MkdirAll(filepath.Dir(live), 0o700); mkErr != nil {
		if hadOld {
			_ = os.Rename(old, live)
		}
		return hadOld, fmt.Errorf("ensure parent of %s: %w", live, mkErr)
	}
	if rerr := os.Rename(tmp, live); rerr != nil {
		if hadOld {
			_ = os.Rename(old, live) // revert: live is never left missing
		}
		return hadOld, fmt.Errorf("promote restored data into %s: %w", live, rerr)
	}
	return hadOld, nil
}

// rollbackSwaps reverts completed swaps in reverse order: drop the restored
// live dir and move the preserved original back. Best-effort — each step is
// logged by the caller; rollback must attempt every volume regardless.
func rollbackSwaps(done []swap) {
	for i := len(done) - 1; i >= 0; i-- {
		s := done[i]
		_ = os.RemoveAll(s.live)
		if s.hadOld {
			_ = os.Rename(s.old, s.live)
		}
	}
}