Files
tiny-forge/internal/volsnap/engine.go
T
alexei.dolgolyov 6b45ed62bb
Build / build (push) Successful in 10m59s
feat(snapshots): capture app data-volume snapshots
Add per-workload capture of host-bind data volumes as downloadable tar.gz archives: a new internal/volsnap engine (enumerate host-bind volumes via the computeMounts merge, archive with archive/tar+gzip skipping symlinks/special files, per-workload retention + startup orphan cleanup), a volume_snapshots table + store CRUD, admin-gated API (list/snapshotable/create/download/delete), and a Snapshots panel on /apps/[id] that shows coverage and which volumes are skipped (and why). Scope: image-source apps, host-bind scopes (absolute/stage/project); Docker named volumes, tmpfs, and instance scope are surfaced as not-yet-supported. Restore is a separate later phase. Download/FilePath are containment-checked; create returns a typed no-data error (400) vs generic 500. Covered by archiver unit tests + full API e2e.
2026-06-02 14:56:10 +03:00

208 lines
6.1 KiB
Go

package volsnap
import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/google/uuid"
"github.com/alexei/tinyforge/internal/store"
)
// maxSnapshotsPerWorkload caps how many snapshots are retained per app. On
// create, older snapshots beyond this count are pruned (best-effort) so volume
// snapshots cannot grow the data disk without bound.
const maxSnapshotsPerWorkload = 20
// ErrNoSnapshotData is returned by Create when the workload has no resolved
// host-bind volume directory to capture. It is a client-actionable condition
// (HTTP 400), distinct from internal failures (HTTP 500).
var ErrNoSnapshotData = errors.New("no snapshottable volume data for this app")
// Engine creates and manages volume snapshots under <dataDir>/snapshots.
type Engine struct {
mu sync.Mutex
store *store.Store
snapDir string
}
// New creates the snapshot engine, ensuring the snapshot directory exists.
func New(st *store.Store, dataDir string) (*Engine, error) {
dir := filepath.Join(dataDir, "snapshots")
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("create snapshot directory: %w", err)
}
return &Engine{store: st, snapDir: dir}, nil
}
// SnapDir returns the directory holding snapshot archives.
func (e *Engine) SnapDir() string { return e.snapDir }
// Create captures a snapshot of the workload's host-bind data volumes.
func (e *Engine) Create(w store.Workload, settings store.Settings, label string) (store.VolumeSnapshot, error) {
refs, _, err := SnapshotableVolumes(e.store, w, settings)
if err != nil {
return store.VolumeSnapshot{}, fmt.Errorf("enumerate volumes: %w", err)
}
if len(refs) == 0 {
return store.VolumeSnapshot{}, ErrNoSnapshotData
}
e.mu.Lock()
defer e.mu.Unlock()
filename := fmt.Sprintf("%s-%s-%s.tar.gz",
idShort(w.ID), time.Now().UTC().Format("20060102-150405"), uuid.New().String()[:8])
dest := filepath.Join(e.snapDir, filename)
manifest, err := writeArchive(dest, refs)
if err != nil {
return store.VolumeSnapshot{}, err
}
info, err := os.Stat(dest)
if err != nil {
os.Remove(dest)
return store.VolumeSnapshot{}, fmt.Errorf("stat snapshot: %w", err)
}
manifestJSON, err := json.Marshal(manifest)
if err != nil {
os.Remove(dest)
return store.VolumeSnapshot{}, fmt.Errorf("encode manifest: %w", err)
}
row, err := e.store.CreateVolumeSnapshot(store.VolumeSnapshot{
WorkloadID: w.ID,
Label: strings.TrimSpace(label),
Filename: filename,
SizeBytes: info.Size(),
Manifest: string(manifestJSON),
})
if err != nil {
os.Remove(dest) // best-effort: don't leak an orphan file
return store.VolumeSnapshot{}, fmt.Errorf("record snapshot: %w", err)
}
slog.Info("volume snapshot created", "id", row.ID, "workload", w.ID,
"volumes", len(manifest), "size", info.Size())
e.pruneWorkload(w.ID)
return row, nil
}
// List returns a workload's snapshots, newest first.
func (e *Engine) List(workloadID string) ([]store.VolumeSnapshot, error) {
return e.store.ListVolumeSnapshots(workloadID)
}
// Get returns one snapshot by id.
func (e *Engine) Get(id string) (store.VolumeSnapshot, error) {
return e.store.GetVolumeSnapshot(id)
}
// Delete removes a snapshot's archive file and its metadata row.
func (e *Engine) Delete(id string) error {
snap, err := e.store.GetVolumeSnapshot(id)
if err != nil {
return err
}
e.mu.Lock()
defer e.mu.Unlock()
if p, perr := e.FilePath(snap); perr == nil {
if rmErr := os.Remove(p); rmErr != nil && !os.IsNotExist(rmErr) {
slog.Warn("volume snapshot: remove file", "id", id, "error", rmErr)
}
}
return e.store.DeleteVolumeSnapshot(id)
}
// FilePath resolves a snapshot's archive path and verifies it stays within the
// snapshot directory (defence-in-depth against a tampered filename column).
func (e *Engine) FilePath(snap store.VolumeSnapshot) (string, error) {
base := filepath.Base(snap.Filename)
if base == "" || base == "." || base != snap.Filename {
return "", fmt.Errorf("invalid snapshot filename")
}
p := filepath.Join(e.snapDir, base)
abs, err := filepath.Abs(p)
if err != nil {
return "", err
}
absDir, _ := filepath.Abs(e.snapDir)
if !strings.HasPrefix(abs, absDir+string(filepath.Separator)) {
return "", fmt.Errorf("snapshot path escapes snapshot directory")
}
return abs, nil
}
// CleanOrphans removes snapshot archive files that have no metadata row,
// reconciling on-disk files against the DB. Workload deletion CASCADEs the
// volume_snapshots rows but cannot reach the files; this (run at startup)
// reclaims them. Mirrors backup.Engine.CleanOrphans.
func (e *Engine) CleanOrphans() (int, error) {
e.mu.Lock()
defer e.mu.Unlock()
entries, err := os.ReadDir(e.snapDir)
if err != nil {
return 0, fmt.Errorf("read snapshot dir: %w", err)
}
filenames, err := e.store.AllVolumeSnapshotFilenames()
if err != nil {
return 0, fmt.Errorf("list snapshot filenames: %w", err)
}
known := make(map[string]bool, len(filenames))
for _, f := range filenames {
known[f] = true
}
removed := 0
for _, ent := range entries {
if ent.IsDir() || known[ent.Name()] {
continue
}
if err := os.Remove(filepath.Join(e.snapDir, ent.Name())); err != nil {
slog.Warn("volume snapshot: remove orphan", "file", ent.Name(), "error", err)
continue
}
removed++
}
return removed, nil
}
// pruneWorkload deletes snapshots beyond maxSnapshotsPerWorkload for one
// workload (oldest first). Best-effort: caller already holds e.mu.
func (e *Engine) pruneWorkload(workloadID string) {
count, err := e.store.CountVolumeSnapshots(workloadID)
if err != nil || count <= maxSnapshotsPerWorkload {
return
}
oldest, err := e.store.GetOldestVolumeSnapshots(workloadID, count-maxSnapshotsPerWorkload)
if err != nil {
slog.Warn("volume snapshot: prune query", "workload", workloadID, "error", err)
return
}
for _, snap := range oldest {
if p, perr := e.FilePath(snap); perr == nil {
_ = os.Remove(p)
}
if derr := e.store.DeleteVolumeSnapshot(snap.ID); derr != nil {
slog.Warn("volume snapshot: prune delete", "id", snap.ID, "error", derr)
}
}
}
func idShort(id string) string {
if len(id) > 8 {
return id[:8]
}
return id
}