Files
tiny-forge/internal/api/volume_snapshots.go
T
alexei.dolgolyov 1c47030854 feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00

244 lines
7.9 KiB
Go

package api
import (
"encoding/json"
"errors"
"io"
"log/slog"
"net/http"
"os"
"path/filepath"
"github.com/go-chi/chi/v5"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/volsnap"
)
// listWorkloadSnapshots handles GET /api/workloads/{id}/snapshots.
func (s *Server) listWorkloadSnapshots(w http.ResponseWriter, r *http.Request) {
if s.snapshotEngine == nil {
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
return
}
id := chi.URLParam(r, "id")
snaps, err := s.snapshotEngine.List(id)
if err != nil {
slog.Error("snapshots: list", "workload", id, "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
respondJSON(w, http.StatusOK, snaps)
}
// snapshotableVolume is the sanitized view of a volume in the snapshotable
// response — it omits the resolved host path so internal layout is not leaked.
type snapshotableVolume struct {
Target string `json:"target"`
Scope string `json:"scope"`
Source string `json:"source"`
}
// getWorkloadSnapshotable handles GET /api/workloads/{id}/snapshotable. It
// tells the UI which volumes can be snapshotted and which are skipped (and
// why), so users are never misled about coverage.
func (s *Server) getWorkloadSnapshotable(w http.ResponseWriter, r *http.Request) {
if s.snapshotEngine == nil {
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
return
}
id := chi.URLParam(r, "id")
workload, err := s.store.GetWorkloadByID(id)
if err != nil {
respondError(w, http.StatusNotFound, "workload not found")
return
}
settings, err := s.store.GetSettings()
if err != nil {
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
refs, skipped, err := volsnap.SnapshotableVolumes(s.store, workload, settings)
if err != nil {
slog.Error("snapshots: enumerate", "workload", id, "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
volumes := make([]snapshotableVolume, 0, len(refs))
for _, ref := range refs {
volumes = append(volumes, snapshotableVolume{Target: ref.Target, Scope: ref.Scope, Source: ref.Source})
}
if skipped == nil {
skipped = []volsnap.SkippedVolume{}
}
respondJSON(w, http.StatusOK, map[string]any{
"volumes": volumes,
"skipped": skipped,
})
}
// createWorkloadSnapshot handles POST /api/workloads/{id}/snapshots.
func (s *Server) createWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
if s.snapshotEngine == nil {
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
return
}
id := chi.URLParam(r, "id")
workload, err := s.store.GetWorkloadByID(id)
if err != nil {
respondError(w, http.StatusNotFound, "workload not found")
return
}
settings, err := s.store.GetSettings()
if err != nil {
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
var body struct {
Label string `json:"label"`
}
if r.ContentLength != 0 {
if err := json.NewDecoder(io.LimitReader(r.Body, 1<<20)).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
respondError(w, http.StatusBadRequest, "invalid JSON body")
return
}
}
snap, err := s.snapshotEngine.Create(workload, settings, body.Label)
if err != nil {
// "no snapshottable volume data" is client-actionable (400, safe to
// echo). Any other error is server-side: log the detail, return a
// generic 500 so internal paths / DB text never reach the client.
if errors.Is(err, volsnap.ErrNoSnapshotData) {
respondError(w, http.StatusBadRequest, err.Error())
return
}
slog.Error("snapshots: create", "workload", id, "error", err)
respondError(w, http.StatusInternalServerError, "internal server error")
return
}
respondJSON(w, http.StatusCreated, snap)
}
// deleteSnapshot handles DELETE /api/snapshots/{sid}.
func (s *Server) deleteSnapshot(w http.ResponseWriter, r *http.Request) {
if s.snapshotEngine == nil {
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
return
}
sid := chi.URLParam(r, "sid")
if err := s.snapshotEngine.Delete(sid); err != nil {
if errors.Is(err, store.ErrNotFound) {
respondError(w, http.StatusNotFound, "snapshot not found")
return
}
respondError(w, http.StatusInternalServerError, "failed to delete snapshot")
return
}
respondJSON(w, http.StatusOK, map[string]string{"status": "deleted"})
}
// restoreWorkloadSnapshot handles POST /api/workloads/{id}/snapshots/{sid}/restore.
//
// This is the most destructive workload action: it overwrites the app's live
// volume data with the snapshot and recreates its containers. It is guarded like
// the DB restore — admin-only, an X-Confirm-Restore header that must echo the
// snapshot id (defeats CSRF form/img posts, which can't set custom headers), and
// a per-workload single-flight so a double-click can't stack two restores. All
// the dangerous lock/stop/swap/redeploy logic lives in Engine.Restore; this
// handler only validates and delegates.
func (s *Server) restoreWorkloadSnapshot(w http.ResponseWriter, r *http.Request) {
if s.snapshotEngine == nil {
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
return
}
id := chi.URLParam(r, "id")
sid := chi.URLParam(r, "sid")
if confirm := r.Header.Get("X-Confirm-Restore"); confirm != sid {
respondError(w, http.StatusBadRequest,
"missing or mismatched X-Confirm-Restore header (must equal snapshot id)")
return
}
// Up-front validation for precise client errors (Engine.Restore re-checks
// ownership + source kind under the lock).
snap, err := s.snapshotEngine.Get(sid)
if err != nil {
respondError(w, http.StatusNotFound, "snapshot not found")
return
}
if snap.WorkloadID != id {
respondError(w, http.StatusBadRequest, "snapshot does not belong to this workload")
return
}
row, ok := s.loadWorkload(w, id)
if !ok {
return
}
if row.SourceKind != "image" {
respondError(w, http.StatusBadRequest, "restore is only supported for image-source workloads")
return
}
// Per-workload single-flight: reject a concurrent restore of the SAME
// workload with 409 rather than queuing it behind the deployer lock.
release, ok := s.volRestoreInFlight.TryLock(id)
if !ok {
respondError(w, http.StatusConflict, "a restore is already in progress for this workload")
return
}
defer release()
if err := s.snapshotEngine.Restore(r.Context(), sid, id); err != nil {
// Raw error (which can carry resolved host paths) stays in the log; the
// client gets a generic message.
slog.Error("snapshots: restore failed", "workload", id, "snapshot", sid, "error", err)
respondError(w, http.StatusInternalServerError, "restore failed; see server logs")
return
}
respondJSON(w, http.StatusOK, map[string]any{
"status": "restored",
"workload_id": id,
"snapshot_id": sid,
})
}
// downloadSnapshot handles GET /api/snapshots/{sid}/download, streaming the
// tar.gz archive. The resolved path is containment-checked against the
// snapshot directory.
func (s *Server) downloadSnapshot(w http.ResponseWriter, r *http.Request) {
if s.snapshotEngine == nil {
respondError(w, http.StatusServiceUnavailable, "snapshot engine not initialized")
return
}
sid := chi.URLParam(r, "sid")
snap, err := s.snapshotEngine.Get(sid)
if err != nil {
respondError(w, http.StatusNotFound, "snapshot not found")
return
}
path, err := s.snapshotEngine.FilePath(snap)
if err != nil {
respondError(w, http.StatusForbidden, "access denied")
return
}
f, err := os.Open(path)
if err != nil {
respondError(w, http.StatusNotFound, "snapshot file not found on disk")
return
}
defer f.Close()
stat, err := f.Stat()
if err != nil {
respondError(w, http.StatusInternalServerError, "failed to read snapshot file")
return
}
name := filepath.Base(snap.Filename)
w.Header().Set("Content-Type", "application/gzip")
w.Header().Set("Content-Disposition", "attachment; filename=\""+name+"\"")
http.ServeContent(w, r, name, stat.ModTime(), f)
}