Files
tiny-forge/internal/deployer/deployer.go
T
alexei.dolgolyov 1c47030854 feat(volsnap): volume snapshot restore (backlog #6)
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.

- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
  the workload's CURRENT config (never the tamperable manifest), per-filesystem
  disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
  pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
  crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
  deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
  for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
  only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
  header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).

Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).

Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).

Plan: plans/volume-snapshot-restore/
2026-06-22 17:23:52 +03:00

197 lines
7.3 KiB
Go

// Package deployer dispatches plugin-native Source deploys. The legacy
// project-pipeline lived here until the hard cutover; what remains is a
// thin holder for the Deployer's shared dependencies that `dispatch.go`
// hands to every Source via PluginDeps().
package deployer
import (
"context"
"fmt"
"log/slog"
"sync"
"sync/atomic"
"github.com/alexei/tinyforge/internal/dns"
"github.com/alexei/tinyforge/internal/docker"
"github.com/alexei/tinyforge/internal/events"
"github.com/alexei/tinyforge/internal/health"
"github.com/alexei/tinyforge/internal/keyedmutex"
"github.com/alexei/tinyforge/internal/notify"
"github.com/alexei/tinyforge/internal/proxy"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/workload/plugin"
)
// Deployer owns the dependency bundle each Source plugin needs at deploy
// time. The plugin pipeline reaches in via PluginDeps(); see dispatch.go
// for the dispatch surface itself.
type Deployer struct {
docker *docker.Client
proxy proxy.Provider
store *store.Store
health *health.Checker
notifier *notify.Notifier
eventBus EventPublisher
backuper PreDeployBackuper // optional; nil disables pre-deploy backups
encKey [32]byte
dnsMu sync.RWMutex
dns dns.Provider // nil when wildcard DNS is active
// proxyMu protects hot-swap of d.proxy from runtime settings updates
// (SetProxyProvider) racing with PluginDeps() reads on the deploy path.
proxyMu sync.RWMutex
// Graceful shutdown: tracks in-progress deploys.
//
// drainMu serializes the "is-draining check + activeWg.Add(1)" in
// beginDispatch against the "set shuttingDown + Wait()" in Drain. Without
// it, a dispatch could pass the draining check, Drain could then flip the
// flag and start Wait() with a zero counter, and the dispatch could call
// Add(1) concurrently with Wait — a documented sync.WaitGroup misuse
// (panic risk) that also lets a deploy slip past the drain barrier.
drainMu sync.Mutex
activeWg sync.WaitGroup
shuttingDown atomic.Bool
// workloadLocks serializes deploy-class operations per workload id so two
// concurrent mutators of the same workload (a manual deploy, a webhook/
// trigger dispatch, a rollback, a promote, OR a volume-snapshot restore)
// can never interleave their container/volume changes. Every deploy
// entrypoint funnels through DispatchPlugin, so locking there gates them
// all at one choke point. This is the per-workload lock activeWg is NOT
// (activeWg is a global drain barrier for graceful shutdown).
workloadLocks keyedmutex.Mutex
}
// LockWorkload acquires the per-workload deploy lock for an external critical
// section (volume-snapshot restore) and returns the release func. The restore
// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked
// (which does NOT re-acquire it).
func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) }
// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock,
// because the caller (restore) already holds it via LockWorkload. Calling the
// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant.
// Not for general use.
func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
return d.dispatchLocked(ctx, w, intent)
}
// EventPublisher is the interface for publishing events to the event bus.
type EventPublisher interface {
Publish(evt events.Event)
}
// PreDeployBackuper takes a "pre-deploy" Tinyforge DB snapshot before any
// deploy starts when the corresponding setting is enabled. Kept as a small
// interface so the deployer does not import internal/backup.
type PreDeployBackuper interface {
CreateBackup(backupType string) (store.Backup, error)
}
// New creates a new Deployer with all required dependencies.
func New(
dockerClient *docker.Client,
proxyProvider proxy.Provider,
st *store.Store,
checker *health.Checker,
notifier *notify.Notifier,
eventBus EventPublisher,
encKey [32]byte,
) *Deployer {
return &Deployer{
docker: dockerClient,
proxy: proxyProvider,
store: st,
health: checker,
notifier: notifier,
eventBus: eventBus,
encKey: encKey,
}
}
// SetProxyProvider updates the proxy provider at runtime (e.g., when settings change).
// Guarded by proxyMu so concurrent deploys that read d.proxy via PluginDeps()
// observe a coherent value (previously a torn-pointer race under -race).
func (d *Deployer) SetProxyProvider(provider proxy.Provider) {
d.proxyMu.Lock()
defer d.proxyMu.Unlock()
d.proxy = provider
}
// SetPreDeployBackuper wires the backup engine in after construction so the
// deployer can take a Tinyforge DB snapshot when the
// auto_backup_before_deploy setting is enabled. Pass nil to disable.
func (d *Deployer) SetPreDeployBackuper(b PreDeployBackuper) {
d.backuper = b
}
// maybeBackupBeforeDeploy takes a "pre-deploy" Tinyforge DB snapshot before a
// deploy when the operator enabled auto_backup_before_deploy. It is called on
// the unified deploy path (DispatchPlugin) so the setting actually fires — its
// predecessor was orphaned when the legacy executeDeploy pipeline (its only
// caller) was removed in the workload-first cutover, silently disabling the
// setting.
//
// Fail-open: a nil backuper, a settings-load error, or a backup failure all
// skip the snapshot without blocking the deploy — missing a backup is
// preferable to refusing to ship a fix.
func (d *Deployer) maybeBackupBeforeDeploy(workloadID string) {
if d.backuper == nil {
return
}
settings, err := d.store.GetSettings()
if err != nil {
slog.Warn("pre-deploy backup: load settings", "workload", workloadID, "error", err)
return
}
if !settings.AutoBackupBeforeDeploy {
return
}
backup, err := d.backuper.CreateBackup("pre-deploy")
if err != nil {
slog.Warn("pre-deploy backup failed", "workload", workloadID, "error", err)
return
}
slog.Info("pre-deploy backup created", "workload", workloadID, "backup_id", backup.ID, "filename", backup.Filename)
}
// SetDNSProvider sets the DNS provider for managing DNS records during deployments.
// Pass nil to disable DNS management (wildcard DNS mode).
func (d *Deployer) SetDNSProvider(provider dns.Provider) {
d.dnsMu.Lock()
defer d.dnsMu.Unlock()
d.dns = provider
}
// Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
func (d *Deployer) Drain() {
d.drainMu.Lock()
already := d.shuttingDown.Swap(true)
d.drainMu.Unlock()
if already {
slog.Info("deployer: drain already in progress")
}
slog.Info("deployer: draining in-progress deploys")
d.activeWg.Wait()
slog.Info("deployer: all deploys drained")
}
// ShuttingDown reports whether Drain() has been called.
func (d *Deployer) ShuttingDown() bool { return d.shuttingDown.Load() }
// beginDispatch atomically rejects when draining and otherwise registers the
// in-flight unit on activeWg. The shuttingDown check and the Add(1) MUST be
// done together under drainMu (see the field comment): Drain sets the flag
// under the same mutex before Wait(), so once Wait() observes a zero counter
// no further Add can race it. Callers must defer d.activeWg.Done() on success.
func (d *Deployer) beginDispatch() error {
d.drainMu.Lock()
defer d.drainMu.Unlock()
if d.shuttingDown.Load() {
return fmt.Errorf("deployer is shutting down, rejecting new deploy")
}
d.activeWg.Add(1)
return nil
}