1c47030854
Restore a captured volume snapshot onto an image workload's live host-bind
data volumes, then redeploy — the most destructive workload action, built to
the adversarially-reviewed design (C1–C6) with all data-loss guards.
- Engine.Restore (engine-owned): all-or-nothing pre-flight re-resolution from
the workload's CURRENT config (never the tamperable manifest), per-filesystem
disk pre-check, per-workload lock, container quiesce, extract-to-tmp, durable
pre-restore snapshot, write-ahead journal, atomic rename swap, redeploy, and
crash-recovery sweep (RecoverInterruptedRestores) wired before serving.
- internal/keyedmutex: shared per-key lock; deployer now serializes every
deploy entrypoint per workload via DispatchPlugin (+ LockWorkload/RedeployLocked
for the restore re-dispatch, no deadlock).
- Untrusted-archive extractor: zip-slip containment, type allow-list (reg/dir
only), decompression-bomb cap, manifest-index bounds.
- POST /api/workloads/{id}/snapshots/{sid}/restore: admin, X-Confirm-Restore
header (CSRF), per-workload single-flight (409).
- WebUI: Restore button + danger ConfirmDialog + busy state + i18n (en/ru).
Scope: image-source only; scopes absolute/stage/project (driven off the same
supportedScopes constant capture uses).
Plan-reviewed before coding; per-phase go/security/ts reviews; final review
READY TO MERGE. Security review caught + fixed a CRITICAL manifest-Source path
traversal (re-derive target from current config + base containment).
Plan: plans/volume-snapshot-restore/
197 lines
7.3 KiB
Go
197 lines
7.3 KiB
Go
// Package deployer dispatches plugin-native Source deploys. The legacy
|
|
// project-pipeline lived here until the hard cutover; what remains is a
|
|
// thin holder for the Deployer's shared dependencies that `dispatch.go`
|
|
// hands to every Source via PluginDeps().
|
|
package deployer
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"github.com/alexei/tinyforge/internal/dns"
|
|
"github.com/alexei/tinyforge/internal/docker"
|
|
"github.com/alexei/tinyforge/internal/events"
|
|
"github.com/alexei/tinyforge/internal/health"
|
|
"github.com/alexei/tinyforge/internal/keyedmutex"
|
|
"github.com/alexei/tinyforge/internal/notify"
|
|
"github.com/alexei/tinyforge/internal/proxy"
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
"github.com/alexei/tinyforge/internal/workload/plugin"
|
|
)
|
|
|
|
// Deployer owns the dependency bundle each Source plugin needs at deploy
|
|
// time. The plugin pipeline reaches in via PluginDeps(); see dispatch.go
|
|
// for the dispatch surface itself.
|
|
type Deployer struct {
|
|
docker *docker.Client
|
|
proxy proxy.Provider
|
|
store *store.Store
|
|
health *health.Checker
|
|
notifier *notify.Notifier
|
|
eventBus EventPublisher
|
|
backuper PreDeployBackuper // optional; nil disables pre-deploy backups
|
|
encKey [32]byte
|
|
dnsMu sync.RWMutex
|
|
dns dns.Provider // nil when wildcard DNS is active
|
|
|
|
// proxyMu protects hot-swap of d.proxy from runtime settings updates
|
|
// (SetProxyProvider) racing with PluginDeps() reads on the deploy path.
|
|
proxyMu sync.RWMutex
|
|
|
|
// Graceful shutdown: tracks in-progress deploys.
|
|
//
|
|
// drainMu serializes the "is-draining check + activeWg.Add(1)" in
|
|
// beginDispatch against the "set shuttingDown + Wait()" in Drain. Without
|
|
// it, a dispatch could pass the draining check, Drain could then flip the
|
|
// flag and start Wait() with a zero counter, and the dispatch could call
|
|
// Add(1) concurrently with Wait — a documented sync.WaitGroup misuse
|
|
// (panic risk) that also lets a deploy slip past the drain barrier.
|
|
drainMu sync.Mutex
|
|
activeWg sync.WaitGroup
|
|
shuttingDown atomic.Bool
|
|
|
|
// workloadLocks serializes deploy-class operations per workload id so two
|
|
// concurrent mutators of the same workload (a manual deploy, a webhook/
|
|
// trigger dispatch, a rollback, a promote, OR a volume-snapshot restore)
|
|
// can never interleave their container/volume changes. Every deploy
|
|
// entrypoint funnels through DispatchPlugin, so locking there gates them
|
|
// all at one choke point. This is the per-workload lock activeWg is NOT
|
|
// (activeWg is a global drain barrier for graceful shutdown).
|
|
workloadLocks keyedmutex.Mutex
|
|
}
|
|
|
|
// LockWorkload acquires the per-workload deploy lock for an external critical
|
|
// section (volume-snapshot restore) and returns the release func. The restore
|
|
// flow holds this across stop→swap→redeploy and redeploys via RedeployLocked
|
|
// (which does NOT re-acquire it).
|
|
func (d *Deployer) LockWorkload(id string) func() { return d.workloadLocks.Lock(id) }
|
|
|
|
// RedeployLocked re-dispatches w WITHOUT acquiring the per-workload lock,
|
|
// because the caller (restore) already holds it via LockWorkload. Calling the
|
|
// normal DispatchPlugin here would deadlock — Go mutexes are not reentrant.
|
|
// Not for general use.
|
|
func (d *Deployer) RedeployLocked(ctx context.Context, w plugin.Workload, intent plugin.DeploymentIntent) error {
|
|
return d.dispatchLocked(ctx, w, intent)
|
|
}
|
|
|
|
// EventPublisher is the interface for publishing events to the event bus.
|
|
type EventPublisher interface {
|
|
Publish(evt events.Event)
|
|
}
|
|
|
|
// PreDeployBackuper takes a "pre-deploy" Tinyforge DB snapshot before any
|
|
// deploy starts when the corresponding setting is enabled. Kept as a small
|
|
// interface so the deployer does not import internal/backup.
|
|
type PreDeployBackuper interface {
|
|
CreateBackup(backupType string) (store.Backup, error)
|
|
}
|
|
|
|
// New creates a new Deployer with all required dependencies.
|
|
func New(
|
|
dockerClient *docker.Client,
|
|
proxyProvider proxy.Provider,
|
|
st *store.Store,
|
|
checker *health.Checker,
|
|
notifier *notify.Notifier,
|
|
eventBus EventPublisher,
|
|
encKey [32]byte,
|
|
) *Deployer {
|
|
return &Deployer{
|
|
docker: dockerClient,
|
|
proxy: proxyProvider,
|
|
store: st,
|
|
health: checker,
|
|
notifier: notifier,
|
|
eventBus: eventBus,
|
|
encKey: encKey,
|
|
}
|
|
}
|
|
|
|
// SetProxyProvider updates the proxy provider at runtime (e.g., when settings change).
|
|
// Guarded by proxyMu so concurrent deploys that read d.proxy via PluginDeps()
|
|
// observe a coherent value (previously a torn-pointer race under -race).
|
|
func (d *Deployer) SetProxyProvider(provider proxy.Provider) {
|
|
d.proxyMu.Lock()
|
|
defer d.proxyMu.Unlock()
|
|
d.proxy = provider
|
|
}
|
|
|
|
// SetPreDeployBackuper wires the backup engine in after construction so the
|
|
// deployer can take a Tinyforge DB snapshot when the
|
|
// auto_backup_before_deploy setting is enabled. Pass nil to disable.
|
|
func (d *Deployer) SetPreDeployBackuper(b PreDeployBackuper) {
|
|
d.backuper = b
|
|
}
|
|
|
|
// maybeBackupBeforeDeploy takes a "pre-deploy" Tinyforge DB snapshot before a
|
|
// deploy when the operator enabled auto_backup_before_deploy. It is called on
|
|
// the unified deploy path (DispatchPlugin) so the setting actually fires — its
|
|
// predecessor was orphaned when the legacy executeDeploy pipeline (its only
|
|
// caller) was removed in the workload-first cutover, silently disabling the
|
|
// setting.
|
|
//
|
|
// Fail-open: a nil backuper, a settings-load error, or a backup failure all
|
|
// skip the snapshot without blocking the deploy — missing a backup is
|
|
// preferable to refusing to ship a fix.
|
|
func (d *Deployer) maybeBackupBeforeDeploy(workloadID string) {
|
|
if d.backuper == nil {
|
|
return
|
|
}
|
|
settings, err := d.store.GetSettings()
|
|
if err != nil {
|
|
slog.Warn("pre-deploy backup: load settings", "workload", workloadID, "error", err)
|
|
return
|
|
}
|
|
if !settings.AutoBackupBeforeDeploy {
|
|
return
|
|
}
|
|
backup, err := d.backuper.CreateBackup("pre-deploy")
|
|
if err != nil {
|
|
slog.Warn("pre-deploy backup failed", "workload", workloadID, "error", err)
|
|
return
|
|
}
|
|
slog.Info("pre-deploy backup created", "workload", workloadID, "backup_id", backup.ID, "filename", backup.Filename)
|
|
}
|
|
|
|
// SetDNSProvider sets the DNS provider for managing DNS records during deployments.
|
|
// Pass nil to disable DNS management (wildcard DNS mode).
|
|
func (d *Deployer) SetDNSProvider(provider dns.Provider) {
|
|
d.dnsMu.Lock()
|
|
defer d.dnsMu.Unlock()
|
|
d.dns = provider
|
|
}
|
|
|
|
// Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
|
|
func (d *Deployer) Drain() {
|
|
d.drainMu.Lock()
|
|
already := d.shuttingDown.Swap(true)
|
|
d.drainMu.Unlock()
|
|
if already {
|
|
slog.Info("deployer: drain already in progress")
|
|
}
|
|
slog.Info("deployer: draining in-progress deploys")
|
|
d.activeWg.Wait()
|
|
slog.Info("deployer: all deploys drained")
|
|
}
|
|
|
|
// ShuttingDown reports whether Drain() has been called.
|
|
func (d *Deployer) ShuttingDown() bool { return d.shuttingDown.Load() }
|
|
|
|
// beginDispatch atomically rejects when draining and otherwise registers the
|
|
// in-flight unit on activeWg. The shuttingDown check and the Add(1) MUST be
|
|
// done together under drainMu (see the field comment): Drain sets the flag
|
|
// under the same mutex before Wait(), so once Wait() observes a zero counter
|
|
// no further Add can race it. Callers must defer d.activeWg.Done() on success.
|
|
func (d *Deployer) beginDispatch() error {
|
|
d.drainMu.Lock()
|
|
defer d.drainMu.Unlock()
|
|
if d.shuttingDown.Load() {
|
|
return fmt.Errorf("deployer is shutting down, rejecting new deploy")
|
|
}
|
|
d.activeWg.Add(1)
|
|
return nil
|
|
}
|