feat(deployer): configurable per-workload deploy strategy (blue-green for built sources)

Add a deploy_strategy field to each source's config blob — "" (default),
"recreate", or "blue-green" — validated in each source's Validate and read on
the deploy path. No new DB column, no migration: the field rides inside the
existing SourceConfig JSON and every existing workload decodes "" to its
historical behavior (image -> blue-green, others -> recreate).

The real gap this closes: dockerfile and static stopped the old container
before creating the new one on every redeploy — a downtime window image never
had. Their blue-green branch now:
- names the new "green" container with a unique suffix so it coexists with the
  still-serving blue (plumbed into both the container name AND the proxy
  forwardHost);
- skips the collision teardown that destroyed blue early;
- gates green — an HTTP readiness probe (deps.Health.Check) when a healthcheck
  is configured, else the existing liveness window;
- swaps the route via a pure upsert (no pre-DeleteRoute) so NPM repoints in
  place with no gap;
- persists green into the single runtime-state row BEFORE reaping blue, so a
  crash mid-swap can never orphan green or leave the row pointing at a removed
  container (state.go/teardown.go/reconcile.go stay untouched).

image honors explicit "recreate" (reap existing containers after pull, before
cutover); its default blue-green path is unchanged. compose stays
stack-managed and rejects "blue-green" at Validate so the contract is honest.
static forces recreate for storage-backed deno sites — blue-green would mount
the same RW volume into both containers at once.

Shared helper internal/workload/plugin/strategy.go (ValidateStrategy +
BuildGreenName). Backend-only (phase 1); the field is usable today via the
app's advanced-JSON editor — a friendly toggle + i18n follow in phase 2.
Tests: ValidateStrategy matrix, per-source Validate (incl. the empty-key
backward-compat lock), and effectiveStrategy defaults + the deno gate. Design
+ adversarial review: docs/plans/DEPLOY_STRATEGY_PLAN.md.
This commit is contained in:
2026-06-19 16:51:20 +03:00
parent 0c4c338bfe
commit e3d140c57a
13 changed files with 592 additions and 12 deletions
@@ -48,6 +48,11 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
return fmt.Errorf("dockerfile source: decode config: %w", err)
}
// bg selects the zero-downtime path: a unique green name so the new
// container coexists with the still-serving blue, an in-place route
// upsert, and blue reaped only AFTER green is persisted + routed.
bg := effectiveStrategy(cfg) == plugin.StrategyBlueGreen
prev, prevContainer, err := loadState(deps, w)
if err != nil {
return err
@@ -224,6 +229,13 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
}
containerName := containerNameFor(w)
if bg {
// Unique green name so the new container coexists with the still-
// serving blue one — the deterministic name would collide on
// Docker's per-daemon unique-name constraint. This name is also the
// proxy forwardHost below, so green receives traffic after cutover.
containerName = plugin.BuildGreenName(containerName, time.Now())
}
// Per-face proxy labels (Traefik consumes these; NPM ignores them).
labels := map[string]string{}
@@ -254,8 +266,16 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
containerID, err := deps.Docker.CreateContainer(ctx, cc)
if err != nil {
// Name conflict — best-effort cleanup of any prior container
// (by ID first; by name as a fallback) and one retry.
if bg {
// Green has a unique name, so this is a genuine create failure, not
// a name conflict — must NOT remove the still-serving blue.
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("create container: %v", err), token))
return fmt.Errorf("create container: %w", err)
}
// recreate: the deterministic name may still be held by the prior
// container — best-effort cleanup (by ID first; by name fallback) and
// one retry. This is the recreate downtime window.
if prevContainerID != "" {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
@@ -308,6 +328,22 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
return fmt.Errorf("container not running: %s", logMsg)
}
// Blue-green readiness gate: the 3s window above only proves green did not
// crash, not that it is SERVING. Before swapping the route, probe green's
// healthcheck over the network (when configured) so traffic never flips to
// a not-yet-listening container. On failure, remove green and leave blue +
// its route untouched — a non-disruptive rollback. recreate skips this (it
// already removed blue, so there is no live fallback to protect).
if bg && cfg.Healthcheck != "" && deps.Health != nil {
healthURL := fmt.Sprintf("http://%s:%d%s", containerName, cfg.Port, cfg.Healthcheck)
if herr := deps.Health.Check(ctx, healthURL); herr != nil {
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("readiness check %s: %v", cfg.Healthcheck, herr), token))
return fmt.Errorf("readiness check failed: %w", herr)
}
}
// Resolve proxy target: in-network DNS by default, NPM-remote
// override uses (settings.ServerIP, hostPort).
forwardHost := containerName
@@ -329,7 +365,12 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
// in-place so traffic shifts atomically over to the new container.
proxyRouteID := prevProxyRouteID
if domain != "" {
if prevProxyRouteID != "" {
// Blue-green relies on ConfigureRoute being an upsert-by-FQDN (NPM
// finds the host by domain and repoints it in place, gap-free), so we
// must NOT delete blue's route first — that would open a window.
// recreate already removed blue, so the pre-delete is harmless there
// but kept to preserve its exact prior behavior.
if !bg && prevProxyRouteID != "" {
deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
}
routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
@@ -347,10 +388,12 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
}
}
// Drop the previous container only after the new one is healthy
// + routed. Different-ID-than-previous tells us we created a
// fresh one (vs returning the same ID via UpsertContainer reuse).
if prevContainerID != "" && prevContainerID != containerID {
// recreate: drop the previous container now that the new one is healthy +
// routed. Blue-green DEFERS this until AFTER saveState (below) so the
// persisted single row always points at a running container — a crash
// between cutover and saveState must not orphan green or leave the row
// pointing at a reaped blue (which the reconciler would then flag failed).
if !bg && prevContainerID != "" && prevContainerID != containerID {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
}
@@ -384,6 +427,14 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
return fmt.Errorf("persist deploy state: %w", err)
}
// Blue-green: green is now persisted in the single row AND serving behind
// the swapped route — only now is it safe to reap blue. (recreate already
// removed blue before saveState.)
if bg && prevContainerID != "" && prevContainerID != containerID {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
}
publishEvent(deps, w, "deployed")
dispatchBuildNotification(deps, w, domain, "deployed", "")
@@ -64,6 +64,23 @@ type Config struct {
// git provider as a commit status (pending/success/failure) on the
// built SHA. Best-effort — a reporting failure never fails a deploy.
ReportCommitStatus bool `json:"report_commit_status"`
// DeployStrategy selects how a redeploy cuts over. "" (default) and
// "recreate" stop the old container before starting the new one (a brief
// downtime window). "blue-green" starts the new build alongside the old,
// gates it, swaps the proxy route in place, then reaps the old —
// zero-downtime under NPM. Validated via plugin.ValidateStrategy.
DeployStrategy string `json:"deploy_strategy,omitempty"`
}
// effectiveStrategy resolves the configured strategy for the dockerfile
// source. Empty maps to recreate — the source's historical behavior — so
// existing workloads are unchanged.
func effectiveStrategy(cfg Config) string {
if cfg.DeployStrategy == "" {
return plugin.StrategyRecreate
}
return cfg.DeployStrategy
}
type source struct{}
@@ -120,6 +137,9 @@ func (*source) Validate(cfg json.RawMessage) error {
return fmt.Errorf("dockerfile source: %q must not contain '..'", p)
}
}
if err := plugin.ValidateStrategy(c.DeployStrategy, true); err != nil {
return fmt.Errorf("dockerfile source: %w", err)
}
return nil
}
@@ -0,0 +1,49 @@
package dockerfile
import (
"encoding/json"
"testing"
"github.com/alexei/tinyforge/internal/workload/plugin"
)
// validCfg is the smallest config that passes the non-strategy checks, so a
// test isolates the deploy_strategy behavior.
func validCfg(strategy string) json.RawMessage {
m := map[string]any{"repo_owner": "o", "repo_name": "r", "port": 8080}
if strategy != "" {
m["deploy_strategy"] = strategy
}
b, _ := json.Marshal(m)
return b
}
func TestValidate_Strategy(t *testing.T) {
cases := []struct {
strategy string
wantErr bool
}{
{"", false}, // backward-compat: no key -> valid
{"recreate", false},
{"blue-green", false}, // dockerfile supports blue-green
{"rolling", true}, // reserved, not yet implemented
{"junk", true},
}
for _, c := range cases {
t.Run("strategy="+c.strategy, func(t *testing.T) {
err := (&source{}).Validate(validCfg(c.strategy))
if (err != nil) != c.wantErr {
t.Fatalf("Validate(strategy=%q) err=%v, wantErr=%v", c.strategy, err, c.wantErr)
}
})
}
}
func TestEffectiveStrategy_Default(t *testing.T) {
if got := effectiveStrategy(Config{}); got != plugin.StrategyRecreate {
t.Fatalf("empty strategy = %q, want recreate (historical default)", got)
}
if got := effectiveStrategy(Config{DeployStrategy: plugin.StrategyBlueGreen}); got != plugin.StrategyBlueGreen {
t.Fatalf("explicit blue-green = %q, want blue-green", got)
}
}