feat(deployer): configurable per-workload deploy strategy (blue-green for built sources)

Add a deploy_strategy field to each source's config blob — "" (default), "recreate", or "blue-green" — validated in each source's Validate and read on the deploy path. No new DB column, no migration: the field rides inside the existing SourceConfig JSON and every existing workload decodes "" to its historical behavior (image -> blue-green, others -> recreate). The real gap this closes: dockerfile and static stopped the old container before creating the new one on every redeploy — a downtime window image never had. Their blue-green branch now: - names the new "green" container with a unique suffix so it coexists with the still-serving blue (plumbed into both the container name AND the proxy forwardHost); - skips the collision teardown that destroyed blue early; - gates green — an HTTP readiness probe (deps.Health.Check) when a healthcheck is configured, else the existing liveness window; - swaps the route via a pure upsert (no pre-DeleteRoute) so NPM repoints in place with no gap; - persists green into the single runtime-state row BEFORE reaping blue, so a crash mid-swap can never orphan green or leave the row pointing at a removed container (state.go/teardown.go/reconcile.go stay untouched). image honors explicit "recreate" (reap existing containers after pull, before cutover); its default blue-green path is unchanged. compose stays stack-managed and rejects "blue-green" at Validate so the contract is honest. static forces recreate for storage-backed deno sites — blue-green would mount the same RW volume into both containers at once. Shared helper internal/workload/plugin/strategy.go (ValidateStrategy + BuildGreenName). Backend-only (phase 1); the field is usable today via the app's advanced-JSON editor — a friendly toggle + i18n follow in phase 2. Tests: ValidateStrategy matrix, per-source Validate (incl. the empty-key backward-compat lock), and effectiveStrategy defaults + the deno gate. Design + adversarial review: docs/plans/DEPLOY_STRATEGY_PLAN.md.
2026-06-19 16:51:20 +03:00
parent 0c4c338bfe
commit e3d140c57a
13 changed files with 592 additions and 12 deletions
@@ -48,6 +48,11 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
 		return fmt.Errorf("dockerfile source: decode config: %w", err)
 	}

+	// bg selects the zero-downtime path: a unique green name so the new
+	// container coexists with the still-serving blue, an in-place route
+	// upsert, and blue reaped only AFTER green is persisted + routed.
+	bg := effectiveStrategy(cfg) == plugin.StrategyBlueGreen
+
 	prev, prevContainer, err := loadState(deps, w)
 	if err != nil {
 		return err
@@ -224,6 +229,13 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
 	}

 	containerName := containerNameFor(w)
+	if bg {
+		// Unique green name so the new container coexists with the still-
+		// serving blue one — the deterministic name would collide on
+		// Docker's per-daemon unique-name constraint. This name is also the
+		// proxy forwardHost below, so green receives traffic after cutover.
+		containerName = plugin.BuildGreenName(containerName, time.Now())
+	}

 	// Per-face proxy labels (Traefik consumes these; NPM ignores them).
 	labels := map[string]string{}
@@ -254,8 +266,16 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu

 	containerID, err := deps.Docker.CreateContainer(ctx, cc)
 	if err != nil {
-		// Name conflict — best-effort cleanup of any prior container
-		// (by ID first; by name as a fallback) and one retry.
+		if bg {
+			// Green has a unique name, so this is a genuine create failure, not
+			// a name conflict — must NOT remove the still-serving blue.
+			updateStatus(deps, w, "failed", latestSHA,
+				sanitizeError(fmt.Sprintf("create container: %v", err), token))
+			return fmt.Errorf("create container: %w", err)
+		}
+		// recreate: the deterministic name may still be held by the prior
+		// container — best-effort cleanup (by ID first; by name fallback) and
+		// one retry. This is the recreate downtime window.
 		if prevContainerID != "" {
 			deps.Docker.StopContainer(ctx, prevContainerID, 10)
 			deps.Docker.RemoveContainer(ctx, prevContainerID, true)
@@ -308,6 +328,22 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
 		return fmt.Errorf("container not running: %s", logMsg)
 	}

+	// Blue-green readiness gate: the 3s window above only proves green did not
+	// crash, not that it is SERVING. Before swapping the route, probe green's
+	// healthcheck over the network (when configured) so traffic never flips to
+	// a not-yet-listening container. On failure, remove green and leave blue +
+	// its route untouched — a non-disruptive rollback. recreate skips this (it
+	// already removed blue, so there is no live fallback to protect).
+	if bg && cfg.Healthcheck != "" && deps.Health != nil {
+		healthURL := fmt.Sprintf("http://%s:%d%s", containerName, cfg.Port, cfg.Healthcheck)
+		if herr := deps.Health.Check(ctx, healthURL); herr != nil {
+			deps.Docker.RemoveContainer(ctx, containerID, true)
+			updateStatus(deps, w, "failed", latestSHA,
+				sanitizeError(fmt.Sprintf("readiness check %s: %v", cfg.Healthcheck, herr), token))
+			return fmt.Errorf("readiness check failed: %w", herr)
+		}
+	}
+
 	// Resolve proxy target: in-network DNS by default, NPM-remote
 	// override uses (settings.ServerIP, hostPort).
 	forwardHost := containerName
@@ -329,7 +365,12 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
 	// in-place so traffic shifts atomically over to the new container.
 	proxyRouteID := prevProxyRouteID
 	if domain != "" {
-		if prevProxyRouteID != "" {
+		// Blue-green relies on ConfigureRoute being an upsert-by-FQDN (NPM
+		// finds the host by domain and repoints it in place, gap-free), so we
+		// must NOT delete blue's route first — that would open a window.
+		// recreate already removed blue, so the pre-delete is harmless there
+		// but kept to preserve its exact prior behavior.
+		if !bg && prevProxyRouteID != "" {
 			deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
 		}
 		routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
@@ -347,10 +388,12 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
 		}
 	}

-	// Drop the previous container only after the new one is healthy
-	// + routed. Different-ID-than-previous tells us we created a
-	// fresh one (vs returning the same ID via UpsertContainer reuse).
-	if prevContainerID != "" && prevContainerID != containerID {
+	// recreate: drop the previous container now that the new one is healthy +
+	// routed. Blue-green DEFERS this until AFTER saveState (below) so the
+	// persisted single row always points at a running container — a crash
+	// between cutover and saveState must not orphan green or leave the row
+	// pointing at a reaped blue (which the reconciler would then flag failed).
+	if !bg && prevContainerID != "" && prevContainerID != containerID {
 		deps.Docker.StopContainer(ctx, prevContainerID, 10)
 		deps.Docker.RemoveContainer(ctx, prevContainerID, true)
 	}
@@ -384,6 +427,14 @@ func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plu
 		return fmt.Errorf("persist deploy state: %w", err)
 	}

+	// Blue-green: green is now persisted in the single row AND serving behind
+	// the swapped route — only now is it safe to reap blue. (recreate already
+	// removed blue before saveState.)
+	if bg && prevContainerID != "" && prevContainerID != containerID {
+		deps.Docker.StopContainer(ctx, prevContainerID, 10)
+		deps.Docker.RemoveContainer(ctx, prevContainerID, true)
+	}
+
 	publishEvent(deps, w, "deployed")
 	dispatchBuildNotification(deps, w, domain, "deployed", "")