feat(docker-watcher): phase 12 - hardening

Blue-green zero-downtime deploys, promote flow validation.
Dual auth: local (bcrypt + JWT) and OAuth2/OIDC (any provider).
Auth middleware, login page, auth settings UI.
Structured logging (slog JSON), config export to YAML.
Graceful shutdown with deploy draining.
Multi-stage Dockerfile and production docker-compose.yml.
Swap phase order: Volumes & Env before UI Polish.
This commit is contained in:
2026-03-27 23:20:56 +03:00
parent 5558396bb7
commit 32de5b26a8
30 changed files with 2134 additions and 143 deletions
+173
View File
@@ -0,0 +1,173 @@
package deployer
import (
"context"
"fmt"
"log/slog"
"github.com/alexei/docker-watcher/internal/docker"
"github.com/alexei/docker-watcher/internal/store"
"github.com/google/uuid"
)
// blueGreenDeploy performs a zero-downtime deployment:
// 1. Start new container (green)
// 2. Health check green
// 3. Swap NPM proxy to point to green
// 4. Stop old container (blue)
//
// If the new container fails health check, it is removed and the old one stays.
func (d *Deployer) blueGreenDeploy(
ctx context.Context,
project store.Project,
stage store.Stage,
settings store.Settings,
deployID string,
imageTag string,
) (string, int, string, error) {
// Find existing running instance for this stage (the "blue" instance).
existingInstances, err := d.store.GetInstancesByStageID(stage.ID)
if err != nil {
return "", 0, "", fmt.Errorf("get existing instances: %w", err)
}
var blueInstance *store.Instance
for _, inst := range existingInstances {
if inst.Status == "running" {
instCopy := inst
blueInstance = &instCopy
break
}
}
// Step 1: Pull image.
if err := d.store.UpdateDeployStatus(deployID, "pulling", ""); err != nil {
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "pulling", "")
d.logDeploy(deployID, fmt.Sprintf("Blue-green: pulling image %s:%s", project.Image, imageTag), "info")
authConfig, err := d.buildRegistryAuth(project)
if err != nil {
return "", 0, "", fmt.Errorf("build registry auth: %w", err)
}
if err := d.docker.PullImage(ctx, project.Image, imageTag, authConfig); err != nil {
return "", 0, "", fmt.Errorf("pull image: %w", err)
}
d.logDeploy(deployID, "Image pulled successfully", "info")
// Step 2: Ensure network.
networkID, err := d.docker.EnsureNetwork(ctx, settings.Network)
if err != nil {
return "", 0, "", fmt.Errorf("ensure network: %w", err)
}
// Step 3: Create and start green container.
if err := d.store.UpdateDeployStatus(deployID, "starting", ""); err != nil {
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "starting", "")
instanceID := uuid.New().String()
subdomain := d.buildSubdomain(project, stage, settings, imageTag)
containerName := docker.ContainerName(project.Name, stage.Name, imageTag)
portStr := fmt.Sprintf("%d/tcp", project.Port)
envVars := d.parseEnvVars(project.Env)
containerCfg := docker.ContainerConfig{
Name: containerName,
Image: project.Image + ":" + imageTag,
Env: envVars,
ExposedPorts: []string{portStr},
NetworkName: settings.Network,
NetworkID: networkID,
Project: project.Name,
Stage: stage.Name,
InstanceID: instanceID,
}
d.logDeploy(deployID, fmt.Sprintf("Blue-green: creating green container %s", containerName), "info")
containerID, err := d.docker.CreateContainer(ctx, containerCfg)
if err != nil {
return "", 0, instanceID, fmt.Errorf("create container: %w", err)
}
// Create instance record.
inst, err := d.store.CreateInstanceWithID(store.Instance{
ID: instanceID,
StageID: stage.ID,
ProjectID: project.ID,
ContainerID: containerID,
ImageTag: imageTag,
Subdomain: subdomain,
Status: "stopped",
Port: project.Port,
})
if err != nil {
return containerID, 0, instanceID, fmt.Errorf("create instance record: %w", err)
}
instanceID = inst.ID
if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
slog.Warn("link deploy to instance", "error", err)
}
d.logDeploy(deployID, fmt.Sprintf("Blue-green: starting green container %s", containerName), "info")
if err := d.docker.StartContainer(ctx, containerID); err != nil {
return containerID, 0, instanceID, fmt.Errorf("start container: %w", err)
}
if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
slog.Warn("update instance status", "error", err)
}
d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
// Step 4: Health check the green container.
if project.Healthcheck != "" {
if err := d.store.UpdateDeployStatus(deployID, "health_checking", ""); err != nil {
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "health_checking", "")
healthURL := fmt.Sprintf("http://%s:%d%s", containerName, project.Port, project.Healthcheck)
d.logDeploy(deployID, fmt.Sprintf("Blue-green: health checking green at %s", healthURL), "info")
if err := d.health.Check(ctx, healthURL); err != nil {
return containerID, 0, instanceID, fmt.Errorf("health check green: %w", err)
}
d.logDeploy(deployID, "Blue-green: green health check passed", "info")
}
// Step 5: Swap NPM proxy to green.
if err := d.store.UpdateDeployStatus(deployID, "configuring_proxy", ""); err != nil {
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "configuring_proxy", "")
npmProxyID, err := d.configureProxy(ctx, deployID, settings, containerName, project.Port, subdomain)
if err != nil {
return containerID, 0, instanceID, fmt.Errorf("configure proxy: %w", err)
}
inst.NpmProxyID = npmProxyID
inst.Subdomain = subdomain
if err := d.store.UpdateInstance(inst); err != nil {
slog.Warn("update instance with proxy ID", "error", err)
}
d.logDeploy(deployID, "Blue-green: proxy swapped to green container", "info")
// Step 6: Stop the blue container.
if blueInstance != nil {
d.logDeploy(deployID, fmt.Sprintf("Blue-green: stopping blue instance %s (tag: %s)", blueInstance.ID, blueInstance.ImageTag), "info")
if err := d.removeInstance(ctx, *blueInstance, settings); err != nil {
// Non-fatal: log but continue. Green is already serving traffic.
d.logDeploy(deployID, fmt.Sprintf("Blue-green: warning: failed to remove blue instance: %v", err), "warn")
} else {
d.logDeploy(deployID, "Blue-green: blue instance removed", "info")
}
}
return containerID, npmProxyID, instanceID, nil
}
+60 -18
View File
@@ -4,8 +4,10 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"log/slog"
"sort"
"sync"
"sync/atomic"
"github.com/alexei/docker-watcher/internal/crypto"
"github.com/alexei/docker-watcher/internal/docker"
@@ -28,6 +30,10 @@ type Deployer struct {
notifier *notify.Notifier
eventBus EventPublisher
encKey [32]byte
// Graceful shutdown: tracks in-progress deploys.
activeWg sync.WaitGroup
shuttingDown atomic.Bool
}
// EventPublisher is the interface for publishing events to the event bus.
@@ -56,10 +62,25 @@ func New(
}
}
// Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
func (d *Deployer) Drain() {
d.shuttingDown.Store(true)
slog.Info("deployer: draining in-progress deploys")
d.activeWg.Wait()
slog.Info("deployer: all deploys drained")
}
// TriggerDeploy is the main entry point for deployments. It orchestrates the full flow:
// pull image -> create container -> start -> configure proxy -> health check.
// On failure, it rolls back (removes container, deletes proxy host, updates status).
func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) error {
if d.shuttingDown.Load() {
return fmt.Errorf("deployer is shutting down, rejecting new deploy")
}
d.activeWg.Add(1)
defer d.activeWg.Done()
// Load project and stage from store.
project, err := d.store.GetProjectByID(projectID)
if err != nil {
@@ -71,6 +92,11 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
return fmt.Errorf("get stage: %w", err)
}
// Validate promote_from constraint.
if err := d.validatePromoteFrom(stage, imageTag); err != nil {
return fmt.Errorf("promote validation: %w", err)
}
settings, err := d.store.GetSettings()
if err != nil {
return fmt.Errorf("get settings: %w", err)
@@ -87,6 +113,12 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
return fmt.Errorf("create deploy record: %w", err)
}
slog.Info("starting deploy",
"deploy_id", deploy.ID,
"project", project.Name,
"stage", stage.Name,
"tag", imageTag,
)
d.logDeploy(deploy.ID, fmt.Sprintf("Starting deploy of %s:%s for project %s, stage %s", project.Image, imageTag, project.Name, stage.Name), "info")
// Enforce max_instances before deploying.
@@ -95,8 +127,18 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
// Non-fatal: continue with deploy.
}
// Execute the deploy pipeline. Track state for rollback.
containerID, npmProxyID, instanceID, deployErr := d.executeDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
// Choose deploy strategy: blue-green if stage has max_instances=1 and an existing instance.
var containerID string
var npmProxyID int
var instanceID string
var deployErr error
if stage.MaxInstances == 1 {
containerID, npmProxyID, instanceID, deployErr = d.blueGreenDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
} else {
// Execute the standard deploy pipeline. Track state for rollback.
containerID, npmProxyID, instanceID, deployErr = d.executeDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
}
if deployErr != nil {
d.logDeploy(deploy.ID, fmt.Sprintf("Deploy failed: %v", deployErr), "error")
@@ -116,7 +158,7 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
// Mark deploy as successful.
if err := d.store.UpdateDeployStatus(deploy.ID, "success", ""); err != nil {
log.Printf("deployer: update deploy status to success: %v", err)
slog.Warn("update deploy status to success", "error", err)
}
d.publishDeployStatus(deploy.ID, projectID, stageID, imageTag, "success", "")
@@ -153,7 +195,7 @@ func (d *Deployer) executeDeploy(
// Step 1: Pull image.
if err := d.store.UpdateDeployStatus(deployID, "pulling", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "pulling", "")
d.logDeploy(deployID, fmt.Sprintf("Pulling image %s:%s", project.Image, imageTag), "info")
@@ -177,7 +219,7 @@ func (d *Deployer) executeDeploy(
// Step 3: Create and start container.
if err := d.store.UpdateDeployStatus(deployID, "starting", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "starting", "")
@@ -226,7 +268,7 @@ func (d *Deployer) executeDeploy(
// Link deploy to instance.
if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
log.Printf("deployer: link deploy to instance: %v", err)
slog.Warn("link deploy to instance", "error", err)
}
d.logDeploy(deployID, fmt.Sprintf("Starting container %s", containerName), "info")
@@ -235,14 +277,14 @@ func (d *Deployer) executeDeploy(
}
if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
log.Printf("deployer: update instance status to running: %v", err)
slog.Warn("update instance status to running", "error", err)
}
d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
d.logDeploy(deployID, "Container started", "info")
// Step 4: Configure NPM proxy.
if err := d.store.UpdateDeployStatus(deployID, "configuring_proxy", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "configuring_proxy", "")
@@ -255,13 +297,13 @@ func (d *Deployer) executeDeploy(
inst.NpmProxyID = npmProxyID
inst.Subdomain = subdomain
if err := d.store.UpdateInstance(inst); err != nil {
log.Printf("deployer: update instance with proxy ID: %v", err)
slog.Warn("update instance with proxy ID", "error", err)
}
// Step 5: Health check.
if project.Healthcheck != "" {
if err := d.store.UpdateDeployStatus(deployID, "health_checking", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "health_checking", "")
@@ -390,13 +432,13 @@ func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, d
func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, settings store.Settings) error {
// Mark as removing.
if err := d.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
log.Printf("deployer: update instance %s status to removing: %v", inst.ID, err)
slog.Warn("update instance status to removing", "instance_id", inst.ID, "error", err)
}
// Remove Docker container.
if inst.ContainerID != "" {
if err := d.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
log.Printf("deployer: remove container %s: %v", inst.ContainerID, err)
slog.Warn("remove container", "container_id", inst.ContainerID, "error", err)
}
}
@@ -404,11 +446,11 @@ func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, sett
if inst.NpmProxyID > 0 {
npmPassword, err := d.decryptNpmPassword(settings.NpmPassword)
if err != nil {
log.Printf("deployer: decrypt npm password for proxy cleanup: %v", err)
slog.Warn("decrypt npm password for proxy cleanup", "error", err)
} else if authErr := d.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr != nil {
log.Printf("deployer: authenticate npm for proxy cleanup: %v", authErr)
slog.Warn("authenticate npm for proxy cleanup", "error", authErr)
} else if delErr := d.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil {
log.Printf("deployer: delete proxy host %d: %v", inst.NpmProxyID, delErr)
slog.Warn("delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr)
}
}
@@ -471,7 +513,7 @@ func (d *Deployer) parseEnvVars(envJSON string) []string {
var envMap map[string]string
if err := json.Unmarshal([]byte(envJSON), &envMap); err != nil {
log.Printf("deployer: parse env vars: %v", err)
slog.Warn("parse env vars", "error", err)
return nil
}
@@ -486,7 +528,7 @@ func (d *Deployer) parseEnvVars(envJSON string) []string {
// Errors are logged to stderr but not propagated.
func (d *Deployer) logDeploy(deployID, message, level string) {
if err := d.store.AppendDeployLog(deployID, message, level); err != nil {
log.Printf("deployer: append deploy log: %v", err)
slog.Warn("append deploy log", "error", err)
}
if d.eventBus != nil {
d.eventBus.Publish(events.Event{
+49
View File
@@ -0,0 +1,49 @@
package deployer
import (
"fmt"
"github.com/alexei/docker-watcher/internal/store"
)
// validatePromoteFrom checks that a tag is running in the promote_from stage
// before allowing it to be deployed to the target stage.
// Returns nil if no promote_from is configured or if the tag is eligible.
func (d *Deployer) validatePromoteFrom(stage store.Stage, imageTag string) error {
if stage.PromoteFrom == "" {
return nil
}
// Look up the source stage by name within the same project.
stages, err := d.store.GetStagesByProjectID(stage.ProjectID)
if err != nil {
return fmt.Errorf("get stages for project: %w", err)
}
var sourceStage *store.Stage
for _, s := range stages {
if s.Name == stage.PromoteFrom {
sCopy := s
sourceStage = &sCopy
break
}
}
if sourceStage == nil {
return fmt.Errorf("promote_from stage %q not found in project", stage.PromoteFrom)
}
// Check if the tag is running in the source stage.
instances, err := d.store.GetInstancesByStageID(sourceStage.ID)
if err != nil {
return fmt.Errorf("get instances for source stage: %w", err)
}
for _, inst := range instances {
if inst.ImageTag == imageTag && (inst.Status == "running" || inst.Status == "stopped") {
return nil // Tag found in source stage, promotion is allowed.
}
}
return fmt.Errorf("tag %q is not running in stage %q; promotion denied", imageTag, stage.PromoteFrom)
}
+8 -8
View File
@@ -3,7 +3,7 @@ package deployer
import (
"context"
"fmt"
"log"
"log/slog"
)
// rollback cleans up a failed deployment by removing the container,
@@ -15,7 +15,7 @@ func (d *Deployer) rollback(ctx context.Context, deployID string, containerID st
// Remove the container if it was created.
if containerID != "" {
if err := d.docker.RemoveContainer(ctx, containerID, true); err != nil {
log.Printf("rollback: remove container %s: %v", containerID, err)
slog.Warn("rollback: remove container", "container_id", containerID, "error", err)
d.logDeploy(deployID, fmt.Sprintf("Rollback: failed to remove container: %v", err), "error")
} else {
d.logDeploy(deployID, "Rollback: container removed", "info")
@@ -26,16 +26,16 @@ func (d *Deployer) rollback(ctx context.Context, deployID string, containerID st
if npmProxyID > 0 {
settings, err := d.store.GetSettings()
if err != nil {
log.Printf("rollback: get settings for npm auth: %v", err)
slog.Warn("rollback: get settings for npm auth", "error", err)
d.logDeploy(deployID, fmt.Sprintf("Rollback: failed to get settings for proxy cleanup: %v", err), "error")
} else if npmPassword, err := d.decryptNpmPassword(settings.NpmPassword); err != nil {
log.Printf("rollback: decrypt npm password: %v", err)
slog.Warn("rollback: decrypt npm password", "error", err)
d.logDeploy(deployID, "Rollback: failed to decrypt NPM password for proxy cleanup", "error")
} else if err := d.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); err != nil {
log.Printf("rollback: authenticate npm: %v", err)
slog.Warn("rollback: authenticate npm", "error", err)
d.logDeploy(deployID, "Rollback: failed to authenticate NPM for proxy cleanup", "error")
} else if err := d.npm.DeleteProxyHost(ctx, npmProxyID); err != nil {
log.Printf("rollback: delete proxy host %d: %v", npmProxyID, err)
slog.Warn("rollback: delete proxy host", "proxy_id", npmProxyID, "error", err)
d.logDeploy(deployID, fmt.Sprintf("Rollback: failed to delete proxy host: %v", err), "error")
} else {
d.logDeploy(deployID, "Rollback: proxy host deleted", "info")
@@ -45,13 +45,13 @@ func (d *Deployer) rollback(ctx context.Context, deployID string, containerID st
// Update instance status to failed if it was created.
if instanceID != "" {
if err := d.store.UpdateInstanceStatus(instanceID, "failed"); err != nil {
log.Printf("rollback: update instance %s status: %v", instanceID, err)
slog.Warn("rollback: update instance status", "instance_id", instanceID, "error", err)
}
}
// Mark deploy as rolled back.
if err := d.store.UpdateDeployStatus(deployID, "rolled_back", "deployment failed, rolled back"); err != nil {
log.Printf("rollback: update deploy %s status: %v", deployID, err)
slog.Warn("rollback: update deploy status", "deploy_id", deployID, "error", err)
}
d.logDeploy(deployID, "Rollback complete", "info")