feat(docker-watcher): phase 12 - hardening

Blue-green zero-downtime deploys, promote flow validation.
Dual auth: local (bcrypt + JWT) and OAuth2/OIDC (any provider).
Auth middleware, login page, auth settings UI.
Structured logging (slog JSON), config export to YAML.
Graceful shutdown with deploy draining.
Multi-stage Dockerfile and production docker-compose.yml.
Swap phase order: Volumes & Env before UI Polish.
This commit is contained in:
2026-03-27 23:20:56 +03:00
parent 5558396bb7
commit 32de5b26a8
30 changed files with 2134 additions and 143 deletions
+60 -18
View File
@@ -4,8 +4,10 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"log/slog"
"sort"
"sync"
"sync/atomic"
"github.com/alexei/docker-watcher/internal/crypto"
"github.com/alexei/docker-watcher/internal/docker"
@@ -28,6 +30,10 @@ type Deployer struct {
notifier *notify.Notifier
eventBus EventPublisher
encKey [32]byte
// Graceful shutdown: tracks in-progress deploys.
activeWg sync.WaitGroup
shuttingDown atomic.Bool
}
// EventPublisher is the interface for publishing events to the event bus.
@@ -56,10 +62,25 @@ func New(
}
}
// Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
func (d *Deployer) Drain() {
d.shuttingDown.Store(true)
slog.Info("deployer: draining in-progress deploys")
d.activeWg.Wait()
slog.Info("deployer: all deploys drained")
}
// TriggerDeploy is the main entry point for deployments. It orchestrates the full flow:
// pull image -> create container -> start -> configure proxy -> health check.
// On failure, it rolls back (removes container, deletes proxy host, updates status).
func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) error {
if d.shuttingDown.Load() {
return fmt.Errorf("deployer is shutting down, rejecting new deploy")
}
d.activeWg.Add(1)
defer d.activeWg.Done()
// Load project and stage from store.
project, err := d.store.GetProjectByID(projectID)
if err != nil {
@@ -71,6 +92,11 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
return fmt.Errorf("get stage: %w", err)
}
// Validate promote_from constraint.
if err := d.validatePromoteFrom(stage, imageTag); err != nil {
return fmt.Errorf("promote validation: %w", err)
}
settings, err := d.store.GetSettings()
if err != nil {
return fmt.Errorf("get settings: %w", err)
@@ -87,6 +113,12 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
return fmt.Errorf("create deploy record: %w", err)
}
slog.Info("starting deploy",
"deploy_id", deploy.ID,
"project", project.Name,
"stage", stage.Name,
"tag", imageTag,
)
d.logDeploy(deploy.ID, fmt.Sprintf("Starting deploy of %s:%s for project %s, stage %s", project.Image, imageTag, project.Name, stage.Name), "info")
// Enforce max_instances before deploying.
@@ -95,8 +127,18 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
// Non-fatal: continue with deploy.
}
// Execute the deploy pipeline. Track state for rollback.
containerID, npmProxyID, instanceID, deployErr := d.executeDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
// Choose deploy strategy: blue-green if stage has max_instances=1 and an existing instance.
var containerID string
var npmProxyID int
var instanceID string
var deployErr error
if stage.MaxInstances == 1 {
containerID, npmProxyID, instanceID, deployErr = d.blueGreenDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
} else {
// Execute the standard deploy pipeline. Track state for rollback.
containerID, npmProxyID, instanceID, deployErr = d.executeDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
}
if deployErr != nil {
d.logDeploy(deploy.ID, fmt.Sprintf("Deploy failed: %v", deployErr), "error")
@@ -116,7 +158,7 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
// Mark deploy as successful.
if err := d.store.UpdateDeployStatus(deploy.ID, "success", ""); err != nil {
log.Printf("deployer: update deploy status to success: %v", err)
slog.Warn("update deploy status to success", "error", err)
}
d.publishDeployStatus(deploy.ID, projectID, stageID, imageTag, "success", "")
@@ -153,7 +195,7 @@ func (d *Deployer) executeDeploy(
// Step 1: Pull image.
if err := d.store.UpdateDeployStatus(deployID, "pulling", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "pulling", "")
d.logDeploy(deployID, fmt.Sprintf("Pulling image %s:%s", project.Image, imageTag), "info")
@@ -177,7 +219,7 @@ func (d *Deployer) executeDeploy(
// Step 3: Create and start container.
if err := d.store.UpdateDeployStatus(deployID, "starting", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "starting", "")
@@ -226,7 +268,7 @@ func (d *Deployer) executeDeploy(
// Link deploy to instance.
if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
log.Printf("deployer: link deploy to instance: %v", err)
slog.Warn("link deploy to instance", "error", err)
}
d.logDeploy(deployID, fmt.Sprintf("Starting container %s", containerName), "info")
@@ -235,14 +277,14 @@ func (d *Deployer) executeDeploy(
}
if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
log.Printf("deployer: update instance status to running: %v", err)
slog.Warn("update instance status to running", "error", err)
}
d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
d.logDeploy(deployID, "Container started", "info")
// Step 4: Configure NPM proxy.
if err := d.store.UpdateDeployStatus(deployID, "configuring_proxy", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "configuring_proxy", "")
@@ -255,13 +297,13 @@ func (d *Deployer) executeDeploy(
inst.NpmProxyID = npmProxyID
inst.Subdomain = subdomain
if err := d.store.UpdateInstance(inst); err != nil {
log.Printf("deployer: update instance with proxy ID: %v", err)
slog.Warn("update instance with proxy ID", "error", err)
}
// Step 5: Health check.
if project.Healthcheck != "" {
if err := d.store.UpdateDeployStatus(deployID, "health_checking", ""); err != nil {
log.Printf("deployer: update deploy status: %v", err)
slog.Warn("update deploy status", "error", err)
}
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "health_checking", "")
@@ -390,13 +432,13 @@ func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, d
func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, settings store.Settings) error {
// Mark as removing.
if err := d.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
log.Printf("deployer: update instance %s status to removing: %v", inst.ID, err)
slog.Warn("update instance status to removing", "instance_id", inst.ID, "error", err)
}
// Remove Docker container.
if inst.ContainerID != "" {
if err := d.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
log.Printf("deployer: remove container %s: %v", inst.ContainerID, err)
slog.Warn("remove container", "container_id", inst.ContainerID, "error", err)
}
}
@@ -404,11 +446,11 @@ func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, sett
if inst.NpmProxyID > 0 {
npmPassword, err := d.decryptNpmPassword(settings.NpmPassword)
if err != nil {
log.Printf("deployer: decrypt npm password for proxy cleanup: %v", err)
slog.Warn("decrypt npm password for proxy cleanup", "error", err)
} else if authErr := d.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr != nil {
log.Printf("deployer: authenticate npm for proxy cleanup: %v", authErr)
slog.Warn("authenticate npm for proxy cleanup", "error", authErr)
} else if delErr := d.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil {
log.Printf("deployer: delete proxy host %d: %v", inst.NpmProxyID, delErr)
slog.Warn("delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr)
}
}
@@ -471,7 +513,7 @@ func (d *Deployer) parseEnvVars(envJSON string) []string {
var envMap map[string]string
if err := json.Unmarshal([]byte(envJSON), &envMap); err != nil {
log.Printf("deployer: parse env vars: %v", err)
slog.Warn("parse env vars", "error", err)
return nil
}
@@ -486,7 +528,7 @@ func (d *Deployer) parseEnvVars(envJSON string) []string {
// Errors are logged to stderr but not propagated.
func (d *Deployer) logDeploy(deployID, message, level string) {
if err := d.store.AppendDeployLog(deployID, message, level); err != nil {
log.Printf("deployer: append deploy log: %v", err)
slog.Warn("append deploy log", "error", err)
}
if d.eventBus != nil {
d.eventBus.Publish(events.Event{