feat(docker-watcher): phase 12 - hardening
Blue-green zero-downtime deploys, promote flow validation. Dual auth: local (bcrypt + JWT) and OAuth2/OIDC (any provider). Auth middleware, login page, auth settings UI. Structured logging (slog JSON), config export to YAML. Graceful shutdown with deploy draining. Multi-stage Dockerfile and production docker-compose.yml. Swap phase order: Volumes & Env before UI Polish.
This commit is contained in:
@@ -4,8 +4,10 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"log/slog"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/alexei/docker-watcher/internal/crypto"
|
||||
"github.com/alexei/docker-watcher/internal/docker"
|
||||
@@ -28,6 +30,10 @@ type Deployer struct {
|
||||
notifier *notify.Notifier
|
||||
eventBus EventPublisher
|
||||
encKey [32]byte
|
||||
|
||||
// Graceful shutdown: tracks in-progress deploys.
|
||||
activeWg sync.WaitGroup
|
||||
shuttingDown atomic.Bool
|
||||
}
|
||||
|
||||
// EventPublisher is the interface for publishing events to the event bus.
|
||||
@@ -56,10 +62,25 @@ func New(
|
||||
}
|
||||
}
|
||||
|
||||
// Drain waits for all in-progress deploys to complete. Call this during graceful shutdown.
|
||||
func (d *Deployer) Drain() {
|
||||
d.shuttingDown.Store(true)
|
||||
slog.Info("deployer: draining in-progress deploys")
|
||||
d.activeWg.Wait()
|
||||
slog.Info("deployer: all deploys drained")
|
||||
}
|
||||
|
||||
// TriggerDeploy is the main entry point for deployments. It orchestrates the full flow:
|
||||
// pull image -> create container -> start -> configure proxy -> health check.
|
||||
// On failure, it rolls back (removes container, deletes proxy host, updates status).
|
||||
func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) error {
|
||||
if d.shuttingDown.Load() {
|
||||
return fmt.Errorf("deployer is shutting down, rejecting new deploy")
|
||||
}
|
||||
|
||||
d.activeWg.Add(1)
|
||||
defer d.activeWg.Done()
|
||||
|
||||
// Load project and stage from store.
|
||||
project, err := d.store.GetProjectByID(projectID)
|
||||
if err != nil {
|
||||
@@ -71,6 +92,11 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
|
||||
return fmt.Errorf("get stage: %w", err)
|
||||
}
|
||||
|
||||
// Validate promote_from constraint.
|
||||
if err := d.validatePromoteFrom(stage, imageTag); err != nil {
|
||||
return fmt.Errorf("promote validation: %w", err)
|
||||
}
|
||||
|
||||
settings, err := d.store.GetSettings()
|
||||
if err != nil {
|
||||
return fmt.Errorf("get settings: %w", err)
|
||||
@@ -87,6 +113,12 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
|
||||
return fmt.Errorf("create deploy record: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("starting deploy",
|
||||
"deploy_id", deploy.ID,
|
||||
"project", project.Name,
|
||||
"stage", stage.Name,
|
||||
"tag", imageTag,
|
||||
)
|
||||
d.logDeploy(deploy.ID, fmt.Sprintf("Starting deploy of %s:%s for project %s, stage %s", project.Image, imageTag, project.Name, stage.Name), "info")
|
||||
|
||||
// Enforce max_instances before deploying.
|
||||
@@ -95,8 +127,18 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
|
||||
// Non-fatal: continue with deploy.
|
||||
}
|
||||
|
||||
// Execute the deploy pipeline. Track state for rollback.
|
||||
containerID, npmProxyID, instanceID, deployErr := d.executeDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
|
||||
// Choose deploy strategy: blue-green if stage has max_instances=1 and an existing instance.
|
||||
var containerID string
|
||||
var npmProxyID int
|
||||
var instanceID string
|
||||
var deployErr error
|
||||
|
||||
if stage.MaxInstances == 1 {
|
||||
containerID, npmProxyID, instanceID, deployErr = d.blueGreenDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
|
||||
} else {
|
||||
// Execute the standard deploy pipeline. Track state for rollback.
|
||||
containerID, npmProxyID, instanceID, deployErr = d.executeDeploy(ctx, project, stage, settings, deploy.ID, imageTag)
|
||||
}
|
||||
|
||||
if deployErr != nil {
|
||||
d.logDeploy(deploy.ID, fmt.Sprintf("Deploy failed: %v", deployErr), "error")
|
||||
@@ -116,7 +158,7 @@ func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageT
|
||||
|
||||
// Mark deploy as successful.
|
||||
if err := d.store.UpdateDeployStatus(deploy.ID, "success", ""); err != nil {
|
||||
log.Printf("deployer: update deploy status to success: %v", err)
|
||||
slog.Warn("update deploy status to success", "error", err)
|
||||
}
|
||||
d.publishDeployStatus(deploy.ID, projectID, stageID, imageTag, "success", "")
|
||||
|
||||
@@ -153,7 +195,7 @@ func (d *Deployer) executeDeploy(
|
||||
|
||||
// Step 1: Pull image.
|
||||
if err := d.store.UpdateDeployStatus(deployID, "pulling", ""); err != nil {
|
||||
log.Printf("deployer: update deploy status: %v", err)
|
||||
slog.Warn("update deploy status", "error", err)
|
||||
}
|
||||
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "pulling", "")
|
||||
d.logDeploy(deployID, fmt.Sprintf("Pulling image %s:%s", project.Image, imageTag), "info")
|
||||
@@ -177,7 +219,7 @@ func (d *Deployer) executeDeploy(
|
||||
|
||||
// Step 3: Create and start container.
|
||||
if err := d.store.UpdateDeployStatus(deployID, "starting", ""); err != nil {
|
||||
log.Printf("deployer: update deploy status: %v", err)
|
||||
slog.Warn("update deploy status", "error", err)
|
||||
}
|
||||
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "starting", "")
|
||||
|
||||
@@ -226,7 +268,7 @@ func (d *Deployer) executeDeploy(
|
||||
|
||||
// Link deploy to instance.
|
||||
if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil {
|
||||
log.Printf("deployer: link deploy to instance: %v", err)
|
||||
slog.Warn("link deploy to instance", "error", err)
|
||||
}
|
||||
|
||||
d.logDeploy(deployID, fmt.Sprintf("Starting container %s", containerName), "info")
|
||||
@@ -235,14 +277,14 @@ func (d *Deployer) executeDeploy(
|
||||
}
|
||||
|
||||
if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil {
|
||||
log.Printf("deployer: update instance status to running: %v", err)
|
||||
slog.Warn("update instance status to running", "error", err)
|
||||
}
|
||||
d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running")
|
||||
d.logDeploy(deployID, "Container started", "info")
|
||||
|
||||
// Step 4: Configure NPM proxy.
|
||||
if err := d.store.UpdateDeployStatus(deployID, "configuring_proxy", ""); err != nil {
|
||||
log.Printf("deployer: update deploy status: %v", err)
|
||||
slog.Warn("update deploy status", "error", err)
|
||||
}
|
||||
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "configuring_proxy", "")
|
||||
|
||||
@@ -255,13 +297,13 @@ func (d *Deployer) executeDeploy(
|
||||
inst.NpmProxyID = npmProxyID
|
||||
inst.Subdomain = subdomain
|
||||
if err := d.store.UpdateInstance(inst); err != nil {
|
||||
log.Printf("deployer: update instance with proxy ID: %v", err)
|
||||
slog.Warn("update instance with proxy ID", "error", err)
|
||||
}
|
||||
|
||||
// Step 5: Health check.
|
||||
if project.Healthcheck != "" {
|
||||
if err := d.store.UpdateDeployStatus(deployID, "health_checking", ""); err != nil {
|
||||
log.Printf("deployer: update deploy status: %v", err)
|
||||
slog.Warn("update deploy status", "error", err)
|
||||
}
|
||||
d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "health_checking", "")
|
||||
|
||||
@@ -390,13 +432,13 @@ func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, d
|
||||
func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, settings store.Settings) error {
|
||||
// Mark as removing.
|
||||
if err := d.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
|
||||
log.Printf("deployer: update instance %s status to removing: %v", inst.ID, err)
|
||||
slog.Warn("update instance status to removing", "instance_id", inst.ID, "error", err)
|
||||
}
|
||||
|
||||
// Remove Docker container.
|
||||
if inst.ContainerID != "" {
|
||||
if err := d.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
|
||||
log.Printf("deployer: remove container %s: %v", inst.ContainerID, err)
|
||||
slog.Warn("remove container", "container_id", inst.ContainerID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -404,11 +446,11 @@ func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, sett
|
||||
if inst.NpmProxyID > 0 {
|
||||
npmPassword, err := d.decryptNpmPassword(settings.NpmPassword)
|
||||
if err != nil {
|
||||
log.Printf("deployer: decrypt npm password for proxy cleanup: %v", err)
|
||||
slog.Warn("decrypt npm password for proxy cleanup", "error", err)
|
||||
} else if authErr := d.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr != nil {
|
||||
log.Printf("deployer: authenticate npm for proxy cleanup: %v", authErr)
|
||||
slog.Warn("authenticate npm for proxy cleanup", "error", authErr)
|
||||
} else if delErr := d.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil {
|
||||
log.Printf("deployer: delete proxy host %d: %v", inst.NpmProxyID, delErr)
|
||||
slog.Warn("delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -471,7 +513,7 @@ func (d *Deployer) parseEnvVars(envJSON string) []string {
|
||||
|
||||
var envMap map[string]string
|
||||
if err := json.Unmarshal([]byte(envJSON), &envMap); err != nil {
|
||||
log.Printf("deployer: parse env vars: %v", err)
|
||||
slog.Warn("parse env vars", "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -486,7 +528,7 @@ func (d *Deployer) parseEnvVars(envJSON string) []string {
|
||||
// Errors are logged to stderr but not propagated.
|
||||
func (d *Deployer) logDeploy(deployID, message, level string) {
|
||||
if err := d.store.AppendDeployLog(deployID, message, level); err != nil {
|
||||
log.Printf("deployer: append deploy log: %v", err)
|
||||
slog.Warn("append deploy log", "error", err)
|
||||
}
|
||||
if d.eventBus != nil {
|
||||
d.eventBus.Publish(events.Event{
|
||||
|
||||
Reference in New Issue
Block a user