package deployer import ( "context" "encoding/json" "fmt" "log/slog" "path/filepath" "sort" "sync" "sync/atomic" "github.com/alexei/docker-watcher/internal/crypto" "github.com/alexei/docker-watcher/internal/docker" "github.com/alexei/docker-watcher/internal/events" "github.com/alexei/docker-watcher/internal/health" "github.com/alexei/docker-watcher/internal/notify" "github.com/alexei/docker-watcher/internal/npm" "github.com/alexei/docker-watcher/internal/store" "github.com/moby/moby/api/types/mount" "github.com/google/uuid" ) // Deployer orchestrates the full deployment flow: pull image, create container, // start, configure proxy, health check, and handle rollback on failure. // It implements both webhook.DeployTriggerer and registry.DeployTriggerer. type Deployer struct { docker *docker.Client npm *npm.Client store *store.Store health *health.Checker notifier *notify.Notifier eventBus EventPublisher encKey [32]byte // Graceful shutdown: tracks in-progress deploys. activeWg sync.WaitGroup shuttingDown atomic.Bool } // EventPublisher is the interface for publishing events to the event bus. type EventPublisher interface { Publish(evt events.Event) } // New creates a new Deployer with all required dependencies. func New( dockerClient *docker.Client, npmClient *npm.Client, st *store.Store, checker *health.Checker, notifier *notify.Notifier, eventBus EventPublisher, encKey [32]byte, ) *Deployer { return &Deployer{ docker: dockerClient, npm: npmClient, store: st, health: checker, notifier: notifier, eventBus: eventBus, encKey: encKey, } } // Drain waits for all in-progress deploys to complete. Call this during graceful shutdown. func (d *Deployer) Drain() { d.shuttingDown.Store(true) slog.Info("deployer: draining in-progress deploys") d.activeWg.Wait() slog.Info("deployer: all deploys drained") } // AsyncTriggerDeploy creates a deploy record and returns the deploy ID immediately, // then runs the full deploy pipeline in a background goroutine. Use this from HTTP handlers // to avoid blocking the request. Progress is streamed via SSE. func (d *Deployer) AsyncTriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) (string, error) { if d.shuttingDown.Load() { return "", fmt.Errorf("deployer is shutting down, rejecting new deploy") } // Validate inputs synchronously so the caller gets immediate feedback. project, err := d.store.GetProjectByID(projectID) if err != nil { return "", fmt.Errorf("get project: %w", err) } stage, err := d.store.GetStageByID(stageID) if err != nil { return "", fmt.Errorf("get stage: %w", err) } if err := d.validatePromoteFrom(stage, imageTag); err != nil { return "", fmt.Errorf("promote validation: %w", err) } // Create deploy record synchronously so caller gets the ID. deploy, err := d.store.CreateDeploy(store.Deploy{ ProjectID: projectID, StageID: stageID, ImageTag: imageTag, Status: "pending", }) if err != nil { return "", fmt.Errorf("create deploy record: %w", err) } // Run the actual deploy in the background. d.activeWg.Add(1) go func() { defer d.activeWg.Done() // Use a detached context so client disconnect doesn't abort the deploy. bgCtx := context.Background() if err := d.runDeploy(bgCtx, project, stage, deploy.ID, imageTag); err != nil { slog.Error("async deploy failed", "deploy_id", deploy.ID, "error", err) } }() return deploy.ID, nil } // runDeploy is the internal deploy pipeline used by AsyncTriggerDeploy. // It assumes the deploy record already exists and project/stage are validated. func (d *Deployer) runDeploy(ctx context.Context, project store.Project, stage store.Stage, deployID string, imageTag string) error { settings, err := d.store.GetSettings() if err != nil { if updateErr := d.store.UpdateDeployStatus(deployID, "failed", err.Error()); updateErr != nil { slog.Warn("update deploy status", "error", updateErr) } return fmt.Errorf("get settings: %w", err) } slog.Info("starting deploy", "deploy_id", deployID, "project", project.Name, "stage", stage.Name, "tag", imageTag, ) d.logDeploy(deployID, fmt.Sprintf("Starting deploy of %s:%s for project %s, stage %s", project.Image, imageTag, project.Name, stage.Name), "info") // Enforce max_instances before deploying. if err := d.enforceMaxInstances(ctx, stage, deployID, settings); err != nil { d.logDeploy(deployID, fmt.Sprintf("Failed to enforce max instances: %v", err), "error") } var containerID string var npmProxyID int var instanceID string var deployErr error if stage.MaxInstances == 1 { containerID, npmProxyID, instanceID, deployErr = d.blueGreenDeploy(ctx, project, stage, settings, deployID, imageTag) } else { containerID, npmProxyID, instanceID, deployErr = d.executeDeploy(ctx, project, stage, settings, deployID, imageTag) } if deployErr != nil { d.logDeploy(deployID, fmt.Sprintf("Deploy failed: %v", deployErr), "error") d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "failed", deployErr.Error()) d.rollback(ctx, deployID, containerID, npmProxyID, instanceID) d.notifier.Send(settings.NotificationURL, notify.Event{ Type: "deploy_failure", Project: project.Name, Stage: stage.Name, ImageTag: imageTag, Error: deployErr.Error(), }) return fmt.Errorf("deploy failed: %w", deployErr) } if err := d.store.UpdateDeployStatus(deployID, "success", ""); err != nil { slog.Warn("update deploy status to success", "error", err) } d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "success", "") subdomain := d.buildSubdomain(project, stage, settings, imageTag) fullURL := fmt.Sprintf("https://%s.%s", subdomain, settings.Domain) d.logDeploy(deployID, fmt.Sprintf("Deploy successful: %s", fullURL), "info") d.notifier.Send(settings.NotificationURL, notify.Event{ Type: "deploy_success", Project: project.Name, Stage: stage.Name, ImageTag: imageTag, Subdomain: subdomain, URL: fullURL, }) return nil } // TriggerDeploy is the synchronous entry point for deployments (used by poller and webhook). // It validates inputs, creates a deploy record, and delegates to runDeploy. func (d *Deployer) TriggerDeploy(ctx context.Context, projectID, stageID, imageTag string) error { if d.shuttingDown.Load() { return fmt.Errorf("deployer is shutting down, rejecting new deploy") } d.activeWg.Add(1) defer d.activeWg.Done() project, err := d.store.GetProjectByID(projectID) if err != nil { return fmt.Errorf("get project: %w", err) } stage, err := d.store.GetStageByID(stageID) if err != nil { return fmt.Errorf("get stage: %w", err) } if err := d.validatePromoteFrom(stage, imageTag); err != nil { return fmt.Errorf("promote validation: %w", err) } deploy, err := d.store.CreateDeploy(store.Deploy{ ProjectID: projectID, StageID: stageID, ImageTag: imageTag, Status: "pending", }) if err != nil { return fmt.Errorf("create deploy record: %w", err) } if err := d.runDeploy(ctx, project, stage, deploy.ID, imageTag); err != nil { return err } return nil } // executeDeploy runs the deploy pipeline steps and returns rollback-relevant state. // It returns (containerID, npmProxyID, instanceID, error). func (d *Deployer) executeDeploy( ctx context.Context, project store.Project, stage store.Stage, settings store.Settings, deployID string, imageTag string, ) (string, int, string, error) { var containerID string var npmProxyID int var instanceID string // Step 1: Pull image. if err := d.store.UpdateDeployStatus(deployID, "pulling", ""); err != nil { slog.Warn("update deploy status", "error", err) } d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "pulling", "") d.logDeploy(deployID, fmt.Sprintf("Pulling image %s:%s", project.Image, imageTag), "info") authConfig, err := d.buildRegistryAuth(project) if err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("build registry auth: %w", err) } if err := d.docker.PullImage(ctx, project.Image, imageTag, authConfig); err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("pull image: %w", err) } d.logDeploy(deployID, "Image pulled successfully", "info") // Step 2: Ensure network exists. networkID, err := d.docker.EnsureNetwork(ctx, settings.Network) if err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("ensure network: %w", err) } d.logDeploy(deployID, fmt.Sprintf("Network %s ready (ID: %s)", settings.Network, truncateID(networkID)), "info") // Step 3: Create and start container. if err := d.store.UpdateDeployStatus(deployID, "starting", ""); err != nil { slog.Warn("update deploy status", "error", err) } d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "starting", "") // Pre-generate instance ID so it can be set as a container label. instanceID = uuid.New().String() subdomain := d.buildSubdomain(project, stage, settings, imageTag) containerName := docker.ContainerName(project.Name, stage.Name, imageTag) portStr := fmt.Sprintf("%d/tcp", project.Port) envVars := d.mergeEnvVars(project, stage.ID) mounts := d.computeVolumeMounts(project.ID, stage.Name, imageTag, settings.BaseVolumePath) containerCfg := docker.ContainerConfig{ Name: containerName, Image: project.Image + ":" + imageTag, Env: envVars, ExposedPorts: []string{portStr}, NetworkName: settings.Network, NetworkID: networkID, Project: project.Name, Stage: stage.Name, InstanceID: instanceID, Mounts: mounts, } d.logDeploy(deployID, fmt.Sprintf("Creating container %s", containerName), "info") containerID, err = d.docker.CreateContainer(ctx, containerCfg) if err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("create container: %w", err) } d.logDeploy(deployID, fmt.Sprintf("Container created (ID: %s)", truncateID(containerID)), "info") // Create instance record in store with the pre-generated ID. inst, err := d.store.CreateInstanceWithID(store.Instance{ ID: instanceID, StageID: stage.ID, ProjectID: project.ID, ContainerID: containerID, ImageTag: imageTag, Subdomain: subdomain, Status: "stopped", Port: project.Port, }) if err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("create instance record: %w", err) } instanceID = inst.ID // Link deploy to instance. if err := d.store.SetDeployInstanceID(deployID, instanceID); err != nil { slog.Warn("link deploy to instance", "error", err) } d.logDeploy(deployID, fmt.Sprintf("Starting container %s", containerName), "info") if err := d.docker.StartContainer(ctx, containerID); err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("start container: %w", err) } if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil { slog.Warn("update instance status to running", "error", err) } d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running") d.logDeploy(deployID, "Container started", "info") // Step 4: Configure NPM proxy. if err := d.store.UpdateDeployStatus(deployID, "configuring_proxy", ""); err != nil { slog.Warn("update deploy status", "error", err) } d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "configuring_proxy", "") npmProxyID, err = d.configureProxy(ctx, deployID, settings, containerName, project.Port, subdomain) if err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("configure proxy: %w", err) } // Update instance with NPM proxy ID. inst.NpmProxyID = npmProxyID inst.Subdomain = subdomain if err := d.store.UpdateInstance(inst); err != nil { slog.Warn("update instance with proxy ID", "error", err) } // Step 5: Health check. if project.Healthcheck != "" { if err := d.store.UpdateDeployStatus(deployID, "health_checking", ""); err != nil { slog.Warn("update deploy status", "error", err) } d.publishDeployStatus(deployID, project.ID, stage.ID, imageTag, "health_checking", "") healthURL := fmt.Sprintf("http://%s:%d%s", containerName, project.Port, project.Healthcheck) d.logDeploy(deployID, fmt.Sprintf("Running health check: %s", healthURL), "info") if err := d.health.Check(ctx, healthURL); err != nil { return containerID, npmProxyID, instanceID, fmt.Errorf("health check: %w", err) } d.logDeploy(deployID, "Health check passed", "info") } else { d.logDeploy(deployID, "No health check configured, skipping", "info") } return containerID, npmProxyID, instanceID, nil } // configureProxy creates or updates an NPM proxy host for the deployed container. // It authenticates to NPM using credentials from settings, then creates the proxy. // Returns the NPM proxy host ID. func (d *Deployer) configureProxy( ctx context.Context, deployID string, settings store.Settings, containerName string, containerPort int, subdomain string, ) (int, error) { // Authenticate to NPM. npmPassword, err := d.decryptNpmPassword(settings.NpmPassword) if err != nil { return 0, fmt.Errorf("decrypt npm password: %w", err) } if err := d.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); err != nil { return 0, fmt.Errorf("authenticate to npm: %w", err) } fqdn := subdomain + "." + settings.Domain d.logDeploy(deployID, fmt.Sprintf("Configuring proxy: %s -> %s:%d", fqdn, containerName, containerPort), "info") // Check if a proxy host already exists for this domain. existing, found, err := d.npm.FindProxyHostByDomain(ctx, fqdn) if err != nil { return 0, fmt.Errorf("find existing proxy host: %w", err) } proxyConfig := npm.ProxyHostConfig{ DomainNames: []string{fqdn}, ForwardScheme: "http", ForwardHost: containerName, ForwardPort: containerPort, BlockExploits: true, AllowWebsocket: true, HTTP2Support: true, Meta: npm.Meta{}, Locations: []any{}, } if found { d.logDeploy(deployID, fmt.Sprintf("Updating existing proxy host %d for %s", existing.ID, fqdn), "info") host, err := d.npm.UpdateProxyHost(ctx, existing.ID, proxyConfig) if err != nil { return 0, fmt.Errorf("update proxy host: %w", err) } d.logDeploy(deployID, "Proxy host updated", "info") return host.ID, nil } d.logDeploy(deployID, fmt.Sprintf("Creating new proxy host for %s", fqdn), "info") host, err := d.npm.CreateProxyHost(ctx, proxyConfig) if err != nil { return 0, fmt.Errorf("create proxy host: %w", err) } d.logDeploy(deployID, fmt.Sprintf("Proxy host created (ID: %d)", host.ID), "info") return host.ID, nil } // enforceMaxInstances removes the oldest instances when the stage has reached its limit. // This makes room for the new deployment. func (d *Deployer) enforceMaxInstances(ctx context.Context, stage store.Stage, deployID string, settings store.Settings) error { if stage.MaxInstances <= 0 { return nil } instances, err := d.store.GetInstancesByStageID(stage.ID) if err != nil { return fmt.Errorf("get instances for stage: %w", err) } // Filter to running/stopped instances (not already failed/removing). var active []store.Instance for _, inst := range instances { if inst.Status == "running" || inst.Status == "stopped" { active = append(active, inst) } } // We need room for one more instance, so remove oldest when at limit. removeCount := len(active) - stage.MaxInstances + 1 if removeCount <= 0 { return nil } // Sort by created_at ascending (oldest first). sort.Slice(active, func(i, j int) bool { return active[i].CreatedAt < active[j].CreatedAt }) for i := 0; i < removeCount && i < len(active); i++ { inst := active[i] d.logDeploy(deployID, fmt.Sprintf("Removing oldest instance %s (tag: %s) to enforce max_instances=%d", inst.ID, inst.ImageTag, stage.MaxInstances), "info") if err := d.removeInstance(ctx, inst, settings); err != nil { d.logDeploy(deployID, fmt.Sprintf("Failed to remove instance %s: %v", inst.ID, err), "warn") continue } d.logDeploy(deployID, fmt.Sprintf("Removed instance %s", inst.ID), "info") } return nil } // removeInstance stops and removes a container, deletes its NPM proxy host, // and removes the instance record from the store. func (d *Deployer) removeInstance(ctx context.Context, inst store.Instance, settings store.Settings) error { // Mark as removing. if err := d.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil { slog.Warn("update instance status to removing", "instance_id", inst.ID, "error", err) } // Remove Docker container. if inst.ContainerID != "" { if err := d.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil { slog.Warn("remove container", "container_id", inst.ContainerID, "error", err) } } // Delete NPM proxy host. if inst.NpmProxyID > 0 { npmPassword, err := d.decryptNpmPassword(settings.NpmPassword) if err != nil { slog.Warn("decrypt npm password for proxy cleanup", "error", err) } else if authErr := d.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr != nil { slog.Warn("authenticate npm for proxy cleanup", "error", authErr) } else if delErr := d.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil { slog.Warn("delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr) } } // Delete instance record. if err := d.store.DeleteInstance(inst.ID); err != nil { return fmt.Errorf("delete instance record: %w", err) } return nil } // buildSubdomain generates the subdomain for an instance based on settings and stage config. func (d *Deployer) buildSubdomain(project store.Project, stage store.Stage, settings store.Settings, imageTag string) string { return GenerateTaggedSubdomain(settings.SubdomainPattern, project.Name, stage.Name, imageTag, stage.Subdomain) } // buildRegistryAuth constructs the Docker registry auth string for pulling images. // If the project has a registry configured, it looks up the registry token. func (d *Deployer) buildRegistryAuth(project store.Project) (string, error) { if project.Registry == "" { return "", nil } reg, err := d.store.GetRegistryByName(project.Registry) if err != nil { return "", fmt.Errorf("get registry %s: %w", project.Registry, err) } if reg.Token != "" { decrypted, err := crypto.Decrypt(d.encKey, reg.Token) if err != nil { return "", fmt.Errorf("decrypt registry token: %w", err) } return docker.EncodeRegistryAuth(decrypted, decrypted, reg.URL) } return "", nil } // decryptNpmPassword decrypts the NPM password from settings. // Returns empty string if the encrypted password is empty. func (d *Deployer) decryptNpmPassword(encryptedPassword string) (string, error) { if encryptedPassword == "" { return "", nil } return crypto.Decrypt(d.encKey, encryptedPassword) } // mergeEnvVars builds the final environment variable list for a container: // 1. Parse project-level env JSON // 2. Overlay with stage-level env overrides (stage wins on key conflict) // 3. Decrypt any encrypted (secret) values // Returns a []string of KEY=VALUE pairs. func (d *Deployer) mergeEnvVars(project store.Project, stageID string) []string { // Step 1: Parse project-level env. envMap := make(map[string]string) if project.Env != "" && project.Env != "{}" { var projectEnv map[string]string if err := json.Unmarshal([]byte(project.Env), &projectEnv); err != nil { slog.Warn("parse project env vars", "error", err) } else { for k, v := range projectEnv { envMap[k] = v } } } // Step 2: Overlay with stage-level overrides. stageEnvs, err := d.store.GetStageEnvByStageID(stageID) if err != nil { slog.Warn("get stage env overrides", "stage_id", stageID, "error", err) } else { for _, se := range stageEnvs { value := se.Value if se.Encrypted { // Step 3: Decrypt secret values. decrypted, err := crypto.Decrypt(d.encKey, se.Value) if err != nil { slog.Warn("decrypt stage env value", "key", se.Key, "error", err) continue } value = decrypted } envMap[se.Key] = value } } vars := make([]string, 0, len(envMap)) for k, v := range envMap { vars = append(vars, k+"="+v) } return vars } // computeVolumeMounts builds Docker mount specifications from the project's volume config. // For shared mode, source is used as-is. // For isolated mode, source gets /{stage}-{tag}/ appended. func (d *Deployer) computeVolumeMounts(projectID, stageName, imageTag, basePath string) []mount.Mount { vols, err := d.store.GetVolumesByProjectID(projectID) if err != nil { slog.Warn("get project volumes", "project_id", projectID, "error", err) return nil } if len(vols) == 0 { return nil } mounts := make([]mount.Mount, 0, len(vols)) for _, vol := range vols { source := vol.Source // Prepend base path if source is relative (doesn't start with /). if basePath != "" && !filepath.IsAbs(source) { source = filepath.Join(basePath, source) } if vol.Mode == "isolated" { source = filepath.Join(source, fmt.Sprintf("%s-%s", stageName, imageTag)) } mounts = append(mounts, mount.Mount{ Type: mount.TypeBind, Source: source, Target: vol.Target, }) } return mounts } // logDeploy appends a log entry for a deploy and publishes it on the event bus. // Errors are logged to stderr but not propagated. func (d *Deployer) logDeploy(deployID, message, level string) { if err := d.store.AppendDeployLog(deployID, message, level); err != nil { slog.Warn("append deploy log", "error", err) } if d.eventBus != nil { d.eventBus.Publish(events.Event{ Type: events.EventDeployLog, Payload: events.DeployLogPayload{ DeployID: deployID, Message: message, Level: level, }, }) } } // publishDeployStatus publishes a deploy status change event on the bus. func (d *Deployer) publishDeployStatus(deployID, projectID, stageID, imageTag, status, deployErr string) { if d.eventBus != nil { d.eventBus.Publish(events.Event{ Type: events.EventDeployStatus, Payload: events.DeployStatusPayload{ DeployID: deployID, ProjectID: projectID, StageID: stageID, ImageTag: imageTag, Status: status, Error: deployErr, }, }) } } // publishInstanceStatus publishes an instance status change event on the bus. func (d *Deployer) publishInstanceStatus(instanceID, projectID, stageID, status string) { if d.eventBus != nil { d.eventBus.Publish(events.Event{ Type: events.EventInstanceStatus, Payload: events.InstanceStatusPayload{ InstanceID: instanceID, ProjectID: projectID, StageID: stageID, Status: status, }, }) } } // truncateID safely truncates a Docker ID to 12 characters for display. func truncateID(id string) string { if len(id) > 12 { return id[:12] } return id }