package dockerfile import ( "context" "fmt" "io" "log/slog" "os" "strconv" "strings" "time" "github.com/alexei/tinyforge/internal/crypto" "github.com/alexei/tinyforge/internal/docker" "github.com/alexei/tinyforge/internal/events" "github.com/alexei/tinyforge/internal/notify" "github.com/alexei/tinyforge/internal/proxy" "github.com/alexei/tinyforge/internal/staticsite" "github.com/alexei/tinyforge/internal/store" "github.com/alexei/tinyforge/internal/workload/plugin" ) // healthCheckDelay is the grace window after StartContainer before we // probe IsContainerRunning. Mirrors the static plugin's window — short // enough not to slow happy-path deploys, long enough to catch // crash-on-boot failures (missing env, bad CMD, port conflict). const healthCheckDelay = 3 * time.Second // deploy runs one end-to-end sync of a dockerfile workload: // // 1. fetch the latest commit SHA from the configured git provider // 2. skip if SHA + container + proxy are all still healthy // 3. clone the repo into a temp dir // 4. resolve the build context + Dockerfile location // 5. `docker build -t -f ` // 6. recreate the container with the new image // 7. health-probe the container, surface logs on failure // 8. reconfigure the proxy route // 9. tear down the previous container (different ID) once we're sure // the new one is healthy and proxied // // Each step writes its own status update so the dashboard's runtime- // state panel can show a useful intermediate state when the deploy // stalls on the slow step (almost always the build). func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) (retErr error) { cfg, err := plugin.SourceConfigOf[Config](w) if err != nil { return fmt.Errorf("dockerfile source: decode config: %w", err) } prev, prevContainer, err := loadState(deps, w) if err != nil { return err } // Force a full rebuild on manual / promote / first-time deploys // (no Reason at all also implies manual). Schedule / git triggers // honour the unchanged-SHA short-circuit so cron polling does not // rebuild minute-by-minute when nothing changed. force := intent.Reason == "" || intent.Reason == "manual" || intent.Reason == "promote" // Decrypt the access token if present. Token never escapes this // frame: any error message routes through sanitizeError(_, token) // which redacts the literal substring. token := "" if cfg.AccessToken != "" { decrypted, derr := crypto.Decrypt(deps.EncKey, cfg.AccessToken) if derr != nil { slog.Warn("dockerfile source: failed to decrypt access token", "workload", w.Name, "error", derr) } else { token = decrypted } } provider, err := staticsite.NewGitProvider(staticsite.ProviderType(cfg.Provider), cfg.BaseURL, token) if err != nil { updateStatus(deps, w, "failed", prev.LastCommitSHA, sanitizeError(fmt.Sprintf("create provider: %v", err), token)) return fmt.Errorf("create provider: %w", err) } latestSHA, err := provider.GetLatestCommitSHA(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch) if err != nil { updateStatus(deps, w, "failed", prev.LastCommitSHA, sanitizeError(fmt.Sprintf("fetch commit SHA: %v", err), token)) return fmt.Errorf("get latest commit: %w", err) } domain := primaryDomain(deps, w) // Commit-status reporter (best-effort; gated on cfg.ReportCommitStatus). // The deferred terminal report fires Success/Failure based on the // deploy's outcome, but ONLY once an actual build/deploy began // (deployStarted). The unchanged-SHA short-circuit below returns via // healUnchanged before that flips, so no status is reported when // nothing was built. retErr is the named return the defer inspects. reporter := staticsite.NewCommitStatusReporter(provider, cfg.RepoOwner, cfg.RepoName, latestSHA, statusTargetURL(domain), cfg.ReportCommitStatus) deployStarted := false defer func() { if !deployStarted { return } if retErr != nil { reporter.Report(ctx, w.Name, w.ID, staticsite.CommitStatusFailure, "Tinyforge: build failed") } else { reporter.Report(ctx, w.Name, w.ID, staticsite.CommitStatusSuccess, "Tinyforge: deployed") } }() prevContainerID := "" prevProxyRouteID := "" if prevContainer != nil { prevContainerID = prevContainer.ContainerID prevProxyRouteID = prevContainer.ProxyRouteID } // Short-circuit: SHA unchanged AND container is still running AND // (if there's a public face) the proxy route still exists. Manual // deploys skip this entirely. // // We deliberately do NOT gate this on prev.Status == "deployed". A // transient failure (e.g. a one-off proxy-check error) leaves the // persisted status as "failed"; if we required "deployed" here, every // subsequent cron/git poll with the same SHA would fall through to a // full clone + docker build despite a perfectly healthy running // container — a rebuild storm that burns CPU/disk until a new commit // lands. Instead we trust the live container/proxy state and heal the // stale status via healUnchanged. if !force && latestSHA == prev.LastCommitSHA && prevContainerID != "" { running, _ := deps.Docker.IsContainerRunning(ctx, prevContainerID) switch { case !running: slog.Info("dockerfile: container not running, forcing redeploy", "workload", w.Name) case domain != "": proxyOK, perr := deps.Proxy.RouteExists(ctx, domain) switch { case perr != nil: slog.Warn("dockerfile: proxy check failed, forcing redeploy", "workload", w.Name, "error", perr) case !proxyOK: slog.Info("dockerfile: proxy route missing, forcing redeploy", "workload", w.Name) default: return healUnchanged(deps, w, prev, latestSHA) } default: return healUnchanged(deps, w, prev, latestSHA) } } // From here on a deploy is genuinely underway, so the deferred terminal // status report should fire. Push a "pending" commit status (best- // effort) and arm the deferred Success/Failure report. updateStatus(deps, w, "syncing", prev.LastCommitSHA, "") publishEvent(deps, w, "syncing") deployStarted = true reporter.Report(ctx, w.Name, w.ID, staticsite.CommitStatusPending, "Tinyforge: deploying") // Clone the repo into a temp dir. We always download the entire // repo tree (folderPath = ""); a ContextPath subset is applied // at build time, not at download time, so a Dockerfile in // `./docker/Dockerfile` with `ContextPath=""` still works. cloneDir, err := os.MkdirTemp("", "tf-build-"+plugin.IDShort(w)+"-*") if err != nil { updateStatus(deps, w, "failed", prev.LastCommitSHA, sanitizeError(fmt.Sprintf("create clone dir: %v", err), token)) return fmt.Errorf("create clone dir: %w", err) } defer os.RemoveAll(cloneDir) if err := provider.DownloadFolder(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch, "", cloneDir); err != nil { updateStatus(deps, w, "failed", prev.LastCommitSHA, sanitizeError(fmt.Sprintf("download repo: %v", err), token)) return fmt.Errorf("download repo: %w", err) } // Resolve the build context (with symlink-aware escape check) and // verify the Dockerfile is actually present before sending the // build off to the daemon. contextDir, err := resolveContextDir(cloneDir, cfg.ContextPath) if err != nil { updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("resolve context: %v", err), token)) return fmt.Errorf("resolve context: %w", err) } if err := verifyDockerfileExists(contextDir, cfg.DockerfilePath); err != nil { updateStatus(deps, w, "failed", latestSHA, sanitizeError(err.Error(), token)) return err } imageTag := imageTagFor(w) updateStatus(deps, w, "building", latestSHA, "") publishEvent(deps, w, "building") // Bridge per-line build output onto the event bus so /api/events // subscribers (the dashboard's live tail) can show progress while // the daemon chugs. The bus is non-blocking — slow subscribers drop // events rather than backpressure the build — so this is safe to // call from the hot scan loop. logFn := func(line string) { publishBuildLog(deps, w, line) } if err := deps.Docker.BuildImageAt(ctx, contextDir, cfg.DockerfilePath, imageTag, logFn); err != nil { updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("docker build: %v", err), token)) return fmt.Errorf("docker build: %w", err) } env := plugin.BuildWorkloadEnv(deps, w.ID, w.GroupID, "dockerfile source") containerPort := strconv.Itoa(cfg.Port) settings, err := deps.Store.GetSettings() if err != nil { updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("get settings: %v", err), token)) return fmt.Errorf("get settings: %w", err) } networkName := settings.Network networkID, err := deps.Docker.EnsureNetwork(ctx, networkName) if err != nil { updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("ensure network: %v", err), token)) return fmt.Errorf("ensure network: %w", err) } containerName := containerNameFor(w) // Per-face proxy labels (Traefik consumes these; NPM ignores them). labels := map[string]string{} if domain != "" { if l := deps.Proxy.ContainerLabels(domain, cfg.Port); l != nil { for k, v := range l { labels[k] = v } } } cc := docker.ContainerConfig{ Name: containerName, Image: imageTag, Env: env, ExposedPorts: []string{containerPort + "/tcp"}, NetworkName: networkName, NetworkID: networkID, Labels: labels, WorkloadID: w.ID, // Dockerfile workloads are tagged as "build" so the dashboard // and any filtered query can distinguish them from static sites // (which serve files) and image-source containers (which pull // pre-built images from a registry). WorkloadKind: string(store.WorkloadKindBuild), Role: "", } containerID, err := deps.Docker.CreateContainer(ctx, cc) if err != nil { // Name conflict — best-effort cleanup of any prior container // (by ID first; by name as a fallback) and one retry. if prevContainerID != "" { deps.Docker.StopContainer(ctx, prevContainerID, 10) deps.Docker.RemoveContainer(ctx, prevContainerID, true) } removeContainerByName(ctx, deps, containerName) containerID, err = deps.Docker.CreateContainer(ctx, cc) if err != nil { updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("create container: %v", err), token)) return fmt.Errorf("create container: %w", err) } } if err := deps.Docker.StartContainer(ctx, containerID); err != nil { deps.Docker.RemoveContainer(ctx, containerID, true) updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("start container: %v", err), token)) return fmt.Errorf("start container: %w", err) } // Brief health-check window — catch crash-on-boot. ctx-aware so a // cancelled deploy returns promptly. On failure surface the tail // of the container's logs as the error reason; that's almost // always what the operator needs to debug. select { case <-ctx.Done(): deps.Docker.RemoveContainer(ctx, containerID, true) updateStatus(deps, w, "failed", latestSHA, "deploy cancelled before health check") return ctx.Err() case <-time.After(healthCheckDelay): } running, runErr := deps.Docker.IsContainerRunning(ctx, containerID) if runErr != nil || !running { logMsg := "container exited immediately after start" if logs, logErr := deps.Docker.ContainerLogs(ctx, containerID, false, "40"); logErr == nil { buf, _ := io.ReadAll(logs) logs.Close() if len(buf) > 0 { // Pass `env` so any decrypted KEY=VALUE pair that the // container's startup output happens to echo (think // `RUN echo $DB_PASSWORD` in a debug Dockerfile) is // redacted before it lands in the operator-visible // last_error field. logMsg = sanitizeErrorWithSecrets(string(buf), token, env) } } deps.Docker.RemoveContainer(ctx, containerID, true) updateStatus(deps, w, "failed", latestSHA, logMsg) return fmt.Errorf("container not running: %s", logMsg) } // Resolve proxy target: in-network DNS by default, NPM-remote // override uses (settings.ServerIP, hostPort). forwardHost := containerName forwardPort := cfg.Port if settings.NpmRemote && settings.ProxyProvider == "npm" { if settings.ServerIP != "" { hostPort, hpErr := deps.Docker.InspectContainerPort(ctx, containerID, containerPort+"/tcp") if hpErr != nil { slog.Warn("dockerfile: could not get host port for remote NPM", "workload", w.Name, "error", hpErr) } else { forwardHost = settings.ServerIP forwardPort = int(hostPort) } } } // Configure proxy if a domain is set. Replace any prior route // in-place so traffic shifts atomically over to the new container. proxyRouteID := prevProxyRouteID if domain != "" { if prevProxyRouteID != "" { deps.Proxy.DeleteRoute(ctx, prevProxyRouteID) } routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{ SSLCertificateID: settings.SSLCertificateID, }) if rerr != nil { slog.Warn("dockerfile: failed to configure proxy", "workload", w.Name, "domain", domain, "target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "error", rerr) } else { proxyRouteID = routeID slog.Info("dockerfile: proxy configured", "workload", w.Name, "domain", domain, "target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "routeID", routeID) } } // Drop the previous container only after the new one is healthy // + routed. Different-ID-than-previous tells us we created a // fresh one (vs returning the same ID via UpsertContainer reuse). if prevContainerID != "" && prevContainerID != containerID { deps.Docker.StopContainer(ctx, prevContainerID, 10) deps.Docker.RemoveContainer(ctx, prevContainerID, true) } // Single transactional write of new state + container metadata. // On failure: tear down the just-created container + proxy route // so we don't leave orphans behind for the next deploy to trip // over. if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) { rs.LastCommitSHA = latestSHA rs.LastSyncAt = store.Now() rs.LastError = "" rs.Status = "deployed" c.ContainerID = containerID c.ProxyRouteID = proxyRouteID c.Subdomain = domain c.State = "running" c.Port = cfg.Port c.ImageRef = imageTag }); err != nil { slog.Error("dockerfile: failed to persist deploy state — rolling back", "workload", w.Name, "error", err) if proxyRouteID != "" { deps.Proxy.DeleteRoute(ctx, proxyRouteID) } deps.Docker.StopContainer(ctx, containerID, 10) deps.Docker.RemoveContainer(ctx, containerID, true) updateStatus(deps, w, "failed", latestSHA, sanitizeError(fmt.Sprintf("persist deploy state: %v", err), token)) return fmt.Errorf("persist deploy state: %w", err) } publishEvent(deps, w, "deployed") dispatchBuildNotification(deps, w, domain, "deployed", "") slog.Info("dockerfile deployed", "workload", w.Name, "sha", shortSHA(latestSHA), "image", imageTag) return nil } // statusTargetURL derives the https URL the commit status links back to — // the workload's primary public face, or "" when it has none. func statusTargetURL(domain string) string { if domain == "" { return "" } return "https://" + domain } // updateStatus writes the runtime-state status/error/commit and (on // terminal states) fires the side effects the static plugin's helper // does: failures land in the event log, and a "deployed" or "failed" // transition dispatches an outbound notification. // // The deploy success path calls saveState directly with the full // container metadata; this helper covers failure / intermediate // transitions where only state moves. func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg string) { if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) { rs.Status = status rs.LastError = errMsg if commitSHA != "" { rs.LastCommitSHA = commitSHA } switch status { case "deployed": c.State = "running" case "stopped": c.State = "stopped" case "failed": c.State = "failed" case "syncing", "building": // Don't churn the container row's state during in-progress // build/sync — leave whatever the previous deploy left. } }); err != nil { slog.Error("dockerfile: failed to update status", "id", w.ID, "status", status, "error", err) } if status == "failed" { publishEvent(deps, w, "failed: "+errMsg) } if status == "deployed" || status == "failed" { dispatchBuildNotification(deps, w, primaryDomain(deps, w), status, errMsg) } } // dispatchBuildNotification fans the build event out to every // configured notification route for the workload. Multi-destination // fan-out (workload_notifications rows + legacy single URL + global // settings fallback) is centralised in plugin.DispatchNotificationForWorkload // so the routing rules are identical across source kinds. func dispatchBuildNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) { eventType := "build_success" if status == "failed" { eventType = "build_failure" } siteURL := "" if domain != "" { siteURL = "https://" + domain } plugin.DispatchNotificationForWorkload(deps, w, notify.Event{ Type: eventType, Project: w.Name, URL: siteURL, Error: errMsg, }) } // publishEvent records a workload-scoped deploy event in the audit log. // The InsertEvent + bus publish (and consistent message/metadata shape // across source kinds) is centralised in plugin.EmitDeployEvent so the // dashboard's audit feed and the per-workload timeline read identically // for image / compose / static / dockerfile deploys. func publishEvent(deps plugin.Deps, w plugin.Workload, status string) { plugin.EmitDeployEvent(deps, w, "dockerfile", status) } // publishBuildLog emits one EventBuildLog per non-empty daemon "stream" // line. The trailing newline the daemon emits per line is trimmed so the // UI can render each event as its own row without smuggled blanks. // Strictly best-effort: the bus drops events under backpressure (slow // subscriber, no subscriber at all) and never blocks the build loop. func publishBuildLog(deps plugin.Deps, w plugin.Workload, line string) { trimmed := strings.TrimRight(line, "\r\n") if trimmed == "" { return } deps.Events.Publish(events.Event{ Type: events.EventBuildLog, Payload: events.BuildLogPayload{ WorkloadID: w.ID, Line: trimmed, Stream: "stdout", }, }) } // healUnchanged is the no-rebuild short-circuit result: the SHA matches and // the live container + proxy are healthy, so there is nothing to deploy. If a // prior transient failure left the persisted status as something other than // "deployed", repair it so the dashboard reflects reality and we stop treating // a healthy workload as failed. We heal via saveState directly (NOT // updateStatus) so this reconciliation does not fire a spurious build-success // notification on every poll. func healUnchanged(deps plugin.Deps, w plugin.Workload, prev runtimeState, latestSHA string) error { slog.Info("dockerfile: no changes", "workload", w.Name, "sha", shortSHA(latestSHA)) if prev.Status == "deployed" { return nil } if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) { rs.Status = "deployed" rs.LastError = "" c.State = "running" }); err != nil { slog.Warn("dockerfile: failed to heal stale status to deployed", "workload", w.Name, "error", err) } return nil } // removeContainerByName enumerates Docker's view and best-effort drops // EVERY matching container so a name conflict in CreateContainer is // recoverable. Container names are unique per daemon, but the recovery // path exists precisely because a conflict occurred — a prior partial // deploy can leave more than one matching artifact, so we must not stop // at the first. Mirrors the static plugin's helper of the same name. func removeContainerByName(ctx context.Context, deps plugin.Deps, name string) { containers, err := deps.Docker.ListContainers(ctx, nil) if err != nil { return } for _, c := range containers { if c.Name == name { deps.Docker.StopContainer(ctx, c.ID, 10) deps.Docker.RemoveContainer(ctx, c.ID, true) } } } // primaryDomain mirrors the static plugin's helper of the same name — // derives an FQDN from the workload's first enabled public face, with // the same bare-subdomain + settings.Domain fall-through. func primaryDomain(deps plugin.Deps, w plugin.Workload) string { for _, f := range w.PublicFaces { if f.Subdomain == "" && f.Domain == "" { continue } switch { case f.Subdomain != "" && f.Domain != "": return f.Subdomain + "." + f.Domain case f.Subdomain == "" && f.Domain != "": return f.Domain case f.Subdomain != "" && f.Domain == "": settings, err := deps.Store.GetSettings() if err != nil || settings.Domain == "" { return f.Subdomain } return f.Subdomain + "." + settings.Domain } } return "" } // shortSHA truncates a commit SHA for log lines. Keeps the deploy log // readable without losing the "is this the same commit?" signal. func shortSHA(sha string) string { if len(sha) > 8 { return sha[:8] } return sha }