Files
tiny-forge/internal/workload/plugin/source/dockerfile/deploy.go
T
alexei.dolgolyov 410a131cec feat(apps): stepped creation wizard, branch previews, and app-creation fixes
This session (frontend focus):
- Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review):
  WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation,
  ConfirmDialog-based unsaved-changes guard.
- Extract lib/workload/sourceForms.ts (single source of truth for source_config)
  + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the
  /apps/[id] edit form onto the same components (removes the duplication). Add
  vitest + sourceForms unit tests.
- Branch preview environments UI: /chain is_preview/preview_branch + a Preview
  environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed
  state); RegistryImagePicker on the registry trigger and the image source.
- Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect;
  conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory
  label hints; dashboard + /apps "Total workloads" count only source_kind workloads
  (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker
  empty-list guard.
- Update CLAUDE.md frontend conventions + add a Build & Test section.

Also captures pre-existing in-progress platform work (not from this session):
workload notifications, Prometheus metrics export, store lockfile, health probes,
backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00

575 lines
20 KiB
Go

package dockerfile
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"strconv"
"strings"
"time"
"github.com/alexei/tinyforge/internal/crypto"
"github.com/alexei/tinyforge/internal/docker"
"github.com/alexei/tinyforge/internal/events"
"github.com/alexei/tinyforge/internal/notify"
"github.com/alexei/tinyforge/internal/proxy"
"github.com/alexei/tinyforge/internal/staticsite"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/workload/plugin"
)
// healthCheckDelay is the grace window after StartContainer before we
// probe IsContainerRunning. Mirrors the static plugin's window — short
// enough not to slow happy-path deploys, long enough to catch
// crash-on-boot failures (missing env, bad CMD, port conflict).
const healthCheckDelay = 3 * time.Second
// deploy runs one end-to-end sync of a dockerfile workload:
//
// 1. fetch the latest commit SHA from the configured git provider
// 2. skip if SHA + container + proxy are all still healthy
// 3. clone the repo into a temp dir
// 4. resolve the build context + Dockerfile location
// 5. `docker build -t <tag> -f <dockerfile> <context>`
// 6. recreate the container with the new image
// 7. health-probe the container, surface logs on failure
// 8. reconfigure the proxy route
// 9. tear down the previous container (different ID) once we're sure
// the new one is healthy and proxied
//
// Each step writes its own status update so the dashboard's runtime-
// state panel can show a useful intermediate state when the deploy
// stalls on the slow step (almost always the build).
func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error {
cfg, err := plugin.SourceConfigOf[Config](w)
if err != nil {
return fmt.Errorf("dockerfile source: decode config: %w", err)
}
prev, prevContainer, err := loadState(deps, w)
if err != nil {
return err
}
// Force a full rebuild on manual / promote / first-time deploys
// (no Reason at all also implies manual). Schedule / git triggers
// honour the unchanged-SHA short-circuit so cron polling does not
// rebuild minute-by-minute when nothing changed.
force := intent.Reason == "" || intent.Reason == "manual" || intent.Reason == "promote"
// Decrypt the access token if present. Token never escapes this
// frame: any error message routes through sanitizeError(_, token)
// which redacts the literal substring.
token := ""
if cfg.AccessToken != "" {
decrypted, derr := crypto.Decrypt(deps.EncKey, cfg.AccessToken)
if derr != nil {
slog.Warn("dockerfile source: failed to decrypt access token",
"workload", w.Name, "error", derr)
} else {
token = decrypted
}
}
provider, err := staticsite.NewGitProvider(staticsite.ProviderType(cfg.Provider), cfg.BaseURL, token)
if err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("create provider: %v", err), token))
return fmt.Errorf("create provider: %w", err)
}
latestSHA, err := provider.GetLatestCommitSHA(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch)
if err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("fetch commit SHA: %v", err), token))
return fmt.Errorf("get latest commit: %w", err)
}
domain := primaryDomain(deps, w)
prevContainerID := ""
prevProxyRouteID := ""
if prevContainer != nil {
prevContainerID = prevContainer.ContainerID
prevProxyRouteID = prevContainer.ProxyRouteID
}
// Short-circuit: SHA unchanged AND container is still running AND
// (if there's a public face) the proxy route still exists. Manual
// deploys skip this entirely.
//
// We deliberately do NOT gate this on prev.Status == "deployed". A
// transient failure (e.g. a one-off proxy-check error) leaves the
// persisted status as "failed"; if we required "deployed" here, every
// subsequent cron/git poll with the same SHA would fall through to a
// full clone + docker build despite a perfectly healthy running
// container — a rebuild storm that burns CPU/disk until a new commit
// lands. Instead we trust the live container/proxy state and heal the
// stale status via healUnchanged.
if !force && latestSHA == prev.LastCommitSHA && prevContainerID != "" {
running, _ := deps.Docker.IsContainerRunning(ctx, prevContainerID)
switch {
case !running:
slog.Info("dockerfile: container not running, forcing redeploy", "workload", w.Name)
case domain != "":
proxyOK, perr := deps.Proxy.RouteExists(ctx, domain)
switch {
case perr != nil:
slog.Warn("dockerfile: proxy check failed, forcing redeploy",
"workload", w.Name, "error", perr)
case !proxyOK:
slog.Info("dockerfile: proxy route missing, forcing redeploy", "workload", w.Name)
default:
return healUnchanged(deps, w, prev, latestSHA)
}
default:
return healUnchanged(deps, w, prev, latestSHA)
}
}
updateStatus(deps, w, "syncing", prev.LastCommitSHA, "")
publishEvent(deps, w, "syncing")
// Clone the repo into a temp dir. We always download the entire
// repo tree (folderPath = ""); a ContextPath subset is applied
// at build time, not at download time, so a Dockerfile in
// `./docker/Dockerfile` with `ContextPath=""` still works.
cloneDir, err := os.MkdirTemp("", "tf-build-"+idShort(w)+"-*")
if err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("create clone dir: %v", err), token))
return fmt.Errorf("create clone dir: %w", err)
}
defer os.RemoveAll(cloneDir)
if err := provider.DownloadFolder(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch, "", cloneDir); err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("download repo: %v", err), token))
return fmt.Errorf("download repo: %w", err)
}
// Resolve the build context (with symlink-aware escape check) and
// verify the Dockerfile is actually present before sending the
// build off to the daemon.
contextDir, err := resolveContextDir(cloneDir, cfg.ContextPath)
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("resolve context: %v", err), token))
return fmt.Errorf("resolve context: %w", err)
}
if err := verifyDockerfileExists(contextDir, cfg.DockerfilePath); err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(err.Error(), token))
return err
}
imageTag := imageTagFor(w)
updateStatus(deps, w, "building", latestSHA, "")
publishEvent(deps, w, "building")
// Bridge per-line build output onto the event bus so /api/events
// subscribers (the dashboard's live tail) can show progress while
// the daemon chugs. The bus is non-blocking — slow subscribers drop
// events rather than backpressure the build — so this is safe to
// call from the hot scan loop.
logFn := func(line string) {
publishBuildLog(deps, w, line)
}
if err := deps.Docker.BuildImageAt(ctx, contextDir, cfg.DockerfilePath, imageTag, logFn); err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("docker build: %v", err), token))
return fmt.Errorf("docker build: %w", err)
}
env := buildEnv(deps, w.ID)
containerPort := strconv.Itoa(cfg.Port)
settings, err := deps.Store.GetSettings()
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("get settings: %v", err), token))
return fmt.Errorf("get settings: %w", err)
}
networkName := settings.Network
networkID, err := deps.Docker.EnsureNetwork(ctx, networkName)
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("ensure network: %v", err), token))
return fmt.Errorf("ensure network: %w", err)
}
containerName := containerNameFor(w)
// Per-face proxy labels (Traefik consumes these; NPM ignores them).
labels := map[string]string{}
if domain != "" {
if l := deps.Proxy.ContainerLabels(domain, cfg.Port); l != nil {
for k, v := range l {
labels[k] = v
}
}
}
cc := docker.ContainerConfig{
Name: containerName,
Image: imageTag,
Env: env,
ExposedPorts: []string{containerPort + "/tcp"},
NetworkName: networkName,
NetworkID: networkID,
Labels: labels,
WorkloadID: w.ID,
// Dockerfile workloads are tagged as "build" so the dashboard
// and any filtered query can distinguish them from static sites
// (which serve files) and image-source containers (which pull
// pre-built images from a registry).
WorkloadKind: string(store.WorkloadKindBuild),
Role: "",
}
containerID, err := deps.Docker.CreateContainer(ctx, cc)
if err != nil {
// Name conflict — best-effort cleanup of any prior container
// (by ID first; by name as a fallback) and one retry.
if prevContainerID != "" {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
}
removeContainerByName(ctx, deps, containerName)
containerID, err = deps.Docker.CreateContainer(ctx, cc)
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("create container: %v", err), token))
return fmt.Errorf("create container: %w", err)
}
}
if err := deps.Docker.StartContainer(ctx, containerID); err != nil {
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("start container: %v", err), token))
return fmt.Errorf("start container: %w", err)
}
// Brief health-check window — catch crash-on-boot. ctx-aware so a
// cancelled deploy returns promptly. On failure surface the tail
// of the container's logs as the error reason; that's almost
// always what the operator needs to debug.
select {
case <-ctx.Done():
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA, "deploy cancelled before health check")
return ctx.Err()
case <-time.After(healthCheckDelay):
}
running, runErr := deps.Docker.IsContainerRunning(ctx, containerID)
if runErr != nil || !running {
logMsg := "container exited immediately after start"
if logs, logErr := deps.Docker.ContainerLogs(ctx, containerID, false, "40"); logErr == nil {
buf, _ := io.ReadAll(logs)
logs.Close()
if len(buf) > 0 {
// Pass `env` so any decrypted KEY=VALUE pair that the
// container's startup output happens to echo (think
// `RUN echo $DB_PASSWORD` in a debug Dockerfile) is
// redacted before it lands in the operator-visible
// last_error field.
logMsg = sanitizeErrorWithSecrets(string(buf), token, env)
}
}
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA, logMsg)
return fmt.Errorf("container not running: %s", logMsg)
}
// Resolve proxy target: in-network DNS by default, NPM-remote
// override uses (settings.ServerIP, hostPort).
forwardHost := containerName
forwardPort := cfg.Port
if settings.NpmRemote && settings.ProxyProvider == "npm" {
if settings.ServerIP != "" {
hostPort, hpErr := deps.Docker.InspectContainerPort(ctx, containerID, containerPort+"/tcp")
if hpErr != nil {
slog.Warn("dockerfile: could not get host port for remote NPM",
"workload", w.Name, "error", hpErr)
} else {
forwardHost = settings.ServerIP
forwardPort = int(hostPort)
}
}
}
// Configure proxy if a domain is set. Replace any prior route
// in-place so traffic shifts atomically over to the new container.
proxyRouteID := prevProxyRouteID
if domain != "" {
if prevProxyRouteID != "" {
deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
}
routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
SSLCertificateID: settings.SSLCertificateID,
})
if rerr != nil {
slog.Warn("dockerfile: failed to configure proxy",
"workload", w.Name, "domain", domain,
"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "error", rerr)
} else {
proxyRouteID = routeID
slog.Info("dockerfile: proxy configured",
"workload", w.Name, "domain", domain,
"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "routeID", routeID)
}
}
// Drop the previous container only after the new one is healthy
// + routed. Different-ID-than-previous tells us we created a
// fresh one (vs returning the same ID via UpsertContainer reuse).
if prevContainerID != "" && prevContainerID != containerID {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
}
// Single transactional write of new state + container metadata.
// On failure: tear down the just-created container + proxy route
// so we don't leave orphans behind for the next deploy to trip
// over.
if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
rs.LastCommitSHA = latestSHA
rs.LastSyncAt = store.Now()
rs.LastError = ""
rs.Status = "deployed"
c.ContainerID = containerID
c.ProxyRouteID = proxyRouteID
c.Subdomain = domain
c.State = "running"
c.Port = cfg.Port
c.ImageRef = imageTag
}); err != nil {
slog.Error("dockerfile: failed to persist deploy state — rolling back",
"workload", w.Name, "error", err)
if proxyRouteID != "" {
deps.Proxy.DeleteRoute(ctx, proxyRouteID)
}
deps.Docker.StopContainer(ctx, containerID, 10)
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("persist deploy state: %v", err), token))
return fmt.Errorf("persist deploy state: %w", err)
}
publishEvent(deps, w, "deployed")
dispatchBuildNotification(deps, w, domain, "deployed", "")
slog.Info("dockerfile deployed",
"workload", w.Name,
"sha", shortSHA(latestSHA),
"image", imageTag)
return nil
}
// updateStatus writes the runtime-state status/error/commit and (on
// terminal states) fires the side effects the static plugin's helper
// does: failures land in the event log, and a "deployed" or "failed"
// transition dispatches an outbound notification.
//
// The deploy success path calls saveState directly with the full
// container metadata; this helper covers failure / intermediate
// transitions where only state moves.
func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg string) {
if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
rs.Status = status
rs.LastError = errMsg
if commitSHA != "" {
rs.LastCommitSHA = commitSHA
}
switch status {
case "deployed":
c.State = "running"
case "stopped":
c.State = "stopped"
case "failed":
c.State = "failed"
case "syncing", "building":
// Don't churn the container row's state during in-progress
// build/sync — leave whatever the previous deploy left.
}
}); err != nil {
slog.Error("dockerfile: failed to update status",
"id", w.ID, "status", status, "error", err)
}
if status == "failed" {
publishEvent(deps, w, "failed: "+errMsg)
}
if status == "deployed" || status == "failed" {
dispatchBuildNotification(deps, w, primaryDomain(deps, w), status, errMsg)
}
}
// dispatchBuildNotification fans the build event out to every
// configured notification route for the workload. Multi-destination
// fan-out (workload_notifications rows + legacy single URL + global
// settings fallback) is centralised in plugin.DispatchNotificationForWorkload
// so the routing rules are identical across source kinds.
func dispatchBuildNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
eventType := "build_success"
if status == "failed" {
eventType = "build_failure"
}
siteURL := ""
if domain != "" {
siteURL = "https://" + domain
}
plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
Type: eventType,
Project: w.Name,
URL: siteURL,
Error: errMsg,
})
}
// publishEvent emits a status event on the bus AND persists an
// event_log row. Message shape mirrors the static plugin
// ("Build %q: %s") so the dashboard's audit feed reads consistently
// across both kinds.
func publishEvent(deps plugin.Deps, w plugin.Workload, status string) {
severity := "info"
if strings.HasPrefix(status, "failed") {
severity = "error"
}
message := fmt.Sprintf("Build %q: %s", w.Name, status)
metaBytes, err := json.Marshal(map[string]string{
"workload_id": w.ID,
"workload_name": w.Name,
"status": status,
})
if err != nil {
slog.Error("dockerfile: marshal event metadata", "error", err)
metaBytes = []byte("{}")
}
metadata := string(metaBytes)
evt, err := deps.Store.InsertEvent(store.EventLog{
Source: "dockerfile",
Severity: severity,
Message: message,
Metadata: metadata,
})
if err != nil {
slog.Error("dockerfile: failed to persist event log", "error", err)
return
}
deps.Events.Publish(events.Event{
Type: events.EventLog,
Payload: events.EventLogPayload{
ID: evt.ID,
Source: "dockerfile",
Severity: severity,
Message: message,
Metadata: metadata,
CreatedAt: evt.CreatedAt,
},
})
}
// publishBuildLog emits one EventBuildLog per non-empty daemon "stream"
// line. The trailing newline the daemon emits per line is trimmed so the
// UI can render each event as its own row without smuggled blanks.
// Strictly best-effort: the bus drops events under backpressure (slow
// subscriber, no subscriber at all) and never blocks the build loop.
func publishBuildLog(deps plugin.Deps, w plugin.Workload, line string) {
trimmed := strings.TrimRight(line, "\r\n")
if trimmed == "" {
return
}
deps.Events.Publish(events.Event{
Type: events.EventBuildLog,
Payload: events.BuildLogPayload{
WorkloadID: w.ID,
Line: trimmed,
Stream: "stdout",
},
})
}
// healUnchanged is the no-rebuild short-circuit result: the SHA matches and
// the live container + proxy are healthy, so there is nothing to deploy. If a
// prior transient failure left the persisted status as something other than
// "deployed", repair it so the dashboard reflects reality and we stop treating
// a healthy workload as failed. We heal via saveState directly (NOT
// updateStatus) so this reconciliation does not fire a spurious build-success
// notification on every poll.
func healUnchanged(deps plugin.Deps, w plugin.Workload, prev runtimeState, latestSHA string) error {
slog.Info("dockerfile: no changes", "workload", w.Name, "sha", shortSHA(latestSHA))
if prev.Status == "deployed" {
return nil
}
if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
rs.Status = "deployed"
rs.LastError = ""
c.State = "running"
}); err != nil {
slog.Warn("dockerfile: failed to heal stale status to deployed",
"workload", w.Name, "error", err)
}
return nil
}
// removeContainerByName enumerates Docker's view and best-effort drops
// EVERY matching container so a name conflict in CreateContainer is
// recoverable. Container names are unique per daemon, but the recovery
// path exists precisely because a conflict occurred — a prior partial
// deploy can leave more than one matching artifact, so we must not stop
// at the first. Mirrors the static plugin's helper of the same name.
func removeContainerByName(ctx context.Context, deps plugin.Deps, name string) {
containers, err := deps.Docker.ListContainers(ctx, nil)
if err != nil {
return
}
for _, c := range containers {
if c.Name == name {
deps.Docker.StopContainer(ctx, c.ID, 10)
deps.Docker.RemoveContainer(ctx, c.ID, true)
}
}
}
// primaryDomain mirrors the static plugin's helper of the same name —
// derives an FQDN from the workload's first enabled public face, with
// the same bare-subdomain + settings.Domain fall-through.
func primaryDomain(deps plugin.Deps, w plugin.Workload) string {
for _, f := range w.PublicFaces {
if f.Subdomain == "" && f.Domain == "" {
continue
}
switch {
case f.Subdomain != "" && f.Domain != "":
return f.Subdomain + "." + f.Domain
case f.Subdomain == "" && f.Domain != "":
return f.Domain
case f.Subdomain != "" && f.Domain == "":
settings, err := deps.Store.GetSettings()
if err != nil || settings.Domain == "" {
return f.Subdomain
}
return f.Subdomain + "." + settings.Domain
}
}
return ""
}
// shortSHA truncates a commit SHA for log lines. Keeps the deploy log
// readable without losing the "is this the same commit?" signal.
func shortSHA(sha string) string {
if len(sha) > 8 {
return sha[:8]
}
return sha
}