tiny-forge/internal/workload/plugin/source/dockerfile/deploy.go

package dockerfile

import (
	"context"
	"encoding/json"
	"fmt"
	"io"
	"log/slog"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/alexei/tinyforge/internal/crypto"
	"github.com/alexei/tinyforge/internal/docker"
	"github.com/alexei/tinyforge/internal/events"
	"github.com/alexei/tinyforge/internal/notify"
	"github.com/alexei/tinyforge/internal/proxy"
	"github.com/alexei/tinyforge/internal/staticsite"
	"github.com/alexei/tinyforge/internal/store"
	"github.com/alexei/tinyforge/internal/workload/plugin"
)

// healthCheckDelay is the grace window after StartContainer before we
// probe IsContainerRunning. Mirrors the static plugin's window — short
// enough not to slow happy-path deploys, long enough to catch
// crash-on-boot failures (missing env, bad CMD, port conflict).
const healthCheckDelay = 3 * time.Second

// deploy runs one end-to-end sync of a dockerfile workload:
//
//  1. fetch the latest commit SHA from the configured git provider
//  2. skip if SHA + container + proxy are all still healthy
//  3. clone the repo into a temp dir
//  4. resolve the build context + Dockerfile location
//  5. `docker build -t <tag> -f <dockerfile> <context>`
//  6. recreate the container with the new image
//  7. health-probe the container, surface logs on failure
//  8. reconfigure the proxy route
//  9. tear down the previous container (different ID) once we're sure
//     the new one is healthy and proxied
//
// Each step writes its own status update so the dashboard's runtime-
// state panel can show a useful intermediate state when the deploy
// stalls on the slow step (almost always the build).
func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error {
	cfg, err := plugin.SourceConfigOf[Config](w)
	if err != nil {
		return fmt.Errorf("dockerfile source: decode config: %w", err)
	}

	prev, prevContainer, err := loadState(deps, w)
	if err != nil {
		return err
	}

	// Force a full rebuild on manual / promote / first-time deploys
	// (no Reason at all also implies manual). Schedule / git triggers
	// honour the unchanged-SHA short-circuit so cron polling does not
	// rebuild minute-by-minute when nothing changed.
	force := intent.Reason == "" || intent.Reason == "manual" || intent.Reason == "promote"

	// Decrypt the access token if present. Token never escapes this
	// frame: any error message routes through sanitizeError(_, token)
	// which redacts the literal substring.
	token := ""
	if cfg.AccessToken != "" {
		decrypted, derr := crypto.Decrypt(deps.EncKey, cfg.AccessToken)
		if derr != nil {
			slog.Warn("dockerfile source: failed to decrypt access token",
				"workload", w.Name, "error", derr)
		} else {
			token = decrypted
		}
	}

	provider, err := staticsite.NewGitProvider(staticsite.ProviderType(cfg.Provider), cfg.BaseURL, token)
	if err != nil {
		updateStatus(deps, w, "failed", prev.LastCommitSHA,
			sanitizeError(fmt.Sprintf("create provider: %v", err), token))
		return fmt.Errorf("create provider: %w", err)
	}

	latestSHA, err := provider.GetLatestCommitSHA(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch)
	if err != nil {
		updateStatus(deps, w, "failed", prev.LastCommitSHA,
			sanitizeError(fmt.Sprintf("fetch commit SHA: %v", err), token))
		return fmt.Errorf("get latest commit: %w", err)
	}

	domain := primaryDomain(deps, w)

	prevContainerID := ""
	prevProxyRouteID := ""
	if prevContainer != nil {
		prevContainerID = prevContainer.ContainerID
		prevProxyRouteID = prevContainer.ProxyRouteID
	}
	// Short-circuit: SHA unchanged AND container is still running AND
	// (if there's a public face) the proxy route still exists. Manual
	// deploys skip this entirely.
	//
	// We deliberately do NOT gate this on prev.Status == "deployed". A
	// transient failure (e.g. a one-off proxy-check error) leaves the
	// persisted status as "failed"; if we required "deployed" here, every
	// subsequent cron/git poll with the same SHA would fall through to a
	// full clone + docker build despite a perfectly healthy running
	// container — a rebuild storm that burns CPU/disk until a new commit
	// lands. Instead we trust the live container/proxy state and heal the
	// stale status via healUnchanged.
	if !force && latestSHA == prev.LastCommitSHA && prevContainerID != "" {
		running, _ := deps.Docker.IsContainerRunning(ctx, prevContainerID)
		switch {
		case !running:
			slog.Info("dockerfile: container not running, forcing redeploy", "workload", w.Name)
		case domain != "":
			proxyOK, perr := deps.Proxy.RouteExists(ctx, domain)
			switch {
			case perr != nil:
				slog.Warn("dockerfile: proxy check failed, forcing redeploy",
					"workload", w.Name, "error", perr)
			case !proxyOK:
				slog.Info("dockerfile: proxy route missing, forcing redeploy", "workload", w.Name)
			default:
				return healUnchanged(deps, w, prev, latestSHA)
			}
		default:
			return healUnchanged(deps, w, prev, latestSHA)
		}
	}

	updateStatus(deps, w, "syncing", prev.LastCommitSHA, "")
	publishEvent(deps, w, "syncing")

	// Clone the repo into a temp dir. We always download the entire
	// repo tree (folderPath = ""); a ContextPath subset is applied
	// at build time, not at download time, so a Dockerfile in
	// `./docker/Dockerfile` with `ContextPath=""` still works.
	cloneDir, err := os.MkdirTemp("", "tf-build-"+idShort(w)+"-*")
	if err != nil {
		updateStatus(deps, w, "failed", prev.LastCommitSHA,
			sanitizeError(fmt.Sprintf("create clone dir: %v", err), token))
		return fmt.Errorf("create clone dir: %w", err)
	}
	defer os.RemoveAll(cloneDir)

	if err := provider.DownloadFolder(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch, "", cloneDir); err != nil {
		updateStatus(deps, w, "failed", prev.LastCommitSHA,
			sanitizeError(fmt.Sprintf("download repo: %v", err), token))
		return fmt.Errorf("download repo: %w", err)
	}

	// Resolve the build context (with symlink-aware escape check) and
	// verify the Dockerfile is actually present before sending the
	// build off to the daemon.
	contextDir, err := resolveContextDir(cloneDir, cfg.ContextPath)
	if err != nil {
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(fmt.Sprintf("resolve context: %v", err), token))
		return fmt.Errorf("resolve context: %w", err)
	}
	if err := verifyDockerfileExists(contextDir, cfg.DockerfilePath); err != nil {
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(err.Error(), token))
		return err
	}

	imageTag := imageTagFor(w)
	updateStatus(deps, w, "building", latestSHA, "")
	publishEvent(deps, w, "building")
	// Bridge per-line build output onto the event bus so /api/events
	// subscribers (the dashboard's live tail) can show progress while
	// the daemon chugs. The bus is non-blocking — slow subscribers drop
	// events rather than backpressure the build — so this is safe to
	// call from the hot scan loop.
	logFn := func(line string) {
		publishBuildLog(deps, w, line)
	}
	if err := deps.Docker.BuildImageAt(ctx, contextDir, cfg.DockerfilePath, imageTag, logFn); err != nil {
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(fmt.Sprintf("docker build: %v", err), token))
		return fmt.Errorf("docker build: %w", err)
	}

	env := buildEnv(deps, w.ID)
	containerPort := strconv.Itoa(cfg.Port)

	settings, err := deps.Store.GetSettings()
	if err != nil {
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(fmt.Sprintf("get settings: %v", err), token))
		return fmt.Errorf("get settings: %w", err)
	}

	networkName := settings.Network
	networkID, err := deps.Docker.EnsureNetwork(ctx, networkName)
	if err != nil {
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(fmt.Sprintf("ensure network: %v", err), token))
		return fmt.Errorf("ensure network: %w", err)
	}

	containerName := containerNameFor(w)

	// Per-face proxy labels (Traefik consumes these; NPM ignores them).
	labels := map[string]string{}
	if domain != "" {
		if l := deps.Proxy.ContainerLabels(domain, cfg.Port); l != nil {
			for k, v := range l {
				labels[k] = v
			}
		}
	}

	cc := docker.ContainerConfig{
		Name:         containerName,
		Image:        imageTag,
		Env:          env,
		ExposedPorts: []string{containerPort + "/tcp"},
		NetworkName:  networkName,
		NetworkID:    networkID,
		Labels:       labels,
		WorkloadID:   w.ID,
		// Dockerfile workloads are tagged as "build" so the dashboard
		// and any filtered query can distinguish them from static sites
		// (which serve files) and image-source containers (which pull
		// pre-built images from a registry).
		WorkloadKind: string(store.WorkloadKindBuild),
		Role:         "",
	}

	containerID, err := deps.Docker.CreateContainer(ctx, cc)
	if err != nil {
		// Name conflict — best-effort cleanup of any prior container
		// (by ID first; by name as a fallback) and one retry.
		if prevContainerID != "" {
			deps.Docker.StopContainer(ctx, prevContainerID, 10)
			deps.Docker.RemoveContainer(ctx, prevContainerID, true)
		}
		removeContainerByName(ctx, deps, containerName)

		containerID, err = deps.Docker.CreateContainer(ctx, cc)
		if err != nil {
			updateStatus(deps, w, "failed", latestSHA,
				sanitizeError(fmt.Sprintf("create container: %v", err), token))
			return fmt.Errorf("create container: %w", err)
		}
	}

	if err := deps.Docker.StartContainer(ctx, containerID); err != nil {
		deps.Docker.RemoveContainer(ctx, containerID, true)
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(fmt.Sprintf("start container: %v", err), token))
		return fmt.Errorf("start container: %w", err)
	}

	// Brief health-check window — catch crash-on-boot. ctx-aware so a
	// cancelled deploy returns promptly. On failure surface the tail
	// of the container's logs as the error reason; that's almost
	// always what the operator needs to debug.
	select {
	case <-ctx.Done():
		deps.Docker.RemoveContainer(ctx, containerID, true)
		updateStatus(deps, w, "failed", latestSHA, "deploy cancelled before health check")
		return ctx.Err()
	case <-time.After(healthCheckDelay):
	}
	running, runErr := deps.Docker.IsContainerRunning(ctx, containerID)
	if runErr != nil || !running {
		logMsg := "container exited immediately after start"
		if logs, logErr := deps.Docker.ContainerLogs(ctx, containerID, false, "40"); logErr == nil {
			buf, _ := io.ReadAll(logs)
			logs.Close()
			if len(buf) > 0 {
				// Pass `env` so any decrypted KEY=VALUE pair that the
				// container's startup output happens to echo (think
				// `RUN echo $DB_PASSWORD` in a debug Dockerfile) is
				// redacted before it lands in the operator-visible
				// last_error field.
				logMsg = sanitizeErrorWithSecrets(string(buf), token, env)
			}
		}
		deps.Docker.RemoveContainer(ctx, containerID, true)
		updateStatus(deps, w, "failed", latestSHA, logMsg)
		return fmt.Errorf("container not running: %s", logMsg)
	}

	// Resolve proxy target: in-network DNS by default, NPM-remote
	// override uses (settings.ServerIP, hostPort).
	forwardHost := containerName
	forwardPort := cfg.Port
	if settings.NpmRemote && settings.ProxyProvider == "npm" {
		if settings.ServerIP != "" {
			hostPort, hpErr := deps.Docker.InspectContainerPort(ctx, containerID, containerPort+"/tcp")
			if hpErr != nil {
				slog.Warn("dockerfile: could not get host port for remote NPM",
					"workload", w.Name, "error", hpErr)
			} else {
				forwardHost = settings.ServerIP
				forwardPort = int(hostPort)
			}
		}
	}

	// Configure proxy if a domain is set. Replace any prior route
	// in-place so traffic shifts atomically over to the new container.
	proxyRouteID := prevProxyRouteID
	if domain != "" {
		if prevProxyRouteID != "" {
			deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
		}
		routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
			SSLCertificateID: settings.SSLCertificateID,
		})
		if rerr != nil {
			slog.Warn("dockerfile: failed to configure proxy",
				"workload", w.Name, "domain", domain,
				"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "error", rerr)
		} else {
			proxyRouteID = routeID
			slog.Info("dockerfile: proxy configured",
				"workload", w.Name, "domain", domain,
				"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "routeID", routeID)
		}
	}

	// Drop the previous container only after the new one is healthy
	// + routed. Different-ID-than-previous tells us we created a
	// fresh one (vs returning the same ID via UpsertContainer reuse).
	if prevContainerID != "" && prevContainerID != containerID {
		deps.Docker.StopContainer(ctx, prevContainerID, 10)
		deps.Docker.RemoveContainer(ctx, prevContainerID, true)
	}

	// Single transactional write of new state + container metadata.
	// On failure: tear down the just-created container + proxy route
	// so we don't leave orphans behind for the next deploy to trip
	// over.
	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
		rs.LastCommitSHA = latestSHA
		rs.LastSyncAt = store.Now()
		rs.LastError = ""
		rs.Status = "deployed"

		c.ContainerID = containerID
		c.ProxyRouteID = proxyRouteID
		c.Subdomain = domain
		c.State = "running"
		c.Port = cfg.Port
		c.ImageRef = imageTag
	}); err != nil {
		slog.Error("dockerfile: failed to persist deploy state — rolling back",
			"workload", w.Name, "error", err)
		if proxyRouteID != "" {
			deps.Proxy.DeleteRoute(ctx, proxyRouteID)
		}
		deps.Docker.StopContainer(ctx, containerID, 10)
		deps.Docker.RemoveContainer(ctx, containerID, true)
		updateStatus(deps, w, "failed", latestSHA,
			sanitizeError(fmt.Sprintf("persist deploy state: %v", err), token))
		return fmt.Errorf("persist deploy state: %w", err)
	}

	publishEvent(deps, w, "deployed")
	dispatchBuildNotification(deps, w, domain, "deployed", "")

	slog.Info("dockerfile deployed",
		"workload", w.Name,
		"sha", shortSHA(latestSHA),
		"image", imageTag)
	return nil
}

// updateStatus writes the runtime-state status/error/commit and (on
// terminal states) fires the side effects the static plugin's helper
// does: failures land in the event log, and a "deployed" or "failed"
// transition dispatches an outbound notification.
//
// The deploy success path calls saveState directly with the full
// container metadata; this helper covers failure / intermediate
// transitions where only state moves.
func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg string) {
	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
		rs.Status = status
		rs.LastError = errMsg
		if commitSHA != "" {
			rs.LastCommitSHA = commitSHA
		}
		switch status {
		case "deployed":
			c.State = "running"
		case "stopped":
			c.State = "stopped"
		case "failed":
			c.State = "failed"
		case "syncing", "building":
			// Don't churn the container row's state during in-progress
			// build/sync — leave whatever the previous deploy left.
		}
	}); err != nil {
		slog.Error("dockerfile: failed to update status",
			"id", w.ID, "status", status, "error", err)
	}

	if status == "failed" {
		publishEvent(deps, w, "failed: "+errMsg)
	}

	if status == "deployed" || status == "failed" {
		dispatchBuildNotification(deps, w, primaryDomain(deps, w), status, errMsg)
	}
}

// dispatchBuildNotification fans the build event out to every
// configured notification route for the workload. Multi-destination
// fan-out (workload_notifications rows + legacy single URL + global
// settings fallback) is centralised in plugin.DispatchNotificationForWorkload
// so the routing rules are identical across source kinds.
func dispatchBuildNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
	eventType := "build_success"
	if status == "failed" {
		eventType = "build_failure"
	}
	siteURL := ""
	if domain != "" {
		siteURL = "https://" + domain
	}
	plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
		Type:    eventType,
		Project: w.Name,
		URL:     siteURL,
		Error:   errMsg,
	})
}

// publishEvent emits a status event on the bus AND persists an
// event_log row. Message shape mirrors the static plugin
// ("Build %q: %s") so the dashboard's audit feed reads consistently
// across both kinds.
func publishEvent(deps plugin.Deps, w plugin.Workload, status string) {
	severity := "info"
	if strings.HasPrefix(status, "failed") {
		severity = "error"
	}
	message := fmt.Sprintf("Build %q: %s", w.Name, status)

	metaBytes, err := json.Marshal(map[string]string{
		"workload_id":   w.ID,
		"workload_name": w.Name,
		"status":        status,
	})
	if err != nil {
		slog.Error("dockerfile: marshal event metadata", "error", err)
		metaBytes = []byte("{}")
	}
	metadata := string(metaBytes)

	evt, err := deps.Store.InsertEvent(store.EventLog{
		Source:   "dockerfile",
		Severity: severity,
		Message:  message,
		Metadata: metadata,
	})
	if err != nil {
		slog.Error("dockerfile: failed to persist event log", "error", err)
		return
	}
	deps.Events.Publish(events.Event{
		Type: events.EventLog,
		Payload: events.EventLogPayload{
			ID:        evt.ID,
			Source:    "dockerfile",
			Severity:  severity,
			Message:   message,
			Metadata:  metadata,
			CreatedAt: evt.CreatedAt,
		},
	})
}

// publishBuildLog emits one EventBuildLog per non-empty daemon "stream"
// line. The trailing newline the daemon emits per line is trimmed so the
// UI can render each event as its own row without smuggled blanks.
// Strictly best-effort: the bus drops events under backpressure (slow
// subscriber, no subscriber at all) and never blocks the build loop.
func publishBuildLog(deps plugin.Deps, w plugin.Workload, line string) {
	trimmed := strings.TrimRight(line, "\r\n")
	if trimmed == "" {
		return
	}
	deps.Events.Publish(events.Event{
		Type: events.EventBuildLog,
		Payload: events.BuildLogPayload{
			WorkloadID: w.ID,
			Line:       trimmed,
			Stream:     "stdout",
		},
	})
}

// healUnchanged is the no-rebuild short-circuit result: the SHA matches and
// the live container + proxy are healthy, so there is nothing to deploy. If a
// prior transient failure left the persisted status as something other than
// "deployed", repair it so the dashboard reflects reality and we stop treating
// a healthy workload as failed. We heal via saveState directly (NOT
// updateStatus) so this reconciliation does not fire a spurious build-success
// notification on every poll.
func healUnchanged(deps plugin.Deps, w plugin.Workload, prev runtimeState, latestSHA string) error {
	slog.Info("dockerfile: no changes", "workload", w.Name, "sha", shortSHA(latestSHA))
	if prev.Status == "deployed" {
		return nil
	}
	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
		rs.Status = "deployed"
		rs.LastError = ""
		c.State = "running"
	}); err != nil {
		slog.Warn("dockerfile: failed to heal stale status to deployed",
			"workload", w.Name, "error", err)
	}
	return nil
}

// removeContainerByName enumerates Docker's view and best-effort drops
// EVERY matching container so a name conflict in CreateContainer is
// recoverable. Container names are unique per daemon, but the recovery
// path exists precisely because a conflict occurred — a prior partial
// deploy can leave more than one matching artifact, so we must not stop
// at the first. Mirrors the static plugin's helper of the same name.
func removeContainerByName(ctx context.Context, deps plugin.Deps, name string) {
	containers, err := deps.Docker.ListContainers(ctx, nil)
	if err != nil {
		return
	}
	for _, c := range containers {
		if c.Name == name {
			deps.Docker.StopContainer(ctx, c.ID, 10)
			deps.Docker.RemoveContainer(ctx, c.ID, true)
		}
	}
}

// primaryDomain mirrors the static plugin's helper of the same name —
// derives an FQDN from the workload's first enabled public face, with
// the same bare-subdomain + settings.Domain fall-through.
func primaryDomain(deps plugin.Deps, w plugin.Workload) string {
	for _, f := range w.PublicFaces {
		if f.Subdomain == "" && f.Domain == "" {
			continue
		}
		switch {
		case f.Subdomain != "" && f.Domain != "":
			return f.Subdomain + "." + f.Domain
		case f.Subdomain == "" && f.Domain != "":
			return f.Domain
		case f.Subdomain != "" && f.Domain == "":
			settings, err := deps.Store.GetSettings()
			if err != nil || settings.Domain == "" {
				return f.Subdomain
			}
			return f.Subdomain + "." + settings.Domain
		}
	}
	return ""
}

// shortSHA truncates a commit SHA for log lines. Keeps the deploy log
// readable without losing the "is this the same commit?" signal.
func shortSHA(sha string) string {
	if len(sha) > 8 {
		return sha[:8]
	}
	return sha
}