Files
tiny-forge/internal/workload/plugin/source/dockerfile/deploy.go
T
alexei.dolgolyov fa6d5bd3ba feat(secrets): scoped shared secrets — backend + API (Phase 1)
Secrets defined once and applied to many workloads by scope (global or
per-app), encrypted at rest and resolved into container env as a
low-precedence default layer: global-shared < app-shared < image cfg.Env
< workload_env. A workload with no applicable shared secrets is
byte-identical to the prior workload_env-only behavior.

- store: shared_secrets table + CRUD + ListApplicableSharedSecrets
  (enabled global + app, global-first), UNIQUE(scope,app_id,name).
- plugin.ResolveSharedSecrets + integration into BuildWorkloadEnv
  (static/dockerfile) and image buildEnv; best-effort — a shared-secret
  store/decrypt error never fails a deploy, and values are never logged.
- REST CRUD at /api/shared-secrets (reads authed, mutations AdminOnly);
  values encrypted at the boundary via crypto.Encrypt and never returned
  (only a has_value flag), mirroring workload_env. UNIQUE collisions 409.

Compose is out of scope (YAML-defined env). Frontend rule UI is Phase 2.
Reviewed: go + security APPROVE (0 CRITICAL/HIGH); two MEDIUMs fixed
(translateSQLError -> 409, no driver-message leak). Deferred defense-in-
depth: json:"-" on the model value + a description length cap.
2026-05-29 15:26:09 +03:00

571 lines
21 KiB
Go

package dockerfile
import (
"context"
"fmt"
"io"
"log/slog"
"os"
"strconv"
"strings"
"time"
"github.com/alexei/tinyforge/internal/crypto"
"github.com/alexei/tinyforge/internal/docker"
"github.com/alexei/tinyforge/internal/events"
"github.com/alexei/tinyforge/internal/notify"
"github.com/alexei/tinyforge/internal/proxy"
"github.com/alexei/tinyforge/internal/staticsite"
"github.com/alexei/tinyforge/internal/store"
"github.com/alexei/tinyforge/internal/workload/plugin"
)
// healthCheckDelay is the grace window after StartContainer before we
// probe IsContainerRunning. Mirrors the static plugin's window — short
// enough not to slow happy-path deploys, long enough to catch
// crash-on-boot failures (missing env, bad CMD, port conflict).
const healthCheckDelay = 3 * time.Second
// deploy runs one end-to-end sync of a dockerfile workload:
//
// 1. fetch the latest commit SHA from the configured git provider
// 2. skip if SHA + container + proxy are all still healthy
// 3. clone the repo into a temp dir
// 4. resolve the build context + Dockerfile location
// 5. `docker build -t <tag> -f <dockerfile> <context>`
// 6. recreate the container with the new image
// 7. health-probe the container, surface logs on failure
// 8. reconfigure the proxy route
// 9. tear down the previous container (different ID) once we're sure
// the new one is healthy and proxied
//
// Each step writes its own status update so the dashboard's runtime-
// state panel can show a useful intermediate state when the deploy
// stalls on the slow step (almost always the build).
func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) (retErr error) {
cfg, err := plugin.SourceConfigOf[Config](w)
if err != nil {
return fmt.Errorf("dockerfile source: decode config: %w", err)
}
prev, prevContainer, err := loadState(deps, w)
if err != nil {
return err
}
// Force a full rebuild on manual / promote / first-time deploys
// (no Reason at all also implies manual). Schedule / git triggers
// honour the unchanged-SHA short-circuit so cron polling does not
// rebuild minute-by-minute when nothing changed.
force := intent.Reason == "" || intent.Reason == "manual" || intent.Reason == "promote"
// Decrypt the access token if present. Token never escapes this
// frame: any error message routes through sanitizeError(_, token)
// which redacts the literal substring.
token := ""
if cfg.AccessToken != "" {
decrypted, derr := crypto.Decrypt(deps.EncKey, cfg.AccessToken)
if derr != nil {
slog.Warn("dockerfile source: failed to decrypt access token",
"workload", w.Name, "error", derr)
} else {
token = decrypted
}
}
provider, err := staticsite.NewGitProvider(staticsite.ProviderType(cfg.Provider), cfg.BaseURL, token)
if err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("create provider: %v", err), token))
return fmt.Errorf("create provider: %w", err)
}
latestSHA, err := provider.GetLatestCommitSHA(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch)
if err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("fetch commit SHA: %v", err), token))
return fmt.Errorf("get latest commit: %w", err)
}
domain := primaryDomain(deps, w)
// Commit-status reporter (best-effort; gated on cfg.ReportCommitStatus).
// The deferred terminal report fires Success/Failure based on the
// deploy's outcome, but ONLY once an actual build/deploy began
// (deployStarted). The unchanged-SHA short-circuit below returns via
// healUnchanged before that flips, so no status is reported when
// nothing was built. retErr is the named return the defer inspects.
reporter := staticsite.NewCommitStatusReporter(provider, cfg.RepoOwner, cfg.RepoName, latestSHA, statusTargetURL(domain), cfg.ReportCommitStatus)
deployStarted := false
defer func() {
if !deployStarted {
return
}
if retErr != nil {
reporter.Report(ctx, w.Name, w.ID, staticsite.CommitStatusFailure, "Tinyforge: build failed")
} else {
reporter.Report(ctx, w.Name, w.ID, staticsite.CommitStatusSuccess, "Tinyforge: deployed")
}
}()
prevContainerID := ""
prevProxyRouteID := ""
if prevContainer != nil {
prevContainerID = prevContainer.ContainerID
prevProxyRouteID = prevContainer.ProxyRouteID
}
// Short-circuit: SHA unchanged AND container is still running AND
// (if there's a public face) the proxy route still exists. Manual
// deploys skip this entirely.
//
// We deliberately do NOT gate this on prev.Status == "deployed". A
// transient failure (e.g. a one-off proxy-check error) leaves the
// persisted status as "failed"; if we required "deployed" here, every
// subsequent cron/git poll with the same SHA would fall through to a
// full clone + docker build despite a perfectly healthy running
// container — a rebuild storm that burns CPU/disk until a new commit
// lands. Instead we trust the live container/proxy state and heal the
// stale status via healUnchanged.
if !force && latestSHA == prev.LastCommitSHA && prevContainerID != "" {
running, _ := deps.Docker.IsContainerRunning(ctx, prevContainerID)
switch {
case !running:
slog.Info("dockerfile: container not running, forcing redeploy", "workload", w.Name)
case domain != "":
proxyOK, perr := deps.Proxy.RouteExists(ctx, domain)
switch {
case perr != nil:
slog.Warn("dockerfile: proxy check failed, forcing redeploy",
"workload", w.Name, "error", perr)
case !proxyOK:
slog.Info("dockerfile: proxy route missing, forcing redeploy", "workload", w.Name)
default:
return healUnchanged(deps, w, prev, latestSHA)
}
default:
return healUnchanged(deps, w, prev, latestSHA)
}
}
// From here on a deploy is genuinely underway, so the deferred terminal
// status report should fire. Push a "pending" commit status (best-
// effort) and arm the deferred Success/Failure report.
updateStatus(deps, w, "syncing", prev.LastCommitSHA, "")
publishEvent(deps, w, "syncing")
deployStarted = true
reporter.Report(ctx, w.Name, w.ID, staticsite.CommitStatusPending, "Tinyforge: deploying")
// Clone the repo into a temp dir. We always download the entire
// repo tree (folderPath = ""); a ContextPath subset is applied
// at build time, not at download time, so a Dockerfile in
// `./docker/Dockerfile` with `ContextPath=""` still works.
cloneDir, err := os.MkdirTemp("", "tf-build-"+plugin.IDShort(w)+"-*")
if err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("create clone dir: %v", err), token))
return fmt.Errorf("create clone dir: %w", err)
}
defer os.RemoveAll(cloneDir)
if err := provider.DownloadFolder(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch, "", cloneDir); err != nil {
updateStatus(deps, w, "failed", prev.LastCommitSHA,
sanitizeError(fmt.Sprintf("download repo: %v", err), token))
return fmt.Errorf("download repo: %w", err)
}
// Resolve the build context (with symlink-aware escape check) and
// verify the Dockerfile is actually present before sending the
// build off to the daemon.
contextDir, err := resolveContextDir(cloneDir, cfg.ContextPath)
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("resolve context: %v", err), token))
return fmt.Errorf("resolve context: %w", err)
}
if err := verifyDockerfileExists(contextDir, cfg.DockerfilePath); err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(err.Error(), token))
return err
}
imageTag := imageTagFor(w)
updateStatus(deps, w, "building", latestSHA, "")
publishEvent(deps, w, "building")
// Bridge per-line build output onto the event bus so /api/events
// subscribers (the dashboard's live tail) can show progress while
// the daemon chugs. The bus is non-blocking — slow subscribers drop
// events rather than backpressure the build — so this is safe to
// call from the hot scan loop.
logFn := func(line string) {
publishBuildLog(deps, w, line)
}
if err := deps.Docker.BuildImageAt(ctx, contextDir, cfg.DockerfilePath, imageTag, logFn); err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("docker build: %v", err), token))
return fmt.Errorf("docker build: %w", err)
}
env := plugin.BuildWorkloadEnv(deps, w.ID, w.GroupID, "dockerfile source")
containerPort := strconv.Itoa(cfg.Port)
settings, err := deps.Store.GetSettings()
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("get settings: %v", err), token))
return fmt.Errorf("get settings: %w", err)
}
networkName := settings.Network
networkID, err := deps.Docker.EnsureNetwork(ctx, networkName)
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("ensure network: %v", err), token))
return fmt.Errorf("ensure network: %w", err)
}
containerName := containerNameFor(w)
// Per-face proxy labels (Traefik consumes these; NPM ignores them).
labels := map[string]string{}
if domain != "" {
if l := deps.Proxy.ContainerLabels(domain, cfg.Port); l != nil {
for k, v := range l {
labels[k] = v
}
}
}
cc := docker.ContainerConfig{
Name: containerName,
Image: imageTag,
Env: env,
ExposedPorts: []string{containerPort + "/tcp"},
NetworkName: networkName,
NetworkID: networkID,
Labels: labels,
WorkloadID: w.ID,
// Dockerfile workloads are tagged as "build" so the dashboard
// and any filtered query can distinguish them from static sites
// (which serve files) and image-source containers (which pull
// pre-built images from a registry).
WorkloadKind: string(store.WorkloadKindBuild),
Role: "",
}
containerID, err := deps.Docker.CreateContainer(ctx, cc)
if err != nil {
// Name conflict — best-effort cleanup of any prior container
// (by ID first; by name as a fallback) and one retry.
if prevContainerID != "" {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
}
removeContainerByName(ctx, deps, containerName)
containerID, err = deps.Docker.CreateContainer(ctx, cc)
if err != nil {
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("create container: %v", err), token))
return fmt.Errorf("create container: %w", err)
}
}
if err := deps.Docker.StartContainer(ctx, containerID); err != nil {
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("start container: %v", err), token))
return fmt.Errorf("start container: %w", err)
}
// Brief health-check window — catch crash-on-boot. ctx-aware so a
// cancelled deploy returns promptly. On failure surface the tail
// of the container's logs as the error reason; that's almost
// always what the operator needs to debug.
select {
case <-ctx.Done():
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA, "deploy cancelled before health check")
return ctx.Err()
case <-time.After(healthCheckDelay):
}
running, runErr := deps.Docker.IsContainerRunning(ctx, containerID)
if runErr != nil || !running {
logMsg := "container exited immediately after start"
if logs, logErr := deps.Docker.ContainerLogs(ctx, containerID, false, "40"); logErr == nil {
buf, _ := io.ReadAll(logs)
logs.Close()
if len(buf) > 0 {
// Pass `env` so any decrypted KEY=VALUE pair that the
// container's startup output happens to echo (think
// `RUN echo $DB_PASSWORD` in a debug Dockerfile) is
// redacted before it lands in the operator-visible
// last_error field.
logMsg = sanitizeErrorWithSecrets(string(buf), token, env)
}
}
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA, logMsg)
return fmt.Errorf("container not running: %s", logMsg)
}
// Resolve proxy target: in-network DNS by default, NPM-remote
// override uses (settings.ServerIP, hostPort).
forwardHost := containerName
forwardPort := cfg.Port
if settings.NpmRemote && settings.ProxyProvider == "npm" {
if settings.ServerIP != "" {
hostPort, hpErr := deps.Docker.InspectContainerPort(ctx, containerID, containerPort+"/tcp")
if hpErr != nil {
slog.Warn("dockerfile: could not get host port for remote NPM",
"workload", w.Name, "error", hpErr)
} else {
forwardHost = settings.ServerIP
forwardPort = int(hostPort)
}
}
}
// Configure proxy if a domain is set. Replace any prior route
// in-place so traffic shifts atomically over to the new container.
proxyRouteID := prevProxyRouteID
if domain != "" {
if prevProxyRouteID != "" {
deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
}
routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
SSLCertificateID: settings.SSLCertificateID,
})
if rerr != nil {
slog.Warn("dockerfile: failed to configure proxy",
"workload", w.Name, "domain", domain,
"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "error", rerr)
} else {
proxyRouteID = routeID
slog.Info("dockerfile: proxy configured",
"workload", w.Name, "domain", domain,
"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "routeID", routeID)
}
}
// Drop the previous container only after the new one is healthy
// + routed. Different-ID-than-previous tells us we created a
// fresh one (vs returning the same ID via UpsertContainer reuse).
if prevContainerID != "" && prevContainerID != containerID {
deps.Docker.StopContainer(ctx, prevContainerID, 10)
deps.Docker.RemoveContainer(ctx, prevContainerID, true)
}
// Single transactional write of new state + container metadata.
// On failure: tear down the just-created container + proxy route
// so we don't leave orphans behind for the next deploy to trip
// over.
if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
rs.LastCommitSHA = latestSHA
rs.LastSyncAt = store.Now()
rs.LastError = ""
rs.Status = "deployed"
c.ContainerID = containerID
c.ProxyRouteID = proxyRouteID
c.Subdomain = domain
c.State = "running"
c.Port = cfg.Port
c.ImageRef = imageTag
}); err != nil {
slog.Error("dockerfile: failed to persist deploy state — rolling back",
"workload", w.Name, "error", err)
if proxyRouteID != "" {
deps.Proxy.DeleteRoute(ctx, proxyRouteID)
}
deps.Docker.StopContainer(ctx, containerID, 10)
deps.Docker.RemoveContainer(ctx, containerID, true)
updateStatus(deps, w, "failed", latestSHA,
sanitizeError(fmt.Sprintf("persist deploy state: %v", err), token))
return fmt.Errorf("persist deploy state: %w", err)
}
publishEvent(deps, w, "deployed")
dispatchBuildNotification(deps, w, domain, "deployed", "")
slog.Info("dockerfile deployed",
"workload", w.Name,
"sha", shortSHA(latestSHA),
"image", imageTag)
return nil
}
// statusTargetURL derives the https URL the commit status links back to —
// the workload's primary public face, or "" when it has none.
func statusTargetURL(domain string) string {
if domain == "" {
return ""
}
return "https://" + domain
}
// updateStatus writes the runtime-state status/error/commit and (on
// terminal states) fires the side effects the static plugin's helper
// does: failures land in the event log, and a "deployed" or "failed"
// transition dispatches an outbound notification.
//
// The deploy success path calls saveState directly with the full
// container metadata; this helper covers failure / intermediate
// transitions where only state moves.
func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg string) {
if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
rs.Status = status
rs.LastError = errMsg
if commitSHA != "" {
rs.LastCommitSHA = commitSHA
}
switch status {
case "deployed":
c.State = "running"
case "stopped":
c.State = "stopped"
case "failed":
c.State = "failed"
case "syncing", "building":
// Don't churn the container row's state during in-progress
// build/sync — leave whatever the previous deploy left.
}
}); err != nil {
slog.Error("dockerfile: failed to update status",
"id", w.ID, "status", status, "error", err)
}
if status == "failed" {
publishEvent(deps, w, "failed: "+errMsg)
}
if status == "deployed" || status == "failed" {
dispatchBuildNotification(deps, w, primaryDomain(deps, w), status, errMsg)
}
}
// dispatchBuildNotification fans the build event out to every
// configured notification route for the workload. Multi-destination
// fan-out (workload_notifications rows + legacy single URL + global
// settings fallback) is centralised in plugin.DispatchNotificationForWorkload
// so the routing rules are identical across source kinds.
func dispatchBuildNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
eventType := "build_success"
if status == "failed" {
eventType = "build_failure"
}
siteURL := ""
if domain != "" {
siteURL = "https://" + domain
}
plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
Type: eventType,
Project: w.Name,
URL: siteURL,
Error: errMsg,
})
}
// publishEvent records a workload-scoped deploy event in the audit log.
// The InsertEvent + bus publish (and consistent message/metadata shape
// across source kinds) is centralised in plugin.EmitDeployEvent so the
// dashboard's audit feed and the per-workload timeline read identically
// for image / compose / static / dockerfile deploys.
func publishEvent(deps plugin.Deps, w plugin.Workload, status string) {
plugin.EmitDeployEvent(deps, w, "dockerfile", status)
}
// publishBuildLog emits one EventBuildLog per non-empty daemon "stream"
// line. The trailing newline the daemon emits per line is trimmed so the
// UI can render each event as its own row without smuggled blanks.
// Strictly best-effort: the bus drops events under backpressure (slow
// subscriber, no subscriber at all) and never blocks the build loop.
func publishBuildLog(deps plugin.Deps, w plugin.Workload, line string) {
trimmed := strings.TrimRight(line, "\r\n")
if trimmed == "" {
return
}
deps.Events.Publish(events.Event{
Type: events.EventBuildLog,
Payload: events.BuildLogPayload{
WorkloadID: w.ID,
Line: trimmed,
Stream: "stdout",
},
})
}
// healUnchanged is the no-rebuild short-circuit result: the SHA matches and
// the live container + proxy are healthy, so there is nothing to deploy. If a
// prior transient failure left the persisted status as something other than
// "deployed", repair it so the dashboard reflects reality and we stop treating
// a healthy workload as failed. We heal via saveState directly (NOT
// updateStatus) so this reconciliation does not fire a spurious build-success
// notification on every poll.
func healUnchanged(deps plugin.Deps, w plugin.Workload, prev runtimeState, latestSHA string) error {
slog.Info("dockerfile: no changes", "workload", w.Name, "sha", shortSHA(latestSHA))
if prev.Status == "deployed" {
return nil
}
if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
rs.Status = "deployed"
rs.LastError = ""
c.State = "running"
}); err != nil {
slog.Warn("dockerfile: failed to heal stale status to deployed",
"workload", w.Name, "error", err)
}
return nil
}
// removeContainerByName enumerates Docker's view and best-effort drops
// EVERY matching container so a name conflict in CreateContainer is
// recoverable. Container names are unique per daemon, but the recovery
// path exists precisely because a conflict occurred — a prior partial
// deploy can leave more than one matching artifact, so we must not stop
// at the first. Mirrors the static plugin's helper of the same name.
func removeContainerByName(ctx context.Context, deps plugin.Deps, name string) {
containers, err := deps.Docker.ListContainers(ctx, nil)
if err != nil {
return
}
for _, c := range containers {
if c.Name == name {
deps.Docker.StopContainer(ctx, c.ID, 10)
deps.Docker.RemoveContainer(ctx, c.ID, true)
}
}
}
// primaryDomain mirrors the static plugin's helper of the same name —
// derives an FQDN from the workload's first enabled public face, with
// the same bare-subdomain + settings.Domain fall-through.
func primaryDomain(deps plugin.Deps, w plugin.Workload) string {
for _, f := range w.PublicFaces {
if f.Subdomain == "" && f.Domain == "" {
continue
}
switch {
case f.Subdomain != "" && f.Domain != "":
return f.Subdomain + "." + f.Domain
case f.Subdomain == "" && f.Domain != "":
return f.Domain
case f.Subdomain != "" && f.Domain == "":
settings, err := deps.Store.GetSettings()
if err != nil || settings.Domain == "" {
return f.Subdomain
}
return f.Subdomain + "." + settings.Domain
}
}
return ""
}
// shortSHA truncates a commit SHA for log lines. Keeps the deploy log
// readable without losing the "is this the same commit?" signal.
func shortSHA(sha string) string {
if len(sha) > 8 {
return sha[:8]
}
return sha
}