feat(apps): stepped creation wizard, branch previews, and app-creation fixes

This session (frontend focus): - Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review): WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation, ConfirmDialog-based unsaved-changes guard. - Extract lib/workload/sourceForms.ts (single source of truth for source_config) + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the /apps/[id] edit form onto the same components (removes the duplication). Add vitest + sourceForms unit tests. - Branch preview environments UI: /chain is_preview/preview_branch + a Preview environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed state); RegistryImagePicker on the registry trigger and the image source. - Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect; conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory label hints; dashboard + /apps "Total workloads" count only source_kind workloads (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker empty-list guard. - Update CLAUDE.md frontend conventions + add a Build & Test section. Also captures pre-existing in-progress platform work (not from this session): workload notifications, Prometheus metrics export, store lockfile, health probes, backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00
parent 956943edbb
commit 410a131cec
112 changed files with 13285 additions and 2765 deletions
@@ -32,6 +32,23 @@ type Config struct {

 type source struct{}

+// composeRunner is the slice of stack.Compose this plugin actually
+// drives. Defined locally per the "interfaces where they are used"
+// idiom so the plugin can be unit-tested without a real docker compose
+// binary. `*stack.Compose` satisfies it implicitly.
+type composeRunner interface {
+	Up(ctx context.Context, projectName, yamlPath string) (string, error)
+	Down(ctx context.Context, projectName string, removeVolumes bool) (string, error)
+	Ps(ctx context.Context, projectName, yamlPath string) ([]stack.Service, error)
+}
+
+// newComposeRunner returns the runner the plugin should call. Tests
+// swap this var with a fake; production code never touches it. The
+// indirection costs one function-pointer dereference per Deploy /
+// Teardown / Reconcile call — negligible against the docker compose
+// exec it gates.
+var newComposeRunner = func() composeRunner { return stack.NewCompose("") }
+
 func init() { plugin.RegisterSource(&source{}) }

 func (*source) Kind() string { return "compose" }
@@ -82,7 +99,7 @@ func (*source) Deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload,
 		return fmt.Errorf("compose source: write yaml: %w", err)
 	}

-	compose := stack.NewCompose("")
+	compose := newComposeRunner()
 	out, err := compose.Up(ctx, projectName, yamlPath)
 	if err != nil {
 		return fmt.Errorf("compose source: docker compose up: %w (output: %s)", err, truncate(out, 1024))
@@ -105,7 +122,7 @@ func (*source) Teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload
 	cfg, _ := plugin.SourceConfigOf[Config](w)
 	projectName := composeProjectName(cfg.ComposeProjectName, w)

-	compose := stack.NewCompose("")
+	compose := newComposeRunner()
 	if _, err := compose.Down(ctx, projectName, true); err != nil {
 		// Log but proceed — the DB rows must not be orphaned.
 		slog.Warn("compose source: docker compose down", "workload", w.ID, "error", err)
@@ -139,7 +156,7 @@ func (*source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workloa
 	projectName := composeProjectName(cfg.ComposeProjectName, w)
 	yamlPath, _ := writeYAMLIfChanged(w.ID, cfg.ComposeYAML)

-	compose := stack.NewCompose("")
+	compose := newComposeRunner()
 	services, err := compose.Ps(ctx, projectName, yamlPath)
 	if err != nil {
 		// Likely no compose project running for this workload. Mark
@@ -162,7 +179,7 @@ func (*source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workloa

 // syncContainers shares its body with Reconcile minus the missing-row
 // fallback — Deploy expects compose ps to succeed since `up` just ran.
-func syncContainers(ctx context.Context, deps plugin.Deps, compose *stack.Compose, w plugin.Workload, projectName, yamlPath string) error {
+func syncContainers(ctx context.Context, deps plugin.Deps, compose composeRunner, w plugin.Workload, projectName, yamlPath string) error {
 	services, err := compose.Ps(ctx, projectName, yamlPath)
 	if err != nil {
 		return fmt.Errorf("compose ps: %w", err)
@@ -204,7 +221,17 @@ var projectNameSanitizer = regexp.MustCompile(`[^a-z0-9_-]`)

 func composeProjectName(explicit string, w plugin.Workload) string {
 	if explicit != "" {
-		return explicit
+		// Apply the same sanitizer to operator-supplied names so a value
+		// like "--foo" cannot reach the docker CLI and be re-parsed as a
+		// flag. Reuses the canonical lower+[^a-z0-9_-]→"-" + trim path.
+		san := strings.ToLower(explicit)
+		san = projectNameSanitizer.ReplaceAllString(san, "-")
+		san = strings.Trim(san, "-")
+		if san != "" {
+			return san
+		}
+		// Fall through to the derived name if sanitization stripped
+		// everything (operator passed e.g. "---" — degenerate input).
 	}
 	name := strings.ToLower(w.Name)
 	name = projectNameSanitizer.ReplaceAllString(name, "-")
@@ -0,0 +1,512 @@
+package compose
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/stack"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// fakeRunner stands in for *stack.Compose. Every method records its
+// inputs and returns whatever the test set on the corresponding field.
+// Defaults are happy-path: empty services from Ps, no error from Up /
+// Down. Fields are slice-typed so a single fakeRunner can serve a
+// sequence of calls (Deploy issues Up + Ps in order).
+type fakeRunner struct {
+	mu sync.Mutex
+
+	upCalls    []runnerCall
+	upOuts     []string
+	upErrs     []error
+	downCalls  []runnerCall
+	downOuts   []string
+	downErrs   []error
+	psCalls    []runnerCall
+	psResults  [][]stack.Service
+	psErrs     []error
+	upCallIdx  int
+	psCallIdx  int
+	downCallI  int
+}
+
+type runnerCall struct {
+	ProjectName   string
+	YAMLPath      string
+	RemoveVolumes bool
+}
+
+func (f *fakeRunner) Up(_ context.Context, projectName, yamlPath string) (string, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.upCalls = append(f.upCalls, runnerCall{ProjectName: projectName, YAMLPath: yamlPath})
+	out, err := pop(f.upOuts, f.upErrs, f.upCallIdx)
+	f.upCallIdx++
+	return out, err
+}
+
+func (f *fakeRunner) Down(_ context.Context, projectName string, removeVolumes bool) (string, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.downCalls = append(f.downCalls, runnerCall{ProjectName: projectName, RemoveVolumes: removeVolumes})
+	out, err := pop(f.downOuts, f.downErrs, f.downCallI)
+	f.downCallI++
+	return out, err
+}
+
+func (f *fakeRunner) Ps(_ context.Context, projectName, yamlPath string) ([]stack.Service, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.psCalls = append(f.psCalls, runnerCall{ProjectName: projectName, YAMLPath: yamlPath})
+
+	idx := f.psCallIdx
+	f.psCallIdx++
+	var svcs []stack.Service
+	if idx < len(f.psResults) {
+		svcs = f.psResults[idx]
+	}
+	var err error
+	if idx < len(f.psErrs) {
+		err = f.psErrs[idx]
+	}
+	return svcs, err
+}
+
+// pop returns the nth element of outs/errs or zero values when n is
+// past the end. Lets a test set a single expected response without
+// padding slices for every other call.
+func pop(outs []string, errs []error, n int) (string, error) {
+	var out string
+	if n < len(outs) {
+		out = outs[n]
+	}
+	var err error
+	if n < len(errs) {
+		err = errs[n]
+	}
+	return out, err
+}
+
+// withFakeRunner swaps newComposeRunner for the duration of one test
+// and restores the original on cleanup. Tests that need to inspect the
+// fake post-hoc keep the returned pointer.
+func withFakeRunner(t *testing.T, f *fakeRunner) {
+	t.Helper()
+	orig := newComposeRunner
+	newComposeRunner = func() composeRunner { return f }
+	t.Cleanup(func() { newComposeRunner = orig })
+}
+
+func testStore(t *testing.T) *store.Store {
+	t.Helper()
+	st, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("open store: %v", err)
+	}
+	t.Cleanup(func() { _ = st.Close() })
+	return st
+}
+
+// seedWorkload creates the parent workload row that container rows FK
+// onto. Returns the workload's ID so callers can reuse it.
+func seedWorkload(t *testing.T, st *store.Store, name, yamlText string) string {
+	t.Helper()
+	cfg := Config{ComposeYAML: yamlText}
+	body, err := json.Marshal(cfg)
+	if err != nil {
+		t.Fatalf("marshal config: %v", err)
+	}
+	w, err := st.CreateWorkload(store.Workload{
+		Kind:         "plugin",
+		Name:         name,
+		SourceKind:   "compose",
+		SourceConfig: string(body),
+	})
+	if err != nil {
+		t.Fatalf("create workload: %v", err)
+	}
+	return w.ID
+}
+
+func TestDeploy_HappyPath(t *testing.T) {
+	withTempDir(t) // isolates the YAML scratch dir under t.TempDir()
+
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx:alpine\n"
+	wid := seedWorkload(t, deps.Store, "myapp", yamlText)
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "myapp",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+
+	fake := &fakeRunner{
+		psResults: [][]stack.Service{{
+			{Service: "web", State: "running", Status: "Up 5 seconds"},
+		}},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	if err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{}); err != nil {
+		t.Fatalf("Deploy: %v", err)
+	}
+
+	// Up called exactly once with the workload-derived project name.
+	if len(fake.upCalls) != 1 {
+		t.Fatalf("Up called %d times, want 1", len(fake.upCalls))
+	}
+	if !strings.HasPrefix(fake.upCalls[0].ProjectName, "tf-myapp-") {
+		t.Errorf("Up projectName = %q, want prefix tf-myapp-", fake.upCalls[0].ProjectName)
+	}
+	if !strings.HasSuffix(fake.upCalls[0].YAMLPath, "compose.yml") {
+		t.Errorf("Up yamlPath = %q, want suffix compose.yml", fake.upCalls[0].YAMLPath)
+	}
+
+	// Ps follows Up to enumerate the resulting containers.
+	if len(fake.psCalls) != 1 {
+		t.Fatalf("Ps called %d times, want 1", len(fake.psCalls))
+	}
+
+	// Service row written.
+	row, err := deps.Store.GetContainerByID(wid + ":web")
+	if err != nil {
+		t.Fatalf("get container row: %v", err)
+	}
+	if row.WorkloadID != wid {
+		t.Errorf("row.WorkloadID = %q, want %q", row.WorkloadID, wid)
+	}
+	if row.Role != "web" {
+		t.Errorf("row.Role = %q, want %q", row.Role, "web")
+	}
+	if row.State != "running" {
+		t.Errorf("row.State = %q, want %q", row.State, "running")
+	}
+}
+
+func TestDeploy_EmptyYAMLConfig_RejectsBeforeExec(t *testing.T) {
+	deps := plugin.Deps{Store: testStore(t)}
+	wid := seedWorkload(t, deps.Store, "empty", "services:\n  web:\n    image: x\n")
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "empty",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: ""}),
+	}
+
+	fake := &fakeRunner{}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{})
+	if err == nil {
+		t.Fatal("Deploy accepted empty compose_yaml")
+	}
+	if !strings.Contains(err.Error(), "empty compose_yaml") {
+		t.Errorf("error = %v, want substring \"empty compose_yaml\"", err)
+	}
+	if len(fake.upCalls) != 0 {
+		t.Errorf("Up should not have been called; got %d calls", len(fake.upCalls))
+	}
+}
+
+func TestDeploy_UpFailure_PropagatesAndIncludesTruncatedOutput(t *testing.T) {
+	withTempDir(t)
+
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: bad-image\n"
+	wid := seedWorkload(t, deps.Store, "fail", yamlText)
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "fail",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+
+	bigOut := strings.Repeat("docker compose log noise ", 200) // > 1024 bytes
+	fake := &fakeRunner{
+		upOuts: []string{bigOut},
+		upErrs: []error{errors.New("exit status 1")},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{})
+	if err == nil {
+		t.Fatal("Deploy accepted Up failure")
+	}
+	if !strings.Contains(err.Error(), "docker compose up") {
+		t.Errorf("error = %v, want substring \"docker compose up\"", err)
+	}
+	if !strings.Contains(err.Error(), "exit status 1") {
+		t.Errorf("error = %v, want wrapped Up err", err)
+	}
+	if !strings.Contains(err.Error(), "(truncated)") {
+		t.Errorf("error = %v, want truncated-output marker", err)
+	}
+	// Ps must not be called when Up failed.
+	if len(fake.psCalls) != 0 {
+		t.Errorf("Ps called %d times after Up failure; want 0", len(fake.psCalls))
+	}
+}
+
+func TestDeploy_UpSucceedsButPsFails_SurfacesError(t *testing.T) {
+	// `up` succeeded but enumerate failed — Deploy must surface so the UI
+	// doesn't show an empty containers index for a running stack.
+	withTempDir(t)
+
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx\n"
+	wid := seedWorkload(t, deps.Store, "psfail", yamlText)
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "psfail",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+
+	fake := &fakeRunner{
+		psErrs: []error{errors.New("compose ps boom")},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	err := src.Deploy(context.Background(), deps, w, plugin.DeploymentIntent{})
+	if err == nil {
+		t.Fatal("Deploy ignored Ps failure")
+	}
+	if !strings.Contains(err.Error(), "sync container rows") {
+		t.Errorf("error = %v, want substring \"sync container rows\"", err)
+	}
+}
+
+func TestTeardown_DropsContainerRows_EvenWhenDownFails(t *testing.T) {
+	// docker compose down failing must not orphan rows in the DB.
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	wid := seedWorkload(t, deps.Store, "tdown", "services:\n  web:\n    image: nginx\n")
+
+	// Seed two service rows the way Deploy would.
+	for _, role := range []string{"web", "db"} {
+		if err := deps.Store.UpsertContainer(store.Container{
+			ID:           wid + ":" + role,
+			WorkloadID:   wid,
+			WorkloadKind: "compose",
+			Role:         role,
+			Host:         "local",
+			State:        "running",
+		}); err != nil {
+			t.Fatalf("seed container: %v", err)
+		}
+	}
+
+	fake := &fakeRunner{downErrs: []error{errors.New("compose project unknown")}}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "tdown",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: "services:\n  web:\n    image: nginx\n"}),
+	}
+	if err := src.Teardown(context.Background(), deps, w); err != nil {
+		t.Fatalf("Teardown: %v", err)
+	}
+
+	// Down requested removeVolumes=true (matches the docstring claim).
+	if len(fake.downCalls) != 1 {
+		t.Fatalf("Down calls = %d, want 1", len(fake.downCalls))
+	}
+	if !fake.downCalls[0].RemoveVolumes {
+		t.Errorf("Down removeVolumes = false, want true (workload teardown is destructive)")
+	}
+
+	// Rows gone despite the Down error.
+	for _, role := range []string{"web", "db"} {
+		if _, err := deps.Store.GetContainerByID(wid + ":" + role); !errors.Is(err, store.ErrNotFound) {
+			t.Errorf("container row %q survived teardown: err=%v", role, err)
+		}
+	}
+}
+
+func TestTeardown_HappyPath(t *testing.T) {
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	wid := seedWorkload(t, deps.Store, "tdown2", "services:\n  web:\n    image: nginx\n")
+
+	if err := deps.Store.UpsertContainer(store.Container{
+		ID:           wid + ":web",
+		WorkloadID:   wid,
+		WorkloadKind: "compose",
+		Role:         "web",
+		Host:         "local",
+		State:        "running",
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	fake := &fakeRunner{}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "tdown2",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: "services:\n  web:\n    image: nginx\n"}),
+	}
+	if err := src.Teardown(context.Background(), deps, w); err != nil {
+		t.Fatalf("Teardown: %v", err)
+	}
+	if len(fake.downCalls) != 1 {
+		t.Errorf("Down calls = %d, want 1", len(fake.downCalls))
+	}
+	if _, err := deps.Store.GetContainerByID(wid + ":web"); !errors.Is(err, store.ErrNotFound) {
+		t.Errorf("container row survived teardown: err=%v", err)
+	}
+}
+
+func TestReconcile_PsSuccess_UpsertsRows(t *testing.T) {
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx\n  db:\n    image: postgres\n"
+	wid := seedWorkload(t, deps.Store, "rec", yamlText)
+
+	fake := &fakeRunner{
+		psResults: [][]stack.Service{{
+			{Service: "web", State: "running"},
+			{Service: "db", State: "running"},
+		}},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "rec",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+	if err := src.Reconcile(context.Background(), deps, w); err != nil {
+		t.Fatalf("Reconcile: %v", err)
+	}
+
+	for _, role := range []string{"web", "db"} {
+		row, err := deps.Store.GetContainerByID(wid + ":" + role)
+		if err != nil {
+			t.Errorf("row %q missing after reconcile: %v", role, err)
+			continue
+		}
+		if row.State != "running" {
+			t.Errorf("row %q state = %q, want \"running\"", role, row.State)
+		}
+	}
+}
+
+func TestReconcile_PsFailure_MarksExistingRowsMissing(t *testing.T) {
+	// When compose ps fails (project unknown to Docker), the reconciler
+	// flips existing rows to "missing" rather than deleting them — the UI
+	// surfaces the desync to the operator.
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  web:\n    image: nginx\n"
+	wid := seedWorkload(t, deps.Store, "missing", yamlText)
+
+	if err := deps.Store.UpsertContainer(store.Container{
+		ID:           wid + ":web",
+		WorkloadID:   wid,
+		WorkloadKind: "compose",
+		Role:         "web",
+		Host:         "local",
+		State:        "running",
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	fake := &fakeRunner{psErrs: []error{errors.New("no such project")}}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "missing",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+	if err := src.Reconcile(context.Background(), deps, w); err != nil {
+		t.Fatalf("Reconcile returned %v; should be nil even on Ps failure", err)
+	}
+
+	row, err := deps.Store.GetContainerByID(wid + ":web")
+	if err != nil {
+		t.Fatalf("row missing entirely (should be marked, not deleted): %v", err)
+	}
+	if row.State != "missing" {
+		t.Errorf("row.State = %q, want \"missing\"", row.State)
+	}
+}
+
+func TestReconcile_FallsBackToStatusWhenStateEmpty(t *testing.T) {
+	// Some compose versions populate Status (human string) but not State
+	// (enum) for non-running services. upsertServiceRow falls back to
+	// Status; verify that here.
+	withTempDir(t)
+	deps := plugin.Deps{Store: testStore(t)}
+	yamlText := "services:\n  worker:\n    image: alpine\n"
+	wid := seedWorkload(t, deps.Store, "fallback", yamlText)
+
+	fake := &fakeRunner{
+		psResults: [][]stack.Service{{
+			{Service: "worker", State: "", Status: "Exit 0"},
+		}},
+	}
+	withFakeRunner(t, fake)
+
+	src := &source{}
+	w := plugin.Workload{
+		ID:           wid,
+		Name:         "fallback",
+		SourceKind:   "compose",
+		SourceConfig: mustMarshalConfig(t, Config{ComposeYAML: yamlText}),
+	}
+	if err := src.Reconcile(context.Background(), deps, w); err != nil {
+		t.Fatalf("Reconcile: %v", err)
+	}
+
+	row, err := deps.Store.GetContainerByID(wid + ":worker")
+	if err != nil {
+		t.Fatalf("get row: %v", err)
+	}
+	if row.State != "Exit 0" {
+		t.Errorf("row.State = %q, want \"Exit 0\" (Status fallback)", row.State)
+	}
+}
+
+// mustMarshalConfig is a small helper that converts a Config to the
+// raw-JSON shape SourceConfig expects. Tests use it instead of
+// hand-rolling the string so a Config field rename can't drift the test
+// fixture from the production decoder.
+func mustMarshalConfig(t *testing.T, cfg Config) json.RawMessage {
+	t.Helper()
+	b, err := json.Marshal(cfg)
+	if err != nil {
+		t.Fatalf("marshal config: %v", err)
+	}
+	return json.RawMessage(b)
+}
+
+// Compile-time guards: *stack.Compose must continue to satisfy
+// composeRunner so the production path keeps building, and the fake
+// must continue to satisfy it too so a drift in the interface shape
+// fails the build here rather than at runtime.
+var (
+	_ composeRunner = (*stack.Compose)(nil)
+	_ composeRunner = (*fakeRunner)(nil)
+)
@@ -0,0 +1,574 @@
+package dockerfile
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/docker"
+	"github.com/alexei/tinyforge/internal/events"
+	"github.com/alexei/tinyforge/internal/notify"
+	"github.com/alexei/tinyforge/internal/proxy"
+	"github.com/alexei/tinyforge/internal/staticsite"
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// healthCheckDelay is the grace window after StartContainer before we
+// probe IsContainerRunning. Mirrors the static plugin's window — short
+// enough not to slow happy-path deploys, long enough to catch
+// crash-on-boot failures (missing env, bad CMD, port conflict).
+const healthCheckDelay = 3 * time.Second
+
+// deploy runs one end-to-end sync of a dockerfile workload:
+//
+//  1. fetch the latest commit SHA from the configured git provider
+//  2. skip if SHA + container + proxy are all still healthy
+//  3. clone the repo into a temp dir
+//  4. resolve the build context + Dockerfile location
+//  5. `docker build -t <tag> -f <dockerfile> <context>`
+//  6. recreate the container with the new image
+//  7. health-probe the container, surface logs on failure
+//  8. reconfigure the proxy route
+//  9. tear down the previous container (different ID) once we're sure
+//     the new one is healthy and proxied
+//
+// Each step writes its own status update so the dashboard's runtime-
+// state panel can show a useful intermediate state when the deploy
+// stalls on the slow step (almost always the build).
+func deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	cfg, err := plugin.SourceConfigOf[Config](w)
+	if err != nil {
+		return fmt.Errorf("dockerfile source: decode config: %w", err)
+	}
+
+	prev, prevContainer, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+
+	// Force a full rebuild on manual / promote / first-time deploys
+	// (no Reason at all also implies manual). Schedule / git triggers
+	// honour the unchanged-SHA short-circuit so cron polling does not
+	// rebuild minute-by-minute when nothing changed.
+	force := intent.Reason == "" || intent.Reason == "manual" || intent.Reason == "promote"
+
+	// Decrypt the access token if present. Token never escapes this
+	// frame: any error message routes through sanitizeError(_, token)
+	// which redacts the literal substring.
+	token := ""
+	if cfg.AccessToken != "" {
+		decrypted, derr := crypto.Decrypt(deps.EncKey, cfg.AccessToken)
+		if derr != nil {
+			slog.Warn("dockerfile source: failed to decrypt access token",
+				"workload", w.Name, "error", derr)
+		} else {
+			token = decrypted
+		}
+	}
+
+	provider, err := staticsite.NewGitProvider(staticsite.ProviderType(cfg.Provider), cfg.BaseURL, token)
+	if err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("create provider: %v", err), token))
+		return fmt.Errorf("create provider: %w", err)
+	}
+
+	latestSHA, err := provider.GetLatestCommitSHA(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch)
+	if err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("fetch commit SHA: %v", err), token))
+		return fmt.Errorf("get latest commit: %w", err)
+	}
+
+	domain := primaryDomain(deps, w)
+
+	prevContainerID := ""
+	prevProxyRouteID := ""
+	if prevContainer != nil {
+		prevContainerID = prevContainer.ContainerID
+		prevProxyRouteID = prevContainer.ProxyRouteID
+	}
+	// Short-circuit: SHA unchanged AND container is still running AND
+	// (if there's a public face) the proxy route still exists. Manual
+	// deploys skip this entirely.
+	//
+	// We deliberately do NOT gate this on prev.Status == "deployed". A
+	// transient failure (e.g. a one-off proxy-check error) leaves the
+	// persisted status as "failed"; if we required "deployed" here, every
+	// subsequent cron/git poll with the same SHA would fall through to a
+	// full clone + docker build despite a perfectly healthy running
+	// container — a rebuild storm that burns CPU/disk until a new commit
+	// lands. Instead we trust the live container/proxy state and heal the
+	// stale status via healUnchanged.
+	if !force && latestSHA == prev.LastCommitSHA && prevContainerID != "" {
+		running, _ := deps.Docker.IsContainerRunning(ctx, prevContainerID)
+		switch {
+		case !running:
+			slog.Info("dockerfile: container not running, forcing redeploy", "workload", w.Name)
+		case domain != "":
+			proxyOK, perr := deps.Proxy.RouteExists(ctx, domain)
+			switch {
+			case perr != nil:
+				slog.Warn("dockerfile: proxy check failed, forcing redeploy",
+					"workload", w.Name, "error", perr)
+			case !proxyOK:
+				slog.Info("dockerfile: proxy route missing, forcing redeploy", "workload", w.Name)
+			default:
+				return healUnchanged(deps, w, prev, latestSHA)
+			}
+		default:
+			return healUnchanged(deps, w, prev, latestSHA)
+		}
+	}
+
+	updateStatus(deps, w, "syncing", prev.LastCommitSHA, "")
+	publishEvent(deps, w, "syncing")
+
+	// Clone the repo into a temp dir. We always download the entire
+	// repo tree (folderPath = ""); a ContextPath subset is applied
+	// at build time, not at download time, so a Dockerfile in
+	// `./docker/Dockerfile` with `ContextPath=""` still works.
+	cloneDir, err := os.MkdirTemp("", "tf-build-"+idShort(w)+"-*")
+	if err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("create clone dir: %v", err), token))
+		return fmt.Errorf("create clone dir: %w", err)
+	}
+	defer os.RemoveAll(cloneDir)
+
+	if err := provider.DownloadFolder(ctx, cfg.RepoOwner, cfg.RepoName, cfg.Branch, "", cloneDir); err != nil {
+		updateStatus(deps, w, "failed", prev.LastCommitSHA,
+			sanitizeError(fmt.Sprintf("download repo: %v", err), token))
+		return fmt.Errorf("download repo: %w", err)
+	}
+
+	// Resolve the build context (with symlink-aware escape check) and
+	// verify the Dockerfile is actually present before sending the
+	// build off to the daemon.
+	contextDir, err := resolveContextDir(cloneDir, cfg.ContextPath)
+	if err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("resolve context: %v", err), token))
+		return fmt.Errorf("resolve context: %w", err)
+	}
+	if err := verifyDockerfileExists(contextDir, cfg.DockerfilePath); err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(err.Error(), token))
+		return err
+	}
+
+	imageTag := imageTagFor(w)
+	updateStatus(deps, w, "building", latestSHA, "")
+	publishEvent(deps, w, "building")
+	// Bridge per-line build output onto the event bus so /api/events
+	// subscribers (the dashboard's live tail) can show progress while
+	// the daemon chugs. The bus is non-blocking — slow subscribers drop
+	// events rather than backpressure the build — so this is safe to
+	// call from the hot scan loop.
+	logFn := func(line string) {
+		publishBuildLog(deps, w, line)
+	}
+	if err := deps.Docker.BuildImageAt(ctx, contextDir, cfg.DockerfilePath, imageTag, logFn); err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("docker build: %v", err), token))
+		return fmt.Errorf("docker build: %w", err)
+	}
+
+	env := buildEnv(deps, w.ID)
+	containerPort := strconv.Itoa(cfg.Port)
+
+	settings, err := deps.Store.GetSettings()
+	if err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("get settings: %v", err), token))
+		return fmt.Errorf("get settings: %w", err)
+	}
+
+	networkName := settings.Network
+	networkID, err := deps.Docker.EnsureNetwork(ctx, networkName)
+	if err != nil {
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("ensure network: %v", err), token))
+		return fmt.Errorf("ensure network: %w", err)
+	}
+
+	containerName := containerNameFor(w)
+
+	// Per-face proxy labels (Traefik consumes these; NPM ignores them).
+	labels := map[string]string{}
+	if domain != "" {
+		if l := deps.Proxy.ContainerLabels(domain, cfg.Port); l != nil {
+			for k, v := range l {
+				labels[k] = v
+			}
+		}
+	}
+
+	cc := docker.ContainerConfig{
+		Name:         containerName,
+		Image:        imageTag,
+		Env:          env,
+		ExposedPorts: []string{containerPort + "/tcp"},
+		NetworkName:  networkName,
+		NetworkID:    networkID,
+		Labels:       labels,
+		WorkloadID:   w.ID,
+		// Dockerfile workloads are tagged as "build" so the dashboard
+		// and any filtered query can distinguish them from static sites
+		// (which serve files) and image-source containers (which pull
+		// pre-built images from a registry).
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Role:         "",
+	}
+
+	containerID, err := deps.Docker.CreateContainer(ctx, cc)
+	if err != nil {
+		// Name conflict — best-effort cleanup of any prior container
+		// (by ID first; by name as a fallback) and one retry.
+		if prevContainerID != "" {
+			deps.Docker.StopContainer(ctx, prevContainerID, 10)
+			deps.Docker.RemoveContainer(ctx, prevContainerID, true)
+		}
+		removeContainerByName(ctx, deps, containerName)
+
+		containerID, err = deps.Docker.CreateContainer(ctx, cc)
+		if err != nil {
+			updateStatus(deps, w, "failed", latestSHA,
+				sanitizeError(fmt.Sprintf("create container: %v", err), token))
+			return fmt.Errorf("create container: %w", err)
+		}
+	}
+
+	if err := deps.Docker.StartContainer(ctx, containerID); err != nil {
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("start container: %v", err), token))
+		return fmt.Errorf("start container: %w", err)
+	}
+
+	// Brief health-check window — catch crash-on-boot. ctx-aware so a
+	// cancelled deploy returns promptly. On failure surface the tail
+	// of the container's logs as the error reason; that's almost
+	// always what the operator needs to debug.
+	select {
+	case <-ctx.Done():
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA, "deploy cancelled before health check")
+		return ctx.Err()
+	case <-time.After(healthCheckDelay):
+	}
+	running, runErr := deps.Docker.IsContainerRunning(ctx, containerID)
+	if runErr != nil || !running {
+		logMsg := "container exited immediately after start"
+		if logs, logErr := deps.Docker.ContainerLogs(ctx, containerID, false, "40"); logErr == nil {
+			buf, _ := io.ReadAll(logs)
+			logs.Close()
+			if len(buf) > 0 {
+				// Pass `env` so any decrypted KEY=VALUE pair that the
+				// container's startup output happens to echo (think
+				// `RUN echo $DB_PASSWORD` in a debug Dockerfile) is
+				// redacted before it lands in the operator-visible
+				// last_error field.
+				logMsg = sanitizeErrorWithSecrets(string(buf), token, env)
+			}
+		}
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA, logMsg)
+		return fmt.Errorf("container not running: %s", logMsg)
+	}
+
+	// Resolve proxy target: in-network DNS by default, NPM-remote
+	// override uses (settings.ServerIP, hostPort).
+	forwardHost := containerName
+	forwardPort := cfg.Port
+	if settings.NpmRemote && settings.ProxyProvider == "npm" {
+		if settings.ServerIP != "" {
+			hostPort, hpErr := deps.Docker.InspectContainerPort(ctx, containerID, containerPort+"/tcp")
+			if hpErr != nil {
+				slog.Warn("dockerfile: could not get host port for remote NPM",
+					"workload", w.Name, "error", hpErr)
+			} else {
+				forwardHost = settings.ServerIP
+				forwardPort = int(hostPort)
+			}
+		}
+	}
+
+	// Configure proxy if a domain is set. Replace any prior route
+	// in-place so traffic shifts atomically over to the new container.
+	proxyRouteID := prevProxyRouteID
+	if domain != "" {
+		if prevProxyRouteID != "" {
+			deps.Proxy.DeleteRoute(ctx, prevProxyRouteID)
+		}
+		routeID, rerr := deps.Proxy.ConfigureRoute(ctx, domain, forwardHost, forwardPort, proxy.RouteOptions{
+			SSLCertificateID: settings.SSLCertificateID,
+		})
+		if rerr != nil {
+			slog.Warn("dockerfile: failed to configure proxy",
+				"workload", w.Name, "domain", domain,
+				"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "error", rerr)
+		} else {
+			proxyRouteID = routeID
+			slog.Info("dockerfile: proxy configured",
+				"workload", w.Name, "domain", domain,
+				"target", fmt.Sprintf("%s:%d", forwardHost, forwardPort), "routeID", routeID)
+		}
+	}
+
+	// Drop the previous container only after the new one is healthy
+	// + routed. Different-ID-than-previous tells us we created a
+	// fresh one (vs returning the same ID via UpsertContainer reuse).
+	if prevContainerID != "" && prevContainerID != containerID {
+		deps.Docker.StopContainer(ctx, prevContainerID, 10)
+		deps.Docker.RemoveContainer(ctx, prevContainerID, true)
+	}
+
+	// Single transactional write of new state + container metadata.
+	// On failure: tear down the just-created container + proxy route
+	// so we don't leave orphans behind for the next deploy to trip
+	// over.
+	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+		rs.LastCommitSHA = latestSHA
+		rs.LastSyncAt = store.Now()
+		rs.LastError = ""
+		rs.Status = "deployed"
+
+		c.ContainerID = containerID
+		c.ProxyRouteID = proxyRouteID
+		c.Subdomain = domain
+		c.State = "running"
+		c.Port = cfg.Port
+		c.ImageRef = imageTag
+	}); err != nil {
+		slog.Error("dockerfile: failed to persist deploy state — rolling back",
+			"workload", w.Name, "error", err)
+		if proxyRouteID != "" {
+			deps.Proxy.DeleteRoute(ctx, proxyRouteID)
+		}
+		deps.Docker.StopContainer(ctx, containerID, 10)
+		deps.Docker.RemoveContainer(ctx, containerID, true)
+		updateStatus(deps, w, "failed", latestSHA,
+			sanitizeError(fmt.Sprintf("persist deploy state: %v", err), token))
+		return fmt.Errorf("persist deploy state: %w", err)
+	}
+
+	publishEvent(deps, w, "deployed")
+	dispatchBuildNotification(deps, w, domain, "deployed", "")
+
+	slog.Info("dockerfile deployed",
+		"workload", w.Name,
+		"sha", shortSHA(latestSHA),
+		"image", imageTag)
+	return nil
+}
+
+// updateStatus writes the runtime-state status/error/commit and (on
+// terminal states) fires the side effects the static plugin's helper
+// does: failures land in the event log, and a "deployed" or "failed"
+// transition dispatches an outbound notification.
+//
+// The deploy success path calls saveState directly with the full
+// container metadata; this helper covers failure / intermediate
+// transitions where only state moves.
+func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg string) {
+	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+		rs.Status = status
+		rs.LastError = errMsg
+		if commitSHA != "" {
+			rs.LastCommitSHA = commitSHA
+		}
+		switch status {
+		case "deployed":
+			c.State = "running"
+		case "stopped":
+			c.State = "stopped"
+		case "failed":
+			c.State = "failed"
+		case "syncing", "building":
+			// Don't churn the container row's state during in-progress
+			// build/sync — leave whatever the previous deploy left.
+		}
+	}); err != nil {
+		slog.Error("dockerfile: failed to update status",
+			"id", w.ID, "status", status, "error", err)
+	}
+
+	if status == "failed" {
+		publishEvent(deps, w, "failed: "+errMsg)
+	}
+
+	if status == "deployed" || status == "failed" {
+		dispatchBuildNotification(deps, w, primaryDomain(deps, w), status, errMsg)
+	}
+}
+
+// dispatchBuildNotification fans the build event out to every
+// configured notification route for the workload. Multi-destination
+// fan-out (workload_notifications rows + legacy single URL + global
+// settings fallback) is centralised in plugin.DispatchNotificationForWorkload
+// so the routing rules are identical across source kinds.
+func dispatchBuildNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
+	eventType := "build_success"
+	if status == "failed" {
+		eventType = "build_failure"
+	}
+	siteURL := ""
+	if domain != "" {
+		siteURL = "https://" + domain
+	}
+	plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
+		Type:    eventType,
+		Project: w.Name,
+		URL:     siteURL,
+		Error:   errMsg,
+	})
+}
+
+// publishEvent emits a status event on the bus AND persists an
+// event_log row. Message shape mirrors the static plugin
+// ("Build %q: %s") so the dashboard's audit feed reads consistently
+// across both kinds.
+func publishEvent(deps plugin.Deps, w plugin.Workload, status string) {
+	severity := "info"
+	if strings.HasPrefix(status, "failed") {
+		severity = "error"
+	}
+	message := fmt.Sprintf("Build %q: %s", w.Name, status)
+
+	metaBytes, err := json.Marshal(map[string]string{
+		"workload_id":   w.ID,
+		"workload_name": w.Name,
+		"status":        status,
+	})
+	if err != nil {
+		slog.Error("dockerfile: marshal event metadata", "error", err)
+		metaBytes = []byte("{}")
+	}
+	metadata := string(metaBytes)
+
+	evt, err := deps.Store.InsertEvent(store.EventLog{
+		Source:   "dockerfile",
+		Severity: severity,
+		Message:  message,
+		Metadata: metadata,
+	})
+	if err != nil {
+		slog.Error("dockerfile: failed to persist event log", "error", err)
+		return
+	}
+	deps.Events.Publish(events.Event{
+		Type: events.EventLog,
+		Payload: events.EventLogPayload{
+			ID:        evt.ID,
+			Source:    "dockerfile",
+			Severity:  severity,
+			Message:   message,
+			Metadata:  metadata,
+			CreatedAt: evt.CreatedAt,
+		},
+	})
+}
+
+// publishBuildLog emits one EventBuildLog per non-empty daemon "stream"
+// line. The trailing newline the daemon emits per line is trimmed so the
+// UI can render each event as its own row without smuggled blanks.
+// Strictly best-effort: the bus drops events under backpressure (slow
+// subscriber, no subscriber at all) and never blocks the build loop.
+func publishBuildLog(deps plugin.Deps, w plugin.Workload, line string) {
+	trimmed := strings.TrimRight(line, "\r\n")
+	if trimmed == "" {
+		return
+	}
+	deps.Events.Publish(events.Event{
+		Type: events.EventBuildLog,
+		Payload: events.BuildLogPayload{
+			WorkloadID: w.ID,
+			Line:       trimmed,
+			Stream:     "stdout",
+		},
+	})
+}
+
+// healUnchanged is the no-rebuild short-circuit result: the SHA matches and
+// the live container + proxy are healthy, so there is nothing to deploy. If a
+// prior transient failure left the persisted status as something other than
+// "deployed", repair it so the dashboard reflects reality and we stop treating
+// a healthy workload as failed. We heal via saveState directly (NOT
+// updateStatus) so this reconciliation does not fire a spurious build-success
+// notification on every poll.
+func healUnchanged(deps plugin.Deps, w plugin.Workload, prev runtimeState, latestSHA string) error {
+	slog.Info("dockerfile: no changes", "workload", w.Name, "sha", shortSHA(latestSHA))
+	if prev.Status == "deployed" {
+		return nil
+	}
+	if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+		rs.Status = "deployed"
+		rs.LastError = ""
+		c.State = "running"
+	}); err != nil {
+		slog.Warn("dockerfile: failed to heal stale status to deployed",
+			"workload", w.Name, "error", err)
+	}
+	return nil
+}
+
+// removeContainerByName enumerates Docker's view and best-effort drops
+// EVERY matching container so a name conflict in CreateContainer is
+// recoverable. Container names are unique per daemon, but the recovery
+// path exists precisely because a conflict occurred — a prior partial
+// deploy can leave more than one matching artifact, so we must not stop
+// at the first. Mirrors the static plugin's helper of the same name.
+func removeContainerByName(ctx context.Context, deps plugin.Deps, name string) {
+	containers, err := deps.Docker.ListContainers(ctx, nil)
+	if err != nil {
+		return
+	}
+	for _, c := range containers {
+		if c.Name == name {
+			deps.Docker.StopContainer(ctx, c.ID, 10)
+			deps.Docker.RemoveContainer(ctx, c.ID, true)
+		}
+	}
+}
+
+// primaryDomain mirrors the static plugin's helper of the same name —
+// derives an FQDN from the workload's first enabled public face, with
+// the same bare-subdomain + settings.Domain fall-through.
+func primaryDomain(deps plugin.Deps, w plugin.Workload) string {
+	for _, f := range w.PublicFaces {
+		if f.Subdomain == "" && f.Domain == "" {
+			continue
+		}
+		switch {
+		case f.Subdomain != "" && f.Domain != "":
+			return f.Subdomain + "." + f.Domain
+		case f.Subdomain == "" && f.Domain != "":
+			return f.Domain
+		case f.Subdomain != "" && f.Domain == "":
+			settings, err := deps.Store.GetSettings()
+			if err != nil || settings.Domain == "" {
+				return f.Subdomain
+			}
+			return f.Subdomain + "." + settings.Domain
+		}
+	}
+	return ""
+}
+
+// shortSHA truncates a commit SHA for log lines. Keeps the deploy log
+// readable without losing the "is this the same commit?" signal.
+func shortSHA(sha string) string {
+	if len(sha) > 8 {
+		return sha[:8]
+	}
+	return sha
+}
@@ -0,0 +1,131 @@
+// Package dockerfile implements the "dockerfile" source: a git-repo-backed
+// deployable that builds a Docker image from a user-supplied Dockerfile
+// and runs one container. This is the "self-hosted Vercel" Source —
+// users point at a Git repo containing a Dockerfile and Tinyforge
+// handles clone → build → run → proxy in one shot, with no external CI
+// pipeline.
+//
+// Architecturally the plugin sits between `static` (clones a Git repo,
+// builds an image, runs one container) and `image` (richer runtime
+// shape: ports, healthcheck, env, volumes). The deploy pipeline mirrors
+// static — same git-fetch, same image-tag/container-name shape, same
+// container-row state persistence — but the build step uses the
+// operator's Dockerfile instead of generating one.
+//
+// The full pipeline is implemented inline in this package
+// (deploy.go / teardown.go / reconcile.go) so a new dockerfile source
+// kind is usable immediately on init() — no separate registration step
+// in the deployer.
+package dockerfile
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// Config is the per-workload source config blob. Mirrors the shape of
+// the static plugin's Config so the UI wizard can largely reuse the
+// existing Git-discovery + branch-picker + repo-picker components.
+//
+// Build-side fields:
+//
+//   - DockerfilePath: path to the Dockerfile *within the context*
+//     directory. Defaults to "Dockerfile". Use e.g. "docker/Dockerfile"
+//     when the operator's repo keeps Dockerfiles in a subfolder.
+//   - ContextPath: subfolder of the cloned repo to use as the build
+//     context. Defaults to "" (repo root). Use e.g. "./api" when the
+//     repo's Dockerfile lives next to a backend service in a monorepo.
+//
+// Runtime-side fields:
+//
+//   - Port: container port the workload listens on. Required.
+//   - Healthcheck: optional curl-style probe; empty disables.
+//
+// Env vars and volume mounts are handled out-of-band via the
+// workload_env and workload_volumes tables, mirroring the image source.
+type Config struct {
+	Provider       string `json:"provider"`         // "gitea" | "github" | "gitlab"; "" = autodetect
+	BaseURL        string `json:"base_url"`         // e.g. https://git.example.com
+	RepoOwner      string `json:"repo_owner"`
+	RepoName       string `json:"repo_name"`
+	Branch         string `json:"branch"`
+	ContextPath    string `json:"context_path"`     // path within repo (root by default)
+	DockerfilePath string `json:"dockerfile_path"`  // relative to context_path; "Dockerfile" by default
+	AccessToken    string `json:"access_token"`     // encrypted; optional for public repos
+
+	Port        int    `json:"port"`
+	Healthcheck string `json:"healthcheck,omitempty"`
+}
+
+type source struct{}
+
+// Eager registration — the deploy pipeline lives entirely inside this
+// package, so the kind is usable as soon as init() fires.
+func init() { plugin.RegisterSource(&source{}) }
+
+func (*source) Kind() string { return "dockerfile" }
+
+func (*source) SchemaSample() any {
+	return Config{
+		Provider:       "gitea",
+		BaseURL:        "https://git.example.com",
+		RepoOwner:      "owner",
+		RepoName:       "myservice",
+		Branch:         "main",
+		ContextPath:    "",
+		DockerfilePath: "Dockerfile",
+		Port:           8080,
+	}
+}
+
+// Validate rejects obviously-malformed configs before the deploy
+// pipeline materializes a temp dir, downloads a repo, and burns
+// minutes of build time on input that was never going to work.
+func (*source) Validate(cfg json.RawMessage) error {
+	var c Config
+	if len(cfg) == 0 {
+		return fmt.Errorf("dockerfile source: config is required")
+	}
+	if err := json.Unmarshal(cfg, &c); err != nil {
+		return fmt.Errorf("dockerfile source: invalid json: %w", err)
+	}
+	if strings.TrimSpace(c.RepoOwner) == "" || strings.TrimSpace(c.RepoName) == "" {
+		return fmt.Errorf("dockerfile source: repo_owner and repo_name are required")
+	}
+	if c.Port <= 0 || c.Port > 65535 {
+		return fmt.Errorf("dockerfile source: port must be between 1 and 65535 (got %d)", c.Port)
+	}
+	// Defense in depth: a leading "/" or any ".." segment in
+	// DockerfilePath / ContextPath would escape the build context. The
+	// plugin's deploy() does its own normalization too; rejecting here
+	// gives the operator a clear error at save-time instead of a
+	// confusing "no such file" mid-build.
+	for _, p := range []string{c.DockerfilePath, c.ContextPath} {
+		if p == "" {
+			continue
+		}
+		if strings.HasPrefix(p, "/") {
+			return fmt.Errorf("dockerfile source: %q must be relative", p)
+		}
+		if strings.Contains(p, "..") {
+			return fmt.Errorf("dockerfile source: %q must not contain '..'", p)
+		}
+	}
+	return nil
+}
+
+func (*source) Deploy(ctx context.Context, deps plugin.Deps, w plugin.Workload, intent plugin.DeploymentIntent) error {
+	return deploy(ctx, deps, w, intent)
+}
+
+func (*source) Teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	return teardown(ctx, deps, w)
+}
+
+func (*source) Reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	return reconcile(ctx, deps, w)
+}
@@ -0,0 +1,288 @@
+package dockerfile
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// ── Source interface plumbing ───────────────────────────────────────
+
+func TestSource_Kind(t *testing.T) {
+	if (&source{}).Kind() != "dockerfile" {
+		t.Fatalf("Kind = %q, want \"dockerfile\"", (&source{}).Kind())
+	}
+}
+
+func TestSource_Registered_AtInit(t *testing.T) {
+	// init() runs once on import; we just verify the registry returns
+	// our concrete kind. A failure here is a regression of the global
+	// plugin.RegisterSource path or our package-level init.
+	got, err := plugin.GetSource("dockerfile")
+	if err != nil {
+		t.Fatalf("GetSource(dockerfile): %v", err)
+	}
+	if got.Kind() != "dockerfile" {
+		t.Fatalf("registered source has wrong kind: %q", got.Kind())
+	}
+}
+
+func TestSource_SchemaSample_RoundTrips(t *testing.T) {
+	s := (&source{}).SchemaSample()
+	raw, err := json.Marshal(s)
+	if err != nil {
+		t.Fatalf("marshal sample: %v", err)
+	}
+	if err := (&source{}).Validate(raw); err != nil {
+		t.Fatalf("Validate(sample) = %v, want nil", err)
+	}
+}
+
+// ── Validate ────────────────────────────────────────────────────────
+
+func TestValidate_RejectsEmpty(t *testing.T) {
+	if err := (&source{}).Validate(nil); err == nil {
+		t.Fatal("expected error on empty config, got nil")
+	}
+}
+
+func TestValidate_RejectsMissingRepo(t *testing.T) {
+	cases := []Config{
+		{RepoName: "x", Port: 80},                  // owner missing
+		{RepoOwner: "y", Port: 80},                 // name missing
+		{RepoOwner: "  ", RepoName: "x", Port: 80}, // owner whitespace-only
+	}
+	for i, c := range cases {
+		raw, _ := json.Marshal(c)
+		if err := (&source{}).Validate(raw); err == nil {
+			t.Errorf("case %d: expected error, got nil", i)
+		}
+	}
+}
+
+func TestValidate_RejectsBadPort(t *testing.T) {
+	for _, port := range []int{0, -1, 70000} {
+		raw, _ := json.Marshal(Config{RepoOwner: "a", RepoName: "b", Port: port})
+		if err := (&source{}).Validate(raw); err == nil {
+			t.Errorf("port %d: expected error, got nil", port)
+		}
+	}
+}
+
+func TestValidate_RejectsPathEscape(t *testing.T) {
+	cases := []Config{
+		{RepoOwner: "a", RepoName: "b", Port: 80, DockerfilePath: "/etc/passwd"},
+		{RepoOwner: "a", RepoName: "b", Port: 80, DockerfilePath: "../../etc/passwd"},
+		{RepoOwner: "a", RepoName: "b", Port: 80, ContextPath: "../../"},
+		{RepoOwner: "a", RepoName: "b", Port: 80, ContextPath: "/etc"},
+	}
+	for i, c := range cases {
+		raw, _ := json.Marshal(c)
+		if err := (&source{}).Validate(raw); err == nil {
+			t.Errorf("case %d: expected path-escape rejection, got nil", i)
+		}
+	}
+}
+
+func TestValidate_AcceptsValid(t *testing.T) {
+	raw, _ := json.Marshal(Config{
+		RepoOwner:      "owner",
+		RepoName:       "repo",
+		Port:           8080,
+		DockerfilePath: "docker/Dockerfile",
+		ContextPath:    "services/api",
+	})
+	if err := (&source{}).Validate(raw); err != nil {
+		t.Fatalf("Validate(valid) = %v", err)
+	}
+}
+
+// ── Naming helpers ──────────────────────────────────────────────────
+
+func TestNaming_SameNameDifferentIDs_NoCollision(t *testing.T) {
+	a := plugin.Workload{ID: "aaaaaaaa-rest", Name: "svc"}
+	b := plugin.Workload{ID: "bbbbbbbb-rest", Name: "svc"}
+	if containerNameFor(a) == containerNameFor(b) {
+		t.Errorf("container names collide: %q", containerNameFor(a))
+	}
+	if imageTagFor(a) == imageTagFor(b) {
+		t.Errorf("image tags collide: %q", imageTagFor(a))
+	}
+}
+
+func TestNaming_ShortIDsPassThrough(t *testing.T) {
+	w := plugin.Workload{ID: "abc", Name: "tiny"}
+	if !strings.HasSuffix(containerNameFor(w), "-abc") {
+		t.Errorf("container name lost short id: %q", containerNameFor(w))
+	}
+}
+
+// ── Context + Dockerfile resolution ─────────────────────────────────
+
+func TestResolveContextDir_Empty_ReturnsRoot(t *testing.T) {
+	dir := t.TempDir()
+	got, err := resolveContextDir(dir, "")
+	if err != nil {
+		t.Fatalf("resolveContextDir: %v", err)
+	}
+	if real, _ := filepath.EvalSymlinks(dir); got != real && got != dir {
+		t.Errorf("got %q, want %q (or symlink-resolved equivalent)", got, dir)
+	}
+}
+
+func TestResolveContextDir_Subfolder_OK(t *testing.T) {
+	dir := t.TempDir()
+	sub := filepath.Join(dir, "api")
+	if err := os.MkdirAll(sub, 0o755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	got, err := resolveContextDir(dir, "api")
+	if err != nil {
+		t.Fatalf("resolveContextDir: %v", err)
+	}
+	if !strings.HasSuffix(got, "api") {
+		t.Errorf("got %q, expected suffix 'api'", got)
+	}
+}
+
+func TestResolveContextDir_NonexistentSubfolder(t *testing.T) {
+	dir := t.TempDir()
+	if _, err := resolveContextDir(dir, "missing"); err == nil {
+		t.Fatal("expected error for missing subfolder")
+	}
+}
+
+func TestResolveContextDir_RejectsEscape(t *testing.T) {
+	dir := t.TempDir()
+	// resolveContextDir is the second wall — Validate is the first.
+	// We pass an absolute escape via a synthesized symlink. Even if
+	// the user bypasses Validate (e.g. by direct DB edit), this must
+	// still reject.
+	outside := t.TempDir()
+	link := filepath.Join(dir, "escape")
+	if err := os.Symlink(outside, link); err != nil {
+		t.Skipf("symlink unsupported in this environment: %v", err)
+	}
+	if _, err := resolveContextDir(dir, "escape"); err == nil {
+		t.Fatal("expected escape-path rejection")
+	}
+}
+
+func TestVerifyDockerfileExists_Present(t *testing.T) {
+	dir := t.TempDir()
+	if err := os.WriteFile(filepath.Join(dir, "Dockerfile"), []byte("FROM scratch\n"), 0o644); err != nil {
+		t.Fatalf("write: %v", err)
+	}
+	if err := verifyDockerfileExists(dir, ""); err != nil {
+		t.Fatalf("verifyDockerfileExists(default) = %v, want nil", err)
+	}
+}
+
+func TestVerifyDockerfileExists_Missing(t *testing.T) {
+	dir := t.TempDir()
+	if err := verifyDockerfileExists(dir, ""); err == nil {
+		t.Fatal("expected error for missing Dockerfile")
+	}
+}
+
+func TestVerifyDockerfileExists_CustomPath(t *testing.T) {
+	dir := t.TempDir()
+	if err := os.MkdirAll(filepath.Join(dir, "docker"), 0o755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, "docker", "Dockerfile.prod"), []byte("FROM scratch\n"), 0o644); err != nil {
+		t.Fatalf("write: %v", err)
+	}
+	if err := verifyDockerfileExists(dir, "docker/Dockerfile.prod"); err != nil {
+		t.Fatalf("verifyDockerfileExists(custom) = %v, want nil", err)
+	}
+}
+
+func TestVerifyDockerfileExists_RejectsAbsolutePath(t *testing.T) {
+	dir := t.TempDir()
+	if err := verifyDockerfileExists(dir, "/etc/passwd"); err == nil {
+		t.Fatal("expected error for absolute dockerfile path")
+	}
+}
+
+// ── Sanitiser ───────────────────────────────────────────────────────
+
+func TestSanitizeError_RedactsToken(t *testing.T) {
+	tok := "ghp_supersecret"
+	got := sanitizeError("401 from gitea token="+tok+" ok", tok)
+	if strings.Contains(got, tok) {
+		t.Errorf("token leaked: %q", got)
+	}
+	if !strings.Contains(got, "[REDACTED]") {
+		t.Errorf("missing [REDACTED] marker: %q", got)
+	}
+}
+
+func TestSanitizeError_CollapsesWhitespace(t *testing.T) {
+	got := sanitizeError("a\nb\rc\td", "")
+	if strings.ContainsAny(got, "\n\r\t") {
+		t.Errorf("did not collapse: %q", got)
+	}
+}
+
+func TestSanitizeError_TruncatesUTF8Safe(t *testing.T) {
+	// 1000 copies of a 2-byte rune = 2000 bytes, well over the 240
+	// cap. Output must remain valid UTF-8 (no torn rune at the cap).
+	long := strings.Repeat("é", 1000)
+	got := sanitizeError(long, "")
+	if !strings.HasSuffix(got, "…") {
+		t.Errorf("missing ellipsis: %q", got)
+	}
+	// Walk the result: every byte should be either an ASCII char or
+	// part of a complete UTF-8 sequence. utf8.ValidString is the
+	// canonical guard but a simple "ends on rune boundary" check
+	// suffices for this fixture.
+	if !isValidUTF8Slice([]byte(got)) {
+		t.Errorf("truncation produced broken UTF-8: %q", got)
+	}
+}
+
+func isValidUTF8Slice(b []byte) bool {
+	for i := 0; i < len(b); {
+		switch {
+		case b[i] < 0x80:
+			i++
+		case b[i] < 0xC0:
+			return false // continuation byte at sequence start
+		case b[i] < 0xE0:
+			if i+1 >= len(b) {
+				return false
+			}
+			i += 2
+		case b[i] < 0xF0:
+			if i+2 >= len(b) {
+				return false
+			}
+			i += 3
+		default:
+			if i+3 >= len(b) {
+				return false
+			}
+			i += 4
+		}
+	}
+	return true
+}
+
+// ── State row ID ────────────────────────────────────────────────────
+
+func TestContainerRowID_Deterministic(t *testing.T) {
+	w := plugin.Workload{ID: "abcd1234-rest"}
+	a := containerRowID(w)
+	b := containerRowID(w)
+	if a != b {
+		t.Errorf("containerRowID not deterministic: %q vs %q", a, b)
+	}
+	if !strings.HasSuffix(a, ":dockerfile") {
+		t.Errorf("containerRowID missing suffix: %q", a)
+	}
+}
@@ -0,0 +1,37 @@
+package dockerfile
+
+import (
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/crypto"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// buildEnv flattens workload_env rows into the KEY=VALUE list Docker
+// expects. Mirrors the static plugin's env helper exactly so the two
+// plugins handle decrypt failures the same way: log + skip the one
+// entry rather than fail the deploy. Bricking a build because one
+// rotated key missed an env entry would be worse than running with
+// the variable unset and a single warning in the operator's log.
+func buildEnv(deps plugin.Deps, workloadID string) []string {
+	rows, err := deps.Store.ListWorkloadEnv(workloadID)
+	if err != nil {
+		slog.Warn("dockerfile source: list workload env", "workload", workloadID, "error", err)
+		return nil
+	}
+	out := make([]string, 0, len(rows))
+	for _, e := range rows {
+		value := e.Value
+		if e.Encrypted {
+			decrypted, err := crypto.Decrypt(deps.EncKey, e.Value)
+			if err != nil {
+				slog.Warn("dockerfile source: decrypt env value",
+					"workload", workloadID, "key", e.Key, "error", err)
+				continue
+			}
+			value = decrypted
+		}
+		out = append(out, e.Key+"="+value)
+	}
+	return out
+}
@@ -0,0 +1,141 @@
+package dockerfile
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// resolveContextDir picks the directory the Docker build context will
+// be packed from, defensively. Returns an error rather than a directory
+// outside the cloned tree even if ContextPath contains a tricky
+// sequence — Validate already rejects ".." and leading "/", but
+// EvalSymlinks here is the second wall.
+//
+// ctx may be "" (use cloneRoot as-is) or a relative subpath like
+// "./api" or "services/api".
+func resolveContextDir(cloneRoot, ctx string) (string, error) {
+	cloneRoot, err := filepath.Abs(cloneRoot)
+	if err != nil {
+		return "", fmt.Errorf("abs cloneRoot: %w", err)
+	}
+	if real, err := filepath.EvalSymlinks(cloneRoot); err == nil {
+		cloneRoot = real
+	}
+	if ctx == "" || ctx == "." || ctx == "./" {
+		return cloneRoot, nil
+	}
+	candidate := filepath.Join(cloneRoot, filepath.FromSlash(ctx))
+	candidate, err = filepath.Abs(candidate)
+	if err != nil {
+		return "", fmt.Errorf("abs candidate: %w", err)
+	}
+	// Resolve symlinks BEFORE the prefix check so a planted symlink
+	// inside the clone cannot escape the build context.
+	if real, err := filepath.EvalSymlinks(candidate); err == nil {
+		candidate = real
+	}
+	if candidate != cloneRoot && !strings.HasPrefix(candidate, cloneRoot+string(filepath.Separator)) {
+		return "", fmt.Errorf("context path %q escapes clone root", ctx)
+	}
+	info, err := os.Stat(candidate)
+	if err != nil {
+		return "", fmt.Errorf("stat context_path %q: %w", ctx, err)
+	}
+	if !info.IsDir() {
+		return "", fmt.Errorf("context_path %q is not a directory", ctx)
+	}
+	return candidate, nil
+}
+
+// verifyDockerfileExists checks that the named Dockerfile is present in
+// the resolved context. Returns a focused error for the operator instead
+// of letting the daemon error out with a less obvious message later.
+//
+// dockerfilePath is the value from Config.DockerfilePath — relative to
+// the context dir, "Dockerfile" by default.
+func verifyDockerfileExists(contextDir, dockerfilePath string) error {
+	if dockerfilePath == "" {
+		dockerfilePath = "Dockerfile"
+	}
+	if strings.HasPrefix(dockerfilePath, "/") || strings.Contains(dockerfilePath, "..") {
+		return fmt.Errorf("dockerfile_path %q must be relative and contain no '..'", dockerfilePath)
+	}
+	full := filepath.Join(contextDir, filepath.FromSlash(dockerfilePath))
+	info, err := os.Stat(full)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return fmt.Errorf("Dockerfile not found at %s/%s", filepath.Base(contextDir), dockerfilePath)
+		}
+		return fmt.Errorf("stat Dockerfile %q: %w", dockerfilePath, err)
+	}
+	if info.IsDir() {
+		return fmt.Errorf("dockerfile_path %q points at a directory, not a file", dockerfilePath)
+	}
+	return nil
+}
+
+// sanitizeError clamps an error string before it lands in
+// containers.extra_json (last_error) or echoes through an outbound
+// notification webhook. Mirrors the static-plugin helper of the same
+// name so both plugins agree on the surface area they expose to
+// operators.
+func sanitizeError(msg, accessToken string) string {
+	return sanitizeErrorWithSecrets(msg, accessToken, nil)
+}
+
+// sanitizeErrorWithSecrets is the dockerfile-plugin-specific extension:
+// when capturing container build/runtime logs into last_error we ALSO
+// need to redact decrypted env-var values, because a malicious or
+// debug-laden Dockerfile can `RUN echo $SECRET` and land a runtime
+// secret in operator-readable state via /api/workloads/{id}/runtime-state.
+//
+// envKV is the same []string the docker client receives — entries shaped
+// "KEY=VALUE". We split on the first '=' and redact every non-empty
+// VALUE longer than 3 chars (shorter values produce too many false-
+// positive substring matches against words like "is" / "of").
+func sanitizeErrorWithSecrets(msg, accessToken string, envKV []string) string {
+	if msg == "" {
+		return ""
+	}
+	if accessToken != "" {
+		msg = strings.ReplaceAll(msg, accessToken, "[REDACTED]")
+	}
+	for _, kv := range envKV {
+		eq := strings.IndexByte(kv, '=')
+		if eq < 0 {
+			continue
+		}
+		value := kv[eq+1:]
+		if len(value) < 4 {
+			continue
+		}
+		msg = strings.ReplaceAll(msg, value, "[REDACTED]")
+	}
+	msg = strings.Map(func(r rune) rune {
+		switch r {
+		case '\n', '\r', '\t':
+			return ' '
+		}
+		return r
+	}, msg)
+	const maxLen = 240
+	if len(msg) > maxLen {
+		// Rune-aware truncation: walk back to the previous rune
+		// boundary so multi-byte chars at the cap don't tear.
+		cut := maxLen
+		for cut > 0 && !isRuneStart(msg[cut]) {
+			cut--
+		}
+		msg = msg[:cut] + "…"
+	}
+	return msg
+}
+
+// isRuneStart reports whether b is a leading byte of a UTF-8 sequence.
+// Used to walk back from a byte-offset cut to a rune boundary.
+func isRuneStart(b byte) bool {
+	return b&0xC0 != 0x80
+}
@@ -0,0 +1,32 @@
+package dockerfile
+
+import (
+	"fmt"
+
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// idShort is the first 8 chars of the workload ID. Same shape as the
+// static plugin — workload names are not UNIQUE in the schema, the ID
+// short suffix is what keeps two same-named workloads from clobbering
+// each other's container/image artifacts.
+func idShort(w plugin.Workload) string {
+	if len(w.ID) < 8 {
+		return w.ID
+	}
+	return w.ID[:8]
+}
+
+// containerNameFor is the deterministic container name. Prefix `tf-build-`
+// distinguishes a dockerfile-built container from `dw-site-` (static) and
+// per-stage image names at a glance in `docker ps`.
+func containerNameFor(w plugin.Workload) string {
+	return fmt.Sprintf("tf-build-%s-%s", w.Name, idShort(w))
+}
+
+// imageTagFor is the deterministic image tag the build step emits. Same
+// shape as the container name so `docker images` shows the linkage at a
+// glance.
+func imageTagFor(w plugin.Workload) string {
+	return fmt.Sprintf("tf-build-%s-%s:latest", w.Name, idShort(w))
+}
@@ -0,0 +1,72 @@
+package dockerfile
+
+import (
+	"context"
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// reconcile syncs the container row's state column with Docker reality
+// for this workload's single container, and marks the runtime state as
+// "failed" if the container is gone or has crashed. Same shape as the
+// static plugin's reconcile — minimal, no automatic re-build on a
+// missing container. The dashboard surfaces the failed status; the
+// operator triggers redeploy explicitly.
+//
+// Auto-redeploy could be added later, but it should be gated on a
+// per-workload toggle: a crash loop with auto-rebuild would burn CPU
+// rebuilding the same broken commit forever.
+func reconcile(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	st, prevContainer, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+	if prevContainer == nil || prevContainer.ContainerID == "" {
+		return nil
+	}
+
+	running, err := deps.Docker.IsContainerRunning(ctx, prevContainer.ContainerID)
+	if err != nil {
+		// Most likely "no such container" — mark missing so the UI
+		// surfaces it; runtime status moves to "failed" so the
+		// dashboard and operator event triggers see the regression.
+		if uerr := deps.Store.UpdateContainerState(prevContainer.ID, "missing"); uerr != nil {
+			slog.Warn("dockerfile: mark missing", "workload", w.Name, "error", uerr)
+		}
+		if st.Status == "deployed" {
+			if uerr := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+				rs.Status = "failed"
+				rs.LastError = "container not found"
+				c.State = "missing"
+			}); uerr != nil {
+				slog.Warn("dockerfile: persist missing-state", "workload", w.Name, "error", uerr)
+			}
+			publishEvent(deps, w, "failed: container not found")
+		}
+		return nil
+	}
+
+	desired := "running"
+	if !running {
+		desired = "stopped"
+	}
+	if prevContainer.State != desired {
+		if err := deps.Store.UpdateContainerState(prevContainer.ID, desired); err != nil {
+			slog.Warn("dockerfile: state sync", "workload", w.Name, "error", err)
+		}
+	}
+
+	if !running && st.Status == "deployed" {
+		if err := saveState(deps, w, func(rs *runtimeState, c *store.Container) {
+			rs.Status = "failed"
+			rs.LastError = "container stopped unexpectedly"
+			c.State = "stopped"
+		}); err != nil {
+			slog.Warn("dockerfile: persist crashed-state", "workload", w.Name, "error", err)
+		}
+		publishEvent(deps, w, "failed: container stopped unexpectedly")
+	}
+	return nil
+}
@@ -0,0 +1,179 @@
+package dockerfile
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"sync"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// runtimeState is the per-workload state we persist inside the
+// container row's extra_json blob. Mirrors the static plugin's
+// runtimeState shape so anyone reading the DB can interpret the two
+// kinds identically.
+//
+// LastImageDigest is the build's image ID — distinct from a registry
+// digest (we never push) but useful for "did the build actually
+// produce a different artifact?" diffing when we add caching later.
+type runtimeState struct {
+	LastCommitSHA   string `json:"last_commit_sha,omitempty"`
+	LastImageDigest string `json:"last_image_digest,omitempty"`
+	LastSyncAt      string `json:"last_sync_at,omitempty"`
+	LastError       string `json:"last_error,omitempty"`
+	Status          string `json:"status,omitempty"`
+}
+
+// runtimeStateKeys lists every JSON field name owned by runtimeState.
+// saveState strips these from the generic map before re-emitting so
+// the typed values do not double-write under both their JSON tag and
+// any subsequent extension's tag.
+var runtimeStateKeys = []string{
+	"last_commit_sha", "last_image_digest", "last_sync_at", "last_error", "status",
+}
+
+// containerRowID is the deterministic container row ID. Stable across
+// redeploys so saveState upserts in place.
+func containerRowID(w plugin.Workload) string {
+	return w.ID + ":dockerfile"
+}
+
+// loadState returns the persisted runtime state plus the underlying
+// container row. Both values are zero on first deploy.
+func loadState(deps plugin.Deps, w plugin.Workload) (runtimeState, *store.Container, error) {
+	row, err := deps.Store.GetContainerByID(containerRowID(w))
+	if err != nil {
+		if errors.Is(err, store.ErrNotFound) {
+			return runtimeState{}, nil, nil
+		}
+		return runtimeState{}, nil, fmt.Errorf("dockerfile source: load state: %w", err)
+	}
+	st := runtimeState{}
+	if row.ExtraJSON != "" && row.ExtraJSON != "{}" {
+		if err := json.Unmarshal([]byte(row.ExtraJSON), &st); err != nil {
+			slog.Debug("dockerfile source: decode extra_json", "workload", w.ID, "error", err)
+		}
+	}
+	return st, &row, nil
+}
+
+// saveLocks serializes per-workload RMW of the container row. Same
+// pattern as the static plugin — SQLite's MaxOpenConns=1 serializes
+// statements but not the caller's read-then-write intent, so two
+// concurrent deploys for the same workload could stomp each other's
+// container_id / proxy_route_id without this mutex.
+//
+// Entries are reference-counted and removed only when the last holder
+// releases. This bounds memory (no per-workload-ID leak) WITHOUT the
+// use-after-delete hazard of deleting an entry on teardown: deleting a
+// live entry while a concurrent saveState still holds (or is about to
+// lock) it would let a fresh saveState mint a SECOND mutex for the same
+// workload, losing the RMW serialization the lock exists to provide.
+var saveLocks struct {
+	mu    sync.Mutex
+	locks map[string]*saveLock
+}
+
+type saveLock struct {
+	mu   sync.Mutex
+	refs int
+}
+
+// acquireSaveLock returns the per-workload lock (creating it on first use),
+// registers this caller as a holder, and takes the lock. Pair with
+// releaseSaveLock. The outer mutex is held only for the bookkeeping; callers
+// contend on the returned per-workload lock.
+func acquireSaveLock(workloadID string) *saveLock {
+	saveLocks.mu.Lock()
+	if saveLocks.locks == nil {
+		saveLocks.locks = map[string]*saveLock{}
+	}
+	l, ok := saveLocks.locks[workloadID]
+	if !ok {
+		l = &saveLock{}
+		saveLocks.locks[workloadID] = l
+	}
+	l.refs++
+	saveLocks.mu.Unlock()
+	l.mu.Lock()
+	return l
+}
+
+// releaseSaveLock unlocks and drops the caller's reference, removing the map
+// entry once no holders remain. Because refs is incremented under saveLocks.mu
+// before the entry can be observed for deletion, an entry with a pending
+// acquirer is never deleted.
+func releaseSaveLock(workloadID string, l *saveLock) {
+	l.mu.Unlock()
+	saveLocks.mu.Lock()
+	l.refs--
+	if l.refs == 0 {
+		delete(saveLocks.locks, workloadID)
+	}
+	saveLocks.mu.Unlock()
+}
+
+// saveState upserts the container row, calling mutate so callers can
+// adjust both the typed runtime state and the row's first-class fields
+// in one transaction. Unknown keys in extra_json survive the round-trip
+// so future writers can extend the blob without forcing this struct to
+// grow.
+func saveState(deps plugin.Deps, w plugin.Workload, mutate func(*runtimeState, *store.Container)) error {
+	lk := acquireSaveLock(w.ID)
+	defer releaseSaveLock(w.ID, lk)
+
+	prev, prevRow, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+
+	row := store.Container{
+		ID:           containerRowID(w),
+		WorkloadID:   w.ID,
+		WorkloadKind: string(store.WorkloadKindBuild),
+		Host:         "local",
+	}
+	if prevRow != nil {
+		row = *prevRow
+	}
+
+	generic := map[string]json.RawMessage{}
+	if row.ExtraJSON != "" && row.ExtraJSON != "{}" {
+		if err := json.Unmarshal([]byte(row.ExtraJSON), &generic); err != nil {
+			slog.Debug("dockerfile source: decode extra_json (generic)", "workload", w.ID, "error", err)
+		}
+	}
+	for _, k := range runtimeStateKeys {
+		delete(generic, k)
+	}
+
+	state := prev
+	mutate(&state, &row)
+
+	typedBytes, err := json.Marshal(state)
+	if err != nil {
+		return fmt.Errorf("dockerfile source: marshal state: %w", err)
+	}
+	typedMap := map[string]json.RawMessage{}
+	if err := json.Unmarshal(typedBytes, &typedMap); err != nil {
+		return fmt.Errorf("dockerfile source: re-decode typed state: %w", err)
+	}
+	for k, v := range typedMap {
+		generic[k] = v
+	}
+
+	merged, err := json.Marshal(generic)
+	if err != nil {
+		return fmt.Errorf("dockerfile source: marshal merged state: %w", err)
+	}
+	row.ExtraJSON = string(merged)
+	row.LastSeenAt = store.Now()
+
+	if err := deps.Store.UpsertContainer(row); err != nil {
+		return fmt.Errorf("dockerfile source: upsert container row: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,51 @@
+package dockerfile
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+
+	"github.com/alexei/tinyforge/internal/store"
+	"github.com/alexei/tinyforge/internal/workload/plugin"
+)
+
+// teardown drops every artifact deploy created: the running container,
+// the proxy route, the container index row. Idempotent — a workload
+// that never deployed is a no-op.
+//
+// The built image tag is left in place: removing it would invalidate
+// the docker build cache (next deploy of the same workload would
+// rebuild from scratch). Operators can prune unused images via the
+// existing Settings → Prune Images path.
+func teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
+	_, prevContainer, err := loadState(deps, w)
+	if err != nil {
+		return err
+	}
+	if prevContainer == nil {
+		return nil
+	}
+
+	// Proxy first so traffic stops landing on a container that is
+	// about to disappear.
+	if prevContainer.ProxyRouteID != "" {
+		if err := deps.Proxy.DeleteRoute(ctx, prevContainer.ProxyRouteID); err != nil {
+			slog.Warn("dockerfile: failed to remove proxy route", "workload", w.Name, "error", err)
+		}
+	}
+
+	if prevContainer.ContainerID != "" {
+		if err := deps.Docker.RemoveContainer(ctx, prevContainer.ContainerID, true); err != nil {
+			slog.Warn("dockerfile: failed to remove container", "workload", w.Name, "error", err)
+		}
+	}
+
+	if err := deps.Store.DeleteContainer(prevContainer.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
+		slog.Warn("dockerfile: failed to delete container row", "workload", w.Name, "error", err)
+	}
+	// The per-workload save-mutex is reference-counted (see state.go) and
+	// frees itself when the last holder releases, so teardown no longer
+	// deletes it explicitly — doing so could race a concurrent saveState
+	// and break the RMW serialization the lock provides.
+	return nil
+}
@@ -444,22 +444,12 @@ func updateStatus(deps plugin.Deps, w plugin.Workload, status, commitSHA, errMsg
 }

 // dispatchSiteNotification fires a site_sync_success or
-// site_sync_failure event to the configured outbound webhook.
-// Resolution: per-workload URL+secret first, then fall through to
-// settings.notification_url/secret. Always best-effort.
+// site_sync_failure event for the workload via the shared multi-route
+// dispatcher in plugin.DispatchNotificationForWorkload. Resolution
+// order (workload_notifications → legacy single URL → settings global)
+// is identical to the dockerfile plugin's path so receivers see
+// consistent fan-out behaviour across source kinds.
 func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, status, errMsg string) {
-	if deps.Notifier == nil {
-		return
-	}
-	settings, err := deps.Store.GetSettings()
-	if err != nil {
-		slog.Warn("static site: notify settings lookup failed", "site", w.ID, "error", err)
-		return
-	}
-	url, secret, tier := resolveSiteTarget(w, settings)
-	if url == "" {
-		return
-	}
 	eventType := "site_sync_success"
 	if status == "failed" {
 		eventType = "site_sync_failure"
@@ -468,7 +458,7 @@ func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, statu
 	if domain != "" {
 		siteURL = "https://" + domain
 	}
-	deps.Notifier.SendSigned(url, secret, tier, notify.Event{
+	plugin.DispatchNotificationForWorkload(deps, w, notify.Event{
 		Type:    eventType,
 		Project: w.Name,
 		URL:     siteURL,
@@ -476,16 +466,6 @@ func dispatchSiteNotification(deps plugin.Deps, w plugin.Workload, domain, statu
 	})
 }

-// resolveSiteTarget mirrors the legacy resolveSiteTarget helper but
-// reads notification config off the workload row (where it now lives
-// post-refactor) rather than the static_sites row.
-func resolveSiteTarget(w plugin.Workload, settings store.Settings) (string, string, notify.Tier) {
-	if w.NotificationURL != "" {
-		return w.NotificationURL, w.NotificationSecret, notify.TierSite
-	}
-	return settings.NotificationURL, settings.NotificationSecret, notify.TierSettings
-}
-
 // publishEvent emits a static_site_status event on the bus AND
 // persists an event_log row so the dashboard's audit trail picks it
 // up. Message format ("Static site \"%s\": %s") is preserved verbatim
@@ -165,30 +165,42 @@ func TestContainerRowID_Deterministic(t *testing.T) {
 	}
 }

-func TestLockFor_ReturnsSameLockForSameWorkload(t *testing.T) {
-	// Suffix by t.Name() so the package-global saveLocks map cannot
-	// bleed key state between tests (or between -count=N runs).
+func TestSaveLock_FreedWhenIdle(t *testing.T) {
+	// After the last holder releases, the reference-counted entry must be
+	// removed from the map so the lock table cannot grow without bound.
+	// Suffix by t.Name() so the package-global saveLocks map cannot bleed
+	// key state between tests (or between -count=N runs).
 	key := t.Name() + "-wid"
-	a := lockFor(key)
-	b := lockFor(key)
-	if a != b {
-		t.Fatalf("lockFor returned distinct locks for same workload: %p vs %p", a, b)
+	lk := acquireSaveLock(key)
+	saveLocks.mu.Lock()
+	_, present := saveLocks.locks[key]
+	saveLocks.mu.Unlock()
+	if !present {
+		t.Fatal("acquireSaveLock did not register the entry while held")
+	}
+	releaseSaveLock(key, lk)
+	saveLocks.mu.Lock()
+	_, stillPresent := saveLocks.locks[key]
+	saveLocks.mu.Unlock()
+	if stillPresent {
+		t.Fatal("releaseSaveLock left the entry behind after the last holder released")
 	}
 }

-func TestLockFor_ReturnsDistinctLocksForDifferentWorkloads(t *testing.T) {
-	a := lockFor(t.Name() + "-a")
-	b := lockFor(t.Name() + "-b")
-	if a == b {
-		t.Fatalf("lockFor returned same lock for different workloads: %p", a)
-	}
+func TestSaveLock_DistinctWorkloadsDoNotSerialize(t *testing.T) {
+	// Two different workloads must be lockable at the same time. If they
+	// shared a mutex the second acquire would block forever (deadlock).
+	a := acquireSaveLock(t.Name() + "-a")
+	b := acquireSaveLock(t.Name() + "-b")
+	releaseSaveLock(t.Name()+"-b", b)
+	releaseSaveLock(t.Name()+"-a", a)
 }

-func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
-	// Two goroutines holding the same lock must run sequentially. The
-	// counter would race past 2 if locking were broken; with the lock,
-	// the increment is observed monotonically.
-	lk := lockFor(t.Name() + "-wid")
+func TestSaveLock_SerializesConcurrentAcquisitions(t *testing.T) {
+	// Goroutines acquiring the same workload's lock must run sequentially.
+	// The counter would race past 1 if locking were broken; with the lock,
+	// peak in-flight stays at 1.
+	key := t.Name() + "-wid"
 	var (
 		wg      sync.WaitGroup
 		mu      sync.Mutex
@@ -199,8 +211,8 @@ func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			lk.Lock()
-			defer lk.Unlock()
+			lk := acquireSaveLock(key)
+			defer releaseSaveLock(key, lk)

 			mu.Lock()
 			counter++
@@ -216,15 +228,15 @@ func TestLockFor_SerializesConcurrentAcquisitions(t *testing.T) {
 	}
 	wg.Wait()
 	if peak != 1 {
-		t.Fatalf("lockFor failed to serialize: peak in-flight = %d, want 1", peak)
+		t.Fatalf("acquireSaveLock failed to serialize: peak in-flight = %d, want 1", peak)
 	}
 }

-func TestLockFor_ConcurrentMapAccessIsSafe(t *testing.T) {
-	// Distinct workloads acquired in parallel must not panic on map
-	// access — exercises the outer-mutex protection inside lockFor.
-	// Each iteration uses a unique key so the test stresses the
-	// insertion path (the common case for "first deploy" callers).
+func TestSaveLock_ConcurrentMapAccessIsSafe(t *testing.T) {
+	// Distinct workloads acquired+released in parallel must not panic on map
+	// access — exercises the outer-mutex protection inside acquire/release.
+	// Each iteration uses a unique key so the test stresses the insertion +
+	// refcount-cleanup paths (the common case for "first deploy" callers).
 	prefix := t.Name() + "-"
 	var wg sync.WaitGroup
 	for i := 0; i < 50; i++ {
@@ -232,9 +244,9 @@ func TestLockFor_ConcurrentMapAccessIsSafe(t *testing.T) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			lk := lockFor(prefix + strconv.Itoa(i))
-			lk.Lock()
-			lk.Unlock()
+			key := prefix + strconv.Itoa(i)
+			lk := acquireSaveLock(key)
+			releaseSaveLock(key, lk)
 		}()
 	}
 	wg.Wait()
@@ -80,26 +80,55 @@ func loadState(deps plugin.Deps, w plugin.Workload) (runtimeState, *store.Contai
 // container_id / proxy_route_id and orphaning Docker resources. The
 // mutex caps the concurrency at 1 per workload; cross-workload
 // parallelism is unaffected.
+//
+// Entries are reference-counted and removed only when the last holder
+// releases. This bounds memory (no per-workload-ID leak) WITHOUT the
+// use-after-delete hazard of deleting an entry on teardown: deleting a
+// live entry while a concurrent saveState still holds (or is about to
+// lock) it would let a fresh saveState mint a SECOND mutex for the same
+// workload, losing the RMW serialization the lock exists to provide.
 var saveLocks struct {
 	mu    sync.Mutex
-	locks map[string]*sync.Mutex
+	locks map[string]*saveLock
 }

-// lockFor returns the per-workload mutex, creating it on first use.
-// The outer mutex is held only briefly during map lookup; the returned
-// per-workload lock is what callers actually contend on.
-func lockFor(workloadID string) *sync.Mutex {
+type saveLock struct {
+	mu   sync.Mutex
+	refs int
+}
+
+// acquireSaveLock returns the per-workload lock (creating it on first use),
+// registers this caller as a holder, and takes the lock. Pair with
+// releaseSaveLock. The outer mutex is held only for the bookkeeping; callers
+// contend on the returned per-workload lock.
+func acquireSaveLock(workloadID string) *saveLock {
 	saveLocks.mu.Lock()
-	defer saveLocks.mu.Unlock()
 	if saveLocks.locks == nil {
-		saveLocks.locks = map[string]*sync.Mutex{}
+		saveLocks.locks = map[string]*saveLock{}
 	}
-	m, ok := saveLocks.locks[workloadID]
+	l, ok := saveLocks.locks[workloadID]
 	if !ok {
-		m = &sync.Mutex{}
-		saveLocks.locks[workloadID] = m
+		l = &saveLock{}
+		saveLocks.locks[workloadID] = l
 	}
-	return m
+	l.refs++
+	saveLocks.mu.Unlock()
+	l.mu.Lock()
+	return l
+}
+
+// releaseSaveLock unlocks and drops the caller's reference, removing the map
+// entry once no holders remain. Because refs is incremented under saveLocks.mu
+// before the entry can be observed for deletion, an entry with a pending
+// acquirer is never deleted.
+func releaseSaveLock(workloadID string, l *saveLock) {
+	l.mu.Unlock()
+	saveLocks.mu.Lock()
+	l.refs--
+	if l.refs == 0 {
+		delete(saveLocks.locks, workloadID)
+	}
+	saveLocks.mu.Unlock()
 }

 // saveState upserts the container row, calling mutate so callers can
@@ -115,9 +144,8 @@ func lockFor(workloadID string) *sync.Mutex {
 // Per-workload mutex serializes concurrent callers so two parallel
 // Deploys can't read the same prior state and race their writes.
 func saveState(deps plugin.Deps, w plugin.Workload, mutate func(*runtimeState, *store.Container)) error {
-	lk := lockFor(w.ID)
-	lk.Lock()
-	defer lk.Unlock()
+	lk := acquireSaveLock(w.ID)
+	defer releaseSaveLock(w.ID, lk)

 	prev, prevRow, err := loadState(deps, w)
 	if err != nil {
@@ -185,14 +185,23 @@ func TestSaveState_RecoversFromInvalidExtraJSON(t *testing.T) {
 	deps, _ := testDeps(t)
 	w := plugin.Workload{ID: t.Name() + "-wid", Name: "site"}

+	// UpsertContainer now validates extra_json at the boundary, so this
+	// test seeds a valid row first and corrupts it via raw SQL to
+	// simulate a pre-existing bad row from an upgrade / external edit.
 	if err := deps.Store.UpsertContainer(store.Container{
 		ID:           containerRowID(w),
 		WorkloadID:   w.ID,
 		WorkloadKind: string(store.WorkloadKindSite),
 		Host:         "local",
-		ExtraJSON:    `{not json`,
+		ExtraJSON:    `{}`,
 	}); err != nil {
-		t.Fatalf("seed bad row: %v", err)
+		t.Fatalf("seed row: %v", err)
+	}
+	if _, err := deps.Store.DB().Exec(
+		`UPDATE containers SET extra_json = ? WHERE id = ?`,
+		`{not json`, containerRowID(w),
+	); err != nil {
+		t.Fatalf("corrupt extra_json: %v", err)
 	}

 	err := saveState(deps, w, func(state *runtimeState, _ *store.Container) {
@@ -66,5 +66,8 @@ func teardown(ctx context.Context, deps plugin.Deps, w plugin.Workload) error {
 	if err := deps.Store.DeleteContainer(prevContainer.ID); err != nil && !errors.Is(err, store.ErrNotFound) {
 		slog.Warn("static site: failed to delete container row", "site", w.Name, "error", err)
 	}
+	// The per-workload save-mutex is reference-counted (see state.go) and
+	// frees itself when the last holder releases, so teardown no longer
+	// deletes it explicitly — doing so could race a concurrent saveState.
 	return nil
 }