package api import ( "context" "log/slog" "sync" "sync/atomic" "github.com/go-chi/chi/v5" "github.com/alexei/tinyforge/internal/auth" "github.com/alexei/tinyforge/internal/backup" "github.com/alexei/tinyforge/internal/crypto" "github.com/alexei/tinyforge/internal/dns" "github.com/alexei/tinyforge/internal/docker" "github.com/alexei/tinyforge/internal/events" "github.com/alexei/tinyforge/internal/notify" "github.com/alexei/tinyforge/internal/npm" "github.com/alexei/tinyforge/internal/proxy" "github.com/alexei/tinyforge/internal/stale" "github.com/alexei/tinyforge/internal/store" "github.com/alexei/tinyforge/internal/volsnap" "github.com/alexei/tinyforge/internal/webhook" "github.com/alexei/tinyforge/internal/workload/plugin" ) // DNSProviderChangedFunc is called when DNS settings change so the caller can // update the provider on the deployer. type DNSProviderChangedFunc func(provider dns.Provider) // PluginDispatcher is the subset of the deployer the API layer uses for the // plugin-native dispatch surface (generic-hooks endpoint + workload teardown // + future surfaces). Defined here so the API does not import the deployer // package directly. type PluginDispatcher interface { webhook.PluginDispatcher DispatchTeardown(ctx context.Context, w plugin.Workload) error } // Server holds all dependencies for the API layer. type Server struct { store *store.Store docker *docker.Client npm *npm.Client // optional: only for NPM-specific endpoints (certificates) proxyProvider proxy.Provider deployer PluginDispatcher notifier *notify.Notifier webhook *webhook.Handler eventBus *events.Bus encKey [32]byte localAuth *auth.LocalAuth oidcProvider *auth.OIDCProvider staleScanner *stale.Scanner dnsProviderMu sync.RWMutex dnsProvider dns.Provider onDNSProviderChanged DNSProviderChangedFunc backupEngine *backup.Engine snapshotEngine *volsnap.Engine sseGate *sseGate logScanReloader LogScanReloader dbPath string shutdownFunc func() // called after restore to trigger graceful shutdown onBackupSettingsChanged func(enabled bool, intervalHours int) // called when backup settings change onProxyProviderChanged func(provider proxy.Provider) // called when proxy provider changes // restoreInFlight is a process-wide guard against double-firing // the restore endpoint. A rapid double-click would otherwise // schedule two goroutines racing s.store.Close() and the // candidate-over-live rename. CAS to true at the entry point; // reject the second caller with 409 Conflict. restoreInFlight atomic.Bool } // NewServer creates a new API Server with all required dependencies. func NewServer( st *store.Store, dockerClient *docker.Client, npmClient *npm.Client, proxyProvider proxy.Provider, deployer PluginDispatcher, notifier *notify.Notifier, webhookHandler *webhook.Handler, eventBus *events.Bus, encKey [32]byte, ) *Server { localAuth := auth.NewLocalAuth(encKey) s := &Server{ store: st, docker: dockerClient, npm: npmClient, proxyProvider: proxyProvider, deployer: deployer, notifier: notifier, webhook: webhookHandler, eventBus: eventBus, encKey: encKey, localAuth: localAuth, sseGate: newSSEGate(maxConcurrentSSEStreams), } // Try to initialize OIDC provider from stored settings. authSettings, err := st.GetAuthSettings() if err == nil && authSettings.AuthMode == "oidc" && authSettings.OIDCIssuerURL != "" { s.initOIDCProvider(context.Background(), authSettings) } return s } // SetStaleScanner sets the stale scanner on the server. // Called after both the API server and scanner are initialized. func (s *Server) SetStaleScanner(scanner *stale.Scanner) { s.staleScanner = scanner } // SetBackupEngine sets the backup engine on the server. func (s *Server) SetBackupEngine(engine *backup.Engine) { s.backupEngine = engine } // SetSnapshotEngine sets the volume-snapshot engine on the server. func (s *Server) SetSnapshotEngine(engine *volsnap.Engine) { s.snapshotEngine = engine } // SetDBPath sets the database file path (needed for restore). func (s *Server) SetDBPath(path string) { s.dbPath = path } // SetShutdownFunc sets the function called after a restore to trigger graceful shutdown. func (s *Server) SetShutdownFunc(fn func()) { s.shutdownFunc = fn } // SetBackupSettingsChangedCallback sets the callback for when backup settings change. func (s *Server) SetBackupSettingsChangedCallback(fn func(enabled bool, intervalHours int)) { s.onBackupSettingsChanged = fn } // SetProxyProviderChangedCallback sets the callback for when the proxy provider changes. func (s *Server) SetProxyProviderChangedCallback(fn func(provider proxy.Provider)) { s.onProxyProviderChanged = fn } // SetProxyProvider updates the proxy provider at runtime. func (s *Server) SetProxyProvider(provider proxy.Provider) { s.proxyProvider = provider } // SetDNSProvider sets the current DNS provider on the server. func (s *Server) SetDNSProvider(provider dns.Provider) { s.dnsProviderMu.Lock() defer s.dnsProviderMu.Unlock() s.dnsProvider = provider } // getDNSProviderLocked returns the current DNS provider under read lock. func (s *Server) getDNSProviderLocked() dns.Provider { s.dnsProviderMu.RLock() defer s.dnsProviderMu.RUnlock() return s.dnsProvider } // SetDNSProviderChangedCallback sets the callback for when DNS settings change. func (s *Server) SetDNSProviderChangedCallback(fn DNSProviderChangedFunc) { s.onDNSProviderChanged = fn } // initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal. func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) { // Decrypt the OIDC client secret. The prior code did a try-decrypt // and silently treated failures as plaintext — under a rotated key // that sent ciphertext upstream to the OP. Now: // - If the value carries the tf1: envelope → fail loud on // decrypt failure (rotated key / corrupted ciphertext). // - If the value is unprefixed (legacy ciphertext from v0 or true // plaintext from an old migration) → try decrypt; on failure // accept as plaintext (the only safe legacy interpretation). clientSecret := as.OIDCClientSecret if clientSecret != "" { switch { case crypto.HasEnvelope(clientSecret): decrypted, err := crypto.Decrypt(s.encKey, clientSecret) if err != nil { slog.Error("OIDC client secret could not be decrypted — refusing to initialize provider", "error", err, "hint", "rotate ENCRYPTION_KEY back, OR re-save OIDC settings to re-encrypt with the current key") return } clientSecret = decrypted default: // Legacy v0 value: try decrypt; on failure assume plaintext. if decrypted, err := crypto.Decrypt(s.encKey, clientSecret); err == nil { clientSecret = decrypted } } } provider, err := auth.NewOIDCProvider(ctx, auth.OIDCConfig{ IssuerURL: as.OIDCIssuerURL, ClientID: as.OIDCClientID, ClientSecret: clientSecret, RedirectURL: as.OIDCRedirectURL, }) if err != nil { slog.Warn("failed to initialize OIDC provider", "error", err) return } s.oidcProvider = provider slog.Info("OIDC provider initialized", "issuer", as.OIDCIssuerURL) } // Router returns a chi router with all API routes mounted. func (s *Server) Router() chi.Router { r := chi.NewRouter() // Global middleware. requestID runs first so every downstream log // line (and the access log emitted by `logging`) carries the same // correlation id, plus the response carries it back on the // X-Request-ID header for the operator to grep across services. r.Use(requestID) r.Use(recovery) r.Use(securityHeaders) r.Use(logging) r.Use(cors) // Unauthenticated health probes — mounted at the root so container // orchestrators / load balancers can hit them without knowing about // the /api prefix. /livez intentionally does no work and stays // unbounded; /readyz pings the DB and is rate-limited to keep an // unauthenticated flood from serialising behind SQLite's single // writer connection (busy-timeout = 5s) and log-amplifying every // request via the structured access log. The 10-per-minute budget // is the existing rateLimiter default — generous for k8s readiness // probes (typically every 5-10s), restrictive for an attacker. r.Get("/livez", s.livez) readyLimiter := newRateLimiter() r.With(rateLimitMiddleware(readyLimiter)).Get("/readyz", s.readyz) loginLimiter := newRateLimiter() webhookLimiter := newRateLimiter() r.Route("/api", func(r chi.Router) { // JSON content type and body size limit for API routes. r.Use(jsonContentType) r.Use(limitBody) // Public auth endpoints (no auth required). r.Get("/auth/mode", s.authMode) r.Post("/auth/login", s.rateLimitedLogin(loginLimiter)) r.Get("/auth/oidc/login", s.oidcLogin) r.Get("/auth/oidc/callback", s.oidcCallback) r.Post("/auth/oidc/token", s.oidcExchangeToken) // Webhook handler (uses its own secret-based auth). // Per-IP rate limit prevents an attacker who has guessed (or leaked) // a secret from triggering a deploy storm, and rejects unauthenticated // brute-force probes over the secret URL space. r.With(rateLimitMiddleware(webhookLimiter)).Mount("/webhook", s.webhook.Route()) // Protected routes: require valid JWT. r.Group(func(r chi.Router) { r.Use(auth.Middleware(s.localAuth)) // Plugin registry inspection + unified ingress. r.Get("/hooks/kinds", s.listHookKinds) r.Get("/hooks/kinds/{kind}/schema", s.getHookKindSchema) r.With(auth.AdminOnly).Post("/hooks/generic", s.dispatchGeneric) // Workload-creation discovery helpers: provider probe, // connection test, repo / branch / tree browsers, and // image-source conflict detection. Admin-gated because // they accept an access token + can enumerate other // workloads' images. r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/discovery/git/detect-provider", s.detectGitProvider) r.Post("/discovery/git/test-connection", s.testGitConnection) r.Post("/discovery/git/repos", s.listGitRepos) r.Post("/discovery/git/branches", s.listGitBranches) r.Post("/discovery/git/tree", s.listGitTree) r.Get("/discovery/image/conflicts", s.listImageConflicts) r.Post("/discovery/image/inspect", s.inspectImageMetadata) }) // Read-only endpoints (any authenticated user). r.Get("/health", s.getHealth) r.Get("/auth/me", s.currentUser) r.Post("/auth/logout", s.logout) r.Get("/proxies", s.listProxyRoutes) r.Get("/docker/unused-images", s.unusedImageStats) r.Get("/events", s.streamEvents) r.Get("/events/log", s.listEventLog) r.Get("/events/log/stats", s.getEventLogStats) r.Get("/registries", s.listRegistries) r.Route("/registries/{id}", func(r chi.Router) { // All registry probes are admin-gated. The /tags and // /images endpoints used to be open to any authenticated // user, but they make outbound requests using the // admin-encrypted registry token — a viewer could // effectively drive arbitrary requests against a private // registry under admin credentials. r.Use(auth.AdminOnly) r.Get("/tags/*", s.listRegistryTags) r.Get("/images", s.listRegistryImages) r.Put("/", s.updateRegistry) r.Delete("/", s.deleteRegistry) r.Post("/test", s.testRegistry) }) r.Get("/settings", s.getSettings) r.Get("/settings/npm-certificates", s.listNpmCertificates) r.Get("/settings/npm-access-lists", s.listNpmAccessLists) // Volume scope metadata (read-only). r.Get("/volumes/scopes", s.listVolumeScopes) // Stale container endpoints (read). r.Get("/containers/stale", s.listStaleContainers) // Workload-shaped endpoints — the canonical surface after the // hard cutover. Reads open to any authenticated user; mutations // admin-gated. r.Get("/workloads", s.listWorkloads) r.With(auth.AdminOnly).Post("/workloads", s.createPluginWorkload) r.Route("/workloads/{id}", func(r chi.Router) { r.Get("/", s.getWorkload) r.Get("/containers", s.listWorkloadContainers) r.Get("/containers/{cid}/logs", s.streamWorkloadContainerLogs) r.With(auth.AdminOnly).Patch("/app", s.updateWorkloadAppID) r.With(auth.AdminOnly).Put("/plugin", s.updatePluginWorkload) r.With(auth.AdminOnly).Post("/deploy", s.deployPluginWorkload) r.With(auth.AdminOnly).Post("/stop", s.stopPluginWorkload) r.With(auth.AdminOnly).Post("/start", s.startPluginWorkload) r.With(auth.AdminOnly).Delete("/", s.deletePluginWorkload) // Deploy ledger + rollback. The history feed is read-only // (any authenticated user); rollback is a redeploy, so it is // admin-gated like /deploy. r.Get("/deploys", s.listWorkloadDeploys) r.With(auth.AdminOnly).Post("/rollback", s.rollbackWorkload) // Volume snapshots (admin-only). Capture/list a workload's // host-bind data volumes; {sid}-scoped download/delete live // in the global admin group alongside backups. r.With(auth.AdminOnly).Get("/snapshots", s.listWorkloadSnapshots) r.With(auth.AdminOnly).Get("/snapshotable", s.getWorkloadSnapshotable) r.With(auth.AdminOnly).Post("/snapshots", s.createWorkloadSnapshot) // Runtime view: per-source persisted state + storage usage. // Read-only; safe for any authenticated user. r.Get("/runtime-state", s.getWorkloadRuntimeState) r.Get("/storage", s.getWorkloadStorage) // Per-workload metrics history (CPU/memory time-series), // aggregated across the workload's containers. Read-only. r.Get("/stats/history", s.getWorkloadStatsHistory) // Per-workload activity / deploy timeline (read-only). Scoped // to this workload's event-log rows; the global feed lives at // /events/log. r.Get("/events", s.listWorkloadEvents) // Per-workload env vars. Listing open to authenticated readers; // mutations admin-gated. Encrypted values are write-only after store. r.Get("/env", s.listWorkloadEnv) r.With(auth.AdminOnly).Put("/env", s.setWorkloadEnv) r.With(auth.AdminOnly).Delete("/env/{envID}", s.deleteWorkloadEnv) // Per-workload inbound webhook URL handlers were dropped in // the hard legacy cutover; inbound webhooks are now first- // class Triggers reachable via /api/triggers/{id}/webhook. // Per-workload volume mounts. r.Get("/volumes", s.listWorkloadVolumes) r.With(auth.AdminOnly).Put("/volumes", s.setWorkloadVolume) r.With(auth.AdminOnly).Delete("/volumes/{volID}", s.deleteWorkloadVolume) // Stages chain: parent + self + direct children, plus a // promote-from action that copies the source workload's // running image tag onto this workload's default_tag. r.Get("/chain", s.getWorkloadChain) r.With(auth.AdminOnly).Post("/promote-from/{sourceID}", s.promoteFromWorkload) // Trigger bindings on this workload — the symmetric view // of /triggers/{id}/bindings keyed on the workload side. r.Get("/triggers", s.listBindingsForWorkload) r.With(auth.AdminOnly).Post("/triggers", s.bindTriggerToWorkload) // Per-workload notification routes — multi-destination // fan-out (Slack channel + Discord webhook + ...). When // zero rows are configured the dispatcher falls back to // the legacy single-URL columns on the workload row. r.Get("/notifications", s.listWorkloadNotifications) r.With(auth.AdminOnly).Post("/notifications", s.createWorkloadNotification) r.With(auth.AdminOnly).Put("/notifications/{nid}", s.updateWorkloadNotification) r.With(auth.AdminOnly).Delete("/notifications/{nid}", s.deleteWorkloadNotification) }) // Global container index, joined to workload + app names. r.Get("/containers", s.listAllContainers) r.Get("/containers/{id}", s.getContainer) // App grouping (optional UI; admin-gated mutations). r.Get("/apps", s.listApps) r.Get("/apps/{id}", s.getApp) r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/apps", s.createApp) r.Put("/apps/{id}", s.updateApp) r.Delete("/apps/{id}", s.deleteApp) }) // First-class Triggers (redeploy signal sources). One trigger // fans out to many workloads via workload_trigger_bindings. r.Get("/triggers", s.listTriggers) r.Get("/triggers/{id}", s.getTrigger) r.Get("/triggers/{id}/bindings", s.listBindingsForTrigger) r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/triggers", s.createTrigger) r.Put("/triggers/{id}", s.updateTrigger) r.Delete("/triggers/{id}", s.deleteTrigger) r.Get("/triggers/{id}/webhook", s.getTriggerWebhook) r.Post("/triggers/{id}/webhook/regenerate", s.regenerateTriggerWebhook) r.Post("/triggers/{id}/fire", s.fireTriggerNow) r.Post("/triggers/{id}/bindings", s.bindWorkloadToTrigger) r.Put("/bindings/{bid}", s.updateBinding) r.Delete("/bindings/{bid}", s.deleteBinding) }) // Event triggers: filter+action rules over the event_log stream. r.Get("/event-triggers", s.listEventTriggers) r.Get("/event-triggers/{id}", s.getEventTrigger) r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/event-triggers", s.createEventTrigger) r.Patch("/event-triggers/{id}", s.updateEventTrigger) r.Delete("/event-triggers/{id}", s.deleteEventTrigger) r.Post("/event-triggers/{id}/test", s.testEventTrigger) }) // Log-scan rules. r.Get("/log-scan-rules", s.listLogScanRules) r.Get("/log-scan-rules/stats", s.getLogScanStats) r.Get("/log-scan-rules/{id}", s.getLogScanRule) r.Get("/workloads/{id}/effective-rules", s.getEffectiveLogScanRules) r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/log-scan-rules", s.createLogScanRule) r.Patch("/log-scan-rules/{id}", s.updateLogScanRule) r.Delete("/log-scan-rules/{id}", s.deleteLogScanRule) r.Post("/log-scan-rules/{id}/test", s.testLogScanRule) }) // Metric-alert rules. r.Get("/metric-alert-rules", s.listMetricAlertRules) r.Get("/metric-alert-rules/{id}", s.getMetricAlertRule) r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/metric-alert-rules", s.createMetricAlertRule) r.Patch("/metric-alert-rules/{id}", s.updateMetricAlertRule) r.Delete("/metric-alert-rules/{id}", s.deleteMetricAlertRule) }) // Shared secrets (env vars shared across workloads by scope). r.Get("/shared-secrets", s.listSharedSecrets) r.Get("/shared-secrets/{id}", s.getSharedSecret) r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) r.Post("/shared-secrets", s.createSharedSecret) r.Patch("/shared-secrets/{id}", s.updateSharedSecret) r.Delete("/shared-secrets/{id}", s.deleteSharedSecret) }) // System resources (read-only). r.Get("/system/stats", s.getSystemStats) r.Get("/system/stats/history", s.getSystemStatsHistory) r.Get("/system/stats/top", s.listTopContainers) // Admin-only routes: require admin role. r.Group(func(r chi.Router) { r.Use(auth.AdminOnly) // Prometheus-format metrics export. Admin-only so the // counter cardinality cannot be enumerated by a low-trust // viewer to map internal endpoints / sources / outcomes. // Scrape with bearer auth from your Prometheus job. r.Get("/metrics", s.metricsExport) // Config export (reveals registry/global details). r.Get("/config/export", s.exportConfig) // Event log management. r.Delete("/events/log/{id}", s.deleteEvent) r.Delete("/events/log", s.clearEvents) // Auth management. r.Get("/auth/settings", s.getAuthSettings) r.Put("/auth/settings", s.updateAuthSettings) r.Get("/auth/users", s.listUsers) r.Post("/auth/users", s.createUser) r.Put("/auth/users/{uid}", s.updateUser) r.Put("/auth/users/{uid}/password", s.changePassword) r.Delete("/auth/users/{uid}", s.deleteUser) // Registry creation. r.Post("/registries", s.createRegistry) // Stale container cleanup endpoints. // Bulk route must be registered before parameterized route. r.Post("/containers/stale/cleanup", s.bulkCleanupStaleContainers) r.Post("/containers/stale/{id}/cleanup", s.cleanupStaleContainer) // Settings endpoints. r.Put("/settings", s.updateSettings) // Global outgoing-webhook signing & test. r.Get("/settings/notification-secret", s.getSettingsNotificationSecret) r.Post("/settings/notification-secret/regenerate", s.regenerateSettingsNotificationSecret) r.Post("/settings/notification-secret/disable", s.disableSettingsNotificationSigning) r.Post("/settings/notification-test", s.settingsNotificationTest) // Docker management. r.Post("/docker/prune-images", s.pruneImages) r.Post("/docker/prune-build-cache", s.pruneBuildCache) // NPM connection test. r.Post("/settings/npm/test", s.testNpmConnection) // DNS management endpoints. r.Post("/settings/dns/test", s.testDNSConnection) r.Post("/settings/dns/zones", s.listDNSZones) r.Get("/dns/records", s.listDNSRecords) r.Post("/dns/sync", s.syncDNSRecords) r.Delete("/dns/records/{fqdn}", s.deleteDNSRecord) // Backup endpoints. r.Get("/backups", s.listBackups) r.Post("/backups", s.triggerBackup) r.Get("/backups/{id}/download", s.downloadBackup) r.Delete("/backups/{id}", s.deleteBackup) r.Post("/backups/{id}/restore", s.restoreBackup) // Volume-snapshot download/delete (workload-scoped capture + // list live under /workloads/{id}/snapshots). r.Get("/snapshots/{sid}/download", s.downloadSnapshot) r.Delete("/snapshots/{sid}", s.deleteSnapshot) }) }) }) return r }