diff --git a/cmd/server/main.go b/cmd/server/main.go index b91e2e6..1842d2c 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -12,6 +12,8 @@ import ( "syscall" "time" + "github.com/robfig/cron/v3" + dockerwatcher "github.com/alexei/docker-watcher" "github.com/alexei/docker-watcher/internal/api" "github.com/alexei/docker-watcher/internal/auth" @@ -24,7 +26,9 @@ import ( "github.com/alexei/docker-watcher/internal/logging" "github.com/alexei/docker-watcher/internal/notify" "github.com/alexei/docker-watcher/internal/npm" + "github.com/alexei/docker-watcher/internal/proxy" "github.com/alexei/docker-watcher/internal/registry" + "github.com/alexei/docker-watcher/internal/stale" "github.com/alexei/docker-watcher/internal/store" "github.com/alexei/docker-watcher/internal/webhook" ) @@ -93,6 +97,21 @@ func main() { notifier := notify.New() eventBus := events.New() + // Auto-persist warn/error events from the event bus to the database. + stopLogger := eventBus.RegisterPersistentLogger(func(source, severity, message, metadata string) (int64, string, error) { + evt, err := db.InsertEvent(store.EventLog{ + Source: source, + Severity: severity, + Message: message, + Metadata: metadata, + }) + if err != nil { + return 0, "", err + } + return evt.ID, evt.CreatedAt, nil + }) + defer stopLogger() + dep := deployer.New(dockerClient, npmClient, db, healthChecker, notifier, eventBus, encKey) // Initialize webhook handler. @@ -115,8 +134,68 @@ func main() { } } + // Initialize stale container scanner. + staleScanner := stale.New(db, dockerClient, eventBus) + if err := staleScanner.Start("1h"); err != nil { + slog.Warn("failed to start stale scanner", "error", err) + } + + // Initialize proxy manager and health monitor. + proxyManager := proxy.NewManager(db, npmClient) + proxyHealth := proxy.NewHealthMonitor(db, eventBus) + if err := proxyHealth.Start("5m"); err != nil { + slog.Warn("failed to start proxy health monitor", "error", err) + } + + // Start daily event log pruning cron job. + cronScheduler := cron.New() + if _, err := cronScheduler.AddFunc("@daily", func() { + pruned, err := db.PruneEvents(30) + if err != nil { + slog.Error("event log prune failed", "error", err) + return + } + if pruned > 0 { + slog.Info("pruned old event log entries", "count", pruned) + } + }); err != nil { + slog.Warn("failed to schedule event prune cron", "error", err) + } + cronScheduler.Start() + + // Subscribe to error events and forward notifications. + notifySub := eventBus.Subscribe(func(evt events.Event) bool { + if evt.Type != events.EventLog { + return false + } + p, ok := evt.Payload.(events.EventLogPayload) + if !ok { + return false + } + return p.Severity == "error" + }) + go func() { + for evt := range notifySub { + p, ok := evt.Payload.(events.EventLogPayload) + if !ok { + continue + } + currentSettings, err := db.GetSettings() + if err != nil || currentSettings.NotificationURL == "" { + continue + } + notifier.Send(currentSettings.NotificationURL, notify.Event{ + Type: p.Source + "_error", + Project: p.Source, + Error: p.Message, + }) + } + }() + // Build API server. apiServer := api.NewServer(db, dockerClient, npmClient, dep, webhookHandler, eventBus, encKey) + apiServer.SetStaleScanner(staleScanner) + apiServer.SetProxyManager(proxyManager) router := apiServer.Router() // Serve embedded static files for the SPA frontend. @@ -158,6 +237,10 @@ func main() { slog.Info("shutting down...") // Stop accepting new work. + cronScheduler.Stop() + eventBus.Unsubscribe(notifySub) + proxyHealth.Stop() + staleScanner.Stop() poller.Stop() // Drain in-progress deploys and notifications. diff --git a/internal/api/eventlog.go b/internal/api/eventlog.go new file mode 100644 index 0000000..a4fd025 --- /dev/null +++ b/internal/api/eventlog.go @@ -0,0 +1,48 @@ +package api + +import ( + "log/slog" + "net/http" + "strconv" + + "github.com/alexei/docker-watcher/internal/store" +) + +// listEventLog handles GET /api/events/log. +// Supports query parameters: severity, source, since, until, limit, offset. +func (s *Server) listEventLog(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query() + + limit, _ := strconv.Atoi(q.Get("limit")) + offset, _ := strconv.Atoi(q.Get("offset")) + + filter := store.EventLogFilter{ + Severity: q.Get("severity"), + Source: q.Get("source"), + Since: q.Get("since"), + Until: q.Get("until"), + Limit: limit, + Offset: offset, + } + + events, err := s.store.ListEvents(filter) + if err != nil { + slog.Error("failed to list events", "error", err) + respondError(w, http.StatusInternalServerError, "failed to list events") + return + } + + respondJSON(w, http.StatusOK, events) +} + +// getEventLogStats handles GET /api/events/log/stats. +func (s *Server) getEventLogStats(w http.ResponseWriter, r *http.Request) { + stats, err := s.store.GetEventStats() + if err != nil { + slog.Error("failed to get event stats", "error", err) + respondError(w, http.StatusInternalServerError, "failed to get event stats") + return + } + + respondJSON(w, http.StatusOK, stats) +} diff --git a/internal/api/instances.go b/internal/api/instances.go index 4b936bc..26eca47 100644 --- a/internal/api/instances.go +++ b/internal/api/instances.go @@ -196,6 +196,13 @@ func (s *Server) controlInstance(w http.ResponseWriter, r *http.Request, action slog.Error("update instance status", "instance_id", instanceID, "status", newStatus, "error", err) } + // Track last_alive_at when container becomes running. + if newStatus == "running" { + if err := s.store.UpdateLastAliveAt(instanceID); err != nil { + slog.Error("update last_alive_at", "instance_id", instanceID, "error", err) + } + } + respondJSON(w, http.StatusOK, map[string]string{ "instance_id": instanceID, "action": action, diff --git a/internal/api/proxy.go b/internal/api/proxy.go new file mode 100644 index 0000000..e5dd8a0 --- /dev/null +++ b/internal/api/proxy.go @@ -0,0 +1,199 @@ +package api + +import ( + "context" + "log/slog" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + + "github.com/alexei/docker-watcher/internal/proxy" +) + +// validateProxy runs the validation pipeline without creating a proxy. +// POST /api/proxies/validate +func (s *Server) validateProxy(w http.ResponseWriter, r *http.Request) { + var req struct { + Host string `json:"host"` + Port int `json:"port"` + } + if !decodeJSON(w, r, &req) { + return + } + + if req.Host == "" { + respondError(w, http.StatusBadRequest, "host is required") + return + } + if req.Port < 1 || req.Port > 65535 { + respondError(w, http.StatusBadRequest, "port must be between 1 and 65535") + return + } + + ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second) + defer cancel() + + result := proxy.ValidateDestination(ctx, req.Host, req.Port) + respondJSON(w, http.StatusOK, result) +} + +// createProxy creates a new standalone proxy. +// POST /api/proxies +func (s *Server) createProxy(w http.ResponseWriter, r *http.Request) { + if s.proxyManager == nil { + respondError(w, http.StatusServiceUnavailable, "proxy manager not configured") + return + } + + var req proxy.CreateProxyRequest + if !decodeJSON(w, r, &req) { + return + } + + if req.Domain == "" { + respondError(w, http.StatusBadRequest, "domain is required") + return + } + if req.DestinationURL == "" { + respondError(w, http.StatusBadRequest, "destination_url is required") + return + } + if req.DestinationPort < 1 || req.DestinationPort > 65535 { + respondError(w, http.StatusBadRequest, "destination_port must be between 1 and 65535") + return + } + + p, err := s.proxyManager.CreateProxy(r.Context(), req) + if err != nil { + slog.Error("failed to create proxy", "domain", req.Domain, "error", err) + respondError(w, http.StatusInternalServerError, "failed to create proxy") + return + } + + respondJSON(w, http.StatusCreated, p) +} + +// listProxies returns all standalone proxies. +// GET /api/proxies +func (s *Server) listProxies(w http.ResponseWriter, r *http.Request) { + if s.proxyManager == nil { + respondError(w, http.StatusServiceUnavailable, "proxy manager not configured") + return + } + + proxies, err := s.proxyManager.ListProxies() + if err != nil { + slog.Error("proxy operation failed", "error", err) + respondError(w, http.StatusInternalServerError, "proxy operation failed") + return + } + + respondJSON(w, http.StatusOK, proxies) +} + +// getProxy returns a single standalone proxy. +// GET /api/proxies/{id} +func (s *Server) getProxy(w http.ResponseWriter, r *http.Request) { + if s.proxyManager == nil { + respondError(w, http.StatusServiceUnavailable, "proxy manager not configured") + return + } + + id := chi.URLParam(r, "id") + p, err := s.proxyManager.GetProxy(id) + if err != nil { + if proxy.IsNotFound(err) { + respondNotFound(w, "proxy") + return + } + slog.Error("proxy operation failed", "error", err) + respondError(w, http.StatusInternalServerError, "proxy operation failed") + return + } + + respondJSON(w, http.StatusOK, p) +} + +// updateProxy updates an existing standalone proxy. +// PUT /api/proxies/{id} +func (s *Server) updateProxy(w http.ResponseWriter, r *http.Request) { + if s.proxyManager == nil { + respondError(w, http.StatusServiceUnavailable, "proxy manager not configured") + return + } + + id := chi.URLParam(r, "id") + + var req proxy.UpdateProxyRequest + if !decodeJSON(w, r, &req) { + return + } + + if req.Domain == "" { + respondError(w, http.StatusBadRequest, "domain is required") + return + } + if req.DestinationURL == "" { + respondError(w, http.StatusBadRequest, "destination_url is required") + return + } + if req.DestinationPort < 1 || req.DestinationPort > 65535 { + respondError(w, http.StatusBadRequest, "destination_port must be between 1 and 65535") + return + } + + p, err := s.proxyManager.UpdateProxy(r.Context(), id, req) + if err != nil { + if proxy.IsNotFound(err) { + respondNotFound(w, "proxy") + return + } + slog.Error("proxy operation failed", "error", err) + respondError(w, http.StatusInternalServerError, "proxy operation failed") + return + } + + respondJSON(w, http.StatusOK, p) +} + +// deleteProxy removes a standalone proxy. +// DELETE /api/proxies/{id} +func (s *Server) deleteProxy(w http.ResponseWriter, r *http.Request) { + if s.proxyManager == nil { + respondError(w, http.StatusServiceUnavailable, "proxy manager not configured") + return + } + + id := chi.URLParam(r, "id") + + if err := s.proxyManager.DeleteProxy(r.Context(), id); err != nil { + if proxy.IsNotFound(err) { + respondNotFound(w, "proxy") + return + } + slog.Error("proxy operation failed", "error", err) + respondError(w, http.StatusInternalServerError, "proxy operation failed") + return + } + + respondJSON(w, http.StatusOK, map[string]string{"deleted": id}) +} + +// listAllProxies returns a merged view of standalone and deploy-managed proxies. +// GET /api/proxies/all +func (s *Server) listAllProxies(w http.ResponseWriter, r *http.Request) { + if s.proxyManager == nil { + respondError(w, http.StatusServiceUnavailable, "proxy manager not configured") + return + } + + views, err := s.proxyManager.ListAllProxies() + if err != nil { + slog.Error("proxy operation failed", "error", err) + respondError(w, http.StatusInternalServerError, "proxy operation failed") + return + } + + respondJSON(w, http.StatusOK, views) +} diff --git a/internal/api/router.go b/internal/api/router.go index dfb0221..c1f01e4 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -11,6 +11,8 @@ import ( "github.com/alexei/docker-watcher/internal/docker" "github.com/alexei/docker-watcher/internal/events" "github.com/alexei/docker-watcher/internal/npm" + "github.com/alexei/docker-watcher/internal/proxy" + "github.com/alexei/docker-watcher/internal/stale" "github.com/alexei/docker-watcher/internal/store" "github.com/alexei/docker-watcher/internal/webhook" ) @@ -26,6 +28,8 @@ type Server struct { encKey [32]byte localAuth *auth.LocalAuth oidcProvider *auth.OIDCProvider + staleScanner *stale.Scanner + proxyManager *proxy.Manager } // NewServer creates a new API Server with all required dependencies. @@ -60,6 +64,18 @@ func NewServer( return s } +// SetStaleScanner sets the stale scanner on the server. +// Called after both the API server and scanner are initialized. +func (s *Server) SetStaleScanner(scanner *stale.Scanner) { + s.staleScanner = scanner +} + +// SetProxyManager sets the proxy manager on the server. +// Called after both the API server and proxy manager are initialized. +func (s *Server) SetProxyManager(pm *proxy.Manager) { + s.proxyManager = pm +} + // initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal. func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) { // Decrypt the OIDC client secret if it's encrypted. @@ -120,36 +136,12 @@ func (s *Server) Router() chi.Router { r.Get("/", s.getProject) r.Get("/stages/{stage}/env", s.listStageEnv) r.Get("/stages/{stage}/instances", s.listInstances) + r.Get("/stages/{stage}/instances/{iid}/stats", s.getInstanceStats) r.Get("/volumes", s.listVolumes) - }) - r.Get("/deploys", s.listDeploys) - r.Get("/deploys/{id}/logs", s.streamDeployLogs) - r.Get("/events", s.streamEvents) - r.Get("/registries", s.listRegistries) - r.Route("/registries/{id}", func(r chi.Router) { - r.Get("/tags/*", s.listRegistryTags) - r.Get("/images", s.listRegistryImages) - }) - r.Get("/settings", s.getSettings) - r.Get("/settings/npm-certificates", s.listNpmCertificates) - // Admin-only routes: require admin role. - r.Group(func(r chi.Router) { - r.Use(auth.AdminOnly) - - // Config export (reveals project/infra details). - r.Get("/config/export", s.exportConfig) - - // Auth management. - r.Get("/auth/settings", s.getAuthSettings) - r.Put("/auth/settings", s.updateAuthSettings) - r.Get("/auth/users", s.listUsers) - r.Post("/auth/users", s.createUser) - r.Delete("/auth/users/{uid}", s.deleteUser) - - // Project mutation endpoints. - r.Post("/projects", s.createProject) - r.Route("/projects/{id}", func(r chi.Router) { + // Admin-only project mutations. + r.Group(func(r chi.Router) { + r.Use(auth.AdminOnly) r.Put("/", s.updateProject) r.Delete("/", s.deleteProject) @@ -177,18 +169,76 @@ func (s *Server) Router() chi.Router { r.Put("/volumes/{volId}", s.updateVolume) r.Delete("/volumes/{volId}", s.deleteVolume) }) + }) + r.Get("/deploys", s.listDeploys) + r.Get("/deploys/{id}/logs", s.streamDeployLogs) + r.Get("/events", s.streamEvents) + r.Get("/events/log", s.listEventLog) + r.Get("/events/log/stats", s.getEventLogStats) + r.Get("/registries", s.listRegistries) + r.Route("/registries/{id}", func(r chi.Router) { + r.Get("/tags/*", s.listRegistryTags) + r.Get("/images", s.listRegistryImages) + + // Admin-only registry mutations. + r.Group(func(r chi.Router) { + r.Use(auth.AdminOnly) + r.Put("/", s.updateRegistry) + r.Delete("/", s.deleteRegistry) + r.Post("/test", s.testRegistry) + }) + }) + r.Get("/settings", s.getSettings) + r.Get("/settings/npm-certificates", s.listNpmCertificates) + + // Stale container endpoints (read). + r.Get("/containers/stale", s.listStaleContainers) + + // Proxy endpoints (read-only for any authenticated user). + r.Get("/proxies", s.listProxies) + r.Get("/proxies/all", s.listAllProxies) + r.Route("/proxies/{id}", func(r chi.Router) { + r.Get("/", s.getProxy) + // Admin-only proxy mutations. + r.Group(func(r chi.Router) { + r.Use(auth.AdminOnly) + r.Put("/", s.updateProxy) + r.Delete("/", s.deleteProxy) + }) + }) + + // Admin-only routes: require admin role. + r.Group(func(r chi.Router) { + r.Use(auth.AdminOnly) + + // Config export (reveals project/infra details). + r.Get("/config/export", s.exportConfig) + + // Auth management. + r.Get("/auth/settings", s.getAuthSettings) + r.Put("/auth/settings", s.updateAuthSettings) + r.Get("/auth/users", s.listUsers) + r.Post("/auth/users", s.createUser) + r.Delete("/auth/users/{uid}", s.deleteUser) + + // Project creation. + r.Post("/projects", s.createProject) // Quick deploy endpoints. r.Post("/deploy/inspect", s.inspectImage) r.Post("/deploy/quick", s.quickDeploy) - // Registry mutation endpoints. + // Registry creation. r.Post("/registries", s.createRegistry) - r.Route("/registries/{id}", func(r chi.Router) { - r.Put("/", s.updateRegistry) - r.Delete("/", s.deleteRegistry) - r.Post("/test", s.testRegistry) - }) + + // Proxy mutation endpoints. + r.Post("/proxies/validate", s.validateProxy) + r.Post("/proxies", s.createProxy) + + // Stale container cleanup endpoints. + // Bulk route must be registered before parameterized route. + r.Post("/containers/stale/cleanup", s.bulkCleanupStaleContainers) + r.Post("/containers/stale/{id}/cleanup", s.cleanupStaleContainer) // Settings endpoints. r.Put("/settings", s.updateSettings) diff --git a/internal/api/settings.go b/internal/api/settings.go index 22c6dbb..276dd45 100644 --- a/internal/api/settings.go +++ b/internal/api/settings.go @@ -24,7 +24,8 @@ type settingsRequest struct { NpmEmail string `json:"npm_email"` NpmPassword string `json:"npm_password"` PollingInterval string `json:"polling_interval"` - SSLCertificateID *int `json:"ssl_certificate_id,omitempty"` + SSLCertificateID *int `json:"ssl_certificate_id,omitempty"` + StaleThresholdDays *int `json:"stale_threshold_days,omitempty"` } // getSettings handles GET /api/settings. @@ -37,17 +38,18 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) { // Return settings without sensitive fields. respondJSON(w, http.StatusOK, map[string]any{ - "domain": settings.Domain, - "server_ip": settings.ServerIP, - "network": settings.Network, - "subdomain_pattern": settings.SubdomainPattern, - "notification_url": settings.NotificationURL, - "npm_url": settings.NpmURL, - "npm_email": settings.NpmEmail, - "has_npm_password": settings.NpmPassword != "", - "polling_interval": settings.PollingInterval, - "ssl_certificate_id": settings.SSLCertificateID, - "updated_at": settings.UpdatedAt, + "domain": settings.Domain, + "server_ip": settings.ServerIP, + "network": settings.Network, + "subdomain_pattern": settings.SubdomainPattern, + "notification_url": settings.NotificationURL, + "npm_url": settings.NpmURL, + "npm_email": settings.NpmEmail, + "has_npm_password": settings.NpmPassword != "", + "polling_interval": settings.PollingInterval, + "ssl_certificate_id": settings.SSLCertificateID, + "stale_threshold_days": settings.StaleThresholdDays, + "updated_at": settings.UpdatedAt, }) } @@ -101,6 +103,13 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) { updated.SSLCertificateID = *req.SSLCertificateID sslChanged = true } + if req.StaleThresholdDays != nil { + if *req.StaleThresholdDays < 1 { + respondError(w, http.StatusBadRequest, "stale_threshold_days must be at least 1") + return + } + updated.StaleThresholdDays = *req.StaleThresholdDays + } if err := s.store.UpdateSettings(updated); err != nil { respondError(w, http.StatusInternalServerError, "failed to update settings: "+err.Error()) diff --git a/internal/api/sse.go b/internal/api/sse.go index 4882223..32b1538 100644 --- a/internal/api/sse.go +++ b/internal/api/sse.go @@ -150,9 +150,9 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) flusher.Flush() - // Subscribe to instance status and deploy status events. + // Subscribe to instance status, deploy status, and persistent event log events. sub := s.eventBus.Subscribe(func(evt events.Event) bool { - return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus + return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus || evt.Type == events.EventLog }) defer s.eventBus.Unsubscribe(sub) diff --git a/internal/api/stale.go b/internal/api/stale.go new file mode 100644 index 0000000..340041e --- /dev/null +++ b/internal/api/stale.go @@ -0,0 +1,176 @@ +package api + +import ( + "errors" + "log/slog" + "net/http" + + "github.com/go-chi/chi/v5" + + "github.com/alexei/docker-watcher/internal/crypto" + "github.com/alexei/docker-watcher/internal/events" + "github.com/alexei/docker-watcher/internal/stale" + "github.com/alexei/docker-watcher/internal/store" +) + +// listStaleContainers handles GET /api/containers/stale. +func (s *Server) listStaleContainers(w http.ResponseWriter, r *http.Request) { + if s.staleScanner == nil { + respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized") + return + } + + staleInstances, err := s.staleScanner.FindStaleInstances(r.Context()) + if err != nil { + slog.Error("failed to find stale containers", "error", err) + respondError(w, http.StatusInternalServerError, "failed to find stale containers") + return + } + + if staleInstances == nil { + staleInstances = []stale.StaleInstance{} + } + respondJSON(w, http.StatusOK, staleInstances) +} + +// cleanupStaleContainer handles POST /api/containers/stale/{id}/cleanup. +// Stops the Docker container, removes the NPM proxy, and deletes the instance from the store. +func (s *Server) cleanupStaleContainer(w http.ResponseWriter, r *http.Request) { + instanceID := chi.URLParam(r, "id") + + inst, err := s.store.GetInstanceByID(instanceID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "instance") + return + } + slog.Error("failed to get instance", "instance_id", instanceID, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get instance") + return + } + + // Don't remove instances already being cleaned up. + if inst.Status == "removing" { + respondError(w, http.StatusConflict, "instance is already being removed") + return + } + + if err := s.cleanupInstance(r, inst); err != nil { + slog.Error("failed to cleanup instance", "instance_id", instanceID, "error", err) + respondError(w, http.StatusInternalServerError, "failed to cleanup instance") + return + } + + respondJSON(w, http.StatusOK, map[string]string{"cleaned": instanceID}) +} + +// bulkCleanupStaleContainers handles POST /api/containers/stale/cleanup. +// Cleans up all currently stale containers. +func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Request) { + if s.staleScanner == nil { + respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized") + return + } + + staleInstances, err := s.staleScanner.FindStaleInstances(r.Context()) + if err != nil { + slog.Error("failed to find stale containers for bulk cleanup", "error", err) + respondError(w, http.StatusInternalServerError, "failed to find stale containers") + return + } + + var cleaned []string + var failed []string + + for _, si := range staleInstances { + if si.Instance.Status == "removing" { + continue + } + if err := s.cleanupInstance(r, si.Instance); err != nil { + slog.Error("bulk stale cleanup failed", + "instance_id", si.Instance.ID, "error", err) + failed = append(failed, si.Instance.ID) + continue + } + cleaned = append(cleaned, si.Instance.ID) + } + + respondJSON(w, http.StatusOK, map[string]any{ + "cleaned": cleaned, + "failed": failed, + }) +} + +// cleanupInstance stops a Docker container, removes the NPM proxy, deletes +// the store record, and emits an event. +func (s *Server) cleanupInstance(r *http.Request, inst store.Instance) error { + ctx := r.Context() + + // Mark as removing. + if err := s.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil { + slog.Warn("stale cleanup: update status to removing", "instance_id", inst.ID, "error", err) + } + + // Stop and remove Docker container. + if inst.ContainerID != "" { + if err := s.docker.StopContainer(ctx, inst.ContainerID, 10); err != nil { + slog.Warn("stale cleanup: stop container", "container_id", inst.ContainerID, "error", err) + } + if err := s.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil { + slog.Warn("stale cleanup: remove container", "container_id", inst.ContainerID, "error", err) + } + } + + // Delete NPM proxy host if present. + if inst.NpmProxyID > 0 { + settings, err := s.store.GetSettings() + if err == nil { + npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword) + if err == nil { + if authErr := s.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr == nil { + if delErr := s.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil { + slog.Warn("stale cleanup: delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr) + } + } + } + } + } + + // Delete instance record. + if err := s.store.DeleteInstance(inst.ID); err != nil { + return err + } + + // Emit cleanup event. + s.emitStaleCleanupEvent(inst) + + return nil +} + +// emitStaleCleanupEvent publishes an event when a stale container is cleaned up. +func (s *Server) emitStaleCleanupEvent(inst store.Instance) { + msg := "Stale container cleaned up: " + inst.ID + " (tag: " + inst.ImageTag + ")" + + evt, err := s.store.InsertEvent(store.EventLog{ + Source: "stale_cleanup", + Severity: "info", + Message: msg, + Metadata: `{"instance_id":"` + inst.ID + `","project_id":"` + inst.ProjectID + `","stage_id":"` + inst.StageID + `"}`, + }) + if err != nil { + slog.Error("stale cleanup: failed to persist event", "error", err) + return + } + + s.eventBus.Publish(events.Event{ + Type: events.EventLog, + Payload: events.EventLogPayload{ + ID: evt.ID, + Source: "stale_cleanup", + Severity: "info", + Message: msg, + Metadata: evt.Metadata, + CreatedAt: evt.CreatedAt, + }, + }) +} diff --git a/internal/api/stats.go b/internal/api/stats.go new file mode 100644 index 0000000..f1e5ea2 --- /dev/null +++ b/internal/api/stats.go @@ -0,0 +1,42 @@ +package api + +import ( + "errors" + "log/slog" + "net/http" + + "github.com/go-chi/chi/v5" + + "github.com/alexei/docker-watcher/internal/store" +) + +// getInstanceStats handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats. +// Returns CPU and memory stats for the container backing the given instance. +func (s *Server) getInstanceStats(w http.ResponseWriter, r *http.Request) { + instanceID := chi.URLParam(r, "iid") + + inst, err := s.store.GetInstanceByID(instanceID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "instance") + return + } + slog.Error("failed to get instance", "instance_id", instanceID, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get instance") + return + } + + if inst.ContainerID == "" { + respondError(w, http.StatusBadRequest, "instance has no container") + return + } + + stats, err := s.docker.GetContainerStats(r.Context(), inst.ContainerID) + if err != nil { + slog.Error("failed to get container stats", "container_id", inst.ContainerID, "error", err) + respondError(w, http.StatusInternalServerError, "failed to get container stats") + return + } + + respondJSON(w, http.StatusOK, stats) +} diff --git a/internal/deployer/deployer.go b/internal/deployer/deployer.go index f8442f9..4b0cda3 100644 --- a/internal/deployer/deployer.go +++ b/internal/deployer/deployer.go @@ -333,6 +333,9 @@ func (d *Deployer) executeDeploy( if err := d.store.UpdateInstanceStatus(instanceID, "running"); err != nil { slog.Warn("update instance status to running", "error", err) } + if err := d.store.UpdateLastAliveAt(instanceID); err != nil { + slog.Warn("update last_alive_at on deploy", "instance_id", instanceID, "error", err) + } d.publishInstanceStatus(instanceID, project.ID, stage.ID, "running") d.logDeploy(deployID, "Container started", "info") diff --git a/internal/docker/stats.go b/internal/docker/stats.go new file mode 100644 index 0000000..e1c919f --- /dev/null +++ b/internal/docker/stats.go @@ -0,0 +1,69 @@ +package docker + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/moby/moby/api/types/container" + "github.com/moby/moby/client" +) + +// ContainerStats holds computed CPU and memory usage for a container. +type ContainerStats struct { + CPUPercent float64 `json:"cpu_percent"` + MemoryUsage int64 `json:"memory_usage"` + MemoryLimit int64 `json:"memory_limit"` + MemoryPercent float64 `json:"memory_percent"` +} + +// GetContainerStats retrieves a one-shot stats snapshot for the given container +// and computes CPU and memory percentages. +func (c *Client) GetContainerStats(ctx context.Context, containerID string) (ContainerStats, error) { + result, err := c.api.ContainerStats(ctx, containerID, client.ContainerStatsOptions{ + Stream: false, + IncludePreviousSample: true, + }) + if err != nil { + return ContainerStats{}, fmt.Errorf("get container stats %s: %w", containerID, err) + } + defer result.Body.Close() + + var stats container.StatsResponse + if err := json.NewDecoder(result.Body).Decode(&stats); err != nil { + return ContainerStats{}, fmt.Errorf("decode container stats %s: %w", containerID, err) + } + + cpuPercent := calculateCPUPercent(stats) + memUsage := int64(stats.MemoryStats.Usage) + memLimit := int64(stats.MemoryStats.Limit) + var memPercent float64 + if memLimit > 0 { + memPercent = float64(memUsage) / float64(memLimit) * 100.0 + } + + return ContainerStats{ + CPUPercent: cpuPercent, + MemoryUsage: memUsage, + MemoryLimit: memLimit, + MemoryPercent: memPercent, + }, nil +} + +// calculateCPUPercent computes CPU usage percentage from a stats response +// using the delta between current and previous CPU readings. +func calculateCPUPercent(stats container.StatsResponse) float64 { + cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage) + systemDelta := float64(stats.CPUStats.SystemUsage) - float64(stats.PreCPUStats.SystemUsage) + + if systemDelta <= 0 || cpuDelta < 0 { + return 0.0 + } + + onlineCPUs := float64(stats.CPUStats.OnlineCPUs) + if onlineCPUs == 0 { + onlineCPUs = 1 + } + + return (cpuDelta / systemDelta) * onlineCPUs * 100.0 +} diff --git a/internal/events/bus.go b/internal/events/bus.go index a4097a2..5cc1a4c 100644 --- a/internal/events/bus.go +++ b/internal/events/bus.go @@ -2,6 +2,7 @@ package events import ( "encoding/json" + "log/slog" "sync" ) @@ -17,6 +18,9 @@ const ( // EventDeployStatus is emitted when a deploy status changes. EventDeployStatus EventType = "deploy_status" + + // EventLog is emitted when a persistent event is logged. + EventLog EventType = "event_log" ) // Event is a single event published on the bus. @@ -50,6 +54,72 @@ type DeployStatusPayload struct { Error string `json:"error,omitempty"` } +// EventLogPayload is the payload for EventLog events (persistent event log). +type EventLogPayload struct { + ID int64 `json:"id"` + Source string `json:"source"` + Severity string `json:"severity"` + Message string `json:"message"` + Metadata string `json:"metadata"` + CreatedAt string `json:"created_at"` +} + +// PersistFunc is a callback that persists an event log entry. +// It receives source, severity, message, and metadata (JSON string). +// It returns the persisted entry's ID and created_at timestamp. +type PersistFunc func(source, severity, message, metadata string) (int64, string, error) + +// RegisterPersistentLogger subscribes to the bus and auto-persists warn/error +// events by calling the provided persist function. It also re-publishes the +// persisted event as an EventLog so SSE clients receive it in real-time. +// Call the returned function to unsubscribe. +func (b *Bus) RegisterPersistentLogger(persist PersistFunc) func() { + sub := b.Subscribe(func(evt Event) bool { + // Only persist deploy log events with warn/error level. + if evt.Type != EventDeployLog { + return false + } + p, ok := evt.Payload.(DeployLogPayload) + if !ok { + return false + } + return p.Level == "warn" || p.Level == "error" + }) + + go func() { + for evt := range sub { + p, ok := evt.Payload.(DeployLogPayload) + if !ok { + continue + } + metaBytes, _ := json.Marshal(map[string]string{"deploy_id": p.DeployID}) + metadata := string(metaBytes) + id, createdAt, err := persist("deploy", p.Level, p.Message, metadata) + if err != nil { + slog.Error("failed to persist event log", "source", "deploy", "level", p.Level, "error", err) + continue + } + + // Re-publish as EventLog for SSE clients. + b.Publish(Event{ + Type: EventLog, + Payload: EventLogPayload{ + ID: id, + Source: "deploy", + Severity: p.Level, + Message: p.Message, + Metadata: metadata, + CreatedAt: createdAt, + }, + }) + } + }() + + return func() { + b.Unsubscribe(sub) + } +} + // Subscriber is a channel that receives events. type Subscriber chan Event diff --git a/internal/notify/types.go b/internal/notify/types.go new file mode 100644 index 0000000..d3e3bb2 --- /dev/null +++ b/internal/notify/types.go @@ -0,0 +1,9 @@ +package notify + +// Event types for notifications. +const ( + EventTypeDeploySuccess = "deploy_success" + EventTypeDeployFailure = "deploy_failure" + EventTypeStaleDetected = "stale_detected" + EventTypeProxyUnhealthy = "proxy_unhealthy" +) diff --git a/internal/proxy/health.go b/internal/proxy/health.go new file mode 100644 index 0000000..323d08f --- /dev/null +++ b/internal/proxy/health.go @@ -0,0 +1,184 @@ +package proxy + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "net/http" + "sync" + "time" + + "github.com/alexei/docker-watcher/internal/events" + "github.com/alexei/docker-watcher/internal/store" + "github.com/robfig/cron/v3" +) + +// HealthMonitor periodically checks the health of all standalone proxies. +type HealthMonitor struct { + store *store.Store + eventBus *events.Bus + + cron *cron.Cron + mu sync.Mutex + entryID cron.EntryID + running bool +} + +// NewHealthMonitor creates a new proxy health monitor. +func NewHealthMonitor(st *store.Store, eventBus *events.Bus) *HealthMonitor { + return &HealthMonitor{ + store: st, + eventBus: eventBus, + cron: cron.New(), + } +} + +// Start begins periodic health checks with the given interval (e.g., "5m", "1m"). +// If already running, it stops and restarts with the new interval. +func (h *HealthMonitor) Start(interval string) error { + h.mu.Lock() + defer h.mu.Unlock() + + duration, err := time.ParseDuration(interval) + if err != nil { + return fmt.Errorf("parse health check interval %q: %w", interval, err) + } + + if h.running { + h.cron.Remove(h.entryID) + } + + spec := fmt.Sprintf("@every %s", duration.String()) + entryID, err := h.cron.AddFunc(spec, func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + if checkErr := h.CheckAll(ctx); checkErr != nil { + slog.Warn("proxy health monitor: check error", "error", checkErr) + } + }) + if err != nil { + return fmt.Errorf("schedule proxy health monitor: %w", err) + } + + h.entryID = entryID + if !h.running { + h.cron.Start() + } + h.running = true + + slog.Info("proxy health monitor started", "interval", duration.String()) + return nil +} + +// Stop gracefully shuts down the health monitor. +func (h *HealthMonitor) Stop() { + h.mu.Lock() + defer h.mu.Unlock() + + if h.running { + ctx := h.cron.Stop() + <-ctx.Done() + h.running = false + slog.Info("proxy health monitor stopped") + } +} + +// CheckAll performs a single health check cycle for all standalone proxies. +func (h *HealthMonitor) CheckAll(ctx context.Context) error { + proxies, err := h.store.ListStandaloneProxies() + if err != nil { + return fmt.Errorf("list standalone proxies: %w", err) + } + + for _, proxy := range proxies { + newStatus := checkProxyHealth(ctx, proxy.DestinationURL, proxy.DestinationPort) + oldStatus := proxy.HealthStatus + + if err := h.store.UpdateProxyHealth(proxy.ID, newStatus); err != nil { + slog.Warn("proxy health monitor: failed to update health", + "proxy_id", proxy.ID, "error", err) + continue + } + + // Emit event on status change. + if oldStatus != newStatus && oldStatus != "unknown" { + h.emitHealthEvent(proxy, oldStatus, newStatus) + } + } + + return nil +} + +// checkProxyHealth performs an HTTP GET to the destination and returns the health status. +func checkProxyHealth(ctx context.Context, host string, port int) string { + target := fmt.Sprintf("http://%s:%d/", host, port) + + reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, target, nil) + if err != nil { + return "unhealthy" + } + + client := &http.Client{ + Timeout: 10 * time.Second, + CheckRedirect: func(*http.Request, []*http.Request) error { + return http.ErrUseLastResponse + }, + } + + resp, err := client.Do(req) + if err != nil { + return "unhealthy" + } + resp.Body.Close() + + if resp.StatusCode >= 500 { + return "unhealthy" + } + + return "healthy" +} + +// emitHealthEvent persists and publishes a health status change event. +func (h *HealthMonitor) emitHealthEvent(proxy store.StandaloneProxy, oldStatus, newStatus string) { + severity := "info" + if newStatus == "unhealthy" { + severity = "warn" + } + + msg := fmt.Sprintf("Proxy %s (%s) health changed: %s -> %s", + proxy.Domain, proxy.ID, oldStatus, newStatus) + + metadata, _ := json.Marshal(map[string]any{ + "proxy_id": proxy.ID, + "domain": proxy.Domain, + "old_status": oldStatus, + "new_status": newStatus, + }) + + evt, err := h.store.InsertEvent(store.EventLog{ + Source: "proxy_health", + Severity: severity, + Message: msg, + Metadata: string(metadata), + }) + if err != nil { + slog.Error("proxy health monitor: failed to persist event", "error", err) + return + } + + h.eventBus.Publish(events.Event{ + Type: events.EventLog, + Payload: events.EventLogPayload{ + ID: evt.ID, + Source: "proxy_health", + Severity: severity, + Message: msg, + Metadata: string(metadata), + CreatedAt: evt.CreatedAt, + }, + }) +} diff --git a/internal/proxy/hints.go b/internal/proxy/hints.go new file mode 100644 index 0000000..8c1fbdc --- /dev/null +++ b/internal/proxy/hints.go @@ -0,0 +1,74 @@ +package proxy + +import ( + "errors" + "fmt" + "net" + "strings" +) + +// diagnosticHint returns a user-friendly suggestion for a validation failure. +func diagnosticHint(step string, err error) string { + if err == nil { + return "" + } + + switch step { + case StepDNS: + return "Domain cannot be resolved. Check DNS settings or use an IP address." + + case StepTCP: + return tcpHintFromError(err) + + case StepHTTP: + return httpHint(err.Error()) + + default: + return "Validation failed: " + err.Error() + } +} + +// tcpHintFromError returns a specific hint based on the TCP error type. +func tcpHintFromError(err error) string { + if err == nil { + return "" + } + + var opErr *net.OpError + if errors.As(err, &opErr) { + lower := strings.ToLower(opErr.Err.Error()) + switch { + case strings.Contains(lower, "connection refused"): + return "Port is not accepting connections. Check if the service is running and the port is correct." + case strings.Contains(lower, "i/o timeout") || strings.Contains(lower, "timeout"): + return "Connection timed out. Possible firewall blocking. Check network/firewall rules." + case strings.Contains(lower, "no route to host") || strings.Contains(lower, "host is unreachable"): + return "Host is not reachable. Verify the IP address and network connectivity." + } + } + + msg := err.Error() + lower := strings.ToLower(msg) + switch { + case strings.Contains(lower, "connection refused"): + return "Port is not accepting connections. Check if the service is running and the port is correct." + case strings.Contains(lower, "timeout"): + return "Connection timed out. Possible firewall blocking. Check network/firewall rules." + default: + return fmt.Sprintf("TCP connection failed: %s", msg) + } +} + +// httpHint returns a specific hint based on the HTTP probe result. +func httpHint(msg string) string { + lower := strings.ToLower(msg) + + switch { + case strings.Contains(lower, "status"): + return msg // Already formatted by the caller with the status code. + case strings.Contains(lower, "timeout"): + return "HTTP health probe timed out. The service may be slow or unresponsive." + default: + return "HTTP health probe failed: " + msg + } +} diff --git a/internal/proxy/manager.go b/internal/proxy/manager.go new file mode 100644 index 0000000..05189ef --- /dev/null +++ b/internal/proxy/manager.go @@ -0,0 +1,314 @@ +package proxy + +import ( + "context" + "errors" + "fmt" + "log/slog" + + "github.com/alexei/docker-watcher/internal/npm" + "github.com/alexei/docker-watcher/internal/store" +) + +// Manager handles the lifecycle of standalone proxy hosts. +type Manager struct { + store *store.Store + npm *npm.Client +} + +// NewManager creates a new proxy manager. +func NewManager(st *store.Store, npmClient *npm.Client) *Manager { + return &Manager{ + store: st, + npm: npmClient, + } +} + +// CreateProxyRequest is the input for creating a standalone proxy. +type CreateProxyRequest struct { + Domain string `json:"domain"` + DestinationURL string `json:"destination_url"` + DestinationPort int `json:"destination_port"` +} + +// UpdateProxyRequest is the input for updating a standalone proxy. +type UpdateProxyRequest struct { + Domain string `json:"domain"` + DestinationURL string `json:"destination_url"` + DestinationPort int `json:"destination_port"` +} + +// ProxyView is a unified view of both standalone and deploy-managed proxies. +type ProxyView struct { + ID string `json:"id"` + Domain string `json:"domain"` + Destination string `json:"destination"` + Type string `json:"type"` // "standalone" or "managed" + ProjectName string `json:"project_name,omitempty"` + StageName string `json:"stage_name,omitempty"` + HealthStatus string `json:"health_status"` + SSLEnabled bool `json:"ssl_enabled"` + NpmProxyID int `json:"npm_proxy_id"` + CreatedAt string `json:"created_at"` +} + +// CreateProxy validates the destination, creates an NPM proxy host, and saves to the store. +func (m *Manager) CreateProxy(ctx context.Context, req CreateProxyRequest) (store.StandaloneProxy, error) { + // Validate destination. + result := ValidateDestination(ctx, req.DestinationURL, req.DestinationPort) + if !result.Valid { + return store.StandaloneProxy{}, fmt.Errorf("destination validation failed: %s", lastFailedStep(result)) + } + + // Load settings for SSL certificate and domain. + settings, err := m.store.GetSettings() + if err != nil { + return store.StandaloneProxy{}, fmt.Errorf("get settings: %w", err) + } + + // Build NPM proxy host config. + config := npm.ProxyHostConfig{ + DomainNames: []string{req.Domain}, + ForwardScheme: "http", + ForwardHost: req.DestinationURL, + ForwardPort: req.DestinationPort, + CertificateID: settings.SSLCertificateID, + SSLForced: settings.SSLCertificateID > 0, + BlockExploits: true, + AllowWebsocket: true, + HTTP2Support: true, + HSTSEnabled: settings.SSLCertificateID > 0, + Locations: []any{}, + } + + // Create NPM proxy host. + npmHost, err := m.npm.CreateProxyHost(ctx, config) + if err != nil { + return store.StandaloneProxy{}, fmt.Errorf("create NPM proxy host: %w", err) + } + + slog.Info("created NPM proxy host for standalone proxy", + "domain", req.Domain, "npm_proxy_id", npmHost.ID) + + // Save to store. + proxy, err := m.store.CreateStandaloneProxy(store.StandaloneProxy{ + Domain: req.Domain, + DestinationURL: req.DestinationURL, + DestinationPort: req.DestinationPort, + SSLCertificateID: settings.SSLCertificateID, + NpmProxyID: npmHost.ID, + HealthStatus: "unknown", + }) + if err != nil { + // Best effort: clean up the NPM host if store insert fails. + if delErr := m.npm.DeleteProxyHost(ctx, npmHost.ID); delErr != nil { + slog.Error("failed to clean up NPM proxy host after store error", + "npm_proxy_id", npmHost.ID, "error", delErr) + } + return store.StandaloneProxy{}, fmt.Errorf("save standalone proxy: %w", err) + } + + return proxy, nil +} + +// UpdateProxy re-validates the destination, updates the NPM proxy host, and updates the store. +func (m *Manager) UpdateProxy(ctx context.Context, id string, req UpdateProxyRequest) (store.StandaloneProxy, error) { + existing, err := m.store.GetStandaloneProxy(id) + if err != nil { + return store.StandaloneProxy{}, fmt.Errorf("get proxy: %w", err) + } + + // Validate new destination. + result := ValidateDestination(ctx, req.DestinationURL, req.DestinationPort) + if !result.Valid { + return store.StandaloneProxy{}, fmt.Errorf("destination validation failed: %s", lastFailedStep(result)) + } + + // Load settings for SSL certificate. + settings, err := m.store.GetSettings() + if err != nil { + return store.StandaloneProxy{}, fmt.Errorf("get settings: %w", err) + } + + // Update NPM proxy host. + config := npm.ProxyHostConfig{ + DomainNames: []string{req.Domain}, + ForwardScheme: "http", + ForwardHost: req.DestinationURL, + ForwardPort: req.DestinationPort, + CertificateID: settings.SSLCertificateID, + SSLForced: settings.SSLCertificateID > 0, + BlockExploits: true, + AllowWebsocket: true, + HTTP2Support: true, + HSTSEnabled: settings.SSLCertificateID > 0, + Locations: []any{}, + } + + if _, err := m.npm.UpdateProxyHost(ctx, existing.NpmProxyID, config); err != nil { + return store.StandaloneProxy{}, fmt.Errorf("update NPM proxy host: %w", err) + } + + // Update store. + updated := existing + updated.Domain = req.Domain + updated.DestinationURL = req.DestinationURL + updated.DestinationPort = req.DestinationPort + updated.SSLCertificateID = settings.SSLCertificateID + + if err := m.store.UpdateStandaloneProxy(updated); err != nil { + return store.StandaloneProxy{}, fmt.Errorf("update standalone proxy: %w", err) + } + + // Re-read from store to get updated timestamps. + return m.store.GetStandaloneProxy(id) +} + +// DeleteProxy removes the NPM proxy host and deletes from the store. +func (m *Manager) DeleteProxy(ctx context.Context, id string) error { + proxy, err := m.store.GetStandaloneProxy(id) + if err != nil { + return fmt.Errorf("get proxy: %w", err) + } + + // Delete NPM proxy host. + if proxy.NpmProxyID > 0 { + if err := m.npm.DeleteProxyHost(ctx, proxy.NpmProxyID); err != nil { + slog.Warn("failed to delete NPM proxy host (continuing with store deletion)", + "npm_proxy_id", proxy.NpmProxyID, "error", err) + } + } + + if err := m.store.DeleteStandaloneProxy(id); err != nil { + return fmt.Errorf("delete standalone proxy: %w", err) + } + + return nil +} + +// GetProxy returns a single standalone proxy by ID. +func (m *Manager) GetProxy(id string) (store.StandaloneProxy, error) { + proxy, err := m.store.GetStandaloneProxy(id) + if err != nil { + return store.StandaloneProxy{}, fmt.Errorf("get proxy: %w", err) + } + return proxy, nil +} + +// ListProxies returns all standalone proxies. +func (m *Manager) ListProxies() ([]store.StandaloneProxy, error) { + proxies, err := m.store.ListStandaloneProxies() + if err != nil { + return nil, fmt.Errorf("list proxies: %w", err) + } + return proxies, nil +} + +// ListAllProxies returns a merged view of standalone and deploy-managed proxies. +func (m *Manager) ListAllProxies() ([]ProxyView, error) { + views := []ProxyView{} + + // Standalone proxies. + standalones, err := m.store.ListStandaloneProxies() + if err != nil { + return nil, fmt.Errorf("list standalone proxies: %w", err) + } + + for _, p := range standalones { + views = append(views, ProxyView{ + ID: p.ID, + Domain: p.Domain, + Destination: fmt.Sprintf("%s:%d", p.DestinationURL, p.DestinationPort), + Type: "standalone", + HealthStatus: p.HealthStatus, + SSLEnabled: p.SSLCertificateID > 0, + NpmProxyID: p.NpmProxyID, + CreatedAt: p.CreatedAt, + }) + } + + // Deploy-managed proxies: instances with npm_proxy_id > 0. + instances, err := m.store.ListAllInstances() + if err != nil { + return nil, fmt.Errorf("list instances: %w", err) + } + + // Pre-load project and stage names to avoid N+1 queries. + allProjects, _ := m.store.GetAllProjects() + projectNames := make(map[string]string, len(allProjects)) + for _, p := range allProjects { + projectNames[p.ID] = p.Name + } + stageNames := make(map[string]string) + for _, p := range allProjects { + stages, _ := m.store.GetStagesByProjectID(p.ID) + for _, s := range stages { + stageNames[s.ID] = s.Name + } + } + + for _, inst := range instances { + if inst.NpmProxyID <= 0 { + continue + } + + projectName := projectNames[inst.ProjectID] + if projectName == "" { + projectName = inst.ProjectID + } + stageName := stageNames[inst.StageID] + if stageName == "" { + stageName = inst.StageID + } + + cid := inst.ContainerID + if len(cid) > 12 { + cid = cid[:12] + } + destination := fmt.Sprintf("%s:%d", cid, inst.Port) + if inst.Subdomain != "" { + destination = fmt.Sprintf("%s:%d", inst.Subdomain, inst.Port) + } + + healthStatus := "unknown" + if inst.Status == "running" { + healthStatus = "healthy" + } else if inst.Status == "stopped" || inst.Status == "failed" { + healthStatus = "unhealthy" + } + + views = append(views, ProxyView{ + ID: inst.ID, + Domain: inst.Subdomain, + Destination: destination, + Type: "managed", + ProjectName: projectName, + StageName: stageName, + HealthStatus: healthStatus, + SSLEnabled: true, // managed proxies always get SSL from settings + NpmProxyID: inst.NpmProxyID, + CreatedAt: inst.CreatedAt, + }) + } + + return views, nil +} + +// lastFailedStep returns the message of the last failed validation step. +func lastFailedStep(result ValidationResult) string { + for _, step := range result.Steps { + if !step.Passed { + msg := step.Message + if step.Hint != "" { + msg += " — " + step.Hint + } + return msg + } + } + return "unknown validation failure" +} + +// IsNotFound checks if an error wraps store.ErrNotFound. +func IsNotFound(err error) bool { + return errors.Is(err, store.ErrNotFound) +} diff --git a/internal/proxy/validator.go b/internal/proxy/validator.go new file mode 100644 index 0000000..a1a5ce5 --- /dev/null +++ b/internal/proxy/validator.go @@ -0,0 +1,224 @@ +package proxy + +import ( + "context" + "fmt" + "net" + "net/http" + "net/url" + "strconv" + "time" +) + +// Validation step names. +const ( + StepSyntax = "syntax" + StepDNS = "dns" + StepTCP = "tcp" + StepHTTP = "http" +) + +// ValidationStep holds the result of a single validation check. +type ValidationStep struct { + Name string `json:"name"` + Passed bool `json:"passed"` + Message string `json:"message,omitempty"` + Hint string `json:"hint,omitempty"` +} + +// ValidationResult holds the aggregate result of the validation pipeline. +type ValidationResult struct { + Valid bool `json:"valid"` + Steps []ValidationStep `json:"steps"` +} + +// ValidateDestination runs the multi-step validation pipeline against the given +// destination host and port. It checks syntax, DNS, TCP reachability, and HTTP health. +// The pipeline short-circuits on failure: later steps are skipped if an earlier one fails. +func ValidateDestination(ctx context.Context, host string, port int) ValidationResult { + result := ValidationResult{Valid: true} + + // Step 1: Syntax validation. + if step, ok := validateSyntax(host, port); !ok { + result.Valid = false + result.Steps = append(result.Steps, step) + return result + } else { + result.Steps = append(result.Steps, step) + } + + // Step 2: DNS resolution (skip for IP addresses). + ip := net.ParseIP(host) + if ip == nil { + if step, ok := validateDNS(ctx, host); !ok { + result.Valid = false + result.Steps = append(result.Steps, step) + return result + } else { + result.Steps = append(result.Steps, step) + } + } else { + result.Steps = append(result.Steps, ValidationStep{ + Name: StepDNS, + Passed: true, + Message: "Skipped (IP address provided)", + }) + } + + // Step 3: TCP port reachability. + if step, ok := validateTCP(ctx, host, port); !ok { + result.Valid = false + result.Steps = append(result.Steps, step) + return result + } else { + result.Steps = append(result.Steps, step) + } + + // Step 4: HTTP health probe. + step := validateHTTP(ctx, host, port) + result.Steps = append(result.Steps, step) + if !step.Passed { + result.Valid = false + } + + return result +} + +// validateSyntax checks that the host and port values are syntactically valid. +func validateSyntax(host string, port int) (ValidationStep, bool) { + if host == "" { + return ValidationStep{ + Name: StepSyntax, + Passed: false, + Message: "Host is empty", + Hint: "Provide a valid hostname or IP address.", + }, false + } + + if port < 1 || port > 65535 { + return ValidationStep{ + Name: StepSyntax, + Passed: false, + Message: fmt.Sprintf("Port %d is out of range (1-65535)", port), + Hint: "Provide a valid port number between 1 and 65535.", + }, false + } + + // Reject obviously invalid hostnames (but allow IPs). + if net.ParseIP(host) == nil { + // Basic hostname validation: must not contain spaces or schemes. + if _, err := url.Parse("http://" + host); err != nil { + return ValidationStep{ + Name: StepSyntax, + Passed: false, + Message: "Invalid hostname: " + err.Error(), + Hint: "Provide a valid hostname without scheme (e.g., 'example.com' not 'http://example.com').", + }, false + } + } + + return ValidationStep{ + Name: StepSyntax, + Passed: true, + Message: fmt.Sprintf("Host %q port %d syntax OK", host, port), + }, true +} + +// validateDNS performs a DNS lookup on the given host. +func validateDNS(ctx context.Context, host string) (ValidationStep, bool) { + resolver := net.DefaultResolver + addrs, err := resolver.LookupHost(ctx, host) + if err != nil { + return ValidationStep{ + Name: StepDNS, + Passed: false, + Message: fmt.Sprintf("DNS resolution failed for %q: %s", host, err.Error()), + Hint: diagnosticHint(StepDNS, err), + }, false + } + + return ValidationStep{ + Name: StepDNS, + Passed: true, + Message: fmt.Sprintf("Resolved to %v", addrs), + }, true +} + +// validateTCP attempts a TCP connection to host:port with a 5-second timeout. +func validateTCP(ctx context.Context, host string, port int) (ValidationStep, bool) { + addr := net.JoinHostPort(host, strconv.Itoa(port)) + + dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + var d net.Dialer + conn, err := d.DialContext(dialCtx, "tcp", addr) + if err != nil { + return ValidationStep{ + Name: StepTCP, + Passed: false, + Message: fmt.Sprintf("TCP connect to %s failed: %s", addr, err.Error()), + Hint: diagnosticHint(StepTCP, err), + }, false + } + conn.Close() + + return ValidationStep{ + Name: StepTCP, + Passed: true, + Message: fmt.Sprintf("TCP connect to %s succeeded", addr), + }, true +} + +// validateHTTP performs a GET request to the destination and checks for a response. +// Non-5xx responses are considered passing (the service is responding). +func validateHTTP(ctx context.Context, host string, port int) ValidationStep { + target := fmt.Sprintf("http://%s:%d/", host, port) + + httpCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(httpCtx, http.MethodGet, target, nil) + if err != nil { + return ValidationStep{ + Name: StepHTTP, + Passed: false, + Message: fmt.Sprintf("Failed to build HTTP request: %s", err.Error()), + Hint: diagnosticHint(StepHTTP, err), + } + } + + client := &http.Client{ + Timeout: 10 * time.Second, + // Do not follow redirects — we just want to see if the port responds to HTTP. + CheckRedirect: func(*http.Request, []*http.Request) error { + return http.ErrUseLastResponse + }, + } + + resp, err := client.Do(req) + if err != nil { + return ValidationStep{ + Name: StepHTTP, + Passed: false, + Message: fmt.Sprintf("HTTP probe to %s failed: %s", target, err.Error()), + Hint: diagnosticHint(StepHTTP, err), + } + } + resp.Body.Close() + + if resp.StatusCode >= 500 { + return ValidationStep{ + Name: StepHTTP, + Passed: false, + Message: fmt.Sprintf("Service responded with HTTP %d. The service may not be healthy.", resp.StatusCode), + Hint: fmt.Sprintf("Service responded with HTTP %d. The service may not be healthy.", resp.StatusCode), + } + } + + return ValidationStep{ + Name: StepHTTP, + Passed: true, + Message: fmt.Sprintf("HTTP probe returned %d", resp.StatusCode), + } +} diff --git a/internal/stale/scanner.go b/internal/stale/scanner.go new file mode 100644 index 0000000..da85e97 --- /dev/null +++ b/internal/stale/scanner.go @@ -0,0 +1,330 @@ +package stale + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "sync" + "time" + + "github.com/alexei/docker-watcher/internal/docker" + "github.com/alexei/docker-watcher/internal/events" + "github.com/alexei/docker-watcher/internal/store" + "github.com/robfig/cron/v3" +) + +// StaleInstance holds enriched info about a stale container for API responses. +type StaleInstance struct { + Instance store.Instance `json:"instance"` + ProjectName string `json:"project_name"` + StageName string `json:"stage_name"` + DaysStale int `json:"days_stale"` +} + +// Scanner periodically checks for stale containers that have been +// non-running for longer than the configured threshold. +type Scanner struct { + store *store.Store + docker *docker.Client + eventBus *events.Bus + + cron *cron.Cron + mu sync.Mutex + entryID cron.EntryID + running bool + + // knownStale tracks instance IDs that have already had a stale event emitted, + // to avoid re-emitting warnings for the same instance. + knownStale map[string]struct{} +} + +// New creates a new stale container scanner. +func New(st *store.Store, dockerClient *docker.Client, eventBus *events.Bus) *Scanner { + return &Scanner{ + store: st, + docker: dockerClient, + eventBus: eventBus, + cron: cron.New(), + knownStale: make(map[string]struct{}), + } +} + +// Start begins the periodic stale container scan with the given interval (e.g., "1h", "30m"). +// If the scanner is already running, it stops and restarts with the new interval. +func (s *Scanner) Start(interval string) error { + s.mu.Lock() + defer s.mu.Unlock() + + duration, err := time.ParseDuration(interval) + if err != nil { + return fmt.Errorf("parse stale scan interval %q: %w", interval, err) + } + + if s.running { + s.cron.Remove(s.entryID) + } + + spec := fmt.Sprintf("@every %s", duration.String()) + entryID, err := s.cron.AddFunc(spec, func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + if scanErr := s.Scan(ctx); scanErr != nil { + slog.Warn("stale scanner: scan error", "error", scanErr) + } + }) + if err != nil { + return fmt.Errorf("schedule stale scanner: %w", err) + } + + s.entryID = entryID + if !s.running { + s.cron.Start() + } + s.running = true + + slog.Info("stale scanner started", "interval", duration.String()) + return nil +} + +// Stop gracefully shuts down the scanner. +func (s *Scanner) Stop() { + s.mu.Lock() + defer s.mu.Unlock() + + if s.running { + ctx := s.cron.Stop() + <-ctx.Done() + s.running = false + slog.Info("stale scanner stopped") + } +} + +// Scan performs a single stale-container scan cycle. +// It updates last_alive_at for running containers and detects newly stale ones. +func (s *Scanner) Scan(ctx context.Context) error { + settings, err := s.store.GetSettings() + if err != nil { + return fmt.Errorf("get settings: %w", err) + } + + thresholdDays := settings.StaleThresholdDays + if thresholdDays <= 0 { + thresholdDays = 7 + } + + // Get all instances from the store. + instances, err := s.store.ListAllInstances() + if err != nil { + return fmt.Errorf("list all instances: %w", err) + } + + if len(instances) == 0 { + return nil + } + + // Get all managed Docker containers to check live state. + containers, err := s.docker.ListContainers(ctx, nil) + if err != nil { + return fmt.Errorf("list docker containers: %w", err) + } + + // Build a lookup: instance ID -> container state. + containerStateByInstanceID := make(map[string]string, len(containers)) + for _, c := range containers { + if c.InstanceID != "" { + containerStateByInstanceID[c.InstanceID] = c.State + } + } + + now := time.Now().UTC() + currentStaleIDs := make(map[string]struct{}) + + for _, inst := range instances { + // Skip instances already being cleaned up. + if inst.Status == "removing" { + continue + } + + dockerState := containerStateByInstanceID[inst.ID] + + // If the container is running in Docker, update last_alive_at. + if dockerState == "running" { + if err := s.store.UpdateLastAliveAt(inst.ID); err != nil { + slog.Warn("stale scanner: failed to update last_alive_at", + "instance_id", inst.ID, "error", err) + } + // Also sync store status if it was out of date. + if inst.Status != "running" { + if err := s.store.UpdateInstanceStatus(inst.ID, "running"); err != nil { + slog.Warn("stale scanner: failed to sync instance status", + "instance_id", inst.ID, "error", err) + } + } + continue + } + + // Container is not running. Check if it's stale. + if inst.LastAliveAt == "" { + // Never been seen running. Use created_at as fallback. + inst.LastAliveAt = inst.CreatedAt + } + + lastAlive, parseErr := time.Parse("2006-01-02 15:04:05", inst.LastAliveAt) + if parseErr != nil { + slog.Warn("stale scanner: failed to parse last_alive_at", + "instance_id", inst.ID, "last_alive_at", inst.LastAliveAt, "error", parseErr) + continue + } + + daysSinceAlive := int(now.Sub(lastAlive).Hours() / 24) + if daysSinceAlive < thresholdDays { + continue + } + + // This instance is stale. + currentStaleIDs[inst.ID] = struct{}{} + + // Emit event only if this is newly detected as stale. + if _, alreadyKnown := s.knownStale[inst.ID]; !alreadyKnown { + s.emitStaleEvent(inst, daysSinceAlive) + } + } + + // Update known stale set: remove IDs that are no longer stale. + s.knownStale = currentStaleIDs + + return nil +} + +// FindStaleInstances returns all currently stale instances with enriched project/stage info. +func (s *Scanner) FindStaleInstances(ctx context.Context) ([]StaleInstance, error) { + settings, err := s.store.GetSettings() + if err != nil { + return nil, fmt.Errorf("get settings: %w", err) + } + + thresholdDays := settings.StaleThresholdDays + if thresholdDays <= 0 { + thresholdDays = 7 + } + + instances, err := s.store.ListAllInstances() + if err != nil { + return nil, fmt.Errorf("list all instances: %w", err) + } + + containers, err := s.docker.ListContainers(ctx, nil) + if err != nil { + return nil, fmt.Errorf("list docker containers: %w", err) + } + + containerStateByInstanceID := make(map[string]string, len(containers)) + for _, c := range containers { + if c.InstanceID != "" { + containerStateByInstanceID[c.InstanceID] = c.State + } + } + + // Pre-load project and stage names to avoid N+1 queries. + allProjects, _ := s.store.GetAllProjects() + projectNames := make(map[string]string, len(allProjects)) + for _, p := range allProjects { + projectNames[p.ID] = p.Name + } + stageNames := make(map[string]string) + for _, p := range allProjects { + stages, _ := s.store.GetStagesByProjectID(p.ID) + for _, st := range stages { + stageNames[st.ID] = st.Name + } + } + + now := time.Now().UTC() + var result []StaleInstance + + for _, inst := range instances { + if inst.Status == "removing" { + continue + } + + // If Docker says it's running, it's not stale. + if containerStateByInstanceID[inst.ID] == "running" { + continue + } + + lastAlive := inst.LastAliveAt + if lastAlive == "" { + lastAlive = inst.CreatedAt + } + + lastAliveTime, parseErr := time.Parse("2006-01-02 15:04:05", lastAlive) + if parseErr != nil { + continue + } + + daysSinceAlive := int(now.Sub(lastAliveTime).Hours() / 24) + if daysSinceAlive < thresholdDays { + continue + } + + // Look up project and stage names from pre-loaded maps. + projectName := projectNames[inst.ProjectID] + if projectName == "" { + projectName = inst.ProjectID + } + stageName := stageNames[inst.StageID] + if stageName == "" { + stageName = inst.StageID + } + + result = append(result, StaleInstance{ + Instance: inst, + ProjectName: projectName, + StageName: stageName, + DaysStale: daysSinceAlive, + }) + } + + return result, nil +} + +// emitStaleEvent publishes a warning event for a newly detected stale container. +func (s *Scanner) emitStaleEvent(inst store.Instance, daysStale int) { + metadata, _ := json.Marshal(map[string]any{ + "instance_id": inst.ID, + "project_id": inst.ProjectID, + "stage_id": inst.StageID, + "image_tag": inst.ImageTag, + "last_alive_at": inst.LastAliveAt, + "days_stale": daysStale, + }) + + msg := fmt.Sprintf("Container %s (tag: %s) has been non-running for %d days", + inst.ID, inst.ImageTag, daysStale) + + // Persist directly to event log. + evt, err := s.store.InsertEvent(store.EventLog{ + Source: "stale_scanner", + Severity: "warn", + Message: msg, + Metadata: string(metadata), + }) + if err != nil { + slog.Error("stale scanner: failed to persist event", "error", err) + return + } + + // Publish for SSE clients. + s.eventBus.Publish(events.Event{ + Type: events.EventLog, + Payload: events.EventLogPayload{ + ID: evt.ID, + Source: "stale_scanner", + Severity: "warn", + Message: msg, + Metadata: string(metadata), + CreatedAt: evt.CreatedAt, + }, + }) +} diff --git a/internal/store/eventlog.go b/internal/store/eventlog.go new file mode 100644 index 0000000..1414348 --- /dev/null +++ b/internal/store/eventlog.go @@ -0,0 +1,168 @@ +package store + +import ( + "fmt" + "strings" +) + +// EventLogFilter holds optional filters for listing event log entries. +type EventLogFilter struct { + Severity string // Filter by severity (info, warn, error). + Source string // Filter by source. + Since string // Only events created at or after this timestamp. + Until string // Only events created at or before this timestamp. + Limit int // Maximum number of results (default 50). + Offset int // Offset for pagination. +} + +// EventLogStats holds counts of event log entries by severity. +type EventLogStats struct { + Info int `json:"info"` + Warn int `json:"warn"` + Error int `json:"error"` + Total int `json:"total"` +} + +// InsertEvent inserts a new event log entry. +func (s *Store) InsertEvent(evt EventLog) (EventLog, error) { + evt.CreatedAt = Now() + if evt.Metadata == "" { + evt.Metadata = "{}" + } + + result, err := s.db.Exec( + `INSERT INTO event_log (source, severity, message, metadata, created_at) + VALUES (?, ?, ?, ?, ?)`, + evt.Source, evt.Severity, evt.Message, evt.Metadata, evt.CreatedAt, + ) + if err != nil { + return EventLog{}, fmt.Errorf("insert event: %w", err) + } + + id, err := result.LastInsertId() + if err != nil { + return EventLog{}, fmt.Errorf("get event id: %w", err) + } + evt.ID = id + + return evt, nil +} + +// ListEvents returns event log entries matching the given filter. +func (s *Store) ListEvents(filter EventLogFilter) ([]EventLog, error) { + var conditions []string + var args []any + + if filter.Severity != "" { + parts := strings.Split(filter.Severity, ",") + if len(parts) == 1 { + conditions = append(conditions, "severity = ?") + args = append(args, filter.Severity) + } else { + placeholders := make([]string, len(parts)) + for i, p := range parts { + placeholders[i] = "?" + args = append(args, strings.TrimSpace(p)) + } + conditions = append(conditions, "severity IN ("+strings.Join(placeholders, ",")+")") + } + } + if filter.Source != "" { + parts := strings.Split(filter.Source, ",") + if len(parts) == 1 { + conditions = append(conditions, "source = ?") + args = append(args, filter.Source) + } else { + placeholders := make([]string, len(parts)) + for i, p := range parts { + placeholders[i] = "?" + args = append(args, strings.TrimSpace(p)) + } + conditions = append(conditions, "source IN ("+strings.Join(placeholders, ",")+")") + } + } + if filter.Since != "" { + conditions = append(conditions, "created_at >= ?") + args = append(args, filter.Since) + } + if filter.Until != "" { + conditions = append(conditions, "created_at <= ?") + args = append(args, filter.Until) + } + + query := "SELECT id, source, severity, message, metadata, created_at FROM event_log" + if len(conditions) > 0 { + query += " WHERE " + strings.Join(conditions, " AND ") + } + query += " ORDER BY created_at DESC" + + limit := filter.Limit + if limit <= 0 { + limit = 50 + } + if limit > 500 { + limit = 500 + } + query += fmt.Sprintf(" LIMIT %d OFFSET %d", limit, filter.Offset) + + rows, err := s.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("query events: %w", err) + } + defer rows.Close() + + events := []EventLog{} + for rows.Next() { + var evt EventLog + if err := rows.Scan(&evt.ID, &evt.Source, &evt.Severity, &evt.Message, &evt.Metadata, &evt.CreatedAt); err != nil { + return nil, fmt.Errorf("scan event: %w", err) + } + events = append(events, evt) + } + return events, rows.Err() +} + +// GetEventStats returns counts of event log entries grouped by severity. +func (s *Store) GetEventStats() (EventLogStats, error) { + rows, err := s.db.Query( + `SELECT severity, COUNT(*) FROM event_log GROUP BY severity`, + ) + if err != nil { + return EventLogStats{}, fmt.Errorf("query event stats: %w", err) + } + defer rows.Close() + + var stats EventLogStats + for rows.Next() { + var severity string + var count int + if err := rows.Scan(&severity, &count); err != nil { + return EventLogStats{}, fmt.Errorf("scan event stats: %w", err) + } + switch severity { + case "info": + stats.Info = count + case "warn": + stats.Warn = count + case "error": + stats.Error = count + } + stats.Total += count + } + return stats, rows.Err() +} + +// PruneEvents deletes event log entries older than the given number of days. +func (s *Store) PruneEvents(olderThanDays int) (int64, error) { + if olderThanDays < 1 { + return 0, fmt.Errorf("prune events: olderThanDays must be >= 1, got %d", olderThanDays) + } + result, err := s.db.Exec( + `DELETE FROM event_log WHERE created_at < datetime('now', ?)`, + fmt.Sprintf("-%d days", olderThanDays), + ) + if err != nil { + return 0, fmt.Errorf("prune events: %w", err) + } + return result.RowsAffected() +} diff --git a/internal/store/instances.go b/internal/store/instances.go index 2cb4cfb..d0fb730 100644 --- a/internal/store/instances.go +++ b/internal/store/instances.go @@ -8,6 +8,20 @@ import ( "github.com/google/uuid" ) +// instanceColumns is the canonical column list for instance queries. +const instanceColumns = `id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, last_alive_at, created_at, updated_at` + +// scanInstance scans a row into an Instance struct using the canonical column order. +func scanInstance(scanner interface{ Scan(...any) error }) (Instance, error) { + var inst Instance + err := scanner.Scan( + &inst.ID, &inst.StageID, &inst.ProjectID, &inst.ContainerID, &inst.ImageTag, + &inst.Subdomain, &inst.NpmProxyID, &inst.Status, &inst.Port, + &inst.LastAliveAt, &inst.CreatedAt, &inst.UpdatedAt, + ) + return inst, err +} + // CreateInstance inserts a new instance record. func (s *Store) CreateInstance(inst Instance) (Instance, error) { inst.ID = uuid.New().String() @@ -15,10 +29,11 @@ func (s *Store) CreateInstance(inst Instance) (Instance, error) { inst.UpdatedAt = inst.CreatedAt _, err := s.db.Exec( - `INSERT INTO instances (id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + `INSERT INTO instances (`+instanceColumns+`) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, inst.ID, inst.StageID, inst.ProjectID, inst.ContainerID, inst.ImageTag, - inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, inst.CreatedAt, inst.UpdatedAt, + inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, + inst.LastAliveAt, inst.CreatedAt, inst.UpdatedAt, ) if err != nil { return Instance{}, fmt.Errorf("insert instance: %w", err) @@ -36,10 +51,11 @@ func (s *Store) CreateInstanceWithID(inst Instance) (Instance, error) { inst.UpdatedAt = inst.CreatedAt _, err := s.db.Exec( - `INSERT INTO instances (id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + `INSERT INTO instances (`+instanceColumns+`) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, inst.ID, inst.StageID, inst.ProjectID, inst.ContainerID, inst.ImageTag, - inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, inst.CreatedAt, inst.UpdatedAt, + inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, + inst.LastAliveAt, inst.CreatedAt, inst.UpdatedAt, ) if err != nil { return Instance{}, fmt.Errorf("insert instance: %w", err) @@ -49,12 +65,9 @@ func (s *Store) CreateInstanceWithID(inst Instance) (Instance, error) { // GetInstanceByID returns a single instance by its ID. func (s *Store) GetInstanceByID(id string) (Instance, error) { - var inst Instance - err := s.db.QueryRow( - `SELECT id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at - FROM instances WHERE id = ?`, id, - ).Scan(&inst.ID, &inst.StageID, &inst.ProjectID, &inst.ContainerID, &inst.ImageTag, - &inst.Subdomain, &inst.NpmProxyID, &inst.Status, &inst.Port, &inst.CreatedAt, &inst.UpdatedAt) + inst, err := scanInstance(s.db.QueryRow( + `SELECT `+instanceColumns+` FROM instances WHERE id = ?`, id, + )) if errors.Is(err, sql.ErrNoRows) { return Instance{}, fmt.Errorf("instance %s: %w", id, ErrNotFound) } @@ -67,8 +80,7 @@ func (s *Store) GetInstanceByID(id string) (Instance, error) { // GetInstancesByStageID returns all instances for a given stage. func (s *Store) GetInstancesByStageID(stageID string) ([]Instance, error) { rows, err := s.db.Query( - `SELECT id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at - FROM instances WHERE stage_id = ? ORDER BY created_at DESC`, stageID, + `SELECT `+instanceColumns+` FROM instances WHERE stage_id = ? ORDER BY created_at DESC`, stageID, ) if err != nil { return nil, fmt.Errorf("query instances: %w", err) @@ -77,9 +89,29 @@ func (s *Store) GetInstancesByStageID(stageID string) ([]Instance, error) { instances := []Instance{} for rows.Next() { - var inst Instance - if err := rows.Scan(&inst.ID, &inst.StageID, &inst.ProjectID, &inst.ContainerID, &inst.ImageTag, - &inst.Subdomain, &inst.NpmProxyID, &inst.Status, &inst.Port, &inst.CreatedAt, &inst.UpdatedAt); err != nil { + inst, err := scanInstance(rows) + if err != nil { + return nil, fmt.Errorf("scan instance: %w", err) + } + instances = append(instances, inst) + } + return instances, rows.Err() +} + +// ListAllInstances returns all instances across all stages. +func (s *Store) ListAllInstances() ([]Instance, error) { + rows, err := s.db.Query( + `SELECT ` + instanceColumns + ` FROM instances ORDER BY created_at DESC`, + ) + if err != nil { + return nil, fmt.Errorf("query all instances: %w", err) + } + defer rows.Close() + + instances := []Instance{} + for rows.Next() { + inst, err := scanInstance(rows) + if err != nil { return nil, fmt.Errorf("scan instance: %w", err) } instances = append(instances, inst) @@ -91,10 +123,11 @@ func (s *Store) GetInstancesByStageID(stageID string) ([]Instance, error) { func (s *Store) UpdateInstance(inst Instance) error { inst.UpdatedAt = Now() result, err := s.db.Exec( - `UPDATE instances SET stage_id=?, project_id=?, container_id=?, image_tag=?, subdomain=?, npm_proxy_id=?, status=?, port=?, updated_at=? + `UPDATE instances SET stage_id=?, project_id=?, container_id=?, image_tag=?, subdomain=?, npm_proxy_id=?, status=?, port=?, last_alive_at=?, updated_at=? WHERE id=?`, inst.StageID, inst.ProjectID, inst.ContainerID, inst.ImageTag, - inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, inst.UpdatedAt, inst.ID, + inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, + inst.LastAliveAt, inst.UpdatedAt, inst.ID, ) if err != nil { return fmt.Errorf("update instance: %w", err) @@ -123,6 +156,24 @@ func (s *Store) UpdateInstanceStatus(id string, status string) error { return nil } +// UpdateLastAliveAt sets the last_alive_at timestamp for an instance. +// Called when an instance is seen running. +func (s *Store) UpdateLastAliveAt(id string) error { + ts := Now() + result, err := s.db.Exec( + `UPDATE instances SET last_alive_at=?, updated_at=? WHERE id=?`, + ts, ts, id, + ) + if err != nil { + return fmt.Errorf("update last_alive_at: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("instance %s: %w", id, ErrNotFound) + } + return nil +} + // DeleteInstance removes an instance by ID. func (s *Store) DeleteInstance(id string) error { result, err := s.db.Exec(`DELETE FROM instances WHERE id = ?`, id) diff --git a/internal/store/models.go b/internal/store/models.go index 72e823f..cd4eedd 100644 --- a/internal/store/models.go +++ b/internal/store/models.go @@ -55,8 +55,9 @@ type Settings struct { WebhookSecret string `json:"webhook_secret"` PollingInterval string `json:"polling_interval"` BaseVolumePath string `json:"base_volume_path"` - SSLCertificateID int `json:"ssl_certificate_id"` - UpdatedAt string `json:"updated_at"` + SSLCertificateID int `json:"ssl_certificate_id"` + StaleThresholdDays int `json:"stale_threshold_days"` + UpdatedAt string `json:"updated_at"` } // Instance represents a running (or stopped) container for a project stage. @@ -70,6 +71,7 @@ type Instance struct { NpmProxyID int `json:"npm_proxy_id"` Status string `json:"status"` // running, stopped, failed, removing Port int `json:"port"` + LastAliveAt string `json:"last_alive_at"` CreatedAt string `json:"created_at"` UpdatedAt string `json:"updated_at"` } @@ -117,3 +119,27 @@ type Volume struct { CreatedAt string `json:"created_at"` UpdatedAt string `json:"updated_at"` } + +// EventLog represents a persistent event log entry. +type EventLog struct { + ID int64 `json:"id"` + Source string `json:"source"` + Severity string `json:"severity"` // info, warn, error + Message string `json:"message"` + Metadata string `json:"metadata"` // JSON-encoded structured data + CreatedAt string `json:"created_at"` +} + +// StandaloneProxy represents a standalone reverse proxy not tied to a project. +type StandaloneProxy struct { + ID string `json:"id"` + Domain string `json:"domain"` + DestinationURL string `json:"destination_url"` + DestinationPort int `json:"destination_port"` + SSLCertificateID int `json:"ssl_certificate_id"` + NpmProxyID int `json:"npm_proxy_id"` + HealthStatus string `json:"health_status"` // unknown, healthy, unhealthy + HealthCheckedAt string `json:"health_checked_at"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` +} diff --git a/internal/store/settings.go b/internal/store/settings.go index 1580cd8..d9ea761 100644 --- a/internal/store/settings.go +++ b/internal/store/settings.go @@ -9,10 +9,10 @@ func (s *Store) GetSettings() (Settings, error) { var st Settings err := s.db.QueryRow( `SELECT domain, server_ip, network, subdomain_pattern, notification_url, - npm_url, npm_email, npm_password, webhook_secret, polling_interval, base_volume_path, ssl_certificate_id, updated_at + npm_url, npm_email, npm_password, webhook_secret, polling_interval, base_volume_path, ssl_certificate_id, stale_threshold_days, updated_at FROM settings WHERE id = 1`, ).Scan(&st.Domain, &st.ServerIP, &st.Network, &st.SubdomainPattern, &st.NotificationURL, - &st.NpmURL, &st.NpmEmail, &st.NpmPassword, &st.WebhookSecret, &st.PollingInterval, &st.BaseVolumePath, &st.SSLCertificateID, &st.UpdatedAt) + &st.NpmURL, &st.NpmEmail, &st.NpmPassword, &st.WebhookSecret, &st.PollingInterval, &st.BaseVolumePath, &st.SSLCertificateID, &st.StaleThresholdDays, &st.UpdatedAt) if err != nil { return Settings{}, fmt.Errorf("query settings: %w", err) } @@ -25,10 +25,10 @@ func (s *Store) UpdateSettings(st Settings) error { _, err := s.db.Exec( `UPDATE settings SET domain=?, server_ip=?, network=?, subdomain_pattern=?, notification_url=?, - npm_url=?, npm_email=?, npm_password=?, webhook_secret=?, polling_interval=?, base_volume_path=?, ssl_certificate_id=?, updated_at=? + npm_url=?, npm_email=?, npm_password=?, webhook_secret=?, polling_interval=?, base_volume_path=?, ssl_certificate_id=?, stale_threshold_days=?, updated_at=? WHERE id = 1`, st.Domain, st.ServerIP, st.Network, st.SubdomainPattern, st.NotificationURL, - st.NpmURL, st.NpmEmail, st.NpmPassword, st.WebhookSecret, st.PollingInterval, st.BaseVolumePath, st.SSLCertificateID, st.UpdatedAt, + st.NpmURL, st.NpmEmail, st.NpmPassword, st.WebhookSecret, st.PollingInterval, st.BaseVolumePath, st.SSLCertificateID, st.StaleThresholdDays, st.UpdatedAt, ) if err != nil { return fmt.Errorf("update settings: %w", err) diff --git a/internal/store/standalone_proxy.go b/internal/store/standalone_proxy.go new file mode 100644 index 0000000..a1ce46e --- /dev/null +++ b/internal/store/standalone_proxy.go @@ -0,0 +1,120 @@ +package store + +import ( + "database/sql" + "errors" + "fmt" + + "github.com/google/uuid" +) + +// CreateStandaloneProxy inserts a new standalone proxy record. +func (s *Store) CreateStandaloneProxy(p StandaloneProxy) (StandaloneProxy, error) { + p.ID = uuid.New().String() + p.CreatedAt = Now() + p.UpdatedAt = p.CreatedAt + + if p.HealthStatus == "" { + p.HealthStatus = "unknown" + } + + _, err := s.db.Exec( + `INSERT INTO standalone_proxies (id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + p.ID, p.Domain, p.DestinationURL, p.DestinationPort, p.SSLCertificateID, + p.NpmProxyID, p.HealthStatus, p.HealthCheckedAt, p.CreatedAt, p.UpdatedAt, + ) + if err != nil { + return StandaloneProxy{}, fmt.Errorf("insert standalone proxy: %w", err) + } + return p, nil +} + +// GetStandaloneProxy returns a standalone proxy by ID. +func (s *Store) GetStandaloneProxy(id string) (StandaloneProxy, error) { + var p StandaloneProxy + err := s.db.QueryRow( + `SELECT id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at + FROM standalone_proxies WHERE id = ?`, id, + ).Scan(&p.ID, &p.Domain, &p.DestinationURL, &p.DestinationPort, &p.SSLCertificateID, + &p.NpmProxyID, &p.HealthStatus, &p.HealthCheckedAt, &p.CreatedAt, &p.UpdatedAt) + if errors.Is(err, sql.ErrNoRows) { + return StandaloneProxy{}, fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound) + } + if err != nil { + return StandaloneProxy{}, fmt.Errorf("query standalone proxy: %w", err) + } + return p, nil +} + +// ListStandaloneProxies returns all standalone proxy records ordered by creation time. +func (s *Store) ListStandaloneProxies() ([]StandaloneProxy, error) { + rows, err := s.db.Query( + `SELECT id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at + FROM standalone_proxies ORDER BY created_at DESC`, + ) + if err != nil { + return nil, fmt.Errorf("query standalone proxies: %w", err) + } + defer rows.Close() + + proxies := []StandaloneProxy{} + for rows.Next() { + var p StandaloneProxy + if err := rows.Scan(&p.ID, &p.Domain, &p.DestinationURL, &p.DestinationPort, &p.SSLCertificateID, + &p.NpmProxyID, &p.HealthStatus, &p.HealthCheckedAt, &p.CreatedAt, &p.UpdatedAt); err != nil { + return nil, fmt.Errorf("scan standalone proxy: %w", err) + } + proxies = append(proxies, p) + } + return proxies, rows.Err() +} + +// UpdateStandaloneProxy updates an existing standalone proxy's mutable fields. +func (s *Store) UpdateStandaloneProxy(p StandaloneProxy) error { + p.UpdatedAt = Now() + result, err := s.db.Exec( + `UPDATE standalone_proxies SET domain=?, destination_url=?, destination_port=?, ssl_certificate_id=?, npm_proxy_id=?, health_status=?, health_checked_at=?, updated_at=? + WHERE id=?`, + p.Domain, p.DestinationURL, p.DestinationPort, p.SSLCertificateID, + p.NpmProxyID, p.HealthStatus, p.HealthCheckedAt, p.UpdatedAt, p.ID, + ) + if err != nil { + return fmt.Errorf("update standalone proxy: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("standalone proxy %s: %w", p.ID, ErrNotFound) + } + return nil +} + +// DeleteStandaloneProxy removes a standalone proxy by ID. +func (s *Store) DeleteStandaloneProxy(id string) error { + result, err := s.db.Exec(`DELETE FROM standalone_proxies WHERE id = ?`, id) + if err != nil { + return fmt.Errorf("delete standalone proxy: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound) + } + return nil +} + +// UpdateProxyHealth updates the health status and check timestamp for a standalone proxy. +func (s *Store) UpdateProxyHealth(id string, status string) error { + ts := Now() + result, err := s.db.Exec( + `UPDATE standalone_proxies SET health_status=?, health_checked_at=?, updated_at=? WHERE id=?`, + status, ts, ts, id, + ) + if err != nil { + return fmt.Errorf("update proxy health: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound) + } + return nil +} diff --git a/internal/store/store.go b/internal/store/store.go index 9dbda01..0ea811f 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -81,6 +81,10 @@ func (s *Store) runMigrations() error { `ALTER TABLE stages ADD COLUMN enable_proxy INTEGER NOT NULL DEFAULT 1`, // Add ssl_certificate_id to settings (2026-03-29). `ALTER TABLE settings ADD COLUMN ssl_certificate_id INTEGER NOT NULL DEFAULT 0`, + // Add stale_threshold_days to settings (2026-03-30). + `ALTER TABLE settings ADD COLUMN stale_threshold_days INTEGER NOT NULL DEFAULT 7`, + // Add last_alive_at to instances for stale container detection (2026-03-30). + `ALTER TABLE instances ADD COLUMN last_alive_at TEXT NOT NULL DEFAULT ''`, } for _, m := range migrations { @@ -98,6 +102,9 @@ func (s *Store) runMigrations() error { `CREATE INDEX IF NOT EXISTS idx_stages_project_id ON stages(project_id)`, `CREATE INDEX IF NOT EXISTS idx_stage_env_stage_id ON stage_env(stage_id)`, `CREATE INDEX IF NOT EXISTS idx_volumes_project_id ON volumes(project_id)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_severity ON event_log(severity)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_source ON event_log(source)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_created_at ON event_log(created_at)`, } for _, idx := range indexes { if _, err := s.db.Exec(idx); err != nil { @@ -250,6 +257,28 @@ CREATE TABLE IF NOT EXISTS volumes ( created_at TEXT NOT NULL DEFAULT (datetime('now')), updated_at TEXT NOT NULL DEFAULT (datetime('now')) ); + +CREATE TABLE IF NOT EXISTS event_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL DEFAULT '', + severity TEXT NOT NULL DEFAULT 'info', + message TEXT NOT NULL DEFAULT '', + metadata TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS standalone_proxies ( + id TEXT PRIMARY KEY, + domain TEXT NOT NULL UNIQUE, + destination_url TEXT NOT NULL DEFAULT '', + destination_port INTEGER NOT NULL DEFAULT 0, + ssl_certificate_id INTEGER NOT NULL DEFAULT 0, + npm_proxy_id INTEGER NOT NULL DEFAULT 0, + health_status TEXT NOT NULL DEFAULT 'unknown', + health_checked_at TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) +); ` // Now returns the current time formatted for SQLite storage. diff --git a/plans/observability-proxy-mgmt/CONTEXT.md b/plans/observability-proxy-mgmt/CONTEXT.md new file mode 100644 index 0000000..025cb0e --- /dev/null +++ b/plans/observability-proxy-mgmt/CONTEXT.md @@ -0,0 +1,52 @@ +# Feature Context: Observability & Proxy Management + +## Configuration +- **Development mode:** Automated +- **Execution mode:** Orchestrator +- **Strategy:** Incremental +- **Build (full):** `make build` +- **Build (frontend):** `cd web && npm install && npm run build` +- **Build (backend):** `go build -o docker-watcher ./cmd/server` +- **Test:** `go test ./...` +- **Lint (backend):** `go vet ./...` +- **Lint (frontend):** `cd web && npm run check` +- **Dev server:** `make dev` (port: 8080) + +## Current State +Feature branch just created. No implementation yet. Codebase is fully working on main. + +## Temporary Workarounds +(none yet) + +## Cross-Phase Dependencies +- Phases 2 & 3 depend on Phase 1 (schema, event_log table, store methods) +- Phases 4, 5, 6, 7 depend on their respective backend phases (1-3) for API endpoints +- Phase 8 depends on Phases 1-3 for backend infrastructure and event system + +## Deferred Work +(none yet) + +## Failed Approaches +(none yet) + +## Review Findings Log +(none yet) + +## Phase Execution Log +| Phase | Agent Used | Test Writer | Parallel | Notes | +|-------|-----------|-------------|----------|-------| +| (none yet) | | | | | + +## Environment & Runtime Notes +- Build is currently blocked on Go 1.25 transitive dep from Docker SDK — may need to use Go 1.24 toolchain +- SQLite has MaxOpenConns=1, so all DB operations are serialized +- Frontend is embedded into Go binary via embed.FS + +## Implementation Notes +- Event bus (`internal/events/bus.go`) uses buffered channels (64 cap), non-blocking publish +- NPM client (`internal/npm/client.go`) handles JWT auth with auto-refresh +- Store uses additive migrations — new `ALTER TABLE` statements are appended to runMigrations(), errors ignored for idempotency +- New tables use `CREATE TABLE IF NOT EXISTS` in the schema constant +- All API responses use envelope pattern: `{success: bool, data?: T, error?: string}` +- Frontend types in `web/src/lib/types.ts` mirror Go models +- API functions centralized in `web/src/lib/api.ts` diff --git a/plans/observability-proxy-mgmt/PLAN.md b/plans/observability-proxy-mgmt/PLAN.md new file mode 100644 index 0000000..8f4ec19 --- /dev/null +++ b/plans/observability-proxy-mgmt/PLAN.md @@ -0,0 +1,71 @@ +# Feature: Observability & Proxy Management + +**Branch:** `feature/observability-proxy-mgmt` +**Base branch:** `main` +**Created:** 2026-03-30 +**Status:** 🟡 In Progress +**Strategy:** Incremental +**Mode:** Automated +**Execution:** Orchestrator + +## Summary + +Extend Docker Watcher with four interconnected features: stale container detection, +standalone proxy management with health monitoring, a unified proxy viewer, and a +persistent event log — plus container stats and notification triggers. + +## Build & Test Commands +- **Build (frontend):** `cd web && npm install && npm run build` +- **Build (backend):** `go build -o docker-watcher ./cmd/server` +- **Build (full):** `make build` +- **Test (backend):** `go test ./...` +- **Lint (backend):** `go vet ./...` +- **Lint (frontend):** `cd web && npm run check` + +## Tech Stack Summary +- **Backend:** Go 1.24, chi v5 router, SQLite (modernc.org/sqlite), Docker SDK (moby/moby/client) +- **Frontend:** SvelteKit 2.15, Svelte 5, TypeScript 5.7, Tailwind CSS 4, Vite 6 +- **Real-time:** Server-Sent Events with auto-reconnect +- **Auth:** JWT + optional OIDC +- **Encryption:** AES-256-GCM for credentials + +## Project Conventions +- **Go:** gofmt, small interfaces, error wrapping with `fmt.Errorf("context: %w", err)`, constructor injection +- **DB:** Single-row settings, additive migrations via `ALTER TABLE` (errors ignored for idempotency), `CREATE TABLE IF NOT EXISTS` for new tables +- **API:** Envelope pattern `{success, data?, error?}`, chi route groups, admin middleware for writes +- **Frontend:** Svelte 5 runes ($state, $derived, $effect), TypeScript interfaces mirroring Go models, centralized api.ts, custom components (no UI library) +- **Files:** Feature-organized, small focused files +- **State:** Immutable patterns, no mutation + +## Phases + +- [ ] Phase 1: Schema, Models & Event Log Backend [domain: backend] → [subplan](./phase-1-schema-eventlog.md) +- [ ] Phase 2: Stale Container Detection [domain: backend] → [subplan](./phase-2-stale-detection.md) +- [ ] Phase 3: Direct Proxy Creation with Validation [domain: backend] → [subplan](./phase-3-proxy-creation.md) +- [ ] Phase 4: Unified Proxy Viewer UI [domain: frontend] → [subplan](./phase-4-proxy-viewer.md) +- [ ] Phase 5: Stale Containers UI [domain: frontend] → [subplan](./phase-5-stale-ui.md) +- [ ] Phase 6: Direct Proxy Creation UI [domain: frontend] → [subplan](./phase-6-proxy-creation-ui.md) +- [ ] Phase 7: Event Log UI [domain: frontend] → [subplan](./phase-7-eventlog-ui.md) +- [ ] Phase 8: Container Stats & Notifications [domain: fullstack] → [subplan](./phase-8-stats-notifications.md) + +**Parallelizable phases:** +- Phases 4, 5, 6, 7 are all frontend phases that touch different routes/components and can potentially run in parallel after all backend phases (1-3) complete. + +## Phase Progress Log + +| Phase | Domain | Status | Review | Build | Committed | +|-------|--------|--------|--------|-------|-----------| +| Phase 1: Schema & Event Log | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 2: Stale Detection | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 3: Proxy Creation | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 4: Proxy Viewer UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 5: Stale Containers UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 6: Proxy Creation UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 7: Event Log UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 8: Stats & Notifications | fullstack | ⬜ Not Started | ⬜ | ⬜ | ⬜ | + +## Final Review +- [ ] Comprehensive code review +- [ ] Full build passes +- [ ] Full test suite passes +- [ ] Merged to `main` diff --git a/plans/observability-proxy-mgmt/phase-1-schema-eventlog.md b/plans/observability-proxy-mgmt/phase-1-schema-eventlog.md new file mode 100644 index 0000000..247d673 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-1-schema-eventlog.md @@ -0,0 +1,60 @@ +# Phase 1: Schema, Models & Event Log Backend + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective +Lay the database foundation for all new features and implement the persistent event log system. + +## Tasks + +- [ ] Task 1: Add `event_log` table to schema (id INTEGER PK AUTOINCREMENT, source TEXT, severity TEXT, message TEXT, metadata TEXT JSON, created_at TEXT) +- [ ] Task 2: Add `standalone_proxies` table to schema (id TEXT PK, domain TEXT UNIQUE, destination_url TEXT, destination_port INTEGER, ssl_certificate_id INTEGER, npm_proxy_id INTEGER, health_status TEXT, health_checked_at TEXT, created_at TEXT, updated_at TEXT) +- [ ] Task 3: Add `stale_threshold_days` column to settings table (migration, default 7) +- [ ] Task 4: Create `internal/store/eventlog.go` — store methods: InsertEvent, ListEvents (paginated, filterable by severity/source/date range), GetEventStats (counts by severity), PruneEvents (delete old entries) +- [ ] Task 5: Create `internal/store/standalone_proxy.go` — store methods: CreateStandaloneProxy, GetStandaloneProxy, ListStandaloneProxies, UpdateStandaloneProxy, DeleteStandaloneProxy, UpdateProxyHealth +- [ ] Task 6: Create Go models in `internal/store/models.go` — EventLog struct, StandaloneProxy struct +- [ ] Task 7: Update settings model to include stale_threshold_days field; update GetSettings/SaveSettings +- [ ] Task 8: Enhance event bus to auto-persist warn/error events — add a subscriber in events.Bus that writes to store +- [ ] Task 9: Add API endpoints: `GET /api/events/log` (paginated, filterable), `GET /api/events/log/stats` +- [ ] Task 10: Add new SSE event type `event_log` — broadcast persistent events in real-time +- [ ] Task 11: Add frontend types: EventLogEntry, StandaloneProxy interfaces in types.ts +- [ ] Task 12: Add API functions in api.ts: fetchEventLog, fetchEventLogStats + +## Files to Modify/Create +- `internal/store/store.go` — Add schema for event_log, standalone_proxies tables; migration for stale_threshold_days +- `internal/store/models.go` — Add EventLog, StandaloneProxy structs; update Settings struct +- `internal/store/eventlog.go` — NEW: Event log store methods +- `internal/store/standalone_proxy.go` — NEW: Standalone proxy store methods +- `internal/store/settings.go` — Update GetSettings/SaveSettings for new field +- `internal/events/bus.go` — Add persistent event subscriber +- `internal/api/router.go` — Mount new event log routes +- `internal/api/eventlog.go` — NEW: Event log HTTP handlers +- `web/src/lib/types.ts` — Add EventLogEntry, StandaloneProxy types +- `web/src/lib/api.ts` — Add fetchEventLog, fetchEventLogStats functions + +## Acceptance Criteria +- event_log and standalone_proxies tables created on startup (migration is idempotent) +- stale_threshold_days setting accessible via settings API +- Events with warn/error severity auto-persisted from event bus +- GET /api/events/log returns paginated, filterable results +- GET /api/events/log/stats returns severity counts +- Frontend types and API functions ready for downstream UI phases +- Existing functionality unchanged — all current tests/builds pass + +## Notes +- Follow existing migration pattern: ALTER TABLE errors ignored for idempotency +- event_log metadata is a JSON TEXT column for flexible structured data +- Pagination follows offset/limit pattern (no cursor — SQLite is simple enough) +- Event log pruning can be called from a cron job later (Phase 8) + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-2-stale-detection.md b/plans/observability-proxy-mgmt/phase-2-stale-detection.md new file mode 100644 index 0000000..aa10c15 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-2-stale-detection.md @@ -0,0 +1,55 @@ +# Phase 2: Stale Container Detection + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective +Implement a periodic scanner that detects containers managed by docker-watcher which have been non-running for more than N configurable days, and exposes them via API. + +## Tasks + +- [ ] Task 1: Create `internal/stale/scanner.go` — Scanner struct with dependencies (store, docker client, event bus) +- [ ] Task 2: Implement scan logic: query all instances from store, check Docker container state via Docker SDK, compare against stale_threshold_days from settings +- [ ] Task 3: Add `last_alive_at` column to instances table (migration) — updated when instance is seen running +- [ ] Task 4: Update deployer/instance lifecycle to set last_alive_at when container starts/is seen running +- [ ] Task 5: Implement stale detection: instance is stale if status != 'running' AND (now - last_alive_at) > threshold days +- [ ] Task 6: Emit event_log warnings when containers become newly stale (avoid re-emitting for already-known stale containers) +- [ ] Task 7: Register scanner as cron job (reuse existing robfig/cron infrastructure from registry poller) +- [ ] Task 8: Add API endpoints: `GET /api/containers/stale` (list stale with project/stage info), `POST /api/containers/stale/{id}/cleanup` (remove single), `POST /api/containers/stale/cleanup` (bulk remove) +- [ ] Task 9: Cleanup handler: stop container via Docker SDK, remove instance from store, emit event +- [ ] Task 10: Wire scanner into main.go startup (after store, docker client, event bus init) + +## Files to Modify/Create +- `internal/stale/scanner.go` — NEW: Stale container scanner +- `internal/store/store.go` — Migration for last_alive_at column +- `internal/store/models.go` — Update Instance struct with LastAliveAt field +- `internal/store/instances.go` — Update queries to include last_alive_at; add UpdateLastAliveAt method +- `internal/api/router.go` — Mount stale container routes +- `internal/api/stale.go` — NEW: Stale container HTTP handlers +- `cmd/server/main.go` — Wire scanner with cron + +## Acceptance Criteria +- Scanner runs on configurable interval (e.g., every hour) +- Stale containers correctly identified based on threshold +- GET /api/containers/stale returns list with project name, stage name, image tag, last alive timestamp, days stale +- Cleanup endpoints properly stop Docker containers and remove from store +- Events emitted when containers become stale +- Existing deploy flow unaffected — last_alive_at updated on successful deploy +- Build passes, existing tests pass + +## Notes +- Scanner should handle gracefully: containers that no longer exist in Docker (already removed externally) +- Bulk cleanup should be admin-only +- Consider: scan interval could be derived from stale_threshold_days (e.g., scan every threshold/7 days, min 1h) +- Don't remove containers that are in 'removing' status (already being cleaned up) + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-3-proxy-creation.md b/plans/observability-proxy-mgmt/phase-3-proxy-creation.md new file mode 100644 index 0000000..c713044 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-3-proxy-creation.md @@ -0,0 +1,81 @@ +# Phase 3: Direct Proxy Creation with Validation + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective +Implement standalone proxy creation with a multi-step validation pipeline that checks destination reachability, and periodic health monitoring for all standalone proxies. + +## Tasks + +- [ ] Task 1: Create `internal/proxy/validator.go` — validation pipeline: + - URL/port syntax validation + - DNS resolution check + - TCP port reachability (net.DialTimeout, 5s) + - HTTP health probe (GET to destination, 10s timeout) + - Returns structured ValidationResult with per-step pass/fail and diagnostic hints +- [ ] Task 2: Create `internal/proxy/hints.go` — diagnostic hint generator: + - DNS failure → "Domain cannot be resolved. Check DNS settings or use an IP address." + - TCP refused → "Port {port} is not accepting connections. Check if the service is running and the port is correct." + - TCP timeout → "Connection timed out. Possible firewall blocking. Check network/firewall rules." + - Host unreachable → "Host is not reachable. Verify the IP address and network connectivity." + - HTTP error → "Service responded with HTTP {status}. The service may not be healthy." +- [ ] Task 3: Create `internal/proxy/manager.go` — proxy lifecycle: + - CreateProxy: validate destination, create NPM proxy host (using npm.Client), assign SSL cert from settings, save to standalone_proxies table + - UpdateProxy: re-validate, update NPM proxy host, update store + - DeleteProxy: remove NPM proxy host, remove from store + - GetProxy/ListProxies: read from store with health status +- [ ] Task 4: Create `internal/proxy/health.go` — periodic health monitor: + - Cron job that checks all standalone proxies + - HTTP GET to destination URL/port + - Updates health_status (healthy/unhealthy/unknown) and health_checked_at in store + - Emits event_log on status change (healthy→unhealthy or vice versa) +- [ ] Task 5: Add API endpoints: + - `POST /api/proxies/validate` — run validation without creating + - `POST /api/proxies` — create standalone proxy + - `GET /api/proxies` — list standalone proxies + - `GET /api/proxies/{id}` — get single proxy + - `PUT /api/proxies/{id}` — update proxy + - `DELETE /api/proxies/{id}` — delete proxy + - `GET /api/proxies/all` — merged view: standalone + deploy-managed proxies (for Phase 4 UI) +- [ ] Task 6: Wire health monitor cron job in main.go +- [ ] Task 7: Add frontend API functions in api.ts: validateProxy, createProxy, listProxies, getProxy, updateProxy, deleteProxy, listAllProxies +- [ ] Task 8: Add frontend types: ValidationResult, ValidationStep, ProxyHealthStatus + +## Files to Modify/Create +- `internal/proxy/validator.go` — NEW: Validation pipeline +- `internal/proxy/hints.go` — NEW: Diagnostic hints +- `internal/proxy/manager.go` — NEW: Proxy lifecycle management +- `internal/proxy/health.go` — NEW: Health monitoring +- `internal/api/router.go` — Mount proxy routes +- `internal/api/proxy.go` — NEW: Proxy HTTP handlers +- `cmd/server/main.go` — Wire proxy manager and health monitor +- `web/src/lib/types.ts` — Add ValidationResult, ProxyHealthStatus types +- `web/src/lib/api.ts` — Add proxy API functions + +## Acceptance Criteria +- Validation pipeline returns structured results with specific failure hints +- POST /api/proxies/validate runs full check without side effects +- Proxy creation creates NPM proxy host with SSL cert from global settings +- Health monitor runs periodically and updates proxy status +- Events emitted on health status changes +- GET /api/proxies/all merges standalone and deploy-managed proxy data +- Build passes, existing tests pass + +## Notes +- Validation should be fast (short timeouts) — user waits for results +- Health monitor interval: every 5 minutes (configurable later) +- For /api/proxies/all: query NPM for all proxy hosts, join with instances table for managed proxies, join with standalone_proxies for standalone ones +- SSL cert auto-assigned from settings.ssl_certificate_id +- Consider: proxy domain must be unique across both standalone and managed proxies + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-4-proxy-viewer.md b/plans/observability-proxy-mgmt/phase-4-proxy-viewer.md new file mode 100644 index 0000000..e77218c --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-4-proxy-viewer.md @@ -0,0 +1,56 @@ +# Phase 4: Unified Proxy Viewer UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build a unified proxy viewer page showing ALL proxies (deploy-managed and standalone) with grouping, filtering, and real-time health indicators. + +## Tasks + +- [ ] Task 1: Create route `/proxies` with `+page.svelte` and `+page.ts` data loader +- [ ] Task 2: Create ProxyCard component — displays: domain, destination, SSL badge, health indicator (green/yellow/red dot), proxy type badge (managed/standalone), last health check timestamp +- [ ] Task 3: Create ProxyGroup component — collapsible section with project name header, stage sub-groups, proxy count badge +- [ ] Task 4: Create StandaloneProxyGroup component — separate collapsible section for user-created proxies +- [ ] Task 5: Implement filtering: by project, stage, health status (healthy/unhealthy/unknown), proxy type (managed/standalone), free-text search by domain/destination +- [ ] Task 6: Filter bar component with dropdown selects and search input +- [ ] Task 7: SSE integration — subscribe to proxy health events, update health indicators in real-time +- [ ] Task 8: Empty state — friendly message when no proxies exist, with link to create one +- [ ] Task 9: Add navigation link in sidebar layout (+layout.svelte) +- [ ] Task 10: Add i18n keys for proxy viewer page + +## Files to Modify/Create +- `web/src/routes/proxies/+page.svelte` — NEW: Proxy viewer page +- `web/src/routes/proxies/+page.ts` — NEW: Data loader +- `web/src/lib/components/ProxyCard.svelte` — NEW: Individual proxy display +- `web/src/lib/components/ProxyGroup.svelte` — NEW: Collapsible project/stage group +- `web/src/lib/components/ProxyFilter.svelte` — NEW: Filter bar +- `web/src/routes/+layout.svelte` — Add proxies nav link +- `web/src/lib/i18n/en.ts` (or equivalent) — Add proxy viewer strings + +## Acceptance Criteria +- All proxies visible: both deploy-managed and standalone +- Proxies grouped by project/stage in collapsible sections +- Health indicators show real-time status (green=healthy, red=unhealthy, yellow=unknown) +- Filtering works: project, stage, health, type, text search +- SSE updates health indicators without page refresh +- Navigation accessible from sidebar +- Responsive layout (mobile-friendly) + +## Notes +- Use existing component patterns (ConfirmDialog, FormField styles, etc.) +- Follow existing Svelte 5 patterns ($state, $derived, $effect) +- The /api/proxies/all endpoint from Phase 3 provides the data source +- Health indicator should pulse/animate briefly on status change +- Consider: show proxy count in sidebar nav badge + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-5-stale-ui.md b/plans/observability-proxy-mgmt/phase-5-stale-ui.md new file mode 100644 index 0000000..28adfb6 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-5-stale-ui.md @@ -0,0 +1,55 @@ +# Phase 5: Stale Containers UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build the stale containers dashboard widget and dedicated view, with cleanup actions and settings configuration. + +## Tasks + +- [ ] Task 1: Add API functions in api.ts: fetchStaleContainers, cleanupStaleContainer, bulkCleanupStaleContainers +- [ ] Task 2: Create StaleContainerCard component — shows: container name, project, stage, image tag, last alive timestamp, "X days stale" badge (color-coded by severity) +- [ ] Task 3: Create stale containers section on dashboard (+page.svelte) — count badge, mini-list of top 5 offenders, "View all" link +- [ ] Task 4: Create dedicated route `/containers/stale` with full stale container list +- [ ] Task 5: Individual cleanup action — ConfirmDialog with warning, calls cleanup API +- [ ] Task 6: Bulk cleanup action — "Clean up all" button with confirmation, progress indicator +- [ ] Task 7: Settings integration — add stale_threshold_days field to settings page with validation (min 1 day) +- [ ] Task 8: Add navigation link or sub-nav for stale containers +- [ ] Task 9: Add i18n keys for stale containers + +## Files to Modify/Create +- `web/src/lib/api.ts` — Add stale container API functions +- `web/src/lib/types.ts` — Add StaleContainer interface +- `web/src/lib/components/StaleContainerCard.svelte` — NEW: Stale container display +- `web/src/routes/+page.svelte` — Add stale containers dashboard widget +- `web/src/routes/containers/stale/+page.svelte` — NEW: Dedicated stale view +- `web/src/routes/containers/stale/+page.ts` — NEW: Data loader +- `web/src/routes/settings/+page.svelte` — Add stale threshold setting field +- `web/src/routes/+layout.svelte` — Add nav link if needed + +## Acceptance Criteria +- Dashboard shows stale container count and top offenders +- Dedicated page lists all stale containers with details +- Individual cleanup removes container with confirmation +- Bulk cleanup works with progress feedback +- Settings page allows configuring stale threshold +- Severity coloring: 7-14 days = yellow, 14+ days = red +- Responsive layout + +## Notes +- Reuse existing ConfirmDialog for destructive actions +- Dashboard widget should not slow down initial page load (lazy load or small payload) +- Stale container data comes from GET /api/containers/stale (Phase 2) +- Settings update uses existing PUT /api/settings endpoint + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-6-proxy-creation-ui.md b/plans/observability-proxy-mgmt/phase-6-proxy-creation-ui.md new file mode 100644 index 0000000..7ccf7df --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-6-proxy-creation-ui.md @@ -0,0 +1,54 @@ +# Phase 6: Direct Proxy Creation UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build the proxy creation form with live validation feedback, diagnostic hints, and management actions (edit/delete). + +## Tasks + +- [ ] Task 1: Create "Create Proxy" form component — fields: destination URL/IP, port, domain (auto-suggested from subdomain pattern), optional custom subdomain override +- [ ] Task 2: Live validation — debounced calls to POST /api/proxies/validate as user types (300ms debounce) +- [ ] Task 3: Validation result display — step-by-step checklist with icons: + - ✅ DNS resolution OK / ❌ DNS resolution failed + - ✅ TCP port reachable / ❌ TCP port not reachable + - ✅ HTTP responding / ❌ HTTP not responding + - Each failure shows the diagnostic hint from the backend +- [ ] Task 4: Create proxy submission — calls POST /api/proxies, shows success toast with health indicator +- [ ] Task 5: Edit proxy — modal or inline form, pre-populated with current values, re-validates on save +- [ ] Task 6: Delete proxy — ConfirmDialog with domain name confirmation +- [ ] Task 7: Integration with proxy viewer page — "Create Proxy" button in the proxy viewer header +- [ ] Task 8: Domain auto-suggestion — when user enters destination, suggest domain based on subdomain_pattern from settings +- [ ] Task 9: Add i18n keys for proxy creation + +## Files to Modify/Create +- `web/src/lib/components/ProxyForm.svelte` — NEW: Create/edit proxy form +- `web/src/lib/components/ValidationChecklist.svelte` — NEW: Step-by-step validation display +- `web/src/routes/proxies/+page.svelte` — Add "Create Proxy" button and modal/panel +- `web/src/lib/api.ts` — Ensure validateProxy, createProxy, updateProxy, deleteProxy are present (from Phase 3) + +## Acceptance Criteria +- Form validates destination in real-time with debouncing +- Each validation step shows pass/fail with diagnostic hints +- Proxy creation works end-to-end (form → API → NPM → success) +- Edit and delete work for existing standalone proxies +- Domain auto-suggestion works from settings pattern +- Error states handled gracefully (network errors, API failures) + +## Notes +- Validation should show a loading spinner while in progress +- Don't validate on every keystroke — use 300ms debounce +- If all validation steps fail, still allow creation (user might know better — just warn) +- SSL certificate is applied automatically from global settings (no cert picker in form) + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-7-eventlog-ui.md b/plans/observability-proxy-mgmt/phase-7-eventlog-ui.md new file mode 100644 index 0000000..d17e39e --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-7-eventlog-ui.md @@ -0,0 +1,54 @@ +# Phase 7: Event Log UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build a persistent, searchable event log viewer with real-time streaming, filters, and resource linking. + +## Tasks + +- [ ] Task 1: Create route `/events` with `+page.svelte` and `+page.ts` data loader +- [ ] Task 2: Create EventLogEntry component — timestamp, severity badge (info=blue, warn=yellow, error=red), source icon (container/proxy/deploy/system), message text, expandable metadata section +- [ ] Task 3: Create EventLogFilter component — filters: severity multi-select, source multi-select, date range picker (start/end), free-text search +- [ ] Task 4: Implement pagination — "Load more" button at bottom (offset/limit pattern matching API) +- [ ] Task 5: SSE integration — subscribe to event_log events, prepend new entries at top with subtle highlight animation +- [ ] Task 6: Quick actions — clickable links to related resources (e.g., click container name → go to project/stage, click proxy domain → go to proxy viewer) +- [ ] Task 7: Stats header — show counts by severity (from GET /api/events/log/stats), with colored badges +- [ ] Task 8: Add navigation link in sidebar +- [ ] Task 9: Add i18n keys for event log page + +## Files to Modify/Create +- `web/src/routes/events/+page.svelte` — NEW: Event log page +- `web/src/routes/events/+page.ts` — NEW: Data loader +- `web/src/lib/components/EventLogEntry.svelte` — NEW: Event entry display +- `web/src/lib/components/EventLogFilter.svelte` — NEW: Filter controls +- `web/src/routes/+layout.svelte` — Add events nav link +- `web/src/lib/sse.ts` — Add event_log SSE subscription helper (if needed) + +## Acceptance Criteria +- Event log shows all persistent events with severity and source +- Filters work: severity, source, date range, text search +- New events stream in real-time via SSE without page refresh +- Pagination loads older events on demand +- Quick actions link to related resources +- Stats header shows severity distribution +- Responsive layout + +## Notes +- Follow existing SSE patterns from deploy logs viewer +- Date range filter: consider "last hour", "last 24h", "last 7 days" presets + custom range +- Metadata section is JSON — render as formatted key-value pairs, not raw JSON +- Resource linking: parse source and metadata to construct navigation URLs +- Consider: auto-scroll to top when new event arrives (if user is at top), otherwise show "N new events" badge + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-8-stats-notifications.md b/plans/observability-proxy-mgmt/phase-8-stats-notifications.md new file mode 100644 index 0000000..857236b --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-8-stats-notifications.md @@ -0,0 +1,67 @@ +# Phase 8: Container Stats & Notifications + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** fullstack + +## Objective +Add container resource monitoring (CPU/memory), notification triggers for operational events, and a system health dashboard summary. + +## Tasks + +- [ ] Task 1: Create `internal/docker/stats.go` — wrapper around Docker Stats API to get CPU %, memory usage/limit for a container +- [ ] Task 2: Add API endpoint: `GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats` — returns current CPU/memory for an instance +- [ ] Task 3: Create SSE event type `container_stats` — periodically broadcast stats for running containers (every 30s) +- [ ] Task 4: Extend notification stub (`internal/notify/`) — implement webhook sender for events: + - Stale container detected + - Proxy health failure + - Deploy failure/rollback + - Format: JSON payload with event type, details, timestamp +- [ ] Task 5: Add notification settings UI — enable/disable per event type in settings page +- [ ] Task 6: Update instance cards in frontend — show CPU % bar and memory usage badge +- [ ] Task 7: Create ContainerStats component — mini CPU/memory visualization (progress bars) +- [ ] Task 8: Dashboard system health summary card — total containers (running/stopped), healthy/unhealthy proxies, recent error count (last 24h) +- [ ] Task 9: Wire notification sender to event bus — subscribe to relevant event types, fire notifications +- [ ] Task 10: Add event log pruning cron job — delete events older than 30 days (configurable) +- [ ] Task 11: Add i18n keys for stats and notifications + +## Files to Modify/Create +- `internal/docker/stats.go` — NEW: Docker Stats API wrapper +- `internal/api/stats.go` — NEW: Stats HTTP handler +- `internal/api/router.go` — Mount stats endpoint +- `internal/notify/sender.go` — Implement webhook notification sender +- `internal/notify/types.go` — NEW: Notification event types and payloads +- `cmd/server/main.go` — Wire notification subscriber and event pruning cron +- `web/src/lib/types.ts` — Add ContainerStats, NotificationSettings types +- `web/src/lib/api.ts` — Add fetchContainerStats function +- `web/src/lib/components/ContainerStats.svelte` — NEW: CPU/memory display +- `web/src/lib/components/SystemHealthCard.svelte` — NEW: Dashboard summary +- `web/src/routes/+page.svelte` — Add system health card to dashboard +- `web/src/routes/settings/+page.svelte` — Add notification settings section +- `web/src/lib/sse.ts` — Add container_stats SSE handler + +## Acceptance Criteria +- Container stats (CPU/memory) visible on instance cards +- Stats update in real-time via SSE +- Webhook notifications fire for configured event types +- Dashboard shows system health summary +- Event log auto-prunes old entries +- Settings page allows configuring notification preferences +- Build passes, existing tests pass + +## Notes +- Docker Stats API returns a stream — read one snapshot and close, don't hold the connection +- CPU calculation: (container CPU delta / system CPU delta) * 100 — needs two reads +- Memory: usage_bytes / limit_bytes * 100 for percentage +- Notification webhook format should be compatible with common receivers (Slack webhook, Discord webhook, generic HTTP) +- System health card: consider caching aggregated stats to avoid N+1 queries on dashboard load + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 9bbc5b1..7031d38 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -1,17 +1,24 @@ import type { ApiEnvelope, + ContainerStats, Deploy, DeployLog, + EventLogEntry, + EventLogStats, InspectResult, Instance, NpmCertificate, Project, ProjectDetail, + ProxyView, Registry, RegistryImage, Settings, + StaleContainer, Stage, StageEnv, + StandaloneProxy, + ValidationResult, Volume } from './types'; @@ -338,4 +345,92 @@ export function deleteVolume( return del<{ deleted: string }>(`/api/projects/${projectId}/volumes/${volId}`); } +// ── Event Log ─────────────────────────────────────────────────────── + +export function fetchEventLog(params?: { + severity?: string; + source?: string; + since?: string; + until?: string; + limit?: number; + offset?: number; +}): Promise { + const query = new URLSearchParams(); + if (params?.severity) query.set('severity', params.severity); + if (params?.source) query.set('source', params.source); + if (params?.since) query.set('since', params.since); + if (params?.until) query.set('until', params.until); + if (params?.limit) query.set('limit', String(params.limit)); + if (params?.offset) query.set('offset', String(params.offset)); + const qs = query.toString(); + return get(`/api/events/log${qs ? `?${qs}` : ''}`); +} + +export function fetchEventLogStats(): Promise { + return get('/api/events/log/stats'); +} + +// ── Proxies ───────────────────────────────────────────────────────── + +export function validateProxy(host: string, port: number): Promise { + return post('/api/proxies/validate', { host, port }); +} + +export function createProxy(data: { + domain: string; + destination_url: string; + destination_port: number; +}): Promise { + return post('/api/proxies', data); +} + +export function listProxies(): Promise { + return get('/api/proxies'); +} + +export function getProxy(id: string): Promise { + return get(`/api/proxies/${id}`); +} + +export function updateProxy( + id: string, + data: { domain: string; destination_url: string; destination_port: number } +): Promise { + return put(`/api/proxies/${id}`, data); +} + +export function deleteProxy(id: string): Promise<{ deleted: string }> { + return del<{ deleted: string }>(`/api/proxies/${id}`); +} + +export function listAllProxies(): Promise { + return get('/api/proxies/all'); +} + +// ── Stale Containers ──────────────────────────────────────────────── + +export function fetchStaleContainers(): Promise { + return get('/api/containers/stale'); +} + +export function cleanupStaleContainer(id: string): Promise<{ deleted: string }> { + return post<{ deleted: string }>(`/api/containers/stale/${id}/cleanup`); +} + +export function bulkCleanupStaleContainers(): Promise<{ deleted: number }> { + return post<{ deleted: number }>('/api/containers/stale/cleanup'); +} + +// ── Container Stats ──────────────────────────────────────────────── + +export function fetchContainerStats( + projectId: string, + stageId: string, + instanceId: string +): Promise { + return get( + `/api/projects/${projectId}/stages/${stageId}/instances/${instanceId}/stats` + ); +} + export { ApiError }; diff --git a/web/src/lib/components/ContainerStats.svelte b/web/src/lib/components/ContainerStats.svelte new file mode 100644 index 0000000..5339c54 --- /dev/null +++ b/web/src/lib/components/ContainerStats.svelte @@ -0,0 +1,104 @@ + + + +{#if stats} +
+ +
+ {$t('stats.cpu')} +
+
+
+ + {stats.cpu_percent.toFixed(1)}% + +
+ +
+ {$t('stats.mem')} +
+
+
+ + {formatBytes(stats.memory_usage)} / {formatBytes(stats.memory_limit)} + +
+
+{:else if error} +

{$t('stats.unavailable')}

+{/if} diff --git a/web/src/lib/components/EventLogEntry.svelte b/web/src/lib/components/EventLogEntry.svelte new file mode 100644 index 0000000..b3fb727 --- /dev/null +++ b/web/src/lib/components/EventLogEntry.svelte @@ -0,0 +1,161 @@ + + + +
+
+ +
+ {#if entry.source === 'deploy'} + + + + + + + {:else if entry.source === 'container'} + + + + + + {:else if entry.source === 'proxy'} + + + + + {:else} + + + + + + {/if} +
+ + +
+
+ + + {$t(severityLabelKeys[entry.severity] ?? 'events.severity.info')} + + + + + {$t(`events.source.${entry.source}`)} + + + + + {timeAgo(entry.created_at)} + +
+ + +

+ {entry.message} +

+ + + {#if hasMetadata} + + + {#if expanded} +
+ + + {#each Object.entries(parsedMetadata ?? {}) as [key, value]} + + + + + {/each} + +
{key}{typeof value === 'object' ? JSON.stringify(value) : String(value)}
+
+ {/if} + {/if} +
+
+
diff --git a/web/src/lib/components/EventLogFilter.svelte b/web/src/lib/components/EventLogFilter.svelte new file mode 100644 index 0000000..46652a6 --- /dev/null +++ b/web/src/lib/components/EventLogFilter.svelte @@ -0,0 +1,167 @@ + + + +
+
+ +
+ +
+ {#each allSeverities as sev} + + {/each} +
+
+ + +
+ +
+ {#each allSources as src} + + {/each} +
+
+ + +
+ +
+ {#each dateRangeOptions as opt} + + {/each} +
+
+ + +
+ +
+ + + + onsearchchange((e.target as HTMLInputElement).value)} + class="w-full rounded-md border border-[var(--border-primary)] bg-[var(--surface-page)] py-1.5 pl-8 pr-3 text-xs text-[var(--text-primary)] placeholder:text-[var(--text-tertiary)] focus:border-[var(--color-brand-500)] focus:outline-none focus:ring-1 focus:ring-[var(--color-brand-500)]" + /> +
+
+ + +
+ +
+
+
diff --git a/web/src/lib/components/InstanceCard.svelte b/web/src/lib/components/InstanceCard.svelte index d26f956..feb7a9d 100644 --- a/web/src/lib/components/InstanceCard.svelte +++ b/web/src/lib/components/InstanceCard.svelte @@ -4,6 +4,7 @@ + +
+ +
+
+
+ + + {#if isHealthy} + + {/if} + + + + + + {proxy.domain} + + +
+ + +

+ {proxy.destination} +

+
+ + + + {proxy.type} + +
+ + +
+ + {#if proxy.ssl_enabled} + + + SSL + + {/if} + + + + {healthLabel} + + + + {#if proxy.type === 'managed' && proxy.project_name} + + {proxy.project_name} + + {#if proxy.stage_name} + + {proxy.stage_name} + + {/if} + {/if} +
+ + +
+ {#if proxy.type === 'standalone'} + + + {$t('common.edit')} + + {:else} + + {/if} + + {#if proxy.created_at} +

+ {$t('proxies.lastChecked')}: {formatTimestamp(proxy.created_at)} +

+ {/if} +
+
diff --git a/web/src/lib/components/ProxyFilter.svelte b/web/src/lib/components/ProxyFilter.svelte new file mode 100644 index 0000000..80be70e --- /dev/null +++ b/web/src/lib/components/ProxyFilter.svelte @@ -0,0 +1,85 @@ + + + +
+ +
+ + onsearchchange(e.currentTarget.value)} + placeholder={$t('proxies.filter.search')} + class="w-full rounded-lg border border-[var(--border-primary)] bg-[var(--surface-card)] py-2 pl-9 pr-3 text-sm text-[var(--text-primary)] placeholder:text-[var(--text-tertiary)] transition-colors focus:border-[var(--color-brand-500)] focus:outline-none focus:ring-1 focus:ring-[var(--color-brand-500)]" + /> +
+ + + + + + + + + {#if hasFilters} + + {/if} +
diff --git a/web/src/lib/components/ProxyForm.svelte b/web/src/lib/components/ProxyForm.svelte new file mode 100644 index 0000000..d5a10fa --- /dev/null +++ b/web/src/lib/components/ProxyForm.svelte @@ -0,0 +1,292 @@ + + + +
+ +

{title}

+ + +
{ e.preventDefault(); handleSubmit(); }} class="space-y-4"> + + + + + + + +
+ + + +
+ + + {#if validationResult && !validationResult.valid} +

+ Validation reported issues but you can still create the proxy. +

+ {/if} + + + {#if submitError} +

{submitError}

+ {/if} + + +
+
+ {#if mode === 'edit'} + + {/if} +
+ +
+ + + +
+
+ +
+ + +{#if mode === 'edit'} + { deleteConfirmOpen = false; }} + /> +{/if} diff --git a/web/src/lib/components/ProxyGroup.svelte b/web/src/lib/components/ProxyGroup.svelte new file mode 100644 index 0000000..17cd76d --- /dev/null +++ b/web/src/lib/components/ProxyGroup.svelte @@ -0,0 +1,46 @@ + + + +
+ + + + + {#if expanded} +
+
+ {@render children()} +
+
+ {/if} +
diff --git a/web/src/lib/components/StaleContainerCard.svelte b/web/src/lib/components/StaleContainerCard.svelte new file mode 100644 index 0000000..9e5f12c --- /dev/null +++ b/web/src/lib/components/StaleContainerCard.svelte @@ -0,0 +1,85 @@ + + + +
+ +
+
+

+ {displayName} +

+
+ + {container.project_name} + + + {container.stage_name} + +
+
+ + + + + {container.days_stale} {$t('stale.daysStale')} + +
+ + +
+ + + {container.instance.image_tag} + + + + {$t('stale.lastAlive')}: {formatDate(container.instance.last_alive_at)} + + + {container.instance.status} + +
+ + +
+ +
+
diff --git a/web/src/lib/components/SystemHealthCard.svelte b/web/src/lib/components/SystemHealthCard.svelte new file mode 100644 index 0000000..d147a3a --- /dev/null +++ b/web/src/lib/components/SystemHealthCard.svelte @@ -0,0 +1,113 @@ + + + +{#if !loading} + +{/if} diff --git a/web/src/lib/components/ValidationChecklist.svelte b/web/src/lib/components/ValidationChecklist.svelte new file mode 100644 index 0000000..54f9a4d --- /dev/null +++ b/web/src/lib/components/ValidationChecklist.svelte @@ -0,0 +1,73 @@ + + + +{#if loading || result} +
+

+ {$t('proxies.validation.title')} +

+ + {#if loading && !result} +
+ + {$t('proxies.validation.checking')} +
+ {:else if result} +
    + {#each result.steps as step} +
  • +
    + {#if step.passed} + + + + {getStepLabel(step.name)} + {#if step.message} + — {step.message} + {/if} + {:else} + + + + {getStepLabel(step.name)} + {#if step.message} + — {step.message} + {/if} + {/if} +
    + {#if !step.passed && step.hint} +

    {step.hint}

    + {/if} +
  • + {/each} +
+ {/if} +
+{/if} diff --git a/web/src/lib/components/icons/IconEvents.svelte b/web/src/lib/components/icons/IconEvents.svelte new file mode 100644 index 0000000..207772a --- /dev/null +++ b/web/src/lib/components/icons/IconEvents.svelte @@ -0,0 +1,7 @@ + + diff --git a/web/src/lib/components/icons/IconLogout.svelte b/web/src/lib/components/icons/IconLogout.svelte new file mode 100644 index 0000000..b486ade --- /dev/null +++ b/web/src/lib/components/icons/IconLogout.svelte @@ -0,0 +1,9 @@ + + + + + + diff --git a/web/src/lib/components/icons/IconProxies.svelte b/web/src/lib/components/icons/IconProxies.svelte new file mode 100644 index 0000000..b6fdede --- /dev/null +++ b/web/src/lib/components/icons/IconProxies.svelte @@ -0,0 +1,7 @@ + + diff --git a/web/src/lib/components/icons/index.ts b/web/src/lib/components/icons/index.ts index a38f759..607706c 100644 --- a/web/src/lib/components/icons/index.ts +++ b/web/src/lib/components/icons/index.ts @@ -45,3 +45,6 @@ export { default as IconContainer } from './IconContainer.svelte'; export { default as IconHardDrive } from './IconHardDrive.svelte'; export { default as IconWifi } from './IconWifi.svelte'; export { default as IconRefresh } from './IconRefresh.svelte'; +export { default as IconProxies } from './IconProxies.svelte'; +export { default as IconEvents } from './IconEvents.svelte'; +export { default as IconLogout } from './IconLogout.svelte'; diff --git a/web/src/lib/i18n/en.json b/web/src/lib/i18n/en.json index 424d07d..aa5f03e 100644 --- a/web/src/lib/i18n/en.json +++ b/web/src/lib/i18n/en.json @@ -7,7 +7,10 @@ "dashboard": "Dashboard", "projects": "Projects", "deploy": "Deploy", - "settings": "Settings" + "proxies": "Proxies", + "events": "Events", + "settings": "Settings", + "logout": "Log out" }, "dashboard": { "title": "Dashboard", @@ -19,7 +22,8 @@ "retry": "Retry", "noProjects": "No projects yet.", "addFirst": "Add your first project", - "loadFailed": "Failed to load dashboard" + "loadFailed": "Failed to load dashboard", + "staleContainers": "Stale Containers" }, "projects": { "title": "Projects", @@ -176,7 +180,9 @@ "registries": "Registries", "credentials": "Credentials", "authentication": "Authentication", - "appearance": "Appearance" + "appearance": "Appearance", + "staleThreshold": "Stale threshold (days)", + "staleThresholdHelp": "Containers inactive for longer than this will be flagged as stale." }, "settingsGeneral": { "title": "General Settings", @@ -306,7 +312,8 @@ "createFailed": "Failed to create user", "deleteFailed": "Failed to delete user", "deleteConfirm": "Are you sure you want to delete this user?", - "usernameRequired": "Username and password are required" + "usernameRequired": "Username and password are required", + "password": "Password" }, "login": { "title": "Docker Watcher", @@ -320,6 +327,27 @@ "loginFailed": "Login failed", "networkError": "Network error" }, + "proxies": { + "title": "Proxy Manager", + "create": "Create Proxy", + "standalone": "Standalone Proxies", + "managed": "Managed Proxies", + "noProxies": "No proxies found", + "noProxiesDesc": "Create a standalone proxy or deploy a project with proxy enabled.", + "filter": { + "search": "Search by domain or destination...", + "health": "Health", + "type": "Type", + "all": "All", + "clear": "Clear filters" + }, + "health": { + "healthy": "Healthy", + "unhealthy": "Unhealthy", + "unknown": "Unknown" + }, + "lastChecked": "Last checked" + }, "common": { "cancel": "Cancel", "confirm": "Confirm", @@ -387,6 +415,108 @@ "search": "Search...", "noResults": "No results found" }, + "stale": { + "title": "Stale Containers", + "noStale": "No stale containers", + "noStaleDesc": "All containers are healthy and running.", + "cleanup": "Clean up", + "cleanupAll": "Clean up all", + "confirmCleanup": "This will stop and remove the container. Continue?", + "confirmBulkCleanup": "This will stop and remove all stale containers. Continue?", + "daysStale": "days stale", + "lastAlive": "Last alive", + "count": "Stale", + "cleanedUp": "Container cleaned up", + "bulkCleanedUp": "{count} containers cleaned up", + "cleanupFailed": "Cleanup failed", + "loadFailed": "Failed to load stale containers" + }, + "proxies": { + "title": "Proxies", + "create": "Create Proxy", + "noProxies": "No proxies configured yet.", + "noProxiesDesc": "Create a standalone proxy or deploy a project to see proxies here.", + "standalone": "Standalone Proxies", + "managed": "Managed", + "lastChecked": "Last checked", + "health": { + "healthy": "Healthy", + "unhealthy": "Unhealthy", + "unknown": "Unknown" + }, + "filter": { + "search": "Search proxies...", + "health": "Health", + "type": "Type", + "all": "All", + "clear": "Clear filters" + }, + "form": { + "title": "Create Proxy", + "editTitle": "Edit Proxy", + "destination": "Destination URL / IP", + "port": "Port", + "domain": "Domain", + "domainHelp": "The public domain for this proxy.", + "validate": "Validate", + "validating": "Validating...", + "create": "Create Proxy", + "save": "Save Changes", + "cancel": "Cancel", + "delete": "Delete", + "deleteConfirm": "Delete this proxy? This cannot be undone." + }, + "validation": { + "title": "Destination Validation", + "syntax": "URL syntax", + "dns": "DNS resolution", + "tcp": "TCP connection", + "http": "HTTP response", + "checking": "Checking...", + "skipped": "Skipped" + } + }, + "events": { + "title": "Event Log", + "noEvents": "No events found", + "noEventsDesc": "Events will appear here as they occur.", + "loadMore": "Load more", + "newEvents": "new events", + "filter": { + "severity": "Severity", + "source": "Source", + "dateRange": "Date range", + "search": "Search events...", + "lastHour": "Last hour", + "last24h": "Last 24 hours", + "last7d": "Last 7 days", + "allTime": "All time", + "clear": "Clear filters" + }, + "severity": { + "info": "Info", + "warn": "Warning", + "error": "Error" + }, + "source": { + "deploy": "Deploy", + "container": "Container", + "proxy": "Proxy", + "system": "System" + }, + "metadata": "Details" + }, + "stats": { + "cpu": "CPU", + "mem": "MEM", + "unavailable": "Stats unavailable" + }, + "systemHealth": { + "title": "System Health", + "containers": "Containers", + "proxies": "Proxies", + "recentErrors": "Recent Errors" + }, "language": { "en": "English", "ru": "Russian" diff --git a/web/src/lib/i18n/ru.json b/web/src/lib/i18n/ru.json index 082dbf7..6431b55 100644 --- a/web/src/lib/i18n/ru.json +++ b/web/src/lib/i18n/ru.json @@ -7,7 +7,10 @@ "dashboard": "Панель", "projects": "Проекты", "deploy": "Деплой", - "settings": "Настройки" + "proxies": "Прокси", + "events": "События", + "settings": "Настройки", + "logout": "Выйти" }, "dashboard": { "title": "Панель управления", @@ -19,7 +22,8 @@ "retry": "Повторить", "noProjects": "Проектов пока нет.", "addFirst": "Добавьте первый проект", - "loadFailed": "Не удалось загрузить панель" + "loadFailed": "Не удалось загрузить панель", + "staleContainers": "Устаревшие контейнеры" }, "projects": { "title": "Проекты", @@ -176,7 +180,9 @@ "registries": "Реестры", "credentials": "Учётные данные", "authentication": "Аутентификация", - "appearance": "Внешний вид" + "appearance": "Внешний вид", + "staleThreshold": "Порог устаревания (дни)", + "staleThresholdHelp": "Контейнеры, неактивные дольше этого срока, будут помечены как устаревшие." }, "settingsGeneral": { "title": "Общие настройки", @@ -306,7 +312,8 @@ "createFailed": "Не удалось создать пользователя", "deleteFailed": "Не удалось удалить пользователя", "deleteConfirm": "Вы уверены, что хотите удалить этого пользователя?", - "usernameRequired": "Имя пользователя и пароль обязательны" + "usernameRequired": "Имя пользователя и пароль обязательны", + "password": "Пароль" }, "login": { "title": "Docker Watcher", @@ -320,6 +327,27 @@ "loginFailed": "Ошибка входа", "networkError": "Ошибка сети" }, + "proxies": { + "title": "Менеджер прокси", + "create": "Создать прокси", + "standalone": "Автономные прокси", + "managed": "Управляемые прокси", + "noProxies": "Прокси не найдены", + "noProxiesDesc": "Создайте автономный прокси или разверните проект с включённым прокси.", + "filter": { + "search": "Поиск по домену или назначению...", + "health": "Здоровье", + "type": "Тип", + "all": "Все", + "clear": "Сбросить фильтры" + }, + "health": { + "healthy": "Здоров", + "unhealthy": "Нездоров", + "unknown": "Неизвестно" + }, + "lastChecked": "Последняя проверка" + }, "common": { "cancel": "Отмена", "confirm": "Подтвердить", @@ -387,6 +415,108 @@ "search": "Поиск...", "noResults": "Ничего не найдено" }, + "stale": { + "title": "Устаревшие контейнеры", + "noStale": "Нет устаревших контейнеров", + "noStaleDesc": "Все контейнеры исправны и работают.", + "cleanup": "Очистить", + "cleanupAll": "Очистить все", + "confirmCleanup": "Это остановит и удалит контейнер. Продолжить?", + "confirmBulkCleanup": "Это остановит и удалит все устаревшие контейнеры. Продолжить?", + "daysStale": "дней устарел", + "lastAlive": "Последний раз жив", + "count": "Устаревшие", + "cleanedUp": "Контейнер очищен", + "bulkCleanedUp": "{count} контейнеров очищено", + "cleanupFailed": "Не удалось очистить", + "loadFailed": "Не удалось загрузить устаревшие контейнеры" + }, + "proxies": { + "title": "Прокси", + "create": "Создать прокси", + "noProxies": "Прокси ещё не настроены.", + "noProxiesDesc": "Создайте автономный прокси или разверните проект, чтобы увидеть прокси здесь.", + "standalone": "Автономные прокси", + "managed": "Управляемые", + "lastChecked": "Последняя проверка", + "health": { + "healthy": "Работает", + "unhealthy": "Недоступен", + "unknown": "Неизвестно" + }, + "filter": { + "search": "Поиск прокси...", + "health": "Здоровье", + "type": "Тип", + "all": "Все", + "clear": "Сбросить фильтры" + }, + "form": { + "title": "Создать прокси", + "editTitle": "Редактировать прокси", + "destination": "URL / IP назначения", + "port": "Порт", + "domain": "Домен", + "domainHelp": "Публичный домен для этого прокси.", + "validate": "Проверить", + "validating": "Проверка...", + "create": "Создать прокси", + "save": "Сохранить изменения", + "cancel": "Отмена", + "delete": "Удалить", + "deleteConfirm": "Удалить этот прокси? Это действие необратимо." + }, + "validation": { + "title": "Проверка назначения", + "syntax": "Синтаксис URL", + "dns": "DNS разрешение", + "tcp": "TCP подключение", + "http": "HTTP ответ", + "checking": "Проверка...", + "skipped": "Пропущено" + } + }, + "events": { + "title": "Журнал событий", + "noEvents": "Событий не найдено", + "noEventsDesc": "События будут отображаться здесь по мере их возникновения.", + "loadMore": "Загрузить ещё", + "newEvents": "новых событий", + "filter": { + "severity": "Уровень", + "source": "Источник", + "dateRange": "Период", + "search": "Поиск событий...", + "lastHour": "Последний час", + "last24h": "Последние 24 часа", + "last7d": "Последние 7 дней", + "allTime": "За всё время", + "clear": "Сбросить фильтры" + }, + "severity": { + "info": "Инфо", + "warn": "Предупреждение", + "error": "Ошибка" + }, + "source": { + "deploy": "Развёртывание", + "container": "Контейнер", + "proxy": "Прокси", + "system": "Система" + }, + "metadata": "Подробности" + }, + "stats": { + "cpu": "ЦП", + "mem": "ОЗУ", + "unavailable": "Статистика недоступна" + }, + "systemHealth": { + "title": "Состояние системы", + "containers": "Контейнеры", + "proxies": "Прокси", + "recentErrors": "Недавние ошибки" + }, "language": { "en": "Английский", "ru": "Русский" diff --git a/web/src/lib/sse.ts b/web/src/lib/sse.ts index 43926a1..7f04651 100644 --- a/web/src/lib/sse.ts +++ b/web/src/lib/sse.ts @@ -7,7 +7,7 @@ // ── Types ────────────────────────────────────────────────────────── -export type SSEEventType = 'deploy_log' | 'instance_status' | 'deploy_status'; +export type SSEEventType = 'deploy_log' | 'instance_status' | 'deploy_status' | 'event_log'; export interface SSEEvent { type: SSEEventType; @@ -36,7 +36,16 @@ export interface DeployStatusPayload { error?: string; } -type SSEPayload = DeployLogPayload | InstanceStatusPayload | DeployStatusPayload; +export interface EventLogSSEPayload { + id: number; + source: string; + severity: string; + message: string; + metadata: string; + created_at: string; +} + +type SSEPayload = DeployLogPayload | InstanceStatusPayload | DeployStatusPayload | EventLogSSEPayload; export interface SSEOptions { /** Called for each SSE event received. */ @@ -179,6 +188,7 @@ export function connectDeployLogs( export function connectGlobalEvents(callbacks: { onInstanceStatus?: (payload: InstanceStatusPayload) => void; onDeployStatus?: (payload: DeployStatusPayload) => void; + onEventLog?: (payload: EventLogSSEPayload) => void; onOpen?: () => void; onError?: (attempt: number) => void; }): SSEConnection { @@ -188,6 +198,8 @@ export function connectGlobalEvents(callbacks: { callbacks.onInstanceStatus?.(event.payload as InstanceStatusPayload); } else if (event.type === 'deploy_status') { callbacks.onDeployStatus?.(event.payload as DeployStatusPayload); + } else if (event.type === 'event_log') { + callbacks.onEventLog?.(event.payload as EventLogSSEPayload); } }, onOpen: callbacks.onOpen, diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index c7dedc8..d80dd0a 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -106,6 +106,7 @@ export interface Settings { polling_interval: string; base_volume_path: string; ssl_certificate_id: number; + stale_threshold_days: number; updated_at: string; } @@ -170,3 +171,95 @@ export interface Volume { created_at: string; updated_at: string; } + +/** A persistent event log entry. */ +export interface EventLogEntry { + id: number; + source: string; + severity: 'info' | 'warn' | 'error'; + message: string; + metadata: string; + created_at: string; +} + +/** Severity counts for the event log. */ +export interface EventLogStats { + info: number; + warn: number; + error: number; + total: number; +} + +/** A standalone reverse proxy not tied to a project. */ +export interface StandaloneProxy { + id: string; + domain: string; + destination_url: string; + destination_port: number; + ssl_certificate_id: number; + npm_proxy_id: number; + health_status: ProxyHealthStatus; + health_checked_at: string; + created_at: string; + updated_at: string; +} + +/** Health status for a proxy. */ +export type ProxyHealthStatus = 'unknown' | 'healthy' | 'unhealthy'; + +/** A container detected as stale by the backend poller. */ +export interface StaleContainer { + instance: { + id: string; + stage_id: string; + project_id: string; + container_id: string; + image_tag: string; + subdomain: string; + npm_proxy_id: number; + status: string; + port: number; + last_alive_at: string; + created_at: string; + updated_at: string; + }; + project_name: string; + stage_name: string; + days_stale: number; +} + +/** A single step in the validation pipeline. */ +export interface ValidationStep { + name: string; + passed: boolean; + message?: string; + hint?: string; +} + +/** Result of the proxy destination validation pipeline. */ +export interface ValidationResult { + valid: boolean; + steps: ValidationStep[]; +} + +/** Container CPU and memory stats from the Docker stats API. */ +export interface ContainerStats { + cpu_percent: number; + memory_usage: number; + memory_limit: number; + memory_percent: number; +} + +/** Unified view of standalone + deploy-managed proxies (from /api/proxies/all). */ +export interface ProxyView { + id: string; + domain: string; + destination: string; + type: 'standalone' | 'managed'; + project_name?: string; + stage_name?: string; + health_status: ProxyHealthStatus; + ssl_enabled: boolean; + npm_proxy_id: number; + created_at: string; +} diff --git a/web/src/routes/+layout.svelte b/web/src/routes/+layout.svelte index f03609a..4aa25d8 100644 --- a/web/src/routes/+layout.svelte +++ b/web/src/routes/+layout.svelte @@ -6,7 +6,7 @@ import Toast from '$lib/components/Toast.svelte'; import ThemeToggle from '$lib/components/ThemeToggle.svelte'; import LocaleSwitcher from '$lib/components/LocaleSwitcher.svelte'; - import { IconDashboard, IconProjects, IconDeploy, IconSettings, IconMenu, IconX } from '$lib/components/icons'; + import { IconDashboard, IconProjects, IconDeploy, IconProxies, IconEvents, IconSettings, IconMenu, IconX, IconLogout } from '$lib/components/icons'; import { connectGlobalEvents, type SSEConnection } from '$lib/sse'; import { instanceStatusStore } from '$lib/stores/instance-status'; import { resolvedTheme, applyTheme } from '$lib/stores/theme'; @@ -22,6 +22,8 @@ { href: '/', labelKey: 'nav.dashboard', icon: 'dashboard' }, { href: '/projects', labelKey: 'nav.projects', icon: 'projects' }, { href: '/deploy', labelKey: 'nav.deploy', icon: 'deploy' }, + { href: '/proxies', labelKey: 'nav.proxies', icon: 'proxies' }, + { href: '/events', labelKey: 'nav.events', icon: 'events' }, { href: '/settings', labelKey: 'nav.settings', icon: 'settings' } ] as const; @@ -56,6 +58,15 @@ sidebarOpen = false; }); + function logout() { + if (typeof localStorage !== 'undefined') { + localStorage.removeItem('auth_token'); + } + sseConnection?.close(); + sseConnection = null; + window.location.href = '/login'; + } + onMount(() => { sseConnection = connectGlobalEvents({ onInstanceStatus(payload) { @@ -128,6 +139,10 @@ {:else if item.icon === 'deploy'} + {:else if item.icon === 'proxies'} + + {:else if item.icon === 'events'} + {:else if item.icon === 'settings'} {/if} @@ -145,7 +160,17 @@ -

{$t('app.name')} {$t('app.version')}

+
+

{$t('app.name')} {$t('app.version')}

+ +
diff --git a/web/src/routes/+page.svelte b/web/src/routes/+page.svelte index 68fa09b..5902ac2 100644 --- a/web/src/routes/+page.svelte +++ b/web/src/routes/+page.svelte @@ -1,14 +1,16 @@ @@ -79,7 +86,7 @@ -
+ + + +

{$t('dashboard.projects')}

diff --git a/web/src/routes/containers/stale/+page.svelte b/web/src/routes/containers/stale/+page.svelte new file mode 100644 index 0000000..9d54c2f --- /dev/null +++ b/web/src/routes/containers/stale/+page.svelte @@ -0,0 +1,152 @@ + + + + {$t('stale.title')} - {$t('app.name')} + + +
+ +
+

{$t('stale.title')}

+ {#if containers.length > 0} + + {/if} +
+ + + {#if loading} +
+ {#each Array(3) as _} + + {/each} +
+ {:else if error} +
+

{error}

+ +
+ {:else if containers.length === 0} + + {:else} +
+ {#each containers as container (container.id)} + + {/each} +
+ {/if} +
+ + + { confirmSingleId = ''; }} +/> + + + { confirmBulk = false; }} +/> diff --git a/web/src/routes/containers/stale/+page.ts b/web/src/routes/containers/stale/+page.ts new file mode 100644 index 0000000..161a35d --- /dev/null +++ b/web/src/routes/containers/stale/+page.ts @@ -0,0 +1,2 @@ +// Client-side only — data is fetched in the component. +export const ssr = false; diff --git a/web/src/routes/events/+page.svelte b/web/src/routes/events/+page.svelte new file mode 100644 index 0000000..ba1db48 --- /dev/null +++ b/web/src/routes/events/+page.svelte @@ -0,0 +1,314 @@ + + + +
+ +
+

{$t('events.title')}

+
+ + +
+
+
+ {$t('events.severity.info')} + {stats.info} +
+
+
+ {$t('events.severity.warn')} + {stats.warn} +
+
+
+ {$t('events.severity.error')} + {stats.error} +
+
+ Total + {stats.total} +
+
+ + + + + + {#if pendingNewEvents.length > 0} + + {/if} + + + {#if loading} +
+ + + + +
+ {:else if filteredEvents.length === 0} + + {:else} +
+ {#each filteredEvents as entry (entry.id)} + + {/each} + + + {#if hasMore && searchText.trim() === ''} +
+ +
+ {/if} +
+ {/if} +
diff --git a/web/src/routes/events/+page.ts b/web/src/routes/events/+page.ts new file mode 100644 index 0000000..7d860cf --- /dev/null +++ b/web/src/routes/events/+page.ts @@ -0,0 +1,2 @@ +// Event log page — all data loaded client-side. +export const ssr = false; diff --git a/web/src/routes/proxies/+page.svelte b/web/src/routes/proxies/+page.svelte new file mode 100644 index 0000000..0319b34 --- /dev/null +++ b/web/src/routes/proxies/+page.svelte @@ -0,0 +1,239 @@ + + + + + {$t('proxies.title')} - {$t('app.name')} + + + +
+
+
+ +
+
+

{$t('proxies.title')}

+ {#if !loading && proxies.length > 0} +

+ {proxies.length} {proxies.length === 1 ? 'proxy' : 'proxies'} +

+ {/if} +
+
+ + + + + + {$t('proxies.create')} + +
+ + +{#if loading} +
+ + {$t('common.loading')} +
+{:else if error} + +
+

{error}

+ +
+{:else if proxies.length === 0} + + +{:else} + +
+ { search = v; }} + onhealthchange={(v) => { healthFilter = v; }} + ontypechange={(v) => { typeFilter = v; }} + onclear={clearFilters} + /> +
+ + + {#if filtered().length === 0} +
+

{$t('proxies.noProxies')}

+ +
+ {:else} +
+ + {#if standaloneProxies.length > 0} + + {#each standaloneProxies as proxy (proxy.id)} + + {/each} + + {/if} + + + {#if managedGroups().length > 0} + {#each managedGroups() as group (group.projectName)} + + {#each group.stages as stage (stage.stageName)} + {#if group.stages.length > 1} +
+

+ {stage.stageName} +

+
+ {/if} + {#each stage.proxies as proxy (proxy.id)} + + {/each} + {/each} +
+ {/each} + {/if} +
+ {/if} +{/if} diff --git a/web/src/routes/proxies/+page.ts b/web/src/routes/proxies/+page.ts new file mode 100644 index 0000000..0aef742 --- /dev/null +++ b/web/src/routes/proxies/+page.ts @@ -0,0 +1 @@ +// Client-side loading — data is fetched in the component via $effect. diff --git a/web/src/routes/proxies/[id]/edit/+page.svelte b/web/src/routes/proxies/[id]/edit/+page.svelte new file mode 100644 index 0000000..ede08ee --- /dev/null +++ b/web/src/routes/proxies/[id]/edit/+page.svelte @@ -0,0 +1,94 @@ + + + + + {$t('proxies.form.editTitle')} - {$t('app.name')} + + + + + + +
+
+ +
+

{$t('proxies.form.editTitle')}

+
+ +{#if loading} +
+ + {$t('common.loading')} +
+{:else if error} + +{:else if proxy} + +
+ +
+{/if} diff --git a/web/src/routes/proxies/[id]/edit/+page.ts b/web/src/routes/proxies/[id]/edit/+page.ts new file mode 100644 index 0000000..12c4939 --- /dev/null +++ b/web/src/routes/proxies/[id]/edit/+page.ts @@ -0,0 +1 @@ +// Client-side loading — proxy data is fetched in the component. diff --git a/web/src/routes/proxies/create/+page.svelte b/web/src/routes/proxies/create/+page.svelte new file mode 100644 index 0000000..2e32f76 --- /dev/null +++ b/web/src/routes/proxies/create/+page.svelte @@ -0,0 +1,52 @@ + + + + + {$t('proxies.form.title')} - {$t('app.name')} + + + + + + +
+
+ +
+

{$t('proxies.form.title')}

+
+ + +
+ +
diff --git a/web/src/routes/proxies/create/+page.ts b/web/src/routes/proxies/create/+page.ts new file mode 100644 index 0000000..c480e82 --- /dev/null +++ b/web/src/routes/proxies/create/+page.ts @@ -0,0 +1 @@ +// Client-side loading — ProxyForm handles data fetching. diff --git a/web/src/routes/settings/+page.svelte b/web/src/routes/settings/+page.svelte index 9cff48f..3ad60a3 100644 --- a/web/src/routes/settings/+page.svelte +++ b/web/src/routes/settings/+page.svelte @@ -20,6 +20,7 @@ let pollingInterval = $state(''); let baseVolumePath = $state(''); let notificationUrl = $state(''); + let staleThresholdDays = $state('7'); let sslCertificateId = $state(0); let sslCertName = $state(''); @@ -79,6 +80,7 @@ baseVolumePath = settings.base_volume_path ?? ''; sslCertificateId = settings.ssl_certificate_id ?? 0; notificationUrl = settings.notification_url ?? ''; + staleThresholdDays = String(settings.stale_threshold_days ?? 7); } catch (err) { toasts.error(err instanceof Error ? err.message : $t('settingsGeneral.loadFailed')); } finally { @@ -101,7 +103,8 @@ domain: domain.trim(), server_ip: serverIp.trim(), network: network.trim(), subdomain_pattern: subdomainPattern.trim(), polling_interval: pollingInterval.trim(), base_volume_path: baseVolumePath.trim(), notification_url: notificationUrl.trim(), - ssl_certificate_id: sslCertificateId + ssl_certificate_id: sslCertificateId, + stale_threshold_days: Math.max(1, parseInt(staleThresholdDays, 10) || 7) }); toasts.success($t('settingsGeneral.saved')); } catch (err) { @@ -242,6 +245,21 @@
+ +
+

{$t('stale.title')}

+
+ +
+
+