diff --git a/cmd/server/main.go b/cmd/server/main.go index b91e2e6..c369f7d 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -93,6 +93,21 @@ func main() { notifier := notify.New() eventBus := events.New() + // Auto-persist warn/error events from the event bus to the database. + stopLogger := eventBus.RegisterPersistentLogger(func(source, severity, message, metadata string) (int64, string, error) { + evt, err := db.InsertEvent(store.EventLog{ + Source: source, + Severity: severity, + Message: message, + Metadata: metadata, + }) + if err != nil { + return 0, "", err + } + return evt.ID, evt.CreatedAt, nil + }) + defer stopLogger() + dep := deployer.New(dockerClient, npmClient, db, healthChecker, notifier, eventBus, encKey) // Initialize webhook handler. diff --git a/internal/api/eventlog.go b/internal/api/eventlog.go new file mode 100644 index 0000000..a4fd025 --- /dev/null +++ b/internal/api/eventlog.go @@ -0,0 +1,48 @@ +package api + +import ( + "log/slog" + "net/http" + "strconv" + + "github.com/alexei/docker-watcher/internal/store" +) + +// listEventLog handles GET /api/events/log. +// Supports query parameters: severity, source, since, until, limit, offset. +func (s *Server) listEventLog(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query() + + limit, _ := strconv.Atoi(q.Get("limit")) + offset, _ := strconv.Atoi(q.Get("offset")) + + filter := store.EventLogFilter{ + Severity: q.Get("severity"), + Source: q.Get("source"), + Since: q.Get("since"), + Until: q.Get("until"), + Limit: limit, + Offset: offset, + } + + events, err := s.store.ListEvents(filter) + if err != nil { + slog.Error("failed to list events", "error", err) + respondError(w, http.StatusInternalServerError, "failed to list events") + return + } + + respondJSON(w, http.StatusOK, events) +} + +// getEventLogStats handles GET /api/events/log/stats. +func (s *Server) getEventLogStats(w http.ResponseWriter, r *http.Request) { + stats, err := s.store.GetEventStats() + if err != nil { + slog.Error("failed to get event stats", "error", err) + respondError(w, http.StatusInternalServerError, "failed to get event stats") + return + } + + respondJSON(w, http.StatusOK, stats) +} diff --git a/internal/api/router.go b/internal/api/router.go index dfb0221..8f4e656 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -125,6 +125,8 @@ func (s *Server) Router() chi.Router { r.Get("/deploys", s.listDeploys) r.Get("/deploys/{id}/logs", s.streamDeployLogs) r.Get("/events", s.streamEvents) + r.Get("/events/log", s.listEventLog) + r.Get("/events/log/stats", s.getEventLogStats) r.Get("/registries", s.listRegistries) r.Route("/registries/{id}", func(r chi.Router) { r.Get("/tags/*", s.listRegistryTags) diff --git a/internal/api/settings.go b/internal/api/settings.go index 22c6dbb..276dd45 100644 --- a/internal/api/settings.go +++ b/internal/api/settings.go @@ -24,7 +24,8 @@ type settingsRequest struct { NpmEmail string `json:"npm_email"` NpmPassword string `json:"npm_password"` PollingInterval string `json:"polling_interval"` - SSLCertificateID *int `json:"ssl_certificate_id,omitempty"` + SSLCertificateID *int `json:"ssl_certificate_id,omitempty"` + StaleThresholdDays *int `json:"stale_threshold_days,omitempty"` } // getSettings handles GET /api/settings. @@ -37,17 +38,18 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) { // Return settings without sensitive fields. respondJSON(w, http.StatusOK, map[string]any{ - "domain": settings.Domain, - "server_ip": settings.ServerIP, - "network": settings.Network, - "subdomain_pattern": settings.SubdomainPattern, - "notification_url": settings.NotificationURL, - "npm_url": settings.NpmURL, - "npm_email": settings.NpmEmail, - "has_npm_password": settings.NpmPassword != "", - "polling_interval": settings.PollingInterval, - "ssl_certificate_id": settings.SSLCertificateID, - "updated_at": settings.UpdatedAt, + "domain": settings.Domain, + "server_ip": settings.ServerIP, + "network": settings.Network, + "subdomain_pattern": settings.SubdomainPattern, + "notification_url": settings.NotificationURL, + "npm_url": settings.NpmURL, + "npm_email": settings.NpmEmail, + "has_npm_password": settings.NpmPassword != "", + "polling_interval": settings.PollingInterval, + "ssl_certificate_id": settings.SSLCertificateID, + "stale_threshold_days": settings.StaleThresholdDays, + "updated_at": settings.UpdatedAt, }) } @@ -101,6 +103,13 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) { updated.SSLCertificateID = *req.SSLCertificateID sslChanged = true } + if req.StaleThresholdDays != nil { + if *req.StaleThresholdDays < 1 { + respondError(w, http.StatusBadRequest, "stale_threshold_days must be at least 1") + return + } + updated.StaleThresholdDays = *req.StaleThresholdDays + } if err := s.store.UpdateSettings(updated); err != nil { respondError(w, http.StatusInternalServerError, "failed to update settings: "+err.Error()) diff --git a/internal/api/sse.go b/internal/api/sse.go index 4882223..32b1538 100644 --- a/internal/api/sse.go +++ b/internal/api/sse.go @@ -150,9 +150,9 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) flusher.Flush() - // Subscribe to instance status and deploy status events. + // Subscribe to instance status, deploy status, and persistent event log events. sub := s.eventBus.Subscribe(func(evt events.Event) bool { - return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus + return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus || evt.Type == events.EventLog }) defer s.eventBus.Unsubscribe(sub) diff --git a/internal/events/bus.go b/internal/events/bus.go index a4097a2..5cc1a4c 100644 --- a/internal/events/bus.go +++ b/internal/events/bus.go @@ -2,6 +2,7 @@ package events import ( "encoding/json" + "log/slog" "sync" ) @@ -17,6 +18,9 @@ const ( // EventDeployStatus is emitted when a deploy status changes. EventDeployStatus EventType = "deploy_status" + + // EventLog is emitted when a persistent event is logged. + EventLog EventType = "event_log" ) // Event is a single event published on the bus. @@ -50,6 +54,72 @@ type DeployStatusPayload struct { Error string `json:"error,omitempty"` } +// EventLogPayload is the payload for EventLog events (persistent event log). +type EventLogPayload struct { + ID int64 `json:"id"` + Source string `json:"source"` + Severity string `json:"severity"` + Message string `json:"message"` + Metadata string `json:"metadata"` + CreatedAt string `json:"created_at"` +} + +// PersistFunc is a callback that persists an event log entry. +// It receives source, severity, message, and metadata (JSON string). +// It returns the persisted entry's ID and created_at timestamp. +type PersistFunc func(source, severity, message, metadata string) (int64, string, error) + +// RegisterPersistentLogger subscribes to the bus and auto-persists warn/error +// events by calling the provided persist function. It also re-publishes the +// persisted event as an EventLog so SSE clients receive it in real-time. +// Call the returned function to unsubscribe. +func (b *Bus) RegisterPersistentLogger(persist PersistFunc) func() { + sub := b.Subscribe(func(evt Event) bool { + // Only persist deploy log events with warn/error level. + if evt.Type != EventDeployLog { + return false + } + p, ok := evt.Payload.(DeployLogPayload) + if !ok { + return false + } + return p.Level == "warn" || p.Level == "error" + }) + + go func() { + for evt := range sub { + p, ok := evt.Payload.(DeployLogPayload) + if !ok { + continue + } + metaBytes, _ := json.Marshal(map[string]string{"deploy_id": p.DeployID}) + metadata := string(metaBytes) + id, createdAt, err := persist("deploy", p.Level, p.Message, metadata) + if err != nil { + slog.Error("failed to persist event log", "source", "deploy", "level", p.Level, "error", err) + continue + } + + // Re-publish as EventLog for SSE clients. + b.Publish(Event{ + Type: EventLog, + Payload: EventLogPayload{ + ID: id, + Source: "deploy", + Severity: p.Level, + Message: p.Message, + Metadata: metadata, + CreatedAt: createdAt, + }, + }) + } + }() + + return func() { + b.Unsubscribe(sub) + } +} + // Subscriber is a channel that receives events. type Subscriber chan Event diff --git a/internal/store/eventlog.go b/internal/store/eventlog.go new file mode 100644 index 0000000..6c8a458 --- /dev/null +++ b/internal/store/eventlog.go @@ -0,0 +1,148 @@ +package store + +import ( + "fmt" + "strings" +) + +// EventLogFilter holds optional filters for listing event log entries. +type EventLogFilter struct { + Severity string // Filter by severity (info, warn, error). + Source string // Filter by source. + Since string // Only events created at or after this timestamp. + Until string // Only events created at or before this timestamp. + Limit int // Maximum number of results (default 50). + Offset int // Offset for pagination. +} + +// EventLogStats holds counts of event log entries by severity. +type EventLogStats struct { + Info int `json:"info"` + Warn int `json:"warn"` + Error int `json:"error"` + Total int `json:"total"` +} + +// InsertEvent inserts a new event log entry. +func (s *Store) InsertEvent(evt EventLog) (EventLog, error) { + evt.CreatedAt = Now() + if evt.Metadata == "" { + evt.Metadata = "{}" + } + + result, err := s.db.Exec( + `INSERT INTO event_log (source, severity, message, metadata, created_at) + VALUES (?, ?, ?, ?, ?)`, + evt.Source, evt.Severity, evt.Message, evt.Metadata, evt.CreatedAt, + ) + if err != nil { + return EventLog{}, fmt.Errorf("insert event: %w", err) + } + + id, err := result.LastInsertId() + if err != nil { + return EventLog{}, fmt.Errorf("get event id: %w", err) + } + evt.ID = id + + return evt, nil +} + +// ListEvents returns event log entries matching the given filter. +func (s *Store) ListEvents(filter EventLogFilter) ([]EventLog, error) { + var conditions []string + var args []any + + if filter.Severity != "" { + conditions = append(conditions, "severity = ?") + args = append(args, filter.Severity) + } + if filter.Source != "" { + conditions = append(conditions, "source = ?") + args = append(args, filter.Source) + } + if filter.Since != "" { + conditions = append(conditions, "created_at >= ?") + args = append(args, filter.Since) + } + if filter.Until != "" { + conditions = append(conditions, "created_at <= ?") + args = append(args, filter.Until) + } + + query := "SELECT id, source, severity, message, metadata, created_at FROM event_log" + if len(conditions) > 0 { + query += " WHERE " + strings.Join(conditions, " AND ") + } + query += " ORDER BY created_at DESC" + + limit := filter.Limit + if limit <= 0 { + limit = 50 + } + if limit > 500 { + limit = 500 + } + query += fmt.Sprintf(" LIMIT %d OFFSET %d", limit, filter.Offset) + + rows, err := s.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("query events: %w", err) + } + defer rows.Close() + + events := []EventLog{} + for rows.Next() { + var evt EventLog + if err := rows.Scan(&evt.ID, &evt.Source, &evt.Severity, &evt.Message, &evt.Metadata, &evt.CreatedAt); err != nil { + return nil, fmt.Errorf("scan event: %w", err) + } + events = append(events, evt) + } + return events, rows.Err() +} + +// GetEventStats returns counts of event log entries grouped by severity. +func (s *Store) GetEventStats() (EventLogStats, error) { + rows, err := s.db.Query( + `SELECT severity, COUNT(*) FROM event_log GROUP BY severity`, + ) + if err != nil { + return EventLogStats{}, fmt.Errorf("query event stats: %w", err) + } + defer rows.Close() + + var stats EventLogStats + for rows.Next() { + var severity string + var count int + if err := rows.Scan(&severity, &count); err != nil { + return EventLogStats{}, fmt.Errorf("scan event stats: %w", err) + } + switch severity { + case "info": + stats.Info = count + case "warn": + stats.Warn = count + case "error": + stats.Error = count + } + stats.Total += count + } + return stats, rows.Err() +} + +// PruneEvents deletes event log entries older than the given number of days. +func (s *Store) PruneEvents(olderThanDays int) (int64, error) { + if olderThanDays < 1 { + return 0, fmt.Errorf("prune events: olderThanDays must be >= 1, got %d", olderThanDays) + } + result, err := s.db.Exec( + `DELETE FROM event_log WHERE created_at < datetime('now', ?)`, + fmt.Sprintf("-%d days", olderThanDays), + ) + if err != nil { + return 0, fmt.Errorf("prune events: %w", err) + } + return result.RowsAffected() +} diff --git a/internal/store/models.go b/internal/store/models.go index 72e823f..5b0ed1b 100644 --- a/internal/store/models.go +++ b/internal/store/models.go @@ -55,8 +55,9 @@ type Settings struct { WebhookSecret string `json:"webhook_secret"` PollingInterval string `json:"polling_interval"` BaseVolumePath string `json:"base_volume_path"` - SSLCertificateID int `json:"ssl_certificate_id"` - UpdatedAt string `json:"updated_at"` + SSLCertificateID int `json:"ssl_certificate_id"` + StaleThresholdDays int `json:"stale_threshold_days"` + UpdatedAt string `json:"updated_at"` } // Instance represents a running (or stopped) container for a project stage. @@ -117,3 +118,27 @@ type Volume struct { CreatedAt string `json:"created_at"` UpdatedAt string `json:"updated_at"` } + +// EventLog represents a persistent event log entry. +type EventLog struct { + ID int64 `json:"id"` + Source string `json:"source"` + Severity string `json:"severity"` // info, warn, error + Message string `json:"message"` + Metadata string `json:"metadata"` // JSON-encoded structured data + CreatedAt string `json:"created_at"` +} + +// StandaloneProxy represents a standalone reverse proxy not tied to a project. +type StandaloneProxy struct { + ID string `json:"id"` + Domain string `json:"domain"` + DestinationURL string `json:"destination_url"` + DestinationPort int `json:"destination_port"` + SSLCertificateID int `json:"ssl_certificate_id"` + NpmProxyID int `json:"npm_proxy_id"` + HealthStatus string `json:"health_status"` // unknown, healthy, unhealthy + HealthCheckedAt string `json:"health_checked_at"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` +} diff --git a/internal/store/settings.go b/internal/store/settings.go index 1580cd8..d9ea761 100644 --- a/internal/store/settings.go +++ b/internal/store/settings.go @@ -9,10 +9,10 @@ func (s *Store) GetSettings() (Settings, error) { var st Settings err := s.db.QueryRow( `SELECT domain, server_ip, network, subdomain_pattern, notification_url, - npm_url, npm_email, npm_password, webhook_secret, polling_interval, base_volume_path, ssl_certificate_id, updated_at + npm_url, npm_email, npm_password, webhook_secret, polling_interval, base_volume_path, ssl_certificate_id, stale_threshold_days, updated_at FROM settings WHERE id = 1`, ).Scan(&st.Domain, &st.ServerIP, &st.Network, &st.SubdomainPattern, &st.NotificationURL, - &st.NpmURL, &st.NpmEmail, &st.NpmPassword, &st.WebhookSecret, &st.PollingInterval, &st.BaseVolumePath, &st.SSLCertificateID, &st.UpdatedAt) + &st.NpmURL, &st.NpmEmail, &st.NpmPassword, &st.WebhookSecret, &st.PollingInterval, &st.BaseVolumePath, &st.SSLCertificateID, &st.StaleThresholdDays, &st.UpdatedAt) if err != nil { return Settings{}, fmt.Errorf("query settings: %w", err) } @@ -25,10 +25,10 @@ func (s *Store) UpdateSettings(st Settings) error { _, err := s.db.Exec( `UPDATE settings SET domain=?, server_ip=?, network=?, subdomain_pattern=?, notification_url=?, - npm_url=?, npm_email=?, npm_password=?, webhook_secret=?, polling_interval=?, base_volume_path=?, ssl_certificate_id=?, updated_at=? + npm_url=?, npm_email=?, npm_password=?, webhook_secret=?, polling_interval=?, base_volume_path=?, ssl_certificate_id=?, stale_threshold_days=?, updated_at=? WHERE id = 1`, st.Domain, st.ServerIP, st.Network, st.SubdomainPattern, st.NotificationURL, - st.NpmURL, st.NpmEmail, st.NpmPassword, st.WebhookSecret, st.PollingInterval, st.BaseVolumePath, st.SSLCertificateID, st.UpdatedAt, + st.NpmURL, st.NpmEmail, st.NpmPassword, st.WebhookSecret, st.PollingInterval, st.BaseVolumePath, st.SSLCertificateID, st.StaleThresholdDays, st.UpdatedAt, ) if err != nil { return fmt.Errorf("update settings: %w", err) diff --git a/internal/store/standalone_proxy.go b/internal/store/standalone_proxy.go new file mode 100644 index 0000000..a1ce46e --- /dev/null +++ b/internal/store/standalone_proxy.go @@ -0,0 +1,120 @@ +package store + +import ( + "database/sql" + "errors" + "fmt" + + "github.com/google/uuid" +) + +// CreateStandaloneProxy inserts a new standalone proxy record. +func (s *Store) CreateStandaloneProxy(p StandaloneProxy) (StandaloneProxy, error) { + p.ID = uuid.New().String() + p.CreatedAt = Now() + p.UpdatedAt = p.CreatedAt + + if p.HealthStatus == "" { + p.HealthStatus = "unknown" + } + + _, err := s.db.Exec( + `INSERT INTO standalone_proxies (id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + p.ID, p.Domain, p.DestinationURL, p.DestinationPort, p.SSLCertificateID, + p.NpmProxyID, p.HealthStatus, p.HealthCheckedAt, p.CreatedAt, p.UpdatedAt, + ) + if err != nil { + return StandaloneProxy{}, fmt.Errorf("insert standalone proxy: %w", err) + } + return p, nil +} + +// GetStandaloneProxy returns a standalone proxy by ID. +func (s *Store) GetStandaloneProxy(id string) (StandaloneProxy, error) { + var p StandaloneProxy + err := s.db.QueryRow( + `SELECT id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at + FROM standalone_proxies WHERE id = ?`, id, + ).Scan(&p.ID, &p.Domain, &p.DestinationURL, &p.DestinationPort, &p.SSLCertificateID, + &p.NpmProxyID, &p.HealthStatus, &p.HealthCheckedAt, &p.CreatedAt, &p.UpdatedAt) + if errors.Is(err, sql.ErrNoRows) { + return StandaloneProxy{}, fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound) + } + if err != nil { + return StandaloneProxy{}, fmt.Errorf("query standalone proxy: %w", err) + } + return p, nil +} + +// ListStandaloneProxies returns all standalone proxy records ordered by creation time. +func (s *Store) ListStandaloneProxies() ([]StandaloneProxy, error) { + rows, err := s.db.Query( + `SELECT id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at + FROM standalone_proxies ORDER BY created_at DESC`, + ) + if err != nil { + return nil, fmt.Errorf("query standalone proxies: %w", err) + } + defer rows.Close() + + proxies := []StandaloneProxy{} + for rows.Next() { + var p StandaloneProxy + if err := rows.Scan(&p.ID, &p.Domain, &p.DestinationURL, &p.DestinationPort, &p.SSLCertificateID, + &p.NpmProxyID, &p.HealthStatus, &p.HealthCheckedAt, &p.CreatedAt, &p.UpdatedAt); err != nil { + return nil, fmt.Errorf("scan standalone proxy: %w", err) + } + proxies = append(proxies, p) + } + return proxies, rows.Err() +} + +// UpdateStandaloneProxy updates an existing standalone proxy's mutable fields. +func (s *Store) UpdateStandaloneProxy(p StandaloneProxy) error { + p.UpdatedAt = Now() + result, err := s.db.Exec( + `UPDATE standalone_proxies SET domain=?, destination_url=?, destination_port=?, ssl_certificate_id=?, npm_proxy_id=?, health_status=?, health_checked_at=?, updated_at=? + WHERE id=?`, + p.Domain, p.DestinationURL, p.DestinationPort, p.SSLCertificateID, + p.NpmProxyID, p.HealthStatus, p.HealthCheckedAt, p.UpdatedAt, p.ID, + ) + if err != nil { + return fmt.Errorf("update standalone proxy: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("standalone proxy %s: %w", p.ID, ErrNotFound) + } + return nil +} + +// DeleteStandaloneProxy removes a standalone proxy by ID. +func (s *Store) DeleteStandaloneProxy(id string) error { + result, err := s.db.Exec(`DELETE FROM standalone_proxies WHERE id = ?`, id) + if err != nil { + return fmt.Errorf("delete standalone proxy: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound) + } + return nil +} + +// UpdateProxyHealth updates the health status and check timestamp for a standalone proxy. +func (s *Store) UpdateProxyHealth(id string, status string) error { + ts := Now() + result, err := s.db.Exec( + `UPDATE standalone_proxies SET health_status=?, health_checked_at=?, updated_at=? WHERE id=?`, + status, ts, ts, id, + ) + if err != nil { + return fmt.Errorf("update proxy health: %w", err) + } + n, _ := result.RowsAffected() + if n == 0 { + return fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound) + } + return nil +} diff --git a/internal/store/store.go b/internal/store/store.go index 9dbda01..e4cf857 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -81,6 +81,8 @@ func (s *Store) runMigrations() error { `ALTER TABLE stages ADD COLUMN enable_proxy INTEGER NOT NULL DEFAULT 1`, // Add ssl_certificate_id to settings (2026-03-29). `ALTER TABLE settings ADD COLUMN ssl_certificate_id INTEGER NOT NULL DEFAULT 0`, + // Add stale_threshold_days to settings (2026-03-30). + `ALTER TABLE settings ADD COLUMN stale_threshold_days INTEGER NOT NULL DEFAULT 7`, } for _, m := range migrations { @@ -98,6 +100,9 @@ func (s *Store) runMigrations() error { `CREATE INDEX IF NOT EXISTS idx_stages_project_id ON stages(project_id)`, `CREATE INDEX IF NOT EXISTS idx_stage_env_stage_id ON stage_env(stage_id)`, `CREATE INDEX IF NOT EXISTS idx_volumes_project_id ON volumes(project_id)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_severity ON event_log(severity)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_source ON event_log(source)`, + `CREATE INDEX IF NOT EXISTS idx_event_log_created_at ON event_log(created_at)`, } for _, idx := range indexes { if _, err := s.db.Exec(idx); err != nil { @@ -250,6 +255,28 @@ CREATE TABLE IF NOT EXISTS volumes ( created_at TEXT NOT NULL DEFAULT (datetime('now')), updated_at TEXT NOT NULL DEFAULT (datetime('now')) ); + +CREATE TABLE IF NOT EXISTS event_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL DEFAULT '', + severity TEXT NOT NULL DEFAULT 'info', + message TEXT NOT NULL DEFAULT '', + metadata TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS standalone_proxies ( + id TEXT PRIMARY KEY, + domain TEXT NOT NULL UNIQUE, + destination_url TEXT NOT NULL DEFAULT '', + destination_port INTEGER NOT NULL DEFAULT 0, + ssl_certificate_id INTEGER NOT NULL DEFAULT 0, + npm_proxy_id INTEGER NOT NULL DEFAULT 0, + health_status TEXT NOT NULL DEFAULT 'unknown', + health_checked_at TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) +); ` // Now returns the current time formatted for SQLite storage. diff --git a/plans/observability-proxy-mgmt/CONTEXT.md b/plans/observability-proxy-mgmt/CONTEXT.md new file mode 100644 index 0000000..025cb0e --- /dev/null +++ b/plans/observability-proxy-mgmt/CONTEXT.md @@ -0,0 +1,52 @@ +# Feature Context: Observability & Proxy Management + +## Configuration +- **Development mode:** Automated +- **Execution mode:** Orchestrator +- **Strategy:** Incremental +- **Build (full):** `make build` +- **Build (frontend):** `cd web && npm install && npm run build` +- **Build (backend):** `go build -o docker-watcher ./cmd/server` +- **Test:** `go test ./...` +- **Lint (backend):** `go vet ./...` +- **Lint (frontend):** `cd web && npm run check` +- **Dev server:** `make dev` (port: 8080) + +## Current State +Feature branch just created. No implementation yet. Codebase is fully working on main. + +## Temporary Workarounds +(none yet) + +## Cross-Phase Dependencies +- Phases 2 & 3 depend on Phase 1 (schema, event_log table, store methods) +- Phases 4, 5, 6, 7 depend on their respective backend phases (1-3) for API endpoints +- Phase 8 depends on Phases 1-3 for backend infrastructure and event system + +## Deferred Work +(none yet) + +## Failed Approaches +(none yet) + +## Review Findings Log +(none yet) + +## Phase Execution Log +| Phase | Agent Used | Test Writer | Parallel | Notes | +|-------|-----------|-------------|----------|-------| +| (none yet) | | | | | + +## Environment & Runtime Notes +- Build is currently blocked on Go 1.25 transitive dep from Docker SDK β€” may need to use Go 1.24 toolchain +- SQLite has MaxOpenConns=1, so all DB operations are serialized +- Frontend is embedded into Go binary via embed.FS + +## Implementation Notes +- Event bus (`internal/events/bus.go`) uses buffered channels (64 cap), non-blocking publish +- NPM client (`internal/npm/client.go`) handles JWT auth with auto-refresh +- Store uses additive migrations β€” new `ALTER TABLE` statements are appended to runMigrations(), errors ignored for idempotency +- New tables use `CREATE TABLE IF NOT EXISTS` in the schema constant +- All API responses use envelope pattern: `{success: bool, data?: T, error?: string}` +- Frontend types in `web/src/lib/types.ts` mirror Go models +- API functions centralized in `web/src/lib/api.ts` diff --git a/plans/observability-proxy-mgmt/PLAN.md b/plans/observability-proxy-mgmt/PLAN.md new file mode 100644 index 0000000..8f4ec19 --- /dev/null +++ b/plans/observability-proxy-mgmt/PLAN.md @@ -0,0 +1,71 @@ +# Feature: Observability & Proxy Management + +**Branch:** `feature/observability-proxy-mgmt` +**Base branch:** `main` +**Created:** 2026-03-30 +**Status:** 🟑 In Progress +**Strategy:** Incremental +**Mode:** Automated +**Execution:** Orchestrator + +## Summary + +Extend Docker Watcher with four interconnected features: stale container detection, +standalone proxy management with health monitoring, a unified proxy viewer, and a +persistent event log β€” plus container stats and notification triggers. + +## Build & Test Commands +- **Build (frontend):** `cd web && npm install && npm run build` +- **Build (backend):** `go build -o docker-watcher ./cmd/server` +- **Build (full):** `make build` +- **Test (backend):** `go test ./...` +- **Lint (backend):** `go vet ./...` +- **Lint (frontend):** `cd web && npm run check` + +## Tech Stack Summary +- **Backend:** Go 1.24, chi v5 router, SQLite (modernc.org/sqlite), Docker SDK (moby/moby/client) +- **Frontend:** SvelteKit 2.15, Svelte 5, TypeScript 5.7, Tailwind CSS 4, Vite 6 +- **Real-time:** Server-Sent Events with auto-reconnect +- **Auth:** JWT + optional OIDC +- **Encryption:** AES-256-GCM for credentials + +## Project Conventions +- **Go:** gofmt, small interfaces, error wrapping with `fmt.Errorf("context: %w", err)`, constructor injection +- **DB:** Single-row settings, additive migrations via `ALTER TABLE` (errors ignored for idempotency), `CREATE TABLE IF NOT EXISTS` for new tables +- **API:** Envelope pattern `{success, data?, error?}`, chi route groups, admin middleware for writes +- **Frontend:** Svelte 5 runes ($state, $derived, $effect), TypeScript interfaces mirroring Go models, centralized api.ts, custom components (no UI library) +- **Files:** Feature-organized, small focused files +- **State:** Immutable patterns, no mutation + +## Phases + +- [ ] Phase 1: Schema, Models & Event Log Backend [domain: backend] β†’ [subplan](./phase-1-schema-eventlog.md) +- [ ] Phase 2: Stale Container Detection [domain: backend] β†’ [subplan](./phase-2-stale-detection.md) +- [ ] Phase 3: Direct Proxy Creation with Validation [domain: backend] β†’ [subplan](./phase-3-proxy-creation.md) +- [ ] Phase 4: Unified Proxy Viewer UI [domain: frontend] β†’ [subplan](./phase-4-proxy-viewer.md) +- [ ] Phase 5: Stale Containers UI [domain: frontend] β†’ [subplan](./phase-5-stale-ui.md) +- [ ] Phase 6: Direct Proxy Creation UI [domain: frontend] β†’ [subplan](./phase-6-proxy-creation-ui.md) +- [ ] Phase 7: Event Log UI [domain: frontend] β†’ [subplan](./phase-7-eventlog-ui.md) +- [ ] Phase 8: Container Stats & Notifications [domain: fullstack] β†’ [subplan](./phase-8-stats-notifications.md) + +**Parallelizable phases:** +- Phases 4, 5, 6, 7 are all frontend phases that touch different routes/components and can potentially run in parallel after all backend phases (1-3) complete. + +## Phase Progress Log + +| Phase | Domain | Status | Review | Build | Committed | +|-------|--------|--------|--------|-------|-----------| +| Phase 1: Schema & Event Log | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 2: Stale Detection | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 3: Proxy Creation | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 4: Proxy Viewer UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 5: Stale Containers UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 6: Proxy Creation UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 7: Event Log UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ | +| Phase 8: Stats & Notifications | fullstack | ⬜ Not Started | ⬜ | ⬜ | ⬜ | + +## Final Review +- [ ] Comprehensive code review +- [ ] Full build passes +- [ ] Full test suite passes +- [ ] Merged to `main` diff --git a/plans/observability-proxy-mgmt/phase-1-schema-eventlog.md b/plans/observability-proxy-mgmt/phase-1-schema-eventlog.md new file mode 100644 index 0000000..247d673 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-1-schema-eventlog.md @@ -0,0 +1,60 @@ +# Phase 1: Schema, Models & Event Log Backend + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective +Lay the database foundation for all new features and implement the persistent event log system. + +## Tasks + +- [ ] Task 1: Add `event_log` table to schema (id INTEGER PK AUTOINCREMENT, source TEXT, severity TEXT, message TEXT, metadata TEXT JSON, created_at TEXT) +- [ ] Task 2: Add `standalone_proxies` table to schema (id TEXT PK, domain TEXT UNIQUE, destination_url TEXT, destination_port INTEGER, ssl_certificate_id INTEGER, npm_proxy_id INTEGER, health_status TEXT, health_checked_at TEXT, created_at TEXT, updated_at TEXT) +- [ ] Task 3: Add `stale_threshold_days` column to settings table (migration, default 7) +- [ ] Task 4: Create `internal/store/eventlog.go` β€” store methods: InsertEvent, ListEvents (paginated, filterable by severity/source/date range), GetEventStats (counts by severity), PruneEvents (delete old entries) +- [ ] Task 5: Create `internal/store/standalone_proxy.go` β€” store methods: CreateStandaloneProxy, GetStandaloneProxy, ListStandaloneProxies, UpdateStandaloneProxy, DeleteStandaloneProxy, UpdateProxyHealth +- [ ] Task 6: Create Go models in `internal/store/models.go` β€” EventLog struct, StandaloneProxy struct +- [ ] Task 7: Update settings model to include stale_threshold_days field; update GetSettings/SaveSettings +- [ ] Task 8: Enhance event bus to auto-persist warn/error events β€” add a subscriber in events.Bus that writes to store +- [ ] Task 9: Add API endpoints: `GET /api/events/log` (paginated, filterable), `GET /api/events/log/stats` +- [ ] Task 10: Add new SSE event type `event_log` β€” broadcast persistent events in real-time +- [ ] Task 11: Add frontend types: EventLogEntry, StandaloneProxy interfaces in types.ts +- [ ] Task 12: Add API functions in api.ts: fetchEventLog, fetchEventLogStats + +## Files to Modify/Create +- `internal/store/store.go` β€” Add schema for event_log, standalone_proxies tables; migration for stale_threshold_days +- `internal/store/models.go` β€” Add EventLog, StandaloneProxy structs; update Settings struct +- `internal/store/eventlog.go` β€” NEW: Event log store methods +- `internal/store/standalone_proxy.go` β€” NEW: Standalone proxy store methods +- `internal/store/settings.go` β€” Update GetSettings/SaveSettings for new field +- `internal/events/bus.go` β€” Add persistent event subscriber +- `internal/api/router.go` β€” Mount new event log routes +- `internal/api/eventlog.go` β€” NEW: Event log HTTP handlers +- `web/src/lib/types.ts` β€” Add EventLogEntry, StandaloneProxy types +- `web/src/lib/api.ts` β€” Add fetchEventLog, fetchEventLogStats functions + +## Acceptance Criteria +- event_log and standalone_proxies tables created on startup (migration is idempotent) +- stale_threshold_days setting accessible via settings API +- Events with warn/error severity auto-persisted from event bus +- GET /api/events/log returns paginated, filterable results +- GET /api/events/log/stats returns severity counts +- Frontend types and API functions ready for downstream UI phases +- Existing functionality unchanged β€” all current tests/builds pass + +## Notes +- Follow existing migration pattern: ALTER TABLE errors ignored for idempotency +- event_log metadata is a JSON TEXT column for flexible structured data +- Pagination follows offset/limit pattern (no cursor β€” SQLite is simple enough) +- Event log pruning can be called from a cron job later (Phase 8) + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-2-stale-detection.md b/plans/observability-proxy-mgmt/phase-2-stale-detection.md new file mode 100644 index 0000000..aa10c15 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-2-stale-detection.md @@ -0,0 +1,55 @@ +# Phase 2: Stale Container Detection + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective +Implement a periodic scanner that detects containers managed by docker-watcher which have been non-running for more than N configurable days, and exposes them via API. + +## Tasks + +- [ ] Task 1: Create `internal/stale/scanner.go` β€” Scanner struct with dependencies (store, docker client, event bus) +- [ ] Task 2: Implement scan logic: query all instances from store, check Docker container state via Docker SDK, compare against stale_threshold_days from settings +- [ ] Task 3: Add `last_alive_at` column to instances table (migration) β€” updated when instance is seen running +- [ ] Task 4: Update deployer/instance lifecycle to set last_alive_at when container starts/is seen running +- [ ] Task 5: Implement stale detection: instance is stale if status != 'running' AND (now - last_alive_at) > threshold days +- [ ] Task 6: Emit event_log warnings when containers become newly stale (avoid re-emitting for already-known stale containers) +- [ ] Task 7: Register scanner as cron job (reuse existing robfig/cron infrastructure from registry poller) +- [ ] Task 8: Add API endpoints: `GET /api/containers/stale` (list stale with project/stage info), `POST /api/containers/stale/{id}/cleanup` (remove single), `POST /api/containers/stale/cleanup` (bulk remove) +- [ ] Task 9: Cleanup handler: stop container via Docker SDK, remove instance from store, emit event +- [ ] Task 10: Wire scanner into main.go startup (after store, docker client, event bus init) + +## Files to Modify/Create +- `internal/stale/scanner.go` β€” NEW: Stale container scanner +- `internal/store/store.go` β€” Migration for last_alive_at column +- `internal/store/models.go` β€” Update Instance struct with LastAliveAt field +- `internal/store/instances.go` β€” Update queries to include last_alive_at; add UpdateLastAliveAt method +- `internal/api/router.go` β€” Mount stale container routes +- `internal/api/stale.go` β€” NEW: Stale container HTTP handlers +- `cmd/server/main.go` β€” Wire scanner with cron + +## Acceptance Criteria +- Scanner runs on configurable interval (e.g., every hour) +- Stale containers correctly identified based on threshold +- GET /api/containers/stale returns list with project name, stage name, image tag, last alive timestamp, days stale +- Cleanup endpoints properly stop Docker containers and remove from store +- Events emitted when containers become stale +- Existing deploy flow unaffected β€” last_alive_at updated on successful deploy +- Build passes, existing tests pass + +## Notes +- Scanner should handle gracefully: containers that no longer exist in Docker (already removed externally) +- Bulk cleanup should be admin-only +- Consider: scan interval could be derived from stale_threshold_days (e.g., scan every threshold/7 days, min 1h) +- Don't remove containers that are in 'removing' status (already being cleaned up) + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-3-proxy-creation.md b/plans/observability-proxy-mgmt/phase-3-proxy-creation.md new file mode 100644 index 0000000..c713044 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-3-proxy-creation.md @@ -0,0 +1,81 @@ +# Phase 3: Direct Proxy Creation with Validation + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** backend + +## Objective +Implement standalone proxy creation with a multi-step validation pipeline that checks destination reachability, and periodic health monitoring for all standalone proxies. + +## Tasks + +- [ ] Task 1: Create `internal/proxy/validator.go` β€” validation pipeline: + - URL/port syntax validation + - DNS resolution check + - TCP port reachability (net.DialTimeout, 5s) + - HTTP health probe (GET to destination, 10s timeout) + - Returns structured ValidationResult with per-step pass/fail and diagnostic hints +- [ ] Task 2: Create `internal/proxy/hints.go` β€” diagnostic hint generator: + - DNS failure β†’ "Domain cannot be resolved. Check DNS settings or use an IP address." + - TCP refused β†’ "Port {port} is not accepting connections. Check if the service is running and the port is correct." + - TCP timeout β†’ "Connection timed out. Possible firewall blocking. Check network/firewall rules." + - Host unreachable β†’ "Host is not reachable. Verify the IP address and network connectivity." + - HTTP error β†’ "Service responded with HTTP {status}. The service may not be healthy." +- [ ] Task 3: Create `internal/proxy/manager.go` β€” proxy lifecycle: + - CreateProxy: validate destination, create NPM proxy host (using npm.Client), assign SSL cert from settings, save to standalone_proxies table + - UpdateProxy: re-validate, update NPM proxy host, update store + - DeleteProxy: remove NPM proxy host, remove from store + - GetProxy/ListProxies: read from store with health status +- [ ] Task 4: Create `internal/proxy/health.go` β€” periodic health monitor: + - Cron job that checks all standalone proxies + - HTTP GET to destination URL/port + - Updates health_status (healthy/unhealthy/unknown) and health_checked_at in store + - Emits event_log on status change (healthyβ†’unhealthy or vice versa) +- [ ] Task 5: Add API endpoints: + - `POST /api/proxies/validate` β€” run validation without creating + - `POST /api/proxies` β€” create standalone proxy + - `GET /api/proxies` β€” list standalone proxies + - `GET /api/proxies/{id}` β€” get single proxy + - `PUT /api/proxies/{id}` β€” update proxy + - `DELETE /api/proxies/{id}` β€” delete proxy + - `GET /api/proxies/all` β€” merged view: standalone + deploy-managed proxies (for Phase 4 UI) +- [ ] Task 6: Wire health monitor cron job in main.go +- [ ] Task 7: Add frontend API functions in api.ts: validateProxy, createProxy, listProxies, getProxy, updateProxy, deleteProxy, listAllProxies +- [ ] Task 8: Add frontend types: ValidationResult, ValidationStep, ProxyHealthStatus + +## Files to Modify/Create +- `internal/proxy/validator.go` β€” NEW: Validation pipeline +- `internal/proxy/hints.go` β€” NEW: Diagnostic hints +- `internal/proxy/manager.go` β€” NEW: Proxy lifecycle management +- `internal/proxy/health.go` β€” NEW: Health monitoring +- `internal/api/router.go` β€” Mount proxy routes +- `internal/api/proxy.go` β€” NEW: Proxy HTTP handlers +- `cmd/server/main.go` β€” Wire proxy manager and health monitor +- `web/src/lib/types.ts` β€” Add ValidationResult, ProxyHealthStatus types +- `web/src/lib/api.ts` β€” Add proxy API functions + +## Acceptance Criteria +- Validation pipeline returns structured results with specific failure hints +- POST /api/proxies/validate runs full check without side effects +- Proxy creation creates NPM proxy host with SSL cert from global settings +- Health monitor runs periodically and updates proxy status +- Events emitted on health status changes +- GET /api/proxies/all merges standalone and deploy-managed proxy data +- Build passes, existing tests pass + +## Notes +- Validation should be fast (short timeouts) β€” user waits for results +- Health monitor interval: every 5 minutes (configurable later) +- For /api/proxies/all: query NPM for all proxy hosts, join with instances table for managed proxies, join with standalone_proxies for standalone ones +- SSL cert auto-assigned from settings.ssl_certificate_id +- Consider: proxy domain must be unique across both standalone and managed proxies + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-4-proxy-viewer.md b/plans/observability-proxy-mgmt/phase-4-proxy-viewer.md new file mode 100644 index 0000000..e77218c --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-4-proxy-viewer.md @@ -0,0 +1,56 @@ +# Phase 4: Unified Proxy Viewer UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build a unified proxy viewer page showing ALL proxies (deploy-managed and standalone) with grouping, filtering, and real-time health indicators. + +## Tasks + +- [ ] Task 1: Create route `/proxies` with `+page.svelte` and `+page.ts` data loader +- [ ] Task 2: Create ProxyCard component β€” displays: domain, destination, SSL badge, health indicator (green/yellow/red dot), proxy type badge (managed/standalone), last health check timestamp +- [ ] Task 3: Create ProxyGroup component β€” collapsible section with project name header, stage sub-groups, proxy count badge +- [ ] Task 4: Create StandaloneProxyGroup component β€” separate collapsible section for user-created proxies +- [ ] Task 5: Implement filtering: by project, stage, health status (healthy/unhealthy/unknown), proxy type (managed/standalone), free-text search by domain/destination +- [ ] Task 6: Filter bar component with dropdown selects and search input +- [ ] Task 7: SSE integration β€” subscribe to proxy health events, update health indicators in real-time +- [ ] Task 8: Empty state β€” friendly message when no proxies exist, with link to create one +- [ ] Task 9: Add navigation link in sidebar layout (+layout.svelte) +- [ ] Task 10: Add i18n keys for proxy viewer page + +## Files to Modify/Create +- `web/src/routes/proxies/+page.svelte` β€” NEW: Proxy viewer page +- `web/src/routes/proxies/+page.ts` β€” NEW: Data loader +- `web/src/lib/components/ProxyCard.svelte` β€” NEW: Individual proxy display +- `web/src/lib/components/ProxyGroup.svelte` β€” NEW: Collapsible project/stage group +- `web/src/lib/components/ProxyFilter.svelte` β€” NEW: Filter bar +- `web/src/routes/+layout.svelte` β€” Add proxies nav link +- `web/src/lib/i18n/en.ts` (or equivalent) β€” Add proxy viewer strings + +## Acceptance Criteria +- All proxies visible: both deploy-managed and standalone +- Proxies grouped by project/stage in collapsible sections +- Health indicators show real-time status (green=healthy, red=unhealthy, yellow=unknown) +- Filtering works: project, stage, health, type, text search +- SSE updates health indicators without page refresh +- Navigation accessible from sidebar +- Responsive layout (mobile-friendly) + +## Notes +- Use existing component patterns (ConfirmDialog, FormField styles, etc.) +- Follow existing Svelte 5 patterns ($state, $derived, $effect) +- The /api/proxies/all endpoint from Phase 3 provides the data source +- Health indicator should pulse/animate briefly on status change +- Consider: show proxy count in sidebar nav badge + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-5-stale-ui.md b/plans/observability-proxy-mgmt/phase-5-stale-ui.md new file mode 100644 index 0000000..28adfb6 --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-5-stale-ui.md @@ -0,0 +1,55 @@ +# Phase 5: Stale Containers UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build the stale containers dashboard widget and dedicated view, with cleanup actions and settings configuration. + +## Tasks + +- [ ] Task 1: Add API functions in api.ts: fetchStaleContainers, cleanupStaleContainer, bulkCleanupStaleContainers +- [ ] Task 2: Create StaleContainerCard component β€” shows: container name, project, stage, image tag, last alive timestamp, "X days stale" badge (color-coded by severity) +- [ ] Task 3: Create stale containers section on dashboard (+page.svelte) β€” count badge, mini-list of top 5 offenders, "View all" link +- [ ] Task 4: Create dedicated route `/containers/stale` with full stale container list +- [ ] Task 5: Individual cleanup action β€” ConfirmDialog with warning, calls cleanup API +- [ ] Task 6: Bulk cleanup action β€” "Clean up all" button with confirmation, progress indicator +- [ ] Task 7: Settings integration β€” add stale_threshold_days field to settings page with validation (min 1 day) +- [ ] Task 8: Add navigation link or sub-nav for stale containers +- [ ] Task 9: Add i18n keys for stale containers + +## Files to Modify/Create +- `web/src/lib/api.ts` β€” Add stale container API functions +- `web/src/lib/types.ts` β€” Add StaleContainer interface +- `web/src/lib/components/StaleContainerCard.svelte` β€” NEW: Stale container display +- `web/src/routes/+page.svelte` β€” Add stale containers dashboard widget +- `web/src/routes/containers/stale/+page.svelte` β€” NEW: Dedicated stale view +- `web/src/routes/containers/stale/+page.ts` β€” NEW: Data loader +- `web/src/routes/settings/+page.svelte` β€” Add stale threshold setting field +- `web/src/routes/+layout.svelte` β€” Add nav link if needed + +## Acceptance Criteria +- Dashboard shows stale container count and top offenders +- Dedicated page lists all stale containers with details +- Individual cleanup removes container with confirmation +- Bulk cleanup works with progress feedback +- Settings page allows configuring stale threshold +- Severity coloring: 7-14 days = yellow, 14+ days = red +- Responsive layout + +## Notes +- Reuse existing ConfirmDialog for destructive actions +- Dashboard widget should not slow down initial page load (lazy load or small payload) +- Stale container data comes from GET /api/containers/stale (Phase 2) +- Settings update uses existing PUT /api/settings endpoint + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-6-proxy-creation-ui.md b/plans/observability-proxy-mgmt/phase-6-proxy-creation-ui.md new file mode 100644 index 0000000..7ccf7df --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-6-proxy-creation-ui.md @@ -0,0 +1,54 @@ +# Phase 6: Direct Proxy Creation UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build the proxy creation form with live validation feedback, diagnostic hints, and management actions (edit/delete). + +## Tasks + +- [ ] Task 1: Create "Create Proxy" form component β€” fields: destination URL/IP, port, domain (auto-suggested from subdomain pattern), optional custom subdomain override +- [ ] Task 2: Live validation β€” debounced calls to POST /api/proxies/validate as user types (300ms debounce) +- [ ] Task 3: Validation result display β€” step-by-step checklist with icons: + - βœ… DNS resolution OK / ❌ DNS resolution failed + - βœ… TCP port reachable / ❌ TCP port not reachable + - βœ… HTTP responding / ❌ HTTP not responding + - Each failure shows the diagnostic hint from the backend +- [ ] Task 4: Create proxy submission β€” calls POST /api/proxies, shows success toast with health indicator +- [ ] Task 5: Edit proxy β€” modal or inline form, pre-populated with current values, re-validates on save +- [ ] Task 6: Delete proxy β€” ConfirmDialog with domain name confirmation +- [ ] Task 7: Integration with proxy viewer page β€” "Create Proxy" button in the proxy viewer header +- [ ] Task 8: Domain auto-suggestion β€” when user enters destination, suggest domain based on subdomain_pattern from settings +- [ ] Task 9: Add i18n keys for proxy creation + +## Files to Modify/Create +- `web/src/lib/components/ProxyForm.svelte` β€” NEW: Create/edit proxy form +- `web/src/lib/components/ValidationChecklist.svelte` β€” NEW: Step-by-step validation display +- `web/src/routes/proxies/+page.svelte` β€” Add "Create Proxy" button and modal/panel +- `web/src/lib/api.ts` β€” Ensure validateProxy, createProxy, updateProxy, deleteProxy are present (from Phase 3) + +## Acceptance Criteria +- Form validates destination in real-time with debouncing +- Each validation step shows pass/fail with diagnostic hints +- Proxy creation works end-to-end (form β†’ API β†’ NPM β†’ success) +- Edit and delete work for existing standalone proxies +- Domain auto-suggestion works from settings pattern +- Error states handled gracefully (network errors, API failures) + +## Notes +- Validation should show a loading spinner while in progress +- Don't validate on every keystroke β€” use 300ms debounce +- If all validation steps fail, still allow creation (user might know better β€” just warn) +- SSL certificate is applied automatically from global settings (no cert picker in form) + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-7-eventlog-ui.md b/plans/observability-proxy-mgmt/phase-7-eventlog-ui.md new file mode 100644 index 0000000..d17e39e --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-7-eventlog-ui.md @@ -0,0 +1,54 @@ +# Phase 7: Event Log UI + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** frontend + +## Objective +Build a persistent, searchable event log viewer with real-time streaming, filters, and resource linking. + +## Tasks + +- [ ] Task 1: Create route `/events` with `+page.svelte` and `+page.ts` data loader +- [ ] Task 2: Create EventLogEntry component β€” timestamp, severity badge (info=blue, warn=yellow, error=red), source icon (container/proxy/deploy/system), message text, expandable metadata section +- [ ] Task 3: Create EventLogFilter component β€” filters: severity multi-select, source multi-select, date range picker (start/end), free-text search +- [ ] Task 4: Implement pagination β€” "Load more" button at bottom (offset/limit pattern matching API) +- [ ] Task 5: SSE integration β€” subscribe to event_log events, prepend new entries at top with subtle highlight animation +- [ ] Task 6: Quick actions β€” clickable links to related resources (e.g., click container name β†’ go to project/stage, click proxy domain β†’ go to proxy viewer) +- [ ] Task 7: Stats header β€” show counts by severity (from GET /api/events/log/stats), with colored badges +- [ ] Task 8: Add navigation link in sidebar +- [ ] Task 9: Add i18n keys for event log page + +## Files to Modify/Create +- `web/src/routes/events/+page.svelte` β€” NEW: Event log page +- `web/src/routes/events/+page.ts` β€” NEW: Data loader +- `web/src/lib/components/EventLogEntry.svelte` β€” NEW: Event entry display +- `web/src/lib/components/EventLogFilter.svelte` β€” NEW: Filter controls +- `web/src/routes/+layout.svelte` β€” Add events nav link +- `web/src/lib/sse.ts` β€” Add event_log SSE subscription helper (if needed) + +## Acceptance Criteria +- Event log shows all persistent events with severity and source +- Filters work: severity, source, date range, text search +- New events stream in real-time via SSE without page refresh +- Pagination loads older events on demand +- Quick actions link to related resources +- Stats header shows severity distribution +- Responsive layout + +## Notes +- Follow existing SSE patterns from deploy logs viewer +- Date range filter: consider "last hour", "last 24h", "last 7 days" presets + custom range +- Metadata section is JSON β€” render as formatted key-value pairs, not raw JSON +- Resource linking: parse source and metadata to construct navigation URLs +- Consider: auto-scroll to top when new event arrives (if user is at top), otherwise show "N new events" badge + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/plans/observability-proxy-mgmt/phase-8-stats-notifications.md b/plans/observability-proxy-mgmt/phase-8-stats-notifications.md new file mode 100644 index 0000000..857236b --- /dev/null +++ b/plans/observability-proxy-mgmt/phase-8-stats-notifications.md @@ -0,0 +1,67 @@ +# Phase 8: Container Stats & Notifications + +**Status:** ⬜ Not Started +**Parent plan:** [PLAN.md](./PLAN.md) +**Domain:** fullstack + +## Objective +Add container resource monitoring (CPU/memory), notification triggers for operational events, and a system health dashboard summary. + +## Tasks + +- [ ] Task 1: Create `internal/docker/stats.go` β€” wrapper around Docker Stats API to get CPU %, memory usage/limit for a container +- [ ] Task 2: Add API endpoint: `GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats` β€” returns current CPU/memory for an instance +- [ ] Task 3: Create SSE event type `container_stats` β€” periodically broadcast stats for running containers (every 30s) +- [ ] Task 4: Extend notification stub (`internal/notify/`) β€” implement webhook sender for events: + - Stale container detected + - Proxy health failure + - Deploy failure/rollback + - Format: JSON payload with event type, details, timestamp +- [ ] Task 5: Add notification settings UI β€” enable/disable per event type in settings page +- [ ] Task 6: Update instance cards in frontend β€” show CPU % bar and memory usage badge +- [ ] Task 7: Create ContainerStats component β€” mini CPU/memory visualization (progress bars) +- [ ] Task 8: Dashboard system health summary card β€” total containers (running/stopped), healthy/unhealthy proxies, recent error count (last 24h) +- [ ] Task 9: Wire notification sender to event bus β€” subscribe to relevant event types, fire notifications +- [ ] Task 10: Add event log pruning cron job β€” delete events older than 30 days (configurable) +- [ ] Task 11: Add i18n keys for stats and notifications + +## Files to Modify/Create +- `internal/docker/stats.go` β€” NEW: Docker Stats API wrapper +- `internal/api/stats.go` β€” NEW: Stats HTTP handler +- `internal/api/router.go` β€” Mount stats endpoint +- `internal/notify/sender.go` β€” Implement webhook notification sender +- `internal/notify/types.go` β€” NEW: Notification event types and payloads +- `cmd/server/main.go` β€” Wire notification subscriber and event pruning cron +- `web/src/lib/types.ts` β€” Add ContainerStats, NotificationSettings types +- `web/src/lib/api.ts` β€” Add fetchContainerStats function +- `web/src/lib/components/ContainerStats.svelte` β€” NEW: CPU/memory display +- `web/src/lib/components/SystemHealthCard.svelte` β€” NEW: Dashboard summary +- `web/src/routes/+page.svelte` β€” Add system health card to dashboard +- `web/src/routes/settings/+page.svelte` β€” Add notification settings section +- `web/src/lib/sse.ts` β€” Add container_stats SSE handler + +## Acceptance Criteria +- Container stats (CPU/memory) visible on instance cards +- Stats update in real-time via SSE +- Webhook notifications fire for configured event types +- Dashboard shows system health summary +- Event log auto-prunes old entries +- Settings page allows configuring notification preferences +- Build passes, existing tests pass + +## Notes +- Docker Stats API returns a stream β€” read one snapshot and close, don't hold the connection +- CPU calculation: (container CPU delta / system CPU delta) * 100 β€” needs two reads +- Memory: usage_bytes / limit_bytes * 100 for percentage +- Notification webhook format should be compatible with common receivers (Slack webhook, Discord webhook, generic HTTP) +- System health card: consider caching aggregated stats to avoid N+1 queries on dashboard load + +## Review Checklist +- [ ] All tasks completed +- [ ] Code follows project conventions +- [ ] No unintended side effects +- [ ] Build passes +- [ ] Tests pass (new + existing) + +## Handoff to Next Phase + diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 9bbc5b1..30d9d68 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -2,6 +2,8 @@ import type { ApiEnvelope, Deploy, DeployLog, + EventLogEntry, + EventLogStats, InspectResult, Instance, NpmCertificate, @@ -338,4 +340,29 @@ export function deleteVolume( return del<{ deleted: string }>(`/api/projects/${projectId}/volumes/${volId}`); } +// ── Event Log ─────────────────────────────────────────────────────── + +export function fetchEventLog(params?: { + severity?: string; + source?: string; + since?: string; + until?: string; + limit?: number; + offset?: number; +}): Promise { + const query = new URLSearchParams(); + if (params?.severity) query.set('severity', params.severity); + if (params?.source) query.set('source', params.source); + if (params?.since) query.set('since', params.since); + if (params?.until) query.set('until', params.until); + if (params?.limit) query.set('limit', String(params.limit)); + if (params?.offset) query.set('offset', String(params.offset)); + const qs = query.toString(); + return get(`/api/events/log${qs ? `?${qs}` : ''}`); +} + +export function fetchEventLogStats(): Promise { + return get('/api/events/log/stats'); +} + export { ApiError }; diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index c7dedc8..7c78603 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -106,6 +106,7 @@ export interface Settings { polling_interval: string; base_volume_path: string; ssl_certificate_id: number; + stale_threshold_days: number; updated_at: string; } @@ -170,3 +171,35 @@ export interface Volume { created_at: string; updated_at: string; } + +/** A persistent event log entry. */ +export interface EventLogEntry { + id: number; + source: string; + severity: 'info' | 'warn' | 'error'; + message: string; + metadata: string; + created_at: string; +} + +/** Severity counts for the event log. */ +export interface EventLogStats { + info: number; + warn: number; + error: number; + total: number; +} + +/** A standalone reverse proxy not tied to a project. */ +export interface StandaloneProxy { + id: string; + domain: string; + destination_url: string; + destination_port: number; + ssl_certificate_id: number; + npm_proxy_id: number; + health_status: 'unknown' | 'healthy' | 'unhealthy'; + health_checked_at: string; + created_at: string; + updated_at: string; +}