feat(observability): phase 1 - schema, models & event log backend
Add database foundation for observability features: - event_log table with severity/source filtering and pagination - standalone_proxies table for user-created reverse proxies - stale_threshold_days setting (default 7 days) - Auto-persist warn/error events from event bus to database - SSE broadcast of persistent events for real-time UI updates - Frontend types and API functions for downstream UI phases
This commit is contained in:
@@ -93,6 +93,21 @@ func main() {
|
||||
notifier := notify.New()
|
||||
eventBus := events.New()
|
||||
|
||||
// Auto-persist warn/error events from the event bus to the database.
|
||||
stopLogger := eventBus.RegisterPersistentLogger(func(source, severity, message, metadata string) (int64, string, error) {
|
||||
evt, err := db.InsertEvent(store.EventLog{
|
||||
Source: source,
|
||||
Severity: severity,
|
||||
Message: message,
|
||||
Metadata: metadata,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, "", err
|
||||
}
|
||||
return evt.ID, evt.CreatedAt, nil
|
||||
})
|
||||
defer stopLogger()
|
||||
|
||||
dep := deployer.New(dockerClient, npmClient, db, healthChecker, notifier, eventBus, encKey)
|
||||
|
||||
// Initialize webhook handler.
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/alexei/docker-watcher/internal/store"
|
||||
)
|
||||
|
||||
// listEventLog handles GET /api/events/log.
|
||||
// Supports query parameters: severity, source, since, until, limit, offset.
|
||||
func (s *Server) listEventLog(w http.ResponseWriter, r *http.Request) {
|
||||
q := r.URL.Query()
|
||||
|
||||
limit, _ := strconv.Atoi(q.Get("limit"))
|
||||
offset, _ := strconv.Atoi(q.Get("offset"))
|
||||
|
||||
filter := store.EventLogFilter{
|
||||
Severity: q.Get("severity"),
|
||||
Source: q.Get("source"),
|
||||
Since: q.Get("since"),
|
||||
Until: q.Get("until"),
|
||||
Limit: limit,
|
||||
Offset: offset,
|
||||
}
|
||||
|
||||
events, err := s.store.ListEvents(filter)
|
||||
if err != nil {
|
||||
slog.Error("failed to list events", "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to list events")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, events)
|
||||
}
|
||||
|
||||
// getEventLogStats handles GET /api/events/log/stats.
|
||||
func (s *Server) getEventLogStats(w http.ResponseWriter, r *http.Request) {
|
||||
stats, err := s.store.GetEventStats()
|
||||
if err != nil {
|
||||
slog.Error("failed to get event stats", "error", err)
|
||||
respondError(w, http.StatusInternalServerError, "failed to get event stats")
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, stats)
|
||||
}
|
||||
@@ -125,6 +125,8 @@ func (s *Server) Router() chi.Router {
|
||||
r.Get("/deploys", s.listDeploys)
|
||||
r.Get("/deploys/{id}/logs", s.streamDeployLogs)
|
||||
r.Get("/events", s.streamEvents)
|
||||
r.Get("/events/log", s.listEventLog)
|
||||
r.Get("/events/log/stats", s.getEventLogStats)
|
||||
r.Get("/registries", s.listRegistries)
|
||||
r.Route("/registries/{id}", func(r chi.Router) {
|
||||
r.Get("/tags/*", s.listRegistryTags)
|
||||
|
||||
+21
-12
@@ -24,7 +24,8 @@ type settingsRequest struct {
|
||||
NpmEmail string `json:"npm_email"`
|
||||
NpmPassword string `json:"npm_password"`
|
||||
PollingInterval string `json:"polling_interval"`
|
||||
SSLCertificateID *int `json:"ssl_certificate_id,omitempty"`
|
||||
SSLCertificateID *int `json:"ssl_certificate_id,omitempty"`
|
||||
StaleThresholdDays *int `json:"stale_threshold_days,omitempty"`
|
||||
}
|
||||
|
||||
// getSettings handles GET /api/settings.
|
||||
@@ -37,17 +38,18 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// Return settings without sensitive fields.
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"domain": settings.Domain,
|
||||
"server_ip": settings.ServerIP,
|
||||
"network": settings.Network,
|
||||
"subdomain_pattern": settings.SubdomainPattern,
|
||||
"notification_url": settings.NotificationURL,
|
||||
"npm_url": settings.NpmURL,
|
||||
"npm_email": settings.NpmEmail,
|
||||
"has_npm_password": settings.NpmPassword != "",
|
||||
"polling_interval": settings.PollingInterval,
|
||||
"ssl_certificate_id": settings.SSLCertificateID,
|
||||
"updated_at": settings.UpdatedAt,
|
||||
"domain": settings.Domain,
|
||||
"server_ip": settings.ServerIP,
|
||||
"network": settings.Network,
|
||||
"subdomain_pattern": settings.SubdomainPattern,
|
||||
"notification_url": settings.NotificationURL,
|
||||
"npm_url": settings.NpmURL,
|
||||
"npm_email": settings.NpmEmail,
|
||||
"has_npm_password": settings.NpmPassword != "",
|
||||
"polling_interval": settings.PollingInterval,
|
||||
"ssl_certificate_id": settings.SSLCertificateID,
|
||||
"stale_threshold_days": settings.StaleThresholdDays,
|
||||
"updated_at": settings.UpdatedAt,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -101,6 +103,13 @@ func (s *Server) updateSettings(w http.ResponseWriter, r *http.Request) {
|
||||
updated.SSLCertificateID = *req.SSLCertificateID
|
||||
sslChanged = true
|
||||
}
|
||||
if req.StaleThresholdDays != nil {
|
||||
if *req.StaleThresholdDays < 1 {
|
||||
respondError(w, http.StatusBadRequest, "stale_threshold_days must be at least 1")
|
||||
return
|
||||
}
|
||||
updated.StaleThresholdDays = *req.StaleThresholdDays
|
||||
}
|
||||
|
||||
if err := s.store.UpdateSettings(updated); err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "failed to update settings: "+err.Error())
|
||||
|
||||
+2
-2
@@ -150,9 +150,9 @@ func (s *Server) streamEvents(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
flusher.Flush()
|
||||
|
||||
// Subscribe to instance status and deploy status events.
|
||||
// Subscribe to instance status, deploy status, and persistent event log events.
|
||||
sub := s.eventBus.Subscribe(func(evt events.Event) bool {
|
||||
return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus
|
||||
return evt.Type == events.EventInstanceStatus || evt.Type == events.EventDeployStatus || evt.Type == events.EventLog
|
||||
})
|
||||
defer s.eventBus.Unsubscribe(sub)
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ package events
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"sync"
|
||||
)
|
||||
|
||||
@@ -17,6 +18,9 @@ const (
|
||||
|
||||
// EventDeployStatus is emitted when a deploy status changes.
|
||||
EventDeployStatus EventType = "deploy_status"
|
||||
|
||||
// EventLog is emitted when a persistent event is logged.
|
||||
EventLog EventType = "event_log"
|
||||
)
|
||||
|
||||
// Event is a single event published on the bus.
|
||||
@@ -50,6 +54,72 @@ type DeployStatusPayload struct {
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// EventLogPayload is the payload for EventLog events (persistent event log).
|
||||
type EventLogPayload struct {
|
||||
ID int64 `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Severity string `json:"severity"`
|
||||
Message string `json:"message"`
|
||||
Metadata string `json:"metadata"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// PersistFunc is a callback that persists an event log entry.
|
||||
// It receives source, severity, message, and metadata (JSON string).
|
||||
// It returns the persisted entry's ID and created_at timestamp.
|
||||
type PersistFunc func(source, severity, message, metadata string) (int64, string, error)
|
||||
|
||||
// RegisterPersistentLogger subscribes to the bus and auto-persists warn/error
|
||||
// events by calling the provided persist function. It also re-publishes the
|
||||
// persisted event as an EventLog so SSE clients receive it in real-time.
|
||||
// Call the returned function to unsubscribe.
|
||||
func (b *Bus) RegisterPersistentLogger(persist PersistFunc) func() {
|
||||
sub := b.Subscribe(func(evt Event) bool {
|
||||
// Only persist deploy log events with warn/error level.
|
||||
if evt.Type != EventDeployLog {
|
||||
return false
|
||||
}
|
||||
p, ok := evt.Payload.(DeployLogPayload)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return p.Level == "warn" || p.Level == "error"
|
||||
})
|
||||
|
||||
go func() {
|
||||
for evt := range sub {
|
||||
p, ok := evt.Payload.(DeployLogPayload)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
metaBytes, _ := json.Marshal(map[string]string{"deploy_id": p.DeployID})
|
||||
metadata := string(metaBytes)
|
||||
id, createdAt, err := persist("deploy", p.Level, p.Message, metadata)
|
||||
if err != nil {
|
||||
slog.Error("failed to persist event log", "source", "deploy", "level", p.Level, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Re-publish as EventLog for SSE clients.
|
||||
b.Publish(Event{
|
||||
Type: EventLog,
|
||||
Payload: EventLogPayload{
|
||||
ID: id,
|
||||
Source: "deploy",
|
||||
Severity: p.Level,
|
||||
Message: p.Message,
|
||||
Metadata: metadata,
|
||||
CreatedAt: createdAt,
|
||||
},
|
||||
})
|
||||
}
|
||||
}()
|
||||
|
||||
return func() {
|
||||
b.Unsubscribe(sub)
|
||||
}
|
||||
}
|
||||
|
||||
// Subscriber is a channel that receives events.
|
||||
type Subscriber chan Event
|
||||
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// EventLogFilter holds optional filters for listing event log entries.
|
||||
type EventLogFilter struct {
|
||||
Severity string // Filter by severity (info, warn, error).
|
||||
Source string // Filter by source.
|
||||
Since string // Only events created at or after this timestamp.
|
||||
Until string // Only events created at or before this timestamp.
|
||||
Limit int // Maximum number of results (default 50).
|
||||
Offset int // Offset for pagination.
|
||||
}
|
||||
|
||||
// EventLogStats holds counts of event log entries by severity.
|
||||
type EventLogStats struct {
|
||||
Info int `json:"info"`
|
||||
Warn int `json:"warn"`
|
||||
Error int `json:"error"`
|
||||
Total int `json:"total"`
|
||||
}
|
||||
|
||||
// InsertEvent inserts a new event log entry.
|
||||
func (s *Store) InsertEvent(evt EventLog) (EventLog, error) {
|
||||
evt.CreatedAt = Now()
|
||||
if evt.Metadata == "" {
|
||||
evt.Metadata = "{}"
|
||||
}
|
||||
|
||||
result, err := s.db.Exec(
|
||||
`INSERT INTO event_log (source, severity, message, metadata, created_at)
|
||||
VALUES (?, ?, ?, ?, ?)`,
|
||||
evt.Source, evt.Severity, evt.Message, evt.Metadata, evt.CreatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return EventLog{}, fmt.Errorf("insert event: %w", err)
|
||||
}
|
||||
|
||||
id, err := result.LastInsertId()
|
||||
if err != nil {
|
||||
return EventLog{}, fmt.Errorf("get event id: %w", err)
|
||||
}
|
||||
evt.ID = id
|
||||
|
||||
return evt, nil
|
||||
}
|
||||
|
||||
// ListEvents returns event log entries matching the given filter.
|
||||
func (s *Store) ListEvents(filter EventLogFilter) ([]EventLog, error) {
|
||||
var conditions []string
|
||||
var args []any
|
||||
|
||||
if filter.Severity != "" {
|
||||
conditions = append(conditions, "severity = ?")
|
||||
args = append(args, filter.Severity)
|
||||
}
|
||||
if filter.Source != "" {
|
||||
conditions = append(conditions, "source = ?")
|
||||
args = append(args, filter.Source)
|
||||
}
|
||||
if filter.Since != "" {
|
||||
conditions = append(conditions, "created_at >= ?")
|
||||
args = append(args, filter.Since)
|
||||
}
|
||||
if filter.Until != "" {
|
||||
conditions = append(conditions, "created_at <= ?")
|
||||
args = append(args, filter.Until)
|
||||
}
|
||||
|
||||
query := "SELECT id, source, severity, message, metadata, created_at FROM event_log"
|
||||
if len(conditions) > 0 {
|
||||
query += " WHERE " + strings.Join(conditions, " AND ")
|
||||
}
|
||||
query += " ORDER BY created_at DESC"
|
||||
|
||||
limit := filter.Limit
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
if limit > 500 {
|
||||
limit = 500
|
||||
}
|
||||
query += fmt.Sprintf(" LIMIT %d OFFSET %d", limit, filter.Offset)
|
||||
|
||||
rows, err := s.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query events: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
events := []EventLog{}
|
||||
for rows.Next() {
|
||||
var evt EventLog
|
||||
if err := rows.Scan(&evt.ID, &evt.Source, &evt.Severity, &evt.Message, &evt.Metadata, &evt.CreatedAt); err != nil {
|
||||
return nil, fmt.Errorf("scan event: %w", err)
|
||||
}
|
||||
events = append(events, evt)
|
||||
}
|
||||
return events, rows.Err()
|
||||
}
|
||||
|
||||
// GetEventStats returns counts of event log entries grouped by severity.
|
||||
func (s *Store) GetEventStats() (EventLogStats, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT severity, COUNT(*) FROM event_log GROUP BY severity`,
|
||||
)
|
||||
if err != nil {
|
||||
return EventLogStats{}, fmt.Errorf("query event stats: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var stats EventLogStats
|
||||
for rows.Next() {
|
||||
var severity string
|
||||
var count int
|
||||
if err := rows.Scan(&severity, &count); err != nil {
|
||||
return EventLogStats{}, fmt.Errorf("scan event stats: %w", err)
|
||||
}
|
||||
switch severity {
|
||||
case "info":
|
||||
stats.Info = count
|
||||
case "warn":
|
||||
stats.Warn = count
|
||||
case "error":
|
||||
stats.Error = count
|
||||
}
|
||||
stats.Total += count
|
||||
}
|
||||
return stats, rows.Err()
|
||||
}
|
||||
|
||||
// PruneEvents deletes event log entries older than the given number of days.
|
||||
func (s *Store) PruneEvents(olderThanDays int) (int64, error) {
|
||||
if olderThanDays < 1 {
|
||||
return 0, fmt.Errorf("prune events: olderThanDays must be >= 1, got %d", olderThanDays)
|
||||
}
|
||||
result, err := s.db.Exec(
|
||||
`DELETE FROM event_log WHERE created_at < datetime('now', ?)`,
|
||||
fmt.Sprintf("-%d days", olderThanDays),
|
||||
)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("prune events: %w", err)
|
||||
}
|
||||
return result.RowsAffected()
|
||||
}
|
||||
@@ -55,8 +55,9 @@ type Settings struct {
|
||||
WebhookSecret string `json:"webhook_secret"`
|
||||
PollingInterval string `json:"polling_interval"`
|
||||
BaseVolumePath string `json:"base_volume_path"`
|
||||
SSLCertificateID int `json:"ssl_certificate_id"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
SSLCertificateID int `json:"ssl_certificate_id"`
|
||||
StaleThresholdDays int `json:"stale_threshold_days"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
// Instance represents a running (or stopped) container for a project stage.
|
||||
@@ -117,3 +118,27 @@ type Volume struct {
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
// EventLog represents a persistent event log entry.
|
||||
type EventLog struct {
|
||||
ID int64 `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Severity string `json:"severity"` // info, warn, error
|
||||
Message string `json:"message"`
|
||||
Metadata string `json:"metadata"` // JSON-encoded structured data
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// StandaloneProxy represents a standalone reverse proxy not tied to a project.
|
||||
type StandaloneProxy struct {
|
||||
ID string `json:"id"`
|
||||
Domain string `json:"domain"`
|
||||
DestinationURL string `json:"destination_url"`
|
||||
DestinationPort int `json:"destination_port"`
|
||||
SSLCertificateID int `json:"ssl_certificate_id"`
|
||||
NpmProxyID int `json:"npm_proxy_id"`
|
||||
HealthStatus string `json:"health_status"` // unknown, healthy, unhealthy
|
||||
HealthCheckedAt string `json:"health_checked_at"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
@@ -9,10 +9,10 @@ func (s *Store) GetSettings() (Settings, error) {
|
||||
var st Settings
|
||||
err := s.db.QueryRow(
|
||||
`SELECT domain, server_ip, network, subdomain_pattern, notification_url,
|
||||
npm_url, npm_email, npm_password, webhook_secret, polling_interval, base_volume_path, ssl_certificate_id, updated_at
|
||||
npm_url, npm_email, npm_password, webhook_secret, polling_interval, base_volume_path, ssl_certificate_id, stale_threshold_days, updated_at
|
||||
FROM settings WHERE id = 1`,
|
||||
).Scan(&st.Domain, &st.ServerIP, &st.Network, &st.SubdomainPattern, &st.NotificationURL,
|
||||
&st.NpmURL, &st.NpmEmail, &st.NpmPassword, &st.WebhookSecret, &st.PollingInterval, &st.BaseVolumePath, &st.SSLCertificateID, &st.UpdatedAt)
|
||||
&st.NpmURL, &st.NpmEmail, &st.NpmPassword, &st.WebhookSecret, &st.PollingInterval, &st.BaseVolumePath, &st.SSLCertificateID, &st.StaleThresholdDays, &st.UpdatedAt)
|
||||
if err != nil {
|
||||
return Settings{}, fmt.Errorf("query settings: %w", err)
|
||||
}
|
||||
@@ -25,10 +25,10 @@ func (s *Store) UpdateSettings(st Settings) error {
|
||||
_, err := s.db.Exec(
|
||||
`UPDATE settings SET
|
||||
domain=?, server_ip=?, network=?, subdomain_pattern=?, notification_url=?,
|
||||
npm_url=?, npm_email=?, npm_password=?, webhook_secret=?, polling_interval=?, base_volume_path=?, ssl_certificate_id=?, updated_at=?
|
||||
npm_url=?, npm_email=?, npm_password=?, webhook_secret=?, polling_interval=?, base_volume_path=?, ssl_certificate_id=?, stale_threshold_days=?, updated_at=?
|
||||
WHERE id = 1`,
|
||||
st.Domain, st.ServerIP, st.Network, st.SubdomainPattern, st.NotificationURL,
|
||||
st.NpmURL, st.NpmEmail, st.NpmPassword, st.WebhookSecret, st.PollingInterval, st.BaseVolumePath, st.SSLCertificateID, st.UpdatedAt,
|
||||
st.NpmURL, st.NpmEmail, st.NpmPassword, st.WebhookSecret, st.PollingInterval, st.BaseVolumePath, st.SSLCertificateID, st.StaleThresholdDays, st.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update settings: %w", err)
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// CreateStandaloneProxy inserts a new standalone proxy record.
|
||||
func (s *Store) CreateStandaloneProxy(p StandaloneProxy) (StandaloneProxy, error) {
|
||||
p.ID = uuid.New().String()
|
||||
p.CreatedAt = Now()
|
||||
p.UpdatedAt = p.CreatedAt
|
||||
|
||||
if p.HealthStatus == "" {
|
||||
p.HealthStatus = "unknown"
|
||||
}
|
||||
|
||||
_, err := s.db.Exec(
|
||||
`INSERT INTO standalone_proxies (id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
p.ID, p.Domain, p.DestinationURL, p.DestinationPort, p.SSLCertificateID,
|
||||
p.NpmProxyID, p.HealthStatus, p.HealthCheckedAt, p.CreatedAt, p.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return StandaloneProxy{}, fmt.Errorf("insert standalone proxy: %w", err)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// GetStandaloneProxy returns a standalone proxy by ID.
|
||||
func (s *Store) GetStandaloneProxy(id string) (StandaloneProxy, error) {
|
||||
var p StandaloneProxy
|
||||
err := s.db.QueryRow(
|
||||
`SELECT id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at
|
||||
FROM standalone_proxies WHERE id = ?`, id,
|
||||
).Scan(&p.ID, &p.Domain, &p.DestinationURL, &p.DestinationPort, &p.SSLCertificateID,
|
||||
&p.NpmProxyID, &p.HealthStatus, &p.HealthCheckedAt, &p.CreatedAt, &p.UpdatedAt)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return StandaloneProxy{}, fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound)
|
||||
}
|
||||
if err != nil {
|
||||
return StandaloneProxy{}, fmt.Errorf("query standalone proxy: %w", err)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// ListStandaloneProxies returns all standalone proxy records ordered by creation time.
|
||||
func (s *Store) ListStandaloneProxies() ([]StandaloneProxy, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT id, domain, destination_url, destination_port, ssl_certificate_id, npm_proxy_id, health_status, health_checked_at, created_at, updated_at
|
||||
FROM standalone_proxies ORDER BY created_at DESC`,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query standalone proxies: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
proxies := []StandaloneProxy{}
|
||||
for rows.Next() {
|
||||
var p StandaloneProxy
|
||||
if err := rows.Scan(&p.ID, &p.Domain, &p.DestinationURL, &p.DestinationPort, &p.SSLCertificateID,
|
||||
&p.NpmProxyID, &p.HealthStatus, &p.HealthCheckedAt, &p.CreatedAt, &p.UpdatedAt); err != nil {
|
||||
return nil, fmt.Errorf("scan standalone proxy: %w", err)
|
||||
}
|
||||
proxies = append(proxies, p)
|
||||
}
|
||||
return proxies, rows.Err()
|
||||
}
|
||||
|
||||
// UpdateStandaloneProxy updates an existing standalone proxy's mutable fields.
|
||||
func (s *Store) UpdateStandaloneProxy(p StandaloneProxy) error {
|
||||
p.UpdatedAt = Now()
|
||||
result, err := s.db.Exec(
|
||||
`UPDATE standalone_proxies SET domain=?, destination_url=?, destination_port=?, ssl_certificate_id=?, npm_proxy_id=?, health_status=?, health_checked_at=?, updated_at=?
|
||||
WHERE id=?`,
|
||||
p.Domain, p.DestinationURL, p.DestinationPort, p.SSLCertificateID,
|
||||
p.NpmProxyID, p.HealthStatus, p.HealthCheckedAt, p.UpdatedAt, p.ID,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update standalone proxy: %w", err)
|
||||
}
|
||||
n, _ := result.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("standalone proxy %s: %w", p.ID, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteStandaloneProxy removes a standalone proxy by ID.
|
||||
func (s *Store) DeleteStandaloneProxy(id string) error {
|
||||
result, err := s.db.Exec(`DELETE FROM standalone_proxies WHERE id = ?`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("delete standalone proxy: %w", err)
|
||||
}
|
||||
n, _ := result.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateProxyHealth updates the health status and check timestamp for a standalone proxy.
|
||||
func (s *Store) UpdateProxyHealth(id string, status string) error {
|
||||
ts := Now()
|
||||
result, err := s.db.Exec(
|
||||
`UPDATE standalone_proxies SET health_status=?, health_checked_at=?, updated_at=? WHERE id=?`,
|
||||
status, ts, ts, id,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update proxy health: %w", err)
|
||||
}
|
||||
n, _ := result.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("standalone proxy %s: %w", id, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -81,6 +81,8 @@ func (s *Store) runMigrations() error {
|
||||
`ALTER TABLE stages ADD COLUMN enable_proxy INTEGER NOT NULL DEFAULT 1`,
|
||||
// Add ssl_certificate_id to settings (2026-03-29).
|
||||
`ALTER TABLE settings ADD COLUMN ssl_certificate_id INTEGER NOT NULL DEFAULT 0`,
|
||||
// Add stale_threshold_days to settings (2026-03-30).
|
||||
`ALTER TABLE settings ADD COLUMN stale_threshold_days INTEGER NOT NULL DEFAULT 7`,
|
||||
}
|
||||
|
||||
for _, m := range migrations {
|
||||
@@ -98,6 +100,9 @@ func (s *Store) runMigrations() error {
|
||||
`CREATE INDEX IF NOT EXISTS idx_stages_project_id ON stages(project_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_stage_env_stage_id ON stage_env(stage_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_volumes_project_id ON volumes(project_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_event_log_severity ON event_log(severity)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_event_log_source ON event_log(source)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_event_log_created_at ON event_log(created_at)`,
|
||||
}
|
||||
for _, idx := range indexes {
|
||||
if _, err := s.db.Exec(idx); err != nil {
|
||||
@@ -250,6 +255,28 @@ CREATE TABLE IF NOT EXISTS volumes (
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS event_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source TEXT NOT NULL DEFAULT '',
|
||||
severity TEXT NOT NULL DEFAULT 'info',
|
||||
message TEXT NOT NULL DEFAULT '',
|
||||
metadata TEXT NOT NULL DEFAULT '{}',
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS standalone_proxies (
|
||||
id TEXT PRIMARY KEY,
|
||||
domain TEXT NOT NULL UNIQUE,
|
||||
destination_url TEXT NOT NULL DEFAULT '',
|
||||
destination_port INTEGER NOT NULL DEFAULT 0,
|
||||
ssl_certificate_id INTEGER NOT NULL DEFAULT 0,
|
||||
npm_proxy_id INTEGER NOT NULL DEFAULT 0,
|
||||
health_status TEXT NOT NULL DEFAULT 'unknown',
|
||||
health_checked_at TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
`
|
||||
|
||||
// Now returns the current time formatted for SQLite storage.
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
# Feature Context: Observability & Proxy Management
|
||||
|
||||
## Configuration
|
||||
- **Development mode:** Automated
|
||||
- **Execution mode:** Orchestrator
|
||||
- **Strategy:** Incremental
|
||||
- **Build (full):** `make build`
|
||||
- **Build (frontend):** `cd web && npm install && npm run build`
|
||||
- **Build (backend):** `go build -o docker-watcher ./cmd/server`
|
||||
- **Test:** `go test ./...`
|
||||
- **Lint (backend):** `go vet ./...`
|
||||
- **Lint (frontend):** `cd web && npm run check`
|
||||
- **Dev server:** `make dev` (port: 8080)
|
||||
|
||||
## Current State
|
||||
Feature branch just created. No implementation yet. Codebase is fully working on main.
|
||||
|
||||
## Temporary Workarounds
|
||||
(none yet)
|
||||
|
||||
## Cross-Phase Dependencies
|
||||
- Phases 2 & 3 depend on Phase 1 (schema, event_log table, store methods)
|
||||
- Phases 4, 5, 6, 7 depend on their respective backend phases (1-3) for API endpoints
|
||||
- Phase 8 depends on Phases 1-3 for backend infrastructure and event system
|
||||
|
||||
## Deferred Work
|
||||
(none yet)
|
||||
|
||||
## Failed Approaches
|
||||
(none yet)
|
||||
|
||||
## Review Findings Log
|
||||
(none yet)
|
||||
|
||||
## Phase Execution Log
|
||||
| Phase | Agent Used | Test Writer | Parallel | Notes |
|
||||
|-------|-----------|-------------|----------|-------|
|
||||
| (none yet) | | | | |
|
||||
|
||||
## Environment & Runtime Notes
|
||||
- Build is currently blocked on Go 1.25 transitive dep from Docker SDK — may need to use Go 1.24 toolchain
|
||||
- SQLite has MaxOpenConns=1, so all DB operations are serialized
|
||||
- Frontend is embedded into Go binary via embed.FS
|
||||
|
||||
## Implementation Notes
|
||||
- Event bus (`internal/events/bus.go`) uses buffered channels (64 cap), non-blocking publish
|
||||
- NPM client (`internal/npm/client.go`) handles JWT auth with auto-refresh
|
||||
- Store uses additive migrations — new `ALTER TABLE` statements are appended to runMigrations(), errors ignored for idempotency
|
||||
- New tables use `CREATE TABLE IF NOT EXISTS` in the schema constant
|
||||
- All API responses use envelope pattern: `{success: bool, data?: T, error?: string}`
|
||||
- Frontend types in `web/src/lib/types.ts` mirror Go models
|
||||
- API functions centralized in `web/src/lib/api.ts`
|
||||
@@ -0,0 +1,71 @@
|
||||
# Feature: Observability & Proxy Management
|
||||
|
||||
**Branch:** `feature/observability-proxy-mgmt`
|
||||
**Base branch:** `main`
|
||||
**Created:** 2026-03-30
|
||||
**Status:** 🟡 In Progress
|
||||
**Strategy:** Incremental
|
||||
**Mode:** Automated
|
||||
**Execution:** Orchestrator
|
||||
|
||||
## Summary
|
||||
|
||||
Extend Docker Watcher with four interconnected features: stale container detection,
|
||||
standalone proxy management with health monitoring, a unified proxy viewer, and a
|
||||
persistent event log — plus container stats and notification triggers.
|
||||
|
||||
## Build & Test Commands
|
||||
- **Build (frontend):** `cd web && npm install && npm run build`
|
||||
- **Build (backend):** `go build -o docker-watcher ./cmd/server`
|
||||
- **Build (full):** `make build`
|
||||
- **Test (backend):** `go test ./...`
|
||||
- **Lint (backend):** `go vet ./...`
|
||||
- **Lint (frontend):** `cd web && npm run check`
|
||||
|
||||
## Tech Stack Summary
|
||||
- **Backend:** Go 1.24, chi v5 router, SQLite (modernc.org/sqlite), Docker SDK (moby/moby/client)
|
||||
- **Frontend:** SvelteKit 2.15, Svelte 5, TypeScript 5.7, Tailwind CSS 4, Vite 6
|
||||
- **Real-time:** Server-Sent Events with auto-reconnect
|
||||
- **Auth:** JWT + optional OIDC
|
||||
- **Encryption:** AES-256-GCM for credentials
|
||||
|
||||
## Project Conventions
|
||||
- **Go:** gofmt, small interfaces, error wrapping with `fmt.Errorf("context: %w", err)`, constructor injection
|
||||
- **DB:** Single-row settings, additive migrations via `ALTER TABLE` (errors ignored for idempotency), `CREATE TABLE IF NOT EXISTS` for new tables
|
||||
- **API:** Envelope pattern `{success, data?, error?}`, chi route groups, admin middleware for writes
|
||||
- **Frontend:** Svelte 5 runes ($state, $derived, $effect), TypeScript interfaces mirroring Go models, centralized api.ts, custom components (no UI library)
|
||||
- **Files:** Feature-organized, small focused files
|
||||
- **State:** Immutable patterns, no mutation
|
||||
|
||||
## Phases
|
||||
|
||||
- [ ] Phase 1: Schema, Models & Event Log Backend [domain: backend] → [subplan](./phase-1-schema-eventlog.md)
|
||||
- [ ] Phase 2: Stale Container Detection [domain: backend] → [subplan](./phase-2-stale-detection.md)
|
||||
- [ ] Phase 3: Direct Proxy Creation with Validation [domain: backend] → [subplan](./phase-3-proxy-creation.md)
|
||||
- [ ] Phase 4: Unified Proxy Viewer UI [domain: frontend] → [subplan](./phase-4-proxy-viewer.md)
|
||||
- [ ] Phase 5: Stale Containers UI [domain: frontend] → [subplan](./phase-5-stale-ui.md)
|
||||
- [ ] Phase 6: Direct Proxy Creation UI [domain: frontend] → [subplan](./phase-6-proxy-creation-ui.md)
|
||||
- [ ] Phase 7: Event Log UI [domain: frontend] → [subplan](./phase-7-eventlog-ui.md)
|
||||
- [ ] Phase 8: Container Stats & Notifications [domain: fullstack] → [subplan](./phase-8-stats-notifications.md)
|
||||
|
||||
**Parallelizable phases:**
|
||||
- Phases 4, 5, 6, 7 are all frontend phases that touch different routes/components and can potentially run in parallel after all backend phases (1-3) complete.
|
||||
|
||||
## Phase Progress Log
|
||||
|
||||
| Phase | Domain | Status | Review | Build | Committed |
|
||||
|-------|--------|--------|--------|-------|-----------|
|
||||
| Phase 1: Schema & Event Log | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 2: Stale Detection | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 3: Proxy Creation | backend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 4: Proxy Viewer UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 5: Stale Containers UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 6: Proxy Creation UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 7: Event Log UI | frontend | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
| Phase 8: Stats & Notifications | fullstack | ⬜ Not Started | ⬜ | ⬜ | ⬜ |
|
||||
|
||||
## Final Review
|
||||
- [ ] Comprehensive code review
|
||||
- [ ] Full build passes
|
||||
- [ ] Full test suite passes
|
||||
- [ ] Merged to `main`
|
||||
@@ -0,0 +1,60 @@
|
||||
# Phase 1: Schema, Models & Event Log Backend
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** backend
|
||||
|
||||
## Objective
|
||||
Lay the database foundation for all new features and implement the persistent event log system.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Add `event_log` table to schema (id INTEGER PK AUTOINCREMENT, source TEXT, severity TEXT, message TEXT, metadata TEXT JSON, created_at TEXT)
|
||||
- [ ] Task 2: Add `standalone_proxies` table to schema (id TEXT PK, domain TEXT UNIQUE, destination_url TEXT, destination_port INTEGER, ssl_certificate_id INTEGER, npm_proxy_id INTEGER, health_status TEXT, health_checked_at TEXT, created_at TEXT, updated_at TEXT)
|
||||
- [ ] Task 3: Add `stale_threshold_days` column to settings table (migration, default 7)
|
||||
- [ ] Task 4: Create `internal/store/eventlog.go` — store methods: InsertEvent, ListEvents (paginated, filterable by severity/source/date range), GetEventStats (counts by severity), PruneEvents (delete old entries)
|
||||
- [ ] Task 5: Create `internal/store/standalone_proxy.go` — store methods: CreateStandaloneProxy, GetStandaloneProxy, ListStandaloneProxies, UpdateStandaloneProxy, DeleteStandaloneProxy, UpdateProxyHealth
|
||||
- [ ] Task 6: Create Go models in `internal/store/models.go` — EventLog struct, StandaloneProxy struct
|
||||
- [ ] Task 7: Update settings model to include stale_threshold_days field; update GetSettings/SaveSettings
|
||||
- [ ] Task 8: Enhance event bus to auto-persist warn/error events — add a subscriber in events.Bus that writes to store
|
||||
- [ ] Task 9: Add API endpoints: `GET /api/events/log` (paginated, filterable), `GET /api/events/log/stats`
|
||||
- [ ] Task 10: Add new SSE event type `event_log` — broadcast persistent events in real-time
|
||||
- [ ] Task 11: Add frontend types: EventLogEntry, StandaloneProxy interfaces in types.ts
|
||||
- [ ] Task 12: Add API functions in api.ts: fetchEventLog, fetchEventLogStats
|
||||
|
||||
## Files to Modify/Create
|
||||
- `internal/store/store.go` — Add schema for event_log, standalone_proxies tables; migration for stale_threshold_days
|
||||
- `internal/store/models.go` — Add EventLog, StandaloneProxy structs; update Settings struct
|
||||
- `internal/store/eventlog.go` — NEW: Event log store methods
|
||||
- `internal/store/standalone_proxy.go` — NEW: Standalone proxy store methods
|
||||
- `internal/store/settings.go` — Update GetSettings/SaveSettings for new field
|
||||
- `internal/events/bus.go` — Add persistent event subscriber
|
||||
- `internal/api/router.go` — Mount new event log routes
|
||||
- `internal/api/eventlog.go` — NEW: Event log HTTP handlers
|
||||
- `web/src/lib/types.ts` — Add EventLogEntry, StandaloneProxy types
|
||||
- `web/src/lib/api.ts` — Add fetchEventLog, fetchEventLogStats functions
|
||||
|
||||
## Acceptance Criteria
|
||||
- event_log and standalone_proxies tables created on startup (migration is idempotent)
|
||||
- stale_threshold_days setting accessible via settings API
|
||||
- Events with warn/error severity auto-persisted from event bus
|
||||
- GET /api/events/log returns paginated, filterable results
|
||||
- GET /api/events/log/stats returns severity counts
|
||||
- Frontend types and API functions ready for downstream UI phases
|
||||
- Existing functionality unchanged — all current tests/builds pass
|
||||
|
||||
## Notes
|
||||
- Follow existing migration pattern: ALTER TABLE errors ignored for idempotency
|
||||
- event_log metadata is a JSON TEXT column for flexible structured data
|
||||
- Pagination follows offset/limit pattern (no cursor — SQLite is simple enough)
|
||||
- Event log pruning can be called from a cron job later (Phase 8)
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,55 @@
|
||||
# Phase 2: Stale Container Detection
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** backend
|
||||
|
||||
## Objective
|
||||
Implement a periodic scanner that detects containers managed by docker-watcher which have been non-running for more than N configurable days, and exposes them via API.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Create `internal/stale/scanner.go` — Scanner struct with dependencies (store, docker client, event bus)
|
||||
- [ ] Task 2: Implement scan logic: query all instances from store, check Docker container state via Docker SDK, compare against stale_threshold_days from settings
|
||||
- [ ] Task 3: Add `last_alive_at` column to instances table (migration) — updated when instance is seen running
|
||||
- [ ] Task 4: Update deployer/instance lifecycle to set last_alive_at when container starts/is seen running
|
||||
- [ ] Task 5: Implement stale detection: instance is stale if status != 'running' AND (now - last_alive_at) > threshold days
|
||||
- [ ] Task 6: Emit event_log warnings when containers become newly stale (avoid re-emitting for already-known stale containers)
|
||||
- [ ] Task 7: Register scanner as cron job (reuse existing robfig/cron infrastructure from registry poller)
|
||||
- [ ] Task 8: Add API endpoints: `GET /api/containers/stale` (list stale with project/stage info), `POST /api/containers/stale/{id}/cleanup` (remove single), `POST /api/containers/stale/cleanup` (bulk remove)
|
||||
- [ ] Task 9: Cleanup handler: stop container via Docker SDK, remove instance from store, emit event
|
||||
- [ ] Task 10: Wire scanner into main.go startup (after store, docker client, event bus init)
|
||||
|
||||
## Files to Modify/Create
|
||||
- `internal/stale/scanner.go` — NEW: Stale container scanner
|
||||
- `internal/store/store.go` — Migration for last_alive_at column
|
||||
- `internal/store/models.go` — Update Instance struct with LastAliveAt field
|
||||
- `internal/store/instances.go` — Update queries to include last_alive_at; add UpdateLastAliveAt method
|
||||
- `internal/api/router.go` — Mount stale container routes
|
||||
- `internal/api/stale.go` — NEW: Stale container HTTP handlers
|
||||
- `cmd/server/main.go` — Wire scanner with cron
|
||||
|
||||
## Acceptance Criteria
|
||||
- Scanner runs on configurable interval (e.g., every hour)
|
||||
- Stale containers correctly identified based on threshold
|
||||
- GET /api/containers/stale returns list with project name, stage name, image tag, last alive timestamp, days stale
|
||||
- Cleanup endpoints properly stop Docker containers and remove from store
|
||||
- Events emitted when containers become stale
|
||||
- Existing deploy flow unaffected — last_alive_at updated on successful deploy
|
||||
- Build passes, existing tests pass
|
||||
|
||||
## Notes
|
||||
- Scanner should handle gracefully: containers that no longer exist in Docker (already removed externally)
|
||||
- Bulk cleanup should be admin-only
|
||||
- Consider: scan interval could be derived from stale_threshold_days (e.g., scan every threshold/7 days, min 1h)
|
||||
- Don't remove containers that are in 'removing' status (already being cleaned up)
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,81 @@
|
||||
# Phase 3: Direct Proxy Creation with Validation
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** backend
|
||||
|
||||
## Objective
|
||||
Implement standalone proxy creation with a multi-step validation pipeline that checks destination reachability, and periodic health monitoring for all standalone proxies.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Create `internal/proxy/validator.go` — validation pipeline:
|
||||
- URL/port syntax validation
|
||||
- DNS resolution check
|
||||
- TCP port reachability (net.DialTimeout, 5s)
|
||||
- HTTP health probe (GET to destination, 10s timeout)
|
||||
- Returns structured ValidationResult with per-step pass/fail and diagnostic hints
|
||||
- [ ] Task 2: Create `internal/proxy/hints.go` — diagnostic hint generator:
|
||||
- DNS failure → "Domain cannot be resolved. Check DNS settings or use an IP address."
|
||||
- TCP refused → "Port {port} is not accepting connections. Check if the service is running and the port is correct."
|
||||
- TCP timeout → "Connection timed out. Possible firewall blocking. Check network/firewall rules."
|
||||
- Host unreachable → "Host is not reachable. Verify the IP address and network connectivity."
|
||||
- HTTP error → "Service responded with HTTP {status}. The service may not be healthy."
|
||||
- [ ] Task 3: Create `internal/proxy/manager.go` — proxy lifecycle:
|
||||
- CreateProxy: validate destination, create NPM proxy host (using npm.Client), assign SSL cert from settings, save to standalone_proxies table
|
||||
- UpdateProxy: re-validate, update NPM proxy host, update store
|
||||
- DeleteProxy: remove NPM proxy host, remove from store
|
||||
- GetProxy/ListProxies: read from store with health status
|
||||
- [ ] Task 4: Create `internal/proxy/health.go` — periodic health monitor:
|
||||
- Cron job that checks all standalone proxies
|
||||
- HTTP GET to destination URL/port
|
||||
- Updates health_status (healthy/unhealthy/unknown) and health_checked_at in store
|
||||
- Emits event_log on status change (healthy→unhealthy or vice versa)
|
||||
- [ ] Task 5: Add API endpoints:
|
||||
- `POST /api/proxies/validate` — run validation without creating
|
||||
- `POST /api/proxies` — create standalone proxy
|
||||
- `GET /api/proxies` — list standalone proxies
|
||||
- `GET /api/proxies/{id}` — get single proxy
|
||||
- `PUT /api/proxies/{id}` — update proxy
|
||||
- `DELETE /api/proxies/{id}` — delete proxy
|
||||
- `GET /api/proxies/all` — merged view: standalone + deploy-managed proxies (for Phase 4 UI)
|
||||
- [ ] Task 6: Wire health monitor cron job in main.go
|
||||
- [ ] Task 7: Add frontend API functions in api.ts: validateProxy, createProxy, listProxies, getProxy, updateProxy, deleteProxy, listAllProxies
|
||||
- [ ] Task 8: Add frontend types: ValidationResult, ValidationStep, ProxyHealthStatus
|
||||
|
||||
## Files to Modify/Create
|
||||
- `internal/proxy/validator.go` — NEW: Validation pipeline
|
||||
- `internal/proxy/hints.go` — NEW: Diagnostic hints
|
||||
- `internal/proxy/manager.go` — NEW: Proxy lifecycle management
|
||||
- `internal/proxy/health.go` — NEW: Health monitoring
|
||||
- `internal/api/router.go` — Mount proxy routes
|
||||
- `internal/api/proxy.go` — NEW: Proxy HTTP handlers
|
||||
- `cmd/server/main.go` — Wire proxy manager and health monitor
|
||||
- `web/src/lib/types.ts` — Add ValidationResult, ProxyHealthStatus types
|
||||
- `web/src/lib/api.ts` — Add proxy API functions
|
||||
|
||||
## Acceptance Criteria
|
||||
- Validation pipeline returns structured results with specific failure hints
|
||||
- POST /api/proxies/validate runs full check without side effects
|
||||
- Proxy creation creates NPM proxy host with SSL cert from global settings
|
||||
- Health monitor runs periodically and updates proxy status
|
||||
- Events emitted on health status changes
|
||||
- GET /api/proxies/all merges standalone and deploy-managed proxy data
|
||||
- Build passes, existing tests pass
|
||||
|
||||
## Notes
|
||||
- Validation should be fast (short timeouts) — user waits for results
|
||||
- Health monitor interval: every 5 minutes (configurable later)
|
||||
- For /api/proxies/all: query NPM for all proxy hosts, join with instances table for managed proxies, join with standalone_proxies for standalone ones
|
||||
- SSL cert auto-assigned from settings.ssl_certificate_id
|
||||
- Consider: proxy domain must be unique across both standalone and managed proxies
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,56 @@
|
||||
# Phase 4: Unified Proxy Viewer UI
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** frontend
|
||||
|
||||
## Objective
|
||||
Build a unified proxy viewer page showing ALL proxies (deploy-managed and standalone) with grouping, filtering, and real-time health indicators.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Create route `/proxies` with `+page.svelte` and `+page.ts` data loader
|
||||
- [ ] Task 2: Create ProxyCard component — displays: domain, destination, SSL badge, health indicator (green/yellow/red dot), proxy type badge (managed/standalone), last health check timestamp
|
||||
- [ ] Task 3: Create ProxyGroup component — collapsible section with project name header, stage sub-groups, proxy count badge
|
||||
- [ ] Task 4: Create StandaloneProxyGroup component — separate collapsible section for user-created proxies
|
||||
- [ ] Task 5: Implement filtering: by project, stage, health status (healthy/unhealthy/unknown), proxy type (managed/standalone), free-text search by domain/destination
|
||||
- [ ] Task 6: Filter bar component with dropdown selects and search input
|
||||
- [ ] Task 7: SSE integration — subscribe to proxy health events, update health indicators in real-time
|
||||
- [ ] Task 8: Empty state — friendly message when no proxies exist, with link to create one
|
||||
- [ ] Task 9: Add navigation link in sidebar layout (+layout.svelte)
|
||||
- [ ] Task 10: Add i18n keys for proxy viewer page
|
||||
|
||||
## Files to Modify/Create
|
||||
- `web/src/routes/proxies/+page.svelte` — NEW: Proxy viewer page
|
||||
- `web/src/routes/proxies/+page.ts` — NEW: Data loader
|
||||
- `web/src/lib/components/ProxyCard.svelte` — NEW: Individual proxy display
|
||||
- `web/src/lib/components/ProxyGroup.svelte` — NEW: Collapsible project/stage group
|
||||
- `web/src/lib/components/ProxyFilter.svelte` — NEW: Filter bar
|
||||
- `web/src/routes/+layout.svelte` — Add proxies nav link
|
||||
- `web/src/lib/i18n/en.ts` (or equivalent) — Add proxy viewer strings
|
||||
|
||||
## Acceptance Criteria
|
||||
- All proxies visible: both deploy-managed and standalone
|
||||
- Proxies grouped by project/stage in collapsible sections
|
||||
- Health indicators show real-time status (green=healthy, red=unhealthy, yellow=unknown)
|
||||
- Filtering works: project, stage, health, type, text search
|
||||
- SSE updates health indicators without page refresh
|
||||
- Navigation accessible from sidebar
|
||||
- Responsive layout (mobile-friendly)
|
||||
|
||||
## Notes
|
||||
- Use existing component patterns (ConfirmDialog, FormField styles, etc.)
|
||||
- Follow existing Svelte 5 patterns ($state, $derived, $effect)
|
||||
- The /api/proxies/all endpoint from Phase 3 provides the data source
|
||||
- Health indicator should pulse/animate briefly on status change
|
||||
- Consider: show proxy count in sidebar nav badge
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,55 @@
|
||||
# Phase 5: Stale Containers UI
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** frontend
|
||||
|
||||
## Objective
|
||||
Build the stale containers dashboard widget and dedicated view, with cleanup actions and settings configuration.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Add API functions in api.ts: fetchStaleContainers, cleanupStaleContainer, bulkCleanupStaleContainers
|
||||
- [ ] Task 2: Create StaleContainerCard component — shows: container name, project, stage, image tag, last alive timestamp, "X days stale" badge (color-coded by severity)
|
||||
- [ ] Task 3: Create stale containers section on dashboard (+page.svelte) — count badge, mini-list of top 5 offenders, "View all" link
|
||||
- [ ] Task 4: Create dedicated route `/containers/stale` with full stale container list
|
||||
- [ ] Task 5: Individual cleanup action — ConfirmDialog with warning, calls cleanup API
|
||||
- [ ] Task 6: Bulk cleanup action — "Clean up all" button with confirmation, progress indicator
|
||||
- [ ] Task 7: Settings integration — add stale_threshold_days field to settings page with validation (min 1 day)
|
||||
- [ ] Task 8: Add navigation link or sub-nav for stale containers
|
||||
- [ ] Task 9: Add i18n keys for stale containers
|
||||
|
||||
## Files to Modify/Create
|
||||
- `web/src/lib/api.ts` — Add stale container API functions
|
||||
- `web/src/lib/types.ts` — Add StaleContainer interface
|
||||
- `web/src/lib/components/StaleContainerCard.svelte` — NEW: Stale container display
|
||||
- `web/src/routes/+page.svelte` — Add stale containers dashboard widget
|
||||
- `web/src/routes/containers/stale/+page.svelte` — NEW: Dedicated stale view
|
||||
- `web/src/routes/containers/stale/+page.ts` — NEW: Data loader
|
||||
- `web/src/routes/settings/+page.svelte` — Add stale threshold setting field
|
||||
- `web/src/routes/+layout.svelte` — Add nav link if needed
|
||||
|
||||
## Acceptance Criteria
|
||||
- Dashboard shows stale container count and top offenders
|
||||
- Dedicated page lists all stale containers with details
|
||||
- Individual cleanup removes container with confirmation
|
||||
- Bulk cleanup works with progress feedback
|
||||
- Settings page allows configuring stale threshold
|
||||
- Severity coloring: 7-14 days = yellow, 14+ days = red
|
||||
- Responsive layout
|
||||
|
||||
## Notes
|
||||
- Reuse existing ConfirmDialog for destructive actions
|
||||
- Dashboard widget should not slow down initial page load (lazy load or small payload)
|
||||
- Stale container data comes from GET /api/containers/stale (Phase 2)
|
||||
- Settings update uses existing PUT /api/settings endpoint
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,54 @@
|
||||
# Phase 6: Direct Proxy Creation UI
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** frontend
|
||||
|
||||
## Objective
|
||||
Build the proxy creation form with live validation feedback, diagnostic hints, and management actions (edit/delete).
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Create "Create Proxy" form component — fields: destination URL/IP, port, domain (auto-suggested from subdomain pattern), optional custom subdomain override
|
||||
- [ ] Task 2: Live validation — debounced calls to POST /api/proxies/validate as user types (300ms debounce)
|
||||
- [ ] Task 3: Validation result display — step-by-step checklist with icons:
|
||||
- ✅ DNS resolution OK / ❌ DNS resolution failed
|
||||
- ✅ TCP port reachable / ❌ TCP port not reachable
|
||||
- ✅ HTTP responding / ❌ HTTP not responding
|
||||
- Each failure shows the diagnostic hint from the backend
|
||||
- [ ] Task 4: Create proxy submission — calls POST /api/proxies, shows success toast with health indicator
|
||||
- [ ] Task 5: Edit proxy — modal or inline form, pre-populated with current values, re-validates on save
|
||||
- [ ] Task 6: Delete proxy — ConfirmDialog with domain name confirmation
|
||||
- [ ] Task 7: Integration with proxy viewer page — "Create Proxy" button in the proxy viewer header
|
||||
- [ ] Task 8: Domain auto-suggestion — when user enters destination, suggest domain based on subdomain_pattern from settings
|
||||
- [ ] Task 9: Add i18n keys for proxy creation
|
||||
|
||||
## Files to Modify/Create
|
||||
- `web/src/lib/components/ProxyForm.svelte` — NEW: Create/edit proxy form
|
||||
- `web/src/lib/components/ValidationChecklist.svelte` — NEW: Step-by-step validation display
|
||||
- `web/src/routes/proxies/+page.svelte` — Add "Create Proxy" button and modal/panel
|
||||
- `web/src/lib/api.ts` — Ensure validateProxy, createProxy, updateProxy, deleteProxy are present (from Phase 3)
|
||||
|
||||
## Acceptance Criteria
|
||||
- Form validates destination in real-time with debouncing
|
||||
- Each validation step shows pass/fail with diagnostic hints
|
||||
- Proxy creation works end-to-end (form → API → NPM → success)
|
||||
- Edit and delete work for existing standalone proxies
|
||||
- Domain auto-suggestion works from settings pattern
|
||||
- Error states handled gracefully (network errors, API failures)
|
||||
|
||||
## Notes
|
||||
- Validation should show a loading spinner while in progress
|
||||
- Don't validate on every keystroke — use 300ms debounce
|
||||
- If all validation steps fail, still allow creation (user might know better — just warn)
|
||||
- SSL certificate is applied automatically from global settings (no cert picker in form)
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,54 @@
|
||||
# Phase 7: Event Log UI
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** frontend
|
||||
|
||||
## Objective
|
||||
Build a persistent, searchable event log viewer with real-time streaming, filters, and resource linking.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Create route `/events` with `+page.svelte` and `+page.ts` data loader
|
||||
- [ ] Task 2: Create EventLogEntry component — timestamp, severity badge (info=blue, warn=yellow, error=red), source icon (container/proxy/deploy/system), message text, expandable metadata section
|
||||
- [ ] Task 3: Create EventLogFilter component — filters: severity multi-select, source multi-select, date range picker (start/end), free-text search
|
||||
- [ ] Task 4: Implement pagination — "Load more" button at bottom (offset/limit pattern matching API)
|
||||
- [ ] Task 5: SSE integration — subscribe to event_log events, prepend new entries at top with subtle highlight animation
|
||||
- [ ] Task 6: Quick actions — clickable links to related resources (e.g., click container name → go to project/stage, click proxy domain → go to proxy viewer)
|
||||
- [ ] Task 7: Stats header — show counts by severity (from GET /api/events/log/stats), with colored badges
|
||||
- [ ] Task 8: Add navigation link in sidebar
|
||||
- [ ] Task 9: Add i18n keys for event log page
|
||||
|
||||
## Files to Modify/Create
|
||||
- `web/src/routes/events/+page.svelte` — NEW: Event log page
|
||||
- `web/src/routes/events/+page.ts` — NEW: Data loader
|
||||
- `web/src/lib/components/EventLogEntry.svelte` — NEW: Event entry display
|
||||
- `web/src/lib/components/EventLogFilter.svelte` — NEW: Filter controls
|
||||
- `web/src/routes/+layout.svelte` — Add events nav link
|
||||
- `web/src/lib/sse.ts` — Add event_log SSE subscription helper (if needed)
|
||||
|
||||
## Acceptance Criteria
|
||||
- Event log shows all persistent events with severity and source
|
||||
- Filters work: severity, source, date range, text search
|
||||
- New events stream in real-time via SSE without page refresh
|
||||
- Pagination loads older events on demand
|
||||
- Quick actions link to related resources
|
||||
- Stats header shows severity distribution
|
||||
- Responsive layout
|
||||
|
||||
## Notes
|
||||
- Follow existing SSE patterns from deploy logs viewer
|
||||
- Date range filter: consider "last hour", "last 24h", "last 7 days" presets + custom range
|
||||
- Metadata section is JSON — render as formatted key-value pairs, not raw JSON
|
||||
- Resource linking: parse source and metadata to construct navigation URLs
|
||||
- Consider: auto-scroll to top when new event arrives (if user is at top), otherwise show "N new events" badge
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -0,0 +1,67 @@
|
||||
# Phase 8: Container Stats & Notifications
|
||||
|
||||
**Status:** ⬜ Not Started
|
||||
**Parent plan:** [PLAN.md](./PLAN.md)
|
||||
**Domain:** fullstack
|
||||
|
||||
## Objective
|
||||
Add container resource monitoring (CPU/memory), notification triggers for operational events, and a system health dashboard summary.
|
||||
|
||||
## Tasks
|
||||
|
||||
- [ ] Task 1: Create `internal/docker/stats.go` — wrapper around Docker Stats API to get CPU %, memory usage/limit for a container
|
||||
- [ ] Task 2: Add API endpoint: `GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats` — returns current CPU/memory for an instance
|
||||
- [ ] Task 3: Create SSE event type `container_stats` — periodically broadcast stats for running containers (every 30s)
|
||||
- [ ] Task 4: Extend notification stub (`internal/notify/`) — implement webhook sender for events:
|
||||
- Stale container detected
|
||||
- Proxy health failure
|
||||
- Deploy failure/rollback
|
||||
- Format: JSON payload with event type, details, timestamp
|
||||
- [ ] Task 5: Add notification settings UI — enable/disable per event type in settings page
|
||||
- [ ] Task 6: Update instance cards in frontend — show CPU % bar and memory usage badge
|
||||
- [ ] Task 7: Create ContainerStats component — mini CPU/memory visualization (progress bars)
|
||||
- [ ] Task 8: Dashboard system health summary card — total containers (running/stopped), healthy/unhealthy proxies, recent error count (last 24h)
|
||||
- [ ] Task 9: Wire notification sender to event bus — subscribe to relevant event types, fire notifications
|
||||
- [ ] Task 10: Add event log pruning cron job — delete events older than 30 days (configurable)
|
||||
- [ ] Task 11: Add i18n keys for stats and notifications
|
||||
|
||||
## Files to Modify/Create
|
||||
- `internal/docker/stats.go` — NEW: Docker Stats API wrapper
|
||||
- `internal/api/stats.go` — NEW: Stats HTTP handler
|
||||
- `internal/api/router.go` — Mount stats endpoint
|
||||
- `internal/notify/sender.go` — Implement webhook notification sender
|
||||
- `internal/notify/types.go` — NEW: Notification event types and payloads
|
||||
- `cmd/server/main.go` — Wire notification subscriber and event pruning cron
|
||||
- `web/src/lib/types.ts` — Add ContainerStats, NotificationSettings types
|
||||
- `web/src/lib/api.ts` — Add fetchContainerStats function
|
||||
- `web/src/lib/components/ContainerStats.svelte` — NEW: CPU/memory display
|
||||
- `web/src/lib/components/SystemHealthCard.svelte` — NEW: Dashboard summary
|
||||
- `web/src/routes/+page.svelte` — Add system health card to dashboard
|
||||
- `web/src/routes/settings/+page.svelte` — Add notification settings section
|
||||
- `web/src/lib/sse.ts` — Add container_stats SSE handler
|
||||
|
||||
## Acceptance Criteria
|
||||
- Container stats (CPU/memory) visible on instance cards
|
||||
- Stats update in real-time via SSE
|
||||
- Webhook notifications fire for configured event types
|
||||
- Dashboard shows system health summary
|
||||
- Event log auto-prunes old entries
|
||||
- Settings page allows configuring notification preferences
|
||||
- Build passes, existing tests pass
|
||||
|
||||
## Notes
|
||||
- Docker Stats API returns a stream — read one snapshot and close, don't hold the connection
|
||||
- CPU calculation: (container CPU delta / system CPU delta) * 100 — needs two reads
|
||||
- Memory: usage_bytes / limit_bytes * 100 for percentage
|
||||
- Notification webhook format should be compatible with common receivers (Slack webhook, Discord webhook, generic HTTP)
|
||||
- System health card: consider caching aggregated stats to avoid N+1 queries on dashboard load
|
||||
|
||||
## Review Checklist
|
||||
- [ ] All tasks completed
|
||||
- [ ] Code follows project conventions
|
||||
- [ ] No unintended side effects
|
||||
- [ ] Build passes
|
||||
- [ ] Tests pass (new + existing)
|
||||
|
||||
## Handoff to Next Phase
|
||||
<!-- Filled in by the implementation agent after completing this phase. -->
|
||||
@@ -2,6 +2,8 @@ import type {
|
||||
ApiEnvelope,
|
||||
Deploy,
|
||||
DeployLog,
|
||||
EventLogEntry,
|
||||
EventLogStats,
|
||||
InspectResult,
|
||||
Instance,
|
||||
NpmCertificate,
|
||||
@@ -338,4 +340,29 @@ export function deleteVolume(
|
||||
return del<{ deleted: string }>(`/api/projects/${projectId}/volumes/${volId}`);
|
||||
}
|
||||
|
||||
// ── Event Log ───────────────────────────────────────────────────────
|
||||
|
||||
export function fetchEventLog(params?: {
|
||||
severity?: string;
|
||||
source?: string;
|
||||
since?: string;
|
||||
until?: string;
|
||||
limit?: number;
|
||||
offset?: number;
|
||||
}): Promise<EventLogEntry[]> {
|
||||
const query = new URLSearchParams();
|
||||
if (params?.severity) query.set('severity', params.severity);
|
||||
if (params?.source) query.set('source', params.source);
|
||||
if (params?.since) query.set('since', params.since);
|
||||
if (params?.until) query.set('until', params.until);
|
||||
if (params?.limit) query.set('limit', String(params.limit));
|
||||
if (params?.offset) query.set('offset', String(params.offset));
|
||||
const qs = query.toString();
|
||||
return get<EventLogEntry[]>(`/api/events/log${qs ? `?${qs}` : ''}`);
|
||||
}
|
||||
|
||||
export function fetchEventLogStats(): Promise<EventLogStats> {
|
||||
return get<EventLogStats>('/api/events/log/stats');
|
||||
}
|
||||
|
||||
export { ApiError };
|
||||
|
||||
@@ -106,6 +106,7 @@ export interface Settings {
|
||||
polling_interval: string;
|
||||
base_volume_path: string;
|
||||
ssl_certificate_id: number;
|
||||
stale_threshold_days: number;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
@@ -170,3 +171,35 @@ export interface Volume {
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
/** A persistent event log entry. */
|
||||
export interface EventLogEntry {
|
||||
id: number;
|
||||
source: string;
|
||||
severity: 'info' | 'warn' | 'error';
|
||||
message: string;
|
||||
metadata: string;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
/** Severity counts for the event log. */
|
||||
export interface EventLogStats {
|
||||
info: number;
|
||||
warn: number;
|
||||
error: number;
|
||||
total: number;
|
||||
}
|
||||
|
||||
/** A standalone reverse proxy not tied to a project. */
|
||||
export interface StandaloneProxy {
|
||||
id: string;
|
||||
domain: string;
|
||||
destination_url: string;
|
||||
destination_port: number;
|
||||
ssl_certificate_id: number;
|
||||
npm_proxy_id: number;
|
||||
health_status: 'unknown' | 'healthy' | 'unhealthy';
|
||||
health_checked_at: string;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user