From 7c57c740b43dca0ab2a92369546cc1eb7d636874 Mon Sep 17 00:00:00 2001 From: "alexei.dolgolyov" Date: Mon, 30 Mar 2026 11:37:25 +0300 Subject: [PATCH] feat(observability): phase 8 - container stats, notifications & dashboard Add container monitoring and notification system: - Docker Stats API: real-time CPU/memory for running containers - Webhook notifications for errors (deploy failures, stale, proxy unhealthy) - Event log auto-pruning (daily, 30-day retention) - ContainerStats component with auto-polling progress bars - SystemHealthCard dashboard widget with running/proxy/error counts - Full EN/RU i18n for stats and system health --- cmd/server/main.go | 49 ++++++++ internal/api/router.go | 1 + internal/api/stats.go | 39 ++++++ internal/docker/stats.go | 69 +++++++++++ internal/notify/types.go | 9 ++ web/src/lib/api.ts | 13 ++ web/src/lib/components/ContainerStats.svelte | 104 ++++++++++++++++ web/src/lib/components/InstanceCard.svelte | 5 + .../lib/components/SystemHealthCard.svelte | 113 ++++++++++++++++++ web/src/lib/i18n/en.json | 11 ++ web/src/lib/i18n/ru.json | 11 ++ web/src/lib/types.ts | 8 ++ web/src/routes/+page.svelte | 4 + 13 files changed, 436 insertions(+) create mode 100644 internal/api/stats.go create mode 100644 internal/docker/stats.go create mode 100644 internal/notify/types.go create mode 100644 web/src/lib/components/ContainerStats.svelte create mode 100644 web/src/lib/components/SystemHealthCard.svelte diff --git a/cmd/server/main.go b/cmd/server/main.go index 06d04a4..1842d2c 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -12,6 +12,8 @@ import ( "syscall" "time" + "github.com/robfig/cron/v3" + dockerwatcher "github.com/alexei/docker-watcher" "github.com/alexei/docker-watcher/internal/api" "github.com/alexei/docker-watcher/internal/auth" @@ -145,6 +147,51 @@ func main() { slog.Warn("failed to start proxy health monitor", "error", err) } + // Start daily event log pruning cron job. + cronScheduler := cron.New() + if _, err := cronScheduler.AddFunc("@daily", func() { + pruned, err := db.PruneEvents(30) + if err != nil { + slog.Error("event log prune failed", "error", err) + return + } + if pruned > 0 { + slog.Info("pruned old event log entries", "count", pruned) + } + }); err != nil { + slog.Warn("failed to schedule event prune cron", "error", err) + } + cronScheduler.Start() + + // Subscribe to error events and forward notifications. + notifySub := eventBus.Subscribe(func(evt events.Event) bool { + if evt.Type != events.EventLog { + return false + } + p, ok := evt.Payload.(events.EventLogPayload) + if !ok { + return false + } + return p.Severity == "error" + }) + go func() { + for evt := range notifySub { + p, ok := evt.Payload.(events.EventLogPayload) + if !ok { + continue + } + currentSettings, err := db.GetSettings() + if err != nil || currentSettings.NotificationURL == "" { + continue + } + notifier.Send(currentSettings.NotificationURL, notify.Event{ + Type: p.Source + "_error", + Project: p.Source, + Error: p.Message, + }) + } + }() + // Build API server. apiServer := api.NewServer(db, dockerClient, npmClient, dep, webhookHandler, eventBus, encKey) apiServer.SetStaleScanner(staleScanner) @@ -190,6 +237,8 @@ func main() { slog.Info("shutting down...") // Stop accepting new work. + cronScheduler.Stop() + eventBus.Unsubscribe(notifySub) proxyHealth.Stop() staleScanner.Stop() poller.Stop() diff --git a/internal/api/router.go b/internal/api/router.go index eecfda2..91295d9 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -136,6 +136,7 @@ func (s *Server) Router() chi.Router { r.Get("/", s.getProject) r.Get("/stages/{stage}/env", s.listStageEnv) r.Get("/stages/{stage}/instances", s.listInstances) + r.Get("/stages/{stage}/instances/{iid}/stats", s.getInstanceStats) r.Get("/volumes", s.listVolumes) }) r.Get("/deploys", s.listDeploys) diff --git a/internal/api/stats.go b/internal/api/stats.go new file mode 100644 index 0000000..9f6e088 --- /dev/null +++ b/internal/api/stats.go @@ -0,0 +1,39 @@ +package api + +import ( + "errors" + "net/http" + + "github.com/go-chi/chi/v5" + + "github.com/alexei/docker-watcher/internal/store" +) + +// getInstanceStats handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats. +// Returns CPU and memory stats for the container backing the given instance. +func (s *Server) getInstanceStats(w http.ResponseWriter, r *http.Request) { + instanceID := chi.URLParam(r, "iid") + + inst, err := s.store.GetInstanceByID(instanceID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + respondNotFound(w, "instance") + return + } + respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error()) + return + } + + if inst.ContainerID == "" { + respondError(w, http.StatusBadRequest, "instance has no container") + return + } + + stats, err := s.docker.GetContainerStats(r.Context(), inst.ContainerID) + if err != nil { + respondError(w, http.StatusInternalServerError, "failed to get container stats: "+err.Error()) + return + } + + respondJSON(w, http.StatusOK, stats) +} diff --git a/internal/docker/stats.go b/internal/docker/stats.go new file mode 100644 index 0000000..e1c919f --- /dev/null +++ b/internal/docker/stats.go @@ -0,0 +1,69 @@ +package docker + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/moby/moby/api/types/container" + "github.com/moby/moby/client" +) + +// ContainerStats holds computed CPU and memory usage for a container. +type ContainerStats struct { + CPUPercent float64 `json:"cpu_percent"` + MemoryUsage int64 `json:"memory_usage"` + MemoryLimit int64 `json:"memory_limit"` + MemoryPercent float64 `json:"memory_percent"` +} + +// GetContainerStats retrieves a one-shot stats snapshot for the given container +// and computes CPU and memory percentages. +func (c *Client) GetContainerStats(ctx context.Context, containerID string) (ContainerStats, error) { + result, err := c.api.ContainerStats(ctx, containerID, client.ContainerStatsOptions{ + Stream: false, + IncludePreviousSample: true, + }) + if err != nil { + return ContainerStats{}, fmt.Errorf("get container stats %s: %w", containerID, err) + } + defer result.Body.Close() + + var stats container.StatsResponse + if err := json.NewDecoder(result.Body).Decode(&stats); err != nil { + return ContainerStats{}, fmt.Errorf("decode container stats %s: %w", containerID, err) + } + + cpuPercent := calculateCPUPercent(stats) + memUsage := int64(stats.MemoryStats.Usage) + memLimit := int64(stats.MemoryStats.Limit) + var memPercent float64 + if memLimit > 0 { + memPercent = float64(memUsage) / float64(memLimit) * 100.0 + } + + return ContainerStats{ + CPUPercent: cpuPercent, + MemoryUsage: memUsage, + MemoryLimit: memLimit, + MemoryPercent: memPercent, + }, nil +} + +// calculateCPUPercent computes CPU usage percentage from a stats response +// using the delta between current and previous CPU readings. +func calculateCPUPercent(stats container.StatsResponse) float64 { + cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage) + systemDelta := float64(stats.CPUStats.SystemUsage) - float64(stats.PreCPUStats.SystemUsage) + + if systemDelta <= 0 || cpuDelta < 0 { + return 0.0 + } + + onlineCPUs := float64(stats.CPUStats.OnlineCPUs) + if onlineCPUs == 0 { + onlineCPUs = 1 + } + + return (cpuDelta / systemDelta) * onlineCPUs * 100.0 +} diff --git a/internal/notify/types.go b/internal/notify/types.go new file mode 100644 index 0000000..d3e3bb2 --- /dev/null +++ b/internal/notify/types.go @@ -0,0 +1,9 @@ +package notify + +// Event types for notifications. +const ( + EventTypeDeploySuccess = "deploy_success" + EventTypeDeployFailure = "deploy_failure" + EventTypeStaleDetected = "stale_detected" + EventTypeProxyUnhealthy = "proxy_unhealthy" +) diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 9206d17..7031d38 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -1,5 +1,6 @@ import type { ApiEnvelope, + ContainerStats, Deploy, DeployLog, EventLogEntry, @@ -420,4 +421,16 @@ export function bulkCleanupStaleContainers(): Promise<{ deleted: number }> { return post<{ deleted: number }>('/api/containers/stale/cleanup'); } +// ── Container Stats ──────────────────────────────────────────────── + +export function fetchContainerStats( + projectId: string, + stageId: string, + instanceId: string +): Promise { + return get( + `/api/projects/${projectId}/stages/${stageId}/instances/${instanceId}/stats` + ); +} + export { ApiError }; diff --git a/web/src/lib/components/ContainerStats.svelte b/web/src/lib/components/ContainerStats.svelte new file mode 100644 index 0000000..5339c54 --- /dev/null +++ b/web/src/lib/components/ContainerStats.svelte @@ -0,0 +1,104 @@ + + + +{#if stats} +
+ +
+ {$t('stats.cpu')} +
+
+
+ + {stats.cpu_percent.toFixed(1)}% + +
+ +
+ {$t('stats.mem')} +
+
+
+ + {formatBytes(stats.memory_usage)} / {formatBytes(stats.memory_limit)} + +
+
+{:else if error} +

{$t('stats.unavailable')}

+{/if} diff --git a/web/src/lib/components/InstanceCard.svelte b/web/src/lib/components/InstanceCard.svelte index d26f956..feb7a9d 100644 --- a/web/src/lib/components/InstanceCard.svelte +++ b/web/src/lib/components/InstanceCard.svelte @@ -4,6 +4,7 @@ + +{#if !loading} + +{/if} diff --git a/web/src/lib/i18n/en.json b/web/src/lib/i18n/en.json index 2560eda..5cfab28 100644 --- a/web/src/lib/i18n/en.json +++ b/web/src/lib/i18n/en.json @@ -504,6 +504,17 @@ }, "metadata": "Details" }, + "stats": { + "cpu": "CPU", + "mem": "MEM", + "unavailable": "Stats unavailable" + }, + "systemHealth": { + "title": "System Health", + "containers": "Containers", + "proxies": "Proxies", + "recentErrors": "Recent Errors" + }, "language": { "en": "English", "ru": "Russian" diff --git a/web/src/lib/i18n/ru.json b/web/src/lib/i18n/ru.json index a1054a4..0747e70 100644 --- a/web/src/lib/i18n/ru.json +++ b/web/src/lib/i18n/ru.json @@ -504,6 +504,17 @@ }, "metadata": "Подробности" }, + "stats": { + "cpu": "ЦП", + "mem": "ОЗУ", + "unavailable": "Статистика недоступна" + }, + "systemHealth": { + "title": "Состояние системы", + "containers": "Контейнеры", + "proxies": "Прокси", + "recentErrors": "Недавние ошибки" + }, "language": { "en": "Английский", "ru": "Русский" diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 8e7f5ca..3f9df42 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -234,6 +234,14 @@ export interface ValidationResult { steps: ValidationStep[]; } +/** Container CPU and memory stats from the Docker stats API. */ +export interface ContainerStats { + cpu_percent: number; + memory_usage: number; + memory_limit: number; + memory_percent: number; +} + /** Unified view of standalone + deploy-managed proxies (from /api/proxies/all). */ export interface ProxyView { id: string; diff --git a/web/src/routes/+page.svelte b/web/src/routes/+page.svelte index f7d89ec..5902ac2 100644 --- a/web/src/routes/+page.svelte +++ b/web/src/routes/+page.svelte @@ -4,6 +4,7 @@ import ProjectCard from '$lib/components/ProjectCard.svelte'; import SkeletonCard from '$lib/components/SkeletonCard.svelte'; import EmptyState from '$lib/components/EmptyState.svelte'; + import SystemHealthCard from '$lib/components/SystemHealthCard.svelte'; import { IconDeploy, IconBox, IconServer, IconAlert, IconClock } from '$lib/components/icons'; import { t } from '$lib/i18n'; @@ -124,6 +125,9 @@ + + +

{$t('dashboard.projects')}