feat(observability): phase 8 - container stats, notifications & dashboard

Add container monitoring and notification system:
- Docker Stats API: real-time CPU/memory for running containers
- Webhook notifications for errors (deploy failures, stale, proxy unhealthy)
- Event log auto-pruning (daily, 30-day retention)
- ContainerStats component with auto-polling progress bars
- SystemHealthCard dashboard widget with running/proxy/error counts
- Full EN/RU i18n for stats and system health
This commit is contained in:
2026-03-30 11:37:25 +03:00
parent 79a40f3d9c
commit 7c57c740b4
13 changed files with 436 additions and 0 deletions
+49
View File
@@ -12,6 +12,8 @@ import (
"syscall"
"time"
"github.com/robfig/cron/v3"
dockerwatcher "github.com/alexei/docker-watcher"
"github.com/alexei/docker-watcher/internal/api"
"github.com/alexei/docker-watcher/internal/auth"
@@ -145,6 +147,51 @@ func main() {
slog.Warn("failed to start proxy health monitor", "error", err)
}
// Start daily event log pruning cron job.
cronScheduler := cron.New()
if _, err := cronScheduler.AddFunc("@daily", func() {
pruned, err := db.PruneEvents(30)
if err != nil {
slog.Error("event log prune failed", "error", err)
return
}
if pruned > 0 {
slog.Info("pruned old event log entries", "count", pruned)
}
}); err != nil {
slog.Warn("failed to schedule event prune cron", "error", err)
}
cronScheduler.Start()
// Subscribe to error events and forward notifications.
notifySub := eventBus.Subscribe(func(evt events.Event) bool {
if evt.Type != events.EventLog {
return false
}
p, ok := evt.Payload.(events.EventLogPayload)
if !ok {
return false
}
return p.Severity == "error"
})
go func() {
for evt := range notifySub {
p, ok := evt.Payload.(events.EventLogPayload)
if !ok {
continue
}
currentSettings, err := db.GetSettings()
if err != nil || currentSettings.NotificationURL == "" {
continue
}
notifier.Send(currentSettings.NotificationURL, notify.Event{
Type: p.Source + "_error",
Project: p.Source,
Error: p.Message,
})
}
}()
// Build API server.
apiServer := api.NewServer(db, dockerClient, npmClient, dep, webhookHandler, eventBus, encKey)
apiServer.SetStaleScanner(staleScanner)
@@ -190,6 +237,8 @@ func main() {
slog.Info("shutting down...")
// Stop accepting new work.
cronScheduler.Stop()
eventBus.Unsubscribe(notifySub)
proxyHealth.Stop()
staleScanner.Stop()
poller.Stop()
+1
View File
@@ -136,6 +136,7 @@ func (s *Server) Router() chi.Router {
r.Get("/", s.getProject)
r.Get("/stages/{stage}/env", s.listStageEnv)
r.Get("/stages/{stage}/instances", s.listInstances)
r.Get("/stages/{stage}/instances/{iid}/stats", s.getInstanceStats)
r.Get("/volumes", s.listVolumes)
})
r.Get("/deploys", s.listDeploys)
+39
View File
@@ -0,0 +1,39 @@
package api
import (
"errors"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/alexei/docker-watcher/internal/store"
)
// getInstanceStats handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats.
// Returns CPU and memory stats for the container backing the given instance.
func (s *Server) getInstanceStats(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "iid")
inst, err := s.store.GetInstanceByID(instanceID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
return
}
respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
return
}
if inst.ContainerID == "" {
respondError(w, http.StatusBadRequest, "instance has no container")
return
}
stats, err := s.docker.GetContainerStats(r.Context(), inst.ContainerID)
if err != nil {
respondError(w, http.StatusInternalServerError, "failed to get container stats: "+err.Error())
return
}
respondJSON(w, http.StatusOK, stats)
}
+69
View File
@@ -0,0 +1,69 @@
package docker
import (
"context"
"encoding/json"
"fmt"
"github.com/moby/moby/api/types/container"
"github.com/moby/moby/client"
)
// ContainerStats holds computed CPU and memory usage for a container.
type ContainerStats struct {
CPUPercent float64 `json:"cpu_percent"`
MemoryUsage int64 `json:"memory_usage"`
MemoryLimit int64 `json:"memory_limit"`
MemoryPercent float64 `json:"memory_percent"`
}
// GetContainerStats retrieves a one-shot stats snapshot for the given container
// and computes CPU and memory percentages.
func (c *Client) GetContainerStats(ctx context.Context, containerID string) (ContainerStats, error) {
result, err := c.api.ContainerStats(ctx, containerID, client.ContainerStatsOptions{
Stream: false,
IncludePreviousSample: true,
})
if err != nil {
return ContainerStats{}, fmt.Errorf("get container stats %s: %w", containerID, err)
}
defer result.Body.Close()
var stats container.StatsResponse
if err := json.NewDecoder(result.Body).Decode(&stats); err != nil {
return ContainerStats{}, fmt.Errorf("decode container stats %s: %w", containerID, err)
}
cpuPercent := calculateCPUPercent(stats)
memUsage := int64(stats.MemoryStats.Usage)
memLimit := int64(stats.MemoryStats.Limit)
var memPercent float64
if memLimit > 0 {
memPercent = float64(memUsage) / float64(memLimit) * 100.0
}
return ContainerStats{
CPUPercent: cpuPercent,
MemoryUsage: memUsage,
MemoryLimit: memLimit,
MemoryPercent: memPercent,
}, nil
}
// calculateCPUPercent computes CPU usage percentage from a stats response
// using the delta between current and previous CPU readings.
func calculateCPUPercent(stats container.StatsResponse) float64 {
cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage)
systemDelta := float64(stats.CPUStats.SystemUsage) - float64(stats.PreCPUStats.SystemUsage)
if systemDelta <= 0 || cpuDelta < 0 {
return 0.0
}
onlineCPUs := float64(stats.CPUStats.OnlineCPUs)
if onlineCPUs == 0 {
onlineCPUs = 1
}
return (cpuDelta / systemDelta) * onlineCPUs * 100.0
}
+9
View File
@@ -0,0 +1,9 @@
package notify
// Event types for notifications.
const (
EventTypeDeploySuccess = "deploy_success"
EventTypeDeployFailure = "deploy_failure"
EventTypeStaleDetected = "stale_detected"
EventTypeProxyUnhealthy = "proxy_unhealthy"
)
+13
View File
@@ -1,5 +1,6 @@
import type {
ApiEnvelope,
ContainerStats,
Deploy,
DeployLog,
EventLogEntry,
@@ -420,4 +421,16 @@ export function bulkCleanupStaleContainers(): Promise<{ deleted: number }> {
return post<{ deleted: number }>('/api/containers/stale/cleanup');
}
// ── Container Stats ────────────────────────────────────────────────
export function fetchContainerStats(
projectId: string,
stageId: string,
instanceId: string
): Promise<ContainerStats> {
return get<ContainerStats>(
`/api/projects/${projectId}/stages/${stageId}/instances/${instanceId}/stats`
);
}
export { ApiError };
@@ -0,0 +1,104 @@
<!--
Compact CPU/memory stats bars for embedding in instance cards.
-->
<script lang="ts">
import type { ContainerStats } from '$lib/types';
import * as api from '$lib/api';
import { t } from '$lib/i18n';
interface Props {
projectId: string;
stageId: string;
instanceId: string;
}
const { projectId, stageId, instanceId }: Props = $props();
let stats = $state<ContainerStats | null>(null);
let error = $state(false);
$effect(() => {
let cancelled = false;
async function load() {
try {
const result = await api.fetchContainerStats(projectId, stageId, instanceId);
if (!cancelled) {
stats = result;
error = false;
}
} catch {
if (!cancelled) {
error = true;
}
}
}
load();
// Poll every 10 seconds.
const interval = setInterval(load, 10_000);
return () => {
cancelled = true;
clearInterval(interval);
};
});
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
const kb = bytes / 1024;
if (kb < 1024) return `${kb.toFixed(0)} KB`;
const mb = kb / 1024;
if (mb < 1024) return `${mb.toFixed(1)} MB`;
const gb = mb / 1024;
return `${gb.toFixed(2)} GB`;
}
const cpuColor = $derived(() => {
if (!stats) return 'bg-gray-300';
if (stats.cpu_percent > 80) return 'bg-red-500';
if (stats.cpu_percent > 50) return 'bg-amber-500';
return 'bg-emerald-500';
});
const memColor = $derived(() => {
if (!stats) return 'bg-gray-300';
if (stats.memory_percent > 80) return 'bg-red-500';
if (stats.memory_percent > 50) return 'bg-amber-500';
return 'bg-blue-500';
});
</script>
{#if stats}
<div class="mt-2 space-y-1">
<!-- CPU bar -->
<div class="flex items-center gap-2">
<span class="w-8 text-[10px] font-medium text-[var(--text-tertiary)]">{$t('stats.cpu')}</span>
<div class="relative h-1.5 flex-1 overflow-hidden rounded-full bg-[var(--surface-card-hover)]">
<div
class="absolute inset-y-0 left-0 rounded-full transition-all duration-500 {cpuColor()}"
style="width: {Math.min(stats.cpu_percent, 100)}%"
></div>
</div>
<span class="w-10 text-right text-[10px] tabular-nums text-[var(--text-tertiary)]">
{stats.cpu_percent.toFixed(1)}%
</span>
</div>
<!-- Memory bar -->
<div class="flex items-center gap-2">
<span class="w-8 text-[10px] font-medium text-[var(--text-tertiary)]">{$t('stats.mem')}</span>
<div class="relative h-1.5 flex-1 overflow-hidden rounded-full bg-[var(--surface-card-hover)]">
<div
class="absolute inset-y-0 left-0 rounded-full transition-all duration-500 {memColor()}"
style="width: {Math.min(stats.memory_percent, 100)}%"
></div>
</div>
<span class="w-24 text-right text-[10px] tabular-nums text-[var(--text-tertiary)]">
{formatBytes(stats.memory_usage)} / {formatBytes(stats.memory_limit)}
</span>
</div>
</div>
{:else if error}
<p class="mt-2 text-[10px] text-[var(--text-tertiary)]">{$t('stats.unavailable')}</p>
{/if}
@@ -4,6 +4,7 @@
<script lang="ts">
import type { Instance } from '$lib/types';
import StatusBadge from './StatusBadge.svelte';
import ContainerStats from './ContainerStats.svelte';
import ConfirmDialog from './ConfirmDialog.svelte';
import { IconPlay, IconStop, IconRestart, IconTrash, IconExternalLink } from '$lib/components/icons';
import { t } from '$lib/i18n';
@@ -141,6 +142,10 @@
</div>
</div>
{#if instance.status === 'running'}
<ContainerStats projectId={projectId} stageId={instance.stage_id} instanceId={instance.id} />
{/if}
{#if error}
<p class="mt-2 text-xs text-[var(--color-danger)]">{error}</p>
{/if}
@@ -0,0 +1,113 @@
<!--
Dashboard summary card: container counts, proxy health, recent errors.
-->
<script lang="ts">
import type { Instance, ProxyView, EventLogStats } from '$lib/types';
import * as api from '$lib/api';
import { IconServer, IconProxies, IconAlert } from '$lib/components/icons';
import { t } from '$lib/i18n';
let runningCount = $state(0);
let stoppedCount = $state(0);
let healthyProxies = $state(0);
let unhealthyProxies = $state(0);
let recentErrors = $state(0);
let loading = $state(true);
$effect(() => {
let cancelled = false;
async function load() {
try {
const [projects, proxies, eventStats] = await Promise.all([
api.listProjects(),
api.listAllProxies().catch(() => [] as ProxyView[]),
api.fetchEventLogStats().catch(() => ({ info: 0, warn: 0, error: 0, total: 0 }) as EventLogStats)
]);
// Gather all instances across projects/stages.
const allInstances: Instance[] = [];
for (const project of projects) {
try {
const detail = await api.getProject(project.id);
for (const stage of detail.stages ?? []) {
const instances = await api.listInstances(project.id, stage.id);
allInstances.push(...instances);
}
} catch {
// Skip projects that fail to load.
}
}
if (!cancelled) {
runningCount = allInstances.filter((i) => i.status === 'running').length;
stoppedCount = allInstances.filter((i) => i.status !== 'running').length;
healthyProxies = proxies.filter((p) => p.health_status === 'healthy').length;
unhealthyProxies = proxies.filter((p) => p.health_status === 'unhealthy').length;
recentErrors = eventStats.error;
loading = false;
}
} catch {
if (!cancelled) {
loading = false;
}
}
}
load();
return () => {
cancelled = true;
};
});
</script>
{#if !loading}
<div class="rounded-xl border border-[var(--border-primary)] bg-[var(--surface-card)] p-5 shadow-[var(--shadow-sm)]">
<h3 class="mb-4 text-sm font-semibold text-[var(--text-primary)]">{$t('systemHealth.title')}</h3>
<div class="grid grid-cols-1 gap-3 sm:grid-cols-3">
<!-- Containers -->
<a href="/projects" class="flex items-center gap-3 rounded-lg p-3 transition-colors hover:bg-[var(--surface-card-hover)]">
<div class="flex h-9 w-9 items-center justify-center rounded-lg bg-emerald-50 text-emerald-600">
<IconServer size={18} />
</div>
<div>
<p class="text-xs text-[var(--text-secondary)]">{$t('systemHealth.containers')}</p>
<p class="text-sm font-semibold text-[var(--text-primary)]">
<span class="text-emerald-600">{runningCount}</span>
<span class="text-[var(--text-tertiary)]"> / </span>
<span class="text-[var(--text-tertiary)]">{stoppedCount}</span>
</p>
</div>
</a>
<!-- Proxies -->
<a href="/proxies" class="flex items-center gap-3 rounded-lg p-3 transition-colors hover:bg-[var(--surface-card-hover)]">
<div class="flex h-9 w-9 items-center justify-center rounded-lg {unhealthyProxies > 0 ? 'bg-red-50 text-red-600' : 'bg-blue-50 text-blue-600'}">
<IconProxies size={18} />
</div>
<div>
<p class="text-xs text-[var(--text-secondary)]">{$t('systemHealth.proxies')}</p>
<p class="text-sm font-semibold text-[var(--text-primary)]">
<span class="text-emerald-600">{healthyProxies}</span>
{#if unhealthyProxies > 0}
<span class="text-[var(--text-tertiary)]"> / </span>
<span class="text-red-600">{unhealthyProxies}</span>
{/if}
</p>
</div>
</a>
<!-- Recent errors -->
<a href="/events" class="flex items-center gap-3 rounded-lg p-3 transition-colors hover:bg-[var(--surface-card-hover)]">
<div class="flex h-9 w-9 items-center justify-center rounded-lg {recentErrors > 0 ? 'bg-red-50 text-red-600' : 'bg-gray-50 text-gray-400'}">
<IconAlert size={18} />
</div>
<div>
<p class="text-xs text-[var(--text-secondary)]">{$t('systemHealth.recentErrors')}</p>
<p class="text-sm font-semibold {recentErrors > 0 ? 'text-red-600' : 'text-[var(--text-primary)]'}">{recentErrors}</p>
</div>
</a>
</div>
</div>
{/if}
+11
View File
@@ -504,6 +504,17 @@
},
"metadata": "Details"
},
"stats": {
"cpu": "CPU",
"mem": "MEM",
"unavailable": "Stats unavailable"
},
"systemHealth": {
"title": "System Health",
"containers": "Containers",
"proxies": "Proxies",
"recentErrors": "Recent Errors"
},
"language": {
"en": "English",
"ru": "Russian"
+11
View File
@@ -504,6 +504,17 @@
},
"metadata": "Подробности"
},
"stats": {
"cpu": "ЦП",
"mem": "ОЗУ",
"unavailable": "Статистика недоступна"
},
"systemHealth": {
"title": "Состояние системы",
"containers": "Контейнеры",
"proxies": "Прокси",
"recentErrors": "Недавние ошибки"
},
"language": {
"en": "Английский",
"ru": "Русский"
+8
View File
@@ -234,6 +234,14 @@ export interface ValidationResult {
steps: ValidationStep[];
}
/** Container CPU and memory stats from the Docker stats API. */
export interface ContainerStats {
cpu_percent: number;
memory_usage: number;
memory_limit: number;
memory_percent: number;
}
/** Unified view of standalone + deploy-managed proxies (from /api/proxies/all). */
export interface ProxyView {
id: string;
+4
View File
@@ -4,6 +4,7 @@
import ProjectCard from '$lib/components/ProjectCard.svelte';
import SkeletonCard from '$lib/components/SkeletonCard.svelte';
import EmptyState from '$lib/components/EmptyState.svelte';
import SystemHealthCard from '$lib/components/SystemHealthCard.svelte';
import { IconDeploy, IconBox, IconServer, IconAlert, IconClock } from '$lib/components/icons';
import { t } from '$lib/i18n';
@@ -124,6 +125,9 @@
</a>
</div>
<!-- System health summary -->
<SystemHealthCard />
<!-- Project cards -->
<div>
<h2 class="text-lg font-semibold text-[var(--text-primary)]">{$t('dashboard.projects')}</h2>