feat(observability): phase 8 - container stats, notifications & dashboard
Add container monitoring and notification system: - Docker Stats API: real-time CPU/memory for running containers - Webhook notifications for errors (deploy failures, stale, proxy unhealthy) - Event log auto-pruning (daily, 30-day retention) - ContainerStats component with auto-polling progress bars - SystemHealthCard dashboard widget with running/proxy/error counts - Full EN/RU i18n for stats and system health
This commit is contained in:
@@ -12,6 +12,8 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/robfig/cron/v3"
|
||||
|
||||
dockerwatcher "github.com/alexei/docker-watcher"
|
||||
"github.com/alexei/docker-watcher/internal/api"
|
||||
"github.com/alexei/docker-watcher/internal/auth"
|
||||
@@ -145,6 +147,51 @@ func main() {
|
||||
slog.Warn("failed to start proxy health monitor", "error", err)
|
||||
}
|
||||
|
||||
// Start daily event log pruning cron job.
|
||||
cronScheduler := cron.New()
|
||||
if _, err := cronScheduler.AddFunc("@daily", func() {
|
||||
pruned, err := db.PruneEvents(30)
|
||||
if err != nil {
|
||||
slog.Error("event log prune failed", "error", err)
|
||||
return
|
||||
}
|
||||
if pruned > 0 {
|
||||
slog.Info("pruned old event log entries", "count", pruned)
|
||||
}
|
||||
}); err != nil {
|
||||
slog.Warn("failed to schedule event prune cron", "error", err)
|
||||
}
|
||||
cronScheduler.Start()
|
||||
|
||||
// Subscribe to error events and forward notifications.
|
||||
notifySub := eventBus.Subscribe(func(evt events.Event) bool {
|
||||
if evt.Type != events.EventLog {
|
||||
return false
|
||||
}
|
||||
p, ok := evt.Payload.(events.EventLogPayload)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return p.Severity == "error"
|
||||
})
|
||||
go func() {
|
||||
for evt := range notifySub {
|
||||
p, ok := evt.Payload.(events.EventLogPayload)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
currentSettings, err := db.GetSettings()
|
||||
if err != nil || currentSettings.NotificationURL == "" {
|
||||
continue
|
||||
}
|
||||
notifier.Send(currentSettings.NotificationURL, notify.Event{
|
||||
Type: p.Source + "_error",
|
||||
Project: p.Source,
|
||||
Error: p.Message,
|
||||
})
|
||||
}
|
||||
}()
|
||||
|
||||
// Build API server.
|
||||
apiServer := api.NewServer(db, dockerClient, npmClient, dep, webhookHandler, eventBus, encKey)
|
||||
apiServer.SetStaleScanner(staleScanner)
|
||||
@@ -190,6 +237,8 @@ func main() {
|
||||
slog.Info("shutting down...")
|
||||
|
||||
// Stop accepting new work.
|
||||
cronScheduler.Stop()
|
||||
eventBus.Unsubscribe(notifySub)
|
||||
proxyHealth.Stop()
|
||||
staleScanner.Stop()
|
||||
poller.Stop()
|
||||
|
||||
@@ -136,6 +136,7 @@ func (s *Server) Router() chi.Router {
|
||||
r.Get("/", s.getProject)
|
||||
r.Get("/stages/{stage}/env", s.listStageEnv)
|
||||
r.Get("/stages/{stage}/instances", s.listInstances)
|
||||
r.Get("/stages/{stage}/instances/{iid}/stats", s.getInstanceStats)
|
||||
r.Get("/volumes", s.listVolumes)
|
||||
})
|
||||
r.Get("/deploys", s.listDeploys)
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/docker-watcher/internal/store"
|
||||
)
|
||||
|
||||
// getInstanceStats handles GET /api/projects/{id}/stages/{stage}/instances/{iid}/stats.
|
||||
// Returns CPU and memory stats for the container backing the given instance.
|
||||
func (s *Server) getInstanceStats(w http.ResponseWriter, r *http.Request) {
|
||||
instanceID := chi.URLParam(r, "iid")
|
||||
|
||||
inst, err := s.store.GetInstanceByID(instanceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "instance")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if inst.ContainerID == "" {
|
||||
respondError(w, http.StatusBadRequest, "instance has no container")
|
||||
return
|
||||
}
|
||||
|
||||
stats, err := s.docker.GetContainerStats(r.Context(), inst.ContainerID)
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "failed to get container stats: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, stats)
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
package docker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/moby/moby/api/types/container"
|
||||
"github.com/moby/moby/client"
|
||||
)
|
||||
|
||||
// ContainerStats holds computed CPU and memory usage for a container.
|
||||
type ContainerStats struct {
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryUsage int64 `json:"memory_usage"`
|
||||
MemoryLimit int64 `json:"memory_limit"`
|
||||
MemoryPercent float64 `json:"memory_percent"`
|
||||
}
|
||||
|
||||
// GetContainerStats retrieves a one-shot stats snapshot for the given container
|
||||
// and computes CPU and memory percentages.
|
||||
func (c *Client) GetContainerStats(ctx context.Context, containerID string) (ContainerStats, error) {
|
||||
result, err := c.api.ContainerStats(ctx, containerID, client.ContainerStatsOptions{
|
||||
Stream: false,
|
||||
IncludePreviousSample: true,
|
||||
})
|
||||
if err != nil {
|
||||
return ContainerStats{}, fmt.Errorf("get container stats %s: %w", containerID, err)
|
||||
}
|
||||
defer result.Body.Close()
|
||||
|
||||
var stats container.StatsResponse
|
||||
if err := json.NewDecoder(result.Body).Decode(&stats); err != nil {
|
||||
return ContainerStats{}, fmt.Errorf("decode container stats %s: %w", containerID, err)
|
||||
}
|
||||
|
||||
cpuPercent := calculateCPUPercent(stats)
|
||||
memUsage := int64(stats.MemoryStats.Usage)
|
||||
memLimit := int64(stats.MemoryStats.Limit)
|
||||
var memPercent float64
|
||||
if memLimit > 0 {
|
||||
memPercent = float64(memUsage) / float64(memLimit) * 100.0
|
||||
}
|
||||
|
||||
return ContainerStats{
|
||||
CPUPercent: cpuPercent,
|
||||
MemoryUsage: memUsage,
|
||||
MemoryLimit: memLimit,
|
||||
MemoryPercent: memPercent,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// calculateCPUPercent computes CPU usage percentage from a stats response
|
||||
// using the delta between current and previous CPU readings.
|
||||
func calculateCPUPercent(stats container.StatsResponse) float64 {
|
||||
cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage)
|
||||
systemDelta := float64(stats.CPUStats.SystemUsage) - float64(stats.PreCPUStats.SystemUsage)
|
||||
|
||||
if systemDelta <= 0 || cpuDelta < 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
onlineCPUs := float64(stats.CPUStats.OnlineCPUs)
|
||||
if onlineCPUs == 0 {
|
||||
onlineCPUs = 1
|
||||
}
|
||||
|
||||
return (cpuDelta / systemDelta) * onlineCPUs * 100.0
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
package notify
|
||||
|
||||
// Event types for notifications.
|
||||
const (
|
||||
EventTypeDeploySuccess = "deploy_success"
|
||||
EventTypeDeployFailure = "deploy_failure"
|
||||
EventTypeStaleDetected = "stale_detected"
|
||||
EventTypeProxyUnhealthy = "proxy_unhealthy"
|
||||
)
|
||||
@@ -1,5 +1,6 @@
|
||||
import type {
|
||||
ApiEnvelope,
|
||||
ContainerStats,
|
||||
Deploy,
|
||||
DeployLog,
|
||||
EventLogEntry,
|
||||
@@ -420,4 +421,16 @@ export function bulkCleanupStaleContainers(): Promise<{ deleted: number }> {
|
||||
return post<{ deleted: number }>('/api/containers/stale/cleanup');
|
||||
}
|
||||
|
||||
// ── Container Stats ────────────────────────────────────────────────
|
||||
|
||||
export function fetchContainerStats(
|
||||
projectId: string,
|
||||
stageId: string,
|
||||
instanceId: string
|
||||
): Promise<ContainerStats> {
|
||||
return get<ContainerStats>(
|
||||
`/api/projects/${projectId}/stages/${stageId}/instances/${instanceId}/stats`
|
||||
);
|
||||
}
|
||||
|
||||
export { ApiError };
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
<!--
|
||||
Compact CPU/memory stats bars for embedding in instance cards.
|
||||
-->
|
||||
<script lang="ts">
|
||||
import type { ContainerStats } from '$lib/types';
|
||||
import * as api from '$lib/api';
|
||||
import { t } from '$lib/i18n';
|
||||
|
||||
interface Props {
|
||||
projectId: string;
|
||||
stageId: string;
|
||||
instanceId: string;
|
||||
}
|
||||
|
||||
const { projectId, stageId, instanceId }: Props = $props();
|
||||
|
||||
let stats = $state<ContainerStats | null>(null);
|
||||
let error = $state(false);
|
||||
|
||||
$effect(() => {
|
||||
let cancelled = false;
|
||||
|
||||
async function load() {
|
||||
try {
|
||||
const result = await api.fetchContainerStats(projectId, stageId, instanceId);
|
||||
if (!cancelled) {
|
||||
stats = result;
|
||||
error = false;
|
||||
}
|
||||
} catch {
|
||||
if (!cancelled) {
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
load();
|
||||
|
||||
// Poll every 10 seconds.
|
||||
const interval = setInterval(load, 10_000);
|
||||
|
||||
return () => {
|
||||
cancelled = true;
|
||||
clearInterval(interval);
|
||||
};
|
||||
});
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
const kb = bytes / 1024;
|
||||
if (kb < 1024) return `${kb.toFixed(0)} KB`;
|
||||
const mb = kb / 1024;
|
||||
if (mb < 1024) return `${mb.toFixed(1)} MB`;
|
||||
const gb = mb / 1024;
|
||||
return `${gb.toFixed(2)} GB`;
|
||||
}
|
||||
|
||||
const cpuColor = $derived(() => {
|
||||
if (!stats) return 'bg-gray-300';
|
||||
if (stats.cpu_percent > 80) return 'bg-red-500';
|
||||
if (stats.cpu_percent > 50) return 'bg-amber-500';
|
||||
return 'bg-emerald-500';
|
||||
});
|
||||
|
||||
const memColor = $derived(() => {
|
||||
if (!stats) return 'bg-gray-300';
|
||||
if (stats.memory_percent > 80) return 'bg-red-500';
|
||||
if (stats.memory_percent > 50) return 'bg-amber-500';
|
||||
return 'bg-blue-500';
|
||||
});
|
||||
</script>
|
||||
|
||||
{#if stats}
|
||||
<div class="mt-2 space-y-1">
|
||||
<!-- CPU bar -->
|
||||
<div class="flex items-center gap-2">
|
||||
<span class="w-8 text-[10px] font-medium text-[var(--text-tertiary)]">{$t('stats.cpu')}</span>
|
||||
<div class="relative h-1.5 flex-1 overflow-hidden rounded-full bg-[var(--surface-card-hover)]">
|
||||
<div
|
||||
class="absolute inset-y-0 left-0 rounded-full transition-all duration-500 {cpuColor()}"
|
||||
style="width: {Math.min(stats.cpu_percent, 100)}%"
|
||||
></div>
|
||||
</div>
|
||||
<span class="w-10 text-right text-[10px] tabular-nums text-[var(--text-tertiary)]">
|
||||
{stats.cpu_percent.toFixed(1)}%
|
||||
</span>
|
||||
</div>
|
||||
<!-- Memory bar -->
|
||||
<div class="flex items-center gap-2">
|
||||
<span class="w-8 text-[10px] font-medium text-[var(--text-tertiary)]">{$t('stats.mem')}</span>
|
||||
<div class="relative h-1.5 flex-1 overflow-hidden rounded-full bg-[var(--surface-card-hover)]">
|
||||
<div
|
||||
class="absolute inset-y-0 left-0 rounded-full transition-all duration-500 {memColor()}"
|
||||
style="width: {Math.min(stats.memory_percent, 100)}%"
|
||||
></div>
|
||||
</div>
|
||||
<span class="w-24 text-right text-[10px] tabular-nums text-[var(--text-tertiary)]">
|
||||
{formatBytes(stats.memory_usage)} / {formatBytes(stats.memory_limit)}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
{:else if error}
|
||||
<p class="mt-2 text-[10px] text-[var(--text-tertiary)]">{$t('stats.unavailable')}</p>
|
||||
{/if}
|
||||
@@ -4,6 +4,7 @@
|
||||
<script lang="ts">
|
||||
import type { Instance } from '$lib/types';
|
||||
import StatusBadge from './StatusBadge.svelte';
|
||||
import ContainerStats from './ContainerStats.svelte';
|
||||
import ConfirmDialog from './ConfirmDialog.svelte';
|
||||
import { IconPlay, IconStop, IconRestart, IconTrash, IconExternalLink } from '$lib/components/icons';
|
||||
import { t } from '$lib/i18n';
|
||||
@@ -141,6 +142,10 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{#if instance.status === 'running'}
|
||||
<ContainerStats projectId={projectId} stageId={instance.stage_id} instanceId={instance.id} />
|
||||
{/if}
|
||||
|
||||
{#if error}
|
||||
<p class="mt-2 text-xs text-[var(--color-danger)]">{error}</p>
|
||||
{/if}
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
<!--
|
||||
Dashboard summary card: container counts, proxy health, recent errors.
|
||||
-->
|
||||
<script lang="ts">
|
||||
import type { Instance, ProxyView, EventLogStats } from '$lib/types';
|
||||
import * as api from '$lib/api';
|
||||
import { IconServer, IconProxies, IconAlert } from '$lib/components/icons';
|
||||
import { t } from '$lib/i18n';
|
||||
|
||||
let runningCount = $state(0);
|
||||
let stoppedCount = $state(0);
|
||||
let healthyProxies = $state(0);
|
||||
let unhealthyProxies = $state(0);
|
||||
let recentErrors = $state(0);
|
||||
let loading = $state(true);
|
||||
|
||||
$effect(() => {
|
||||
let cancelled = false;
|
||||
|
||||
async function load() {
|
||||
try {
|
||||
const [projects, proxies, eventStats] = await Promise.all([
|
||||
api.listProjects(),
|
||||
api.listAllProxies().catch(() => [] as ProxyView[]),
|
||||
api.fetchEventLogStats().catch(() => ({ info: 0, warn: 0, error: 0, total: 0 }) as EventLogStats)
|
||||
]);
|
||||
|
||||
// Gather all instances across projects/stages.
|
||||
const allInstances: Instance[] = [];
|
||||
for (const project of projects) {
|
||||
try {
|
||||
const detail = await api.getProject(project.id);
|
||||
for (const stage of detail.stages ?? []) {
|
||||
const instances = await api.listInstances(project.id, stage.id);
|
||||
allInstances.push(...instances);
|
||||
}
|
||||
} catch {
|
||||
// Skip projects that fail to load.
|
||||
}
|
||||
}
|
||||
|
||||
if (!cancelled) {
|
||||
runningCount = allInstances.filter((i) => i.status === 'running').length;
|
||||
stoppedCount = allInstances.filter((i) => i.status !== 'running').length;
|
||||
healthyProxies = proxies.filter((p) => p.health_status === 'healthy').length;
|
||||
unhealthyProxies = proxies.filter((p) => p.health_status === 'unhealthy').length;
|
||||
recentErrors = eventStats.error;
|
||||
loading = false;
|
||||
}
|
||||
} catch {
|
||||
if (!cancelled) {
|
||||
loading = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
load();
|
||||
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
});
|
||||
</script>
|
||||
|
||||
{#if !loading}
|
||||
<div class="rounded-xl border border-[var(--border-primary)] bg-[var(--surface-card)] p-5 shadow-[var(--shadow-sm)]">
|
||||
<h3 class="mb-4 text-sm font-semibold text-[var(--text-primary)]">{$t('systemHealth.title')}</h3>
|
||||
<div class="grid grid-cols-1 gap-3 sm:grid-cols-3">
|
||||
<!-- Containers -->
|
||||
<a href="/projects" class="flex items-center gap-3 rounded-lg p-3 transition-colors hover:bg-[var(--surface-card-hover)]">
|
||||
<div class="flex h-9 w-9 items-center justify-center rounded-lg bg-emerald-50 text-emerald-600">
|
||||
<IconServer size={18} />
|
||||
</div>
|
||||
<div>
|
||||
<p class="text-xs text-[var(--text-secondary)]">{$t('systemHealth.containers')}</p>
|
||||
<p class="text-sm font-semibold text-[var(--text-primary)]">
|
||||
<span class="text-emerald-600">{runningCount}</span>
|
||||
<span class="text-[var(--text-tertiary)]"> / </span>
|
||||
<span class="text-[var(--text-tertiary)]">{stoppedCount}</span>
|
||||
</p>
|
||||
</div>
|
||||
</a>
|
||||
|
||||
<!-- Proxies -->
|
||||
<a href="/proxies" class="flex items-center gap-3 rounded-lg p-3 transition-colors hover:bg-[var(--surface-card-hover)]">
|
||||
<div class="flex h-9 w-9 items-center justify-center rounded-lg {unhealthyProxies > 0 ? 'bg-red-50 text-red-600' : 'bg-blue-50 text-blue-600'}">
|
||||
<IconProxies size={18} />
|
||||
</div>
|
||||
<div>
|
||||
<p class="text-xs text-[var(--text-secondary)]">{$t('systemHealth.proxies')}</p>
|
||||
<p class="text-sm font-semibold text-[var(--text-primary)]">
|
||||
<span class="text-emerald-600">{healthyProxies}</span>
|
||||
{#if unhealthyProxies > 0}
|
||||
<span class="text-[var(--text-tertiary)]"> / </span>
|
||||
<span class="text-red-600">{unhealthyProxies}</span>
|
||||
{/if}
|
||||
</p>
|
||||
</div>
|
||||
</a>
|
||||
|
||||
<!-- Recent errors -->
|
||||
<a href="/events" class="flex items-center gap-3 rounded-lg p-3 transition-colors hover:bg-[var(--surface-card-hover)]">
|
||||
<div class="flex h-9 w-9 items-center justify-center rounded-lg {recentErrors > 0 ? 'bg-red-50 text-red-600' : 'bg-gray-50 text-gray-400'}">
|
||||
<IconAlert size={18} />
|
||||
</div>
|
||||
<div>
|
||||
<p class="text-xs text-[var(--text-secondary)]">{$t('systemHealth.recentErrors')}</p>
|
||||
<p class="text-sm font-semibold {recentErrors > 0 ? 'text-red-600' : 'text-[var(--text-primary)]'}">{recentErrors}</p>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
@@ -504,6 +504,17 @@
|
||||
},
|
||||
"metadata": "Details"
|
||||
},
|
||||
"stats": {
|
||||
"cpu": "CPU",
|
||||
"mem": "MEM",
|
||||
"unavailable": "Stats unavailable"
|
||||
},
|
||||
"systemHealth": {
|
||||
"title": "System Health",
|
||||
"containers": "Containers",
|
||||
"proxies": "Proxies",
|
||||
"recentErrors": "Recent Errors"
|
||||
},
|
||||
"language": {
|
||||
"en": "English",
|
||||
"ru": "Russian"
|
||||
|
||||
@@ -504,6 +504,17 @@
|
||||
},
|
||||
"metadata": "Подробности"
|
||||
},
|
||||
"stats": {
|
||||
"cpu": "ЦП",
|
||||
"mem": "ОЗУ",
|
||||
"unavailable": "Статистика недоступна"
|
||||
},
|
||||
"systemHealth": {
|
||||
"title": "Состояние системы",
|
||||
"containers": "Контейнеры",
|
||||
"proxies": "Прокси",
|
||||
"recentErrors": "Недавние ошибки"
|
||||
},
|
||||
"language": {
|
||||
"en": "Английский",
|
||||
"ru": "Русский"
|
||||
|
||||
@@ -234,6 +234,14 @@ export interface ValidationResult {
|
||||
steps: ValidationStep[];
|
||||
}
|
||||
|
||||
/** Container CPU and memory stats from the Docker stats API. */
|
||||
export interface ContainerStats {
|
||||
cpu_percent: number;
|
||||
memory_usage: number;
|
||||
memory_limit: number;
|
||||
memory_percent: number;
|
||||
}
|
||||
|
||||
/** Unified view of standalone + deploy-managed proxies (from /api/proxies/all). */
|
||||
export interface ProxyView {
|
||||
id: string;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
import ProjectCard from '$lib/components/ProjectCard.svelte';
|
||||
import SkeletonCard from '$lib/components/SkeletonCard.svelte';
|
||||
import EmptyState from '$lib/components/EmptyState.svelte';
|
||||
import SystemHealthCard from '$lib/components/SystemHealthCard.svelte';
|
||||
import { IconDeploy, IconBox, IconServer, IconAlert, IconClock } from '$lib/components/icons';
|
||||
import { t } from '$lib/i18n';
|
||||
|
||||
@@ -124,6 +125,9 @@
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<!-- System health summary -->
|
||||
<SystemHealthCard />
|
||||
|
||||
<!-- Project cards -->
|
||||
<div>
|
||||
<h2 class="text-lg font-semibold text-[var(--text-primary)]">{$t('dashboard.projects')}</h2>
|
||||
|
||||
Reference in New Issue
Block a user