feat(observability): phase 2 - stale container detection

Add periodic scanner for stale containers:
- Cron-based scanner (hourly) detects non-running containers exceeding threshold
- last_alive_at tracking on instances, updated on deploy/start/restart
- API: GET /api/containers/stale, POST cleanup (single + bulk)
- Event log warnings emitted for newly stale containers
- Graceful handling of externally removed containers
This commit is contained in:
2026-03-30 11:12:25 +03:00
parent c38b7d4c78
commit aefecdffdf
9 changed files with 596 additions and 19 deletions
+9
View File
@@ -25,6 +25,7 @@ import (
"github.com/alexei/docker-watcher/internal/notify"
"github.com/alexei/docker-watcher/internal/npm"
"github.com/alexei/docker-watcher/internal/registry"
"github.com/alexei/docker-watcher/internal/stale"
"github.com/alexei/docker-watcher/internal/store"
"github.com/alexei/docker-watcher/internal/webhook"
)
@@ -130,8 +131,15 @@ func main() {
}
}
// Initialize stale container scanner.
staleScanner := stale.New(db, dockerClient, eventBus)
if err := staleScanner.Start("1h"); err != nil {
slog.Warn("failed to start stale scanner", "error", err)
}
// Build API server.
apiServer := api.NewServer(db, dockerClient, npmClient, dep, webhookHandler, eventBus, encKey)
apiServer.SetStaleScanner(staleScanner)
router := apiServer.Router()
// Serve embedded static files for the SPA frontend.
@@ -173,6 +181,7 @@ func main() {
slog.Info("shutting down...")
// Stop accepting new work.
staleScanner.Stop()
poller.Stop()
// Drain in-progress deploys and notifications.