feat(observability): phase 2 - stale container detection

Add periodic scanner for stale containers:
- Cron-based scanner (hourly) detects non-running containers exceeding threshold
- last_alive_at tracking on instances, updated on deploy/start/restart
- API: GET /api/containers/stale, POST cleanup (single + bulk)
- Event log warnings emitted for newly stale containers
- Graceful handling of externally removed containers
This commit is contained in:
2026-03-30 11:12:25 +03:00
parent c38b7d4c78
commit aefecdffdf
9 changed files with 596 additions and 19 deletions
+16
View File
@@ -11,6 +11,7 @@ import (
"github.com/alexei/docker-watcher/internal/docker"
"github.com/alexei/docker-watcher/internal/events"
"github.com/alexei/docker-watcher/internal/npm"
"github.com/alexei/docker-watcher/internal/stale"
"github.com/alexei/docker-watcher/internal/store"
"github.com/alexei/docker-watcher/internal/webhook"
)
@@ -26,6 +27,7 @@ type Server struct {
encKey [32]byte
localAuth *auth.LocalAuth
oidcProvider *auth.OIDCProvider
staleScanner *stale.Scanner
}
// NewServer creates a new API Server with all required dependencies.
@@ -60,6 +62,12 @@ func NewServer(
return s
}
// SetStaleScanner sets the stale scanner on the server.
// Called after both the API server and scanner are initialized.
func (s *Server) SetStaleScanner(scanner *stale.Scanner) {
s.staleScanner = scanner
}
// initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
// Decrypt the OIDC client secret if it's encrypted.
@@ -135,6 +143,9 @@ func (s *Server) Router() chi.Router {
r.Get("/settings", s.getSettings)
r.Get("/settings/npm-certificates", s.listNpmCertificates)
// Stale container endpoints.
r.Get("/containers/stale", s.listStaleContainers)
// Admin-only routes: require admin role.
r.Group(func(r chi.Router) {
r.Use(auth.AdminOnly)
@@ -192,6 +203,11 @@ func (s *Server) Router() chi.Router {
r.Post("/test", s.testRegistry)
})
// Stale container cleanup endpoints (admin-only).
// Bulk route must be registered before parameterized route.
r.Post("/containers/stale/cleanup", s.bulkCleanupStaleContainers)
r.Post("/containers/stale/{id}/cleanup", s.cleanupStaleContainer)
// Settings endpoints.
r.Put("/settings", s.updateSettings)
r.Get("/settings/webhook-url", s.getWebhookURL)