feat(observability): phase 2 - stale container detection

Add periodic scanner for stale containers:
- Cron-based scanner (hourly) detects non-running containers exceeding threshold
- last_alive_at tracking on instances, updated on deploy/start/restart
- API: GET /api/containers/stale, POST cleanup (single + bulk)
- Event log warnings emitted for newly stale containers
- Graceful handling of externally removed containers
This commit is contained in:
2026-03-30 11:12:25 +03:00
parent c38b7d4c78
commit aefecdffdf
9 changed files with 596 additions and 19 deletions
+7
View File
@@ -196,6 +196,13 @@ func (s *Server) controlInstance(w http.ResponseWriter, r *http.Request, action
slog.Error("update instance status", "instance_id", instanceID, "status", newStatus, "error", err)
}
// Track last_alive_at when container becomes running.
if newStatus == "running" {
if err := s.store.UpdateLastAliveAt(instanceID); err != nil {
slog.Error("update last_alive_at", "instance_id", instanceID, "error", err)
}
}
respondJSON(w, http.StatusOK, map[string]string{
"instance_id": instanceID,
"action": action,
+16
View File
@@ -11,6 +11,7 @@ import (
"github.com/alexei/docker-watcher/internal/docker"
"github.com/alexei/docker-watcher/internal/events"
"github.com/alexei/docker-watcher/internal/npm"
"github.com/alexei/docker-watcher/internal/stale"
"github.com/alexei/docker-watcher/internal/store"
"github.com/alexei/docker-watcher/internal/webhook"
)
@@ -26,6 +27,7 @@ type Server struct {
encKey [32]byte
localAuth *auth.LocalAuth
oidcProvider *auth.OIDCProvider
staleScanner *stale.Scanner
}
// NewServer creates a new API Server with all required dependencies.
@@ -60,6 +62,12 @@ func NewServer(
return s
}
// SetStaleScanner sets the stale scanner on the server.
// Called after both the API server and scanner are initialized.
func (s *Server) SetStaleScanner(scanner *stale.Scanner) {
s.staleScanner = scanner
}
// initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
// Decrypt the OIDC client secret if it's encrypted.
@@ -135,6 +143,9 @@ func (s *Server) Router() chi.Router {
r.Get("/settings", s.getSettings)
r.Get("/settings/npm-certificates", s.listNpmCertificates)
// Stale container endpoints.
r.Get("/containers/stale", s.listStaleContainers)
// Admin-only routes: require admin role.
r.Group(func(r chi.Router) {
r.Use(auth.AdminOnly)
@@ -192,6 +203,11 @@ func (s *Server) Router() chi.Router {
r.Post("/test", s.testRegistry)
})
// Stale container cleanup endpoints (admin-only).
// Bulk route must be registered before parameterized route.
r.Post("/containers/stale/cleanup", s.bulkCleanupStaleContainers)
r.Post("/containers/stale/{id}/cleanup", s.cleanupStaleContainer)
// Settings endpoints.
r.Put("/settings", s.updateSettings)
r.Get("/settings/webhook-url", s.getWebhookURL)
+172
View File
@@ -0,0 +1,172 @@
package api
import (
"errors"
"log/slog"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/alexei/docker-watcher/internal/crypto"
"github.com/alexei/docker-watcher/internal/events"
"github.com/alexei/docker-watcher/internal/stale"
"github.com/alexei/docker-watcher/internal/store"
)
// listStaleContainers handles GET /api/containers/stale.
func (s *Server) listStaleContainers(w http.ResponseWriter, r *http.Request) {
if s.staleScanner == nil {
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
return
}
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
if err != nil {
respondError(w, http.StatusInternalServerError, "failed to find stale containers: "+err.Error())
return
}
if staleInstances == nil {
staleInstances = []stale.StaleInstance{}
}
respondJSON(w, http.StatusOK, staleInstances)
}
// cleanupStaleContainer handles POST /api/containers/stale/{id}/cleanup.
// Stops the Docker container, removes the NPM proxy, and deletes the instance from the store.
func (s *Server) cleanupStaleContainer(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "id")
inst, err := s.store.GetInstanceByID(instanceID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
return
}
respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
return
}
// Don't remove instances already being cleaned up.
if inst.Status == "removing" {
respondError(w, http.StatusConflict, "instance is already being removed")
return
}
if err := s.cleanupInstance(r, inst); err != nil {
respondError(w, http.StatusInternalServerError, "failed to cleanup instance: "+err.Error())
return
}
respondJSON(w, http.StatusOK, map[string]string{"cleaned": instanceID})
}
// bulkCleanupStaleContainers handles POST /api/containers/stale/cleanup.
// Cleans up all currently stale containers.
func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Request) {
if s.staleScanner == nil {
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
return
}
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
if err != nil {
respondError(w, http.StatusInternalServerError, "failed to find stale containers: "+err.Error())
return
}
var cleaned []string
var failed []string
for _, si := range staleInstances {
if si.Instance.Status == "removing" {
continue
}
if err := s.cleanupInstance(r, si.Instance); err != nil {
slog.Error("bulk stale cleanup failed",
"instance_id", si.Instance.ID, "error", err)
failed = append(failed, si.Instance.ID)
continue
}
cleaned = append(cleaned, si.Instance.ID)
}
respondJSON(w, http.StatusOK, map[string]any{
"cleaned": cleaned,
"failed": failed,
})
}
// cleanupInstance stops a Docker container, removes the NPM proxy, deletes
// the store record, and emits an event.
func (s *Server) cleanupInstance(r *http.Request, inst store.Instance) error {
ctx := r.Context()
// Mark as removing.
if err := s.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
slog.Warn("stale cleanup: update status to removing", "instance_id", inst.ID, "error", err)
}
// Stop and remove Docker container.
if inst.ContainerID != "" {
if err := s.docker.StopContainer(ctx, inst.ContainerID, 10); err != nil {
slog.Warn("stale cleanup: stop container", "container_id", inst.ContainerID, "error", err)
}
if err := s.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
slog.Warn("stale cleanup: remove container", "container_id", inst.ContainerID, "error", err)
}
}
// Delete NPM proxy host if present.
if inst.NpmProxyID > 0 {
settings, err := s.store.GetSettings()
if err == nil {
npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
if err == nil {
if authErr := s.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr == nil {
if delErr := s.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil {
slog.Warn("stale cleanup: delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr)
}
}
}
}
}
// Delete instance record.
if err := s.store.DeleteInstance(inst.ID); err != nil {
return err
}
// Emit cleanup event.
s.emitStaleCleanupEvent(inst)
return nil
}
// emitStaleCleanupEvent publishes an event when a stale container is cleaned up.
func (s *Server) emitStaleCleanupEvent(inst store.Instance) {
msg := "Stale container cleaned up: " + inst.ID + " (tag: " + inst.ImageTag + ")"
evt, err := s.store.InsertEvent(store.EventLog{
Source: "stale_cleanup",
Severity: "info",
Message: msg,
Metadata: `{"instance_id":"` + inst.ID + `","project_id":"` + inst.ProjectID + `","stage_id":"` + inst.StageID + `"}`,
})
if err != nil {
slog.Error("stale cleanup: failed to persist event", "error", err)
return
}
s.eventBus.Publish(events.Event{
Type: events.EventLog,
Payload: events.EventLogPayload{
ID: evt.ID,
Source: "stale_cleanup",
Severity: "info",
Message: msg,
Metadata: evt.Metadata,
CreatedAt: evt.CreatedAt,
},
})
}