feat(observability): phase 2 - stale container detection
Add periodic scanner for stale containers: - Cron-based scanner (hourly) detects non-running containers exceeding threshold - last_alive_at tracking on instances, updated on deploy/start/restart - API: GET /api/containers/stale, POST cleanup (single + bulk) - Event log warnings emitted for newly stale containers - Graceful handling of externally removed containers
This commit is contained in:
@@ -196,6 +196,13 @@ func (s *Server) controlInstance(w http.ResponseWriter, r *http.Request, action
|
||||
slog.Error("update instance status", "instance_id", instanceID, "status", newStatus, "error", err)
|
||||
}
|
||||
|
||||
// Track last_alive_at when container becomes running.
|
||||
if newStatus == "running" {
|
||||
if err := s.store.UpdateLastAliveAt(instanceID); err != nil {
|
||||
slog.Error("update last_alive_at", "instance_id", instanceID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, map[string]string{
|
||||
"instance_id": instanceID,
|
||||
"action": action,
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"github.com/alexei/docker-watcher/internal/docker"
|
||||
"github.com/alexei/docker-watcher/internal/events"
|
||||
"github.com/alexei/docker-watcher/internal/npm"
|
||||
"github.com/alexei/docker-watcher/internal/stale"
|
||||
"github.com/alexei/docker-watcher/internal/store"
|
||||
"github.com/alexei/docker-watcher/internal/webhook"
|
||||
)
|
||||
@@ -26,6 +27,7 @@ type Server struct {
|
||||
encKey [32]byte
|
||||
localAuth *auth.LocalAuth
|
||||
oidcProvider *auth.OIDCProvider
|
||||
staleScanner *stale.Scanner
|
||||
}
|
||||
|
||||
// NewServer creates a new API Server with all required dependencies.
|
||||
@@ -60,6 +62,12 @@ func NewServer(
|
||||
return s
|
||||
}
|
||||
|
||||
// SetStaleScanner sets the stale scanner on the server.
|
||||
// Called after both the API server and scanner are initialized.
|
||||
func (s *Server) SetStaleScanner(scanner *stale.Scanner) {
|
||||
s.staleScanner = scanner
|
||||
}
|
||||
|
||||
// initOIDCProvider creates an OIDC provider from settings. Errors are logged, not fatal.
|
||||
func (s *Server) initOIDCProvider(ctx context.Context, as store.AuthSettings) {
|
||||
// Decrypt the OIDC client secret if it's encrypted.
|
||||
@@ -135,6 +143,9 @@ func (s *Server) Router() chi.Router {
|
||||
r.Get("/settings", s.getSettings)
|
||||
r.Get("/settings/npm-certificates", s.listNpmCertificates)
|
||||
|
||||
// Stale container endpoints.
|
||||
r.Get("/containers/stale", s.listStaleContainers)
|
||||
|
||||
// Admin-only routes: require admin role.
|
||||
r.Group(func(r chi.Router) {
|
||||
r.Use(auth.AdminOnly)
|
||||
@@ -192,6 +203,11 @@ func (s *Server) Router() chi.Router {
|
||||
r.Post("/test", s.testRegistry)
|
||||
})
|
||||
|
||||
// Stale container cleanup endpoints (admin-only).
|
||||
// Bulk route must be registered before parameterized route.
|
||||
r.Post("/containers/stale/cleanup", s.bulkCleanupStaleContainers)
|
||||
r.Post("/containers/stale/{id}/cleanup", s.cleanupStaleContainer)
|
||||
|
||||
// Settings endpoints.
|
||||
r.Put("/settings", s.updateSettings)
|
||||
r.Get("/settings/webhook-url", s.getWebhookURL)
|
||||
|
||||
@@ -0,0 +1,172 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/alexei/docker-watcher/internal/crypto"
|
||||
"github.com/alexei/docker-watcher/internal/events"
|
||||
"github.com/alexei/docker-watcher/internal/stale"
|
||||
"github.com/alexei/docker-watcher/internal/store"
|
||||
)
|
||||
|
||||
// listStaleContainers handles GET /api/containers/stale.
|
||||
func (s *Server) listStaleContainers(w http.ResponseWriter, r *http.Request) {
|
||||
if s.staleScanner == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
|
||||
return
|
||||
}
|
||||
|
||||
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "failed to find stale containers: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if staleInstances == nil {
|
||||
staleInstances = []stale.StaleInstance{}
|
||||
}
|
||||
respondJSON(w, http.StatusOK, staleInstances)
|
||||
}
|
||||
|
||||
// cleanupStaleContainer handles POST /api/containers/stale/{id}/cleanup.
|
||||
// Stops the Docker container, removes the NPM proxy, and deletes the instance from the store.
|
||||
func (s *Server) cleanupStaleContainer(w http.ResponseWriter, r *http.Request) {
|
||||
instanceID := chi.URLParam(r, "id")
|
||||
|
||||
inst, err := s.store.GetInstanceByID(instanceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, store.ErrNotFound) {
|
||||
respondNotFound(w, "instance")
|
||||
return
|
||||
}
|
||||
respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// Don't remove instances already being cleaned up.
|
||||
if inst.Status == "removing" {
|
||||
respondError(w, http.StatusConflict, "instance is already being removed")
|
||||
return
|
||||
}
|
||||
|
||||
if err := s.cleanupInstance(r, inst); err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "failed to cleanup instance: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, map[string]string{"cleaned": instanceID})
|
||||
}
|
||||
|
||||
// bulkCleanupStaleContainers handles POST /api/containers/stale/cleanup.
|
||||
// Cleans up all currently stale containers.
|
||||
func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Request) {
|
||||
if s.staleScanner == nil {
|
||||
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
|
||||
return
|
||||
}
|
||||
|
||||
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "failed to find stale containers: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
var cleaned []string
|
||||
var failed []string
|
||||
|
||||
for _, si := range staleInstances {
|
||||
if si.Instance.Status == "removing" {
|
||||
continue
|
||||
}
|
||||
if err := s.cleanupInstance(r, si.Instance); err != nil {
|
||||
slog.Error("bulk stale cleanup failed",
|
||||
"instance_id", si.Instance.ID, "error", err)
|
||||
failed = append(failed, si.Instance.ID)
|
||||
continue
|
||||
}
|
||||
cleaned = append(cleaned, si.Instance.ID)
|
||||
}
|
||||
|
||||
respondJSON(w, http.StatusOK, map[string]any{
|
||||
"cleaned": cleaned,
|
||||
"failed": failed,
|
||||
})
|
||||
}
|
||||
|
||||
// cleanupInstance stops a Docker container, removes the NPM proxy, deletes
|
||||
// the store record, and emits an event.
|
||||
func (s *Server) cleanupInstance(r *http.Request, inst store.Instance) error {
|
||||
ctx := r.Context()
|
||||
|
||||
// Mark as removing.
|
||||
if err := s.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
|
||||
slog.Warn("stale cleanup: update status to removing", "instance_id", inst.ID, "error", err)
|
||||
}
|
||||
|
||||
// Stop and remove Docker container.
|
||||
if inst.ContainerID != "" {
|
||||
if err := s.docker.StopContainer(ctx, inst.ContainerID, 10); err != nil {
|
||||
slog.Warn("stale cleanup: stop container", "container_id", inst.ContainerID, "error", err)
|
||||
}
|
||||
if err := s.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
|
||||
slog.Warn("stale cleanup: remove container", "container_id", inst.ContainerID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete NPM proxy host if present.
|
||||
if inst.NpmProxyID > 0 {
|
||||
settings, err := s.store.GetSettings()
|
||||
if err == nil {
|
||||
npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
|
||||
if err == nil {
|
||||
if authErr := s.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr == nil {
|
||||
if delErr := s.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil {
|
||||
slog.Warn("stale cleanup: delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Delete instance record.
|
||||
if err := s.store.DeleteInstance(inst.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Emit cleanup event.
|
||||
s.emitStaleCleanupEvent(inst)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// emitStaleCleanupEvent publishes an event when a stale container is cleaned up.
|
||||
func (s *Server) emitStaleCleanupEvent(inst store.Instance) {
|
||||
msg := "Stale container cleaned up: " + inst.ID + " (tag: " + inst.ImageTag + ")"
|
||||
|
||||
evt, err := s.store.InsertEvent(store.EventLog{
|
||||
Source: "stale_cleanup",
|
||||
Severity: "info",
|
||||
Message: msg,
|
||||
Metadata: `{"instance_id":"` + inst.ID + `","project_id":"` + inst.ProjectID + `","stage_id":"` + inst.StageID + `"}`,
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("stale cleanup: failed to persist event", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
s.eventBus.Publish(events.Event{
|
||||
Type: events.EventLog,
|
||||
Payload: events.EventLogPayload{
|
||||
ID: evt.ID,
|
||||
Source: "stale_cleanup",
|
||||
Severity: "info",
|
||||
Message: msg,
|
||||
Metadata: evt.Metadata,
|
||||
CreatedAt: evt.CreatedAt,
|
||||
},
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user