feat(observability): phase 2 - stale container detection

Add periodic scanner for stale containers:
- Cron-based scanner (hourly) detects non-running containers exceeding threshold
- last_alive_at tracking on instances, updated on deploy/start/restart
- API: GET /api/containers/stale, POST cleanup (single + bulk)
- Event log warnings emitted for newly stale containers
- Graceful handling of externally removed containers
This commit is contained in:
2026-03-30 11:12:25 +03:00
parent c38b7d4c78
commit aefecdffdf
9 changed files with 596 additions and 19 deletions
+7
View File
@@ -196,6 +196,13 @@ func (s *Server) controlInstance(w http.ResponseWriter, r *http.Request, action
slog.Error("update instance status", "instance_id", instanceID, "status", newStatus, "error", err)
}
// Track last_alive_at when container becomes running.
if newStatus == "running" {
if err := s.store.UpdateLastAliveAt(instanceID); err != nil {
slog.Error("update last_alive_at", "instance_id", instanceID, "error", err)
}
}
respondJSON(w, http.StatusOK, map[string]string{
"instance_id": instanceID,
"action": action,