Files
tiny-forge/internal/api/stale.go
T
alexei.dolgolyov aefecdffdf feat(observability): phase 2 - stale container detection
Add periodic scanner for stale containers:
- Cron-based scanner (hourly) detects non-running containers exceeding threshold
- last_alive_at tracking on instances, updated on deploy/start/restart
- API: GET /api/containers/stale, POST cleanup (single + bulk)
- Event log warnings emitted for newly stale containers
- Graceful handling of externally removed containers
2026-03-30 11:12:25 +03:00

173 lines
5.1 KiB
Go

package api
import (
"errors"
"log/slog"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/alexei/docker-watcher/internal/crypto"
"github.com/alexei/docker-watcher/internal/events"
"github.com/alexei/docker-watcher/internal/stale"
"github.com/alexei/docker-watcher/internal/store"
)
// listStaleContainers handles GET /api/containers/stale.
func (s *Server) listStaleContainers(w http.ResponseWriter, r *http.Request) {
if s.staleScanner == nil {
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
return
}
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
if err != nil {
respondError(w, http.StatusInternalServerError, "failed to find stale containers: "+err.Error())
return
}
if staleInstances == nil {
staleInstances = []stale.StaleInstance{}
}
respondJSON(w, http.StatusOK, staleInstances)
}
// cleanupStaleContainer handles POST /api/containers/stale/{id}/cleanup.
// Stops the Docker container, removes the NPM proxy, and deletes the instance from the store.
func (s *Server) cleanupStaleContainer(w http.ResponseWriter, r *http.Request) {
instanceID := chi.URLParam(r, "id")
inst, err := s.store.GetInstanceByID(instanceID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
respondNotFound(w, "instance")
return
}
respondError(w, http.StatusInternalServerError, "failed to get instance: "+err.Error())
return
}
// Don't remove instances already being cleaned up.
if inst.Status == "removing" {
respondError(w, http.StatusConflict, "instance is already being removed")
return
}
if err := s.cleanupInstance(r, inst); err != nil {
respondError(w, http.StatusInternalServerError, "failed to cleanup instance: "+err.Error())
return
}
respondJSON(w, http.StatusOK, map[string]string{"cleaned": instanceID})
}
// bulkCleanupStaleContainers handles POST /api/containers/stale/cleanup.
// Cleans up all currently stale containers.
func (s *Server) bulkCleanupStaleContainers(w http.ResponseWriter, r *http.Request) {
if s.staleScanner == nil {
respondError(w, http.StatusServiceUnavailable, "stale scanner not initialized")
return
}
staleInstances, err := s.staleScanner.FindStaleInstances(r.Context())
if err != nil {
respondError(w, http.StatusInternalServerError, "failed to find stale containers: "+err.Error())
return
}
var cleaned []string
var failed []string
for _, si := range staleInstances {
if si.Instance.Status == "removing" {
continue
}
if err := s.cleanupInstance(r, si.Instance); err != nil {
slog.Error("bulk stale cleanup failed",
"instance_id", si.Instance.ID, "error", err)
failed = append(failed, si.Instance.ID)
continue
}
cleaned = append(cleaned, si.Instance.ID)
}
respondJSON(w, http.StatusOK, map[string]any{
"cleaned": cleaned,
"failed": failed,
})
}
// cleanupInstance stops a Docker container, removes the NPM proxy, deletes
// the store record, and emits an event.
func (s *Server) cleanupInstance(r *http.Request, inst store.Instance) error {
ctx := r.Context()
// Mark as removing.
if err := s.store.UpdateInstanceStatus(inst.ID, "removing"); err != nil {
slog.Warn("stale cleanup: update status to removing", "instance_id", inst.ID, "error", err)
}
// Stop and remove Docker container.
if inst.ContainerID != "" {
if err := s.docker.StopContainer(ctx, inst.ContainerID, 10); err != nil {
slog.Warn("stale cleanup: stop container", "container_id", inst.ContainerID, "error", err)
}
if err := s.docker.RemoveContainer(ctx, inst.ContainerID, true); err != nil {
slog.Warn("stale cleanup: remove container", "container_id", inst.ContainerID, "error", err)
}
}
// Delete NPM proxy host if present.
if inst.NpmProxyID > 0 {
settings, err := s.store.GetSettings()
if err == nil {
npmPassword, err := crypto.Decrypt(s.encKey, settings.NpmPassword)
if err == nil {
if authErr := s.npm.Authenticate(ctx, settings.NpmEmail, npmPassword); authErr == nil {
if delErr := s.npm.DeleteProxyHost(ctx, inst.NpmProxyID); delErr != nil {
slog.Warn("stale cleanup: delete proxy host", "proxy_id", inst.NpmProxyID, "error", delErr)
}
}
}
}
}
// Delete instance record.
if err := s.store.DeleteInstance(inst.ID); err != nil {
return err
}
// Emit cleanup event.
s.emitStaleCleanupEvent(inst)
return nil
}
// emitStaleCleanupEvent publishes an event when a stale container is cleaned up.
func (s *Server) emitStaleCleanupEvent(inst store.Instance) {
msg := "Stale container cleaned up: " + inst.ID + " (tag: " + inst.ImageTag + ")"
evt, err := s.store.InsertEvent(store.EventLog{
Source: "stale_cleanup",
Severity: "info",
Message: msg,
Metadata: `{"instance_id":"` + inst.ID + `","project_id":"` + inst.ProjectID + `","stage_id":"` + inst.StageID + `"}`,
})
if err != nil {
slog.Error("stale cleanup: failed to persist event", "error", err)
return
}
s.eventBus.Publish(events.Event{
Type: events.EventLog,
Payload: events.EventLogPayload{
ID: evt.ID,
Source: "stale_cleanup",
Severity: "info",
Message: msg,
Metadata: evt.Metadata,
CreatedAt: evt.CreatedAt,
},
})
}