package staticsite import ( "context" "fmt" "log/slog" "sync" "time" "github.com/alexei/tinyforge/internal/docker" "github.com/alexei/tinyforge/internal/store" "github.com/robfig/cron/v3" ) // HealthChecker periodically checks that deployed static site containers // are still running. If a container has crashed, it updates the site status // to "failed" and optionally triggers a redeploy. type HealthChecker struct { store *store.Store docker *docker.Client manager *Manager cron *cron.Cron mu sync.Mutex entryID cron.EntryID running bool } // NewHealthChecker creates a new static site health checker. func NewHealthChecker(st *store.Store, dockerClient *docker.Client, mgr *Manager) *HealthChecker { return &HealthChecker{ store: st, docker: dockerClient, manager: mgr, cron: cron.New(), } } // Start begins the periodic health check with the given interval (e.g., "5m", "1m"). func (h *HealthChecker) Start(interval string) error { h.mu.Lock() defer h.mu.Unlock() duration, err := time.ParseDuration(interval) if err != nil { return fmt.Errorf("parse interval %q: %w", interval, err) } if h.running { h.cron.Remove(h.entryID) } spec := fmt.Sprintf("@every %s", duration) id, err := h.cron.AddFunc(spec, h.check) if err != nil { return fmt.Errorf("schedule health check: %w", err) } h.entryID = id h.running = true h.cron.Start() slog.Info("static site health checker started", "interval", interval) return nil } // Stop stops the periodic health checker. func (h *HealthChecker) Stop() { h.mu.Lock() defer h.mu.Unlock() if h.running { h.cron.Stop() h.running = false slog.Info("static site health checker stopped") } } // check runs a single health check pass over all deployed static sites. func (h *HealthChecker) check() { sites, err := h.store.GetAllStaticSites() if err != nil { slog.Error("static site health check: failed to list sites", "error", err) return } ctx := context.Background() for _, site := range sites { if site.Status != "deployed" || site.ContainerID == "" { continue } running, err := h.docker.IsContainerRunning(ctx, site.ContainerID) if err != nil { // Container might have been removed externally. slog.Warn("static site health check: container inspect failed", "site", site.Name, "container", site.ContainerID[:12], "error", err) h.manager.updateStatus(site.ID, "failed", site.LastCommitSHA, "container not found") h.manager.publishEvent(site.ID, site.Name, "failed: container not found") continue } if !running { slog.Warn("static site health check: container not running", "site", site.Name, "container", site.ContainerID[:12]) h.manager.updateStatus(site.ID, "failed", site.LastCommitSHA, "container stopped unexpectedly") h.manager.publishEvent(site.ID, site.Name, "failed: container stopped unexpectedly") } } }