tiny-forge/internal/api/health.go

package api

import (
	"context"
	"net/http"
	"time"

	"github.com/alexei/tinyforge/internal/auth"
	"github.com/alexei/tinyforge/internal/proxy"
)

// healthProbeTimeout caps a single health probe so a stuck dependency does
// not hold the polling endpoint open. The UI polls every 30 s, so 8 s leaves
// headroom for the ping + Info + NPM list calls.
const healthProbeTimeout = 8 * time.Second

// nonAdminDockerFields enumerates the fields any authenticated user is
// allowed to see — version + connectivity + container counts. Host-detail
// fields (kernel, root_dir, hostname, OS, storage driver) are admin-only to
// avoid recon information leaks.
var nonAdminDockerFields = map[string]bool{
	"connected":  true,
	"latency_ms": true,
	"error":      true,
	"version":    true,
	"api_version": true,
	"containers": true,
	"running":    true,
	"paused":     true,
	"stopped":    true,
	"images":     true,
	"ncpu":       true,
	"memory_total": true,
}

// nonAdminProxyFields are the proxy fields safe to share with non-admins.
// Configured URLs and aggregate counts of internal lists/certs are stripped.
var nonAdminProxyFields = map[string]bool{
	"provider":            true,
	"connected":           true,
	"latency_ms":          true,
	"error":               true,
	"proxy_hosts_managed": true,
}

// getHealth handles GET /api/health.
//
// Returns the connectivity state and (when connected) diagnostics for the
// Docker daemon and the active proxy provider. Detailed host information
// (kernel, root_dir, internal NPM URL, …) is stripped for non-admin users to
// avoid leaking infrastructure details to read-only viewers.
func (s *Server) getHealth(w http.ResponseWriter, r *http.Request) {
	ctx, cancel := context.WithTimeout(r.Context(), healthProbeTimeout)
	defer cancel()

	claims, _ := auth.ClaimsFromContext(r.Context())
	isAdmin := claims.Role == "admin"

	now := time.Now().UTC().Format(time.RFC3339)
	result := map[string]any{
		"checked_at": now,
	}

	// ── Database ─────────────────────────────────────────────────────
	if err := s.store.DB().PingContext(ctx); err != nil {
		result["database"] = map[string]any{"connected": false, "error": "database unreachable"}
	} else {
		result["database"] = map[string]any{"connected": true}
	}

	// ── Docker daemon ────────────────────────────────────────────────
	docker := s.dockerHealth(ctx)
	if !isAdmin {
		docker = filterFields(docker, nonAdminDockerFields)
	}
	result["docker"] = docker

	// ── Proxy provider ───────────────────────────────────────────────
	if s.proxyProvider != nil {
		proxyInfo := s.proxyHealth(ctx)
		if !isAdmin {
			proxyInfo = filterFields(proxyInfo, nonAdminProxyFields)
		}
		result["proxy"] = proxyInfo
	}

	respondJSON(w, http.StatusOK, result)
}

// filterFields returns a copy of m containing only the keys present in allow.
func filterFields(m map[string]any, allow map[string]bool) map[string]any {
	out := make(map[string]any, len(allow))
	for k, v := range m {
		if allow[k] {
			out[k] = v
		}
	}
	return out
}

// dockerHealth probes the Docker daemon and, if reachable, attaches a full
// DaemonInfo snapshot. The caller does not need to error-check the Info()
// call — if it fails, the connected flag remains true (ping succeeded) but
// the detail fields are simply omitted.
func (s *Server) dockerHealth(ctx context.Context) map[string]any {
	if s.docker == nil {
		return map[string]any{
			"connected": false,
			"error":     "docker client not initialized",
		}
	}

	start := time.Now()
	if err := s.docker.Ping(ctx); err != nil {
		return map[string]any{
			"connected":  false,
			"error":      err.Error(),
			"latency_ms": time.Since(start).Milliseconds(),
		}
	}

	out := map[string]any{
		"connected":  true,
		"latency_ms": time.Since(start).Milliseconds(),
	}

	// Info enriches the payload; failures are non-fatal.
	info, err := s.docker.Info(ctx)
	if err == nil {
		if info.Version != "" {
			out["version"] = info.Version
		}
		if info.APIVersion != "" {
			out["api_version"] = info.APIVersion
		}
		if info.OS != "" {
			out["os"] = info.OS
		}
		if info.Arch != "" {
			out["arch"] = info.Arch
		}
		if info.Kernel != "" {
			out["kernel"] = info.Kernel
		}
		if info.OperatingSystem != "" {
			out["operating_system"] = info.OperatingSystem
		}
		if info.StorageDriver != "" {
			out["storage_driver"] = info.StorageDriver
		}
		if info.RootDir != "" {
			out["root_dir"] = info.RootDir
		}
		if info.Name != "" {
			out["name"] = info.Name
		}
		if info.NCPU > 0 {
			out["ncpu"] = info.NCPU
		}
		if info.MemoryTotal > 0 {
			out["memory_total"] = info.MemoryTotal
		}
		out["containers"] = info.Containers
		out["running"] = info.Running
		out["paused"] = info.Paused
		out["stopped"] = info.Stopped
		out["images"] = info.Images
	}

	return out
}

// proxyHealth probes the configured proxy provider. For NPM, attaches
// aggregate counts (proxy hosts, access lists, certificates) which the
// dashboard surfaces alongside the connection indicator.
func (s *Server) proxyHealth(ctx context.Context) map[string]any {
	providerName := s.proxyProvider.Name()

	start := time.Now()
	err := s.proxyProvider.Ping(ctx)
	latency := time.Since(start).Milliseconds()

	if err != nil {
		return map[string]any{
			"provider":   providerName,
			"connected":  false,
			"error":      providerName + " unreachable: " + err.Error(),
			"latency_ms": latency,
		}
	}

	out := map[string]any{
		"provider":   providerName,
		"connected":  true,
		"latency_ms": latency,
	}

	// Attach configured URL from settings for both NPM and Traefik.
	if settings, serr := s.store.GetSettings(); serr == nil {
		switch providerName {
		case "npm":
			if settings.NpmURL != "" {
				out["url"] = settings.NpmURL
			}
		case "traefik":
			if settings.TraefikAPIURL != "" {
				out["url"] = settings.TraefikAPIURL
			}
		}
	}

	// NPM-specific aggregates — a quick glance at route/list/cert counts.
	// These calls require an authenticated NPM session, so we trigger the
	// provider's auth step first (it's cheap: cached JWT is reused for 1h).
	if providerName == "npm" && s.npm != nil {
		if np, ok := s.proxyProvider.(*proxy.NpmProvider); ok {
			if err := np.Authenticate(ctx); err == nil {
				if hosts, herr := s.npm.ListProxyHosts(ctx); herr == nil {
					out["proxy_hosts"] = len(hosts)
				}
				if lists, lerr := s.npm.ListAccessLists(ctx); lerr == nil {
					out["access_lists"] = len(lists)
				}
				if certs, cerr := s.npm.ListCertificates(ctx); cerr == nil {
					out["certificates"] = len(certs)
				}
			}
		}
	}

	// Managed-route count — how many of the proxy's routes were deployed
	// by Tinyforge itself, counting both Docker instances and static sites.
	// This works for every provider (NPM, Traefik, …) because it reads from
	// our own store, not the external proxy API.
	if managed, merr := s.managedRouteCount(); merr == nil {
		out["proxy_hosts_managed"] = managed
	}

	return out
}

// managedRouteCount returns the number of proxy routes Tinyforge manages,
// reading from the unified containers index. The domain argument doesn't
// affect the count so we pass an empty string to skip FQDN rendering.
func (s *Server) managedRouteCount() (int, error) {
	routes, err := s.store.ListProxyRoutes("")
	if err != nil {
		return 0, err
	}
	return len(routes), nil
}