d8ab22876f
Build / build (push) Successful in 10m41s
End-to-end extraction of the Instance concept. After this commit:
* internal/store/instances.go — DELETED
* internal/store/models.go — Instance struct gone, ProxyRoute moved here
* containers table is the single source of truth for project/stack/site
container state. instances table is dropped via DROP TABLE migration
(idempotent; re-runnable on every boot).
* Legacy tinyforge.project / tinyforge.stage / tinyforge.instance-id
Docker labels are no longer emitted; only tinyforge.workload.{id,kind},
tinyforge.role, and tinyforge.managed are stamped on new containers.
Backend rewrites:
- internal/deployer: executeDeploy + blueGreenDeploy + rollback +
promote use store.Container natively. New
removeContainer() replaces removeInstance().
enforceMaxInstances reads via
ListContainersByStageID.
- internal/reconciler: legacy tinyforge.instance-id dispatch removed;
upsertByWorkloadLabel now finds existing rows
by docker container ID first and falls back to
the deterministic workloadID:role key.
- internal/stale/scanner: Scan + new FindStaleContainers walk the
containers table; emit StaleContainer JSON.
- internal/stats/collector: ListContainers replaces ListAllInstances.
- internal/webhook/handler: workload-secret lookup tried first; falls back
to project / static_site secret column.
- internal/api: instances.go, stale.go, stats.go, stats_history.go,
projects.go, settings.go, docker.go, dns.go all read /
write through Container.
Docker layer:
- ManagedContainer exposes WorkloadID/Kind/Role from the canonical labels.
- ListContainers filters by tinyforge.managed=true.
- Network creation uses LabelManaged instead of LabelProject.
Frontend:
- Instance type is now a Container alias; .status → .state,
.last_alive_at → .last_seen_at.
- InstanceCard takes stageId as a prop (no longer derived from Instance).
- StaleContainer JSON shape rewritten: { container, workload_name, role,
days_stale }. StaleContainerCard + /containers/stale page updated.
- ProjectCard / homepage / SystemHealthCard filter by .state.
The migration loop now tolerates "no such table" alongside "duplicate
column" / "already exists" so obsolete ALTER TABLE entries targeting the
dropped instances table no-op cleanly on first boot.
Tests: store + deployer + reconciler + webhook + staticsite + notify all
still pass. Frontend svelte-check: zero errors.
335 lines
9.2 KiB
Go
335 lines
9.2 KiB
Go
// Package stats implements a background goroutine that periodically samples
|
|
// Docker container and host-level resource usage and persists the samples
|
|
// into SQLite. It reads its interval and retention from settings on every
|
|
// tick so configuration changes take effect without a restart.
|
|
package stats
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/alexei/tinyforge/internal/docker"
|
|
"github.com/alexei/tinyforge/internal/store"
|
|
)
|
|
|
|
// Defaults applied when settings values are outside their valid range.
|
|
const (
|
|
DefaultIntervalSeconds = 15
|
|
DefaultRetentionHours = 2
|
|
MinIntervalSeconds = 5
|
|
MaxIntervalSeconds = 300
|
|
// Hard cap on parallel container stat requests to avoid overwhelming
|
|
// the Docker daemon when the user has many containers.
|
|
maxParallelSamples = 8
|
|
)
|
|
|
|
// OwnerType values for ContainerStatsSample.OwnerType.
|
|
const (
|
|
OwnerTypeInstance = "instance"
|
|
OwnerTypeSite = "site"
|
|
)
|
|
|
|
// Collector runs the background sampling loop.
|
|
type Collector struct {
|
|
store *store.Store
|
|
docker *docker.Client
|
|
|
|
startOnce sync.Once
|
|
stopOnce sync.Once
|
|
started bool
|
|
stop chan struct{}
|
|
done chan struct{}
|
|
}
|
|
|
|
// New creates a new stats collector. Call Start to begin sampling.
|
|
func New(s *store.Store, d *docker.Client) *Collector {
|
|
return &Collector{
|
|
store: s,
|
|
docker: d,
|
|
stop: make(chan struct{}),
|
|
done: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Start launches the background loop. Returns immediately. The loop exits
|
|
// when Stop is called. Safe to call multiple times — only the first call has
|
|
// an effect.
|
|
func (c *Collector) Start() {
|
|
c.startOnce.Do(func() {
|
|
c.started = true
|
|
go c.run()
|
|
})
|
|
}
|
|
|
|
// Stop signals the collector to exit and blocks until it has finished the
|
|
// in-flight tick. If Start was never called, Stop returns immediately.
|
|
func (c *Collector) Stop() {
|
|
c.stopOnce.Do(func() {
|
|
close(c.stop)
|
|
if !c.started {
|
|
close(c.done)
|
|
}
|
|
})
|
|
<-c.done
|
|
}
|
|
|
|
// run is the main loop. It reads the interval from settings on every tick,
|
|
// which lets configuration changes propagate within one tick without a
|
|
// dedicated reload mechanism.
|
|
func (c *Collector) run() {
|
|
defer close(c.done)
|
|
|
|
// Derive a base context that's cancelled when Stop is called so in-flight
|
|
// Docker requests abort instead of waiting out their timeout.
|
|
baseCtx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
go func() {
|
|
<-c.stop
|
|
cancel()
|
|
}()
|
|
|
|
// Wait a few seconds before the first sample so the app has settled.
|
|
select {
|
|
case <-time.After(3 * time.Second):
|
|
case <-c.stop:
|
|
return
|
|
}
|
|
|
|
for {
|
|
interval, retention := c.readConfig()
|
|
if interval == 0 || retention == 0 {
|
|
// Collection disabled. Poll settings every minute in case the
|
|
// user re-enables it.
|
|
select {
|
|
case <-time.After(time.Minute):
|
|
continue
|
|
case <-c.stop:
|
|
return
|
|
}
|
|
}
|
|
|
|
c.tick(baseCtx, retention)
|
|
|
|
select {
|
|
case <-time.After(time.Duration(interval) * time.Second):
|
|
case <-c.stop:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// readConfig reads the current interval + retention from settings, applying
|
|
// defaults and clamping to the valid range.
|
|
func (c *Collector) readConfig() (intervalSeconds, retentionHours int) {
|
|
settings, err := c.store.GetSettings()
|
|
if err != nil {
|
|
slog.Warn("stats collector: failed to read settings — using defaults", "error", err)
|
|
return DefaultIntervalSeconds, DefaultRetentionHours
|
|
}
|
|
intervalSeconds = settings.StatsIntervalSeconds
|
|
retentionHours = settings.StatsRetentionHours
|
|
if intervalSeconds < 0 || retentionHours < 0 {
|
|
return 0, 0
|
|
}
|
|
if intervalSeconds > 0 && intervalSeconds < MinIntervalSeconds {
|
|
intervalSeconds = MinIntervalSeconds
|
|
}
|
|
if intervalSeconds > MaxIntervalSeconds {
|
|
intervalSeconds = MaxIntervalSeconds
|
|
}
|
|
return intervalSeconds, retentionHours
|
|
}
|
|
|
|
// tick samples all known containers, aggregates workload-level totals,
|
|
// persists samples, and prunes rows beyond the retention window. When
|
|
// the Docker daemon is unreachable the whole tick is skipped with a
|
|
// single debug log instead of one warning per container.
|
|
func (c *Collector) tick(parent context.Context, retentionHours int) {
|
|
ctx, cancel := context.WithTimeout(parent, 30*time.Second)
|
|
defer cancel()
|
|
|
|
pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second)
|
|
defer pingCancel()
|
|
if err := c.docker.Ping(pingCtx); err != nil {
|
|
slog.Debug("stats collector: docker unreachable, skipping tick", "error", err)
|
|
return
|
|
}
|
|
|
|
targets := c.buildTargets()
|
|
if len(targets) == 0 {
|
|
// No containers to sample, but still record a system sample so the
|
|
// host history isn't empty.
|
|
c.recordSystemSample(ctx, 0, 0, 0)
|
|
c.prune(retentionHours)
|
|
return
|
|
}
|
|
|
|
samples := c.sampleAll(ctx, targets)
|
|
|
|
var (
|
|
totalCPU float64
|
|
totalMem int64
|
|
running int
|
|
)
|
|
for _, s := range samples {
|
|
if err := c.store.InsertContainerStatsSample(s); err != nil {
|
|
slog.Warn("stats collector: insert container sample",
|
|
"container", s.ContainerID, "error", err)
|
|
continue
|
|
}
|
|
totalCPU += s.CPUPercent
|
|
totalMem += s.MemoryUsage
|
|
running++
|
|
}
|
|
|
|
c.recordSystemSample(ctx, totalCPU, totalMem, running)
|
|
c.prune(retentionHours)
|
|
}
|
|
|
|
// target describes a single container to sample.
|
|
type target struct {
|
|
ContainerID string
|
|
OwnerType string
|
|
OwnerID string
|
|
}
|
|
|
|
// buildTargets fetches container rows that have a docker container_id bound.
|
|
// Project containers and stack containers are surfaced as OwnerTypeInstance
|
|
// (the stats sample owner_type is kept for back-compat with the persisted
|
|
// schema and the dashboard's group-by semantics).
|
|
func (c *Collector) buildTargets() []target {
|
|
var out []target
|
|
|
|
containers, err := c.store.ListContainers(store.ContainerFilter{})
|
|
if err != nil {
|
|
slog.Warn("stats collector: list containers", "error", err)
|
|
} else {
|
|
for _, row := range containers {
|
|
if row.ContainerID == "" {
|
|
continue
|
|
}
|
|
ownerType := OwnerTypeInstance
|
|
if row.WorkloadKind == string(store.WorkloadKindSite) {
|
|
ownerType = OwnerTypeSite
|
|
}
|
|
out = append(out, target{
|
|
ContainerID: row.ContainerID,
|
|
OwnerType: ownerType,
|
|
OwnerID: row.ID,
|
|
})
|
|
}
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
// sampleAll fetches Docker stats for every target in bounded parallelism.
|
|
// Failed samples are logged and skipped — a missing container must not kill
|
|
// the whole tick.
|
|
func (c *Collector) sampleAll(ctx context.Context, targets []target) []store.ContainerStatsSample {
|
|
sem := make(chan struct{}, maxParallelSamples)
|
|
results := make([]store.ContainerStatsSample, len(targets))
|
|
found := make([]bool, len(targets))
|
|
|
|
var wg sync.WaitGroup
|
|
for i, t := range targets {
|
|
// Acquire the semaphore in the parent loop so ctx cancellation
|
|
// short-circuits the queue rather than spawning goroutines that
|
|
// block on an unreachable slot.
|
|
select {
|
|
case sem <- struct{}{}:
|
|
case <-ctx.Done():
|
|
break
|
|
}
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
wg.Add(1)
|
|
go func(i int, t target) {
|
|
defer wg.Done()
|
|
defer func() { <-sem }()
|
|
|
|
sampleCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
stats, err := c.docker.GetContainerStats(sampleCtx, t.ContainerID)
|
|
if err != nil {
|
|
slog.Debug("stats collector: get container stats",
|
|
"container", t.ContainerID, "owner_type", t.OwnerType, "error", err)
|
|
return
|
|
}
|
|
ts := stats.Timestamp.Unix()
|
|
if ts <= 0 {
|
|
ts = time.Now().UTC().Unix()
|
|
}
|
|
results[i] = store.ContainerStatsSample{
|
|
ContainerID: t.ContainerID,
|
|
OwnerType: t.OwnerType,
|
|
OwnerID: t.OwnerID,
|
|
TS: ts,
|
|
CPUPercent: stats.CPUPercent,
|
|
MemoryUsage: stats.MemoryUsage,
|
|
MemoryLimit: stats.MemoryLimit,
|
|
NetworkRxBytes: stats.NetworkRxBytes,
|
|
NetworkTxBytes: stats.NetworkTxBytes,
|
|
BlockReadBytes: stats.BlockReadBytes,
|
|
BlockWriteBytes: stats.BlockWriteBytes,
|
|
}
|
|
found[i] = true
|
|
}(i, t)
|
|
}
|
|
wg.Wait()
|
|
|
|
out := results[:0]
|
|
for i := range results {
|
|
if found[i] {
|
|
out = append(out, results[i])
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// recordSystemSample fetches host info + disk usage and persists a combined
|
|
// system-level sample. Failures are warned but do not propagate.
|
|
func (c *Collector) recordSystemSample(ctx context.Context, workloadCPU float64, workloadMem int64, running int) {
|
|
sysStats, err := c.docker.GetSystemStats(ctx)
|
|
if err != nil {
|
|
slog.Warn("stats collector: get system stats", "error", err)
|
|
return
|
|
}
|
|
ts := sysStats.Timestamp.Unix()
|
|
if ts <= 0 {
|
|
ts = time.Now().UTC().Unix()
|
|
}
|
|
sample := store.SystemStatsSample{
|
|
TS: ts,
|
|
NCPU: sysStats.NCPU,
|
|
MemoryTotal: sysStats.MemoryTotal,
|
|
WorkloadCPUPercent: workloadCPU,
|
|
WorkloadMemUsage: workloadMem,
|
|
ContainersRunning: running,
|
|
DiskTotalBytes: sysStats.DiskTotalBytes,
|
|
}
|
|
// Prefer the Docker-reported running count when we have no running samples
|
|
// (e.g., very first tick may race with container readiness).
|
|
if running == 0 && sysStats.Running > 0 {
|
|
sample.ContainersRunning = sysStats.Running
|
|
}
|
|
if err := c.store.InsertSystemStatsSample(sample); err != nil {
|
|
slog.Warn("stats collector: insert system sample", "error", err)
|
|
}
|
|
}
|
|
|
|
// prune drops rows older than the retention window.
|
|
func (c *Collector) prune(retentionHours int) {
|
|
if retentionHours <= 0 {
|
|
return
|
|
}
|
|
cutoff := time.Now().UTC().Add(-time.Duration(retentionHours) * time.Hour).Unix()
|
|
if _, err := c.store.PruneStatsSamplesBefore(cutoff); err != nil {
|
|
slog.Warn("stats collector: prune", "error", err)
|
|
}
|
|
}
|