feat(observability): phase 2 - stale container detection
Add periodic scanner for stale containers: - Cron-based scanner (hourly) detects non-running containers exceeding threshold - last_alive_at tracking on instances, updated on deploy/start/restart - API: GET /api/containers/stale, POST cleanup (single + bulk) - Event log warnings emitted for newly stale containers - Graceful handling of externally removed containers
This commit is contained in:
+70
-19
@@ -8,6 +8,20 @@ import (
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
// instanceColumns is the canonical column list for instance queries.
|
||||
const instanceColumns = `id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, last_alive_at, created_at, updated_at`
|
||||
|
||||
// scanInstance scans a row into an Instance struct using the canonical column order.
|
||||
func scanInstance(scanner interface{ Scan(...any) error }) (Instance, error) {
|
||||
var inst Instance
|
||||
err := scanner.Scan(
|
||||
&inst.ID, &inst.StageID, &inst.ProjectID, &inst.ContainerID, &inst.ImageTag,
|
||||
&inst.Subdomain, &inst.NpmProxyID, &inst.Status, &inst.Port,
|
||||
&inst.LastAliveAt, &inst.CreatedAt, &inst.UpdatedAt,
|
||||
)
|
||||
return inst, err
|
||||
}
|
||||
|
||||
// CreateInstance inserts a new instance record.
|
||||
func (s *Store) CreateInstance(inst Instance) (Instance, error) {
|
||||
inst.ID = uuid.New().String()
|
||||
@@ -15,10 +29,11 @@ func (s *Store) CreateInstance(inst Instance) (Instance, error) {
|
||||
inst.UpdatedAt = inst.CreatedAt
|
||||
|
||||
_, err := s.db.Exec(
|
||||
`INSERT INTO instances (id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
`INSERT INTO instances (`+instanceColumns+`)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
inst.ID, inst.StageID, inst.ProjectID, inst.ContainerID, inst.ImageTag,
|
||||
inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, inst.CreatedAt, inst.UpdatedAt,
|
||||
inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port,
|
||||
inst.LastAliveAt, inst.CreatedAt, inst.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return Instance{}, fmt.Errorf("insert instance: %w", err)
|
||||
@@ -36,10 +51,11 @@ func (s *Store) CreateInstanceWithID(inst Instance) (Instance, error) {
|
||||
inst.UpdatedAt = inst.CreatedAt
|
||||
|
||||
_, err := s.db.Exec(
|
||||
`INSERT INTO instances (id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
`INSERT INTO instances (`+instanceColumns+`)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
inst.ID, inst.StageID, inst.ProjectID, inst.ContainerID, inst.ImageTag,
|
||||
inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, inst.CreatedAt, inst.UpdatedAt,
|
||||
inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port,
|
||||
inst.LastAliveAt, inst.CreatedAt, inst.UpdatedAt,
|
||||
)
|
||||
if err != nil {
|
||||
return Instance{}, fmt.Errorf("insert instance: %w", err)
|
||||
@@ -49,12 +65,9 @@ func (s *Store) CreateInstanceWithID(inst Instance) (Instance, error) {
|
||||
|
||||
// GetInstanceByID returns a single instance by its ID.
|
||||
func (s *Store) GetInstanceByID(id string) (Instance, error) {
|
||||
var inst Instance
|
||||
err := s.db.QueryRow(
|
||||
`SELECT id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at
|
||||
FROM instances WHERE id = ?`, id,
|
||||
).Scan(&inst.ID, &inst.StageID, &inst.ProjectID, &inst.ContainerID, &inst.ImageTag,
|
||||
&inst.Subdomain, &inst.NpmProxyID, &inst.Status, &inst.Port, &inst.CreatedAt, &inst.UpdatedAt)
|
||||
inst, err := scanInstance(s.db.QueryRow(
|
||||
`SELECT `+instanceColumns+` FROM instances WHERE id = ?`, id,
|
||||
))
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return Instance{}, fmt.Errorf("instance %s: %w", id, ErrNotFound)
|
||||
}
|
||||
@@ -67,8 +80,7 @@ func (s *Store) GetInstanceByID(id string) (Instance, error) {
|
||||
// GetInstancesByStageID returns all instances for a given stage.
|
||||
func (s *Store) GetInstancesByStageID(stageID string) ([]Instance, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT id, stage_id, project_id, container_id, image_tag, subdomain, npm_proxy_id, status, port, created_at, updated_at
|
||||
FROM instances WHERE stage_id = ? ORDER BY created_at DESC`, stageID,
|
||||
`SELECT `+instanceColumns+` FROM instances WHERE stage_id = ? ORDER BY created_at DESC`, stageID,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query instances: %w", err)
|
||||
@@ -77,9 +89,29 @@ func (s *Store) GetInstancesByStageID(stageID string) ([]Instance, error) {
|
||||
|
||||
instances := []Instance{}
|
||||
for rows.Next() {
|
||||
var inst Instance
|
||||
if err := rows.Scan(&inst.ID, &inst.StageID, &inst.ProjectID, &inst.ContainerID, &inst.ImageTag,
|
||||
&inst.Subdomain, &inst.NpmProxyID, &inst.Status, &inst.Port, &inst.CreatedAt, &inst.UpdatedAt); err != nil {
|
||||
inst, err := scanInstance(rows)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("scan instance: %w", err)
|
||||
}
|
||||
instances = append(instances, inst)
|
||||
}
|
||||
return instances, rows.Err()
|
||||
}
|
||||
|
||||
// ListAllInstances returns all instances across all stages.
|
||||
func (s *Store) ListAllInstances() ([]Instance, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT ` + instanceColumns + ` FROM instances ORDER BY created_at DESC`,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query all instances: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
instances := []Instance{}
|
||||
for rows.Next() {
|
||||
inst, err := scanInstance(rows)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("scan instance: %w", err)
|
||||
}
|
||||
instances = append(instances, inst)
|
||||
@@ -91,10 +123,11 @@ func (s *Store) GetInstancesByStageID(stageID string) ([]Instance, error) {
|
||||
func (s *Store) UpdateInstance(inst Instance) error {
|
||||
inst.UpdatedAt = Now()
|
||||
result, err := s.db.Exec(
|
||||
`UPDATE instances SET stage_id=?, project_id=?, container_id=?, image_tag=?, subdomain=?, npm_proxy_id=?, status=?, port=?, updated_at=?
|
||||
`UPDATE instances SET stage_id=?, project_id=?, container_id=?, image_tag=?, subdomain=?, npm_proxy_id=?, status=?, port=?, last_alive_at=?, updated_at=?
|
||||
WHERE id=?`,
|
||||
inst.StageID, inst.ProjectID, inst.ContainerID, inst.ImageTag,
|
||||
inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port, inst.UpdatedAt, inst.ID,
|
||||
inst.Subdomain, inst.NpmProxyID, inst.Status, inst.Port,
|
||||
inst.LastAliveAt, inst.UpdatedAt, inst.ID,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update instance: %w", err)
|
||||
@@ -123,6 +156,24 @@ func (s *Store) UpdateInstanceStatus(id string, status string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateLastAliveAt sets the last_alive_at timestamp for an instance.
|
||||
// Called when an instance is seen running.
|
||||
func (s *Store) UpdateLastAliveAt(id string) error {
|
||||
ts := Now()
|
||||
result, err := s.db.Exec(
|
||||
`UPDATE instances SET last_alive_at=?, updated_at=? WHERE id=?`,
|
||||
ts, ts, id,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update last_alive_at: %w", err)
|
||||
}
|
||||
n, _ := result.RowsAffected()
|
||||
if n == 0 {
|
||||
return fmt.Errorf("instance %s: %w", id, ErrNotFound)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteInstance removes an instance by ID.
|
||||
func (s *Store) DeleteInstance(id string) error {
|
||||
result, err := s.db.Exec(`DELETE FROM instances WHERE id = ?`, id)
|
||||
|
||||
@@ -71,6 +71,7 @@ type Instance struct {
|
||||
NpmProxyID int `json:"npm_proxy_id"`
|
||||
Status string `json:"status"` // running, stopped, failed, removing
|
||||
Port int `json:"port"`
|
||||
LastAliveAt string `json:"last_alive_at"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
@@ -83,6 +83,8 @@ func (s *Store) runMigrations() error {
|
||||
`ALTER TABLE settings ADD COLUMN ssl_certificate_id INTEGER NOT NULL DEFAULT 0`,
|
||||
// Add stale_threshold_days to settings (2026-03-30).
|
||||
`ALTER TABLE settings ADD COLUMN stale_threshold_days INTEGER NOT NULL DEFAULT 7`,
|
||||
// Add last_alive_at to instances for stale container detection (2026-03-30).
|
||||
`ALTER TABLE instances ADD COLUMN last_alive_at TEXT NOT NULL DEFAULT ''`,
|
||||
}
|
||||
|
||||
for _, m := range migrations {
|
||||
|
||||
Reference in New Issue
Block a user