410a131cec
This session (frontend focus):
- Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review):
WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation,
ConfirmDialog-based unsaved-changes guard.
- Extract lib/workload/sourceForms.ts (single source of truth for source_config)
+ {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the
/apps/[id] edit form onto the same components (removes the duplication). Add
vitest + sourceForms unit tests.
- Branch preview environments UI: /chain is_preview/preview_branch + a Preview
environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed
state); RegistryImagePicker on the registry trigger and the image source.
- Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect;
conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory
label hints; dashboard + /apps "Total workloads" count only source_kind workloads
(drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker
empty-list guard.
- Update CLAUDE.md frontend conventions + add a Build & Test section.
Also captures pre-existing in-progress platform work (not from this session):
workload notifications, Prometheus metrics export, store lockfile, health probes,
backup hardening, and related store/webhook/scheduler changes.
172 lines
6.7 KiB
Go
172 lines
6.7 KiB
Go
package store
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// ErrLockHeld is returned when another Tinyforge process appears to be
|
|
// running against the same data directory. SQLite + SetMaxOpenConns(1)
|
|
// makes this otherwise-silent collision a recipe for double-fired
|
|
// schedulers, double-polled registries, and `extra_json` RMW corruption.
|
|
var ErrLockHeld = errors.New("data directory is locked by another tinyforge process")
|
|
|
|
// Lockfile is a portable PID file. AcquireLockfile takes it; the returned
|
|
// Release function removes it. The contract:
|
|
//
|
|
// - Lockfile is created with O_CREATE|O_EXCL — atomic on POSIX, atomic
|
|
// on NTFS / ReFS via the equivalent.
|
|
// - On collision, the existing file's PID is read; if the PID is dead,
|
|
// we treat the lock as stale (process crashed without cleanup),
|
|
// reclaim it, and proceed. Live PID → ErrLockHeld.
|
|
// - flock is intentionally not used: cross-platform consistency wins
|
|
// over advisory-lock semantics for the single-instance use case.
|
|
type Lockfile struct {
|
|
path string
|
|
}
|
|
|
|
// AcquireLockfile creates a PID-file lock under dataDir. Returns a
|
|
// Release function the caller must defer. If another live process holds
|
|
// the lock, returns ErrLockHeld with a hint pointing at the lockfile.
|
|
//
|
|
// Reclaim atomicity: when the existing lockfile names a dead PID, the
|
|
// replacement is serialized through an auxiliary reclaim lock (see
|
|
// reclaimStaleLock) so that, of N processes booting concurrently against
|
|
// the same stale lockfile, EXACTLY ONE reclaims it and the rest get
|
|
// ErrLockHeld. A bare `os.Remove`+`O_EXCL` retry — or a rename, which is
|
|
// "last-writer-wins" — cannot guarantee this: multiple reclaimers can each
|
|
// end up believing they own the lock, defeating the single-instance guard.
|
|
func AcquireLockfile(dataDir string) (release func(), err error) {
|
|
path := filepath.Join(dataDir, "tinyforge.lock")
|
|
|
|
// First try: clean acquire.
|
|
if rel, ok, err := tryCreateExclusive(path); ok {
|
|
return rel, nil
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Existing lockfile — read PID and decide whether to reclaim.
|
|
pid, readErr := readLockPID(path)
|
|
if readErr == nil && processAlive(pid) {
|
|
return nil, fmt.Errorf("%w (held by pid %d, lockfile=%s)", ErrLockHeld, pid, path)
|
|
}
|
|
// Stale lock (dead pid) or malformed file — reclaim under serialization.
|
|
reason := "malformed existing lockfile"
|
|
if readErr == nil {
|
|
reason = fmt.Sprintf("stale lockfile (dead pid %d)", pid)
|
|
}
|
|
return reclaimStaleLock(path, reason)
|
|
}
|
|
|
|
// tryCreateExclusive attempts an atomic O_CREATE|O_EXCL create at path.
|
|
// Returns (release, true, nil) on success; (nil, false, nil) when the
|
|
// file already exists; (nil, false, err) on any other error.
|
|
func tryCreateExclusive(path string) (func(), bool, error) {
|
|
f, openErr := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
|
|
if openErr != nil {
|
|
if os.IsExist(openErr) {
|
|
return nil, false, nil
|
|
}
|
|
return nil, false, fmt.Errorf("open lockfile: %w", openErr)
|
|
}
|
|
if _, err := fmt.Fprintf(f, "%d\n", os.Getpid()); err != nil {
|
|
_ = f.Close()
|
|
_ = os.Remove(path)
|
|
return nil, false, fmt.Errorf("write lockfile: %w", err)
|
|
}
|
|
if err := f.Close(); err != nil {
|
|
_ = os.Remove(path)
|
|
return nil, false, fmt.Errorf("close lockfile: %w", err)
|
|
}
|
|
return func() { _ = os.Remove(path) }, true, nil
|
|
}
|
|
|
|
// reclaimStaleLock replaces a stale/malformed lockfile with one holding our
|
|
// PID, serialized by an auxiliary reclaim lock. Holding the reclaim lock
|
|
// (O_EXCL) guarantees that only one process performs the remove-and-recreate
|
|
// of the main lockfile at a time, so concurrent reclaimers cannot each end
|
|
// up "owning" the lock the way a rename or unguarded remove+create would
|
|
// allow. The reclaim lock is itself liveness-checked so a reclaimer that
|
|
// crashed mid-reclaim cannot wedge startup forever.
|
|
func reclaimStaleLock(lockPath, reason string) (func(), error) {
|
|
reclaimPath := lockPath + ".reclaim"
|
|
if err := acquireReclaimLock(reclaimPath); err != nil {
|
|
return nil, fmt.Errorf("%w (%v; %s)", ErrLockHeld, err, reason)
|
|
}
|
|
defer func() { _ = os.Remove(reclaimPath) }()
|
|
|
|
// Serialized now. Re-check the main lock: another process may have fully
|
|
// reclaimed it between our liveness probe and our taking the reclaim lock.
|
|
if pid, perr := readLockPID(lockPath); perr == nil && processAlive(pid) {
|
|
return nil, fmt.Errorf("%w (reclaimed by pid %d while we waited; %s)",
|
|
ErrLockHeld, pid, reason)
|
|
}
|
|
|
|
// Safe to replace: remove the stale file, then create a fresh exclusive
|
|
// one. Both run while we hold the reclaim lock, so no other reclaimer can
|
|
// observe the gap.
|
|
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
|
|
return nil, fmt.Errorf("%w (could not remove stale lockfile %s: %v; %s)",
|
|
ErrLockHeld, lockPath, err, reason)
|
|
}
|
|
rel, ok, err := tryCreateExclusive(lockPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !ok {
|
|
// Should be impossible while we hold the reclaim lock; fail safe.
|
|
return nil, fmt.Errorf("%w (lockfile reappeared during reclaim of %s; %s)",
|
|
ErrLockHeld, lockPath, reason)
|
|
}
|
|
return rel, nil
|
|
}
|
|
|
|
// acquireReclaimLock takes the auxiliary reclaim lock with O_EXCL. An
|
|
// existing reclaim lock is honoured only while its recorded PID is alive (a
|
|
// genuine concurrent reclaim); a stale one (dead/foreign PID) is removed once
|
|
// and re-attempted so a crashed reclaimer cannot block boot indefinitely. Of
|
|
// concurrent callers, O_EXCL ensures at most one acquires it; the rest fail
|
|
// and back off to ErrLockHeld.
|
|
func acquireReclaimLock(reclaimPath string) error {
|
|
for attempt := 0; attempt < 2; attempt++ {
|
|
f, err := os.OpenFile(reclaimPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
|
|
if err == nil {
|
|
if _, werr := fmt.Fprintf(f, "%d\n", os.Getpid()); werr != nil {
|
|
_ = f.Close()
|
|
_ = os.Remove(reclaimPath)
|
|
return fmt.Errorf("write reclaim lock %s: %v", reclaimPath, werr)
|
|
}
|
|
return f.Close()
|
|
}
|
|
if !os.IsExist(err) {
|
|
return fmt.Errorf("create reclaim lock %s: %v", reclaimPath, err)
|
|
}
|
|
// Reclaim lock present. A live owner means a real concurrent reclaim.
|
|
if pid, perr := readLockPID(reclaimPath); perr == nil && processAlive(pid) {
|
|
return fmt.Errorf("concurrent reclaim in progress (pid %d)", pid)
|
|
}
|
|
// Stale reclaim lock — clear it and retry the exclusive create once.
|
|
if rerr := os.Remove(reclaimPath); rerr != nil && !os.IsNotExist(rerr) {
|
|
return fmt.Errorf("remove stale reclaim lock %s: %v", reclaimPath, rerr)
|
|
}
|
|
}
|
|
return fmt.Errorf("could not acquire reclaim lock %s after retry", reclaimPath)
|
|
}
|
|
|
|
func readLockPID(path string) (int, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
pidStr := strings.TrimSpace(string(data))
|
|
if pidStr == "" {
|
|
return 0, errors.New("empty lockfile")
|
|
}
|
|
return strconv.Atoi(pidStr)
|
|
}
|