Files
tiny-forge/internal/store/lockfile.go
T
alexei.dolgolyov 410a131cec feat(apps): stepped creation wizard, branch previews, and app-creation fixes
This session (frontend focus):
- Rebuild /apps/new as a 4-step wizard (Basics → Configure → Trigger → Review):
  WizardRail, SourceKindPicker card grid, AppManifest review, per-step validation,
  ConfirmDialog-based unsaved-changes guard.
- Extract lib/workload/sourceForms.ts (single source of truth for source_config)
  + {Image,Compose,Static,Dockerfile}SourceForm + StaticDiscoveryWizard; fold the
  /apps/[id] edit form onto the same components (removes the duplication). Add
  vitest + sourceForms unit tests.
- Branch preview environments UI: /chain is_preview/preview_branch + a Preview
  environments panel on /apps/[id] (per-branch URLs, ConfirmDialog teardown, armed
  state); RegistryImagePicker on the registry trigger and the image source.
- Fixes: image-inspect 404 -> admin-gated POST /api/discovery/image/inspect;
  conflict-panel blur flicker; friendly localized discovery errors; CPU/Memory
  label hints; dashboard + /apps "Total workloads" count only source_kind workloads
  (drop stale trigger_kind gate); NPM cert/access-list name cache; EntityPicker
  empty-list guard.
- Update CLAUDE.md frontend conventions + add a Build & Test section.

Also captures pre-existing in-progress platform work (not from this session):
workload notifications, Prometheus metrics export, store lockfile, health probes,
backup hardening, and related store/webhook/scheduler changes.
2026-05-29 02:09:54 +03:00

172 lines
6.7 KiB
Go

package store
import (
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
)
// ErrLockHeld is returned when another Tinyforge process appears to be
// running against the same data directory. SQLite + SetMaxOpenConns(1)
// makes this otherwise-silent collision a recipe for double-fired
// schedulers, double-polled registries, and `extra_json` RMW corruption.
var ErrLockHeld = errors.New("data directory is locked by another tinyforge process")
// Lockfile is a portable PID file. AcquireLockfile takes it; the returned
// Release function removes it. The contract:
//
// - Lockfile is created with O_CREATE|O_EXCL — atomic on POSIX, atomic
// on NTFS / ReFS via the equivalent.
// - On collision, the existing file's PID is read; if the PID is dead,
// we treat the lock as stale (process crashed without cleanup),
// reclaim it, and proceed. Live PID → ErrLockHeld.
// - flock is intentionally not used: cross-platform consistency wins
// over advisory-lock semantics for the single-instance use case.
type Lockfile struct {
path string
}
// AcquireLockfile creates a PID-file lock under dataDir. Returns a
// Release function the caller must defer. If another live process holds
// the lock, returns ErrLockHeld with a hint pointing at the lockfile.
//
// Reclaim atomicity: when the existing lockfile names a dead PID, the
// replacement is serialized through an auxiliary reclaim lock (see
// reclaimStaleLock) so that, of N processes booting concurrently against
// the same stale lockfile, EXACTLY ONE reclaims it and the rest get
// ErrLockHeld. A bare `os.Remove`+`O_EXCL` retry — or a rename, which is
// "last-writer-wins" — cannot guarantee this: multiple reclaimers can each
// end up believing they own the lock, defeating the single-instance guard.
func AcquireLockfile(dataDir string) (release func(), err error) {
path := filepath.Join(dataDir, "tinyforge.lock")
// First try: clean acquire.
if rel, ok, err := tryCreateExclusive(path); ok {
return rel, nil
} else if err != nil {
return nil, err
}
// Existing lockfile — read PID and decide whether to reclaim.
pid, readErr := readLockPID(path)
if readErr == nil && processAlive(pid) {
return nil, fmt.Errorf("%w (held by pid %d, lockfile=%s)", ErrLockHeld, pid, path)
}
// Stale lock (dead pid) or malformed file — reclaim under serialization.
reason := "malformed existing lockfile"
if readErr == nil {
reason = fmt.Sprintf("stale lockfile (dead pid %d)", pid)
}
return reclaimStaleLock(path, reason)
}
// tryCreateExclusive attempts an atomic O_CREATE|O_EXCL create at path.
// Returns (release, true, nil) on success; (nil, false, nil) when the
// file already exists; (nil, false, err) on any other error.
func tryCreateExclusive(path string) (func(), bool, error) {
f, openErr := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
if openErr != nil {
if os.IsExist(openErr) {
return nil, false, nil
}
return nil, false, fmt.Errorf("open lockfile: %w", openErr)
}
if _, err := fmt.Fprintf(f, "%d\n", os.Getpid()); err != nil {
_ = f.Close()
_ = os.Remove(path)
return nil, false, fmt.Errorf("write lockfile: %w", err)
}
if err := f.Close(); err != nil {
_ = os.Remove(path)
return nil, false, fmt.Errorf("close lockfile: %w", err)
}
return func() { _ = os.Remove(path) }, true, nil
}
// reclaimStaleLock replaces a stale/malformed lockfile with one holding our
// PID, serialized by an auxiliary reclaim lock. Holding the reclaim lock
// (O_EXCL) guarantees that only one process performs the remove-and-recreate
// of the main lockfile at a time, so concurrent reclaimers cannot each end
// up "owning" the lock the way a rename or unguarded remove+create would
// allow. The reclaim lock is itself liveness-checked so a reclaimer that
// crashed mid-reclaim cannot wedge startup forever.
func reclaimStaleLock(lockPath, reason string) (func(), error) {
reclaimPath := lockPath + ".reclaim"
if err := acquireReclaimLock(reclaimPath); err != nil {
return nil, fmt.Errorf("%w (%v; %s)", ErrLockHeld, err, reason)
}
defer func() { _ = os.Remove(reclaimPath) }()
// Serialized now. Re-check the main lock: another process may have fully
// reclaimed it between our liveness probe and our taking the reclaim lock.
if pid, perr := readLockPID(lockPath); perr == nil && processAlive(pid) {
return nil, fmt.Errorf("%w (reclaimed by pid %d while we waited; %s)",
ErrLockHeld, pid, reason)
}
// Safe to replace: remove the stale file, then create a fresh exclusive
// one. Both run while we hold the reclaim lock, so no other reclaimer can
// observe the gap.
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
return nil, fmt.Errorf("%w (could not remove stale lockfile %s: %v; %s)",
ErrLockHeld, lockPath, err, reason)
}
rel, ok, err := tryCreateExclusive(lockPath)
if err != nil {
return nil, err
}
if !ok {
// Should be impossible while we hold the reclaim lock; fail safe.
return nil, fmt.Errorf("%w (lockfile reappeared during reclaim of %s; %s)",
ErrLockHeld, lockPath, reason)
}
return rel, nil
}
// acquireReclaimLock takes the auxiliary reclaim lock with O_EXCL. An
// existing reclaim lock is honoured only while its recorded PID is alive (a
// genuine concurrent reclaim); a stale one (dead/foreign PID) is removed once
// and re-attempted so a crashed reclaimer cannot block boot indefinitely. Of
// concurrent callers, O_EXCL ensures at most one acquires it; the rest fail
// and back off to ErrLockHeld.
func acquireReclaimLock(reclaimPath string) error {
for attempt := 0; attempt < 2; attempt++ {
f, err := os.OpenFile(reclaimPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
if err == nil {
if _, werr := fmt.Fprintf(f, "%d\n", os.Getpid()); werr != nil {
_ = f.Close()
_ = os.Remove(reclaimPath)
return fmt.Errorf("write reclaim lock %s: %v", reclaimPath, werr)
}
return f.Close()
}
if !os.IsExist(err) {
return fmt.Errorf("create reclaim lock %s: %v", reclaimPath, err)
}
// Reclaim lock present. A live owner means a real concurrent reclaim.
if pid, perr := readLockPID(reclaimPath); perr == nil && processAlive(pid) {
return fmt.Errorf("concurrent reclaim in progress (pid %d)", pid)
}
// Stale reclaim lock — clear it and retry the exclusive create once.
if rerr := os.Remove(reclaimPath); rerr != nil && !os.IsNotExist(rerr) {
return fmt.Errorf("remove stale reclaim lock %s: %v", reclaimPath, rerr)
}
}
return fmt.Errorf("could not acquire reclaim lock %s after retry", reclaimPath)
}
func readLockPID(path string) (int, error) {
data, err := os.ReadFile(path)
if err != nil {
return 0, err
}
pidStr := strings.TrimSpace(string(data))
if pidStr == "" {
return 0, errors.New("empty lockfile")
}
return strconv.Atoi(pidStr)
}