package store import ( "errors" "fmt" "os" "path/filepath" "strconv" "strings" ) // ErrLockHeld is returned when another Tinyforge process appears to be // running against the same data directory. SQLite + SetMaxOpenConns(1) // makes this otherwise-silent collision a recipe for double-fired // schedulers, double-polled registries, and `extra_json` RMW corruption. var ErrLockHeld = errors.New("data directory is locked by another tinyforge process") // Lockfile is a portable PID file. AcquireLockfile takes it; the returned // Release function removes it. The contract: // // - Lockfile is created with O_CREATE|O_EXCL — atomic on POSIX, atomic // on NTFS / ReFS via the equivalent. // - On collision, the existing file's PID is read; if the PID is dead, // we treat the lock as stale (process crashed without cleanup), // reclaim it, and proceed. Live PID → ErrLockHeld. // - flock is intentionally not used: cross-platform consistency wins // over advisory-lock semantics for the single-instance use case. type Lockfile struct { path string } // AcquireLockfile creates a PID-file lock under dataDir. Returns a // Release function the caller must defer. If another live process holds // the lock, returns ErrLockHeld with a hint pointing at the lockfile. // // Reclaim atomicity: when the existing lockfile names a dead PID, the // replacement is serialized through an auxiliary reclaim lock (see // reclaimStaleLock) so that, of N processes booting concurrently against // the same stale lockfile, EXACTLY ONE reclaims it and the rest get // ErrLockHeld. A bare `os.Remove`+`O_EXCL` retry — or a rename, which is // "last-writer-wins" — cannot guarantee this: multiple reclaimers can each // end up believing they own the lock, defeating the single-instance guard. func AcquireLockfile(dataDir string) (release func(), err error) { path := filepath.Join(dataDir, "tinyforge.lock") // First try: clean acquire. if rel, ok, err := tryCreateExclusive(path); ok { return rel, nil } else if err != nil { return nil, err } // Existing lockfile — read PID and decide whether to reclaim. pid, readErr := readLockPID(path) if readErr == nil && processAlive(pid) { return nil, fmt.Errorf("%w (held by pid %d, lockfile=%s)", ErrLockHeld, pid, path) } // Stale lock (dead pid) or malformed file — reclaim under serialization. reason := "malformed existing lockfile" if readErr == nil { reason = fmt.Sprintf("stale lockfile (dead pid %d)", pid) } return reclaimStaleLock(path, reason) } // tryCreateExclusive attempts an atomic O_CREATE|O_EXCL create at path. // Returns (release, true, nil) on success; (nil, false, nil) when the // file already exists; (nil, false, err) on any other error. func tryCreateExclusive(path string) (func(), bool, error) { f, openErr := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if openErr != nil { if os.IsExist(openErr) { return nil, false, nil } return nil, false, fmt.Errorf("open lockfile: %w", openErr) } if _, err := fmt.Fprintf(f, "%d\n", os.Getpid()); err != nil { _ = f.Close() _ = os.Remove(path) return nil, false, fmt.Errorf("write lockfile: %w", err) } if err := f.Close(); err != nil { _ = os.Remove(path) return nil, false, fmt.Errorf("close lockfile: %w", err) } return func() { _ = os.Remove(path) }, true, nil } // reclaimStaleLock replaces a stale/malformed lockfile with one holding our // PID, serialized by an auxiliary reclaim lock. Holding the reclaim lock // (O_EXCL) guarantees that only one process performs the remove-and-recreate // of the main lockfile at a time, so concurrent reclaimers cannot each end // up "owning" the lock the way a rename or unguarded remove+create would // allow. The reclaim lock is itself liveness-checked so a reclaimer that // crashed mid-reclaim cannot wedge startup forever. func reclaimStaleLock(lockPath, reason string) (func(), error) { reclaimPath := lockPath + ".reclaim" if err := acquireReclaimLock(reclaimPath); err != nil { return nil, fmt.Errorf("%w (%v; %s)", ErrLockHeld, err, reason) } defer func() { _ = os.Remove(reclaimPath) }() // Serialized now. Re-check the main lock: another process may have fully // reclaimed it between our liveness probe and our taking the reclaim lock. if pid, perr := readLockPID(lockPath); perr == nil && processAlive(pid) { return nil, fmt.Errorf("%w (reclaimed by pid %d while we waited; %s)", ErrLockHeld, pid, reason) } // Safe to replace: remove the stale file, then create a fresh exclusive // one. Both run while we hold the reclaim lock, so no other reclaimer can // observe the gap. if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) { return nil, fmt.Errorf("%w (could not remove stale lockfile %s: %v; %s)", ErrLockHeld, lockPath, err, reason) } rel, ok, err := tryCreateExclusive(lockPath) if err != nil { return nil, err } if !ok { // Should be impossible while we hold the reclaim lock; fail safe. return nil, fmt.Errorf("%w (lockfile reappeared during reclaim of %s; %s)", ErrLockHeld, lockPath, reason) } return rel, nil } // acquireReclaimLock takes the auxiliary reclaim lock with O_EXCL. An // existing reclaim lock is honoured only while its recorded PID is alive (a // genuine concurrent reclaim); a stale one (dead/foreign PID) is removed once // and re-attempted so a crashed reclaimer cannot block boot indefinitely. Of // concurrent callers, O_EXCL ensures at most one acquires it; the rest fail // and back off to ErrLockHeld. func acquireReclaimLock(reclaimPath string) error { for attempt := 0; attempt < 2; attempt++ { f, err := os.OpenFile(reclaimPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err == nil { if _, werr := fmt.Fprintf(f, "%d\n", os.Getpid()); werr != nil { _ = f.Close() _ = os.Remove(reclaimPath) return fmt.Errorf("write reclaim lock %s: %v", reclaimPath, werr) } return f.Close() } if !os.IsExist(err) { return fmt.Errorf("create reclaim lock %s: %v", reclaimPath, err) } // Reclaim lock present. A live owner means a real concurrent reclaim. if pid, perr := readLockPID(reclaimPath); perr == nil && processAlive(pid) { return fmt.Errorf("concurrent reclaim in progress (pid %d)", pid) } // Stale reclaim lock — clear it and retry the exclusive create once. if rerr := os.Remove(reclaimPath); rerr != nil && !os.IsNotExist(rerr) { return fmt.Errorf("remove stale reclaim lock %s: %v", reclaimPath, rerr) } } return fmt.Errorf("could not acquire reclaim lock %s after retry", reclaimPath) } func readLockPID(path string) (int, error) { data, err := os.ReadFile(path) if err != nil { return 0, err } pidStr := strings.TrimSpace(string(data)) if pidStr == "" { return 0, errors.New("empty lockfile") } return strconv.Atoi(pidStr) }