From 2d885a240272ecb907b08a4ef77d8fdd9ca5d8af Mon Sep 17 00:00:00 2001 From: Jim Smith Date: Wed, 29 Apr 2026 09:38:00 -0400 Subject: [PATCH] fix(plugins): disambiguate runtime-deps lock owners by process start-time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `shouldRemoveRuntimeDepsLock` previously trusted `isAlive(owner.pid)` alone when deciding whether a lock could be reclaimed. That works fine on a normal host: when the writer dies the PID is gone and `isAlive` returns false. Inside Docker it does not — every Node gateway process runs as PID 1 (or PID 7 with `init: true`) in its container PID namespace, so a stale lock left behind by a previous incarnation looks "alive" to the new one. The 5-minute lock-wait timeout then fires and the supervisor restarts, and the cycle repeats indefinitely. Operators have to manually remove `.openclaw-runtime-deps.lock` to recover. This change records `pidStartTimeMs` alongside `pid` and `createdAtMs` when the lock is acquired, and consults it in the staleness check. When both sides have start-time evidence and they disagree, the lock is treated as stale; otherwise the existing PID-alive-means-fresh behavior is preserved exactly. The capture point uses `Date.now() - process.uptime() * 1000` once at module load, and the read side uses `/proc//stat` field 22 on Linux (returning null elsewhere so legacy semantics still apply on macOS/Windows hosts). This is strictly additive on the wire format and the predicate: existing lock files without `pidStartTimeMs` continue to take the same code path they did before, and platforms that cannot resolve a live PID's start-time fall back to the same legacy behavior. Refs #74346. --- src/plugins/bundled-runtime-deps.test.ts | 45 ++++++++++ src/plugins/bundled-runtime-deps.ts | 102 ++++++++++++++++++++++- 2 files changed, 143 insertions(+), 4 deletions(-) diff --git a/src/plugins/bundled-runtime-deps.test.ts b/src/plugins/bundled-runtime-deps.test.ts index c269d3fd79d..e7fc0309019 100644 --- a/src/plugins/bundled-runtime-deps.test.ts +++ b/src/plugins/bundled-runtime-deps.test.ts @@ -2126,6 +2126,51 @@ describe("ensureBundledPluginRuntimeDeps", () => { ).toBe(true); }); + it("treats a PID-alive lock with matching pidStartTimeMs as held by the same incarnation", () => { + expect( + bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( + { pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 }, + 2_500, + () => true, + // Live PID's start-time matches owner.pidStartTimeMs → same process + () => 1_000, + ), + ).toBe(false); + }); + + it("expires a PID-alive lock when the live PID's start-time differs (Docker PID reuse)", () => { + // Models the failure mode that motivated this change: inside a container + // the gateway is always PID 1 (or PID 7 with `init: true`), so a stale + // lock from a previous incarnation looks "alive" if we only consult + // isProcessAlive. Capturing the writer's start-time and comparing it to + // the live PID's start-time disambiguates incarnations. + expect( + bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( + { pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 }, + 2_500, + () => true, + // Same PID, but a different incarnation started later → stale. + () => 9_000, + ), + ).toBe(true); + }); + + it("treats a PID-alive lock as fresh when start-time evidence cannot be read", () => { + // Defensive: when readProcessStartTimeMs returns null (legacy lock with + // no pidStartTimeMs, or a platform that does not expose it) we keep the + // pre-existing behavior of trusting isAlive(pid). The only verified + // disambiguation path is start-time evidence on both sides; without it + // we err toward "still held" rather than risk stomping a real install. + expect( + bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( + { pid: 7, pidStartTimeMs: 1_000, createdAtMs: 0 }, + Number.MAX_SAFE_INTEGER, + () => true, + () => null, + ), + ).toBe(false); + }); + it("does not expire fresh ownerless runtime-deps install locks", () => { expect( bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( diff --git a/src/plugins/bundled-runtime-deps.ts b/src/plugins/bundled-runtime-deps.ts index 6a6c68b986e..e6e1cb76566 100644 --- a/src/plugins/bundled-runtime-deps.ts +++ b/src/plugins/bundled-runtime-deps.ts @@ -456,8 +456,65 @@ function isProcessAlive(pid: number): boolean { } } +// Approximate epoch-ms when the current process started, captured once per +// Node lifetime so the lock owner record carries a value that is stable +// across `Date.now()` clock skew within the running process. +const CURRENT_PROCESS_START_TIME_MS = Math.round(Date.now() - process.uptime() * 1000); + +// Approximate epoch-ms when `pid` started running, or null when the platform +// or runtime cannot determine it. Used to distinguish two different process +// incarnations that happen to share the same numeric PID — most notably +// inside Docker containers, where the gateway is always PID 1 (or PID 7 +// with `init: true`) in its container PID namespace, so a stale lock left +// by a previous incarnation looks live to the new one if we only consult +// `isAlive(pid)`. +function readProcessStartTimeMs(pid: number): number | null { + if (!Number.isInteger(pid) || pid <= 0) { + return null; + } + if (process.platform !== "linux") { + return null; + } + let stat: string; + try { + stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8"); + } catch { + return null; + } + // /proc//stat: PID (comm) state ppid pgrp ... starttime ... where + // `comm` may contain spaces and parens, so we anchor on the LAST `)` and + // count whitespace-separated fields after it. Field index 22 (1-based) is + // starttime, expressed in clock ticks since boot. After the `)` the next + // field is `state` (1), then ppid (2), pgrp (3), session (4), tty_nr (5), + // tpgid (6), flags (7), minflt (8), cminflt (9), majflt (10), cmajflt + // (11), utime (12), stime (13), cutime (14), cstime (15), priority (16), + // nice (17), num_threads (18), itrealvalue (19), starttime (20). + const lastParen = stat.lastIndexOf(")"); + if (lastParen === -1) { + return null; + } + const tail = stat + .slice(lastParen + 1) + .trim() + .split(/\s+/); + // tail[0] is `state`; starttime is therefore tail[19]. + const starttimeJiffies = Number(tail[19]); + if (!Number.isFinite(starttimeJiffies)) { + return null; + } + // sysconf(_SC_CLK_TCK) is universally 100 on Linux x86/arm targets. We + // hard-code that here rather than spawning getconf — the exact tick rate + // does not matter for our purposes as long as it is stable: the starttime + // we capture for our own process and the starttime we read for the + // lock-owner PID are converted with the same constant, so equality holds. + const clockTicksPerSecond = 100; + const bootTimeMs = Date.now() - os.uptime() * 1000; + return Math.round(bootTimeMs + (starttimeJiffies * 1000) / clockTicksPerSecond); +} + type RuntimeDepsLockOwner = { pid?: number; + pidStartTimeMs?: number; createdAtMs?: number; ownerFileState: "ok" | "missing" | "invalid"; ownerFilePath: string; @@ -501,6 +558,7 @@ function readRuntimeDepsLockOwner(lockDir: string): RuntimeDepsLockOwner { } return { pid: typeof owner?.pid === "number" ? owner.pid : undefined, + pidStartTimeMs: typeof owner?.pidStartTimeMs === "number" ? owner.pidStartTimeMs : undefined, createdAtMs: typeof owner?.createdAtMs === "number" ? owner.createdAtMs : undefined, ownerFileState, ownerFilePath, @@ -524,12 +582,32 @@ function latestFiniteMs(values: readonly (number | undefined)[]): number | undef } function shouldRemoveRuntimeDepsLock( - owner: Pick, + owner: Pick< + RuntimeDepsLockOwner, + "pid" | "pidStartTimeMs" | "createdAtMs" | "lockDirMtimeMs" | "ownerFileMtimeMs" + >, nowMs: number, isAlive: (pid: number) => boolean = isProcessAlive, + readStartTimeMs: (pid: number) => number | null = readProcessStartTimeMs, ): boolean { if (typeof owner.pid === "number") { - return !isAlive(owner.pid); + if (!isAlive(owner.pid)) { + return true; + } + // PID is alive — but inside Docker the new process can share the same + // PID as the dead writer. If we recorded the writer's start-time and we + // can read the live PID's start-time, mismatch means a different + // incarnation owns this PID now and the lock is stale. When start-time + // evidence is unavailable on either side, fall through to the existing + // PID-alive-means-fresh behavior so legacy locks keep working as + // before. + if (typeof owner.pidStartTimeMs === "number") { + const liveStartTimeMs = readStartTimeMs(owner.pid); + if (liveStartTimeMs !== null && liveStartTimeMs !== owner.pidStartTimeMs) { + return true; + } + } + return false; } if (typeof owner.createdAtMs === "number") { @@ -614,7 +692,15 @@ export function withBundledRuntimeDepsFilesystemLock( try { fs.writeFileSync( path.join(lockDir, BUNDLED_RUNTIME_DEPS_LOCK_OWNER_FILE), - `${JSON.stringify({ pid: process.pid, createdAtMs: Date.now() }, null, 2)}\n`, + `${JSON.stringify( + { + pid: process.pid, + pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS, + createdAtMs: Date.now(), + }, + null, + 2, + )}\n`, "utf8", ); } catch (ownerWriteError) { @@ -2387,7 +2473,15 @@ async function withBundledRuntimeDepsInstallRootLockAsync( try { fs.writeFileSync( path.join(lockDir, BUNDLED_RUNTIME_DEPS_LOCK_OWNER_FILE), - `${JSON.stringify({ pid: process.pid, createdAtMs: Date.now() }, null, 2)}\n`, + `${JSON.stringify( + { + pid: process.pid, + pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS, + createdAtMs: Date.now(), + }, + null, + 2, + )}\n`, "utf8", ); } catch (ownerWriteError) {