diff --git a/src/plugins/bundled-runtime-deps.test.ts b/src/plugins/bundled-runtime-deps.test.ts index c269d3fd79d..e7fc0309019 100644 --- a/src/plugins/bundled-runtime-deps.test.ts +++ b/src/plugins/bundled-runtime-deps.test.ts @@ -2126,6 +2126,51 @@ describe("ensureBundledPluginRuntimeDeps", () => { ).toBe(true); }); + it("treats a PID-alive lock with matching pidStartTimeMs as held by the same incarnation", () => { + expect( + bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( + { pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 }, + 2_500, + () => true, + // Live PID's start-time matches owner.pidStartTimeMs → same process + () => 1_000, + ), + ).toBe(false); + }); + + it("expires a PID-alive lock when the live PID's start-time differs (Docker PID reuse)", () => { + // Models the failure mode that motivated this change: inside a container + // the gateway is always PID 1 (or PID 7 with `init: true`), so a stale + // lock from a previous incarnation looks "alive" if we only consult + // isProcessAlive. Capturing the writer's start-time and comparing it to + // the live PID's start-time disambiguates incarnations. + expect( + bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( + { pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 }, + 2_500, + () => true, + // Same PID, but a different incarnation started later → stale. + () => 9_000, + ), + ).toBe(true); + }); + + it("treats a PID-alive lock as fresh when start-time evidence cannot be read", () => { + // Defensive: when readProcessStartTimeMs returns null (legacy lock with + // no pidStartTimeMs, or a platform that does not expose it) we keep the + // pre-existing behavior of trusting isAlive(pid). The only verified + // disambiguation path is start-time evidence on both sides; without it + // we err toward "still held" rather than risk stomping a real install. + expect( + bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( + { pid: 7, pidStartTimeMs: 1_000, createdAtMs: 0 }, + Number.MAX_SAFE_INTEGER, + () => true, + () => null, + ), + ).toBe(false); + }); + it("does not expire fresh ownerless runtime-deps install locks", () => { expect( bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock( diff --git a/src/plugins/bundled-runtime-deps.ts b/src/plugins/bundled-runtime-deps.ts index 6a6c68b986e..e6e1cb76566 100644 --- a/src/plugins/bundled-runtime-deps.ts +++ b/src/plugins/bundled-runtime-deps.ts @@ -456,8 +456,65 @@ function isProcessAlive(pid: number): boolean { } } +// Approximate epoch-ms when the current process started, captured once per +// Node lifetime so the lock owner record carries a value that is stable +// across `Date.now()` clock skew within the running process. +const CURRENT_PROCESS_START_TIME_MS = Math.round(Date.now() - process.uptime() * 1000); + +// Approximate epoch-ms when `pid` started running, or null when the platform +// or runtime cannot determine it. Used to distinguish two different process +// incarnations that happen to share the same numeric PID — most notably +// inside Docker containers, where the gateway is always PID 1 (or PID 7 +// with `init: true`) in its container PID namespace, so a stale lock left +// by a previous incarnation looks live to the new one if we only consult +// `isAlive(pid)`. +function readProcessStartTimeMs(pid: number): number | null { + if (!Number.isInteger(pid) || pid <= 0) { + return null; + } + if (process.platform !== "linux") { + return null; + } + let stat: string; + try { + stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8"); + } catch { + return null; + } + // /proc//stat: PID (comm) state ppid pgrp ... starttime ... where + // `comm` may contain spaces and parens, so we anchor on the LAST `)` and + // count whitespace-separated fields after it. Field index 22 (1-based) is + // starttime, expressed in clock ticks since boot. After the `)` the next + // field is `state` (1), then ppid (2), pgrp (3), session (4), tty_nr (5), + // tpgid (6), flags (7), minflt (8), cminflt (9), majflt (10), cmajflt + // (11), utime (12), stime (13), cutime (14), cstime (15), priority (16), + // nice (17), num_threads (18), itrealvalue (19), starttime (20). + const lastParen = stat.lastIndexOf(")"); + if (lastParen === -1) { + return null; + } + const tail = stat + .slice(lastParen + 1) + .trim() + .split(/\s+/); + // tail[0] is `state`; starttime is therefore tail[19]. + const starttimeJiffies = Number(tail[19]); + if (!Number.isFinite(starttimeJiffies)) { + return null; + } + // sysconf(_SC_CLK_TCK) is universally 100 on Linux x86/arm targets. We + // hard-code that here rather than spawning getconf — the exact tick rate + // does not matter for our purposes as long as it is stable: the starttime + // we capture for our own process and the starttime we read for the + // lock-owner PID are converted with the same constant, so equality holds. + const clockTicksPerSecond = 100; + const bootTimeMs = Date.now() - os.uptime() * 1000; + return Math.round(bootTimeMs + (starttimeJiffies * 1000) / clockTicksPerSecond); +} + type RuntimeDepsLockOwner = { pid?: number; + pidStartTimeMs?: number; createdAtMs?: number; ownerFileState: "ok" | "missing" | "invalid"; ownerFilePath: string; @@ -501,6 +558,7 @@ function readRuntimeDepsLockOwner(lockDir: string): RuntimeDepsLockOwner { } return { pid: typeof owner?.pid === "number" ? owner.pid : undefined, + pidStartTimeMs: typeof owner?.pidStartTimeMs === "number" ? owner.pidStartTimeMs : undefined, createdAtMs: typeof owner?.createdAtMs === "number" ? owner.createdAtMs : undefined, ownerFileState, ownerFilePath, @@ -524,12 +582,32 @@ function latestFiniteMs(values: readonly (number | undefined)[]): number | undef } function shouldRemoveRuntimeDepsLock( - owner: Pick, + owner: Pick< + RuntimeDepsLockOwner, + "pid" | "pidStartTimeMs" | "createdAtMs" | "lockDirMtimeMs" | "ownerFileMtimeMs" + >, nowMs: number, isAlive: (pid: number) => boolean = isProcessAlive, + readStartTimeMs: (pid: number) => number | null = readProcessStartTimeMs, ): boolean { if (typeof owner.pid === "number") { - return !isAlive(owner.pid); + if (!isAlive(owner.pid)) { + return true; + } + // PID is alive — but inside Docker the new process can share the same + // PID as the dead writer. If we recorded the writer's start-time and we + // can read the live PID's start-time, mismatch means a different + // incarnation owns this PID now and the lock is stale. When start-time + // evidence is unavailable on either side, fall through to the existing + // PID-alive-means-fresh behavior so legacy locks keep working as + // before. + if (typeof owner.pidStartTimeMs === "number") { + const liveStartTimeMs = readStartTimeMs(owner.pid); + if (liveStartTimeMs !== null && liveStartTimeMs !== owner.pidStartTimeMs) { + return true; + } + } + return false; } if (typeof owner.createdAtMs === "number") { @@ -614,7 +692,15 @@ export function withBundledRuntimeDepsFilesystemLock( try { fs.writeFileSync( path.join(lockDir, BUNDLED_RUNTIME_DEPS_LOCK_OWNER_FILE), - `${JSON.stringify({ pid: process.pid, createdAtMs: Date.now() }, null, 2)}\n`, + `${JSON.stringify( + { + pid: process.pid, + pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS, + createdAtMs: Date.now(), + }, + null, + 2, + )}\n`, "utf8", ); } catch (ownerWriteError) { @@ -2387,7 +2473,15 @@ async function withBundledRuntimeDepsInstallRootLockAsync( try { fs.writeFileSync( path.join(lockDir, BUNDLED_RUNTIME_DEPS_LOCK_OWNER_FILE), - `${JSON.stringify({ pid: process.pid, createdAtMs: Date.now() }, null, 2)}\n`, + `${JSON.stringify( + { + pid: process.pid, + pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS, + createdAtMs: Date.now(), + }, + null, + 2, + )}\n`, "utf8", ); } catch (ownerWriteError) {