From 5a2200b280034424141dde84f044d828546eb39b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 1 Mar 2026 21:42:22 -0800 Subject: [PATCH] fix(sessions): harden recycled PID lock recovery follow-up (#31320) * fix: detect PID recycling in session write lock staleness check The session lock uses isPidAlive() to determine if a lock holder is still running. In containers, PID recycling can cause a different process to inherit the same PID, making the lock appear valid when the original holder is dead. Record the process start time (field 22 of /proc/pid/stat) in the lock file and compare it during staleness checks. If the PID is alive but its start time differs from the recorded value, the lock is treated as stale and reclaimed immediately. Backward compatible: lock files without starttime are handled with the existing PID-alive + age-based logic. Non-Linux platforms skip the starttime check entirely (getProcessStartTime returns null). * shared: harden pid starttime parsing * sessions: validate lock pid/starttime payloads * changelog: note recycled PID lock recovery fix * changelog: credit hiroki and vincent on lock recovery fix --------- Co-authored-by: HirokiKobayashi-R --- CHANGELOG.md | 1 + src/agents/session-write-lock.test.ts | 88 ++++++++++++++++++++ src/agents/session-write-lock.ts | 36 +++++++- src/shared/pid-alive.test.ts | 114 +++++++++++++++++++++++++- src/shared/pid-alive.ts | 39 ++++++++- 5 files changed, 272 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e80065f3297..a0df2f30e74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,7 @@ Docs: https://docs.openclaw.ai - Android/Gateway canvas capability refresh: send `node.canvas.capability.refresh` with object `params` (`{}`) from Android node runtime so gateway object-schema validation accepts refresh retries and A2UI host recovery works after scoped capability expiry. (#28413) Thanks @obviyus. - Gateway/Control UI origins: honor `gateway.controlUi.allowedOrigins: ["*"]` wildcard entries (including trimmed values) and lock behavior with regression tests. Landed from contributor PR #31058 by @byungsker. Thanks @byungsker. - Agents/Sessions list transcript paths: handle missing/non-string/relative `sessions.list.path` values and per-agent `{agentId}` templates when deriving `transcriptPath`, so cross-agent session listings resolve to concrete agent session files instead of workspace-relative paths. (#24775) Thanks @martinfrancois. +- Sessions/Lock recovery: detect recycled Linux PIDs by comparing lock-file `starttime` with `/proc//stat` starttime, so stale `.jsonl.lock` files are reclaimed immediately in containerized PID-reuse scenarios while preserving compatibility for older lock files. (#26443) Fixes #27252. Thanks @HirokiKobayashi-R and @vincentkoc. - Gateway/Control UI CSP: allow required Google Fonts origins in Control UI CSP. (#29279) Thanks @Glucksberg and @vincentkoc. - CLI/Install: add an npm-link fallback to fix CLI startup `Permission denied` failures (`exit 127`) on affected installs. (#17151) Thanks @sskyu and @vincentkoc. - Onboarding/Custom providers: improve verification reliability for slower local endpoints (for example Ollama) during setup. (#27380) Thanks @Sid-Qin. diff --git a/src/agents/session-write-lock.test.ts b/src/agents/session-write-lock.test.ts index 4bef8a5194a..3c1b52e8be9 100644 --- a/src/agents/session-write-lock.test.ts +++ b/src/agents/session-write-lock.test.ts @@ -2,6 +2,18 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { describe, expect, it, vi } from "vitest"; + +// Mock getProcessStartTime so PID-recycling detection works on non-Linux +// (macOS, CI runners). isPidAlive is left unmocked. +const FAKE_STARTTIME = 12345; +vi.mock("../shared/pid-alive.js", async (importOriginal) => { + const original = await importOriginal(); + return { + ...original, + getProcessStartTime: (pid: number) => (pid === process.pid ? FAKE_STARTTIME : null), + }; +}); + import { __testing, acquireSessionWriteLock, @@ -255,6 +267,82 @@ describe("acquireSessionWriteLock", () => { } }); + it("reclaims lock files with recycled PIDs", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + // Write a lock with a live PID (current process) but a wrong starttime, + // simulating PID recycling: the PID is alive but belongs to a different + // process than the one that created the lock. + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), + starttime: 999_999_999, + }), + "utf8", + ); + + const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 }); + const raw = await fs.readFile(lockPath, "utf8"); + const payload = JSON.parse(raw) as { pid: number }; + + expect(payload.pid).toBe(process.pid); + await lock.release(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("does not reclaim lock files without starttime (backward compat)", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + // Old-format lock without starttime — should NOT be reclaimed just because + // starttime is missing. The PID is alive, so the lock is valid. + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), + }), + "utf8", + ); + + await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow( + /session file locked/, + ); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("does not treat malformed starttime as recycled", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), + starttime: 123.5, + }), + "utf8", + ); + + await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow( + /session file locked/, + ); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + it("registers cleanup for SIGQUIT and SIGABRT", () => { expect(__testing.cleanupSignals).toContain("SIGQUIT"); expect(__testing.cleanupSignals).toContain("SIGABRT"); diff --git a/src/agents/session-write-lock.ts b/src/agents/session-write-lock.ts index 5b030430ec9..837a7ada36b 100644 --- a/src/agents/session-write-lock.ts +++ b/src/agents/session-write-lock.ts @@ -1,14 +1,20 @@ import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; -import { isPidAlive } from "../shared/pid-alive.js"; +import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js"; import { resolveProcessScopedMap } from "../shared/process-scoped-map.js"; type LockFilePayload = { pid?: number; createdAt?: string; + /** Process start time in clock ticks (from /proc/pid/stat field 22). */ + starttime?: number; }; +function isValidLockNumber(value: unknown): value is number { + return typeof value === "number" && Number.isInteger(value) && value >= 0; +} + type HeldLock = { count: number; handle: fs.FileHandle; @@ -270,12 +276,15 @@ async function readLockPayload(lockPath: string): Promise; const payload: LockFilePayload = {}; - if (typeof parsed.pid === "number") { + if (isValidLockNumber(parsed.pid) && parsed.pid > 0) { payload.pid = parsed.pid; } if (typeof parsed.createdAt === "string") { payload.createdAt = parsed.createdAt; } + if (isValidLockNumber(parsed.starttime)) { + payload.starttime = parsed.starttime; + } return payload; } catch { return null; @@ -287,17 +296,31 @@ function inspectLockPayload( staleMs: number, nowMs: number, ): LockInspectionDetails { - const pid = typeof payload?.pid === "number" ? payload.pid : null; + const pid = isValidLockNumber(payload?.pid) && payload.pid > 0 ? payload.pid : null; const pidAlive = pid !== null ? isPidAlive(pid) : false; const createdAt = typeof payload?.createdAt === "string" ? payload.createdAt : null; const createdAtMs = createdAt ? Date.parse(createdAt) : Number.NaN; const ageMs = Number.isFinite(createdAtMs) ? Math.max(0, nowMs - createdAtMs) : null; + // Detect PID recycling: if the PID is alive but its start time differs from + // what was recorded in the lock file, the original process died and the OS + // reassigned the same PID to a different process. + const storedStarttime = isValidLockNumber(payload?.starttime) ? payload.starttime : null; + const pidRecycled = + pidAlive && pid !== null && storedStarttime !== null + ? (() => { + const currentStarttime = getProcessStartTime(pid); + return currentStarttime !== null && currentStarttime !== storedStarttime; + })() + : false; + const staleReasons: string[] = []; if (pid === null) { staleReasons.push("missing-pid"); } else if (!pidAlive) { staleReasons.push("dead-pid"); + } else if (pidRecycled) { + staleReasons.push("recycled-pid"); } if (ageMs === null) { staleReasons.push("invalid-createdAt"); @@ -447,7 +470,12 @@ export async function acquireSessionWriteLock(params: { try { handle = await fs.open(lockPath, "wx"); const createdAt = new Date().toISOString(); - await handle.writeFile(JSON.stringify({ pid: process.pid, createdAt }, null, 2), "utf8"); + const starttime = getProcessStartTime(process.pid); + const lockPayload: LockFilePayload = { pid: process.pid, createdAt }; + if (starttime !== null) { + lockPayload.starttime = starttime; + } + await handle.writeFile(JSON.stringify(lockPayload, null, 2), "utf8"); const createdHeld: HeldLock = { count: 1, handle, diff --git a/src/shared/pid-alive.test.ts b/src/shared/pid-alive.test.ts index 862101bb7be..1edafa77cab 100644 --- a/src/shared/pid-alive.test.ts +++ b/src/shared/pid-alive.test.ts @@ -1,6 +1,6 @@ import fsSync from "node:fs"; import { describe, expect, it, vi } from "vitest"; -import { isPidAlive } from "./pid-alive.js"; +import { getProcessStartTime, isPidAlive } from "./pid-alive.js"; describe("isPidAlive", () => { it("returns true for the current running process", () => { @@ -14,6 +14,7 @@ describe("isPidAlive", () => { it("returns false for invalid PIDs", () => { expect(isPidAlive(0)).toBe(false); expect(isPidAlive(-1)).toBe(false); + expect(isPidAlive(1.5)).toBe(false); expect(isPidAlive(Number.NaN)).toBe(false); expect(isPidAlive(Number.POSITIVE_INFINITY)).toBe(false); }); @@ -51,3 +52,114 @@ describe("isPidAlive", () => { } }); }); + +describe("getProcessStartTime", () => { + it("returns a number on Linux for the current process", async () => { + const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + if (!originalPlatformDescriptor) { + throw new Error("missing process.platform descriptor"); + } + + const originalReadFileSync = fsSync.readFileSync; + // Simulate a realistic /proc//stat line + const fakeStat = `${process.pid} (node) S 1 ${process.pid} ${process.pid} 0 -1 4194304 12345 0 0 0 100 50 0 0 20 0 8 0 98765 123456789 5000 18446744073709551615 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0`; + vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === `/proc/${process.pid}/stat`) { + return fakeStat; + } + return originalReadFileSync(filePath as never, encoding as never) as never; + }); + + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: "linux", + }); + + try { + vi.resetModules(); + const { getProcessStartTime: fresh } = await import("./pid-alive.js"); + const starttime = fresh(process.pid); + expect(starttime).toBe(98765); + } finally { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + vi.restoreAllMocks(); + } + }); + + it("returns null on non-Linux platforms", () => { + if (process.platform === "linux") { + // On actual Linux, this test is trivially satisfied by the other tests. + expect(true).toBe(true); + return; + } + expect(getProcessStartTime(process.pid)).toBeNull(); + }); + + it("returns null for invalid PIDs", () => { + expect(getProcessStartTime(0)).toBeNull(); + expect(getProcessStartTime(-1)).toBeNull(); + expect(getProcessStartTime(1.5)).toBeNull(); + expect(getProcessStartTime(Number.NaN)).toBeNull(); + expect(getProcessStartTime(Number.POSITIVE_INFINITY)).toBeNull(); + }); + + it("returns null for malformed /proc stat content", async () => { + const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + if (!originalPlatformDescriptor) { + throw new Error("missing process.platform descriptor"); + } + + const originalReadFileSync = fsSync.readFileSync; + vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === "/proc/42/stat") { + return "42 node S malformed"; + } + return originalReadFileSync(filePath as never, encoding as never) as never; + }); + + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: "linux", + }); + + try { + vi.resetModules(); + const { getProcessStartTime: fresh } = await import("./pid-alive.js"); + expect(fresh(42)).toBeNull(); + } finally { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + vi.restoreAllMocks(); + } + }); + + it("handles comm fields containing spaces and parentheses", async () => { + const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + if (!originalPlatformDescriptor) { + throw new Error("missing process.platform descriptor"); + } + + const originalReadFileSync = fsSync.readFileSync; + // comm field with spaces and nested parens: "(My App (v2))" + const fakeStat = `42 (My App (v2)) S 1 42 42 0 -1 4194304 0 0 0 0 0 0 0 0 20 0 1 0 55555 0 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0`; + vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === "/proc/42/stat") { + return fakeStat; + } + return originalReadFileSync(filePath as never, encoding as never) as never; + }); + + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: "linux", + }); + + try { + vi.resetModules(); + const { getProcessStartTime: fresh } = await import("./pid-alive.js"); + expect(fresh(42)).toBe(55555); + } finally { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + vi.restoreAllMocks(); + } + }); +}); diff --git a/src/shared/pid-alive.ts b/src/shared/pid-alive.ts index d3aeaaf6f43..522566fb3fd 100644 --- a/src/shared/pid-alive.ts +++ b/src/shared/pid-alive.ts @@ -1,5 +1,9 @@ import fsSync from "node:fs"; +function isValidPid(pid: number): boolean { + return Number.isInteger(pid) && pid > 0; +} + /** * Check if a process is a zombie on Linux by reading /proc//status. * Returns false on non-Linux platforms or if the proc file can't be read. @@ -18,7 +22,7 @@ function isZombieProcess(pid: number): boolean { } export function isPidAlive(pid: number): boolean { - if (!Number.isFinite(pid) || pid <= 0) { + if (!isValidPid(pid)) { return false; } try { @@ -31,3 +35,36 @@ export function isPidAlive(pid: number): boolean { } return true; } + +/** + * Read the process start time (field 22 "starttime") from /proc//stat. + * Returns the value in clock ticks since system boot, or null on non-Linux + * platforms or if the proc file can't be read. + * + * This is used to detect PID recycling: if two readings for the same PID + * return different starttimes, the PID has been reused by a different process. + */ +export function getProcessStartTime(pid: number): number | null { + if (process.platform !== "linux") { + return null; + } + if (!isValidPid(pid)) { + return null; + } + try { + const stat = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8"); + const commEndIndex = stat.lastIndexOf(")"); + if (commEndIndex < 0) { + return null; + } + // The comm field (field 2) is wrapped in parens and can contain spaces, + // so split after the last ")" to get fields 3..N reliably. + const afterComm = stat.slice(commEndIndex + 1).trimStart(); + const fields = afterComm.split(/\s+/); + // field 22 (starttime) = index 19 after the comm-split (field 3 is index 0). + const starttime = Number(fields[19]); + return Number.isInteger(starttime) && starttime >= 0 ? starttime : null; + } catch { + return null; + } +}