diff --git a/CHANGELOG.md b/CHANGELOG.md index e80065f3297..a0df2f30e74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,7 @@ Docs: https://docs.openclaw.ai - Android/Gateway canvas capability refresh: send `node.canvas.capability.refresh` with object `params` (`{}`) from Android node runtime so gateway object-schema validation accepts refresh retries and A2UI host recovery works after scoped capability expiry. (#28413) Thanks @obviyus. - Gateway/Control UI origins: honor `gateway.controlUi.allowedOrigins: ["*"]` wildcard entries (including trimmed values) and lock behavior with regression tests. Landed from contributor PR #31058 by @byungsker. Thanks @byungsker. - Agents/Sessions list transcript paths: handle missing/non-string/relative `sessions.list.path` values and per-agent `{agentId}` templates when deriving `transcriptPath`, so cross-agent session listings resolve to concrete agent session files instead of workspace-relative paths. (#24775) Thanks @martinfrancois. +- Sessions/Lock recovery: detect recycled Linux PIDs by comparing lock-file `starttime` with `/proc//stat` starttime, so stale `.jsonl.lock` files are reclaimed immediately in containerized PID-reuse scenarios while preserving compatibility for older lock files. (#26443) Fixes #27252. Thanks @HirokiKobayashi-R and @vincentkoc. - Gateway/Control UI CSP: allow required Google Fonts origins in Control UI CSP. (#29279) Thanks @Glucksberg and @vincentkoc. - CLI/Install: add an npm-link fallback to fix CLI startup `Permission denied` failures (`exit 127`) on affected installs. (#17151) Thanks @sskyu and @vincentkoc. - Onboarding/Custom providers: improve verification reliability for slower local endpoints (for example Ollama) during setup. (#27380) Thanks @Sid-Qin. diff --git a/src/agents/session-write-lock.test.ts b/src/agents/session-write-lock.test.ts index 4bef8a5194a..3c1b52e8be9 100644 --- a/src/agents/session-write-lock.test.ts +++ b/src/agents/session-write-lock.test.ts @@ -2,6 +2,18 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { describe, expect, it, vi } from "vitest"; + +// Mock getProcessStartTime so PID-recycling detection works on non-Linux +// (macOS, CI runners). isPidAlive is left unmocked. +const FAKE_STARTTIME = 12345; +vi.mock("../shared/pid-alive.js", async (importOriginal) => { + const original = await importOriginal(); + return { + ...original, + getProcessStartTime: (pid: number) => (pid === process.pid ? FAKE_STARTTIME : null), + }; +}); + import { __testing, acquireSessionWriteLock, @@ -255,6 +267,82 @@ describe("acquireSessionWriteLock", () => { } }); + it("reclaims lock files with recycled PIDs", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + // Write a lock with a live PID (current process) but a wrong starttime, + // simulating PID recycling: the PID is alive but belongs to a different + // process than the one that created the lock. + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), + starttime: 999_999_999, + }), + "utf8", + ); + + const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 }); + const raw = await fs.readFile(lockPath, "utf8"); + const payload = JSON.parse(raw) as { pid: number }; + + expect(payload.pid).toBe(process.pid); + await lock.release(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("does not reclaim lock files without starttime (backward compat)", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + // Old-format lock without starttime — should NOT be reclaimed just because + // starttime is missing. The PID is alive, so the lock is valid. + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), + }), + "utf8", + ); + + await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow( + /session file locked/, + ); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("does not treat malformed starttime as recycled", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date().toISOString(), + starttime: 123.5, + }), + "utf8", + ); + + await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow( + /session file locked/, + ); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + it("registers cleanup for SIGQUIT and SIGABRT", () => { expect(__testing.cleanupSignals).toContain("SIGQUIT"); expect(__testing.cleanupSignals).toContain("SIGABRT"); diff --git a/src/agents/session-write-lock.ts b/src/agents/session-write-lock.ts index 5b030430ec9..837a7ada36b 100644 --- a/src/agents/session-write-lock.ts +++ b/src/agents/session-write-lock.ts @@ -1,14 +1,20 @@ import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; -import { isPidAlive } from "../shared/pid-alive.js"; +import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js"; import { resolveProcessScopedMap } from "../shared/process-scoped-map.js"; type LockFilePayload = { pid?: number; createdAt?: string; + /** Process start time in clock ticks (from /proc/pid/stat field 22). */ + starttime?: number; }; +function isValidLockNumber(value: unknown): value is number { + return typeof value === "number" && Number.isInteger(value) && value >= 0; +} + type HeldLock = { count: number; handle: fs.FileHandle; @@ -270,12 +276,15 @@ async function readLockPayload(lockPath: string): Promise; const payload: LockFilePayload = {}; - if (typeof parsed.pid === "number") { + if (isValidLockNumber(parsed.pid) && parsed.pid > 0) { payload.pid = parsed.pid; } if (typeof parsed.createdAt === "string") { payload.createdAt = parsed.createdAt; } + if (isValidLockNumber(parsed.starttime)) { + payload.starttime = parsed.starttime; + } return payload; } catch { return null; @@ -287,17 +296,31 @@ function inspectLockPayload( staleMs: number, nowMs: number, ): LockInspectionDetails { - const pid = typeof payload?.pid === "number" ? payload.pid : null; + const pid = isValidLockNumber(payload?.pid) && payload.pid > 0 ? payload.pid : null; const pidAlive = pid !== null ? isPidAlive(pid) : false; const createdAt = typeof payload?.createdAt === "string" ? payload.createdAt : null; const createdAtMs = createdAt ? Date.parse(createdAt) : Number.NaN; const ageMs = Number.isFinite(createdAtMs) ? Math.max(0, nowMs - createdAtMs) : null; + // Detect PID recycling: if the PID is alive but its start time differs from + // what was recorded in the lock file, the original process died and the OS + // reassigned the same PID to a different process. + const storedStarttime = isValidLockNumber(payload?.starttime) ? payload.starttime : null; + const pidRecycled = + pidAlive && pid !== null && storedStarttime !== null + ? (() => { + const currentStarttime = getProcessStartTime(pid); + return currentStarttime !== null && currentStarttime !== storedStarttime; + })() + : false; + const staleReasons: string[] = []; if (pid === null) { staleReasons.push("missing-pid"); } else if (!pidAlive) { staleReasons.push("dead-pid"); + } else if (pidRecycled) { + staleReasons.push("recycled-pid"); } if (ageMs === null) { staleReasons.push("invalid-createdAt"); @@ -447,7 +470,12 @@ export async function acquireSessionWriteLock(params: { try { handle = await fs.open(lockPath, "wx"); const createdAt = new Date().toISOString(); - await handle.writeFile(JSON.stringify({ pid: process.pid, createdAt }, null, 2), "utf8"); + const starttime = getProcessStartTime(process.pid); + const lockPayload: LockFilePayload = { pid: process.pid, createdAt }; + if (starttime !== null) { + lockPayload.starttime = starttime; + } + await handle.writeFile(JSON.stringify(lockPayload, null, 2), "utf8"); const createdHeld: HeldLock = { count: 1, handle, diff --git a/src/shared/pid-alive.test.ts b/src/shared/pid-alive.test.ts index 862101bb7be..1edafa77cab 100644 --- a/src/shared/pid-alive.test.ts +++ b/src/shared/pid-alive.test.ts @@ -1,6 +1,6 @@ import fsSync from "node:fs"; import { describe, expect, it, vi } from "vitest"; -import { isPidAlive } from "./pid-alive.js"; +import { getProcessStartTime, isPidAlive } from "./pid-alive.js"; describe("isPidAlive", () => { it("returns true for the current running process", () => { @@ -14,6 +14,7 @@ describe("isPidAlive", () => { it("returns false for invalid PIDs", () => { expect(isPidAlive(0)).toBe(false); expect(isPidAlive(-1)).toBe(false); + expect(isPidAlive(1.5)).toBe(false); expect(isPidAlive(Number.NaN)).toBe(false); expect(isPidAlive(Number.POSITIVE_INFINITY)).toBe(false); }); @@ -51,3 +52,114 @@ describe("isPidAlive", () => { } }); }); + +describe("getProcessStartTime", () => { + it("returns a number on Linux for the current process", async () => { + const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + if (!originalPlatformDescriptor) { + throw new Error("missing process.platform descriptor"); + } + + const originalReadFileSync = fsSync.readFileSync; + // Simulate a realistic /proc//stat line + const fakeStat = `${process.pid} (node) S 1 ${process.pid} ${process.pid} 0 -1 4194304 12345 0 0 0 100 50 0 0 20 0 8 0 98765 123456789 5000 18446744073709551615 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0`; + vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === `/proc/${process.pid}/stat`) { + return fakeStat; + } + return originalReadFileSync(filePath as never, encoding as never) as never; + }); + + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: "linux", + }); + + try { + vi.resetModules(); + const { getProcessStartTime: fresh } = await import("./pid-alive.js"); + const starttime = fresh(process.pid); + expect(starttime).toBe(98765); + } finally { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + vi.restoreAllMocks(); + } + }); + + it("returns null on non-Linux platforms", () => { + if (process.platform === "linux") { + // On actual Linux, this test is trivially satisfied by the other tests. + expect(true).toBe(true); + return; + } + expect(getProcessStartTime(process.pid)).toBeNull(); + }); + + it("returns null for invalid PIDs", () => { + expect(getProcessStartTime(0)).toBeNull(); + expect(getProcessStartTime(-1)).toBeNull(); + expect(getProcessStartTime(1.5)).toBeNull(); + expect(getProcessStartTime(Number.NaN)).toBeNull(); + expect(getProcessStartTime(Number.POSITIVE_INFINITY)).toBeNull(); + }); + + it("returns null for malformed /proc stat content", async () => { + const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + if (!originalPlatformDescriptor) { + throw new Error("missing process.platform descriptor"); + } + + const originalReadFileSync = fsSync.readFileSync; + vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === "/proc/42/stat") { + return "42 node S malformed"; + } + return originalReadFileSync(filePath as never, encoding as never) as never; + }); + + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: "linux", + }); + + try { + vi.resetModules(); + const { getProcessStartTime: fresh } = await import("./pid-alive.js"); + expect(fresh(42)).toBeNull(); + } finally { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + vi.restoreAllMocks(); + } + }); + + it("handles comm fields containing spaces and parentheses", async () => { + const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + if (!originalPlatformDescriptor) { + throw new Error("missing process.platform descriptor"); + } + + const originalReadFileSync = fsSync.readFileSync; + // comm field with spaces and nested parens: "(My App (v2))" + const fakeStat = `42 (My App (v2)) S 1 42 42 0 -1 4194304 0 0 0 0 0 0 0 0 20 0 1 0 55555 0 0 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0`; + vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === "/proc/42/stat") { + return fakeStat; + } + return originalReadFileSync(filePath as never, encoding as never) as never; + }); + + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: "linux", + }); + + try { + vi.resetModules(); + const { getProcessStartTime: fresh } = await import("./pid-alive.js"); + expect(fresh(42)).toBe(55555); + } finally { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + vi.restoreAllMocks(); + } + }); +}); diff --git a/src/shared/pid-alive.ts b/src/shared/pid-alive.ts index d3aeaaf6f43..522566fb3fd 100644 --- a/src/shared/pid-alive.ts +++ b/src/shared/pid-alive.ts @@ -1,5 +1,9 @@ import fsSync from "node:fs"; +function isValidPid(pid: number): boolean { + return Number.isInteger(pid) && pid > 0; +} + /** * Check if a process is a zombie on Linux by reading /proc//status. * Returns false on non-Linux platforms or if the proc file can't be read. @@ -18,7 +22,7 @@ function isZombieProcess(pid: number): boolean { } export function isPidAlive(pid: number): boolean { - if (!Number.isFinite(pid) || pid <= 0) { + if (!isValidPid(pid)) { return false; } try { @@ -31,3 +35,36 @@ export function isPidAlive(pid: number): boolean { } return true; } + +/** + * Read the process start time (field 22 "starttime") from /proc//stat. + * Returns the value in clock ticks since system boot, or null on non-Linux + * platforms or if the proc file can't be read. + * + * This is used to detect PID recycling: if two readings for the same PID + * return different starttimes, the PID has been reused by a different process. + */ +export function getProcessStartTime(pid: number): number | null { + if (process.platform !== "linux") { + return null; + } + if (!isValidPid(pid)) { + return null; + } + try { + const stat = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8"); + const commEndIndex = stat.lastIndexOf(")"); + if (commEndIndex < 0) { + return null; + } + // The comm field (field 2) is wrapped in parens and can contain spaces, + // so split after the last ")" to get fields 3..N reliably. + const afterComm = stat.slice(commEndIndex + 1).trimStart(); + const fields = afterComm.split(/\s+/); + // field 22 (starttime) = index 19 after the comm-split (field 3 is index 0). + const starttime = Number(fields[19]); + return Number.isInteger(starttime) && starttime >= 0 ? starttime : null; + } catch { + return null; + } +}