mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-03 13:20:22 +00:00
fix(sessions): harden recycled PID lock recovery follow-up (#31320)
* fix: detect PID recycling in session write lock staleness check The session lock uses isPidAlive() to determine if a lock holder is still running. In containers, PID recycling can cause a different process to inherit the same PID, making the lock appear valid when the original holder is dead. Record the process start time (field 22 of /proc/pid/stat) in the lock file and compare it during staleness checks. If the PID is alive but its start time differs from the recorded value, the lock is treated as stale and reclaimed immediately. Backward compatible: lock files without starttime are handled with the existing PID-alive + age-based logic. Non-Linux platforms skip the starttime check entirely (getProcessStartTime returns null). * shared: harden pid starttime parsing * sessions: validate lock pid/starttime payloads * changelog: note recycled PID lock recovery fix * changelog: credit hiroki and vincent on lock recovery fix --------- Co-authored-by: HirokiKobayashi-R <hiroki@rhems-japan.co.jp>
This commit is contained in:
@@ -2,6 +2,18 @@ import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
|
||||
// Mock getProcessStartTime so PID-recycling detection works on non-Linux
|
||||
// (macOS, CI runners). isPidAlive is left unmocked.
|
||||
const FAKE_STARTTIME = 12345;
|
||||
vi.mock("../shared/pid-alive.js", async (importOriginal) => {
|
||||
const original = await importOriginal<typeof import("../shared/pid-alive.js")>();
|
||||
return {
|
||||
...original,
|
||||
getProcessStartTime: (pid: number) => (pid === process.pid ? FAKE_STARTTIME : null),
|
||||
};
|
||||
});
|
||||
|
||||
import {
|
||||
__testing,
|
||||
acquireSessionWriteLock,
|
||||
@@ -255,6 +267,82 @@ describe("acquireSessionWriteLock", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("reclaims lock files with recycled PIDs", async () => {
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
|
||||
try {
|
||||
const sessionFile = path.join(root, "sessions.json");
|
||||
const lockPath = `${sessionFile}.lock`;
|
||||
// Write a lock with a live PID (current process) but a wrong starttime,
|
||||
// simulating PID recycling: the PID is alive but belongs to a different
|
||||
// process than the one that created the lock.
|
||||
await fs.writeFile(
|
||||
lockPath,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date().toISOString(),
|
||||
starttime: 999_999_999,
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
|
||||
const raw = await fs.readFile(lockPath, "utf8");
|
||||
const payload = JSON.parse(raw) as { pid: number };
|
||||
|
||||
expect(payload.pid).toBe(process.pid);
|
||||
await lock.release();
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("does not reclaim lock files without starttime (backward compat)", async () => {
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
|
||||
try {
|
||||
const sessionFile = path.join(root, "sessions.json");
|
||||
const lockPath = `${sessionFile}.lock`;
|
||||
// Old-format lock without starttime — should NOT be reclaimed just because
|
||||
// starttime is missing. The PID is alive, so the lock is valid.
|
||||
await fs.writeFile(
|
||||
lockPath,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date().toISOString(),
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow(
|
||||
/session file locked/,
|
||||
);
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("does not treat malformed starttime as recycled", async () => {
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
|
||||
try {
|
||||
const sessionFile = path.join(root, "sessions.json");
|
||||
const lockPath = `${sessionFile}.lock`;
|
||||
await fs.writeFile(
|
||||
lockPath,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date().toISOString(),
|
||||
starttime: 123.5,
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 50 })).rejects.toThrow(
|
||||
/session file locked/,
|
||||
);
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("registers cleanup for SIGQUIT and SIGABRT", () => {
|
||||
expect(__testing.cleanupSignals).toContain("SIGQUIT");
|
||||
expect(__testing.cleanupSignals).toContain("SIGABRT");
|
||||
|
||||
@@ -1,14 +1,20 @@
|
||||
import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { isPidAlive } from "../shared/pid-alive.js";
|
||||
import { getProcessStartTime, isPidAlive } from "../shared/pid-alive.js";
|
||||
import { resolveProcessScopedMap } from "../shared/process-scoped-map.js";
|
||||
|
||||
type LockFilePayload = {
|
||||
pid?: number;
|
||||
createdAt?: string;
|
||||
/** Process start time in clock ticks (from /proc/pid/stat field 22). */
|
||||
starttime?: number;
|
||||
};
|
||||
|
||||
function isValidLockNumber(value: unknown): value is number {
|
||||
return typeof value === "number" && Number.isInteger(value) && value >= 0;
|
||||
}
|
||||
|
||||
type HeldLock = {
|
||||
count: number;
|
||||
handle: fs.FileHandle;
|
||||
@@ -270,12 +276,15 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
|
||||
const raw = await fs.readFile(lockPath, "utf8");
|
||||
const parsed = JSON.parse(raw) as Record<string, unknown>;
|
||||
const payload: LockFilePayload = {};
|
||||
if (typeof parsed.pid === "number") {
|
||||
if (isValidLockNumber(parsed.pid) && parsed.pid > 0) {
|
||||
payload.pid = parsed.pid;
|
||||
}
|
||||
if (typeof parsed.createdAt === "string") {
|
||||
payload.createdAt = parsed.createdAt;
|
||||
}
|
||||
if (isValidLockNumber(parsed.starttime)) {
|
||||
payload.starttime = parsed.starttime;
|
||||
}
|
||||
return payload;
|
||||
} catch {
|
||||
return null;
|
||||
@@ -287,17 +296,31 @@ function inspectLockPayload(
|
||||
staleMs: number,
|
||||
nowMs: number,
|
||||
): LockInspectionDetails {
|
||||
const pid = typeof payload?.pid === "number" ? payload.pid : null;
|
||||
const pid = isValidLockNumber(payload?.pid) && payload.pid > 0 ? payload.pid : null;
|
||||
const pidAlive = pid !== null ? isPidAlive(pid) : false;
|
||||
const createdAt = typeof payload?.createdAt === "string" ? payload.createdAt : null;
|
||||
const createdAtMs = createdAt ? Date.parse(createdAt) : Number.NaN;
|
||||
const ageMs = Number.isFinite(createdAtMs) ? Math.max(0, nowMs - createdAtMs) : null;
|
||||
|
||||
// Detect PID recycling: if the PID is alive but its start time differs from
|
||||
// what was recorded in the lock file, the original process died and the OS
|
||||
// reassigned the same PID to a different process.
|
||||
const storedStarttime = isValidLockNumber(payload?.starttime) ? payload.starttime : null;
|
||||
const pidRecycled =
|
||||
pidAlive && pid !== null && storedStarttime !== null
|
||||
? (() => {
|
||||
const currentStarttime = getProcessStartTime(pid);
|
||||
return currentStarttime !== null && currentStarttime !== storedStarttime;
|
||||
})()
|
||||
: false;
|
||||
|
||||
const staleReasons: string[] = [];
|
||||
if (pid === null) {
|
||||
staleReasons.push("missing-pid");
|
||||
} else if (!pidAlive) {
|
||||
staleReasons.push("dead-pid");
|
||||
} else if (pidRecycled) {
|
||||
staleReasons.push("recycled-pid");
|
||||
}
|
||||
if (ageMs === null) {
|
||||
staleReasons.push("invalid-createdAt");
|
||||
@@ -447,7 +470,12 @@ export async function acquireSessionWriteLock(params: {
|
||||
try {
|
||||
handle = await fs.open(lockPath, "wx");
|
||||
const createdAt = new Date().toISOString();
|
||||
await handle.writeFile(JSON.stringify({ pid: process.pid, createdAt }, null, 2), "utf8");
|
||||
const starttime = getProcessStartTime(process.pid);
|
||||
const lockPayload: LockFilePayload = { pid: process.pid, createdAt };
|
||||
if (starttime !== null) {
|
||||
lockPayload.starttime = starttime;
|
||||
}
|
||||
await handle.writeFile(JSON.stringify(lockPayload, null, 2), "utf8");
|
||||
const createdHeld: HeldLock = {
|
||||
count: 1,
|
||||
handle,
|
||||
|
||||
Reference in New Issue
Block a user