mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:40:44 +00:00
fix(plugins): disambiguate runtime-deps lock owners by process start-time
`shouldRemoveRuntimeDepsLock` previously trusted `isAlive(owner.pid)` alone when deciding whether a lock could be reclaimed. That works fine on a normal host: when the writer dies the PID is gone and `isAlive` returns false. Inside Docker it does not — every Node gateway process runs as PID 1 (or PID 7 with `init: true`) in its container PID namespace, so a stale lock left behind by a previous incarnation looks "alive" to the new one. The 5-minute lock-wait timeout then fires and the supervisor restarts, and the cycle repeats indefinitely. Operators have to manually remove `.openclaw-runtime-deps.lock` to recover. This change records `pidStartTimeMs` alongside `pid` and `createdAtMs` when the lock is acquired, and consults it in the staleness check. When both sides have start-time evidence and they disagree, the lock is treated as stale; otherwise the existing PID-alive-means-fresh behavior is preserved exactly. The capture point uses `Date.now() - process.uptime() * 1000` once at module load, and the read side uses `/proc/<pid>/stat` field 22 on Linux (returning null elsewhere so legacy semantics still apply on macOS/Windows hosts). This is strictly additive on the wire format and the predicate: existing lock files without `pidStartTimeMs` continue to take the same code path they did before, and platforms that cannot resolve a live PID's start-time fall back to the same legacy behavior. Refs #74346.
This commit is contained in:
committed by
Peter Steinberger
parent
3f0039e2ea
commit
2d885a2402
@@ -2126,6 +2126,51 @@ describe("ensureBundledPluginRuntimeDeps", () => {
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("treats a PID-alive lock with matching pidStartTimeMs as held by the same incarnation", () => {
|
||||
expect(
|
||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||
{ pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 },
|
||||
2_500,
|
||||
() => true,
|
||||
// Live PID's start-time matches owner.pidStartTimeMs → same process
|
||||
() => 1_000,
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("expires a PID-alive lock when the live PID's start-time differs (Docker PID reuse)", () => {
|
||||
// Models the failure mode that motivated this change: inside a container
|
||||
// the gateway is always PID 1 (or PID 7 with `init: true`), so a stale
|
||||
// lock from a previous incarnation looks "alive" if we only consult
|
||||
// isProcessAlive. Capturing the writer's start-time and comparing it to
|
||||
// the live PID's start-time disambiguates incarnations.
|
||||
expect(
|
||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||
{ pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 },
|
||||
2_500,
|
||||
() => true,
|
||||
// Same PID, but a different incarnation started later → stale.
|
||||
() => 9_000,
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("treats a PID-alive lock as fresh when start-time evidence cannot be read", () => {
|
||||
// Defensive: when readProcessStartTimeMs returns null (legacy lock with
|
||||
// no pidStartTimeMs, or a platform that does not expose it) we keep the
|
||||
// pre-existing behavior of trusting isAlive(pid). The only verified
|
||||
// disambiguation path is start-time evidence on both sides; without it
|
||||
// we err toward "still held" rather than risk stomping a real install.
|
||||
expect(
|
||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||
{ pid: 7, pidStartTimeMs: 1_000, createdAtMs: 0 },
|
||||
Number.MAX_SAFE_INTEGER,
|
||||
() => true,
|
||||
() => null,
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("does not expire fresh ownerless runtime-deps install locks", () => {
|
||||
expect(
|
||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||
|
||||
@@ -456,8 +456,65 @@ function isProcessAlive(pid: number): boolean {
|
||||
}
|
||||
}
|
||||
|
||||
// Approximate epoch-ms when the current process started, captured once per
|
||||
// Node lifetime so the lock owner record carries a value that is stable
|
||||
// across `Date.now()` clock skew within the running process.
|
||||
const CURRENT_PROCESS_START_TIME_MS = Math.round(Date.now() - process.uptime() * 1000);
|
||||
|
||||
// Approximate epoch-ms when `pid` started running, or null when the platform
|
||||
// or runtime cannot determine it. Used to distinguish two different process
|
||||
// incarnations that happen to share the same numeric PID — most notably
|
||||
// inside Docker containers, where the gateway is always PID 1 (or PID 7
|
||||
// with `init: true`) in its container PID namespace, so a stale lock left
|
||||
// by a previous incarnation looks live to the new one if we only consult
|
||||
// `isAlive(pid)`.
|
||||
function readProcessStartTimeMs(pid: number): number | null {
|
||||
if (!Number.isInteger(pid) || pid <= 0) {
|
||||
return null;
|
||||
}
|
||||
if (process.platform !== "linux") {
|
||||
return null;
|
||||
}
|
||||
let stat: string;
|
||||
try {
|
||||
stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8");
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
// /proc/<pid>/stat: PID (comm) state ppid pgrp ... starttime ... where
|
||||
// `comm` may contain spaces and parens, so we anchor on the LAST `)` and
|
||||
// count whitespace-separated fields after it. Field index 22 (1-based) is
|
||||
// starttime, expressed in clock ticks since boot. After the `)` the next
|
||||
// field is `state` (1), then ppid (2), pgrp (3), session (4), tty_nr (5),
|
||||
// tpgid (6), flags (7), minflt (8), cminflt (9), majflt (10), cmajflt
|
||||
// (11), utime (12), stime (13), cutime (14), cstime (15), priority (16),
|
||||
// nice (17), num_threads (18), itrealvalue (19), starttime (20).
|
||||
const lastParen = stat.lastIndexOf(")");
|
||||
if (lastParen === -1) {
|
||||
return null;
|
||||
}
|
||||
const tail = stat
|
||||
.slice(lastParen + 1)
|
||||
.trim()
|
||||
.split(/\s+/);
|
||||
// tail[0] is `state`; starttime is therefore tail[19].
|
||||
const starttimeJiffies = Number(tail[19]);
|
||||
if (!Number.isFinite(starttimeJiffies)) {
|
||||
return null;
|
||||
}
|
||||
// sysconf(_SC_CLK_TCK) is universally 100 on Linux x86/arm targets. We
|
||||
// hard-code that here rather than spawning getconf — the exact tick rate
|
||||
// does not matter for our purposes as long as it is stable: the starttime
|
||||
// we capture for our own process and the starttime we read for the
|
||||
// lock-owner PID are converted with the same constant, so equality holds.
|
||||
const clockTicksPerSecond = 100;
|
||||
const bootTimeMs = Date.now() - os.uptime() * 1000;
|
||||
return Math.round(bootTimeMs + (starttimeJiffies * 1000) / clockTicksPerSecond);
|
||||
}
|
||||
|
||||
type RuntimeDepsLockOwner = {
|
||||
pid?: number;
|
||||
pidStartTimeMs?: number;
|
||||
createdAtMs?: number;
|
||||
ownerFileState: "ok" | "missing" | "invalid";
|
||||
ownerFilePath: string;
|
||||
@@ -501,6 +558,7 @@ function readRuntimeDepsLockOwner(lockDir: string): RuntimeDepsLockOwner {
|
||||
}
|
||||
return {
|
||||
pid: typeof owner?.pid === "number" ? owner.pid : undefined,
|
||||
pidStartTimeMs: typeof owner?.pidStartTimeMs === "number" ? owner.pidStartTimeMs : undefined,
|
||||
createdAtMs: typeof owner?.createdAtMs === "number" ? owner.createdAtMs : undefined,
|
||||
ownerFileState,
|
||||
ownerFilePath,
|
||||
@@ -524,12 +582,32 @@ function latestFiniteMs(values: readonly (number | undefined)[]): number | undef
|
||||
}
|
||||
|
||||
function shouldRemoveRuntimeDepsLock(
|
||||
owner: Pick<RuntimeDepsLockOwner, "pid" | "createdAtMs" | "lockDirMtimeMs" | "ownerFileMtimeMs">,
|
||||
owner: Pick<
|
||||
RuntimeDepsLockOwner,
|
||||
"pid" | "pidStartTimeMs" | "createdAtMs" | "lockDirMtimeMs" | "ownerFileMtimeMs"
|
||||
>,
|
||||
nowMs: number,
|
||||
isAlive: (pid: number) => boolean = isProcessAlive,
|
||||
readStartTimeMs: (pid: number) => number | null = readProcessStartTimeMs,
|
||||
): boolean {
|
||||
if (typeof owner.pid === "number") {
|
||||
return !isAlive(owner.pid);
|
||||
if (!isAlive(owner.pid)) {
|
||||
return true;
|
||||
}
|
||||
// PID is alive — but inside Docker the new process can share the same
|
||||
// PID as the dead writer. If we recorded the writer's start-time and we
|
||||
// can read the live PID's start-time, mismatch means a different
|
||||
// incarnation owns this PID now and the lock is stale. When start-time
|
||||
// evidence is unavailable on either side, fall through to the existing
|
||||
// PID-alive-means-fresh behavior so legacy locks keep working as
|
||||
// before.
|
||||
if (typeof owner.pidStartTimeMs === "number") {
|
||||
const liveStartTimeMs = readStartTimeMs(owner.pid);
|
||||
if (liveStartTimeMs !== null && liveStartTimeMs !== owner.pidStartTimeMs) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (typeof owner.createdAtMs === "number") {
|
||||
@@ -614,7 +692,15 @@ export function withBundledRuntimeDepsFilesystemLock<T>(
|
||||
try {
|
||||
fs.writeFileSync(
|
||||
path.join(lockDir, BUNDLED_RUNTIME_DEPS_LOCK_OWNER_FILE),
|
||||
`${JSON.stringify({ pid: process.pid, createdAtMs: Date.now() }, null, 2)}\n`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
pid: process.pid,
|
||||
pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS,
|
||||
createdAtMs: Date.now(),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
"utf8",
|
||||
);
|
||||
} catch (ownerWriteError) {
|
||||
@@ -2387,7 +2473,15 @@ async function withBundledRuntimeDepsInstallRootLockAsync<T>(
|
||||
try {
|
||||
fs.writeFileSync(
|
||||
path.join(lockDir, BUNDLED_RUNTIME_DEPS_LOCK_OWNER_FILE),
|
||||
`${JSON.stringify({ pid: process.pid, createdAtMs: Date.now() }, null, 2)}\n`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
pid: process.pid,
|
||||
pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS,
|
||||
createdAtMs: Date.now(),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
"utf8",
|
||||
);
|
||||
} catch (ownerWriteError) {
|
||||
|
||||
Reference in New Issue
Block a user