mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 18:20:44 +00:00
fix: harden runtime deps lock owner identity
This commit is contained in:
@@ -32,6 +32,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Agents/subagents: cache persisted subagent run registry reads by file signature while preserving fresh-parse isolation, so busy gateways stop reparsing unchanged `subagents/runs.json` on controller/list/status hot paths. Refs #72338. Thanks @argus-as.
|
- Agents/subagents: cache persisted subagent run registry reads by file signature while preserving fresh-parse isolation, so busy gateways stop reparsing unchanged `subagents/runs.json` on controller/list/status hot paths. Refs #72338. Thanks @argus-as.
|
||||||
- Gateway/clients: wait for the event loop to become responsive before opening Gateway WebSocket RPC/probe/client connections while charging that readiness wait to caller timeouts, so Windows deferred module-evaluation stalls no longer turn healthy loopback gateways into false handshake timeouts across status, TUI, ACP, MCP, node-host, and plugin client paths. Refs #74279 and #48270. Thanks @wongcode and @joost-heijden.
|
- Gateway/clients: wait for the event loop to become responsive before opening Gateway WebSocket RPC/probe/client connections while charging that readiness wait to caller timeouts, so Windows deferred module-evaluation stalls no longer turn healthy loopback gateways into false handshake timeouts across status, TUI, ACP, MCP, node-host, and plugin client paths. Refs #74279 and #48270. Thanks @wongcode and @joost-heijden.
|
||||||
- Gateway/Windows: read listener command lines via PowerShell before falling back to `wmic`, so restart health can recognize OpenClaw listeners on modern Windows installs and avoid long anonymous-port waits. Refs #74280. Thanks @zym951223.
|
- Gateway/Windows: read listener command lines via PowerShell before falling back to `wmic`, so restart health can recognize OpenClaw listeners on modern Windows installs and avoid long anonymous-port waits. Refs #74280. Thanks @zym951223.
|
||||||
|
- Plugins/runtime-deps: record process start-time in bundled dependency install locks and expire recycled-PID locks, so Docker gateway restarts recover from stale `.openclaw-runtime-deps.lock` directories without waiting through repeated five-minute timeouts. Fixes #74346. (#74361) Thanks @jhsmith409.
|
||||||
- Plugins/runtime-deps: memoize packaged bundled runtime dist-mirror preparation after the first successful pass while keeping source-checkout mirrors refreshable, so constrained Docker/VPS installs avoid repeated root scans before chat turns. Refs #73428, #73421, #73532, and #73477. Thanks @Dimaoggg, @oromeis, @oadiazp, @jmfraga, @bstanbury, @antoniusfelix, and @jkobject.
|
- Plugins/runtime-deps: memoize packaged bundled runtime dist-mirror preparation after the first successful pass while keeping source-checkout mirrors refreshable, so constrained Docker/VPS installs avoid repeated root scans before chat turns. Refs #73428, #73421, #73532, and #73477. Thanks @Dimaoggg, @oromeis, @oadiazp, @jmfraga, @bstanbury, @antoniusfelix, and @jkobject.
|
||||||
- Channels/Discord: treat bare numeric outbound targets that match the effective Discord DM allowlist as user DMs while preserving account-specific legacy `dm.allowFrom` precedence over inherited root `allowFrom`. (#74303) Thanks @Squirbie.
|
- Channels/Discord: treat bare numeric outbound targets that match the effective Discord DM allowlist as user DMs while preserving account-specific legacy `dm.allowFrom` precedence over inherited root `allowFrom`. (#74303) Thanks @Squirbie.
|
||||||
- Channels/Discord/Slack: share one DM policy/allowlist resolver across runtime, setup, allowlist editing, and doctor repair, so legacy `dm.policy` / `dm.allowFrom` compatibility migrates to canonical `dmPolicy` / `allowFrom` without divergent access checks. Thanks @Squirbie.
|
- Channels/Discord/Slack: share one DM policy/allowlist resolver across runtime, setup, allowlist editing, and doctor repair, so legacy `dm.policy` / `dm.allowFrom` compatibility migrates to canonical `dmPolicy` / `allowFrom` without divergent access checks. Thanks @Squirbie.
|
||||||
|
|||||||
@@ -2126,13 +2126,13 @@ describe("ensureBundledPluginRuntimeDeps", () => {
|
|||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("treats a PID-alive lock with matching pidStartTimeMs as held by the same incarnation", () => {
|
it("treats a PID-alive lock with matching starttime as held by the same incarnation", () => {
|
||||||
expect(
|
expect(
|
||||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||||
{ pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 },
|
{ pid: 7, starttime: 1_000, createdAtMs: 2_000 },
|
||||||
2_500,
|
2_500,
|
||||||
() => true,
|
() => true,
|
||||||
// Live PID's start-time matches owner.pidStartTimeMs → same process
|
// Live PID's starttime matches the lock owner, so this is the same process.
|
||||||
() => 1_000,
|
() => 1_000,
|
||||||
),
|
),
|
||||||
).toBe(false);
|
).toBe(false);
|
||||||
@@ -2146,24 +2146,24 @@ describe("ensureBundledPluginRuntimeDeps", () => {
|
|||||||
// the live PID's start-time disambiguates incarnations.
|
// the live PID's start-time disambiguates incarnations.
|
||||||
expect(
|
expect(
|
||||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||||
{ pid: 7, pidStartTimeMs: 1_000, createdAtMs: 2_000 },
|
{ pid: 7, starttime: 1_000, createdAtMs: 2_000 },
|
||||||
2_500,
|
2_500,
|
||||||
() => true,
|
() => true,
|
||||||
// Same PID, but a different incarnation started later → stale.
|
// Same PID, but a different incarnation started later.
|
||||||
() => 9_000,
|
() => 9_000,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("treats a PID-alive lock as fresh when start-time evidence cannot be read", () => {
|
it("treats a PID-alive lock as fresh when start-time evidence cannot be read", () => {
|
||||||
// Defensive: when readProcessStartTimeMs returns null (legacy lock with
|
// Defensive: when getProcessStartTime returns null (legacy lock with no
|
||||||
// no pidStartTimeMs, or a platform that does not expose it) we keep the
|
// starttime, or a platform that does not expose it) we keep the
|
||||||
// pre-existing behavior of trusting isAlive(pid). The only verified
|
// pre-existing behavior of trusting isAlive(pid). The only verified
|
||||||
// disambiguation path is start-time evidence on both sides; without it
|
// disambiguation path is start-time evidence on both sides; without it
|
||||||
// we err toward "still held" rather than risk stomping a real install.
|
// we err toward "still held" rather than risk stomping a real install.
|
||||||
expect(
|
expect(
|
||||||
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
bundledRuntimeDepsTesting.shouldRemoveRuntimeDepsLock(
|
||||||
{ pid: 7, pidStartTimeMs: 1_000, createdAtMs: 0 },
|
{ pid: 7, starttime: 1_000, createdAtMs: 0 },
|
||||||
Number.MAX_SAFE_INTEGER,
|
Number.MAX_SAFE_INTEGER,
|
||||||
() => true,
|
() => true,
|
||||||
() => null,
|
() => null,
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js";
|
|||||||
import { createLowDiskSpaceWarning } from "../infra/disk-space.js";
|
import { createLowDiskSpaceWarning } from "../infra/disk-space.js";
|
||||||
import { resolveHomeRelativePath } from "../infra/home-dir.js";
|
import { resolveHomeRelativePath } from "../infra/home-dir.js";
|
||||||
import { createNpmProjectInstallEnv } from "../infra/npm-install-env.js";
|
import { createNpmProjectInstallEnv } from "../infra/npm-install-env.js";
|
||||||
|
import { getProcessStartTime } from "../shared/pid-alive.js";
|
||||||
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
|
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
|
||||||
import { sanitizeTerminalText } from "../terminal/safe-text.js";
|
import { sanitizeTerminalText } from "../terminal/safe-text.js";
|
||||||
import { beginBundledRuntimeDepsInstall } from "./bundled-runtime-deps-activity.js";
|
import { beginBundledRuntimeDepsInstall } from "./bundled-runtime-deps-activity.js";
|
||||||
@@ -456,65 +457,11 @@ function isProcessAlive(pid: number): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Approximate epoch-ms when the current process started, captured once per
|
const CURRENT_PROCESS_STARTTIME = getProcessStartTime(process.pid);
|
||||||
// Node lifetime so the lock owner record carries a value that is stable
|
|
||||||
// across `Date.now()` clock skew within the running process.
|
|
||||||
const CURRENT_PROCESS_START_TIME_MS = Math.round(Date.now() - process.uptime() * 1000);
|
|
||||||
|
|
||||||
// Approximate epoch-ms when `pid` started running, or null when the platform
|
|
||||||
// or runtime cannot determine it. Used to distinguish two different process
|
|
||||||
// incarnations that happen to share the same numeric PID — most notably
|
|
||||||
// inside Docker containers, where the gateway is always PID 1 (or PID 7
|
|
||||||
// with `init: true`) in its container PID namespace, so a stale lock left
|
|
||||||
// by a previous incarnation looks live to the new one if we only consult
|
|
||||||
// `isAlive(pid)`.
|
|
||||||
function readProcessStartTimeMs(pid: number): number | null {
|
|
||||||
if (!Number.isInteger(pid) || pid <= 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
if (process.platform !== "linux") {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
let stat: string;
|
|
||||||
try {
|
|
||||||
stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8");
|
|
||||||
} catch {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
// /proc/<pid>/stat: PID (comm) state ppid pgrp ... starttime ... where
|
|
||||||
// `comm` may contain spaces and parens, so we anchor on the LAST `)` and
|
|
||||||
// count whitespace-separated fields after it. Field index 22 (1-based) is
|
|
||||||
// starttime, expressed in clock ticks since boot. After the `)` the next
|
|
||||||
// field is `state` (1), then ppid (2), pgrp (3), session (4), tty_nr (5),
|
|
||||||
// tpgid (6), flags (7), minflt (8), cminflt (9), majflt (10), cmajflt
|
|
||||||
// (11), utime (12), stime (13), cutime (14), cstime (15), priority (16),
|
|
||||||
// nice (17), num_threads (18), itrealvalue (19), starttime (20).
|
|
||||||
const lastParen = stat.lastIndexOf(")");
|
|
||||||
if (lastParen === -1) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
const tail = stat
|
|
||||||
.slice(lastParen + 1)
|
|
||||||
.trim()
|
|
||||||
.split(/\s+/);
|
|
||||||
// tail[0] is `state`; starttime is therefore tail[19].
|
|
||||||
const starttimeJiffies = Number(tail[19]);
|
|
||||||
if (!Number.isFinite(starttimeJiffies)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
// sysconf(_SC_CLK_TCK) is universally 100 on Linux x86/arm targets. We
|
|
||||||
// hard-code that here rather than spawning getconf — the exact tick rate
|
|
||||||
// does not matter for our purposes as long as it is stable: the starttime
|
|
||||||
// we capture for our own process and the starttime we read for the
|
|
||||||
// lock-owner PID are converted with the same constant, so equality holds.
|
|
||||||
const clockTicksPerSecond = 100;
|
|
||||||
const bootTimeMs = Date.now() - os.uptime() * 1000;
|
|
||||||
return Math.round(bootTimeMs + (starttimeJiffies * 1000) / clockTicksPerSecond);
|
|
||||||
}
|
|
||||||
|
|
||||||
type RuntimeDepsLockOwner = {
|
type RuntimeDepsLockOwner = {
|
||||||
pid?: number;
|
pid?: number;
|
||||||
pidStartTimeMs?: number;
|
starttime?: number;
|
||||||
createdAtMs?: number;
|
createdAtMs?: number;
|
||||||
ownerFileState: "ok" | "missing" | "invalid";
|
ownerFileState: "ok" | "missing" | "invalid";
|
||||||
ownerFilePath: string;
|
ownerFilePath: string;
|
||||||
@@ -558,7 +505,7 @@ function readRuntimeDepsLockOwner(lockDir: string): RuntimeDepsLockOwner {
|
|||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
pid: typeof owner?.pid === "number" ? owner.pid : undefined,
|
pid: typeof owner?.pid === "number" ? owner.pid : undefined,
|
||||||
pidStartTimeMs: typeof owner?.pidStartTimeMs === "number" ? owner.pidStartTimeMs : undefined,
|
starttime: typeof owner?.starttime === "number" ? owner.starttime : undefined,
|
||||||
createdAtMs: typeof owner?.createdAtMs === "number" ? owner.createdAtMs : undefined,
|
createdAtMs: typeof owner?.createdAtMs === "number" ? owner.createdAtMs : undefined,
|
||||||
ownerFileState,
|
ownerFileState,
|
||||||
ownerFilePath,
|
ownerFilePath,
|
||||||
@@ -584,26 +531,26 @@ function latestFiniteMs(values: readonly (number | undefined)[]): number | undef
|
|||||||
function shouldRemoveRuntimeDepsLock(
|
function shouldRemoveRuntimeDepsLock(
|
||||||
owner: Pick<
|
owner: Pick<
|
||||||
RuntimeDepsLockOwner,
|
RuntimeDepsLockOwner,
|
||||||
"pid" | "pidStartTimeMs" | "createdAtMs" | "lockDirMtimeMs" | "ownerFileMtimeMs"
|
"pid" | "starttime" | "createdAtMs" | "lockDirMtimeMs" | "ownerFileMtimeMs"
|
||||||
>,
|
>,
|
||||||
nowMs: number,
|
nowMs: number,
|
||||||
isAlive: (pid: number) => boolean = isProcessAlive,
|
isAlive: (pid: number) => boolean = isProcessAlive,
|
||||||
readStartTimeMs: (pid: number) => number | null = readProcessStartTimeMs,
|
readStarttime: (pid: number) => number | null = getProcessStartTime,
|
||||||
): boolean {
|
): boolean {
|
||||||
if (typeof owner.pid === "number") {
|
if (typeof owner.pid === "number") {
|
||||||
if (!isAlive(owner.pid)) {
|
if (!isAlive(owner.pid)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// PID is alive — but inside Docker the new process can share the same
|
// PID is alive, but inside Docker the new process can share the same
|
||||||
// PID as the dead writer. If we recorded the writer's start-time and we
|
// PID as the dead writer. If we recorded the writer's start-time and we
|
||||||
// can read the live PID's start-time, mismatch means a different
|
// can read the live PID's start-time, mismatch means a different
|
||||||
// incarnation owns this PID now and the lock is stale. When start-time
|
// incarnation owns this PID now and the lock is stale. When start-time
|
||||||
// evidence is unavailable on either side, fall through to the existing
|
// evidence is unavailable on either side, fall through to the existing
|
||||||
// PID-alive-means-fresh behavior so legacy locks keep working as
|
// PID-alive-means-fresh behavior so legacy locks keep working as
|
||||||
// before.
|
// before.
|
||||||
if (typeof owner.pidStartTimeMs === "number") {
|
if (typeof owner.starttime === "number") {
|
||||||
const liveStartTimeMs = readStartTimeMs(owner.pid);
|
const liveStarttime = readStarttime(owner.pid);
|
||||||
if (liveStartTimeMs !== null && liveStartTimeMs !== owner.pidStartTimeMs) {
|
if (liveStarttime !== null && liveStarttime !== owner.starttime) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -695,7 +642,9 @@ export function withBundledRuntimeDepsFilesystemLock<T>(
|
|||||||
`${JSON.stringify(
|
`${JSON.stringify(
|
||||||
{
|
{
|
||||||
pid: process.pid,
|
pid: process.pid,
|
||||||
pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS,
|
...(typeof CURRENT_PROCESS_STARTTIME === "number"
|
||||||
|
? { starttime: CURRENT_PROCESS_STARTTIME }
|
||||||
|
: {}),
|
||||||
createdAtMs: Date.now(),
|
createdAtMs: Date.now(),
|
||||||
},
|
},
|
||||||
null,
|
null,
|
||||||
@@ -2476,7 +2425,9 @@ async function withBundledRuntimeDepsInstallRootLockAsync<T>(
|
|||||||
`${JSON.stringify(
|
`${JSON.stringify(
|
||||||
{
|
{
|
||||||
pid: process.pid,
|
pid: process.pid,
|
||||||
pidStartTimeMs: CURRENT_PROCESS_START_TIME_MS,
|
...(typeof CURRENT_PROCESS_STARTTIME === "number"
|
||||||
|
? { starttime: CURRENT_PROCESS_STARTTIME }
|
||||||
|
: {}),
|
||||||
createdAtMs: Date.now(),
|
createdAtMs: Date.now(),
|
||||||
},
|
},
|
||||||
null,
|
null,
|
||||||
|
|||||||
Reference in New Issue
Block a user