mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 09:10:45 +00:00
fix(infra/restart): exclude ancestor pids from stale-gateway cleanup
The stale-gateway cleanup filter already refused to kill process.pid — acknowledging the invariant that terminating a process whose death cascades into the caller is never safe. That invariant was applied only to the caller itself, not to its ancestors, which is why the openclaw-weixin sidecar triggered an unbounded restart loop: the sidecar's cleanup SIGTERM'd its parent gateway, the supervisor restarted the gateway, the gateway re-spawned the sidecar, the cleanup ran again. Complete the invariant by excluding the full self+ancestor PID set in both the lsof (Unix) and PowerShell/netstat (Windows) cleanup paths. Walk uses process.ppid unconditionally (Node built-in, no spawn) and /proc/<pid>/status on Linux for transitive ancestors, with graceful degradation where /proc is unavailable.
This commit is contained in:
committed by
Peter Steinberger
parent
aad9a833c0
commit
8aadca4c3e
@@ -26,6 +26,29 @@ const mockReadWindowsProcessArgsResult = vi.hoisted(() =>
|
||||
(_pid: number, _timeoutMs?: number) => ({ ok: true, args: null }),
|
||||
),
|
||||
);
|
||||
// Drives the Linux `/proc/<pid>/status` ancestor walk inside
|
||||
// `getSelfAndAncestorPidsSync`. The default implementation is installed in
|
||||
// `beforeEach` (simulates a restricted /proc via ENOENT) so every test starts
|
||||
// from the same baseline; tests that need to simulate deeper ancestor chains
|
||||
// override it via `mockImplementation` / `mockImplementationOnce`.
|
||||
const mockReadFileSync = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("node:fs", async () => {
|
||||
const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js");
|
||||
return mockNodeBuiltinModule(
|
||||
() => vi.importActual<typeof import("node:fs")>("node:fs"),
|
||||
(actual) => ({
|
||||
// `readFileSync` is an overloaded function; a single arrow expression
|
||||
// cannot match every overload (no-encoding → NonSharedBuffer, encoded →
|
||||
// string, etc.), which tsgo flags as TS2322. Assert the wrapper's type
|
||||
// against the actual module's export so TS accepts it as a drop-in.
|
||||
// The test only exercises the string-returning overload (encoded /proc
|
||||
// reads); the cast is a precise retype, not `any`.
|
||||
readFileSync: ((path: unknown, encoding?: unknown) =>
|
||||
mockReadFileSync(path, encoding)) as typeof actual.readFileSync,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
vi.mock("node:child_process", async () => {
|
||||
const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js");
|
||||
@@ -143,6 +166,14 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
||||
mockReadWindowsListeningPidsResult.mockReset();
|
||||
mockReadWindowsProcessArgs.mockReset();
|
||||
mockReadWindowsProcessArgsResult.mockReset();
|
||||
mockReadFileSync.mockReset();
|
||||
mockReadFileSync.mockImplementation(() => {
|
||||
// Default: simulate /proc unavailable. Walks that reach this mock
|
||||
// degrade silently and return whatever set they collected so far.
|
||||
const error: NodeJS.ErrnoException = new Error("ENOENT: test default");
|
||||
error.code = "ENOENT";
|
||||
throw error;
|
||||
});
|
||||
mockResolveGatewayPort.mockReturnValue(18789);
|
||||
mockReadWindowsListeningPids.mockReturnValue([]);
|
||||
mockReadWindowsListeningPidsResult.mockReturnValue({ ok: true, pids: [] });
|
||||
@@ -157,6 +188,24 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
// Temporarily rewrites `process.ppid` for a block of test code. Used by the
|
||||
// ancestor-exclusion tests to drive the real `getSelfAndAncestorPidsSync`
|
||||
// walk without installing a runtime-reachable override on the module. Node
|
||||
// always exposes `process.ppid` as an own property so the captured
|
||||
// descriptor is non-null in practice; the `if (orig)` guard is defensive
|
||||
// against a broken environment, not a reachable branch.
|
||||
function withStubbedPpid<T>(ppid: number, fn: () => T): T {
|
||||
const orig = Object.getOwnPropertyDescriptor(process, "ppid");
|
||||
Object.defineProperty(process, "ppid", { value: ppid, configurable: true });
|
||||
try {
|
||||
return fn();
|
||||
} finally {
|
||||
if (orig) {
|
||||
Object.defineProperty(process, "ppid", orig);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// findGatewayPidsOnPortSync
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -203,6 +252,142 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
||||
expect(pids).not.toContain(process.pid);
|
||||
});
|
||||
|
||||
it("excludes ancestor pids so a sidecar cannot kill its parent gateway — regression for #68451", () => {
|
||||
// Regression: openclaw-weixin sidecar (child of the gateway) invoked
|
||||
// cleanStaleGatewayProcessesSync during init. lsof reported the parent
|
||||
// gateway on port 18789, its PID was not process.pid, so the cleanup
|
||||
// SIGTERM'd it — the supervisor restarted the gateway, re-spawned the
|
||||
// sidecar, the cleanup ran again: infinite restart loop.
|
||||
//
|
||||
// Fix: parsePidsFromLsofOutput now excludes process.pid AND its
|
||||
// ancestor chain (see getSelfAndAncestorPidsSync). This test stubs
|
||||
// process.ppid to the synthetic parent gateway pid so the real walk
|
||||
// adds it to the exclusion set; the default /proc mock throws ENOENT
|
||||
// so the walk stops after the direct parent.
|
||||
const parentGatewayPid = process.pid + 2001;
|
||||
const unrelatedStalePid = process.pid + 2002;
|
||||
mockSpawnSync.mockReturnValue({
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([
|
||||
{ pid: parentGatewayPid, cmd: "openclaw-gateway" },
|
||||
{ pid: unrelatedStalePid, cmd: "openclaw-gateway" },
|
||||
]),
|
||||
stderr: "",
|
||||
});
|
||||
const pids = withStubbedPpid(parentGatewayPid, () => findGatewayPidsOnPortSync(18789));
|
||||
// Parent gateway must be excluded; an unrelated stale PID must still be
|
||||
// reported so the supervisor-path cleanup continues to work.
|
||||
expect(pids).not.toContain(parentGatewayPid);
|
||||
expect(pids).toContain(unrelatedStalePid);
|
||||
});
|
||||
|
||||
it.skipIf(process.platform !== "linux")(
|
||||
"excludes the full ancestor chain, not just the direct parent — deeper nesting",
|
||||
() => {
|
||||
// The ancestor-exclusion invariant is transitive: killing any
|
||||
// ancestor cascades to the caller the same way killing the direct
|
||||
// parent does. Drive the real Linux /proc walk by stubbing
|
||||
// process.ppid to the direct parent and mocking readFileSync to
|
||||
// return synthetic PPid lines for each ancestor hop; the mock ends
|
||||
// the chain with "PPid: 0" so the walk terminates without touching
|
||||
// the real /proc.
|
||||
const directParentPid = process.pid + 2003;
|
||||
const grandparentPid = process.pid + 2004;
|
||||
const benignStalePid = process.pid + 2005;
|
||||
mockReadFileSync.mockImplementation((path: unknown): string => {
|
||||
if (path === `/proc/${directParentPid}/status`) {
|
||||
return `Name:\topenclaw-gateway\nPid:\t${directParentPid}\nPPid:\t${grandparentPid}\n`;
|
||||
}
|
||||
if (path === `/proc/${grandparentPid}/status`) {
|
||||
return `Name:\tsystemd\nPid:\t${grandparentPid}\nPPid:\t0\n`;
|
||||
}
|
||||
const error: NodeJS.ErrnoException = new Error("ENOENT");
|
||||
error.code = "ENOENT";
|
||||
throw error;
|
||||
});
|
||||
mockSpawnSync.mockReturnValue({
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([
|
||||
{ pid: directParentPid, cmd: "openclaw-gateway" },
|
||||
{ pid: grandparentPid, cmd: "openclaw-gateway" },
|
||||
{ pid: benignStalePid, cmd: "openclaw-gateway" },
|
||||
]),
|
||||
stderr: "",
|
||||
});
|
||||
const pids = withStubbedPpid(directParentPid, () => findGatewayPidsOnPortSync(18789));
|
||||
expect(pids).not.toContain(directParentPid);
|
||||
expect(pids).not.toContain(grandparentPid);
|
||||
expect(pids).toContain(benignStalePid);
|
||||
},
|
||||
);
|
||||
|
||||
it("excludes PID 1 when the direct parent gateway is the container entrypoint — container topology", () => {
|
||||
// Codex P1: in container deployments the gateway is the container
|
||||
// entrypoint and therefore runs as PID 1 of its namespace. A sidecar
|
||||
// spawned by that gateway has process.ppid === 1. An earlier revision
|
||||
// guarded the exclusion with `immediateParent > 1`, which dropped PID 1
|
||||
// and reopened the #68451 restart loop on every containerised install.
|
||||
// The current `> 0` check admits PID 1 into the exclusion set; this
|
||||
// test exercises the real walk by stubbing process.ppid to 1.
|
||||
const benignStalePid = process.pid + 2050;
|
||||
mockSpawnSync.mockReturnValue({
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([
|
||||
{ pid: 1, cmd: "openclaw-gateway" },
|
||||
{ pid: benignStalePid, cmd: "openclaw-gateway" },
|
||||
]),
|
||||
stderr: "",
|
||||
});
|
||||
const pids = withStubbedPpid(1, () => findGatewayPidsOnPortSync(18789));
|
||||
expect(pids).not.toContain(1);
|
||||
expect(pids).toContain(benignStalePid);
|
||||
});
|
||||
|
||||
it.skipIf(process.platform !== "linux")(
|
||||
"leaves the gateway grandparent in the kill list when /proc truncates the walk — documented degradation on hidepid/gVisor hosts",
|
||||
() => {
|
||||
// Pins the known-partial coverage the PR description and the
|
||||
// `readParentPidFromProc` comment call out: in hardened Linux
|
||||
// containers (hidepid=2, gVisor, AppArmor-locked namespaces) the
|
||||
// ancestor walk cannot traverse /proc/<other_pid>/status beyond
|
||||
// the caller, so it stops at `process.ppid`. For the direct-child
|
||||
// topology #68451 reports (gateway→sidecar), this is fine — ppid
|
||||
// is captured unconditionally via Node's syscall. For a 3-level
|
||||
// chain (gateway→plugin-host→sidecar), the gateway grandparent
|
||||
// falls outside the exclusion set and is still killable.
|
||||
//
|
||||
// This test locks that degraded outcome in place so a future
|
||||
// refactor cannot silently regress further (for example, by
|
||||
// skipping `process.ppid` as well) without at least failing this
|
||||
// assertion first. A fuller fix (macOS/Windows ancestor walk,
|
||||
// pidfd-based Linux walk, or privileged cmdline probe) belongs
|
||||
// in a separate change.
|
||||
const pluginHostPid = process.pid + 3001;
|
||||
const gatewayGrandparentPid = process.pid + 3002;
|
||||
// Default mockReadFileSync throws ENOENT for every /proc path —
|
||||
// the same view a non-privileged process has under hidepid=2.
|
||||
mockSpawnSync.mockReturnValue({
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([
|
||||
{ pid: pluginHostPid, cmd: "openclaw-gateway" },
|
||||
{ pid: gatewayGrandparentPid, cmd: "openclaw-gateway" },
|
||||
]),
|
||||
stderr: "",
|
||||
});
|
||||
const pids = withStubbedPpid(pluginHostPid, () => findGatewayPidsOnPortSync(18789));
|
||||
// Direct parent (plugin-host) must still be excluded — process.ppid
|
||||
// is captured with no /proc dependency, so hidepid cannot mask it.
|
||||
expect(pids).not.toContain(pluginHostPid);
|
||||
// Grandparent IS returned — documented partial coverage, tracked
|
||||
// separately from #68451.
|
||||
expect(pids).toContain(gatewayGrandparentPid);
|
||||
},
|
||||
);
|
||||
|
||||
it("excludes pids whose command does not include 'openclaw'", () => {
|
||||
const otherPid = process.pid + 2;
|
||||
mockSpawnSync.mockReturnValue({
|
||||
@@ -268,6 +453,33 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("excludes ancestor pids on Windows too — #68451 regression mirror for the win32 path", () => {
|
||||
// The #68451 invariant must hold on every code path the cleanup can take.
|
||||
// The Windows filter (filterVerifiedWindowsGatewayPids) shares the same
|
||||
// exclusion source, so the direct-parent gateway PID must be dropped
|
||||
// before the argv-verification step runs. Drive the real walk on the
|
||||
// win32 branch (which stops at process.ppid — no /proc lookup) by
|
||||
// stubbing process.ppid to the synthetic parent pid.
|
||||
const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
|
||||
const parentGatewayPid = process.pid + 2101;
|
||||
const unrelatedStalePid = process.pid + 2102;
|
||||
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
|
||||
try {
|
||||
mockReadWindowsListeningPids.mockReturnValue([parentGatewayPid, unrelatedStalePid]);
|
||||
mockReadWindowsProcessArgs.mockReturnValue(["openclaw", "gateway"]);
|
||||
const pids = withStubbedPpid(parentGatewayPid, () => findGatewayPidsOnPortSync(18789));
|
||||
expect(pids).not.toContain(parentGatewayPid);
|
||||
expect(pids).toContain(unrelatedStalePid);
|
||||
// argv verification must never have been asked about the parent, because
|
||||
// exclusion happens before the per-PID inspection step.
|
||||
expect(mockReadWindowsProcessArgs).not.toHaveBeenCalledWith(parentGatewayPid, undefined);
|
||||
} finally {
|
||||
if (origDescriptor) {
|
||||
Object.defineProperty(process, "platform", origDescriptor);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { readFileSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
import { resolveGatewayPort } from "../config/paths.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
@@ -33,6 +34,14 @@ const PORT_FREE_POLL_INTERVAL_MS = 50;
|
||||
const PORT_FREE_TIMEOUT_MS = 2000;
|
||||
const POLL_SPAWN_TIMEOUT_MS = 400;
|
||||
|
||||
/**
|
||||
* Upper bound on the ancestor-PID walk. A real-world chain is shallow
|
||||
* (pid1 → systemd → gateway → plugin-host → sidecar ≈ 5); 32 generously covers
|
||||
* nested-supervisor setups (k8s pod → containerd-shim → runc → …) while still
|
||||
* providing a hard stop against corrupted process tables or ppid cycles.
|
||||
*/
|
||||
const MAX_ANCESTOR_WALK_DEPTH = 32;
|
||||
|
||||
const restartLog = createSubsystemLogger("restart");
|
||||
let sleepSyncOverride: ((ms: number) => void) | null = null;
|
||||
let dateNowOverride: (() => number) | null = null;
|
||||
@@ -62,8 +71,102 @@ function sleepSync(ms: number): void {
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse openclaw gateway PIDs from lsof -Fpc stdout.
|
||||
* Pure function — no I/O. Excludes the current process.
|
||||
* Read a single ancestor PID from `/proc/<pid>/status` on Linux.
|
||||
* Returns null on any failure (non-Linux platform, restricted /proc, race
|
||||
* where the target pid exited between the walk step and the read); callers
|
||||
* treat a null return as "stop walking" and proceed with the ancestor set
|
||||
* collected so far.
|
||||
*/
|
||||
function readParentPidFromProc(pid: number): number | null {
|
||||
try {
|
||||
const status = readFileSync(`/proc/${pid}/status`, "utf8");
|
||||
const match = status.match(/^PPid:\s*(\d+)/m);
|
||||
if (!match) {
|
||||
return null;
|
||||
}
|
||||
const parsed = Number.parseInt(match[1] ?? "", 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
|
||||
} catch {
|
||||
// Null truncates the walk at this hop. In hardened Linux (hidepid=2,
|
||||
// gVisor, AppArmor-locked namespaces) /proc is unreadable beyond the
|
||||
// caller, so the walk can stop at `process.ppid`. #68451's direct
|
||||
// gateway→sidecar topology is covered (ppid is captured without a
|
||||
// /proc read); 3-level chains (gateway→plugin-host→sidecar) are not
|
||||
// — pinned by the "grandparent stays killable when /proc truncates
|
||||
// the walk" regression test.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect the set of PIDs whose termination would cascade-kill the caller:
|
||||
* the current process, its direct parent, and — where the platform permits
|
||||
* — the full ancestor chain up to the top of the pid namespace.
|
||||
*
|
||||
* Rationale: `cleanStaleGatewayProcessesSync` already refuses to kill
|
||||
* `process.pid` (see `parsePidsFromLsofOutput`), acknowledging the invariant
|
||||
* "a cleanup step must never destroy its own caller." That invariant was
|
||||
* applied only to the caller itself, not to its ancestors — which is how
|
||||
* issue #68451 arises: a plugin sidecar calls the cleanup, `lsof` reports
|
||||
* the parent gateway listening on 18789, the parent's PID passes the
|
||||
* `pid !== process.pid` filter, it is SIGTERM'd, the sidecar is then reaped
|
||||
* by the supervisor, the supervisor restarts the gateway, which re-spawns
|
||||
* the sidecar, which runs the cleanup again — infinite restart loop.
|
||||
*
|
||||
* Completing the invariant here removes the loop at its source: killing any
|
||||
* ancestor is exactly as fatal to the caller as killing itself, so ancestors
|
||||
* must receive the same exclusion treatment. The check admits any positive
|
||||
* ancestor PID (including 1), because inside a container — a first-class
|
||||
* deployment target for this project — the gateway is frequently the
|
||||
* entrypoint and therefore runs as PID 1 of its own namespace; excluding 1
|
||||
* unconditionally would recreate the #68451 loop on every containerised
|
||||
* install where the gateway spawns a direct-child sidecar.
|
||||
*
|
||||
* The walk is best-effort. `process.ppid` is provided by Node via a direct
|
||||
* syscall and is always available; transitive ancestors are only read on
|
||||
* Linux via `/proc`. macOS/Windows stop at ppid, which is sufficient for
|
||||
* the direct-child sidecar topology this bug describes; extending those
|
||||
* platforms can be done without touching the call sites.
|
||||
*
|
||||
* The function takes no parameters and exposes no hooks. Tests exercise
|
||||
* the real walk by stubbing `process.ppid` (and, on Linux, by mocking
|
||||
* `node:fs` to inject `/proc/<pid>/status` payloads) — there is no
|
||||
* reachable override for runtime callers to mutate.
|
||||
*/
|
||||
function getSelfAndAncestorPidsSync(): Set<number> {
|
||||
const pids = new Set<number>([process.pid]);
|
||||
const immediateParent = process.ppid;
|
||||
if (!Number.isFinite(immediateParent) || immediateParent <= 0) {
|
||||
return pids;
|
||||
}
|
||||
pids.add(immediateParent);
|
||||
if (process.platform !== "linux") {
|
||||
return pids;
|
||||
}
|
||||
// Transitive ancestor walk. Each hop's validity (positive pid, not already
|
||||
// seen) is enforced by the per-iteration `parent` check below; the entry
|
||||
// invariant `current > 0` is established above and preserved by `current =
|
||||
// parent` after the same check, so no separate top-of-loop guard is needed.
|
||||
let current = immediateParent;
|
||||
for (let depth = 0; depth < MAX_ANCESTOR_WALK_DEPTH; depth++) {
|
||||
const parent = readParentPidFromProc(current);
|
||||
if (parent == null || parent <= 0 || pids.has(parent)) {
|
||||
break;
|
||||
}
|
||||
pids.add(parent);
|
||||
current = parent;
|
||||
}
|
||||
return pids;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse openclaw gateway PIDs from lsof -Fpc stdout, excluding the current
|
||||
* process and its ancestors (see `getSelfAndAncestorPidsSync` for the full
|
||||
* rationale). On Linux the ancestor lookup reads up to
|
||||
* `MAX_ANCESTOR_WALK_DEPTH` entries from `/proc/<pid>/status`; each read is
|
||||
* a virtual-filesystem access (no disk I/O, no external process), wrapped
|
||||
* in try/catch and degrades silently. On macOS/Windows the lookup is
|
||||
* in-memory via `process.ppid` only.
|
||||
*/
|
||||
function parsePidsFromLsofOutput(stdout: string): number[] {
|
||||
const pids: number[] = [];
|
||||
@@ -94,16 +197,22 @@ function parsePidsFromLsofOutput(stdout: string): number[] {
|
||||
}
|
||||
// Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the
|
||||
// same PID twice. Return each PID at most once to avoid double-killing.
|
||||
return [...new Set(pids)].filter((pid) => pid !== process.pid);
|
||||
// Exclude self and ancestors — terminating any ancestor cascade-kills the
|
||||
// caller via the supervisor, recreating the #68451 restart loop.
|
||||
const excluded = getSelfAndAncestorPidsSync();
|
||||
return [...new Set(pids)].filter((pid) => !excluded.has(pid));
|
||||
}
|
||||
|
||||
/**
|
||||
* Windows: find listening PIDs on the port, then verify each is an openclaw
|
||||
* gateway process via command-line inspection. Excludes the current process.
|
||||
* gateway process via command-line inspection. Excludes the current process
|
||||
* and its ancestors (same invariant as the lsof path — see
|
||||
* `getSelfAndAncestorPidsSync`).
|
||||
*/
|
||||
function filterVerifiedWindowsGatewayPids(rawPids: number[]): number[] {
|
||||
const excluded = getSelfAndAncestorPidsSync();
|
||||
return Array.from(new Set(rawPids))
|
||||
.filter((pid) => Number.isFinite(pid) && pid > 0 && pid !== process.pid)
|
||||
.filter((pid) => Number.isFinite(pid) && pid > 0 && !excluded.has(pid))
|
||||
.filter((pid) => {
|
||||
const args = readWindowsProcessArgsSync(pid);
|
||||
return args != null && isGatewayArgv(args, { allowGatewayBinary: true });
|
||||
@@ -114,9 +223,10 @@ function filterVerifiedWindowsGatewayPidsResult(
|
||||
rawPids: number[],
|
||||
processArgsResult: (pid: number) => WindowsProcessArgsResult,
|
||||
): WindowsListeningPidsResult {
|
||||
const excluded = getSelfAndAncestorPidsSync();
|
||||
const verified: number[] = [];
|
||||
for (const pid of Array.from(new Set(rawPids))) {
|
||||
if (!Number.isFinite(pid) || pid <= 0 || pid === process.pid) {
|
||||
if (!Number.isFinite(pid) || pid <= 0 || excluded.has(pid)) {
|
||||
continue;
|
||||
}
|
||||
const argsResult = processArgsResult(pid);
|
||||
|
||||
Reference in New Issue
Block a user