fix(infra/restart): exclude ancestor pids from stale-gateway cleanup

The stale-gateway cleanup filter already refused to kill process.pid —
acknowledging the invariant that terminating a process whose death
cascades into the caller is never safe. That invariant was applied only
to the caller itself, not to its ancestors, which is why the
openclaw-weixin sidecar triggered an unbounded restart loop: the
sidecar's cleanup SIGTERM'd its parent gateway, the supervisor
restarted the gateway, the gateway re-spawned the sidecar, the cleanup
ran again.

Complete the invariant by excluding the full self+ancestor PID set in
both the lsof (Unix) and PowerShell/netstat (Windows) cleanup paths.
Walk uses process.ppid unconditionally (Node built-in, no spawn) and
/proc/<pid>/status on Linux for transitive ancestors, with graceful
degradation where /proc is unavailable.
This commit is contained in:
openperf
2026-04-18 18:45:24 +08:00
committed by Peter Steinberger
parent aad9a833c0
commit 8aadca4c3e
2 changed files with 328 additions and 6 deletions

View File

@@ -26,6 +26,29 @@ const mockReadWindowsProcessArgsResult = vi.hoisted(() =>
(_pid: number, _timeoutMs?: number) => ({ ok: true, args: null }),
),
);
// Drives the Linux `/proc/<pid>/status` ancestor walk inside
// `getSelfAndAncestorPidsSync`. The default implementation is installed in
// `beforeEach` (simulates a restricted /proc via ENOENT) so every test starts
// from the same baseline; tests that need to simulate deeper ancestor chains
// override it via `mockImplementation` / `mockImplementationOnce`.
const mockReadFileSync = vi.hoisted(() => vi.fn());
vi.mock("node:fs", async () => {
const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js");
return mockNodeBuiltinModule(
() => vi.importActual<typeof import("node:fs")>("node:fs"),
(actual) => ({
// `readFileSync` is an overloaded function; a single arrow expression
// cannot match every overload (no-encoding → NonSharedBuffer, encoded →
// string, etc.), which tsgo flags as TS2322. Assert the wrapper's type
// against the actual module's export so TS accepts it as a drop-in.
// The test only exercises the string-returning overload (encoded /proc
// reads); the cast is a precise retype, not `any`.
readFileSync: ((path: unknown, encoding?: unknown) =>
mockReadFileSync(path, encoding)) as typeof actual.readFileSync,
}),
);
});
vi.mock("node:child_process", async () => {
const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js");
@@ -143,6 +166,14 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
mockReadWindowsListeningPidsResult.mockReset();
mockReadWindowsProcessArgs.mockReset();
mockReadWindowsProcessArgsResult.mockReset();
mockReadFileSync.mockReset();
mockReadFileSync.mockImplementation(() => {
// Default: simulate /proc unavailable. Walks that reach this mock
// degrade silently and return whatever set they collected so far.
const error: NodeJS.ErrnoException = new Error("ENOENT: test default");
error.code = "ENOENT";
throw error;
});
mockResolveGatewayPort.mockReturnValue(18789);
mockReadWindowsListeningPids.mockReturnValue([]);
mockReadWindowsListeningPidsResult.mockReturnValue({ ok: true, pids: [] });
@@ -157,6 +188,24 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
vi.restoreAllMocks();
});
// Temporarily rewrites `process.ppid` for a block of test code. Used by the
// ancestor-exclusion tests to drive the real `getSelfAndAncestorPidsSync`
// walk without installing a runtime-reachable override on the module. Node
// always exposes `process.ppid` as an own property so the captured
// descriptor is non-null in practice; the `if (orig)` guard is defensive
// against a broken environment, not a reachable branch.
function withStubbedPpid<T>(ppid: number, fn: () => T): T {
const orig = Object.getOwnPropertyDescriptor(process, "ppid");
Object.defineProperty(process, "ppid", { value: ppid, configurable: true });
try {
return fn();
} finally {
if (orig) {
Object.defineProperty(process, "ppid", orig);
}
}
}
// -------------------------------------------------------------------------
// findGatewayPidsOnPortSync
// -------------------------------------------------------------------------
@@ -203,6 +252,142 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
expect(pids).not.toContain(process.pid);
});
it("excludes ancestor pids so a sidecar cannot kill its parent gateway — regression for #68451", () => {
// Regression: openclaw-weixin sidecar (child of the gateway) invoked
// cleanStaleGatewayProcessesSync during init. lsof reported the parent
// gateway on port 18789, its PID was not process.pid, so the cleanup
// SIGTERM'd it — the supervisor restarted the gateway, re-spawned the
// sidecar, the cleanup ran again: infinite restart loop.
//
// Fix: parsePidsFromLsofOutput now excludes process.pid AND its
// ancestor chain (see getSelfAndAncestorPidsSync). This test stubs
// process.ppid to the synthetic parent gateway pid so the real walk
// adds it to the exclusion set; the default /proc mock throws ENOENT
// so the walk stops after the direct parent.
const parentGatewayPid = process.pid + 2001;
const unrelatedStalePid = process.pid + 2002;
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout: lsofOutput([
{ pid: parentGatewayPid, cmd: "openclaw-gateway" },
{ pid: unrelatedStalePid, cmd: "openclaw-gateway" },
]),
stderr: "",
});
const pids = withStubbedPpid(parentGatewayPid, () => findGatewayPidsOnPortSync(18789));
// Parent gateway must be excluded; an unrelated stale PID must still be
// reported so the supervisor-path cleanup continues to work.
expect(pids).not.toContain(parentGatewayPid);
expect(pids).toContain(unrelatedStalePid);
});
it.skipIf(process.platform !== "linux")(
"excludes the full ancestor chain, not just the direct parent — deeper nesting",
() => {
// The ancestor-exclusion invariant is transitive: killing any
// ancestor cascades to the caller the same way killing the direct
// parent does. Drive the real Linux /proc walk by stubbing
// process.ppid to the direct parent and mocking readFileSync to
// return synthetic PPid lines for each ancestor hop; the mock ends
// the chain with "PPid: 0" so the walk terminates without touching
// the real /proc.
const directParentPid = process.pid + 2003;
const grandparentPid = process.pid + 2004;
const benignStalePid = process.pid + 2005;
mockReadFileSync.mockImplementation((path: unknown): string => {
if (path === `/proc/${directParentPid}/status`) {
return `Name:\topenclaw-gateway\nPid:\t${directParentPid}\nPPid:\t${grandparentPid}\n`;
}
if (path === `/proc/${grandparentPid}/status`) {
return `Name:\tsystemd\nPid:\t${grandparentPid}\nPPid:\t0\n`;
}
const error: NodeJS.ErrnoException = new Error("ENOENT");
error.code = "ENOENT";
throw error;
});
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout: lsofOutput([
{ pid: directParentPid, cmd: "openclaw-gateway" },
{ pid: grandparentPid, cmd: "openclaw-gateway" },
{ pid: benignStalePid, cmd: "openclaw-gateway" },
]),
stderr: "",
});
const pids = withStubbedPpid(directParentPid, () => findGatewayPidsOnPortSync(18789));
expect(pids).not.toContain(directParentPid);
expect(pids).not.toContain(grandparentPid);
expect(pids).toContain(benignStalePid);
},
);
it("excludes PID 1 when the direct parent gateway is the container entrypoint — container topology", () => {
// Codex P1: in container deployments the gateway is the container
// entrypoint and therefore runs as PID 1 of its namespace. A sidecar
// spawned by that gateway has process.ppid === 1. An earlier revision
// guarded the exclusion with `immediateParent > 1`, which dropped PID 1
// and reopened the #68451 restart loop on every containerised install.
// The current `> 0` check admits PID 1 into the exclusion set; this
// test exercises the real walk by stubbing process.ppid to 1.
const benignStalePid = process.pid + 2050;
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout: lsofOutput([
{ pid: 1, cmd: "openclaw-gateway" },
{ pid: benignStalePid, cmd: "openclaw-gateway" },
]),
stderr: "",
});
const pids = withStubbedPpid(1, () => findGatewayPidsOnPortSync(18789));
expect(pids).not.toContain(1);
expect(pids).toContain(benignStalePid);
});
it.skipIf(process.platform !== "linux")(
"leaves the gateway grandparent in the kill list when /proc truncates the walk — documented degradation on hidepid/gVisor hosts",
() => {
// Pins the known-partial coverage the PR description and the
// `readParentPidFromProc` comment call out: in hardened Linux
// containers (hidepid=2, gVisor, AppArmor-locked namespaces) the
// ancestor walk cannot traverse /proc/<other_pid>/status beyond
// the caller, so it stops at `process.ppid`. For the direct-child
// topology #68451 reports (gateway→sidecar), this is fine — ppid
// is captured unconditionally via Node's syscall. For a 3-level
// chain (gateway→plugin-host→sidecar), the gateway grandparent
// falls outside the exclusion set and is still killable.
//
// This test locks that degraded outcome in place so a future
// refactor cannot silently regress further (for example, by
// skipping `process.ppid` as well) without at least failing this
// assertion first. A fuller fix (macOS/Windows ancestor walk,
// pidfd-based Linux walk, or privileged cmdline probe) belongs
// in a separate change.
const pluginHostPid = process.pid + 3001;
const gatewayGrandparentPid = process.pid + 3002;
// Default mockReadFileSync throws ENOENT for every /proc path —
// the same view a non-privileged process has under hidepid=2.
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout: lsofOutput([
{ pid: pluginHostPid, cmd: "openclaw-gateway" },
{ pid: gatewayGrandparentPid, cmd: "openclaw-gateway" },
]),
stderr: "",
});
const pids = withStubbedPpid(pluginHostPid, () => findGatewayPidsOnPortSync(18789));
// Direct parent (plugin-host) must still be excluded — process.ppid
// is captured with no /proc dependency, so hidepid cannot mask it.
expect(pids).not.toContain(pluginHostPid);
// Grandparent IS returned — documented partial coverage, tracked
// separately from #68451.
expect(pids).toContain(gatewayGrandparentPid);
},
);
it("excludes pids whose command does not include 'openclaw'", () => {
const otherPid = process.pid + 2;
mockSpawnSync.mockReturnValue({
@@ -268,6 +453,33 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
}
}
});
it("excludes ancestor pids on Windows too — #68451 regression mirror for the win32 path", () => {
// The #68451 invariant must hold on every code path the cleanup can take.
// The Windows filter (filterVerifiedWindowsGatewayPids) shares the same
// exclusion source, so the direct-parent gateway PID must be dropped
// before the argv-verification step runs. Drive the real walk on the
// win32 branch (which stops at process.ppid — no /proc lookup) by
// stubbing process.ppid to the synthetic parent pid.
const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform");
const parentGatewayPid = process.pid + 2101;
const unrelatedStalePid = process.pid + 2102;
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
try {
mockReadWindowsListeningPids.mockReturnValue([parentGatewayPid, unrelatedStalePid]);
mockReadWindowsProcessArgs.mockReturnValue(["openclaw", "gateway"]);
const pids = withStubbedPpid(parentGatewayPid, () => findGatewayPidsOnPortSync(18789));
expect(pids).not.toContain(parentGatewayPid);
expect(pids).toContain(unrelatedStalePid);
// argv verification must never have been asked about the parent, because
// exclusion happens before the per-PID inspection step.
expect(mockReadWindowsProcessArgs).not.toHaveBeenCalledWith(parentGatewayPid, undefined);
} finally {
if (origDescriptor) {
Object.defineProperty(process, "platform", origDescriptor);
}
}
});
});
// -------------------------------------------------------------------------

View File

@@ -1,4 +1,5 @@
import { spawnSync } from "node:child_process";
import { readFileSync } from "node:fs";
import path from "node:path";
import { resolveGatewayPort } from "../config/paths.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
@@ -33,6 +34,14 @@ const PORT_FREE_POLL_INTERVAL_MS = 50;
const PORT_FREE_TIMEOUT_MS = 2000;
const POLL_SPAWN_TIMEOUT_MS = 400;
/**
* Upper bound on the ancestor-PID walk. A real-world chain is shallow
* (pid1 → systemd → gateway → plugin-host → sidecar ≈ 5); 32 generously covers
* nested-supervisor setups (k8s pod → containerd-shim → runc → …) while still
* providing a hard stop against corrupted process tables or ppid cycles.
*/
const MAX_ANCESTOR_WALK_DEPTH = 32;
const restartLog = createSubsystemLogger("restart");
let sleepSyncOverride: ((ms: number) => void) | null = null;
let dateNowOverride: (() => number) | null = null;
@@ -62,8 +71,102 @@ function sleepSync(ms: number): void {
}
/**
* Parse openclaw gateway PIDs from lsof -Fpc stdout.
* Pure function — no I/O. Excludes the current process.
* Read a single ancestor PID from `/proc/<pid>/status` on Linux.
* Returns null on any failure (non-Linux platform, restricted /proc, race
* where the target pid exited between the walk step and the read); callers
* treat a null return as "stop walking" and proceed with the ancestor set
* collected so far.
*/
function readParentPidFromProc(pid: number): number | null {
try {
const status = readFileSync(`/proc/${pid}/status`, "utf8");
const match = status.match(/^PPid:\s*(\d+)/m);
if (!match) {
return null;
}
const parsed = Number.parseInt(match[1] ?? "", 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
} catch {
// Null truncates the walk at this hop. In hardened Linux (hidepid=2,
// gVisor, AppArmor-locked namespaces) /proc is unreadable beyond the
// caller, so the walk can stop at `process.ppid`. #68451's direct
// gateway→sidecar topology is covered (ppid is captured without a
// /proc read); 3-level chains (gateway→plugin-host→sidecar) are not
// — pinned by the "grandparent stays killable when /proc truncates
// the walk" regression test.
return null;
}
}
/**
* Collect the set of PIDs whose termination would cascade-kill the caller:
* the current process, its direct parent, and — where the platform permits
* — the full ancestor chain up to the top of the pid namespace.
*
* Rationale: `cleanStaleGatewayProcessesSync` already refuses to kill
* `process.pid` (see `parsePidsFromLsofOutput`), acknowledging the invariant
* "a cleanup step must never destroy its own caller." That invariant was
* applied only to the caller itself, not to its ancestors — which is how
* issue #68451 arises: a plugin sidecar calls the cleanup, `lsof` reports
* the parent gateway listening on 18789, the parent's PID passes the
* `pid !== process.pid` filter, it is SIGTERM'd, the sidecar is then reaped
* by the supervisor, the supervisor restarts the gateway, which re-spawns
* the sidecar, which runs the cleanup again — infinite restart loop.
*
* Completing the invariant here removes the loop at its source: killing any
* ancestor is exactly as fatal to the caller as killing itself, so ancestors
* must receive the same exclusion treatment. The check admits any positive
* ancestor PID (including 1), because inside a container — a first-class
* deployment target for this project — the gateway is frequently the
* entrypoint and therefore runs as PID 1 of its own namespace; excluding 1
* unconditionally would recreate the #68451 loop on every containerised
* install where the gateway spawns a direct-child sidecar.
*
* The walk is best-effort. `process.ppid` is provided by Node via a direct
* syscall and is always available; transitive ancestors are only read on
* Linux via `/proc`. macOS/Windows stop at ppid, which is sufficient for
* the direct-child sidecar topology this bug describes; extending those
* platforms can be done without touching the call sites.
*
* The function takes no parameters and exposes no hooks. Tests exercise
* the real walk by stubbing `process.ppid` (and, on Linux, by mocking
* `node:fs` to inject `/proc/<pid>/status` payloads) — there is no
* reachable override for runtime callers to mutate.
*/
function getSelfAndAncestorPidsSync(): Set<number> {
const pids = new Set<number>([process.pid]);
const immediateParent = process.ppid;
if (!Number.isFinite(immediateParent) || immediateParent <= 0) {
return pids;
}
pids.add(immediateParent);
if (process.platform !== "linux") {
return pids;
}
// Transitive ancestor walk. Each hop's validity (positive pid, not already
// seen) is enforced by the per-iteration `parent` check below; the entry
// invariant `current > 0` is established above and preserved by `current =
// parent` after the same check, so no separate top-of-loop guard is needed.
let current = immediateParent;
for (let depth = 0; depth < MAX_ANCESTOR_WALK_DEPTH; depth++) {
const parent = readParentPidFromProc(current);
if (parent == null || parent <= 0 || pids.has(parent)) {
break;
}
pids.add(parent);
current = parent;
}
return pids;
}
/**
* Parse openclaw gateway PIDs from lsof -Fpc stdout, excluding the current
* process and its ancestors (see `getSelfAndAncestorPidsSync` for the full
* rationale). On Linux the ancestor lookup reads up to
* `MAX_ANCESTOR_WALK_DEPTH` entries from `/proc/<pid>/status`; each read is
* a virtual-filesystem access (no disk I/O, no external process), wrapped
* in try/catch and degrades silently. On macOS/Windows the lookup is
* in-memory via `process.ppid` only.
*/
function parsePidsFromLsofOutput(stdout: string): number[] {
const pids: number[] = [];
@@ -94,16 +197,22 @@ function parsePidsFromLsofOutput(stdout: string): number[] {
}
// Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the
// same PID twice. Return each PID at most once to avoid double-killing.
return [...new Set(pids)].filter((pid) => pid !== process.pid);
// Exclude self and ancestors — terminating any ancestor cascade-kills the
// caller via the supervisor, recreating the #68451 restart loop.
const excluded = getSelfAndAncestorPidsSync();
return [...new Set(pids)].filter((pid) => !excluded.has(pid));
}
/**
* Windows: find listening PIDs on the port, then verify each is an openclaw
* gateway process via command-line inspection. Excludes the current process.
* gateway process via command-line inspection. Excludes the current process
* and its ancestors (same invariant as the lsof path — see
* `getSelfAndAncestorPidsSync`).
*/
function filterVerifiedWindowsGatewayPids(rawPids: number[]): number[] {
const excluded = getSelfAndAncestorPidsSync();
return Array.from(new Set(rawPids))
.filter((pid) => Number.isFinite(pid) && pid > 0 && pid !== process.pid)
.filter((pid) => Number.isFinite(pid) && pid > 0 && !excluded.has(pid))
.filter((pid) => {
const args = readWindowsProcessArgsSync(pid);
return args != null && isGatewayArgv(args, { allowGatewayBinary: true });
@@ -114,9 +223,10 @@ function filterVerifiedWindowsGatewayPidsResult(
rawPids: number[],
processArgsResult: (pid: number) => WindowsProcessArgsResult,
): WindowsListeningPidsResult {
const excluded = getSelfAndAncestorPidsSync();
const verified: number[] = [];
for (const pid of Array.from(new Set(rawPids))) {
if (!Number.isFinite(pid) || pid <= 0 || pid === process.pid) {
if (!Number.isFinite(pid) || pid <= 0 || excluded.has(pid)) {
continue;
}
const argsResult = processArgsResult(pid);