From 8aadca4c3e477b1242a1d6da8f039af29cae2078 Mon Sep 17 00:00:00 2001 From: openperf <16864032@qq.com> Date: Sat, 18 Apr 2026 18:45:24 +0800 Subject: [PATCH] fix(infra/restart): exclude ancestor pids from stale-gateway cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stale-gateway cleanup filter already refused to kill process.pid — acknowledging the invariant that terminating a process whose death cascades into the caller is never safe. That invariant was applied only to the caller itself, not to its ancestors, which is why the openclaw-weixin sidecar triggered an unbounded restart loop: the sidecar's cleanup SIGTERM'd its parent gateway, the supervisor restarted the gateway, the gateway re-spawned the sidecar, the cleanup ran again. Complete the invariant by excluding the full self+ancestor PID set in both the lsof (Unix) and PowerShell/netstat (Windows) cleanup paths. Walk uses process.ppid unconditionally (Node built-in, no spawn) and /proc//status on Linux for transitive ancestors, with graceful degradation where /proc is unavailable. --- src/infra/restart-stale-pids.test.ts | 212 +++++++++++++++++++++++++++ src/infra/restart-stale-pids.ts | 122 ++++++++++++++- 2 files changed, 328 insertions(+), 6 deletions(-) diff --git a/src/infra/restart-stale-pids.test.ts b/src/infra/restart-stale-pids.test.ts index 968b335dea0..9d8d1b429d7 100644 --- a/src/infra/restart-stale-pids.test.ts +++ b/src/infra/restart-stale-pids.test.ts @@ -26,6 +26,29 @@ const mockReadWindowsProcessArgsResult = vi.hoisted(() => (_pid: number, _timeoutMs?: number) => ({ ok: true, args: null }), ), ); +// Drives the Linux `/proc//status` ancestor walk inside +// `getSelfAndAncestorPidsSync`. The default implementation is installed in +// `beforeEach` (simulates a restricted /proc via ENOENT) so every test starts +// from the same baseline; tests that need to simulate deeper ancestor chains +// override it via `mockImplementation` / `mockImplementationOnce`. +const mockReadFileSync = vi.hoisted(() => vi.fn()); + +vi.mock("node:fs", async () => { + const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js"); + return mockNodeBuiltinModule( + () => vi.importActual("node:fs"), + (actual) => ({ + // `readFileSync` is an overloaded function; a single arrow expression + // cannot match every overload (no-encoding → NonSharedBuffer, encoded → + // string, etc.), which tsgo flags as TS2322. Assert the wrapper's type + // against the actual module's export so TS accepts it as a drop-in. + // The test only exercises the string-returning overload (encoded /proc + // reads); the cast is a precise retype, not `any`. + readFileSync: ((path: unknown, encoding?: unknown) => + mockReadFileSync(path, encoding)) as typeof actual.readFileSync, + }), + ); +}); vi.mock("node:child_process", async () => { const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js"); @@ -143,6 +166,14 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { mockReadWindowsListeningPidsResult.mockReset(); mockReadWindowsProcessArgs.mockReset(); mockReadWindowsProcessArgsResult.mockReset(); + mockReadFileSync.mockReset(); + mockReadFileSync.mockImplementation(() => { + // Default: simulate /proc unavailable. Walks that reach this mock + // degrade silently and return whatever set they collected so far. + const error: NodeJS.ErrnoException = new Error("ENOENT: test default"); + error.code = "ENOENT"; + throw error; + }); mockResolveGatewayPort.mockReturnValue(18789); mockReadWindowsListeningPids.mockReturnValue([]); mockReadWindowsListeningPidsResult.mockReturnValue({ ok: true, pids: [] }); @@ -157,6 +188,24 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { vi.restoreAllMocks(); }); + // Temporarily rewrites `process.ppid` for a block of test code. Used by the + // ancestor-exclusion tests to drive the real `getSelfAndAncestorPidsSync` + // walk without installing a runtime-reachable override on the module. Node + // always exposes `process.ppid` as an own property so the captured + // descriptor is non-null in practice; the `if (orig)` guard is defensive + // against a broken environment, not a reachable branch. + function withStubbedPpid(ppid: number, fn: () => T): T { + const orig = Object.getOwnPropertyDescriptor(process, "ppid"); + Object.defineProperty(process, "ppid", { value: ppid, configurable: true }); + try { + return fn(); + } finally { + if (orig) { + Object.defineProperty(process, "ppid", orig); + } + } + } + // ------------------------------------------------------------------------- // findGatewayPidsOnPortSync // ------------------------------------------------------------------------- @@ -203,6 +252,142 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { expect(pids).not.toContain(process.pid); }); + it("excludes ancestor pids so a sidecar cannot kill its parent gateway — regression for #68451", () => { + // Regression: openclaw-weixin sidecar (child of the gateway) invoked + // cleanStaleGatewayProcessesSync during init. lsof reported the parent + // gateway on port 18789, its PID was not process.pid, so the cleanup + // SIGTERM'd it — the supervisor restarted the gateway, re-spawned the + // sidecar, the cleanup ran again: infinite restart loop. + // + // Fix: parsePidsFromLsofOutput now excludes process.pid AND its + // ancestor chain (see getSelfAndAncestorPidsSync). This test stubs + // process.ppid to the synthetic parent gateway pid so the real walk + // adds it to the exclusion set; the default /proc mock throws ENOENT + // so the walk stops after the direct parent. + const parentGatewayPid = process.pid + 2001; + const unrelatedStalePid = process.pid + 2002; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: parentGatewayPid, cmd: "openclaw-gateway" }, + { pid: unrelatedStalePid, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const pids = withStubbedPpid(parentGatewayPid, () => findGatewayPidsOnPortSync(18789)); + // Parent gateway must be excluded; an unrelated stale PID must still be + // reported so the supervisor-path cleanup continues to work. + expect(pids).not.toContain(parentGatewayPid); + expect(pids).toContain(unrelatedStalePid); + }); + + it.skipIf(process.platform !== "linux")( + "excludes the full ancestor chain, not just the direct parent — deeper nesting", + () => { + // The ancestor-exclusion invariant is transitive: killing any + // ancestor cascades to the caller the same way killing the direct + // parent does. Drive the real Linux /proc walk by stubbing + // process.ppid to the direct parent and mocking readFileSync to + // return synthetic PPid lines for each ancestor hop; the mock ends + // the chain with "PPid: 0" so the walk terminates without touching + // the real /proc. + const directParentPid = process.pid + 2003; + const grandparentPid = process.pid + 2004; + const benignStalePid = process.pid + 2005; + mockReadFileSync.mockImplementation((path: unknown): string => { + if (path === `/proc/${directParentPid}/status`) { + return `Name:\topenclaw-gateway\nPid:\t${directParentPid}\nPPid:\t${grandparentPid}\n`; + } + if (path === `/proc/${grandparentPid}/status`) { + return `Name:\tsystemd\nPid:\t${grandparentPid}\nPPid:\t0\n`; + } + const error: NodeJS.ErrnoException = new Error("ENOENT"); + error.code = "ENOENT"; + throw error; + }); + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: directParentPid, cmd: "openclaw-gateway" }, + { pid: grandparentPid, cmd: "openclaw-gateway" }, + { pid: benignStalePid, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const pids = withStubbedPpid(directParentPid, () => findGatewayPidsOnPortSync(18789)); + expect(pids).not.toContain(directParentPid); + expect(pids).not.toContain(grandparentPid); + expect(pids).toContain(benignStalePid); + }, + ); + + it("excludes PID 1 when the direct parent gateway is the container entrypoint — container topology", () => { + // Codex P1: in container deployments the gateway is the container + // entrypoint and therefore runs as PID 1 of its namespace. A sidecar + // spawned by that gateway has process.ppid === 1. An earlier revision + // guarded the exclusion with `immediateParent > 1`, which dropped PID 1 + // and reopened the #68451 restart loop on every containerised install. + // The current `> 0` check admits PID 1 into the exclusion set; this + // test exercises the real walk by stubbing process.ppid to 1. + const benignStalePid = process.pid + 2050; + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: 1, cmd: "openclaw-gateway" }, + { pid: benignStalePid, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const pids = withStubbedPpid(1, () => findGatewayPidsOnPortSync(18789)); + expect(pids).not.toContain(1); + expect(pids).toContain(benignStalePid); + }); + + it.skipIf(process.platform !== "linux")( + "leaves the gateway grandparent in the kill list when /proc truncates the walk — documented degradation on hidepid/gVisor hosts", + () => { + // Pins the known-partial coverage the PR description and the + // `readParentPidFromProc` comment call out: in hardened Linux + // containers (hidepid=2, gVisor, AppArmor-locked namespaces) the + // ancestor walk cannot traverse /proc//status beyond + // the caller, so it stops at `process.ppid`. For the direct-child + // topology #68451 reports (gateway→sidecar), this is fine — ppid + // is captured unconditionally via Node's syscall. For a 3-level + // chain (gateway→plugin-host→sidecar), the gateway grandparent + // falls outside the exclusion set and is still killable. + // + // This test locks that degraded outcome in place so a future + // refactor cannot silently regress further (for example, by + // skipping `process.ppid` as well) without at least failing this + // assertion first. A fuller fix (macOS/Windows ancestor walk, + // pidfd-based Linux walk, or privileged cmdline probe) belongs + // in a separate change. + const pluginHostPid = process.pid + 3001; + const gatewayGrandparentPid = process.pid + 3002; + // Default mockReadFileSync throws ENOENT for every /proc path — + // the same view a non-privileged process has under hidepid=2. + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: lsofOutput([ + { pid: pluginHostPid, cmd: "openclaw-gateway" }, + { pid: gatewayGrandparentPid, cmd: "openclaw-gateway" }, + ]), + stderr: "", + }); + const pids = withStubbedPpid(pluginHostPid, () => findGatewayPidsOnPortSync(18789)); + // Direct parent (plugin-host) must still be excluded — process.ppid + // is captured with no /proc dependency, so hidepid cannot mask it. + expect(pids).not.toContain(pluginHostPid); + // Grandparent IS returned — documented partial coverage, tracked + // separately from #68451. + expect(pids).toContain(gatewayGrandparentPid); + }, + ); + it("excludes pids whose command does not include 'openclaw'", () => { const otherPid = process.pid + 2; mockSpawnSync.mockReturnValue({ @@ -268,6 +453,33 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { } } }); + + it("excludes ancestor pids on Windows too — #68451 regression mirror for the win32 path", () => { + // The #68451 invariant must hold on every code path the cleanup can take. + // The Windows filter (filterVerifiedWindowsGatewayPids) shares the same + // exclusion source, so the direct-parent gateway PID must be dropped + // before the argv-verification step runs. Drive the real walk on the + // win32 branch (which stops at process.ppid — no /proc lookup) by + // stubbing process.ppid to the synthetic parent pid. + const origDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + const parentGatewayPid = process.pid + 2101; + const unrelatedStalePid = process.pid + 2102; + Object.defineProperty(process, "platform", { value: "win32", configurable: true }); + try { + mockReadWindowsListeningPids.mockReturnValue([parentGatewayPid, unrelatedStalePid]); + mockReadWindowsProcessArgs.mockReturnValue(["openclaw", "gateway"]); + const pids = withStubbedPpid(parentGatewayPid, () => findGatewayPidsOnPortSync(18789)); + expect(pids).not.toContain(parentGatewayPid); + expect(pids).toContain(unrelatedStalePid); + // argv verification must never have been asked about the parent, because + // exclusion happens before the per-PID inspection step. + expect(mockReadWindowsProcessArgs).not.toHaveBeenCalledWith(parentGatewayPid, undefined); + } finally { + if (origDescriptor) { + Object.defineProperty(process, "platform", origDescriptor); + } + } + }); }); // ------------------------------------------------------------------------- diff --git a/src/infra/restart-stale-pids.ts b/src/infra/restart-stale-pids.ts index b8ce4ad9b48..5693c23742c 100644 --- a/src/infra/restart-stale-pids.ts +++ b/src/infra/restart-stale-pids.ts @@ -1,4 +1,5 @@ import { spawnSync } from "node:child_process"; +import { readFileSync } from "node:fs"; import path from "node:path"; import { resolveGatewayPort } from "../config/paths.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; @@ -33,6 +34,14 @@ const PORT_FREE_POLL_INTERVAL_MS = 50; const PORT_FREE_TIMEOUT_MS = 2000; const POLL_SPAWN_TIMEOUT_MS = 400; +/** + * Upper bound on the ancestor-PID walk. A real-world chain is shallow + * (pid1 → systemd → gateway → plugin-host → sidecar ≈ 5); 32 generously covers + * nested-supervisor setups (k8s pod → containerd-shim → runc → …) while still + * providing a hard stop against corrupted process tables or ppid cycles. + */ +const MAX_ANCESTOR_WALK_DEPTH = 32; + const restartLog = createSubsystemLogger("restart"); let sleepSyncOverride: ((ms: number) => void) | null = null; let dateNowOverride: (() => number) | null = null; @@ -62,8 +71,102 @@ function sleepSync(ms: number): void { } /** - * Parse openclaw gateway PIDs from lsof -Fpc stdout. - * Pure function — no I/O. Excludes the current process. + * Read a single ancestor PID from `/proc//status` on Linux. + * Returns null on any failure (non-Linux platform, restricted /proc, race + * where the target pid exited between the walk step and the read); callers + * treat a null return as "stop walking" and proceed with the ancestor set + * collected so far. + */ +function readParentPidFromProc(pid: number): number | null { + try { + const status = readFileSync(`/proc/${pid}/status`, "utf8"); + const match = status.match(/^PPid:\s*(\d+)/m); + if (!match) { + return null; + } + const parsed = Number.parseInt(match[1] ?? "", 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : null; + } catch { + // Null truncates the walk at this hop. In hardened Linux (hidepid=2, + // gVisor, AppArmor-locked namespaces) /proc is unreadable beyond the + // caller, so the walk can stop at `process.ppid`. #68451's direct + // gateway→sidecar topology is covered (ppid is captured without a + // /proc read); 3-level chains (gateway→plugin-host→sidecar) are not + // — pinned by the "grandparent stays killable when /proc truncates + // the walk" regression test. + return null; + } +} + +/** + * Collect the set of PIDs whose termination would cascade-kill the caller: + * the current process, its direct parent, and — where the platform permits + * — the full ancestor chain up to the top of the pid namespace. + * + * Rationale: `cleanStaleGatewayProcessesSync` already refuses to kill + * `process.pid` (see `parsePidsFromLsofOutput`), acknowledging the invariant + * "a cleanup step must never destroy its own caller." That invariant was + * applied only to the caller itself, not to its ancestors — which is how + * issue #68451 arises: a plugin sidecar calls the cleanup, `lsof` reports + * the parent gateway listening on 18789, the parent's PID passes the + * `pid !== process.pid` filter, it is SIGTERM'd, the sidecar is then reaped + * by the supervisor, the supervisor restarts the gateway, which re-spawns + * the sidecar, which runs the cleanup again — infinite restart loop. + * + * Completing the invariant here removes the loop at its source: killing any + * ancestor is exactly as fatal to the caller as killing itself, so ancestors + * must receive the same exclusion treatment. The check admits any positive + * ancestor PID (including 1), because inside a container — a first-class + * deployment target for this project — the gateway is frequently the + * entrypoint and therefore runs as PID 1 of its own namespace; excluding 1 + * unconditionally would recreate the #68451 loop on every containerised + * install where the gateway spawns a direct-child sidecar. + * + * The walk is best-effort. `process.ppid` is provided by Node via a direct + * syscall and is always available; transitive ancestors are only read on + * Linux via `/proc`. macOS/Windows stop at ppid, which is sufficient for + * the direct-child sidecar topology this bug describes; extending those + * platforms can be done without touching the call sites. + * + * The function takes no parameters and exposes no hooks. Tests exercise + * the real walk by stubbing `process.ppid` (and, on Linux, by mocking + * `node:fs` to inject `/proc//status` payloads) — there is no + * reachable override for runtime callers to mutate. + */ +function getSelfAndAncestorPidsSync(): Set { + const pids = new Set([process.pid]); + const immediateParent = process.ppid; + if (!Number.isFinite(immediateParent) || immediateParent <= 0) { + return pids; + } + pids.add(immediateParent); + if (process.platform !== "linux") { + return pids; + } + // Transitive ancestor walk. Each hop's validity (positive pid, not already + // seen) is enforced by the per-iteration `parent` check below; the entry + // invariant `current > 0` is established above and preserved by `current = + // parent` after the same check, so no separate top-of-loop guard is needed. + let current = immediateParent; + for (let depth = 0; depth < MAX_ANCESTOR_WALK_DEPTH; depth++) { + const parent = readParentPidFromProc(current); + if (parent == null || parent <= 0 || pids.has(parent)) { + break; + } + pids.add(parent); + current = parent; + } + return pids; +} + +/** + * Parse openclaw gateway PIDs from lsof -Fpc stdout, excluding the current + * process and its ancestors (see `getSelfAndAncestorPidsSync` for the full + * rationale). On Linux the ancestor lookup reads up to + * `MAX_ANCESTOR_WALK_DEPTH` entries from `/proc//status`; each read is + * a virtual-filesystem access (no disk I/O, no external process), wrapped + * in try/catch and degrades silently. On macOS/Windows the lookup is + * in-memory via `process.ppid` only. */ function parsePidsFromLsofOutput(stdout: string): number[] { const pids: number[] = []; @@ -94,16 +197,22 @@ function parsePidsFromLsofOutput(stdout: string): number[] { } // Deduplicate: dual-stack listeners (IPv4 + IPv6) cause lsof to emit the // same PID twice. Return each PID at most once to avoid double-killing. - return [...new Set(pids)].filter((pid) => pid !== process.pid); + // Exclude self and ancestors — terminating any ancestor cascade-kills the + // caller via the supervisor, recreating the #68451 restart loop. + const excluded = getSelfAndAncestorPidsSync(); + return [...new Set(pids)].filter((pid) => !excluded.has(pid)); } /** * Windows: find listening PIDs on the port, then verify each is an openclaw - * gateway process via command-line inspection. Excludes the current process. + * gateway process via command-line inspection. Excludes the current process + * and its ancestors (same invariant as the lsof path — see + * `getSelfAndAncestorPidsSync`). */ function filterVerifiedWindowsGatewayPids(rawPids: number[]): number[] { + const excluded = getSelfAndAncestorPidsSync(); return Array.from(new Set(rawPids)) - .filter((pid) => Number.isFinite(pid) && pid > 0 && pid !== process.pid) + .filter((pid) => Number.isFinite(pid) && pid > 0 && !excluded.has(pid)) .filter((pid) => { const args = readWindowsProcessArgsSync(pid); return args != null && isGatewayArgv(args, { allowGatewayBinary: true }); @@ -114,9 +223,10 @@ function filterVerifiedWindowsGatewayPidsResult( rawPids: number[], processArgsResult: (pid: number) => WindowsProcessArgsResult, ): WindowsListeningPidsResult { + const excluded = getSelfAndAncestorPidsSync(); const verified: number[] = []; for (const pid of Array.from(new Set(rawPids))) { - if (!Number.isFinite(pid) || pid <= 0 || pid === process.pid) { + if (!Number.isFinite(pid) || pid <= 0 || excluded.has(pid)) { continue; } const argsResult = processArgsResult(pid);