From 71d24f98a803decece68aa71d14ca1de44b2973f Mon Sep 17 00:00:00 2001 From: Chunyue Wang <80630709+openperf@users.noreply.github.com> Date: Wed, 27 May 2026 09:04:29 +0800 Subject: [PATCH] fix(agents): force SIGKILL for stuck MCP stdio children (#86739) Guarantee MCP stdio child cleanup during Gateway shutdown by sending a synchronous SIGKILL when the child survives the existing stdin and SIGTERM waits. This prevents SIGTERM-ignoring local MCP processes from outliving the Gateway when killProcessTree's unref'd SIGKILL timer would otherwise lose the shutdown race. Fixes #86412. Verification: - GitHub CI green on relevant agent/runtime, lint/type, CodeQL/security, OpenGrep, and Real behavior proof checks. - Real behavior proof: https://github.com/openclaw/openclaw/actions/runs/26430512156/job/77802651894 - Maintainer manual review: no blocking findings. Thanks @openperf. Co-authored-by: openperf <16864032@qq.com> --- src/agents/mcp-stdio-transport.test.ts | 27 ++++++++++++++++++++++++++ src/agents/mcp-stdio-transport.ts | 8 +++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/agents/mcp-stdio-transport.test.ts b/src/agents/mcp-stdio-transport.test.ts index 806445ae165..94853d76d70 100644 --- a/src/agents/mcp-stdio-transport.test.ts +++ b/src/agents/mcp-stdio-transport.test.ts @@ -6,6 +6,7 @@ import { OpenClawStdioClientTransport } from "./mcp-stdio-transport.js"; const spawnMock = vi.hoisted(() => vi.fn()); const killProcessTreeMock = vi.hoisted(() => vi.fn()); +const signalProcessTreeMock = vi.hoisted(() => vi.fn()); vi.mock("node:child_process", async () => ({ ...(await vi.importActual("node:child_process")), @@ -14,6 +15,7 @@ vi.mock("node:child_process", async () => ({ vi.mock("../process/kill-tree.js", () => ({ killProcessTree: killProcessTreeMock, + signalProcessTree: signalProcessTreeMock, })); class MockChildProcess extends EventEmitter { @@ -29,6 +31,7 @@ describe("OpenClawStdioClientTransport", () => { vi.useRealTimers(); spawnMock.mockReset(); killProcessTreeMock.mockReset(); + signalProcessTreeMock.mockReset(); }); it("starts stdio MCP servers in a disposable process group on POSIX", async () => { @@ -88,6 +91,30 @@ describe("OpenClawStdioClientTransport", () => { await closing; }); + it("force-SIGKILLs synchronously when killProcessTree's grace expires (#86412)", async () => { + vi.useFakeTimers(); + const child = new MockChildProcess(); + spawnMock.mockReturnValue(child); + + const transport = new OpenClawStdioClientTransport({ command: "npx" }); + const started = transport.start(); + child.emit("spawn"); + await started; + + const closing = transport.close(); + await vi.advanceTimersByTimeAsync(2000); + expect(killProcessTreeMock).toHaveBeenCalledWith(4321); + expect(signalProcessTreeMock).not.toHaveBeenCalled(); + + // killProcessTree's SIGKILL is .unref()'d (#86412); close() force-SIGKILLs synchronously instead. + await vi.advanceTimersByTimeAsync(2000); + expect(signalProcessTreeMock).toHaveBeenCalledWith(4321, "SIGKILL"); + + child.exitCode = 0; + child.emit("close", 0); + await closing; + }); + it("does not kill the process tree when graceful stdio close exits", async () => { vi.useFakeTimers(); const child = new MockChildProcess(); diff --git a/src/agents/mcp-stdio-transport.ts b/src/agents/mcp-stdio-transport.ts index d99a7ac5aed..4308ce2013a 100644 --- a/src/agents/mcp-stdio-transport.ts +++ b/src/agents/mcp-stdio-transport.ts @@ -5,7 +5,7 @@ import { getDefaultEnvironment } from "@modelcontextprotocol/sdk/client/stdio.js import { ReadBuffer, serializeMessage } from "@modelcontextprotocol/sdk/shared/stdio.js"; import type { Transport } from "@modelcontextprotocol/sdk/shared/transport.js"; import type { JSONRPCMessage } from "@modelcontextprotocol/sdk/types.js"; -import { killProcessTree } from "../process/kill-tree.js"; +import { killProcessTree, signalProcessTree } from "../process/kill-tree.js"; import { prepareOomScoreAdjustedSpawn } from "../process/linux-oom-score.js"; export type OpenClawStdioServerParameters = { @@ -17,6 +17,7 @@ export type OpenClawStdioServerParameters = { }; const CLOSE_TIMEOUT_MS = 2000; +const SIGKILL_REAP_TIMEOUT_MS = 500; function delay(ms: number) { return new Promise((resolve) => { @@ -125,6 +126,11 @@ export class OpenClawStdioClientTransport implements Transport { if (processToClose.exitCode === null && processToClose.pid) { killProcessTree(processToClose.pid); await Promise.race([closePromise, delay(CLOSE_TIMEOUT_MS)]); + if (processToClose.exitCode === null && processToClose.pid) { + // SIGKILL synchronously: killProcessTree's setTimeout is .unref()'d and races shutdown (#86412). + signalProcessTree(processToClose.pid, "SIGKILL"); + await Promise.race([closePromise, delay(SIGKILL_REAP_TIMEOUT_MS)]); + } } } this.readBuffer.clear();