fix(agents): force SIGKILL for stuck MCP stdio children (#86739)

Guarantee MCP stdio child cleanup during Gateway shutdown by sending a synchronous SIGKILL when the child survives the existing stdin and SIGTERM waits. This prevents SIGTERM-ignoring local MCP processes from outliving the Gateway when killProcessTree's unref'd SIGKILL timer would otherwise lose the shutdown race.

Fixes #86412.

Verification:
- GitHub CI green on relevant agent/runtime, lint/type, CodeQL/security, OpenGrep, and Real behavior proof checks.
- Real behavior proof: https://github.com/openclaw/openclaw/actions/runs/26430512156/job/77802651894
- Maintainer manual review: no blocking findings.

Thanks @openperf.

Co-authored-by: openperf <16864032@qq.com>
This commit is contained in:
Chunyue Wang
2026-05-27 09:04:29 +08:00
committed by GitHub
parent 1dbd9a3154
commit 71d24f98a8
2 changed files with 34 additions and 1 deletions

View File

@@ -6,6 +6,7 @@ import { OpenClawStdioClientTransport } from "./mcp-stdio-transport.js";
const spawnMock = vi.hoisted(() => vi.fn());
const killProcessTreeMock = vi.hoisted(() => vi.fn());
const signalProcessTreeMock = vi.hoisted(() => vi.fn());
vi.mock("node:child_process", async () => ({
...(await vi.importActual<typeof import("node:child_process")>("node:child_process")),
@@ -14,6 +15,7 @@ vi.mock("node:child_process", async () => ({
vi.mock("../process/kill-tree.js", () => ({
killProcessTree: killProcessTreeMock,
signalProcessTree: signalProcessTreeMock,
}));
class MockChildProcess extends EventEmitter {
@@ -29,6 +31,7 @@ describe("OpenClawStdioClientTransport", () => {
vi.useRealTimers();
spawnMock.mockReset();
killProcessTreeMock.mockReset();
signalProcessTreeMock.mockReset();
});
it("starts stdio MCP servers in a disposable process group on POSIX", async () => {
@@ -88,6 +91,30 @@ describe("OpenClawStdioClientTransport", () => {
await closing;
});
it("force-SIGKILLs synchronously when killProcessTree's grace expires (#86412)", async () => {
vi.useFakeTimers();
const child = new MockChildProcess();
spawnMock.mockReturnValue(child);
const transport = new OpenClawStdioClientTransport({ command: "npx" });
const started = transport.start();
child.emit("spawn");
await started;
const closing = transport.close();
await vi.advanceTimersByTimeAsync(2000);
expect(killProcessTreeMock).toHaveBeenCalledWith(4321);
expect(signalProcessTreeMock).not.toHaveBeenCalled();
// killProcessTree's SIGKILL is .unref()'d (#86412); close() force-SIGKILLs synchronously instead.
await vi.advanceTimersByTimeAsync(2000);
expect(signalProcessTreeMock).toHaveBeenCalledWith(4321, "SIGKILL");
child.exitCode = 0;
child.emit("close", 0);
await closing;
});
it("does not kill the process tree when graceful stdio close exits", async () => {
vi.useFakeTimers();
const child = new MockChildProcess();

View File

@@ -5,7 +5,7 @@ import { getDefaultEnvironment } from "@modelcontextprotocol/sdk/client/stdio.js
import { ReadBuffer, serializeMessage } from "@modelcontextprotocol/sdk/shared/stdio.js";
import type { Transport } from "@modelcontextprotocol/sdk/shared/transport.js";
import type { JSONRPCMessage } from "@modelcontextprotocol/sdk/types.js";
import { killProcessTree } from "../process/kill-tree.js";
import { killProcessTree, signalProcessTree } from "../process/kill-tree.js";
import { prepareOomScoreAdjustedSpawn } from "../process/linux-oom-score.js";
export type OpenClawStdioServerParameters = {
@@ -17,6 +17,7 @@ export type OpenClawStdioServerParameters = {
};
const CLOSE_TIMEOUT_MS = 2000;
const SIGKILL_REAP_TIMEOUT_MS = 500;
function delay(ms: number) {
return new Promise<void>((resolve) => {
@@ -125,6 +126,11 @@ export class OpenClawStdioClientTransport implements Transport {
if (processToClose.exitCode === null && processToClose.pid) {
killProcessTree(processToClose.pid);
await Promise.race([closePromise, delay(CLOSE_TIMEOUT_MS)]);
if (processToClose.exitCode === null && processToClose.pid) {
// SIGKILL synchronously: killProcessTree's setTimeout is .unref()'d and races shutdown (#86412).
signalProcessTree(processToClose.pid, "SIGKILL");
await Promise.race([closePromise, delay(SIGKILL_REAP_TIMEOUT_MS)]);
}
}
}
this.readBuffer.clear();