mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-15 08:50:45 +00:00
* fix(process): skip kill-tree group kill when child wasn't detached (#71662) When the supervisor spawns a child with detached:false (service-managed runtime under launchd/systemd), the child shares the gateway's process group. On session abort or SIGKILL, killProcessTree was unconditionally issuing process.kill(-pid, 'SIGTERM') — which targets the entire process GROUP (negative pid is POSIX group-kill semantics) and therefore SIGTERMs the gateway parent along with the child. Reporter saw this on macOS (LaunchAgent + KeepAlive=true): aborting a claude-cli/claude-opus-4-7 session caused the gateway to receive SIGTERM, then auto-restart, dropping all in-flight sessions. Switching the primary model to a non-cli provider eliminated it because the non-cli paths don't go through this kill-tree call. Did not occur on Linux VPS where the gateway runs detached, because there useDetached === true and the child got its own process group. Fix: - killProcessTree now accepts opts.detached?: boolean. When detached:false, killProcessTreeUnix skips the `-pid` group-kill and goes straight to direct-pid SIGTERM/SIGKILL. Group-kill default (detached:true) is preserved so all existing callers behave exactly as before. - supervisor/adapters/child.ts:286 now threads the spawn-time `useDetached` flag into killProcessTree, so the kill-tree path matches the spawn-time detachment decision (line 45 of the same file already computes useDetached = process.platform !== 'win32' && !isServiceManagedRuntime()). Tests: - new: detached:false skips group kill and uses direct pid SIGTERM only. - new: default behaviour (detached:true) still uses group kill (regression guard so the existing test case isn't accidentally weakened). Existing tests still pass (6/6 in kill-tree.test.ts). Lint clean. Out of scope: other killProcessTree callers (mcp-stdio-transport, bash-tools.process, etc.) keep the default group-kill behaviour because those processes are typically detached from the gateway. Only the supervisor/adapters/child.ts path threads `detached` through, since it's the path that knows whether the child was actually spawned detached. * fixup(process): also gate kill-tree group-kill on the no-detach spawn fallback (#71662) Greptile review on the original PR caught a P1 gap: when spawnWithFallback's initial detached spawn fails and it retries with the no-detach fallback (label: "no-detach", options.detached: false), the child runs detached:false but my variable useDetached was still true. The kill closure then passed `detached: useDetached` = true to killProcessTree, which still group-killed the gateway — same bug, just on the fallback path. Compute the actual detachment as `useDetached && !spawned.usedFallback` after spawn returns, and pass that through. This closes the gap: the kill path now correctly skips group-kill in BOTH: 1. Service-managed runtime (useDetached=false from the start, original case) 2. Detached-spawn fallback to no-detach (useDetached=true at intent time but spawned.usedFallback=true) Tests: - existing 'uses process-tree kill for default SIGKILL' updated to assert the new {detached} option is forwarded. - new: passes detached:false to killProcessTree when spawn fell back. - new: passes detached:false in service-managed mode (regression guard for the original fix). 11/11 tests pass in child.test.ts. 6/6 in kill-tree.test.ts.
183 lines
5.8 KiB
TypeScript
183 lines
5.8 KiB
TypeScript
import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
|
|
|
|
const { spawnMock } = vi.hoisted(() => ({
|
|
spawnMock: vi.fn(),
|
|
}));
|
|
|
|
vi.mock("node:child_process", async () => {
|
|
const { mockNodeBuiltinModule } = await import("../../test/helpers/node-builtin-mocks.js");
|
|
return mockNodeBuiltinModule(
|
|
() => vi.importActual<typeof import("node:child_process")>("node:child_process"),
|
|
{
|
|
spawn: (...args: unknown[]) => spawnMock(...args),
|
|
},
|
|
);
|
|
});
|
|
|
|
let killProcessTree: typeof import("./kill-tree.js").killProcessTree;
|
|
|
|
async function withPlatform<T>(platform: NodeJS.Platform, run: () => Promise<T> | T): Promise<T> {
|
|
const originalPlatform = Object.getOwnPropertyDescriptor(process, "platform");
|
|
Object.defineProperty(process, "platform", { value: platform, configurable: true });
|
|
try {
|
|
return await run();
|
|
} finally {
|
|
if (originalPlatform) {
|
|
Object.defineProperty(process, "platform", originalPlatform);
|
|
}
|
|
}
|
|
}
|
|
|
|
describe("killProcessTree", () => {
|
|
let killSpy: ReturnType<typeof vi.spyOn>;
|
|
|
|
beforeAll(async () => {
|
|
({ killProcessTree } = await import("./kill-tree.js"));
|
|
});
|
|
|
|
beforeEach(() => {
|
|
spawnMock.mockClear();
|
|
killSpy = vi.spyOn(process, "kill");
|
|
vi.useFakeTimers();
|
|
});
|
|
|
|
afterEach(() => {
|
|
killSpy.mockRestore();
|
|
vi.useRealTimers();
|
|
vi.clearAllMocks();
|
|
});
|
|
|
|
it("on Windows skips delayed force-kill when PID is already gone", async () => {
|
|
killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => {
|
|
if (pid === 4242 && signal === 0) {
|
|
throw new Error("ESRCH");
|
|
}
|
|
return true;
|
|
}) as typeof process.kill);
|
|
|
|
await withPlatform("win32", async () => {
|
|
killProcessTree(4242, { graceMs: 25 });
|
|
|
|
expect(spawnMock).toHaveBeenCalledTimes(1);
|
|
expect(spawnMock).toHaveBeenNthCalledWith(
|
|
1,
|
|
"taskkill",
|
|
["/T", "/PID", "4242"],
|
|
expect.objectContaining({ detached: true, stdio: "ignore" }),
|
|
);
|
|
|
|
await vi.advanceTimersByTimeAsync(25);
|
|
expect(spawnMock).toHaveBeenCalledTimes(1);
|
|
});
|
|
});
|
|
|
|
it("on Windows force-kills after grace period only when PID still exists", async () => {
|
|
killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => {
|
|
if (pid === 5252 && signal === 0) {
|
|
return true;
|
|
}
|
|
return true;
|
|
}) as typeof process.kill);
|
|
|
|
await withPlatform("win32", async () => {
|
|
killProcessTree(5252, { graceMs: 10 });
|
|
|
|
await vi.advanceTimersByTimeAsync(10);
|
|
|
|
expect(spawnMock).toHaveBeenCalledTimes(2);
|
|
expect(spawnMock).toHaveBeenNthCalledWith(
|
|
1,
|
|
"taskkill",
|
|
["/T", "/PID", "5252"],
|
|
expect.objectContaining({ detached: true, stdio: "ignore" }),
|
|
);
|
|
expect(spawnMock).toHaveBeenNthCalledWith(
|
|
2,
|
|
"taskkill",
|
|
["/F", "/T", "/PID", "5252"],
|
|
expect.objectContaining({ detached: true, stdio: "ignore" }),
|
|
);
|
|
});
|
|
});
|
|
|
|
it("on Unix sends SIGTERM first and skips SIGKILL when process exits", async () => {
|
|
killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => {
|
|
if (pid === -3333 && signal === 0) {
|
|
throw new Error("ESRCH");
|
|
}
|
|
if (pid === 3333 && signal === 0) {
|
|
throw new Error("ESRCH");
|
|
}
|
|
return true;
|
|
}) as typeof process.kill);
|
|
|
|
await withPlatform("linux", async () => {
|
|
killProcessTree(3333, { graceMs: 10 });
|
|
|
|
await vi.advanceTimersByTimeAsync(10);
|
|
|
|
expect(killSpy).toHaveBeenCalledWith(-3333, "SIGTERM");
|
|
expect(killSpy).not.toHaveBeenCalledWith(-3333, "SIGKILL");
|
|
expect(killSpy).not.toHaveBeenCalledWith(3333, "SIGKILL");
|
|
});
|
|
});
|
|
|
|
it("on Unix sends SIGKILL after grace period when process is still alive", async () => {
|
|
killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => {
|
|
if (pid === -4444 && signal === 0) {
|
|
return true;
|
|
}
|
|
return true;
|
|
}) as typeof process.kill);
|
|
|
|
await withPlatform("linux", async () => {
|
|
killProcessTree(4444, { graceMs: 5 });
|
|
|
|
await vi.advanceTimersByTimeAsync(5);
|
|
|
|
expect(killSpy).toHaveBeenCalledWith(-4444, "SIGTERM");
|
|
expect(killSpy).toHaveBeenCalledWith(-4444, "SIGKILL");
|
|
});
|
|
});
|
|
|
|
it("on Unix skips group kill when detached:false to avoid SIGTERMing the parent's own process group (#71662)", async () => {
|
|
killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => {
|
|
if (pid === 5555 && signal === 0) {
|
|
throw new Error("ESRCH");
|
|
}
|
|
return true;
|
|
}) as typeof process.kill);
|
|
|
|
await withPlatform("linux", async () => {
|
|
killProcessTree(5555, { graceMs: 10, detached: false });
|
|
await vi.advanceTimersByTimeAsync(10);
|
|
|
|
// Direct pid kill is fine. Group kill (`-pid`) is FORBIDDEN here because
|
|
// when the child wasn't spawned detached, its process group is the
|
|
// gateway's group — `-pid` would SIGTERM the gateway itself.
|
|
expect(killSpy).toHaveBeenCalledWith(5555, "SIGTERM");
|
|
expect(killSpy).not.toHaveBeenCalledWith(-5555, "SIGTERM");
|
|
expect(killSpy).not.toHaveBeenCalledWith(-5555, "SIGKILL");
|
|
});
|
|
});
|
|
|
|
it("on Unix uses group kill by default (detached:true preserved as the existing behavior)", async () => {
|
|
killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => {
|
|
if (pid === -6666 && signal === 0) {
|
|
throw new Error("ESRCH");
|
|
}
|
|
if (pid === 6666 && signal === 0) {
|
|
throw new Error("ESRCH");
|
|
}
|
|
return true;
|
|
}) as typeof process.kill);
|
|
|
|
await withPlatform("linux", async () => {
|
|
killProcessTree(6666, { graceMs: 10 });
|
|
await vi.advanceTimersByTimeAsync(10);
|
|
|
|
expect(killSpy).toHaveBeenCalledWith(-6666, "SIGTERM");
|
|
});
|
|
});
|
|
});
|