diff --git a/src/process/kill-tree.test.ts b/src/process/kill-tree.test.ts index aba5355cd9d..ce0cf0d8425 100644 --- a/src/process/kill-tree.test.ts +++ b/src/process/kill-tree.test.ts @@ -139,4 +139,44 @@ describe("killProcessTree", () => { expect(killSpy).toHaveBeenCalledWith(-4444, "SIGKILL"); }); }); + + it("on Unix skips group kill when detached:false to avoid SIGTERMing the parent's own process group (#71662)", async () => { + killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => { + if (pid === 5555 && signal === 0) { + throw new Error("ESRCH"); + } + return true; + }) as typeof process.kill); + + await withPlatform("linux", async () => { + killProcessTree(5555, { graceMs: 10, detached: false }); + await vi.advanceTimersByTimeAsync(10); + + // Direct pid kill is fine. Group kill (`-pid`) is FORBIDDEN here because + // when the child wasn't spawned detached, its process group is the + // gateway's group — `-pid` would SIGTERM the gateway itself. + expect(killSpy).toHaveBeenCalledWith(5555, "SIGTERM"); + expect(killSpy).not.toHaveBeenCalledWith(-5555, "SIGTERM"); + expect(killSpy).not.toHaveBeenCalledWith(-5555, "SIGKILL"); + }); + }); + + it("on Unix uses group kill by default (detached:true preserved as the existing behavior)", async () => { + killSpy.mockImplementation(((pid: number, signal?: NodeJS.Signals | number) => { + if (pid === -6666 && signal === 0) { + throw new Error("ESRCH"); + } + if (pid === 6666 && signal === 0) { + throw new Error("ESRCH"); + } + return true; + }) as typeof process.kill); + + await withPlatform("linux", async () => { + killProcessTree(6666, { graceMs: 10 }); + await vi.advanceTimersByTimeAsync(10); + + expect(killSpy).toHaveBeenCalledWith(-6666, "SIGTERM"); + }); + }); }); diff --git a/src/process/kill-tree.ts b/src/process/kill-tree.ts index 6f0d752e4c5..0260f7d55ae 100644 --- a/src/process/kill-tree.ts +++ b/src/process/kill-tree.ts @@ -11,8 +11,16 @@ const MAX_GRACE_MS = 60_000; * * This gives child processes a chance to clean up (close connections, remove * temp files, terminate their own children) before being hard-killed. + * + * When the child was spawned with `detached: false` (e.g. service-managed + * runtime under launchd/systemd), pass `detached: false` to skip the Unix + * `process.kill(-pid, ...)` group-kill — it would otherwise target the + * gateway's own process group and SIGTERM the gateway itself. (#71662) */ -export function killProcessTree(pid: number, opts?: { graceMs?: number }): void { +export function killProcessTree( + pid: number, + opts?: { graceMs?: number; detached?: boolean }, +): void { if (!Number.isFinite(pid) || pid <= 0) { return; } @@ -24,7 +32,7 @@ export function killProcessTree(pid: number, opts?: { graceMs?: number }): void return; } - killProcessTreeUnix(pid, graceMs); + killProcessTreeUnix(pid, graceMs, opts?.detached !== false); } function normalizeGraceMs(value?: number): number { @@ -43,12 +51,23 @@ function isProcessAlive(pid: number): boolean { } } -function killProcessTreeUnix(pid: number, graceMs: number): void { - // Step 1: Try graceful SIGTERM to process group - try { - process.kill(-pid, "SIGTERM"); - } catch { - // Process group doesn't exist or we lack permission - try direct +function killProcessTreeUnix(pid: number, graceMs: number, useGroupKill: boolean): void { + // Step 1: Try graceful SIGTERM. Prefer process-group kill (`-pid`) when the + // child was spawned detached so it has its own group; otherwise stick to the + // direct pid to avoid SIGTERMing our own process group (the gateway). (#71662) + if (useGroupKill) { + try { + process.kill(-pid, "SIGTERM"); + } catch { + // Process group doesn't exist or we lack permission - try direct + try { + process.kill(pid, "SIGTERM"); + } catch { + // Already gone + return; + } + } + } else { try { process.kill(pid, "SIGTERM"); } catch { @@ -59,7 +78,7 @@ function killProcessTreeUnix(pid: number, graceMs: number): void { // Step 2: Wait grace period, then SIGKILL if still alive setTimeout(() => { - if (isProcessAlive(-pid)) { + if (useGroupKill && isProcessAlive(-pid)) { try { process.kill(-pid, "SIGKILL"); return; diff --git a/src/process/supervisor/adapters/child.test.ts b/src/process/supervisor/adapters/child.test.ts index a3007b4da32..cccaccf3239 100644 --- a/src/process/supervisor/adapters/child.test.ts +++ b/src/process/supervisor/adapters/child.test.ts @@ -122,10 +122,46 @@ describe("createChildAdapter", () => { adapter.kill(); - expect(killProcessTreeMock).toHaveBeenCalledWith(4321); + // Detachment flag is now passed to killProcessTree so it knows whether + // it can safely group-kill via -pid. (#71662) + const expectedDetached = process.platform !== "win32" && !process.env.OPENCLAW_SERVICE_MARKER; + expect(killProcessTreeMock).toHaveBeenCalledWith(4321, { detached: expectedDetached }); expect(killMock).toHaveBeenCalledWith("SIGKILL"); }); + it("passes detached:false to killProcessTree when spawn fell back to no-detach (#71662 follow-up)", async () => { + // Simulate the fallback scenario: spawnWithFallback retried with + // detached:false because the initial detached spawn failed. The kill + // closure must NOT group-kill since the child shares the gateway's group. + const { child, killMock } = createStubChild(8888); + spawnWithFallbackMock.mockResolvedValue({ + child, + usedFallback: true, + fallbackLabel: "no-detach", + }); + const adapter = await createChildAdapter({ + argv: ["node", "-e", "setTimeout(() => {}, 1000)"], + stdinMode: "pipe-open", + }); + + adapter.kill(); + + expect(killProcessTreeMock).toHaveBeenCalledWith(8888, { detached: false }); + expect(killMock).toHaveBeenCalledWith("SIGKILL"); + }); + + it("passes detached:false in service-managed mode where useDetached is false from the start (#71662)", async () => { + process.env.OPENCLAW_SERVICE_MARKER = "1"; + try { + const { adapter, killMock } = await createAdapterHarness({ pid: 9999 }); + adapter.kill(); + expect(killProcessTreeMock).toHaveBeenCalledWith(9999, { detached: false }); + expect(killMock).toHaveBeenCalledWith("SIGKILL"); + } finally { + delete process.env.OPENCLAW_SERVICE_MARKER; + } + }); + it("uses direct child.kill for non-SIGKILL signals", async () => { const { adapter, killMock } = await createAdapterHarness({ pid: 7654 }); diff --git a/src/process/supervisor/adapters/child.ts b/src/process/supervisor/adapters/child.ts index 0d71f30e8de..efeed87d245 100644 --- a/src/process/supervisor/adapters/child.ts +++ b/src/process/supervisor/adapters/child.ts @@ -283,11 +283,20 @@ export async function createChildAdapter(params: { return waitPromise; }; + // The actual detachment of the spawned child can differ from `useDetached`: + // when the detached spawn fails, `spawnWithFallback` retries with the + // `no-detach` fallback (detached:false). In that case the child shares the + // gateway's process group regardless of intent, so the kill must avoid + // group-kill. (#71662 follow-up — caught by Greptile review) + const childIsDetached = useDetached && !spawned.usedFallback; const kill = (signal?: NodeJS.Signals) => { const pid = child.pid ?? undefined; if (signal === undefined || signal === "SIGKILL") { if (pid) { - killProcessTree(pid); + // Pass through whether the child is actually detached. Without this, + // `killProcessTree` group-kills via `-pid` and takes out the gateway's + // own process group along with the child. (#71662) + killProcessTree(pid, { detached: childIsDetached }); } try { child.kill("SIGKILL");