fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)

* exec: clean up PTY resources on timeout and exit

* cli: harden resume cleanup and watchdog stalled runs

* cli: productionize PTY and resume reliability paths

* docs: add PTY process supervision architecture plan

* docs: rewrite PTY supervision plan as pre-rewrite baseline

* docs: switch PTY supervision plan to one-go execution

* docs: add one-line root cause to PTY supervision plan

* docs: add OS contracts and test matrix to PTY supervision plan

* docs: define process-supervisor package placement and scope

* docs: tie supervisor plan to existing CI lanes

* docs: place PTY supervisor plan under src/process

* refactor(process): route exec and cli runs through supervisor

* docs(process): refresh PTY supervision plan

* wip

* fix(process): harden supervisor timeout and PTY termination

* fix(process): harden supervisor adapters env and wait handling

* ci: avoid failing formal conformance on comment permissions

* test(ui): fix cron request mock argument typing

* fix(ui): remove leftover conflict marker

* fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
Onur
2026-02-16 09:32:05 +08:00
committed by GitHub
parent a73e7786e7
commit cd44a0d01e
32 changed files with 2759 additions and 855 deletions

View File

@@ -1,17 +1,17 @@
import type { AgentToolResult } from "@mariozechner/pi-agent-core";
import type { ChildProcessWithoutNullStreams } from "node:child_process";
import { Type } from "@sinclair/typebox";
import path from "node:path";
import type { ExecAsk, ExecHost, ExecSecurity } from "../infra/exec-approvals.js";
import type { ProcessSession, SessionStdin } from "./bash-process-registry.js";
import type { ProcessSession } from "./bash-process-registry.js";
import type { ExecToolDetails } from "./bash-tools.exec.js";
import type { BashSandboxConfig } from "./bash-tools.shared.js";
import { requestHeartbeatNow } from "../infra/heartbeat-wake.js";
import { mergePathPrepend } from "../infra/path-prepend.js";
import { enqueueSystemEvent } from "../infra/system-events.js";
export { applyPathPrepend, normalizePathPrepend } from "../infra/path-prepend.js";
import type { ManagedRun } from "../process/supervisor/index.js";
import { logWarn } from "../logger.js";
import { formatSpawnError, spawnWithFallback } from "../process/spawn-utils.js";
import { getProcessSupervisor } from "../process/supervisor/index.js";
import {
addSession,
appendOutput,
@@ -23,7 +23,6 @@ import {
buildDockerExecArgs,
chunkString,
clampWithDefault,
killSession,
readEnvInt,
} from "./bash-tools.shared.js";
import { buildCursorPositionResponse, stripDsrRequests } from "./pty-dsr.js";
@@ -147,26 +146,6 @@ export const execSchema = Type.Object({
),
});
type PtyExitEvent = { exitCode: number; signal?: number };
type PtyListener<T> = (event: T) => void;
type PtyHandle = {
pid: number;
write: (data: string | Buffer) => void;
onData: (listener: PtyListener<string>) => void;
onExit: (listener: PtyListener<PtyExitEvent>) => void;
};
type PtySpawn = (
file: string,
args: string[] | string,
options: {
name?: string;
cols?: number;
rows?: number;
cwd?: string;
env?: Record<string, string>;
},
) => PtyHandle;
export type ExecProcessOutcome = {
status: "completed" | "failed";
exitCode: number | null;
@@ -319,138 +298,10 @@ export async function runExecProcess(opts: {
}): Promise<ExecProcessHandle> {
const startedAt = Date.now();
const sessionId = createSessionSlug();
let child: ChildProcessWithoutNullStreams | null = null;
let pty: PtyHandle | null = null;
let stdin: SessionStdin | undefined;
const execCommand = opts.execCommand ?? opts.command;
const supervisor = getProcessSupervisor();
const spawnFallbacks = [
{
label: "no-detach",
options: { detached: false },
},
];
const handleSpawnFallback = (err: unknown, fallback: { label: string }) => {
const errText = formatSpawnError(err);
const warning = `Warning: spawn failed (${errText}); retrying with ${fallback.label}.`;
logWarn(`exec: spawn failed (${errText}); retrying with ${fallback.label}.`);
opts.warnings.push(warning);
};
const spawnShellChild = async (
shell: string,
shellArgs: string[],
): Promise<ChildProcessWithoutNullStreams> => {
const { child: spawned } = await spawnWithFallback({
argv: [shell, ...shellArgs, execCommand],
options: {
cwd: opts.workdir,
env: opts.env,
detached: process.platform !== "win32",
stdio: ["pipe", "pipe", "pipe"],
windowsHide: true,
},
fallbacks: spawnFallbacks,
onFallback: handleSpawnFallback,
});
return spawned as ChildProcessWithoutNullStreams;
};
// `exec` does not currently accept tool-provided stdin content. For non-PTY runs,
// keeping stdin open can cause commands like `wc -l` (or safeBins-hardened segments)
// to block forever waiting for input, leading to accidental backgrounding.
// For interactive flows, callers should use `pty: true` (stdin kept open).
const maybeCloseNonPtyStdin = () => {
if (opts.usePty) {
return;
}
try {
// Signal EOF immediately so stdin-only commands can terminate.
child?.stdin?.end();
} catch {
// ignore stdin close errors
}
};
if (opts.sandbox) {
const { child: spawned } = await spawnWithFallback({
argv: [
"docker",
...buildDockerExecArgs({
containerName: opts.sandbox.containerName,
command: execCommand,
workdir: opts.containerWorkdir ?? opts.sandbox.containerWorkdir,
env: opts.env,
tty: opts.usePty,
}),
],
options: {
cwd: opts.workdir,
env: process.env,
detached: process.platform !== "win32",
stdio: ["pipe", "pipe", "pipe"],
windowsHide: true,
},
fallbacks: spawnFallbacks,
onFallback: handleSpawnFallback,
});
child = spawned as ChildProcessWithoutNullStreams;
stdin = child.stdin;
maybeCloseNonPtyStdin();
} else if (opts.usePty) {
const { shell, args: shellArgs } = getShellConfig();
try {
const ptyModule = (await import("@lydell/node-pty")) as unknown as {
spawn?: PtySpawn;
default?: { spawn?: PtySpawn };
};
const spawnPty = ptyModule.spawn ?? ptyModule.default?.spawn;
if (!spawnPty) {
throw new Error("PTY support is unavailable (node-pty spawn not found).");
}
pty = spawnPty(shell, [...shellArgs, execCommand], {
cwd: opts.workdir,
env: opts.env,
name: process.env.TERM ?? "xterm-256color",
cols: 120,
rows: 30,
});
stdin = {
destroyed: false,
write: (data, cb) => {
try {
pty?.write(data);
cb?.(null);
} catch (err) {
cb?.(err as Error);
}
},
end: () => {
try {
const eof = process.platform === "win32" ? "\x1a" : "\x04";
pty?.write(eof);
} catch {
// ignore EOF errors
}
},
};
} catch (err) {
const errText = String(err);
const warning = `Warning: PTY spawn failed (${errText}); retrying without PTY for \`${opts.command}\`.`;
logWarn(`exec: PTY spawn failed (${errText}); retrying without PTY for "${opts.command}".`);
opts.warnings.push(warning);
child = await spawnShellChild(shell, shellArgs);
stdin = child.stdin;
}
} else {
const { shell, args: shellArgs } = getShellConfig();
child = await spawnShellChild(shell, shellArgs);
stdin = child.stdin;
maybeCloseNonPtyStdin();
}
const session = {
const session: ProcessSession = {
id: sessionId,
command: opts.command,
scopeKey: opts.scopeKey,
@@ -458,9 +309,9 @@ export async function runExecProcess(opts: {
notifyOnExit: opts.notifyOnExit,
notifyOnExitEmptySuccess: opts.notifyOnExitEmptySuccess === true,
exitNotified: false,
child: child ?? undefined,
stdin,
pid: child?.pid ?? pty?.pid,
child: undefined,
stdin: undefined,
pid: undefined,
startedAt,
cwd: opts.workdir,
maxOutputChars: opts.maxOutput,
@@ -477,59 +328,9 @@ export async function runExecProcess(opts: {
exitSignal: undefined as NodeJS.Signals | number | null | undefined,
truncated: false,
backgrounded: false,
} satisfies ProcessSession;
};
addSession(session);
let settled = false;
let timeoutTimer: NodeJS.Timeout | null = null;
let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
let timedOut = false;
const timeoutFinalizeMs = 1000;
let resolveFn: ((outcome: ExecProcessOutcome) => void) | null = null;
const settle = (outcome: ExecProcessOutcome) => {
if (settled) {
return;
}
settled = true;
resolveFn?.(outcome);
};
const finalizeTimeout = () => {
if (session.exited) {
return;
}
markExited(session, null, "SIGKILL", "failed");
maybeNotifyOnExit(session, "failed");
const aggregated = session.aggregated.trim();
const reason = `Command timed out after ${opts.timeoutSec} seconds`;
settle({
status: "failed",
exitCode: null,
exitSignal: "SIGKILL",
durationMs: Date.now() - startedAt,
aggregated,
timedOut: true,
reason: aggregated ? `${aggregated}\n\n${reason}` : reason,
});
};
const onTimeout = () => {
timedOut = true;
killSession(session);
if (!timeoutFinalizeTimer) {
timeoutFinalizeTimer = setTimeout(() => {
finalizeTimeout();
}, timeoutFinalizeMs);
}
};
if (opts.timeoutSec > 0) {
timeoutTimer = setTimeout(() => {
onTimeout();
}, opts.timeoutSec * 1000);
}
const emitUpdate = () => {
if (!opts.onUpdate) {
return;
@@ -565,116 +366,208 @@ export async function runExecProcess(opts: {
}
};
if (pty) {
const cursorResponse = buildCursorPositionResponse();
pty.onData((data) => {
const raw = data.toString();
const { cleaned, requests } = stripDsrRequests(raw);
if (requests > 0) {
const timeoutMs =
typeof opts.timeoutSec === "number" && opts.timeoutSec > 0
? Math.floor(opts.timeoutSec * 1000)
: undefined;
const spawnSpec:
| {
mode: "child";
argv: string[];
env: NodeJS.ProcessEnv;
stdinMode: "pipe-open" | "pipe-closed";
}
| {
mode: "pty";
ptyCommand: string;
childFallbackArgv: string[];
env: NodeJS.ProcessEnv;
stdinMode: "pipe-open";
} = (() => {
if (opts.sandbox) {
return {
mode: "child" as const,
argv: [
"docker",
...buildDockerExecArgs({
containerName: opts.sandbox.containerName,
command: execCommand,
workdir: opts.containerWorkdir ?? opts.sandbox.containerWorkdir,
env: opts.env,
tty: opts.usePty,
}),
],
env: process.env,
stdinMode: opts.usePty ? ("pipe-open" as const) : ("pipe-closed" as const),
};
}
const { shell, args: shellArgs } = getShellConfig();
const childArgv = [shell, ...shellArgs, execCommand];
if (opts.usePty) {
return {
mode: "pty" as const,
ptyCommand: execCommand,
childFallbackArgv: childArgv,
env: opts.env,
stdinMode: "pipe-open" as const,
};
}
return {
mode: "child" as const,
argv: childArgv,
env: opts.env,
stdinMode: "pipe-closed" as const,
};
})();
let managedRun: ManagedRun | null = null;
let usingPty = spawnSpec.mode === "pty";
const cursorResponse = buildCursorPositionResponse();
const onSupervisorStdout = (chunk: string) => {
if (usingPty) {
const { cleaned, requests } = stripDsrRequests(chunk);
if (requests > 0 && managedRun?.stdin) {
for (let i = 0; i < requests; i += 1) {
pty.write(cursorResponse);
managedRun.stdin.write(cursorResponse);
}
}
handleStdout(cleaned);
});
} else if (child) {
child.stdout.on("data", handleStdout);
child.stderr.on("data", handleStderr);
}
return;
}
handleStdout(chunk);
};
const promise = new Promise<ExecProcessOutcome>((resolve) => {
resolveFn = resolve;
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
if (timeoutTimer) {
clearTimeout(timeoutTimer);
}
if (timeoutFinalizeTimer) {
clearTimeout(timeoutFinalizeTimer);
try {
const spawnBase = {
runId: sessionId,
sessionId: opts.sessionKey?.trim() || sessionId,
backendId: opts.sandbox ? "exec-sandbox" : "exec-host",
scopeKey: opts.scopeKey,
cwd: opts.workdir,
env: spawnSpec.env,
timeoutMs,
captureOutput: false,
onStdout: onSupervisorStdout,
onStderr: handleStderr,
};
managedRun =
spawnSpec.mode === "pty"
? await supervisor.spawn({
...spawnBase,
mode: "pty",
ptyCommand: spawnSpec.ptyCommand,
})
: await supervisor.spawn({
...spawnBase,
mode: "child",
argv: spawnSpec.argv,
stdinMode: spawnSpec.stdinMode,
});
} catch (err) {
if (spawnSpec.mode === "pty") {
const warning = `Warning: PTY spawn failed (${String(err)}); retrying without PTY for \`${opts.command}\`.`;
logWarn(
`exec: PTY spawn failed (${String(err)}); retrying without PTY for "${opts.command}".`,
);
opts.warnings.push(warning);
usingPty = false;
try {
managedRun = await supervisor.spawn({
runId: sessionId,
sessionId: opts.sessionKey?.trim() || sessionId,
backendId: "exec-host",
scopeKey: opts.scopeKey,
mode: "child",
argv: spawnSpec.childFallbackArgv,
cwd: opts.workdir,
env: spawnSpec.env,
stdinMode: "pipe-open",
timeoutMs,
captureOutput: false,
onStdout: handleStdout,
onStderr: handleStderr,
});
} catch (retryErr) {
markExited(session, null, null, "failed");
maybeNotifyOnExit(session, "failed");
throw retryErr;
}
} else {
markExited(session, null, null, "failed");
maybeNotifyOnExit(session, "failed");
throw err;
}
}
session.stdin = managedRun.stdin;
session.pid = managedRun.pid;
const promise = managedRun
.wait()
.then((exit): ExecProcessOutcome => {
const durationMs = Date.now() - startedAt;
const wasSignal = exitSignal != null;
const isSuccess = code === 0 && !wasSignal && !timedOut;
const status: "completed" | "failed" = isSuccess ? "completed" : "failed";
markExited(session, code, exitSignal, status);
const status: "completed" | "failed" =
exit.exitCode === 0 && exit.reason === "exit" ? "completed" : "failed";
markExited(session, exit.exitCode, exit.exitSignal, status);
maybeNotifyOnExit(session, status);
if (!session.child && session.stdin) {
session.stdin.destroyed = true;
}
if (settled) {
return;
}
const aggregated = session.aggregated.trim();
if (!isSuccess) {
const reason = timedOut
? `Command timed out after ${opts.timeoutSec} seconds`
: wasSignal && exitSignal
? `Command aborted by signal ${exitSignal}`
: code === null
? "Command aborted before exit code was captured"
: `Command exited with code ${code}`;
const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
settle({
status: "failed",
exitCode: code ?? null,
exitSignal: exitSignal ?? null,
if (status === "completed") {
return {
status: "completed",
exitCode: exit.exitCode ?? 0,
exitSignal: exit.exitSignal,
durationMs,
aggregated,
timedOut,
reason: message,
});
return;
timedOut: false,
};
}
settle({
status: "completed",
exitCode: code ?? 0,
exitSignal: exitSignal ?? null,
const reason =
exit.reason === "overall-timeout"
? `Command timed out after ${opts.timeoutSec} seconds`
: exit.reason === "no-output-timeout"
? "Command timed out waiting for output"
: exit.exitSignal != null
? `Command aborted by signal ${exit.exitSignal}`
: exit.exitCode == null
? "Command aborted before exit code was captured"
: `Command exited with code ${exit.exitCode}`;
return {
status: "failed",
exitCode: exit.exitCode,
exitSignal: exit.exitSignal,
durationMs,
aggregated,
timedOut: exit.timedOut,
reason: aggregated ? `${aggregated}\n\n${reason}` : reason,
};
})
.catch((err): ExecProcessOutcome => {
markExited(session, null, null, "failed");
maybeNotifyOnExit(session, "failed");
const aggregated = session.aggregated.trim();
const message = aggregated ? `${aggregated}\n\n${String(err)}` : String(err);
return {
status: "failed",
exitCode: null,
exitSignal: null,
durationMs: Date.now() - startedAt,
aggregated,
timedOut: false,
});
};
if (pty) {
pty.onExit((event) => {
const rawSignal = event.signal ?? null;
const normalizedSignal = rawSignal === 0 ? null : rawSignal;
handleExit(event.exitCode ?? null, normalizedSignal);
});
} else if (child) {
child.once("close", (code, exitSignal) => {
handleExit(code, exitSignal);
});
child.once("error", (err) => {
if (timeoutTimer) {
clearTimeout(timeoutTimer);
}
if (timeoutFinalizeTimer) {
clearTimeout(timeoutFinalizeTimer);
}
markExited(session, null, null, "failed");
maybeNotifyOnExit(session, "failed");
const aggregated = session.aggregated.trim();
const message = aggregated ? `${aggregated}\n\n${String(err)}` : String(err);
settle({
status: "failed",
exitCode: null,
exitSignal: null,
durationMs: Date.now() - startedAt,
aggregated,
timedOut,
reason: message,
});
});
}
});
reason: message,
};
});
return {
session,
startedAt,
pid: session.pid ?? undefined,
promise,
kill: () => killSession(session),
kill: () => {
managedRun?.cancel("manual-cancel");
},
};
}

View File

@@ -0,0 +1,73 @@
import { afterEach, expect, test, vi } from "vitest";
import { resetProcessRegistryForTests } from "./bash-process-registry";
afterEach(() => {
resetProcessRegistryForTests();
vi.resetModules();
vi.clearAllMocks();
});
test("exec disposes PTY listeners after normal exit", async () => {
const disposeData = vi.fn();
const disposeExit = vi.fn();
vi.doMock("@lydell/node-pty", () => ({
spawn: () => {
return {
pid: 0,
write: vi.fn(),
onData: (listener: (value: string) => void) => {
setTimeout(() => listener("ok"), 0);
return { dispose: disposeData };
},
onExit: (listener: (event: { exitCode: number; signal?: number }) => void) => {
setTimeout(() => listener({ exitCode: 0 }), 0);
return { dispose: disposeExit };
},
kill: vi.fn(),
};
},
}));
const { createExecTool } = await import("./bash-tools.exec");
const tool = createExecTool({ allowBackground: false });
const result = await tool.execute("toolcall", {
command: "echo ok",
pty: true,
});
expect(result.details.status).toBe("completed");
expect(disposeData).toHaveBeenCalledTimes(1);
expect(disposeExit).toHaveBeenCalledTimes(1);
});
test("exec tears down PTY resources on timeout", async () => {
const disposeData = vi.fn();
const disposeExit = vi.fn();
const kill = vi.fn();
vi.doMock("@lydell/node-pty", () => ({
spawn: () => {
return {
pid: 0,
write: vi.fn(),
onData: () => ({ dispose: disposeData }),
onExit: () => ({ dispose: disposeExit }),
kill,
};
},
}));
const { createExecTool } = await import("./bash-tools.exec");
const tool = createExecTool({ allowBackground: false });
await expect(
tool.execute("toolcall", {
command: "sleep 5",
pty: true,
timeout: 0.01,
}),
).rejects.toThrow("Command timed out");
expect(kill).toHaveBeenCalledTimes(1);
expect(disposeData).toHaveBeenCalledTimes(1);
expect(disposeExit).toHaveBeenCalledTimes(1);
});

View File

@@ -0,0 +1,40 @@
import { afterEach, expect, test, vi } from "vitest";
import { listRunningSessions, resetProcessRegistryForTests } from "./bash-process-registry";
const { supervisorSpawnMock } = vi.hoisted(() => ({
supervisorSpawnMock: vi.fn(),
}));
vi.mock("../process/supervisor/index.js", () => ({
getProcessSupervisor: () => ({
spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
cancel: vi.fn(),
cancelScope: vi.fn(),
reconcileOrphans: vi.fn(),
getRecord: vi.fn(),
}),
}));
afterEach(() => {
resetProcessRegistryForTests();
vi.resetModules();
vi.clearAllMocks();
});
test("exec cleans session state when PTY fallback spawn also fails", async () => {
supervisorSpawnMock
.mockRejectedValueOnce(new Error("pty spawn failed"))
.mockRejectedValueOnce(new Error("child fallback failed"));
const { createExecTool } = await import("./bash-tools.exec");
const tool = createExecTool({ allowBackground: false });
await expect(
tool.execute("toolcall", {
command: "echo ok",
pty: true,
}),
).rejects.toThrow("child fallback failed");
expect(listRunningSessions()).toHaveLength(0);
});

View File

@@ -0,0 +1,152 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { ProcessSession } from "./bash-process-registry.js";
import {
addSession,
getFinishedSession,
getSession,
resetProcessRegistryForTests,
} from "./bash-process-registry.js";
import { createProcessTool } from "./bash-tools.process.js";
const { supervisorMock } = vi.hoisted(() => ({
supervisorMock: {
spawn: vi.fn(),
cancel: vi.fn(),
cancelScope: vi.fn(),
reconcileOrphans: vi.fn(),
getRecord: vi.fn(),
},
}));
const { killProcessTreeMock } = vi.hoisted(() => ({
killProcessTreeMock: vi.fn(),
}));
vi.mock("../process/supervisor/index.js", () => ({
getProcessSupervisor: () => supervisorMock,
}));
vi.mock("../process/kill-tree.js", () => ({
killProcessTree: (...args: unknown[]) => killProcessTreeMock(...args),
}));
function createBackgroundSession(id: string, pid?: number): ProcessSession {
return {
id,
command: "sleep 999",
startedAt: Date.now(),
cwd: "/tmp",
maxOutputChars: 10_000,
pendingMaxOutputChars: 30_000,
totalOutputChars: 0,
pendingStdout: [],
pendingStderr: [],
pendingStdoutChars: 0,
pendingStderrChars: 0,
aggregated: "",
tail: "",
pid,
exited: false,
exitCode: undefined,
exitSignal: undefined,
truncated: false,
backgrounded: true,
};
}
describe("process tool supervisor cancellation", () => {
beforeEach(() => {
supervisorMock.spawn.mockReset();
supervisorMock.cancel.mockReset();
supervisorMock.cancelScope.mockReset();
supervisorMock.reconcileOrphans.mockReset();
supervisorMock.getRecord.mockReset();
killProcessTreeMock.mockReset();
});
afterEach(() => {
resetProcessRegistryForTests();
});
it("routes kill through supervisor when run is managed", async () => {
supervisorMock.getRecord.mockReturnValue({
runId: "sess",
state: "running",
});
addSession(createBackgroundSession("sess"));
const processTool = createProcessTool();
const result = await processTool.execute("toolcall", {
action: "kill",
sessionId: "sess",
});
expect(supervisorMock.cancel).toHaveBeenCalledWith("sess", "manual-cancel");
expect(getSession("sess")).toBeDefined();
expect(getSession("sess")?.exited).toBe(false);
expect(result.content[0]).toMatchObject({
type: "text",
text: "Termination requested for session sess.",
});
});
it("remove drops running session immediately when cancellation is requested", async () => {
supervisorMock.getRecord.mockReturnValue({
runId: "sess",
state: "running",
});
addSession(createBackgroundSession("sess"));
const processTool = createProcessTool();
const result = await processTool.execute("toolcall", {
action: "remove",
sessionId: "sess",
});
expect(supervisorMock.cancel).toHaveBeenCalledWith("sess", "manual-cancel");
expect(getSession("sess")).toBeUndefined();
expect(getFinishedSession("sess")).toBeUndefined();
expect(result.content[0]).toMatchObject({
type: "text",
text: "Removed session sess (termination requested).",
});
});
it("falls back to process-tree kill when supervisor record is missing", async () => {
supervisorMock.getRecord.mockReturnValue(undefined);
addSession(createBackgroundSession("sess-fallback", 4242));
const processTool = createProcessTool();
const result = await processTool.execute("toolcall", {
action: "kill",
sessionId: "sess-fallback",
});
expect(killProcessTreeMock).toHaveBeenCalledWith(4242);
expect(getSession("sess-fallback")).toBeUndefined();
expect(getFinishedSession("sess-fallback")).toBeDefined();
expect(result.content[0]).toMatchObject({
type: "text",
text: "Killed session sess-fallback.",
});
});
it("fails remove when no supervisor record and no pid is available", async () => {
supervisorMock.getRecord.mockReturnValue(undefined);
addSession(createBackgroundSession("sess-no-pid"));
const processTool = createProcessTool();
const result = await processTool.execute("toolcall", {
action: "remove",
sessionId: "sess-no-pid",
});
expect(killProcessTreeMock).not.toHaveBeenCalled();
expect(getSession("sess-no-pid")).toBeDefined();
expect(result.details).toMatchObject({ status: "failed" });
expect(result.content[0]).toMatchObject({
type: "text",
text: "Unable to remove session sess-no-pid: no active supervisor run or process id.",
});
});
});

View File

@@ -1,7 +1,10 @@
import type { AgentTool, AgentToolResult } from "@mariozechner/pi-agent-core";
import { Type } from "@sinclair/typebox";
import { formatDurationCompact } from "../infra/format-time/format-duration.ts";
import { killProcessTree } from "../process/kill-tree.js";
import { getProcessSupervisor } from "../process/supervisor/index.js";
import {
type ProcessSession,
deleteSession,
drainSession,
getFinishedSession,
@@ -11,13 +14,7 @@ import {
markExited,
setJobTtlMs,
} from "./bash-process-registry.js";
import {
deriveSessionName,
killSession,
pad,
sliceLogLines,
truncateMiddle,
} from "./bash-tools.shared.js";
import { deriveSessionName, pad, sliceLogLines, truncateMiddle } from "./bash-tools.shared.js";
import { encodeKeySequence, encodePaste } from "./pty-keys.js";
export type ProcessToolDefaults = {
@@ -107,9 +104,28 @@ export function createProcessTool(
setJobTtlMs(defaults.cleanupMs);
}
const scopeKey = defaults?.scopeKey;
const supervisor = getProcessSupervisor();
const isInScope = (session?: { scopeKey?: string } | null) =>
!scopeKey || session?.scopeKey === scopeKey;
const cancelManagedSession = (sessionId: string) => {
const record = supervisor.getRecord(sessionId);
if (!record || record.state === "exited") {
return false;
}
supervisor.cancel(sessionId, "manual-cancel");
return true;
};
const terminateSessionFallback = (session: ProcessSession) => {
const pid = session.pid ?? session.child?.pid;
if (typeof pid !== "number" || !Number.isFinite(pid) || pid <= 0) {
return false;
}
killProcessTree(pid);
return true;
};
return {
name: "process",
label: "process",
@@ -523,10 +539,25 @@ export function createProcessTool(
if (!scopedSession.backgrounded) {
return failText(`Session ${params.sessionId} is not backgrounded.`);
}
killSession(scopedSession);
markExited(scopedSession, null, "SIGKILL", "failed");
const canceled = cancelManagedSession(scopedSession.id);
if (!canceled) {
const terminated = terminateSessionFallback(scopedSession);
if (!terminated) {
return failText(
`Unable to terminate session ${params.sessionId}: no active supervisor run or process id.`,
);
}
markExited(scopedSession, null, "SIGKILL", "failed");
}
return {
content: [{ type: "text", text: `Killed session ${params.sessionId}.` }],
content: [
{
type: "text",
text: canceled
? `Termination requested for session ${params.sessionId}.`
: `Killed session ${params.sessionId}.`,
},
],
details: {
status: "failed",
name: scopedSession ? deriveSessionName(scopedSession.command) : undefined,
@@ -555,10 +586,30 @@ export function createProcessTool(
case "remove": {
if (scopedSession) {
killSession(scopedSession);
markExited(scopedSession, null, "SIGKILL", "failed");
const canceled = cancelManagedSession(scopedSession.id);
if (canceled) {
// Keep remove semantics deterministic: drop from process registry now.
scopedSession.backgrounded = false;
deleteSession(params.sessionId);
} else {
const terminated = terminateSessionFallback(scopedSession);
if (!terminated) {
return failText(
`Unable to remove session ${params.sessionId}: no active supervisor run or process id.`,
);
}
markExited(scopedSession, null, "SIGKILL", "failed");
deleteSession(params.sessionId);
}
return {
content: [{ type: "text", text: `Removed session ${params.sessionId}.` }],
content: [
{
type: "text",
text: canceled
? `Removed session ${params.sessionId} (termination requested).`
: `Removed session ${params.sessionId}.`,
},
],
details: {
status: "failed",
name: scopedSession ? deriveSessionName(scopedSession.command) : undefined,

View File

@@ -1,11 +1,9 @@
import type { ChildProcessWithoutNullStreams } from "node:child_process";
import { existsSync, statSync } from "node:fs";
import fs from "node:fs/promises";
import { homedir } from "node:os";
import path from "node:path";
import { sliceUtf16Safe } from "../utils.js";
import { assertSandboxPath } from "./sandbox-paths.js";
import { killProcessTree } from "./shell-utils.js";
const CHUNK_LIMIT = 8 * 1024;
@@ -115,13 +113,6 @@ export async function resolveSandboxWorkdir(params: {
}
}
export function killSession(session: { pid?: number; child?: ChildProcessWithoutNullStreams }) {
const pid = session.pid ?? session.child?.pid;
if (pid) {
killProcessTree(pid);
}
}
export function resolveWorkdir(workdir: string, warnings: string[]) {
const current = safeCwd();
const fallback = current ?? homedir();

View File

@@ -0,0 +1,36 @@
import { describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import { resolveCliBackendConfig } from "./cli-backends.js";
describe("resolveCliBackendConfig reliability merge", () => {
it("deep-merges reliability watchdog overrides for codex", () => {
const cfg = {
agents: {
defaults: {
cliBackends: {
"codex-cli": {
command: "codex",
reliability: {
watchdog: {
resume: {
noOutputTimeoutMs: 42_000,
},
},
},
},
},
},
},
} satisfies OpenClawConfig;
const resolved = resolveCliBackendConfig("codex-cli", cfg);
expect(resolved).not.toBeNull();
expect(resolved?.config.reliability?.watchdog?.resume?.noOutputTimeoutMs).toBe(42_000);
// Ensure defaults are retained when only one field is overridden.
expect(resolved?.config.reliability?.watchdog?.resume?.noOutputTimeoutRatio).toBe(0.3);
expect(resolved?.config.reliability?.watchdog?.resume?.minMs).toBe(60_000);
expect(resolved?.config.reliability?.watchdog?.resume?.maxMs).toBe(180_000);
expect(resolved?.config.reliability?.watchdog?.fresh?.noOutputTimeoutRatio).toBe(0.8);
});
});

View File

@@ -1,5 +1,9 @@
import type { OpenClawConfig } from "../config/config.js";
import type { CliBackendConfig } from "../config/types.js";
import {
CLI_FRESH_WATCHDOG_DEFAULTS,
CLI_RESUME_WATCHDOG_DEFAULTS,
} from "./cli-watchdog-defaults.js";
import { normalizeProviderId } from "./model-selection.js";
export type ResolvedCliBackend = {
@@ -49,6 +53,12 @@ const DEFAULT_CLAUDE_BACKEND: CliBackendConfig = {
systemPromptMode: "append",
systemPromptWhen: "first",
clearEnv: ["ANTHROPIC_API_KEY", "ANTHROPIC_API_KEY_OLD"],
reliability: {
watchdog: {
fresh: { ...CLI_FRESH_WATCHDOG_DEFAULTS },
resume: { ...CLI_RESUME_WATCHDOG_DEFAULTS },
},
},
serialize: true,
};
@@ -73,6 +83,12 @@ const DEFAULT_CODEX_BACKEND: CliBackendConfig = {
sessionMode: "existing",
imageArg: "--image",
imageMode: "repeat",
reliability: {
watchdog: {
fresh: { ...CLI_FRESH_WATCHDOG_DEFAULTS },
resume: { ...CLI_RESUME_WATCHDOG_DEFAULTS },
},
},
serialize: true,
};
@@ -96,6 +112,10 @@ function mergeBackendConfig(base: CliBackendConfig, override?: CliBackendConfig)
if (!override) {
return { ...base };
}
const baseFresh = base.reliability?.watchdog?.fresh ?? {};
const baseResume = base.reliability?.watchdog?.resume ?? {};
const overrideFresh = override.reliability?.watchdog?.fresh ?? {};
const overrideResume = override.reliability?.watchdog?.resume ?? {};
return {
...base,
...override,
@@ -106,6 +126,22 @@ function mergeBackendConfig(base: CliBackendConfig, override?: CliBackendConfig)
sessionIdFields: override.sessionIdFields ?? base.sessionIdFields,
sessionArgs: override.sessionArgs ?? base.sessionArgs,
resumeArgs: override.resumeArgs ?? base.resumeArgs,
reliability: {
...base.reliability,
...override.reliability,
watchdog: {
...base.reliability?.watchdog,
...override.reliability?.watchdog,
fresh: {
...baseFresh,
...overrideFresh,
},
resume: {
...baseResume,
...overrideResume,
},
},
},
};
}

View File

@@ -3,50 +3,69 @@ import os from "node:os";
import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import type { CliBackendConfig } from "../config/types.js";
import { runCliAgent } from "./cli-runner.js";
import { cleanupResumeProcesses, cleanupSuspendedCliProcesses } from "./cli-runner/helpers.js";
import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";
const runCommandWithTimeoutMock = vi.fn();
const runExecMock = vi.fn();
const supervisorSpawnMock = vi.fn();
vi.mock("../process/exec.js", () => ({
runCommandWithTimeout: (...args: unknown[]) => runCommandWithTimeoutMock(...args),
runExec: (...args: unknown[]) => runExecMock(...args),
vi.mock("../process/supervisor/index.js", () => ({
getProcessSupervisor: () => ({
spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
cancel: vi.fn(),
cancelScope: vi.fn(),
reconcileOrphans: vi.fn(),
getRecord: vi.fn(),
}),
}));
describe("runCliAgent resume cleanup", () => {
type MockRunExit = {
reason:
| "manual-cancel"
| "overall-timeout"
| "no-output-timeout"
| "spawn-error"
| "signal"
| "exit";
exitCode: number | null;
exitSignal: NodeJS.Signals | number | null;
durationMs: number;
stdout: string;
stderr: string;
timedOut: boolean;
noOutputTimedOut: boolean;
};
function createManagedRun(exit: MockRunExit, pid = 1234) {
return {
runId: "run-supervisor",
pid,
startedAtMs: Date.now(),
stdin: undefined,
wait: vi.fn().mockResolvedValue(exit),
cancel: vi.fn(),
};
}
describe("runCliAgent with process supervisor", () => {
beforeEach(() => {
runCommandWithTimeoutMock.mockReset();
runExecMock.mockReset();
supervisorSpawnMock.mockReset();
});
it("kills stale resume processes for codex sessions", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: " 1 999 S /bin/launchd\n",
it("runs CLI through supervisor and returns payload", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 50,
stdout: "ok",
stderr: "",
}) // cleanupSuspendedCliProcesses (ps) — ppid 999 != selfPid, no match
.mockResolvedValueOnce({
stdout: [
` ${selfPid + 1} ${selfPid} codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
` ${selfPid + 2} 999 codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
].join("\n"),
stderr: "",
}) // cleanupResumeProcesses (ps)
.mockResolvedValueOnce({ stdout: "", stderr: "" }) // cleanupResumeProcesses (kill -TERM)
.mockResolvedValueOnce({ stdout: "", stderr: "" }); // cleanupResumeProcesses (kill -9)
runCommandWithTimeoutMock.mockResolvedValueOnce({
stdout: "ok",
stderr: "",
code: 0,
signal: null,
killed: false,
});
timedOut: false,
noOutputTimedOut: false,
}),
);
await runCliAgent({
const result = await runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
@@ -58,28 +77,80 @@ describe("runCliAgent resume cleanup", () => {
cliSessionId: "thread-123",
});
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(result.payloads?.[0]?.text).toBe("ok");
expect(supervisorSpawnMock).toHaveBeenCalledTimes(1);
const input = supervisorSpawnMock.mock.calls[0]?.[0] as {
argv?: string[];
mode?: string;
timeoutMs?: number;
noOutputTimeoutMs?: number;
replaceExistingScope?: boolean;
scopeKey?: string;
};
expect(input.mode).toBe("child");
expect(input.argv?.[0]).toBe("codex");
expect(input.timeoutMs).toBe(1_000);
expect(input.noOutputTimeoutMs).toBeGreaterThanOrEqual(1_000);
expect(input.replaceExistingScope).toBe(true);
expect(input.scopeKey).toContain("thread-123");
});
expect(runExecMock).toHaveBeenCalledTimes(4);
it("fails with timeout when no-output watchdog trips", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "no-output-timeout",
exitCode: null,
exitSignal: "SIGKILL",
durationMs: 200,
stdout: "",
stderr: "",
timedOut: true,
noOutputTimedOut: true,
}),
);
// Second call: cleanupResumeProcesses ps
const psCall = runExecMock.mock.calls[1] ?? [];
expect(psCall[0]).toBe("ps");
await expect(
runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
prompt: "hi",
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-2",
cliSessionId: "thread-123",
}),
).rejects.toThrow("produced no output");
});
// Third call: TERM, only the child PID
const termCall = runExecMock.mock.calls[2] ?? [];
expect(termCall[0]).toBe("kill");
const termArgs = termCall[1] as string[];
expect(termArgs).toEqual(["-TERM", String(selfPid + 1)]);
it("fails with timeout when overall timeout trips", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "overall-timeout",
exitCode: null,
exitSignal: "SIGKILL",
durationMs: 200,
stdout: "",
stderr: "",
timedOut: true,
noOutputTimedOut: false,
}),
);
// Fourth call: KILL, only the child PID
const killCall = runExecMock.mock.calls[3] ?? [];
expect(killCall[0]).toBe("kill");
const killArgs = killCall[1] as string[];
expect(killArgs).toEqual(["-9", String(selfPid + 1)]);
await expect(
runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
prompt: "hi",
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-3",
cliSessionId: "thread-123",
}),
).rejects.toThrow("exceeded timeout");
});
it("falls back to per-agent workspace when workspaceDir is missing", async () => {
@@ -94,14 +165,18 @@ describe("runCliAgent resume cleanup", () => {
},
} satisfies OpenClawConfig;
runExecMock.mockResolvedValue({ stdout: "", stderr: "" });
runCommandWithTimeoutMock.mockResolvedValueOnce({
stdout: "ok",
stderr: "",
code: 0,
signal: null,
killed: false,
});
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 25,
stdout: "ok",
stderr: "",
timedOut: false,
noOutputTimedOut: false,
}),
);
try {
await runCliAgent({
@@ -114,264 +189,33 @@ describe("runCliAgent resume cleanup", () => {
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-1",
runId: "run-4",
});
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
const options = runCommandWithTimeoutMock.mock.calls[0]?.[1] as { cwd?: string };
expect(options.cwd).toBe(path.resolve(fallbackWorkspace));
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { cwd?: string };
expect(input.cwd).toBe(path.resolve(fallbackWorkspace));
});
});
it("throws when sessionKey is malformed", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-"));
const mainWorkspace = path.join(tempDir, "workspace-main");
const researchWorkspace = path.join(tempDir, "workspace-research");
await fs.mkdir(mainWorkspace, { recursive: true });
await fs.mkdir(researchWorkspace, { recursive: true });
const cfg = {
agents: {
defaults: {
workspace: mainWorkspace,
describe("resolveCliNoOutputTimeoutMs", () => {
it("uses backend-configured resume watchdog override", () => {
const timeoutMs = resolveCliNoOutputTimeoutMs({
backend: {
command: "codex",
reliability: {
watchdog: {
resume: {
noOutputTimeoutMs: 42_000,
},
},
},
list: [{ id: "research", workspace: researchWorkspace }],
},
} satisfies OpenClawConfig;
try {
await expect(
runCliAgent({
sessionId: "s1",
sessionKey: "agent::broken",
agentId: "research",
sessionFile: "/tmp/session.jsonl",
workspaceDir: undefined as unknown as string,
config: cfg,
prompt: "hi",
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-2",
}),
).rejects.toThrow("Malformed agent session key");
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
expect(runCommandWithTimeoutMock).not.toHaveBeenCalled();
});
});
describe("cleanupSuspendedCliProcesses", () => {
beforeEach(() => {
runExecMock.mockReset();
});
it("skips when no session tokens are configured", async () => {
await cleanupSuspendedCliProcesses(
{
command: "tool",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).not.toHaveBeenCalled();
});
it("matches sessionArg-based commands", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: [
` 40 ${selfPid} T+ claude --session-id thread-1 -p`,
` 41 ${selfPid} S claude --session-id thread-2 -p`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupSuspendedCliProcesses(
{
command: "claude",
sessionArg: "--session-id",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(2);
const killCall = runExecMock.mock.calls[1] ?? [];
expect(killCall[0]).toBe("kill");
expect(killCall[1]).toEqual(["-9", "40"]);
});
it("matches resumeArgs with positional session id", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: [
` 50 ${selfPid} T codex exec resume thread-99 --color never --sandbox read-only`,
` 51 ${selfPid} T codex exec resume other --color never --sandbox read-only`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupSuspendedCliProcesses(
{
command: "codex",
resumeArgs: ["exec", "resume", "{sessionId}", "--color", "never", "--sandbox", "read-only"],
} as CliBackendConfig,
1,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(2);
const killCall = runExecMock.mock.calls[1] ?? [];
expect(killCall[0]).toBe("kill");
expect(killCall[1]).toEqual(["-9", "50", "51"]);
});
it("only kills child processes of current process (ppid validation)", async () => {
const selfPid = process.pid;
const childPid = selfPid + 1;
const unrelatedPid = 9999;
runExecMock
.mockResolvedValueOnce({
stdout: [
` ${childPid} ${selfPid} T claude --session-id thread-1 -p`,
` ${unrelatedPid} 100 T claude --session-id thread-2 -p`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupSuspendedCliProcesses(
{
command: "claude",
sessionArg: "--session-id",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(2);
const killCall = runExecMock.mock.calls[1] ?? [];
expect(killCall[0]).toBe("kill");
// Only childPid killed; unrelatedPid (ppid=100) excluded
expect(killCall[1]).toEqual(["-9", String(childPid)]);
});
it("skips all processes when none are children of current process", async () => {
runExecMock.mockResolvedValueOnce({
stdout: [
" 200 100 T claude --session-id thread-1 -p",
" 201 100 T claude --session-id thread-2 -p",
].join("\n"),
stderr: "",
timeoutMs: 120_000,
useResume: true,
});
await cleanupSuspendedCliProcesses(
{
command: "claude",
sessionArg: "--session-id",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
// Only ps called — no kill because no matching ppid
expect(runExecMock).toHaveBeenCalledTimes(1);
});
});
describe("cleanupResumeProcesses", () => {
beforeEach(() => {
runExecMock.mockReset();
});
it("only kills resume processes owned by current process", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: [
` ${selfPid + 1} ${selfPid} codex exec resume abc-123`,
` ${selfPid + 2} 999 codex exec resume abc-123`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" })
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupResumeProcesses(
{
command: "codex",
resumeArgs: ["exec", "resume", "{sessionId}"],
} as CliBackendConfig,
"abc-123",
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(3);
const termCall = runExecMock.mock.calls[1] ?? [];
expect(termCall[0]).toBe("kill");
expect(termCall[1]).toEqual(["-TERM", String(selfPid + 1)]);
const killCall = runExecMock.mock.calls[2] ?? [];
expect(killCall[0]).toBe("kill");
expect(killCall[1]).toEqual(["-9", String(selfPid + 1)]);
});
it("skips kill when no resume processes match ppid", async () => {
runExecMock.mockResolvedValueOnce({
stdout: [" 300 100 codex exec resume abc-123", " 301 200 codex exec resume abc-123"].join(
"\n",
),
stderr: "",
});
await cleanupResumeProcesses(
{
command: "codex",
resumeArgs: ["exec", "resume", "{sessionId}"],
} as CliBackendConfig,
"abc-123",
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
// Only ps called — no kill because no matching ppid
expect(runExecMock).toHaveBeenCalledTimes(1);
expect(timeoutMs).toBe(42_000);
});
});

View File

@@ -6,20 +6,20 @@ import { resolveHeartbeatPrompt } from "../auto-reply/heartbeat.js";
import { shouldLogVerbose } from "../globals.js";
import { isTruthyEnvValue } from "../infra/env.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import { runCommandWithTimeout } from "../process/exec.js";
import { getProcessSupervisor } from "../process/supervisor/index.js";
import { resolveSessionAgentIds } from "./agent-scope.js";
import { makeBootstrapWarn, resolveBootstrapContextForRun } from "./bootstrap-files.js";
import { resolveCliBackendConfig } from "./cli-backends.js";
import {
appendImagePathsToPrompt,
buildCliSupervisorScopeKey,
buildCliArgs,
buildSystemPrompt,
cleanupResumeProcesses,
cleanupSuspendedCliProcesses,
enqueueCliRun,
normalizeCliModel,
parseCliJson,
parseCliJsonl,
resolveCliNoOutputTimeoutMs,
resolvePromptInput,
resolveSessionIdToSend,
resolveSystemPromptUsage,
@@ -226,19 +226,32 @@ export async function runCliAgent(params: {
}
return next;
})();
// Cleanup suspended processes that have accumulated (regardless of sessionId)
await cleanupSuspendedCliProcesses(backend);
if (useResume && cliSessionIdToSend) {
await cleanupResumeProcesses(backend, cliSessionIdToSend);
}
const result = await runCommandWithTimeout([backend.command, ...args], {
const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({
backend,
timeoutMs: params.timeoutMs,
useResume,
});
const supervisor = getProcessSupervisor();
const scopeKey = buildCliSupervisorScopeKey({
backend,
backendId: backendResolved.id,
cliSessionId: useResume ? cliSessionIdToSend : undefined,
});
const managedRun = await supervisor.spawn({
sessionId: params.sessionId,
backendId: backendResolved.id,
scopeKey,
replaceExistingScope: Boolean(useResume && scopeKey),
mode: "child",
argv: [backend.command, ...args],
timeoutMs: params.timeoutMs,
noOutputTimeoutMs,
cwd: workspaceDir,
env,
input: stdinPayload,
});
const result = await managedRun.wait();
const stdout = result.stdout.trim();
const stderr = result.stderr.trim();
@@ -259,7 +272,28 @@ export async function runCliAgent(params: {
}
}
if (result.code !== 0) {
if (result.exitCode !== 0 || result.reason !== "exit") {
if (result.reason === "no-output-timeout" || result.noOutputTimedOut) {
const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`;
log.warn(
`cli watchdog timeout: provider=${params.provider} model=${modelId} session=${cliSessionIdToSend ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`,
);
throw new FailoverError(timeoutReason, {
reason: "timeout",
provider: params.provider,
model: modelId,
status: resolveFailoverStatus("timeout"),
});
}
if (result.reason === "overall-timeout") {
const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`;
throw new FailoverError(timeoutReason, {
reason: "timeout",
provider: params.provider,
model: modelId,
status: resolveFailoverStatus("timeout"),
});
}
const err = stderr || stdout || "CLI failed.";
const reason = classifyFailoverReason(err) ?? "unknown";
const status = resolveFailoverStatus(reason);

View File

@@ -8,232 +8,27 @@ import type { ThinkLevel } from "../../auto-reply/thinking.js";
import type { OpenClawConfig } from "../../config/config.js";
import type { CliBackendConfig } from "../../config/types.js";
import type { EmbeddedContextFile } from "../pi-embedded-helpers.js";
import { runExec } from "../../process/exec.js";
import { buildTtsSystemPromptHint } from "../../tts/tts.js";
import { escapeRegExp, isRecord } from "../../utils.js";
import { isRecord } from "../../utils.js";
import { buildModelAliasLines } from "../model-alias-lines.js";
import { resolveDefaultModelForAgent } from "../model-selection.js";
import { detectRuntimeShell } from "../shell-utils.js";
import { buildSystemPromptParams } from "../system-prompt-params.js";
import { buildAgentSystemPrompt } from "../system-prompt.js";
export { buildCliSupervisorScopeKey, resolveCliNoOutputTimeoutMs } from "./reliability.js";
const CLI_RUN_QUEUE = new Map<string, Promise<unknown>>();
function buildLooseArgOrderRegex(tokens: string[]): RegExp {
// Scan `ps` output lines. Keep matching flexible, but require whitespace arg boundaries
// to avoid substring matches like `codexx` or `/path/to/codexx`.
const [head, ...rest] = tokens.map((t) => String(t ?? "").trim()).filter(Boolean);
if (!head) {
return /$^/;
}
const headEscaped = escapeRegExp(head);
const headFragment = `(?:^|\\s)(?:${headEscaped}|\\S+\\/${headEscaped})(?=\\s|$)`;
const restFragments = rest.map((t) => `(?:^|\\s)${escapeRegExp(t)}(?=\\s|$)`);
return new RegExp([headFragment, ...restFragments].join(".*"));
}
async function psWithFallback(argsA: string[], argsB: string[]): Promise<string> {
try {
const { stdout } = await runExec("ps", argsA);
return stdout;
} catch {
// fallthrough
}
const { stdout } = await runExec("ps", argsB);
return stdout;
}
export async function cleanupResumeProcesses(
backend: CliBackendConfig,
sessionId: string,
): Promise<void> {
if (process.platform === "win32") {
return;
}
const resumeArgs = backend.resumeArgs ?? [];
if (resumeArgs.length === 0) {
return;
}
if (!resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
return;
}
const commandToken = path.basename(backend.command ?? "").trim();
if (!commandToken) {
return;
}
const resumeTokens = resumeArgs.map((arg) => arg.replaceAll("{sessionId}", sessionId));
const pattern = [commandToken, ...resumeTokens]
.filter(Boolean)
.map((token) => escapeRegExp(token))
.join(".*");
if (!pattern) {
return;
}
try {
const stdout = await psWithFallback(
["-axww", "-o", "pid=,ppid=,command="],
["-ax", "-o", "pid=,ppid=,command="],
);
const patternRegex = buildLooseArgOrderRegex([commandToken, ...resumeTokens]);
const toKill: number[] = [];
for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
const match = /^(\d+)\s+(\d+)\s+(.*)$/.exec(trimmed);
if (!match) {
continue;
}
const pid = Number(match[1]);
const ppid = Number(match[2]);
const cmd = match[3] ?? "";
if (!Number.isFinite(pid)) {
continue;
}
if (ppid !== process.pid) {
continue;
}
if (!patternRegex.test(cmd)) {
continue;
}
toKill.push(pid);
}
if (toKill.length > 0) {
const pidArgs = toKill.map((pid) => String(pid));
try {
await runExec("kill", ["-TERM", ...pidArgs]);
} catch {
// ignore
}
await new Promise((resolve) => setTimeout(resolve, 250));
try {
await runExec("kill", ["-9", ...pidArgs]);
} catch {
// ignore
}
}
} catch {
// ignore errors - best effort cleanup
}
}
function buildSessionMatchers(backend: CliBackendConfig): RegExp[] {
const commandToken = path.basename(backend.command ?? "").trim();
if (!commandToken) {
return [];
}
const matchers: RegExp[] = [];
const sessionArg = backend.sessionArg?.trim();
const sessionArgs = backend.sessionArgs ?? [];
const resumeArgs = backend.resumeArgs ?? [];
const addMatcher = (args: string[]) => {
if (args.length === 0) {
return;
}
const tokens = [commandToken, ...args];
const pattern = tokens
.map((token, index) => {
const tokenPattern = tokenToRegex(token);
return index === 0 ? `(?:^|\\s)${tokenPattern}` : `\\s+${tokenPattern}`;
})
.join("");
matchers.push(new RegExp(pattern));
};
if (sessionArgs.some((arg) => arg.includes("{sessionId}"))) {
addMatcher(sessionArgs);
} else if (sessionArg) {
addMatcher([sessionArg, "{sessionId}"]);
}
if (resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
addMatcher(resumeArgs);
}
return matchers;
}
function tokenToRegex(token: string): string {
if (!token.includes("{sessionId}")) {
return escapeRegExp(token);
}
const parts = token.split("{sessionId}").map((part) => escapeRegExp(part));
return parts.join("\\S+");
}
/**
* Cleanup suspended OpenClaw CLI processes that have accumulated.
* Only cleans up if there are more than the threshold (default: 10).
*/
export async function cleanupSuspendedCliProcesses(
backend: CliBackendConfig,
threshold = 10,
): Promise<void> {
if (process.platform === "win32") {
return;
}
const matchers = buildSessionMatchers(backend);
if (matchers.length === 0) {
return;
}
try {
const stdout = await psWithFallback(
["-axww", "-o", "pid=,ppid=,stat=,command="],
["-ax", "-o", "pid=,ppid=,stat=,command="],
);
const suspended: number[] = [];
for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
const match = /^(\d+)\s+(\d+)\s+(\S+)\s+(.*)$/.exec(trimmed);
if (!match) {
continue;
}
const pid = Number(match[1]);
const ppid = Number(match[2]);
const stat = match[3] ?? "";
const command = match[4] ?? "";
if (!Number.isFinite(pid)) {
continue;
}
if (ppid !== process.pid) {
continue;
}
if (!stat.includes("T")) {
continue;
}
if (!matchers.some((matcher) => matcher.test(command))) {
continue;
}
suspended.push(pid);
}
if (suspended.length > threshold) {
// Verified locally: stopped (T) processes ignore SIGTERM, so use SIGKILL.
await runExec("kill", ["-9", ...suspended.map((pid) => String(pid))]);
}
} catch {
// ignore errors - best effort cleanup
}
}
export function enqueueCliRun<T>(key: string, task: () => Promise<T>): Promise<T> {
const prior = CLI_RUN_QUEUE.get(key) ?? Promise.resolve();
const chained = prior.catch(() => undefined).then(task);
const tracked = chained.finally(() => {
if (CLI_RUN_QUEUE.get(key) === tracked) {
CLI_RUN_QUEUE.delete(key);
}
});
// Keep queue continuity even when a run rejects, without emitting unhandled rejections.
const tracked = chained
.catch(() => undefined)
.finally(() => {
if (CLI_RUN_QUEUE.get(key) === tracked) {
CLI_RUN_QUEUE.delete(key);
}
});
CLI_RUN_QUEUE.set(key, tracked);
return chained;
}

View File

@@ -0,0 +1,88 @@
import path from "node:path";
import type { CliBackendConfig } from "../../config/types.js";
import {
CLI_FRESH_WATCHDOG_DEFAULTS,
CLI_RESUME_WATCHDOG_DEFAULTS,
CLI_WATCHDOG_MIN_TIMEOUT_MS,
} from "../cli-watchdog-defaults.js";
function pickWatchdogProfile(
backend: CliBackendConfig,
useResume: boolean,
): {
noOutputTimeoutMs?: number;
noOutputTimeoutRatio: number;
minMs: number;
maxMs: number;
} {
const defaults = useResume ? CLI_RESUME_WATCHDOG_DEFAULTS : CLI_FRESH_WATCHDOG_DEFAULTS;
const configured = useResume
? backend.reliability?.watchdog?.resume
: backend.reliability?.watchdog?.fresh;
const ratio = (() => {
const value = configured?.noOutputTimeoutRatio;
if (typeof value !== "number" || !Number.isFinite(value)) {
return defaults.noOutputTimeoutRatio;
}
return Math.max(0.05, Math.min(0.95, value));
})();
const minMs = (() => {
const value = configured?.minMs;
if (typeof value !== "number" || !Number.isFinite(value)) {
return defaults.minMs;
}
return Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(value));
})();
const maxMs = (() => {
const value = configured?.maxMs;
if (typeof value !== "number" || !Number.isFinite(value)) {
return defaults.maxMs;
}
return Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(value));
})();
return {
noOutputTimeoutMs:
typeof configured?.noOutputTimeoutMs === "number" &&
Number.isFinite(configured.noOutputTimeoutMs)
? Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(configured.noOutputTimeoutMs))
: undefined,
noOutputTimeoutRatio: ratio,
minMs: Math.min(minMs, maxMs),
maxMs: Math.max(minMs, maxMs),
};
}
export function resolveCliNoOutputTimeoutMs(params: {
backend: CliBackendConfig;
timeoutMs: number;
useResume: boolean;
}): number {
const profile = pickWatchdogProfile(params.backend, params.useResume);
// Keep watchdog below global timeout in normal cases.
const cap = Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, params.timeoutMs - 1_000);
if (profile.noOutputTimeoutMs !== undefined) {
return Math.min(profile.noOutputTimeoutMs, cap);
}
const computed = Math.floor(params.timeoutMs * profile.noOutputTimeoutRatio);
const bounded = Math.min(profile.maxMs, Math.max(profile.minMs, computed));
return Math.min(bounded, cap);
}
export function buildCliSupervisorScopeKey(params: {
backend: CliBackendConfig;
backendId: string;
cliSessionId?: string;
}): string | undefined {
const commandToken = path
.basename(params.backend.command ?? "")
.trim()
.toLowerCase();
const backendToken = params.backendId.trim().toLowerCase();
const sessionToken = params.cliSessionId?.trim();
if (!sessionToken) {
return undefined;
}
return `cli:${backendToken}:${commandToken}:${sessionToken}`;
}

View File

@@ -0,0 +1,13 @@
export const CLI_WATCHDOG_MIN_TIMEOUT_MS = 1_000;
export const CLI_FRESH_WATCHDOG_DEFAULTS = {
noOutputTimeoutRatio: 0.8,
minMs: 180_000,
maxMs: 600_000,
} as const;
export const CLI_RESUME_WATCHDOG_DEFAULTS = {
noOutputTimeoutRatio: 0.3,
minMs: 60_000,
maxMs: 180_000,
} as const;