mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-05 06:40:24 +00:00
fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)
* exec: clean up PTY resources on timeout and exit * cli: harden resume cleanup and watchdog stalled runs * cli: productionize PTY and resume reliability paths * docs: add PTY process supervision architecture plan * docs: rewrite PTY supervision plan as pre-rewrite baseline * docs: switch PTY supervision plan to one-go execution * docs: add one-line root cause to PTY supervision plan * docs: add OS contracts and test matrix to PTY supervision plan * docs: define process-supervisor package placement and scope * docs: tie supervisor plan to existing CI lanes * docs: place PTY supervisor plan under src/process * refactor(process): route exec and cli runs through supervisor * docs(process): refresh PTY supervision plan * wip * fix(process): harden supervisor timeout and PTY termination * fix(process): harden supervisor adapters env and wait handling * ci: avoid failing formal conformance on comment permissions * test(ui): fix cron request mock argument typing * fix(ui): remove leftover conflict marker * fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
@@ -1,17 +1,17 @@
|
||||
import type { AgentToolResult } from "@mariozechner/pi-agent-core";
|
||||
import type { ChildProcessWithoutNullStreams } from "node:child_process";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import path from "node:path";
|
||||
import type { ExecAsk, ExecHost, ExecSecurity } from "../infra/exec-approvals.js";
|
||||
import type { ProcessSession, SessionStdin } from "./bash-process-registry.js";
|
||||
import type { ProcessSession } from "./bash-process-registry.js";
|
||||
import type { ExecToolDetails } from "./bash-tools.exec.js";
|
||||
import type { BashSandboxConfig } from "./bash-tools.shared.js";
|
||||
import { requestHeartbeatNow } from "../infra/heartbeat-wake.js";
|
||||
import { mergePathPrepend } from "../infra/path-prepend.js";
|
||||
import { enqueueSystemEvent } from "../infra/system-events.js";
|
||||
export { applyPathPrepend, normalizePathPrepend } from "../infra/path-prepend.js";
|
||||
import type { ManagedRun } from "../process/supervisor/index.js";
|
||||
import { logWarn } from "../logger.js";
|
||||
import { formatSpawnError, spawnWithFallback } from "../process/spawn-utils.js";
|
||||
import { getProcessSupervisor } from "../process/supervisor/index.js";
|
||||
import {
|
||||
addSession,
|
||||
appendOutput,
|
||||
@@ -23,7 +23,6 @@ import {
|
||||
buildDockerExecArgs,
|
||||
chunkString,
|
||||
clampWithDefault,
|
||||
killSession,
|
||||
readEnvInt,
|
||||
} from "./bash-tools.shared.js";
|
||||
import { buildCursorPositionResponse, stripDsrRequests } from "./pty-dsr.js";
|
||||
@@ -147,26 +146,6 @@ export const execSchema = Type.Object({
|
||||
),
|
||||
});
|
||||
|
||||
type PtyExitEvent = { exitCode: number; signal?: number };
|
||||
type PtyListener<T> = (event: T) => void;
|
||||
type PtyHandle = {
|
||||
pid: number;
|
||||
write: (data: string | Buffer) => void;
|
||||
onData: (listener: PtyListener<string>) => void;
|
||||
onExit: (listener: PtyListener<PtyExitEvent>) => void;
|
||||
};
|
||||
type PtySpawn = (
|
||||
file: string,
|
||||
args: string[] | string,
|
||||
options: {
|
||||
name?: string;
|
||||
cols?: number;
|
||||
rows?: number;
|
||||
cwd?: string;
|
||||
env?: Record<string, string>;
|
||||
},
|
||||
) => PtyHandle;
|
||||
|
||||
export type ExecProcessOutcome = {
|
||||
status: "completed" | "failed";
|
||||
exitCode: number | null;
|
||||
@@ -319,138 +298,10 @@ export async function runExecProcess(opts: {
|
||||
}): Promise<ExecProcessHandle> {
|
||||
const startedAt = Date.now();
|
||||
const sessionId = createSessionSlug();
|
||||
let child: ChildProcessWithoutNullStreams | null = null;
|
||||
let pty: PtyHandle | null = null;
|
||||
let stdin: SessionStdin | undefined;
|
||||
const execCommand = opts.execCommand ?? opts.command;
|
||||
const supervisor = getProcessSupervisor();
|
||||
|
||||
const spawnFallbacks = [
|
||||
{
|
||||
label: "no-detach",
|
||||
options: { detached: false },
|
||||
},
|
||||
];
|
||||
|
||||
const handleSpawnFallback = (err: unknown, fallback: { label: string }) => {
|
||||
const errText = formatSpawnError(err);
|
||||
const warning = `Warning: spawn failed (${errText}); retrying with ${fallback.label}.`;
|
||||
logWarn(`exec: spawn failed (${errText}); retrying with ${fallback.label}.`);
|
||||
opts.warnings.push(warning);
|
||||
};
|
||||
|
||||
const spawnShellChild = async (
|
||||
shell: string,
|
||||
shellArgs: string[],
|
||||
): Promise<ChildProcessWithoutNullStreams> => {
|
||||
const { child: spawned } = await spawnWithFallback({
|
||||
argv: [shell, ...shellArgs, execCommand],
|
||||
options: {
|
||||
cwd: opts.workdir,
|
||||
env: opts.env,
|
||||
detached: process.platform !== "win32",
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
windowsHide: true,
|
||||
},
|
||||
fallbacks: spawnFallbacks,
|
||||
onFallback: handleSpawnFallback,
|
||||
});
|
||||
return spawned as ChildProcessWithoutNullStreams;
|
||||
};
|
||||
|
||||
// `exec` does not currently accept tool-provided stdin content. For non-PTY runs,
|
||||
// keeping stdin open can cause commands like `wc -l` (or safeBins-hardened segments)
|
||||
// to block forever waiting for input, leading to accidental backgrounding.
|
||||
// For interactive flows, callers should use `pty: true` (stdin kept open).
|
||||
const maybeCloseNonPtyStdin = () => {
|
||||
if (opts.usePty) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// Signal EOF immediately so stdin-only commands can terminate.
|
||||
child?.stdin?.end();
|
||||
} catch {
|
||||
// ignore stdin close errors
|
||||
}
|
||||
};
|
||||
|
||||
if (opts.sandbox) {
|
||||
const { child: spawned } = await spawnWithFallback({
|
||||
argv: [
|
||||
"docker",
|
||||
...buildDockerExecArgs({
|
||||
containerName: opts.sandbox.containerName,
|
||||
command: execCommand,
|
||||
workdir: opts.containerWorkdir ?? opts.sandbox.containerWorkdir,
|
||||
env: opts.env,
|
||||
tty: opts.usePty,
|
||||
}),
|
||||
],
|
||||
options: {
|
||||
cwd: opts.workdir,
|
||||
env: process.env,
|
||||
detached: process.platform !== "win32",
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
windowsHide: true,
|
||||
},
|
||||
fallbacks: spawnFallbacks,
|
||||
onFallback: handleSpawnFallback,
|
||||
});
|
||||
child = spawned as ChildProcessWithoutNullStreams;
|
||||
stdin = child.stdin;
|
||||
maybeCloseNonPtyStdin();
|
||||
} else if (opts.usePty) {
|
||||
const { shell, args: shellArgs } = getShellConfig();
|
||||
try {
|
||||
const ptyModule = (await import("@lydell/node-pty")) as unknown as {
|
||||
spawn?: PtySpawn;
|
||||
default?: { spawn?: PtySpawn };
|
||||
};
|
||||
const spawnPty = ptyModule.spawn ?? ptyModule.default?.spawn;
|
||||
if (!spawnPty) {
|
||||
throw new Error("PTY support is unavailable (node-pty spawn not found).");
|
||||
}
|
||||
pty = spawnPty(shell, [...shellArgs, execCommand], {
|
||||
cwd: opts.workdir,
|
||||
env: opts.env,
|
||||
name: process.env.TERM ?? "xterm-256color",
|
||||
cols: 120,
|
||||
rows: 30,
|
||||
});
|
||||
stdin = {
|
||||
destroyed: false,
|
||||
write: (data, cb) => {
|
||||
try {
|
||||
pty?.write(data);
|
||||
cb?.(null);
|
||||
} catch (err) {
|
||||
cb?.(err as Error);
|
||||
}
|
||||
},
|
||||
end: () => {
|
||||
try {
|
||||
const eof = process.platform === "win32" ? "\x1a" : "\x04";
|
||||
pty?.write(eof);
|
||||
} catch {
|
||||
// ignore EOF errors
|
||||
}
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
const errText = String(err);
|
||||
const warning = `Warning: PTY spawn failed (${errText}); retrying without PTY for \`${opts.command}\`.`;
|
||||
logWarn(`exec: PTY spawn failed (${errText}); retrying without PTY for "${opts.command}".`);
|
||||
opts.warnings.push(warning);
|
||||
child = await spawnShellChild(shell, shellArgs);
|
||||
stdin = child.stdin;
|
||||
}
|
||||
} else {
|
||||
const { shell, args: shellArgs } = getShellConfig();
|
||||
child = await spawnShellChild(shell, shellArgs);
|
||||
stdin = child.stdin;
|
||||
maybeCloseNonPtyStdin();
|
||||
}
|
||||
|
||||
const session = {
|
||||
const session: ProcessSession = {
|
||||
id: sessionId,
|
||||
command: opts.command,
|
||||
scopeKey: opts.scopeKey,
|
||||
@@ -458,9 +309,9 @@ export async function runExecProcess(opts: {
|
||||
notifyOnExit: opts.notifyOnExit,
|
||||
notifyOnExitEmptySuccess: opts.notifyOnExitEmptySuccess === true,
|
||||
exitNotified: false,
|
||||
child: child ?? undefined,
|
||||
stdin,
|
||||
pid: child?.pid ?? pty?.pid,
|
||||
child: undefined,
|
||||
stdin: undefined,
|
||||
pid: undefined,
|
||||
startedAt,
|
||||
cwd: opts.workdir,
|
||||
maxOutputChars: opts.maxOutput,
|
||||
@@ -477,59 +328,9 @@ export async function runExecProcess(opts: {
|
||||
exitSignal: undefined as NodeJS.Signals | number | null | undefined,
|
||||
truncated: false,
|
||||
backgrounded: false,
|
||||
} satisfies ProcessSession;
|
||||
};
|
||||
addSession(session);
|
||||
|
||||
let settled = false;
|
||||
let timeoutTimer: NodeJS.Timeout | null = null;
|
||||
let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
|
||||
let timedOut = false;
|
||||
const timeoutFinalizeMs = 1000;
|
||||
let resolveFn: ((outcome: ExecProcessOutcome) => void) | null = null;
|
||||
|
||||
const settle = (outcome: ExecProcessOutcome) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
resolveFn?.(outcome);
|
||||
};
|
||||
|
||||
const finalizeTimeout = () => {
|
||||
if (session.exited) {
|
||||
return;
|
||||
}
|
||||
markExited(session, null, "SIGKILL", "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
const aggregated = session.aggregated.trim();
|
||||
const reason = `Command timed out after ${opts.timeoutSec} seconds`;
|
||||
settle({
|
||||
status: "failed",
|
||||
exitCode: null,
|
||||
exitSignal: "SIGKILL",
|
||||
durationMs: Date.now() - startedAt,
|
||||
aggregated,
|
||||
timedOut: true,
|
||||
reason: aggregated ? `${aggregated}\n\n${reason}` : reason,
|
||||
});
|
||||
};
|
||||
|
||||
const onTimeout = () => {
|
||||
timedOut = true;
|
||||
killSession(session);
|
||||
if (!timeoutFinalizeTimer) {
|
||||
timeoutFinalizeTimer = setTimeout(() => {
|
||||
finalizeTimeout();
|
||||
}, timeoutFinalizeMs);
|
||||
}
|
||||
};
|
||||
|
||||
if (opts.timeoutSec > 0) {
|
||||
timeoutTimer = setTimeout(() => {
|
||||
onTimeout();
|
||||
}, opts.timeoutSec * 1000);
|
||||
}
|
||||
|
||||
const emitUpdate = () => {
|
||||
if (!opts.onUpdate) {
|
||||
return;
|
||||
@@ -565,116 +366,208 @@ export async function runExecProcess(opts: {
|
||||
}
|
||||
};
|
||||
|
||||
if (pty) {
|
||||
const cursorResponse = buildCursorPositionResponse();
|
||||
pty.onData((data) => {
|
||||
const raw = data.toString();
|
||||
const { cleaned, requests } = stripDsrRequests(raw);
|
||||
if (requests > 0) {
|
||||
const timeoutMs =
|
||||
typeof opts.timeoutSec === "number" && opts.timeoutSec > 0
|
||||
? Math.floor(opts.timeoutSec * 1000)
|
||||
: undefined;
|
||||
|
||||
const spawnSpec:
|
||||
| {
|
||||
mode: "child";
|
||||
argv: string[];
|
||||
env: NodeJS.ProcessEnv;
|
||||
stdinMode: "pipe-open" | "pipe-closed";
|
||||
}
|
||||
| {
|
||||
mode: "pty";
|
||||
ptyCommand: string;
|
||||
childFallbackArgv: string[];
|
||||
env: NodeJS.ProcessEnv;
|
||||
stdinMode: "pipe-open";
|
||||
} = (() => {
|
||||
if (opts.sandbox) {
|
||||
return {
|
||||
mode: "child" as const,
|
||||
argv: [
|
||||
"docker",
|
||||
...buildDockerExecArgs({
|
||||
containerName: opts.sandbox.containerName,
|
||||
command: execCommand,
|
||||
workdir: opts.containerWorkdir ?? opts.sandbox.containerWorkdir,
|
||||
env: opts.env,
|
||||
tty: opts.usePty,
|
||||
}),
|
||||
],
|
||||
env: process.env,
|
||||
stdinMode: opts.usePty ? ("pipe-open" as const) : ("pipe-closed" as const),
|
||||
};
|
||||
}
|
||||
const { shell, args: shellArgs } = getShellConfig();
|
||||
const childArgv = [shell, ...shellArgs, execCommand];
|
||||
if (opts.usePty) {
|
||||
return {
|
||||
mode: "pty" as const,
|
||||
ptyCommand: execCommand,
|
||||
childFallbackArgv: childArgv,
|
||||
env: opts.env,
|
||||
stdinMode: "pipe-open" as const,
|
||||
};
|
||||
}
|
||||
return {
|
||||
mode: "child" as const,
|
||||
argv: childArgv,
|
||||
env: opts.env,
|
||||
stdinMode: "pipe-closed" as const,
|
||||
};
|
||||
})();
|
||||
|
||||
let managedRun: ManagedRun | null = null;
|
||||
let usingPty = spawnSpec.mode === "pty";
|
||||
const cursorResponse = buildCursorPositionResponse();
|
||||
|
||||
const onSupervisorStdout = (chunk: string) => {
|
||||
if (usingPty) {
|
||||
const { cleaned, requests } = stripDsrRequests(chunk);
|
||||
if (requests > 0 && managedRun?.stdin) {
|
||||
for (let i = 0; i < requests; i += 1) {
|
||||
pty.write(cursorResponse);
|
||||
managedRun.stdin.write(cursorResponse);
|
||||
}
|
||||
}
|
||||
handleStdout(cleaned);
|
||||
});
|
||||
} else if (child) {
|
||||
child.stdout.on("data", handleStdout);
|
||||
child.stderr.on("data", handleStderr);
|
||||
}
|
||||
return;
|
||||
}
|
||||
handleStdout(chunk);
|
||||
};
|
||||
|
||||
const promise = new Promise<ExecProcessOutcome>((resolve) => {
|
||||
resolveFn = resolve;
|
||||
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
|
||||
if (timeoutTimer) {
|
||||
clearTimeout(timeoutTimer);
|
||||
}
|
||||
if (timeoutFinalizeTimer) {
|
||||
clearTimeout(timeoutFinalizeTimer);
|
||||
try {
|
||||
const spawnBase = {
|
||||
runId: sessionId,
|
||||
sessionId: opts.sessionKey?.trim() || sessionId,
|
||||
backendId: opts.sandbox ? "exec-sandbox" : "exec-host",
|
||||
scopeKey: opts.scopeKey,
|
||||
cwd: opts.workdir,
|
||||
env: spawnSpec.env,
|
||||
timeoutMs,
|
||||
captureOutput: false,
|
||||
onStdout: onSupervisorStdout,
|
||||
onStderr: handleStderr,
|
||||
};
|
||||
managedRun =
|
||||
spawnSpec.mode === "pty"
|
||||
? await supervisor.spawn({
|
||||
...spawnBase,
|
||||
mode: "pty",
|
||||
ptyCommand: spawnSpec.ptyCommand,
|
||||
})
|
||||
: await supervisor.spawn({
|
||||
...spawnBase,
|
||||
mode: "child",
|
||||
argv: spawnSpec.argv,
|
||||
stdinMode: spawnSpec.stdinMode,
|
||||
});
|
||||
} catch (err) {
|
||||
if (spawnSpec.mode === "pty") {
|
||||
const warning = `Warning: PTY spawn failed (${String(err)}); retrying without PTY for \`${opts.command}\`.`;
|
||||
logWarn(
|
||||
`exec: PTY spawn failed (${String(err)}); retrying without PTY for "${opts.command}".`,
|
||||
);
|
||||
opts.warnings.push(warning);
|
||||
usingPty = false;
|
||||
try {
|
||||
managedRun = await supervisor.spawn({
|
||||
runId: sessionId,
|
||||
sessionId: opts.sessionKey?.trim() || sessionId,
|
||||
backendId: "exec-host",
|
||||
scopeKey: opts.scopeKey,
|
||||
mode: "child",
|
||||
argv: spawnSpec.childFallbackArgv,
|
||||
cwd: opts.workdir,
|
||||
env: spawnSpec.env,
|
||||
stdinMode: "pipe-open",
|
||||
timeoutMs,
|
||||
captureOutput: false,
|
||||
onStdout: handleStdout,
|
||||
onStderr: handleStderr,
|
||||
});
|
||||
} catch (retryErr) {
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
throw retryErr;
|
||||
}
|
||||
} else {
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
session.stdin = managedRun.stdin;
|
||||
session.pid = managedRun.pid;
|
||||
|
||||
const promise = managedRun
|
||||
.wait()
|
||||
.then((exit): ExecProcessOutcome => {
|
||||
const durationMs = Date.now() - startedAt;
|
||||
const wasSignal = exitSignal != null;
|
||||
const isSuccess = code === 0 && !wasSignal && !timedOut;
|
||||
const status: "completed" | "failed" = isSuccess ? "completed" : "failed";
|
||||
markExited(session, code, exitSignal, status);
|
||||
const status: "completed" | "failed" =
|
||||
exit.exitCode === 0 && exit.reason === "exit" ? "completed" : "failed";
|
||||
markExited(session, exit.exitCode, exit.exitSignal, status);
|
||||
maybeNotifyOnExit(session, status);
|
||||
if (!session.child && session.stdin) {
|
||||
session.stdin.destroyed = true;
|
||||
}
|
||||
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
const aggregated = session.aggregated.trim();
|
||||
if (!isSuccess) {
|
||||
const reason = timedOut
|
||||
? `Command timed out after ${opts.timeoutSec} seconds`
|
||||
: wasSignal && exitSignal
|
||||
? `Command aborted by signal ${exitSignal}`
|
||||
: code === null
|
||||
? "Command aborted before exit code was captured"
|
||||
: `Command exited with code ${code}`;
|
||||
const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
|
||||
settle({
|
||||
status: "failed",
|
||||
exitCode: code ?? null,
|
||||
exitSignal: exitSignal ?? null,
|
||||
if (status === "completed") {
|
||||
return {
|
||||
status: "completed",
|
||||
exitCode: exit.exitCode ?? 0,
|
||||
exitSignal: exit.exitSignal,
|
||||
durationMs,
|
||||
aggregated,
|
||||
timedOut,
|
||||
reason: message,
|
||||
});
|
||||
return;
|
||||
timedOut: false,
|
||||
};
|
||||
}
|
||||
settle({
|
||||
status: "completed",
|
||||
exitCode: code ?? 0,
|
||||
exitSignal: exitSignal ?? null,
|
||||
const reason =
|
||||
exit.reason === "overall-timeout"
|
||||
? `Command timed out after ${opts.timeoutSec} seconds`
|
||||
: exit.reason === "no-output-timeout"
|
||||
? "Command timed out waiting for output"
|
||||
: exit.exitSignal != null
|
||||
? `Command aborted by signal ${exit.exitSignal}`
|
||||
: exit.exitCode == null
|
||||
? "Command aborted before exit code was captured"
|
||||
: `Command exited with code ${exit.exitCode}`;
|
||||
return {
|
||||
status: "failed",
|
||||
exitCode: exit.exitCode,
|
||||
exitSignal: exit.exitSignal,
|
||||
durationMs,
|
||||
aggregated,
|
||||
timedOut: exit.timedOut,
|
||||
reason: aggregated ? `${aggregated}\n\n${reason}` : reason,
|
||||
};
|
||||
})
|
||||
.catch((err): ExecProcessOutcome => {
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
const aggregated = session.aggregated.trim();
|
||||
const message = aggregated ? `${aggregated}\n\n${String(err)}` : String(err);
|
||||
return {
|
||||
status: "failed",
|
||||
exitCode: null,
|
||||
exitSignal: null,
|
||||
durationMs: Date.now() - startedAt,
|
||||
aggregated,
|
||||
timedOut: false,
|
||||
});
|
||||
};
|
||||
|
||||
if (pty) {
|
||||
pty.onExit((event) => {
|
||||
const rawSignal = event.signal ?? null;
|
||||
const normalizedSignal = rawSignal === 0 ? null : rawSignal;
|
||||
handleExit(event.exitCode ?? null, normalizedSignal);
|
||||
});
|
||||
} else if (child) {
|
||||
child.once("close", (code, exitSignal) => {
|
||||
handleExit(code, exitSignal);
|
||||
});
|
||||
|
||||
child.once("error", (err) => {
|
||||
if (timeoutTimer) {
|
||||
clearTimeout(timeoutTimer);
|
||||
}
|
||||
if (timeoutFinalizeTimer) {
|
||||
clearTimeout(timeoutFinalizeTimer);
|
||||
}
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
const aggregated = session.aggregated.trim();
|
||||
const message = aggregated ? `${aggregated}\n\n${String(err)}` : String(err);
|
||||
settle({
|
||||
status: "failed",
|
||||
exitCode: null,
|
||||
exitSignal: null,
|
||||
durationMs: Date.now() - startedAt,
|
||||
aggregated,
|
||||
timedOut,
|
||||
reason: message,
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
reason: message,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
session,
|
||||
startedAt,
|
||||
pid: session.pid ?? undefined,
|
||||
promise,
|
||||
kill: () => killSession(session),
|
||||
kill: () => {
|
||||
managedRun?.cancel("manual-cancel");
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
73
src/agents/bash-tools.exec.pty-cleanup.test.ts
Normal file
73
src/agents/bash-tools.exec.pty-cleanup.test.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
import { afterEach, expect, test, vi } from "vitest";
|
||||
import { resetProcessRegistryForTests } from "./bash-process-registry";
|
||||
|
||||
afterEach(() => {
|
||||
resetProcessRegistryForTests();
|
||||
vi.resetModules();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
test("exec disposes PTY listeners after normal exit", async () => {
|
||||
const disposeData = vi.fn();
|
||||
const disposeExit = vi.fn();
|
||||
|
||||
vi.doMock("@lydell/node-pty", () => ({
|
||||
spawn: () => {
|
||||
return {
|
||||
pid: 0,
|
||||
write: vi.fn(),
|
||||
onData: (listener: (value: string) => void) => {
|
||||
setTimeout(() => listener("ok"), 0);
|
||||
return { dispose: disposeData };
|
||||
},
|
||||
onExit: (listener: (event: { exitCode: number; signal?: number }) => void) => {
|
||||
setTimeout(() => listener({ exitCode: 0 }), 0);
|
||||
return { dispose: disposeExit };
|
||||
},
|
||||
kill: vi.fn(),
|
||||
};
|
||||
},
|
||||
}));
|
||||
|
||||
const { createExecTool } = await import("./bash-tools.exec");
|
||||
const tool = createExecTool({ allowBackground: false });
|
||||
const result = await tool.execute("toolcall", {
|
||||
command: "echo ok",
|
||||
pty: true,
|
||||
});
|
||||
|
||||
expect(result.details.status).toBe("completed");
|
||||
expect(disposeData).toHaveBeenCalledTimes(1);
|
||||
expect(disposeExit).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
test("exec tears down PTY resources on timeout", async () => {
|
||||
const disposeData = vi.fn();
|
||||
const disposeExit = vi.fn();
|
||||
const kill = vi.fn();
|
||||
|
||||
vi.doMock("@lydell/node-pty", () => ({
|
||||
spawn: () => {
|
||||
return {
|
||||
pid: 0,
|
||||
write: vi.fn(),
|
||||
onData: () => ({ dispose: disposeData }),
|
||||
onExit: () => ({ dispose: disposeExit }),
|
||||
kill,
|
||||
};
|
||||
},
|
||||
}));
|
||||
|
||||
const { createExecTool } = await import("./bash-tools.exec");
|
||||
const tool = createExecTool({ allowBackground: false });
|
||||
await expect(
|
||||
tool.execute("toolcall", {
|
||||
command: "sleep 5",
|
||||
pty: true,
|
||||
timeout: 0.01,
|
||||
}),
|
||||
).rejects.toThrow("Command timed out");
|
||||
expect(kill).toHaveBeenCalledTimes(1);
|
||||
expect(disposeData).toHaveBeenCalledTimes(1);
|
||||
expect(disposeExit).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
40
src/agents/bash-tools.exec.pty-fallback-failure.test.ts
Normal file
40
src/agents/bash-tools.exec.pty-fallback-failure.test.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import { afterEach, expect, test, vi } from "vitest";
|
||||
import { listRunningSessions, resetProcessRegistryForTests } from "./bash-process-registry";
|
||||
|
||||
const { supervisorSpawnMock } = vi.hoisted(() => ({
|
||||
supervisorSpawnMock: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("../process/supervisor/index.js", () => ({
|
||||
getProcessSupervisor: () => ({
|
||||
spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
|
||||
cancel: vi.fn(),
|
||||
cancelScope: vi.fn(),
|
||||
reconcileOrphans: vi.fn(),
|
||||
getRecord: vi.fn(),
|
||||
}),
|
||||
}));
|
||||
|
||||
afterEach(() => {
|
||||
resetProcessRegistryForTests();
|
||||
vi.resetModules();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
test("exec cleans session state when PTY fallback spawn also fails", async () => {
|
||||
supervisorSpawnMock
|
||||
.mockRejectedValueOnce(new Error("pty spawn failed"))
|
||||
.mockRejectedValueOnce(new Error("child fallback failed"));
|
||||
|
||||
const { createExecTool } = await import("./bash-tools.exec");
|
||||
const tool = createExecTool({ allowBackground: false });
|
||||
|
||||
await expect(
|
||||
tool.execute("toolcall", {
|
||||
command: "echo ok",
|
||||
pty: true,
|
||||
}),
|
||||
).rejects.toThrow("child fallback failed");
|
||||
|
||||
expect(listRunningSessions()).toHaveLength(0);
|
||||
});
|
||||
152
src/agents/bash-tools.process.supervisor.test.ts
Normal file
152
src/agents/bash-tools.process.supervisor.test.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { ProcessSession } from "./bash-process-registry.js";
|
||||
import {
|
||||
addSession,
|
||||
getFinishedSession,
|
||||
getSession,
|
||||
resetProcessRegistryForTests,
|
||||
} from "./bash-process-registry.js";
|
||||
import { createProcessTool } from "./bash-tools.process.js";
|
||||
|
||||
const { supervisorMock } = vi.hoisted(() => ({
|
||||
supervisorMock: {
|
||||
spawn: vi.fn(),
|
||||
cancel: vi.fn(),
|
||||
cancelScope: vi.fn(),
|
||||
reconcileOrphans: vi.fn(),
|
||||
getRecord: vi.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
const { killProcessTreeMock } = vi.hoisted(() => ({
|
||||
killProcessTreeMock: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("../process/supervisor/index.js", () => ({
|
||||
getProcessSupervisor: () => supervisorMock,
|
||||
}));
|
||||
|
||||
vi.mock("../process/kill-tree.js", () => ({
|
||||
killProcessTree: (...args: unknown[]) => killProcessTreeMock(...args),
|
||||
}));
|
||||
|
||||
function createBackgroundSession(id: string, pid?: number): ProcessSession {
|
||||
return {
|
||||
id,
|
||||
command: "sleep 999",
|
||||
startedAt: Date.now(),
|
||||
cwd: "/tmp",
|
||||
maxOutputChars: 10_000,
|
||||
pendingMaxOutputChars: 30_000,
|
||||
totalOutputChars: 0,
|
||||
pendingStdout: [],
|
||||
pendingStderr: [],
|
||||
pendingStdoutChars: 0,
|
||||
pendingStderrChars: 0,
|
||||
aggregated: "",
|
||||
tail: "",
|
||||
pid,
|
||||
exited: false,
|
||||
exitCode: undefined,
|
||||
exitSignal: undefined,
|
||||
truncated: false,
|
||||
backgrounded: true,
|
||||
};
|
||||
}
|
||||
|
||||
describe("process tool supervisor cancellation", () => {
|
||||
beforeEach(() => {
|
||||
supervisorMock.spawn.mockReset();
|
||||
supervisorMock.cancel.mockReset();
|
||||
supervisorMock.cancelScope.mockReset();
|
||||
supervisorMock.reconcileOrphans.mockReset();
|
||||
supervisorMock.getRecord.mockReset();
|
||||
killProcessTreeMock.mockReset();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
resetProcessRegistryForTests();
|
||||
});
|
||||
|
||||
it("routes kill through supervisor when run is managed", async () => {
|
||||
supervisorMock.getRecord.mockReturnValue({
|
||||
runId: "sess",
|
||||
state: "running",
|
||||
});
|
||||
addSession(createBackgroundSession("sess"));
|
||||
const processTool = createProcessTool();
|
||||
|
||||
const result = await processTool.execute("toolcall", {
|
||||
action: "kill",
|
||||
sessionId: "sess",
|
||||
});
|
||||
|
||||
expect(supervisorMock.cancel).toHaveBeenCalledWith("sess", "manual-cancel");
|
||||
expect(getSession("sess")).toBeDefined();
|
||||
expect(getSession("sess")?.exited).toBe(false);
|
||||
expect(result.content[0]).toMatchObject({
|
||||
type: "text",
|
||||
text: "Termination requested for session sess.",
|
||||
});
|
||||
});
|
||||
|
||||
it("remove drops running session immediately when cancellation is requested", async () => {
|
||||
supervisorMock.getRecord.mockReturnValue({
|
||||
runId: "sess",
|
||||
state: "running",
|
||||
});
|
||||
addSession(createBackgroundSession("sess"));
|
||||
const processTool = createProcessTool();
|
||||
|
||||
const result = await processTool.execute("toolcall", {
|
||||
action: "remove",
|
||||
sessionId: "sess",
|
||||
});
|
||||
|
||||
expect(supervisorMock.cancel).toHaveBeenCalledWith("sess", "manual-cancel");
|
||||
expect(getSession("sess")).toBeUndefined();
|
||||
expect(getFinishedSession("sess")).toBeUndefined();
|
||||
expect(result.content[0]).toMatchObject({
|
||||
type: "text",
|
||||
text: "Removed session sess (termination requested).",
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back to process-tree kill when supervisor record is missing", async () => {
|
||||
supervisorMock.getRecord.mockReturnValue(undefined);
|
||||
addSession(createBackgroundSession("sess-fallback", 4242));
|
||||
const processTool = createProcessTool();
|
||||
|
||||
const result = await processTool.execute("toolcall", {
|
||||
action: "kill",
|
||||
sessionId: "sess-fallback",
|
||||
});
|
||||
|
||||
expect(killProcessTreeMock).toHaveBeenCalledWith(4242);
|
||||
expect(getSession("sess-fallback")).toBeUndefined();
|
||||
expect(getFinishedSession("sess-fallback")).toBeDefined();
|
||||
expect(result.content[0]).toMatchObject({
|
||||
type: "text",
|
||||
text: "Killed session sess-fallback.",
|
||||
});
|
||||
});
|
||||
|
||||
it("fails remove when no supervisor record and no pid is available", async () => {
|
||||
supervisorMock.getRecord.mockReturnValue(undefined);
|
||||
addSession(createBackgroundSession("sess-no-pid"));
|
||||
const processTool = createProcessTool();
|
||||
|
||||
const result = await processTool.execute("toolcall", {
|
||||
action: "remove",
|
||||
sessionId: "sess-no-pid",
|
||||
});
|
||||
|
||||
expect(killProcessTreeMock).not.toHaveBeenCalled();
|
||||
expect(getSession("sess-no-pid")).toBeDefined();
|
||||
expect(result.details).toMatchObject({ status: "failed" });
|
||||
expect(result.content[0]).toMatchObject({
|
||||
type: "text",
|
||||
text: "Unable to remove session sess-no-pid: no active supervisor run or process id.",
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,7 +1,10 @@
|
||||
import type { AgentTool, AgentToolResult } from "@mariozechner/pi-agent-core";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import { formatDurationCompact } from "../infra/format-time/format-duration.ts";
|
||||
import { killProcessTree } from "../process/kill-tree.js";
|
||||
import { getProcessSupervisor } from "../process/supervisor/index.js";
|
||||
import {
|
||||
type ProcessSession,
|
||||
deleteSession,
|
||||
drainSession,
|
||||
getFinishedSession,
|
||||
@@ -11,13 +14,7 @@ import {
|
||||
markExited,
|
||||
setJobTtlMs,
|
||||
} from "./bash-process-registry.js";
|
||||
import {
|
||||
deriveSessionName,
|
||||
killSession,
|
||||
pad,
|
||||
sliceLogLines,
|
||||
truncateMiddle,
|
||||
} from "./bash-tools.shared.js";
|
||||
import { deriveSessionName, pad, sliceLogLines, truncateMiddle } from "./bash-tools.shared.js";
|
||||
import { encodeKeySequence, encodePaste } from "./pty-keys.js";
|
||||
|
||||
export type ProcessToolDefaults = {
|
||||
@@ -107,9 +104,28 @@ export function createProcessTool(
|
||||
setJobTtlMs(defaults.cleanupMs);
|
||||
}
|
||||
const scopeKey = defaults?.scopeKey;
|
||||
const supervisor = getProcessSupervisor();
|
||||
const isInScope = (session?: { scopeKey?: string } | null) =>
|
||||
!scopeKey || session?.scopeKey === scopeKey;
|
||||
|
||||
const cancelManagedSession = (sessionId: string) => {
|
||||
const record = supervisor.getRecord(sessionId);
|
||||
if (!record || record.state === "exited") {
|
||||
return false;
|
||||
}
|
||||
supervisor.cancel(sessionId, "manual-cancel");
|
||||
return true;
|
||||
};
|
||||
|
||||
const terminateSessionFallback = (session: ProcessSession) => {
|
||||
const pid = session.pid ?? session.child?.pid;
|
||||
if (typeof pid !== "number" || !Number.isFinite(pid) || pid <= 0) {
|
||||
return false;
|
||||
}
|
||||
killProcessTree(pid);
|
||||
return true;
|
||||
};
|
||||
|
||||
return {
|
||||
name: "process",
|
||||
label: "process",
|
||||
@@ -523,10 +539,25 @@ export function createProcessTool(
|
||||
if (!scopedSession.backgrounded) {
|
||||
return failText(`Session ${params.sessionId} is not backgrounded.`);
|
||||
}
|
||||
killSession(scopedSession);
|
||||
markExited(scopedSession, null, "SIGKILL", "failed");
|
||||
const canceled = cancelManagedSession(scopedSession.id);
|
||||
if (!canceled) {
|
||||
const terminated = terminateSessionFallback(scopedSession);
|
||||
if (!terminated) {
|
||||
return failText(
|
||||
`Unable to terminate session ${params.sessionId}: no active supervisor run or process id.`,
|
||||
);
|
||||
}
|
||||
markExited(scopedSession, null, "SIGKILL", "failed");
|
||||
}
|
||||
return {
|
||||
content: [{ type: "text", text: `Killed session ${params.sessionId}.` }],
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: canceled
|
||||
? `Termination requested for session ${params.sessionId}.`
|
||||
: `Killed session ${params.sessionId}.`,
|
||||
},
|
||||
],
|
||||
details: {
|
||||
status: "failed",
|
||||
name: scopedSession ? deriveSessionName(scopedSession.command) : undefined,
|
||||
@@ -555,10 +586,30 @@ export function createProcessTool(
|
||||
|
||||
case "remove": {
|
||||
if (scopedSession) {
|
||||
killSession(scopedSession);
|
||||
markExited(scopedSession, null, "SIGKILL", "failed");
|
||||
const canceled = cancelManagedSession(scopedSession.id);
|
||||
if (canceled) {
|
||||
// Keep remove semantics deterministic: drop from process registry now.
|
||||
scopedSession.backgrounded = false;
|
||||
deleteSession(params.sessionId);
|
||||
} else {
|
||||
const terminated = terminateSessionFallback(scopedSession);
|
||||
if (!terminated) {
|
||||
return failText(
|
||||
`Unable to remove session ${params.sessionId}: no active supervisor run or process id.`,
|
||||
);
|
||||
}
|
||||
markExited(scopedSession, null, "SIGKILL", "failed");
|
||||
deleteSession(params.sessionId);
|
||||
}
|
||||
return {
|
||||
content: [{ type: "text", text: `Removed session ${params.sessionId}.` }],
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: canceled
|
||||
? `Removed session ${params.sessionId} (termination requested).`
|
||||
: `Removed session ${params.sessionId}.`,
|
||||
},
|
||||
],
|
||||
details: {
|
||||
status: "failed",
|
||||
name: scopedSession ? deriveSessionName(scopedSession.command) : undefined,
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import type { ChildProcessWithoutNullStreams } from "node:child_process";
|
||||
import { existsSync, statSync } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import { homedir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { sliceUtf16Safe } from "../utils.js";
|
||||
import { assertSandboxPath } from "./sandbox-paths.js";
|
||||
import { killProcessTree } from "./shell-utils.js";
|
||||
|
||||
const CHUNK_LIMIT = 8 * 1024;
|
||||
|
||||
@@ -115,13 +113,6 @@ export async function resolveSandboxWorkdir(params: {
|
||||
}
|
||||
}
|
||||
|
||||
export function killSession(session: { pid?: number; child?: ChildProcessWithoutNullStreams }) {
|
||||
const pid = session.pid ?? session.child?.pid;
|
||||
if (pid) {
|
||||
killProcessTree(pid);
|
||||
}
|
||||
}
|
||||
|
||||
export function resolveWorkdir(workdir: string, warnings: string[]) {
|
||||
const current = safeCwd();
|
||||
const fallback = current ?? homedir();
|
||||
|
||||
36
src/agents/cli-backends.test.ts
Normal file
36
src/agents/cli-backends.test.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { resolveCliBackendConfig } from "./cli-backends.js";
|
||||
|
||||
describe("resolveCliBackendConfig reliability merge", () => {
|
||||
it("deep-merges reliability watchdog overrides for codex", () => {
|
||||
const cfg = {
|
||||
agents: {
|
||||
defaults: {
|
||||
cliBackends: {
|
||||
"codex-cli": {
|
||||
command: "codex",
|
||||
reliability: {
|
||||
watchdog: {
|
||||
resume: {
|
||||
noOutputTimeoutMs: 42_000,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
} satisfies OpenClawConfig;
|
||||
|
||||
const resolved = resolveCliBackendConfig("codex-cli", cfg);
|
||||
|
||||
expect(resolved).not.toBeNull();
|
||||
expect(resolved?.config.reliability?.watchdog?.resume?.noOutputTimeoutMs).toBe(42_000);
|
||||
// Ensure defaults are retained when only one field is overridden.
|
||||
expect(resolved?.config.reliability?.watchdog?.resume?.noOutputTimeoutRatio).toBe(0.3);
|
||||
expect(resolved?.config.reliability?.watchdog?.resume?.minMs).toBe(60_000);
|
||||
expect(resolved?.config.reliability?.watchdog?.resume?.maxMs).toBe(180_000);
|
||||
expect(resolved?.config.reliability?.watchdog?.fresh?.noOutputTimeoutRatio).toBe(0.8);
|
||||
});
|
||||
});
|
||||
@@ -1,5 +1,9 @@
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type { CliBackendConfig } from "../config/types.js";
|
||||
import {
|
||||
CLI_FRESH_WATCHDOG_DEFAULTS,
|
||||
CLI_RESUME_WATCHDOG_DEFAULTS,
|
||||
} from "./cli-watchdog-defaults.js";
|
||||
import { normalizeProviderId } from "./model-selection.js";
|
||||
|
||||
export type ResolvedCliBackend = {
|
||||
@@ -49,6 +53,12 @@ const DEFAULT_CLAUDE_BACKEND: CliBackendConfig = {
|
||||
systemPromptMode: "append",
|
||||
systemPromptWhen: "first",
|
||||
clearEnv: ["ANTHROPIC_API_KEY", "ANTHROPIC_API_KEY_OLD"],
|
||||
reliability: {
|
||||
watchdog: {
|
||||
fresh: { ...CLI_FRESH_WATCHDOG_DEFAULTS },
|
||||
resume: { ...CLI_RESUME_WATCHDOG_DEFAULTS },
|
||||
},
|
||||
},
|
||||
serialize: true,
|
||||
};
|
||||
|
||||
@@ -73,6 +83,12 @@ const DEFAULT_CODEX_BACKEND: CliBackendConfig = {
|
||||
sessionMode: "existing",
|
||||
imageArg: "--image",
|
||||
imageMode: "repeat",
|
||||
reliability: {
|
||||
watchdog: {
|
||||
fresh: { ...CLI_FRESH_WATCHDOG_DEFAULTS },
|
||||
resume: { ...CLI_RESUME_WATCHDOG_DEFAULTS },
|
||||
},
|
||||
},
|
||||
serialize: true,
|
||||
};
|
||||
|
||||
@@ -96,6 +112,10 @@ function mergeBackendConfig(base: CliBackendConfig, override?: CliBackendConfig)
|
||||
if (!override) {
|
||||
return { ...base };
|
||||
}
|
||||
const baseFresh = base.reliability?.watchdog?.fresh ?? {};
|
||||
const baseResume = base.reliability?.watchdog?.resume ?? {};
|
||||
const overrideFresh = override.reliability?.watchdog?.fresh ?? {};
|
||||
const overrideResume = override.reliability?.watchdog?.resume ?? {};
|
||||
return {
|
||||
...base,
|
||||
...override,
|
||||
@@ -106,6 +126,22 @@ function mergeBackendConfig(base: CliBackendConfig, override?: CliBackendConfig)
|
||||
sessionIdFields: override.sessionIdFields ?? base.sessionIdFields,
|
||||
sessionArgs: override.sessionArgs ?? base.sessionArgs,
|
||||
resumeArgs: override.resumeArgs ?? base.resumeArgs,
|
||||
reliability: {
|
||||
...base.reliability,
|
||||
...override.reliability,
|
||||
watchdog: {
|
||||
...base.reliability?.watchdog,
|
||||
...override.reliability?.watchdog,
|
||||
fresh: {
|
||||
...baseFresh,
|
||||
...overrideFresh,
|
||||
},
|
||||
resume: {
|
||||
...baseResume,
|
||||
...overrideResume,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -3,50 +3,69 @@ import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type { CliBackendConfig } from "../config/types.js";
|
||||
import { runCliAgent } from "./cli-runner.js";
|
||||
import { cleanupResumeProcesses, cleanupSuspendedCliProcesses } from "./cli-runner/helpers.js";
|
||||
import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";
|
||||
|
||||
const runCommandWithTimeoutMock = vi.fn();
|
||||
const runExecMock = vi.fn();
|
||||
const supervisorSpawnMock = vi.fn();
|
||||
|
||||
vi.mock("../process/exec.js", () => ({
|
||||
runCommandWithTimeout: (...args: unknown[]) => runCommandWithTimeoutMock(...args),
|
||||
runExec: (...args: unknown[]) => runExecMock(...args),
|
||||
vi.mock("../process/supervisor/index.js", () => ({
|
||||
getProcessSupervisor: () => ({
|
||||
spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
|
||||
cancel: vi.fn(),
|
||||
cancelScope: vi.fn(),
|
||||
reconcileOrphans: vi.fn(),
|
||||
getRecord: vi.fn(),
|
||||
}),
|
||||
}));
|
||||
|
||||
describe("runCliAgent resume cleanup", () => {
|
||||
type MockRunExit = {
|
||||
reason:
|
||||
| "manual-cancel"
|
||||
| "overall-timeout"
|
||||
| "no-output-timeout"
|
||||
| "spawn-error"
|
||||
| "signal"
|
||||
| "exit";
|
||||
exitCode: number | null;
|
||||
exitSignal: NodeJS.Signals | number | null;
|
||||
durationMs: number;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
timedOut: boolean;
|
||||
noOutputTimedOut: boolean;
|
||||
};
|
||||
|
||||
function createManagedRun(exit: MockRunExit, pid = 1234) {
|
||||
return {
|
||||
runId: "run-supervisor",
|
||||
pid,
|
||||
startedAtMs: Date.now(),
|
||||
stdin: undefined,
|
||||
wait: vi.fn().mockResolvedValue(exit),
|
||||
cancel: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
describe("runCliAgent with process supervisor", () => {
|
||||
beforeEach(() => {
|
||||
runCommandWithTimeoutMock.mockReset();
|
||||
runExecMock.mockReset();
|
||||
supervisorSpawnMock.mockReset();
|
||||
});
|
||||
|
||||
it("kills stale resume processes for codex sessions", async () => {
|
||||
const selfPid = process.pid;
|
||||
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: " 1 999 S /bin/launchd\n",
|
||||
it("runs CLI through supervisor and returns payload", async () => {
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "exit",
|
||||
exitCode: 0,
|
||||
exitSignal: null,
|
||||
durationMs: 50,
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
}) // cleanupSuspendedCliProcesses (ps) — ppid 999 != selfPid, no match
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` ${selfPid + 1} ${selfPid} codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
|
||||
` ${selfPid + 2} 999 codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
}) // cleanupResumeProcesses (ps)
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" }) // cleanupResumeProcesses (kill -TERM)
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" }); // cleanupResumeProcesses (kill -9)
|
||||
runCommandWithTimeoutMock.mockResolvedValueOnce({
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
code: 0,
|
||||
signal: null,
|
||||
killed: false,
|
||||
});
|
||||
timedOut: false,
|
||||
noOutputTimedOut: false,
|
||||
}),
|
||||
);
|
||||
|
||||
await runCliAgent({
|
||||
const result = await runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: "/tmp",
|
||||
@@ -58,28 +77,80 @@ describe("runCliAgent resume cleanup", () => {
|
||||
cliSessionId: "thread-123",
|
||||
});
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
expect(result.payloads?.[0]?.text).toBe("ok");
|
||||
expect(supervisorSpawnMock).toHaveBeenCalledTimes(1);
|
||||
const input = supervisorSpawnMock.mock.calls[0]?.[0] as {
|
||||
argv?: string[];
|
||||
mode?: string;
|
||||
timeoutMs?: number;
|
||||
noOutputTimeoutMs?: number;
|
||||
replaceExistingScope?: boolean;
|
||||
scopeKey?: string;
|
||||
};
|
||||
expect(input.mode).toBe("child");
|
||||
expect(input.argv?.[0]).toBe("codex");
|
||||
expect(input.timeoutMs).toBe(1_000);
|
||||
expect(input.noOutputTimeoutMs).toBeGreaterThanOrEqual(1_000);
|
||||
expect(input.replaceExistingScope).toBe(true);
|
||||
expect(input.scopeKey).toContain("thread-123");
|
||||
});
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(4);
|
||||
it("fails with timeout when no-output watchdog trips", async () => {
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "no-output-timeout",
|
||||
exitCode: null,
|
||||
exitSignal: "SIGKILL",
|
||||
durationMs: 200,
|
||||
stdout: "",
|
||||
stderr: "",
|
||||
timedOut: true,
|
||||
noOutputTimedOut: true,
|
||||
}),
|
||||
);
|
||||
|
||||
// Second call: cleanupResumeProcesses ps
|
||||
const psCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(psCall[0]).toBe("ps");
|
||||
await expect(
|
||||
runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: "/tmp",
|
||||
prompt: "hi",
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-2",
|
||||
cliSessionId: "thread-123",
|
||||
}),
|
||||
).rejects.toThrow("produced no output");
|
||||
});
|
||||
|
||||
// Third call: TERM, only the child PID
|
||||
const termCall = runExecMock.mock.calls[2] ?? [];
|
||||
expect(termCall[0]).toBe("kill");
|
||||
const termArgs = termCall[1] as string[];
|
||||
expect(termArgs).toEqual(["-TERM", String(selfPid + 1)]);
|
||||
it("fails with timeout when overall timeout trips", async () => {
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "overall-timeout",
|
||||
exitCode: null,
|
||||
exitSignal: "SIGKILL",
|
||||
durationMs: 200,
|
||||
stdout: "",
|
||||
stderr: "",
|
||||
timedOut: true,
|
||||
noOutputTimedOut: false,
|
||||
}),
|
||||
);
|
||||
|
||||
// Fourth call: KILL, only the child PID
|
||||
const killCall = runExecMock.mock.calls[3] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
const killArgs = killCall[1] as string[];
|
||||
expect(killArgs).toEqual(["-9", String(selfPid + 1)]);
|
||||
await expect(
|
||||
runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: "/tmp",
|
||||
prompt: "hi",
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-3",
|
||||
cliSessionId: "thread-123",
|
||||
}),
|
||||
).rejects.toThrow("exceeded timeout");
|
||||
});
|
||||
|
||||
it("falls back to per-agent workspace when workspaceDir is missing", async () => {
|
||||
@@ -94,14 +165,18 @@ describe("runCliAgent resume cleanup", () => {
|
||||
},
|
||||
} satisfies OpenClawConfig;
|
||||
|
||||
runExecMock.mockResolvedValue({ stdout: "", stderr: "" });
|
||||
runCommandWithTimeoutMock.mockResolvedValueOnce({
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
code: 0,
|
||||
signal: null,
|
||||
killed: false,
|
||||
});
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "exit",
|
||||
exitCode: 0,
|
||||
exitSignal: null,
|
||||
durationMs: 25,
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
timedOut: false,
|
||||
noOutputTimedOut: false,
|
||||
}),
|
||||
);
|
||||
|
||||
try {
|
||||
await runCliAgent({
|
||||
@@ -114,264 +189,33 @@ describe("runCliAgent resume cleanup", () => {
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-1",
|
||||
runId: "run-4",
|
||||
});
|
||||
} finally {
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
const options = runCommandWithTimeoutMock.mock.calls[0]?.[1] as { cwd?: string };
|
||||
expect(options.cwd).toBe(path.resolve(fallbackWorkspace));
|
||||
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { cwd?: string };
|
||||
expect(input.cwd).toBe(path.resolve(fallbackWorkspace));
|
||||
});
|
||||
});
|
||||
|
||||
it("throws when sessionKey is malformed", async () => {
|
||||
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-"));
|
||||
const mainWorkspace = path.join(tempDir, "workspace-main");
|
||||
const researchWorkspace = path.join(tempDir, "workspace-research");
|
||||
await fs.mkdir(mainWorkspace, { recursive: true });
|
||||
await fs.mkdir(researchWorkspace, { recursive: true });
|
||||
const cfg = {
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: mainWorkspace,
|
||||
describe("resolveCliNoOutputTimeoutMs", () => {
|
||||
it("uses backend-configured resume watchdog override", () => {
|
||||
const timeoutMs = resolveCliNoOutputTimeoutMs({
|
||||
backend: {
|
||||
command: "codex",
|
||||
reliability: {
|
||||
watchdog: {
|
||||
resume: {
|
||||
noOutputTimeoutMs: 42_000,
|
||||
},
|
||||
},
|
||||
},
|
||||
list: [{ id: "research", workspace: researchWorkspace }],
|
||||
},
|
||||
} satisfies OpenClawConfig;
|
||||
|
||||
try {
|
||||
await expect(
|
||||
runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionKey: "agent::broken",
|
||||
agentId: "research",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: undefined as unknown as string,
|
||||
config: cfg,
|
||||
prompt: "hi",
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-2",
|
||||
}),
|
||||
).rejects.toThrow("Malformed agent session key");
|
||||
} finally {
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
expect(runCommandWithTimeoutMock).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe("cleanupSuspendedCliProcesses", () => {
|
||||
beforeEach(() => {
|
||||
runExecMock.mockReset();
|
||||
});
|
||||
|
||||
it("skips when no session tokens are configured", async () => {
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "tool",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("matches sessionArg-based commands", async () => {
|
||||
const selfPid = process.pid;
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` 40 ${selfPid} T+ claude --session-id thread-1 -p`,
|
||||
` 41 ${selfPid} S claude --session-id thread-2 -p`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "claude",
|
||||
sessionArg: "--session-id",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(2);
|
||||
const killCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
expect(killCall[1]).toEqual(["-9", "40"]);
|
||||
});
|
||||
|
||||
it("matches resumeArgs with positional session id", async () => {
|
||||
const selfPid = process.pid;
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` 50 ${selfPid} T codex exec resume thread-99 --color never --sandbox read-only`,
|
||||
` 51 ${selfPid} T codex exec resume other --color never --sandbox read-only`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "codex",
|
||||
resumeArgs: ["exec", "resume", "{sessionId}", "--color", "never", "--sandbox", "read-only"],
|
||||
} as CliBackendConfig,
|
||||
1,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(2);
|
||||
const killCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
expect(killCall[1]).toEqual(["-9", "50", "51"]);
|
||||
});
|
||||
|
||||
it("only kills child processes of current process (ppid validation)", async () => {
|
||||
const selfPid = process.pid;
|
||||
const childPid = selfPid + 1;
|
||||
const unrelatedPid = 9999;
|
||||
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` ${childPid} ${selfPid} T claude --session-id thread-1 -p`,
|
||||
` ${unrelatedPid} 100 T claude --session-id thread-2 -p`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "claude",
|
||||
sessionArg: "--session-id",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(2);
|
||||
const killCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
// Only childPid killed; unrelatedPid (ppid=100) excluded
|
||||
expect(killCall[1]).toEqual(["-9", String(childPid)]);
|
||||
});
|
||||
|
||||
it("skips all processes when none are children of current process", async () => {
|
||||
runExecMock.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
" 200 100 T claude --session-id thread-1 -p",
|
||||
" 201 100 T claude --session-id thread-2 -p",
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
timeoutMs: 120_000,
|
||||
useResume: true,
|
||||
});
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "claude",
|
||||
sessionArg: "--session-id",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
// Only ps called — no kill because no matching ppid
|
||||
expect(runExecMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe("cleanupResumeProcesses", () => {
|
||||
beforeEach(() => {
|
||||
runExecMock.mockReset();
|
||||
});
|
||||
|
||||
it("only kills resume processes owned by current process", async () => {
|
||||
const selfPid = process.pid;
|
||||
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` ${selfPid + 1} ${selfPid} codex exec resume abc-123`,
|
||||
` ${selfPid + 2} 999 codex exec resume abc-123`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" })
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupResumeProcesses(
|
||||
{
|
||||
command: "codex",
|
||||
resumeArgs: ["exec", "resume", "{sessionId}"],
|
||||
} as CliBackendConfig,
|
||||
"abc-123",
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(3);
|
||||
|
||||
const termCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(termCall[0]).toBe("kill");
|
||||
expect(termCall[1]).toEqual(["-TERM", String(selfPid + 1)]);
|
||||
|
||||
const killCall = runExecMock.mock.calls[2] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
expect(killCall[1]).toEqual(["-9", String(selfPid + 1)]);
|
||||
});
|
||||
|
||||
it("skips kill when no resume processes match ppid", async () => {
|
||||
runExecMock.mockResolvedValueOnce({
|
||||
stdout: [" 300 100 codex exec resume abc-123", " 301 200 codex exec resume abc-123"].join(
|
||||
"\n",
|
||||
),
|
||||
stderr: "",
|
||||
});
|
||||
|
||||
await cleanupResumeProcesses(
|
||||
{
|
||||
command: "codex",
|
||||
resumeArgs: ["exec", "resume", "{sessionId}"],
|
||||
} as CliBackendConfig,
|
||||
"abc-123",
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
// Only ps called — no kill because no matching ppid
|
||||
expect(runExecMock).toHaveBeenCalledTimes(1);
|
||||
expect(timeoutMs).toBe(42_000);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,20 +6,20 @@ import { resolveHeartbeatPrompt } from "../auto-reply/heartbeat.js";
|
||||
import { shouldLogVerbose } from "../globals.js";
|
||||
import { isTruthyEnvValue } from "../infra/env.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
import { runCommandWithTimeout } from "../process/exec.js";
|
||||
import { getProcessSupervisor } from "../process/supervisor/index.js";
|
||||
import { resolveSessionAgentIds } from "./agent-scope.js";
|
||||
import { makeBootstrapWarn, resolveBootstrapContextForRun } from "./bootstrap-files.js";
|
||||
import { resolveCliBackendConfig } from "./cli-backends.js";
|
||||
import {
|
||||
appendImagePathsToPrompt,
|
||||
buildCliSupervisorScopeKey,
|
||||
buildCliArgs,
|
||||
buildSystemPrompt,
|
||||
cleanupResumeProcesses,
|
||||
cleanupSuspendedCliProcesses,
|
||||
enqueueCliRun,
|
||||
normalizeCliModel,
|
||||
parseCliJson,
|
||||
parseCliJsonl,
|
||||
resolveCliNoOutputTimeoutMs,
|
||||
resolvePromptInput,
|
||||
resolveSessionIdToSend,
|
||||
resolveSystemPromptUsage,
|
||||
@@ -226,19 +226,32 @@ export async function runCliAgent(params: {
|
||||
}
|
||||
return next;
|
||||
})();
|
||||
|
||||
// Cleanup suspended processes that have accumulated (regardless of sessionId)
|
||||
await cleanupSuspendedCliProcesses(backend);
|
||||
if (useResume && cliSessionIdToSend) {
|
||||
await cleanupResumeProcesses(backend, cliSessionIdToSend);
|
||||
}
|
||||
|
||||
const result = await runCommandWithTimeout([backend.command, ...args], {
|
||||
const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({
|
||||
backend,
|
||||
timeoutMs: params.timeoutMs,
|
||||
useResume,
|
||||
});
|
||||
const supervisor = getProcessSupervisor();
|
||||
const scopeKey = buildCliSupervisorScopeKey({
|
||||
backend,
|
||||
backendId: backendResolved.id,
|
||||
cliSessionId: useResume ? cliSessionIdToSend : undefined,
|
||||
});
|
||||
|
||||
const managedRun = await supervisor.spawn({
|
||||
sessionId: params.sessionId,
|
||||
backendId: backendResolved.id,
|
||||
scopeKey,
|
||||
replaceExistingScope: Boolean(useResume && scopeKey),
|
||||
mode: "child",
|
||||
argv: [backend.command, ...args],
|
||||
timeoutMs: params.timeoutMs,
|
||||
noOutputTimeoutMs,
|
||||
cwd: workspaceDir,
|
||||
env,
|
||||
input: stdinPayload,
|
||||
});
|
||||
const result = await managedRun.wait();
|
||||
|
||||
const stdout = result.stdout.trim();
|
||||
const stderr = result.stderr.trim();
|
||||
@@ -259,7 +272,28 @@ export async function runCliAgent(params: {
|
||||
}
|
||||
}
|
||||
|
||||
if (result.code !== 0) {
|
||||
if (result.exitCode !== 0 || result.reason !== "exit") {
|
||||
if (result.reason === "no-output-timeout" || result.noOutputTimedOut) {
|
||||
const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`;
|
||||
log.warn(
|
||||
`cli watchdog timeout: provider=${params.provider} model=${modelId} session=${cliSessionIdToSend ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`,
|
||||
);
|
||||
throw new FailoverError(timeoutReason, {
|
||||
reason: "timeout",
|
||||
provider: params.provider,
|
||||
model: modelId,
|
||||
status: resolveFailoverStatus("timeout"),
|
||||
});
|
||||
}
|
||||
if (result.reason === "overall-timeout") {
|
||||
const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`;
|
||||
throw new FailoverError(timeoutReason, {
|
||||
reason: "timeout",
|
||||
provider: params.provider,
|
||||
model: modelId,
|
||||
status: resolveFailoverStatus("timeout"),
|
||||
});
|
||||
}
|
||||
const err = stderr || stdout || "CLI failed.";
|
||||
const reason = classifyFailoverReason(err) ?? "unknown";
|
||||
const status = resolveFailoverStatus(reason);
|
||||
|
||||
@@ -8,232 +8,27 @@ import type { ThinkLevel } from "../../auto-reply/thinking.js";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import type { CliBackendConfig } from "../../config/types.js";
|
||||
import type { EmbeddedContextFile } from "../pi-embedded-helpers.js";
|
||||
import { runExec } from "../../process/exec.js";
|
||||
import { buildTtsSystemPromptHint } from "../../tts/tts.js";
|
||||
import { escapeRegExp, isRecord } from "../../utils.js";
|
||||
import { isRecord } from "../../utils.js";
|
||||
import { buildModelAliasLines } from "../model-alias-lines.js";
|
||||
import { resolveDefaultModelForAgent } from "../model-selection.js";
|
||||
import { detectRuntimeShell } from "../shell-utils.js";
|
||||
import { buildSystemPromptParams } from "../system-prompt-params.js";
|
||||
import { buildAgentSystemPrompt } from "../system-prompt.js";
|
||||
export { buildCliSupervisorScopeKey, resolveCliNoOutputTimeoutMs } from "./reliability.js";
|
||||
|
||||
const CLI_RUN_QUEUE = new Map<string, Promise<unknown>>();
|
||||
|
||||
function buildLooseArgOrderRegex(tokens: string[]): RegExp {
|
||||
// Scan `ps` output lines. Keep matching flexible, but require whitespace arg boundaries
|
||||
// to avoid substring matches like `codexx` or `/path/to/codexx`.
|
||||
const [head, ...rest] = tokens.map((t) => String(t ?? "").trim()).filter(Boolean);
|
||||
if (!head) {
|
||||
return /$^/;
|
||||
}
|
||||
|
||||
const headEscaped = escapeRegExp(head);
|
||||
const headFragment = `(?:^|\\s)(?:${headEscaped}|\\S+\\/${headEscaped})(?=\\s|$)`;
|
||||
const restFragments = rest.map((t) => `(?:^|\\s)${escapeRegExp(t)}(?=\\s|$)`);
|
||||
return new RegExp([headFragment, ...restFragments].join(".*"));
|
||||
}
|
||||
|
||||
async function psWithFallback(argsA: string[], argsB: string[]): Promise<string> {
|
||||
try {
|
||||
const { stdout } = await runExec("ps", argsA);
|
||||
return stdout;
|
||||
} catch {
|
||||
// fallthrough
|
||||
}
|
||||
const { stdout } = await runExec("ps", argsB);
|
||||
return stdout;
|
||||
}
|
||||
|
||||
export async function cleanupResumeProcesses(
|
||||
backend: CliBackendConfig,
|
||||
sessionId: string,
|
||||
): Promise<void> {
|
||||
if (process.platform === "win32") {
|
||||
return;
|
||||
}
|
||||
const resumeArgs = backend.resumeArgs ?? [];
|
||||
if (resumeArgs.length === 0) {
|
||||
return;
|
||||
}
|
||||
if (!resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
|
||||
return;
|
||||
}
|
||||
const commandToken = path.basename(backend.command ?? "").trim();
|
||||
if (!commandToken) {
|
||||
return;
|
||||
}
|
||||
|
||||
const resumeTokens = resumeArgs.map((arg) => arg.replaceAll("{sessionId}", sessionId));
|
||||
const pattern = [commandToken, ...resumeTokens]
|
||||
.filter(Boolean)
|
||||
.map((token) => escapeRegExp(token))
|
||||
.join(".*");
|
||||
if (!pattern) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const stdout = await psWithFallback(
|
||||
["-axww", "-o", "pid=,ppid=,command="],
|
||||
["-ax", "-o", "pid=,ppid=,command="],
|
||||
);
|
||||
const patternRegex = buildLooseArgOrderRegex([commandToken, ...resumeTokens]);
|
||||
const toKill: number[] = [];
|
||||
|
||||
for (const line of stdout.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) {
|
||||
continue;
|
||||
}
|
||||
const match = /^(\d+)\s+(\d+)\s+(.*)$/.exec(trimmed);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
const pid = Number(match[1]);
|
||||
const ppid = Number(match[2]);
|
||||
const cmd = match[3] ?? "";
|
||||
if (!Number.isFinite(pid)) {
|
||||
continue;
|
||||
}
|
||||
if (ppid !== process.pid) {
|
||||
continue;
|
||||
}
|
||||
if (!patternRegex.test(cmd)) {
|
||||
continue;
|
||||
}
|
||||
toKill.push(pid);
|
||||
}
|
||||
|
||||
if (toKill.length > 0) {
|
||||
const pidArgs = toKill.map((pid) => String(pid));
|
||||
try {
|
||||
await runExec("kill", ["-TERM", ...pidArgs]);
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
try {
|
||||
await runExec("kill", ["-9", ...pidArgs]);
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore errors - best effort cleanup
|
||||
}
|
||||
}
|
||||
|
||||
function buildSessionMatchers(backend: CliBackendConfig): RegExp[] {
|
||||
const commandToken = path.basename(backend.command ?? "").trim();
|
||||
if (!commandToken) {
|
||||
return [];
|
||||
}
|
||||
const matchers: RegExp[] = [];
|
||||
const sessionArg = backend.sessionArg?.trim();
|
||||
const sessionArgs = backend.sessionArgs ?? [];
|
||||
const resumeArgs = backend.resumeArgs ?? [];
|
||||
|
||||
const addMatcher = (args: string[]) => {
|
||||
if (args.length === 0) {
|
||||
return;
|
||||
}
|
||||
const tokens = [commandToken, ...args];
|
||||
const pattern = tokens
|
||||
.map((token, index) => {
|
||||
const tokenPattern = tokenToRegex(token);
|
||||
return index === 0 ? `(?:^|\\s)${tokenPattern}` : `\\s+${tokenPattern}`;
|
||||
})
|
||||
.join("");
|
||||
matchers.push(new RegExp(pattern));
|
||||
};
|
||||
|
||||
if (sessionArgs.some((arg) => arg.includes("{sessionId}"))) {
|
||||
addMatcher(sessionArgs);
|
||||
} else if (sessionArg) {
|
||||
addMatcher([sessionArg, "{sessionId}"]);
|
||||
}
|
||||
|
||||
if (resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
|
||||
addMatcher(resumeArgs);
|
||||
}
|
||||
|
||||
return matchers;
|
||||
}
|
||||
|
||||
function tokenToRegex(token: string): string {
|
||||
if (!token.includes("{sessionId}")) {
|
||||
return escapeRegExp(token);
|
||||
}
|
||||
const parts = token.split("{sessionId}").map((part) => escapeRegExp(part));
|
||||
return parts.join("\\S+");
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup suspended OpenClaw CLI processes that have accumulated.
|
||||
* Only cleans up if there are more than the threshold (default: 10).
|
||||
*/
|
||||
export async function cleanupSuspendedCliProcesses(
|
||||
backend: CliBackendConfig,
|
||||
threshold = 10,
|
||||
): Promise<void> {
|
||||
if (process.platform === "win32") {
|
||||
return;
|
||||
}
|
||||
const matchers = buildSessionMatchers(backend);
|
||||
if (matchers.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const stdout = await psWithFallback(
|
||||
["-axww", "-o", "pid=,ppid=,stat=,command="],
|
||||
["-ax", "-o", "pid=,ppid=,stat=,command="],
|
||||
);
|
||||
const suspended: number[] = [];
|
||||
for (const line of stdout.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) {
|
||||
continue;
|
||||
}
|
||||
const match = /^(\d+)\s+(\d+)\s+(\S+)\s+(.*)$/.exec(trimmed);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
const pid = Number(match[1]);
|
||||
const ppid = Number(match[2]);
|
||||
const stat = match[3] ?? "";
|
||||
const command = match[4] ?? "";
|
||||
if (!Number.isFinite(pid)) {
|
||||
continue;
|
||||
}
|
||||
if (ppid !== process.pid) {
|
||||
continue;
|
||||
}
|
||||
if (!stat.includes("T")) {
|
||||
continue;
|
||||
}
|
||||
if (!matchers.some((matcher) => matcher.test(command))) {
|
||||
continue;
|
||||
}
|
||||
suspended.push(pid);
|
||||
}
|
||||
|
||||
if (suspended.length > threshold) {
|
||||
// Verified locally: stopped (T) processes ignore SIGTERM, so use SIGKILL.
|
||||
await runExec("kill", ["-9", ...suspended.map((pid) => String(pid))]);
|
||||
}
|
||||
} catch {
|
||||
// ignore errors - best effort cleanup
|
||||
}
|
||||
}
|
||||
export function enqueueCliRun<T>(key: string, task: () => Promise<T>): Promise<T> {
|
||||
const prior = CLI_RUN_QUEUE.get(key) ?? Promise.resolve();
|
||||
const chained = prior.catch(() => undefined).then(task);
|
||||
const tracked = chained.finally(() => {
|
||||
if (CLI_RUN_QUEUE.get(key) === tracked) {
|
||||
CLI_RUN_QUEUE.delete(key);
|
||||
}
|
||||
});
|
||||
// Keep queue continuity even when a run rejects, without emitting unhandled rejections.
|
||||
const tracked = chained
|
||||
.catch(() => undefined)
|
||||
.finally(() => {
|
||||
if (CLI_RUN_QUEUE.get(key) === tracked) {
|
||||
CLI_RUN_QUEUE.delete(key);
|
||||
}
|
||||
});
|
||||
CLI_RUN_QUEUE.set(key, tracked);
|
||||
return chained;
|
||||
}
|
||||
|
||||
88
src/agents/cli-runner/reliability.ts
Normal file
88
src/agents/cli-runner/reliability.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
import path from "node:path";
|
||||
import type { CliBackendConfig } from "../../config/types.js";
|
||||
import {
|
||||
CLI_FRESH_WATCHDOG_DEFAULTS,
|
||||
CLI_RESUME_WATCHDOG_DEFAULTS,
|
||||
CLI_WATCHDOG_MIN_TIMEOUT_MS,
|
||||
} from "../cli-watchdog-defaults.js";
|
||||
|
||||
function pickWatchdogProfile(
|
||||
backend: CliBackendConfig,
|
||||
useResume: boolean,
|
||||
): {
|
||||
noOutputTimeoutMs?: number;
|
||||
noOutputTimeoutRatio: number;
|
||||
minMs: number;
|
||||
maxMs: number;
|
||||
} {
|
||||
const defaults = useResume ? CLI_RESUME_WATCHDOG_DEFAULTS : CLI_FRESH_WATCHDOG_DEFAULTS;
|
||||
const configured = useResume
|
||||
? backend.reliability?.watchdog?.resume
|
||||
: backend.reliability?.watchdog?.fresh;
|
||||
|
||||
const ratio = (() => {
|
||||
const value = configured?.noOutputTimeoutRatio;
|
||||
if (typeof value !== "number" || !Number.isFinite(value)) {
|
||||
return defaults.noOutputTimeoutRatio;
|
||||
}
|
||||
return Math.max(0.05, Math.min(0.95, value));
|
||||
})();
|
||||
const minMs = (() => {
|
||||
const value = configured?.minMs;
|
||||
if (typeof value !== "number" || !Number.isFinite(value)) {
|
||||
return defaults.minMs;
|
||||
}
|
||||
return Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(value));
|
||||
})();
|
||||
const maxMs = (() => {
|
||||
const value = configured?.maxMs;
|
||||
if (typeof value !== "number" || !Number.isFinite(value)) {
|
||||
return defaults.maxMs;
|
||||
}
|
||||
return Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(value));
|
||||
})();
|
||||
|
||||
return {
|
||||
noOutputTimeoutMs:
|
||||
typeof configured?.noOutputTimeoutMs === "number" &&
|
||||
Number.isFinite(configured.noOutputTimeoutMs)
|
||||
? Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(configured.noOutputTimeoutMs))
|
||||
: undefined,
|
||||
noOutputTimeoutRatio: ratio,
|
||||
minMs: Math.min(minMs, maxMs),
|
||||
maxMs: Math.max(minMs, maxMs),
|
||||
};
|
||||
}
|
||||
|
||||
export function resolveCliNoOutputTimeoutMs(params: {
|
||||
backend: CliBackendConfig;
|
||||
timeoutMs: number;
|
||||
useResume: boolean;
|
||||
}): number {
|
||||
const profile = pickWatchdogProfile(params.backend, params.useResume);
|
||||
// Keep watchdog below global timeout in normal cases.
|
||||
const cap = Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, params.timeoutMs - 1_000);
|
||||
if (profile.noOutputTimeoutMs !== undefined) {
|
||||
return Math.min(profile.noOutputTimeoutMs, cap);
|
||||
}
|
||||
const computed = Math.floor(params.timeoutMs * profile.noOutputTimeoutRatio);
|
||||
const bounded = Math.min(profile.maxMs, Math.max(profile.minMs, computed));
|
||||
return Math.min(bounded, cap);
|
||||
}
|
||||
|
||||
export function buildCliSupervisorScopeKey(params: {
|
||||
backend: CliBackendConfig;
|
||||
backendId: string;
|
||||
cliSessionId?: string;
|
||||
}): string | undefined {
|
||||
const commandToken = path
|
||||
.basename(params.backend.command ?? "")
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
const backendToken = params.backendId.trim().toLowerCase();
|
||||
const sessionToken = params.cliSessionId?.trim();
|
||||
if (!sessionToken) {
|
||||
return undefined;
|
||||
}
|
||||
return `cli:${backendToken}:${commandToken}:${sessionToken}`;
|
||||
}
|
||||
13
src/agents/cli-watchdog-defaults.ts
Normal file
13
src/agents/cli-watchdog-defaults.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
export const CLI_WATCHDOG_MIN_TIMEOUT_MS = 1_000;
|
||||
|
||||
export const CLI_FRESH_WATCHDOG_DEFAULTS = {
|
||||
noOutputTimeoutRatio: 0.8,
|
||||
minMs: 180_000,
|
||||
maxMs: 600_000,
|
||||
} as const;
|
||||
|
||||
export const CLI_RESUME_WATCHDOG_DEFAULTS = {
|
||||
noOutputTimeoutRatio: 0.3,
|
||||
minMs: 60_000,
|
||||
maxMs: 180_000,
|
||||
} as const;
|
||||
Reference in New Issue
Block a user