fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)

* exec: clean up PTY resources on timeout and exit

* cli: harden resume cleanup and watchdog stalled runs

* cli: productionize PTY and resume reliability paths

* docs: add PTY process supervision architecture plan

* docs: rewrite PTY supervision plan as pre-rewrite baseline

* docs: switch PTY supervision plan to one-go execution

* docs: add one-line root cause to PTY supervision plan

* docs: add OS contracts and test matrix to PTY supervision plan

* docs: define process-supervisor package placement and scope

* docs: tie supervisor plan to existing CI lanes

* docs: place PTY supervisor plan under src/process

* refactor(process): route exec and cli runs through supervisor

* docs(process): refresh PTY supervision plan

* wip

* fix(process): harden supervisor timeout and PTY termination

* fix(process): harden supervisor adapters env and wait handling

* ci: avoid failing formal conformance on comment permissions

* test(ui): fix cron request mock argument typing

* fix(ui): remove leftover conflict marker

* fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
Onur
2026-02-16 09:32:05 +08:00
committed by GitHub
parent a73e7786e7
commit cd44a0d01e
32 changed files with 2759 additions and 855 deletions

View File

@@ -8,232 +8,27 @@ import type { ThinkLevel } from "../../auto-reply/thinking.js";
import type { OpenClawConfig } from "../../config/config.js";
import type { CliBackendConfig } from "../../config/types.js";
import type { EmbeddedContextFile } from "../pi-embedded-helpers.js";
import { runExec } from "../../process/exec.js";
import { buildTtsSystemPromptHint } from "../../tts/tts.js";
import { escapeRegExp, isRecord } from "../../utils.js";
import { isRecord } from "../../utils.js";
import { buildModelAliasLines } from "../model-alias-lines.js";
import { resolveDefaultModelForAgent } from "../model-selection.js";
import { detectRuntimeShell } from "../shell-utils.js";
import { buildSystemPromptParams } from "../system-prompt-params.js";
import { buildAgentSystemPrompt } from "../system-prompt.js";
export { buildCliSupervisorScopeKey, resolveCliNoOutputTimeoutMs } from "./reliability.js";
const CLI_RUN_QUEUE = new Map<string, Promise<unknown>>();
function buildLooseArgOrderRegex(tokens: string[]): RegExp {
// Scan `ps` output lines. Keep matching flexible, but require whitespace arg boundaries
// to avoid substring matches like `codexx` or `/path/to/codexx`.
const [head, ...rest] = tokens.map((t) => String(t ?? "").trim()).filter(Boolean);
if (!head) {
return /$^/;
}
const headEscaped = escapeRegExp(head);
const headFragment = `(?:^|\\s)(?:${headEscaped}|\\S+\\/${headEscaped})(?=\\s|$)`;
const restFragments = rest.map((t) => `(?:^|\\s)${escapeRegExp(t)}(?=\\s|$)`);
return new RegExp([headFragment, ...restFragments].join(".*"));
}
async function psWithFallback(argsA: string[], argsB: string[]): Promise<string> {
try {
const { stdout } = await runExec("ps", argsA);
return stdout;
} catch {
// fallthrough
}
const { stdout } = await runExec("ps", argsB);
return stdout;
}
export async function cleanupResumeProcesses(
backend: CliBackendConfig,
sessionId: string,
): Promise<void> {
if (process.platform === "win32") {
return;
}
const resumeArgs = backend.resumeArgs ?? [];
if (resumeArgs.length === 0) {
return;
}
if (!resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
return;
}
const commandToken = path.basename(backend.command ?? "").trim();
if (!commandToken) {
return;
}
const resumeTokens = resumeArgs.map((arg) => arg.replaceAll("{sessionId}", sessionId));
const pattern = [commandToken, ...resumeTokens]
.filter(Boolean)
.map((token) => escapeRegExp(token))
.join(".*");
if (!pattern) {
return;
}
try {
const stdout = await psWithFallback(
["-axww", "-o", "pid=,ppid=,command="],
["-ax", "-o", "pid=,ppid=,command="],
);
const patternRegex = buildLooseArgOrderRegex([commandToken, ...resumeTokens]);
const toKill: number[] = [];
for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
const match = /^(\d+)\s+(\d+)\s+(.*)$/.exec(trimmed);
if (!match) {
continue;
}
const pid = Number(match[1]);
const ppid = Number(match[2]);
const cmd = match[3] ?? "";
if (!Number.isFinite(pid)) {
continue;
}
if (ppid !== process.pid) {
continue;
}
if (!patternRegex.test(cmd)) {
continue;
}
toKill.push(pid);
}
if (toKill.length > 0) {
const pidArgs = toKill.map((pid) => String(pid));
try {
await runExec("kill", ["-TERM", ...pidArgs]);
} catch {
// ignore
}
await new Promise((resolve) => setTimeout(resolve, 250));
try {
await runExec("kill", ["-9", ...pidArgs]);
} catch {
// ignore
}
}
} catch {
// ignore errors - best effort cleanup
}
}
function buildSessionMatchers(backend: CliBackendConfig): RegExp[] {
const commandToken = path.basename(backend.command ?? "").trim();
if (!commandToken) {
return [];
}
const matchers: RegExp[] = [];
const sessionArg = backend.sessionArg?.trim();
const sessionArgs = backend.sessionArgs ?? [];
const resumeArgs = backend.resumeArgs ?? [];
const addMatcher = (args: string[]) => {
if (args.length === 0) {
return;
}
const tokens = [commandToken, ...args];
const pattern = tokens
.map((token, index) => {
const tokenPattern = tokenToRegex(token);
return index === 0 ? `(?:^|\\s)${tokenPattern}` : `\\s+${tokenPattern}`;
})
.join("");
matchers.push(new RegExp(pattern));
};
if (sessionArgs.some((arg) => arg.includes("{sessionId}"))) {
addMatcher(sessionArgs);
} else if (sessionArg) {
addMatcher([sessionArg, "{sessionId}"]);
}
if (resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
addMatcher(resumeArgs);
}
return matchers;
}
function tokenToRegex(token: string): string {
if (!token.includes("{sessionId}")) {
return escapeRegExp(token);
}
const parts = token.split("{sessionId}").map((part) => escapeRegExp(part));
return parts.join("\\S+");
}
/**
* Cleanup suspended OpenClaw CLI processes that have accumulated.
* Only cleans up if there are more than the threshold (default: 10).
*/
export async function cleanupSuspendedCliProcesses(
backend: CliBackendConfig,
threshold = 10,
): Promise<void> {
if (process.platform === "win32") {
return;
}
const matchers = buildSessionMatchers(backend);
if (matchers.length === 0) {
return;
}
try {
const stdout = await psWithFallback(
["-axww", "-o", "pid=,ppid=,stat=,command="],
["-ax", "-o", "pid=,ppid=,stat=,command="],
);
const suspended: number[] = [];
for (const line of stdout.split("\n")) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
const match = /^(\d+)\s+(\d+)\s+(\S+)\s+(.*)$/.exec(trimmed);
if (!match) {
continue;
}
const pid = Number(match[1]);
const ppid = Number(match[2]);
const stat = match[3] ?? "";
const command = match[4] ?? "";
if (!Number.isFinite(pid)) {
continue;
}
if (ppid !== process.pid) {
continue;
}
if (!stat.includes("T")) {
continue;
}
if (!matchers.some((matcher) => matcher.test(command))) {
continue;
}
suspended.push(pid);
}
if (suspended.length > threshold) {
// Verified locally: stopped (T) processes ignore SIGTERM, so use SIGKILL.
await runExec("kill", ["-9", ...suspended.map((pid) => String(pid))]);
}
} catch {
// ignore errors - best effort cleanup
}
}
export function enqueueCliRun<T>(key: string, task: () => Promise<T>): Promise<T> {
const prior = CLI_RUN_QUEUE.get(key) ?? Promise.resolve();
const chained = prior.catch(() => undefined).then(task);
const tracked = chained.finally(() => {
if (CLI_RUN_QUEUE.get(key) === tracked) {
CLI_RUN_QUEUE.delete(key);
}
});
// Keep queue continuity even when a run rejects, without emitting unhandled rejections.
const tracked = chained
.catch(() => undefined)
.finally(() => {
if (CLI_RUN_QUEUE.get(key) === tracked) {
CLI_RUN_QUEUE.delete(key);
}
});
CLI_RUN_QUEUE.set(key, tracked);
return chained;
}

View File

@@ -0,0 +1,88 @@
import path from "node:path";
import type { CliBackendConfig } from "../../config/types.js";
import {
CLI_FRESH_WATCHDOG_DEFAULTS,
CLI_RESUME_WATCHDOG_DEFAULTS,
CLI_WATCHDOG_MIN_TIMEOUT_MS,
} from "../cli-watchdog-defaults.js";
function pickWatchdogProfile(
backend: CliBackendConfig,
useResume: boolean,
): {
noOutputTimeoutMs?: number;
noOutputTimeoutRatio: number;
minMs: number;
maxMs: number;
} {
const defaults = useResume ? CLI_RESUME_WATCHDOG_DEFAULTS : CLI_FRESH_WATCHDOG_DEFAULTS;
const configured = useResume
? backend.reliability?.watchdog?.resume
: backend.reliability?.watchdog?.fresh;
const ratio = (() => {
const value = configured?.noOutputTimeoutRatio;
if (typeof value !== "number" || !Number.isFinite(value)) {
return defaults.noOutputTimeoutRatio;
}
return Math.max(0.05, Math.min(0.95, value));
})();
const minMs = (() => {
const value = configured?.minMs;
if (typeof value !== "number" || !Number.isFinite(value)) {
return defaults.minMs;
}
return Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(value));
})();
const maxMs = (() => {
const value = configured?.maxMs;
if (typeof value !== "number" || !Number.isFinite(value)) {
return defaults.maxMs;
}
return Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(value));
})();
return {
noOutputTimeoutMs:
typeof configured?.noOutputTimeoutMs === "number" &&
Number.isFinite(configured.noOutputTimeoutMs)
? Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, Math.floor(configured.noOutputTimeoutMs))
: undefined,
noOutputTimeoutRatio: ratio,
minMs: Math.min(minMs, maxMs),
maxMs: Math.max(minMs, maxMs),
};
}
export function resolveCliNoOutputTimeoutMs(params: {
backend: CliBackendConfig;
timeoutMs: number;
useResume: boolean;
}): number {
const profile = pickWatchdogProfile(params.backend, params.useResume);
// Keep watchdog below global timeout in normal cases.
const cap = Math.max(CLI_WATCHDOG_MIN_TIMEOUT_MS, params.timeoutMs - 1_000);
if (profile.noOutputTimeoutMs !== undefined) {
return Math.min(profile.noOutputTimeoutMs, cap);
}
const computed = Math.floor(params.timeoutMs * profile.noOutputTimeoutRatio);
const bounded = Math.min(profile.maxMs, Math.max(profile.minMs, computed));
return Math.min(bounded, cap);
}
export function buildCliSupervisorScopeKey(params: {
backend: CliBackendConfig;
backendId: string;
cliSessionId?: string;
}): string | undefined {
const commandToken = path
.basename(params.backend.command ?? "")
.trim()
.toLowerCase();
const backendToken = params.backendId.trim().toLowerCase();
const sessionToken = params.cliSessionId?.trim();
if (!sessionToken) {
return undefined;
}
return `cli:${backendToken}:${commandToken}:${sessionToken}`;
}