fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)

* exec: clean up PTY resources on timeout and exit

* cli: harden resume cleanup and watchdog stalled runs

* cli: productionize PTY and resume reliability paths

* docs: add PTY process supervision architecture plan

* docs: rewrite PTY supervision plan as pre-rewrite baseline

* docs: switch PTY supervision plan to one-go execution

* docs: add one-line root cause to PTY supervision plan

* docs: add OS contracts and test matrix to PTY supervision plan

* docs: define process-supervisor package placement and scope

* docs: tie supervisor plan to existing CI lanes

* docs: place PTY supervisor plan under src/process

* refactor(process): route exec and cli runs through supervisor

* docs(process): refresh PTY supervision plan

* wip

* fix(process): harden supervisor timeout and PTY termination

* fix(process): harden supervisor adapters env and wait handling

* ci: avoid failing formal conformance on comment permissions

* test(ui): fix cron request mock argument typing

* fix(ui): remove leftover conflict marker

* fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
Onur
2026-02-16 09:32:05 +08:00
committed by GitHub
parent a73e7786e7
commit cd44a0d01e
32 changed files with 2759 additions and 855 deletions

View File

@@ -6,20 +6,20 @@ import { resolveHeartbeatPrompt } from "../auto-reply/heartbeat.js";
import { shouldLogVerbose } from "../globals.js";
import { isTruthyEnvValue } from "../infra/env.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import { runCommandWithTimeout } from "../process/exec.js";
import { getProcessSupervisor } from "../process/supervisor/index.js";
import { resolveSessionAgentIds } from "./agent-scope.js";
import { makeBootstrapWarn, resolveBootstrapContextForRun } from "./bootstrap-files.js";
import { resolveCliBackendConfig } from "./cli-backends.js";
import {
appendImagePathsToPrompt,
buildCliSupervisorScopeKey,
buildCliArgs,
buildSystemPrompt,
cleanupResumeProcesses,
cleanupSuspendedCliProcesses,
enqueueCliRun,
normalizeCliModel,
parseCliJson,
parseCliJsonl,
resolveCliNoOutputTimeoutMs,
resolvePromptInput,
resolveSessionIdToSend,
resolveSystemPromptUsage,
@@ -226,19 +226,32 @@ export async function runCliAgent(params: {
}
return next;
})();
// Cleanup suspended processes that have accumulated (regardless of sessionId)
await cleanupSuspendedCliProcesses(backend);
if (useResume && cliSessionIdToSend) {
await cleanupResumeProcesses(backend, cliSessionIdToSend);
}
const result = await runCommandWithTimeout([backend.command, ...args], {
const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({
backend,
timeoutMs: params.timeoutMs,
useResume,
});
const supervisor = getProcessSupervisor();
const scopeKey = buildCliSupervisorScopeKey({
backend,
backendId: backendResolved.id,
cliSessionId: useResume ? cliSessionIdToSend : undefined,
});
const managedRun = await supervisor.spawn({
sessionId: params.sessionId,
backendId: backendResolved.id,
scopeKey,
replaceExistingScope: Boolean(useResume && scopeKey),
mode: "child",
argv: [backend.command, ...args],
timeoutMs: params.timeoutMs,
noOutputTimeoutMs,
cwd: workspaceDir,
env,
input: stdinPayload,
});
const result = await managedRun.wait();
const stdout = result.stdout.trim();
const stderr = result.stderr.trim();
@@ -259,7 +272,28 @@ export async function runCliAgent(params: {
}
}
if (result.code !== 0) {
if (result.exitCode !== 0 || result.reason !== "exit") {
if (result.reason === "no-output-timeout" || result.noOutputTimedOut) {
const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`;
log.warn(
`cli watchdog timeout: provider=${params.provider} model=${modelId} session=${cliSessionIdToSend ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`,
);
throw new FailoverError(timeoutReason, {
reason: "timeout",
provider: params.provider,
model: modelId,
status: resolveFailoverStatus("timeout"),
});
}
if (result.reason === "overall-timeout") {
const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`;
throw new FailoverError(timeoutReason, {
reason: "timeout",
provider: params.provider,
model: modelId,
status: resolveFailoverStatus("timeout"),
});
}
const err = stderr || stdout || "CLI failed.";
const reason = classifyFailoverReason(err) ?? "unknown";
const status = resolveFailoverStatus(reason);