mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-04 14:50:24 +00:00
fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)
* exec: clean up PTY resources on timeout and exit * cli: harden resume cleanup and watchdog stalled runs * cli: productionize PTY and resume reliability paths * docs: add PTY process supervision architecture plan * docs: rewrite PTY supervision plan as pre-rewrite baseline * docs: switch PTY supervision plan to one-go execution * docs: add one-line root cause to PTY supervision plan * docs: add OS contracts and test matrix to PTY supervision plan * docs: define process-supervisor package placement and scope * docs: tie supervisor plan to existing CI lanes * docs: place PTY supervisor plan under src/process * refactor(process): route exec and cli runs through supervisor * docs(process): refresh PTY supervision plan * wip * fix(process): harden supervisor timeout and PTY termination * fix(process): harden supervisor adapters env and wait handling * ci: avoid failing formal conformance on comment permissions * test(ui): fix cron request mock argument typing * fix(ui): remove leftover conflict marker * fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
@@ -6,20 +6,20 @@ import { resolveHeartbeatPrompt } from "../auto-reply/heartbeat.js";
|
||||
import { shouldLogVerbose } from "../globals.js";
|
||||
import { isTruthyEnvValue } from "../infra/env.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
import { runCommandWithTimeout } from "../process/exec.js";
|
||||
import { getProcessSupervisor } from "../process/supervisor/index.js";
|
||||
import { resolveSessionAgentIds } from "./agent-scope.js";
|
||||
import { makeBootstrapWarn, resolveBootstrapContextForRun } from "./bootstrap-files.js";
|
||||
import { resolveCliBackendConfig } from "./cli-backends.js";
|
||||
import {
|
||||
appendImagePathsToPrompt,
|
||||
buildCliSupervisorScopeKey,
|
||||
buildCliArgs,
|
||||
buildSystemPrompt,
|
||||
cleanupResumeProcesses,
|
||||
cleanupSuspendedCliProcesses,
|
||||
enqueueCliRun,
|
||||
normalizeCliModel,
|
||||
parseCliJson,
|
||||
parseCliJsonl,
|
||||
resolveCliNoOutputTimeoutMs,
|
||||
resolvePromptInput,
|
||||
resolveSessionIdToSend,
|
||||
resolveSystemPromptUsage,
|
||||
@@ -226,19 +226,32 @@ export async function runCliAgent(params: {
|
||||
}
|
||||
return next;
|
||||
})();
|
||||
|
||||
// Cleanup suspended processes that have accumulated (regardless of sessionId)
|
||||
await cleanupSuspendedCliProcesses(backend);
|
||||
if (useResume && cliSessionIdToSend) {
|
||||
await cleanupResumeProcesses(backend, cliSessionIdToSend);
|
||||
}
|
||||
|
||||
const result = await runCommandWithTimeout([backend.command, ...args], {
|
||||
const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({
|
||||
backend,
|
||||
timeoutMs: params.timeoutMs,
|
||||
useResume,
|
||||
});
|
||||
const supervisor = getProcessSupervisor();
|
||||
const scopeKey = buildCliSupervisorScopeKey({
|
||||
backend,
|
||||
backendId: backendResolved.id,
|
||||
cliSessionId: useResume ? cliSessionIdToSend : undefined,
|
||||
});
|
||||
|
||||
const managedRun = await supervisor.spawn({
|
||||
sessionId: params.sessionId,
|
||||
backendId: backendResolved.id,
|
||||
scopeKey,
|
||||
replaceExistingScope: Boolean(useResume && scopeKey),
|
||||
mode: "child",
|
||||
argv: [backend.command, ...args],
|
||||
timeoutMs: params.timeoutMs,
|
||||
noOutputTimeoutMs,
|
||||
cwd: workspaceDir,
|
||||
env,
|
||||
input: stdinPayload,
|
||||
});
|
||||
const result = await managedRun.wait();
|
||||
|
||||
const stdout = result.stdout.trim();
|
||||
const stderr = result.stderr.trim();
|
||||
@@ -259,7 +272,28 @@ export async function runCliAgent(params: {
|
||||
}
|
||||
}
|
||||
|
||||
if (result.code !== 0) {
|
||||
if (result.exitCode !== 0 || result.reason !== "exit") {
|
||||
if (result.reason === "no-output-timeout" || result.noOutputTimedOut) {
|
||||
const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`;
|
||||
log.warn(
|
||||
`cli watchdog timeout: provider=${params.provider} model=${modelId} session=${cliSessionIdToSend ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`,
|
||||
);
|
||||
throw new FailoverError(timeoutReason, {
|
||||
reason: "timeout",
|
||||
provider: params.provider,
|
||||
model: modelId,
|
||||
status: resolveFailoverStatus("timeout"),
|
||||
});
|
||||
}
|
||||
if (result.reason === "overall-timeout") {
|
||||
const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`;
|
||||
throw new FailoverError(timeoutReason, {
|
||||
reason: "timeout",
|
||||
provider: params.provider,
|
||||
model: modelId,
|
||||
status: resolveFailoverStatus("timeout"),
|
||||
});
|
||||
}
|
||||
const err = stderr || stdout || "CLI failed.";
|
||||
const reason = classifyFailoverReason(err) ?? "unknown";
|
||||
const status = resolveFailoverStatus(reason);
|
||||
|
||||
Reference in New Issue
Block a user