From a01ba8099b1701aeb5a2c3901466df8ccbe0697d Mon Sep 17 00:00:00 2001 From: Neerav Makwana <261249544+neeravmakwana@users.noreply.github.com> Date: Sun, 3 May 2026 20:58:23 -0400 Subject: [PATCH] fix(agent-reply): surface CLI subprocess timeouts in user-facing replies Treat shared CLI runner timeout literals as actionable copy when verbose failures stay off (#77007). - Use subprocess-neutral wording; optional routing context from fallback `provider/model:` prefixes (`codex-cli/...`, `anthropic/...`) without implying a Claude-only backend. Co-authored-by: Cursor --- CHANGELOG.md | 1 + .../reply/agent-runner-execution.test.ts | 49 +++++++++++++++++++ .../reply/agent-runner-execution.ts | 28 +++++++++++ 3 files changed, 78 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70ad5355df4..b6cb604a9d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -377,6 +377,7 @@ Docs: https://docs.openclaw.ai - Web fetch: late-bind `web_fetch` config and provider fallback metadata from the active runtime snapshot, matching `web_search` so long-lived tools do not use stale fetch provider settings. Thanks @vincentkoc. - Plugins/discovery: demote the source-only TypeScript runtime check on already-installed `origin: "global"` plugin packages from a config-blocking error to a warning and let the runtime fall through to the TypeScript source via jiti, so a single broken installed package no longer blocks `plugins install` for unrelated plugins; install-time rejection of newly-installed source-only packages is unchanged. Thanks @romneyda. - Providers/OpenAI Codex: stop the OAuth progress spinner before showing the manual redirect paste prompt, so callback timeouts do not spam `Browser callback did not finish` across terminals. +- Agents/messaging: surface CLI subprocess watchdog/turn timeout messages to chat users when verbose failures are off, instead of collapsing them into generic external-run failure copy. Fixes #77007. - Channels/WhatsApp: allow `@whiskeysockets/libsignal-node` in `onlyBuiltDependencies` so pnpm v9+ `blockExoticSubdeps` no longer rejects the baileys git-tarball subdep and silences all inbound agent replies. Fixes #76539. Thanks @ottodeng and @vincentkoc. - Gateway/systemd: preserve operator-added secrets in the Gateway env file across re-stage while clearing OpenClaw-managed keys (such as `OPENCLAW_GATEWAY_TOKEN`) so a fresh staging value is never shadowed by a stale env-file copy; operator secrets are also retained when the state-dir `.env` is empty. Fixes #76860. Thanks @hclsys. - Plugin updates: do not short-circuit trusted official npm updates as unchanged when the default/latest spec still resolves to an already-installed prerelease that the installer should replace with a stable fallback. Thanks @vincentkoc. diff --git a/src/auto-reply/reply/agent-runner-execution.test.ts b/src/auto-reply/reply/agent-runner-execution.test.ts index 82b878750f6..7ecb22af4d4 100644 --- a/src/auto-reply/reply/agent-runner-execution.test.ts +++ b/src/auto-reply/reply/agent-runner-execution.test.ts @@ -2364,6 +2364,55 @@ describe("runAgentTurnWithFallback", () => { } }); + it.each([ + { + rejection: new Error("CLI exceeded timeout (300s) and was terminated."), + modeLabel: "overall CLI turn budget" as const, + routingSubstring: undefined as string | undefined, + }, + { + rejection: new Error("CLI produced no output for 120s and was terminated."), + modeLabel: "no-output stall" as const, + routingSubstring: undefined, + }, + { + rejection: new Error( + "All models failed (2): anthropic/claude-opus-4-7: CLI exceeded timeout (300s) and was terminated. | anthropic/foo: bar", + ), + modeLabel: "overall CLI turn budget" as const, + routingSubstring: "(routing anthropic/claude-opus-4-7)", + }, + { + rejection: new Error("codex-cli/gpt-5.5: CLI exceeded timeout (60s) and was terminated."), + modeLabel: "overall CLI turn budget" as const, + routingSubstring: "(routing codex-cli/gpt-5.5)", + }, + ])( + "surfaces CLI subprocess timeout copy instead of generic failure when verbose is off ($modeLabel)", + async ({ rejection, modeLabel, routingSubstring }) => { + state.runWithModelFallbackMock.mockRejectedValueOnce(rejection); + + const runAgentTurnWithFallback = await getRunAgentTurnWithFallback(); + const result = await runAgentTurnWithFallback({ + ...createMinimalRunAgentTurnParams(), + }); + + expect(result.kind).toBe("final"); + if (result.kind !== "final") { + throw new Error("expected final reply"); + } + expect(result.payload.text).not.toBe(GENERIC_RUN_FAILURE_TEXT); + expect(result.payload.text).toContain("CLI subprocess"); + expect(result.payload.text).not.toContain("Claude CLI"); + expect(result.payload.text).toContain(modeLabel); + expect(result.payload.text).toContain("gateway may still be healthy"); + expect(result.payload.text).toContain("cliBackends."); + if (routingSubstring) { + expect(result.payload.text).toContain(routingSubstring); + } + }, + ); + it("forwards sanitized generic errors on external chat channels when verbose is on", async () => { state.runEmbeddedPiAgentMock.mockRejectedValueOnce( new Error("INVALID_ARGUMENT: some other failure"), diff --git a/src/auto-reply/reply/agent-runner-execution.ts b/src/auto-reply/reply/agent-runner-execution.ts index 41364a70de4..a962203c19d 100644 --- a/src/auto-reply/reply/agent-runner-execution.ts +++ b/src/auto-reply/reply/agent-runner-execution.ts @@ -433,6 +433,30 @@ function resolveExternalRunFailureTextForConversation(params: { return SILENT_REPLY_TOKEN; } +const CLI_BACKEND_NO_OUTPUT_STALL_RE = + /\bCLI produced no output for\s+(\d+)\s*s\s+and was terminated\b/iu; +const CLI_BACKEND_OVERALL_TIMEOUT_RE = + /\bCLI exceeded timeout\s*\(\s*(\d+)\s*s\s*\)\s+and was terminated\b/iu; +const CLI_BACKEND_ROUTING_REF_BEFORE_ERROR_RE = /\b([\w.-]+\/[A-Za-z][\w.-]*)\s*:\s*CLI\b/iu; + +function buildCliBackendTimeoutFailureText(message: string): string | null { + const normalizedMessage = collapseRepeatedFailureDetail(message); + const stall = normalizedMessage.match(CLI_BACKEND_NO_OUTPUT_STALL_RE); + const overall = normalizedMessage.match(CLI_BACKEND_OVERALL_TIMEOUT_RE); + const timeout = stall ?? overall; + const seconds = timeout?.[1]; + if (!seconds) { + return null; + } + const routedModelRef = normalizedMessage.match(CLI_BACKEND_ROUTING_REF_BEFORE_ERROR_RE)?.[1]; + const routingSuffix = routedModelRef ? ` (routing ${routedModelRef})` : ""; + const modeLabel = stall ? "no-output stall" : "overall CLI turn budget"; + return ( + `⚠️ CLI subprocess${routingSuffix}: timed out after ${seconds}s (${modeLabel}). The gateway may still be healthy. Try \`/new\`, a lighter model, or raise ` + + "`agents.defaults.timeoutSeconds` and the watchdog `noOutputTimeoutMs` entries under `cliBackends.`." + ); +} + function buildMissingApiKeyFailureText(message: string): string | null { const normalizedMessage = collapseRepeatedFailureDetail(message); const providerMatch = normalizedMessage.match(/No API key found for provider "([^"]+)"/u); @@ -494,6 +518,10 @@ function buildExternalRunFailureReply( isGenericRunnerFailure: false, }; } + const cliBackendTimeoutFailure = buildCliBackendTimeoutFailureText(normalizedMessage); + if (cliBackendTimeoutFailure) { + return { text: cliBackendTimeoutFailure, isGenericRunnerFailure: false }; + } return { text: options?.includeDetails ? formatForwardedExternalRunFailureText(normalizedMessage)