From 7236d6487e722291088523f3f7d7a17ccccb84f9 Mon Sep 17 00:00:00 2001 From: hekunwang <38637531+hekunwang@users.noreply.github.com> Date: Sat, 9 May 2026 18:18:04 +0800 Subject: [PATCH] fix(agents): classify stream_read_error as transient (#79692) * fix(agents): classify stream_read_error as transient * fix: classify stream read errors as transient (#79692) --------- Co-authored-by: Peter Steinberger --- CHANGELOG.md | 2 ++ .../model-fallback.run-embedded.e2e.test.ts | 4 ++++ src/agents/model-fallback.test.ts | 8 ++++++++ ...mbedded-helpers.isbillingerrormessage.test.ts | 2 ++ .../pi-embedded-helpers/failover-matches.ts | 1 + .../pi-embedded-runner/run/assistant-failover.ts | 16 +++++++++------- 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be011e612b8..0ca0374bcc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -153,6 +153,8 @@ Docs: https://docs.openclaw.ai - Memory/QMD: warn with a manual stale collection removal hint when QMD reports a path/pattern conflict but `collection list` lacks verifiable metadata, avoiding unsafe stderr-only rebinds. Refs #71783. (#72297) Thanks @MonkeyLeeT. - Models/auth: make `openclaw models status --check` and dashboard auth health honor effective auth profile order while keeping stale profiles visible. (#79685) Thanks @nimbleenigma. +- Agents/failover: classify bare `stream_read_error` streaming failures as transient timeouts so configured model fallback runs instead of surfacing the raw transport error. Fixes #79689. (#79692) Thanks @hekunwang. +- Agents/failover: persist overloaded auth-profile cooldown marks before exhausted fallback summaries surface, so immediate fallback retries honor the recorded cooldown state. - Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net. - Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang. - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358. diff --git a/src/agents/model-fallback.run-embedded.e2e.test.ts b/src/agents/model-fallback.run-embedded.e2e.test.ts index 66da9ed1764..83434f9fb95 100644 --- a/src/agents/model-fallback.run-embedded.e2e.test.ts +++ b/src/agents/model-fallback.run-embedded.e2e.test.ts @@ -483,6 +483,10 @@ describe("runWithModelFallback + runEmbeddedPiAgent failover behavior", () => { name: "undici-terminated", message: "terminated", }, + { + name: "stream-read-error", + message: "stream_read_error", + }, { name: "codex-empty-transport-response", message: "Request failed", diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index 9d93d486951..fe881f924d7 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -1113,6 +1113,14 @@ describe("runWithModelFallback", () => { error: new Error("Model not found: openai/gpt-6"), expectedFallback: ["anthropic", "claude-haiku-3-5"], }, + { + name: "bare stream read transport error", + provider: "openai", + model: "gpt-4.1-mini", + error: new Error("stream_read_error"), + expectedFallback: ["anthropic", "claude-haiku-3-5"], + expectedReason: "timeout", + }, ]; for (const testCase of cases) { diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index ab8b7473fe9..03100d852a8 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -923,6 +923,8 @@ describe("isFailoverErrorMessage", () => { "terminated", "Terminated", " terminated ", + "stream_read_error", + " stream_read_error ", "UND_ERR_SOCKET", "Error: UND_ERR_SOCKET other side closed", "UND_ERR_CONNECT_TIMEOUT", diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index e36b9f74684..dd9e239a041 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -168,6 +168,7 @@ const ERROR_PATTERNS = { // aborted). These arrive as bare strings on the outer error and, without // an explicit match, the fallback chain is never attempted (#69368). /^terminated$/i, + /^stream_read_error$/i, /\bund_err_(?:socket|connect|headers?|body|req_content_length_mismatch|aborted|closed)\b/i, // pi-ai's openai-codex provider surfaces `Request failed` when the HTTP // response has no body and no status text (typical of Cloudflare 502s diff --git a/src/agents/pi-embedded-runner/run/assistant-failover.ts b/src/agents/pi-embedded-runner/run/assistant-failover.ts index 752e24ec1c3..d9b40cf9e66 100644 --- a/src/agents/pi-embedded-runner/run/assistant-failover.ts +++ b/src/agents/pi-embedded-runner/run/assistant-failover.ts @@ -99,17 +99,19 @@ export async function handleAssistantFailover(params: { if (decision.action === "rotate_profile") { const failedProfileId = params.lastProfileId; const failureReason = params.timedOut ? "timeout" : params.assistantProfileFailureReason; - const markFailedProfile = () => { + const markFailedProfile = async () => { if (!failedProfileId || !failureReason || failureReason === "timeout") { return; } - params - .maybeMarkAuthProfileFailure({ + try { + await params.maybeMarkAuthProfileFailure({ profileId: failedProfileId, reason: failureReason, modelId: params.modelId, - }) - .catch((err) => params.warn(`deferred profile failure mark failed: ${String(err)}`)); + }); + } catch (err) { + params.warn(`profile failure mark failed: ${String(err)}`); + } }; if (params.failoverReason === "overloaded") { @@ -122,7 +124,7 @@ export async function handleAssistantFailover(params: { params.warn( `overload profile rotation cap reached for ${sanitizeForLog(params.provider)}/${sanitizeForLog(params.modelId)} after ${overloadProfileRotations} rotations; escalating to model fallback`, ); - markFailedProfile(); + await markFailedProfile(); params.logAssistantFailoverDecision("fallback_model", { status }); return { action: "throw", @@ -151,7 +153,7 @@ export async function handleAssistantFailover(params: { } const rotated = await params.advanceAuthProfile(); - markFailedProfile(); + await markFailedProfile(); if (params.timedOut && !params.isProbeSession && failedProfileId) { params.warn(`Profile ${failedProfileId} timed out. Trying next account...`); }