From 7236d6487e722291088523f3f7d7a17ccccb84f9 Mon Sep 17 00:00:00 2001
From: hekunwang <38637531+hekunwang@users.noreply.github.com>
Date: Sat, 9 May 2026 18:18:04 +0800
Subject: [PATCH] fix(agents): classify stream_read_error as transient (#79692)

* fix(agents): classify stream_read_error as transient

* fix: classify stream read errors as transient (#79692)

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
---
 CHANGELOG.md                                     |  2 ++
 .../model-fallback.run-embedded.e2e.test.ts      |  4 ++++
 src/agents/model-fallback.test.ts                |  8 ++++++++
 ...mbedded-helpers.isbillingerrormessage.test.ts |  2 ++
 .../pi-embedded-helpers/failover-matches.ts      |  1 +
 .../pi-embedded-runner/run/assistant-failover.ts | 16 +++++++++-------
 6 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be011e612b8..0ca0374bcc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -153,6 +153,8 @@ Docs: https://docs.openclaw.ai
 
 - Memory/QMD: warn with a manual stale collection removal hint when QMD reports a path/pattern conflict but `collection list` lacks verifiable metadata, avoiding unsafe stderr-only rebinds. Refs #71783. (#72297) Thanks @MonkeyLeeT.
 - Models/auth: make `openclaw models status --check` and dashboard auth health honor effective auth profile order while keeping stale profiles visible. (#79685) Thanks @nimbleenigma.
+- Agents/failover: classify bare `stream_read_error` streaming failures as transient timeouts so configured model fallback runs instead of surfacing the raw transport error. Fixes #79689. (#79692) Thanks @hekunwang.
+- Agents/failover: persist overloaded auth-profile cooldown marks before exhausted fallback summaries surface, so immediate fallback retries honor the recorded cooldown state.
 - Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net.
 - Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang.
 - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
diff --git a/src/agents/model-fallback.run-embedded.e2e.test.ts b/src/agents/model-fallback.run-embedded.e2e.test.ts
index 66da9ed1764..83434f9fb95 100644
--- a/src/agents/model-fallback.run-embedded.e2e.test.ts
+++ b/src/agents/model-fallback.run-embedded.e2e.test.ts
@@ -483,6 +483,10 @@ describe("runWithModelFallback + runEmbeddedPiAgent failover behavior", () => {
         name: "undici-terminated",
         message: "terminated",
       },
+      {
+        name: "stream-read-error",
+        message: "stream_read_error",
+      },
       {
         name: "codex-empty-transport-response",
         message: "Request failed",
diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts
index 9d93d486951..fe881f924d7 100644
--- a/src/agents/model-fallback.test.ts
+++ b/src/agents/model-fallback.test.ts
@@ -1113,6 +1113,14 @@ describe("runWithModelFallback", () => {
         error: new Error("Model not found: openai/gpt-6"),
         expectedFallback: ["anthropic", "claude-haiku-3-5"],
       },
+      {
+        name: "bare stream read transport error",
+        provider: "openai",
+        model: "gpt-4.1-mini",
+        error: new Error("stream_read_error"),
+        expectedFallback: ["anthropic", "claude-haiku-3-5"],
+        expectedReason: "timeout",
+      },
     ];
 
     for (const testCase of cases) {
diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
index ab8b7473fe9..03100d852a8 100644
--- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
+++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
@@ -923,6 +923,8 @@ describe("isFailoverErrorMessage", () => {
       "terminated",
       "Terminated",
       "  terminated  ",
+      "stream_read_error",
+      "  stream_read_error  ",
       "UND_ERR_SOCKET",
       "Error: UND_ERR_SOCKET other side closed",
       "UND_ERR_CONNECT_TIMEOUT",
diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts
index e36b9f74684..dd9e239a041 100644
--- a/src/agents/pi-embedded-helpers/failover-matches.ts
+++ b/src/agents/pi-embedded-helpers/failover-matches.ts
@@ -168,6 +168,7 @@ const ERROR_PATTERNS = {
     // aborted). These arrive as bare strings on the outer error and, without
     // an explicit match, the fallback chain is never attempted (#69368).
     /^terminated$/i,
+    /^stream_read_error$/i,
     /\bund_err_(?:socket|connect|headers?|body|req_content_length_mismatch|aborted|closed)\b/i,
     // pi-ai's openai-codex provider surfaces `Request failed` when the HTTP
     // response has no body and no status text (typical of Cloudflare 502s
diff --git a/src/agents/pi-embedded-runner/run/assistant-failover.ts b/src/agents/pi-embedded-runner/run/assistant-failover.ts
index 752e24ec1c3..d9b40cf9e66 100644
--- a/src/agents/pi-embedded-runner/run/assistant-failover.ts
+++ b/src/agents/pi-embedded-runner/run/assistant-failover.ts
@@ -99,17 +99,19 @@ export async function handleAssistantFailover(params: {
   if (decision.action === "rotate_profile") {
     const failedProfileId = params.lastProfileId;
     const failureReason = params.timedOut ? "timeout" : params.assistantProfileFailureReason;
-    const markFailedProfile = () => {
+    const markFailedProfile = async () => {
       if (!failedProfileId || !failureReason || failureReason === "timeout") {
         return;
       }
-      params
-        .maybeMarkAuthProfileFailure({
+      try {
+        await params.maybeMarkAuthProfileFailure({
           profileId: failedProfileId,
           reason: failureReason,
           modelId: params.modelId,
-        })
-        .catch((err) => params.warn(`deferred profile failure mark failed: ${String(err)}`));
+        });
+      } catch (err) {
+        params.warn(`profile failure mark failed: ${String(err)}`);
+      }
     };
 
     if (params.failoverReason === "overloaded") {
@@ -122,7 +124,7 @@ export async function handleAssistantFailover(params: {
         params.warn(
           `overload profile rotation cap reached for ${sanitizeForLog(params.provider)}/${sanitizeForLog(params.modelId)} after ${overloadProfileRotations} rotations; escalating to model fallback`,
         );
-        markFailedProfile();
+        await markFailedProfile();
         params.logAssistantFailoverDecision("fallback_model", { status });
         return {
           action: "throw",
@@ -151,7 +153,7 @@ export async function handleAssistantFailover(params: {
     }
 
     const rotated = await params.advanceAuthProfile();
-    markFailedProfile();
+    await markFailedProfile();
     if (params.timedOut && !params.isProbeSession && failedProfileId) {
       params.warn(`Profile ${failedProfileId} timed out. Trying next account...`);
     }