fix(agents): classify stream_read_error as transient (#79692)

* fix(agents): classify stream_read_error as transient * fix: classify stream read errors as transient (#79692) --------- Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-05-13 09:10:42 +00:00 · 2026-05-09 18:18:04 +08:00
parent a855414bc6
commit 7236d6487e
6 changed files with 26 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -153,6 +153,8 @@ Docs: https://docs.openclaw.ai

 - Memory/QMD: warn with a manual stale collection removal hint when QMD reports a path/pattern conflict but `collection list` lacks verifiable metadata, avoiding unsafe stderr-only rebinds. Refs #71783. (#72297) Thanks @MonkeyLeeT.
 - Models/auth: make `openclaw models status --check` and dashboard auth health honor effective auth profile order while keeping stale profiles visible. (#79685) Thanks @nimbleenigma.
+- Agents/failover: classify bare `stream_read_error` streaming failures as transient timeouts so configured model fallback runs instead of surfacing the raw transport error. Fixes #79689. (#79692) Thanks @hekunwang.
+- Agents/failover: persist overloaded auth-profile cooldown marks before exhausted fallback summaries surface, so immediate fallback retries honor the recorded cooldown state.
 - Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net.
 - Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang.
 - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
--- a/src/agents/model-fallback.run-embedded.e2e.test.ts
+++ b/src/agents/model-fallback.run-embedded.e2e.test.ts
@@ -483,6 +483,10 @@ describe("runWithModelFallback + runEmbeddedPiAgent failover behavior", () => {
        name: "undici-terminated",
        message: "terminated",
      },
+      {
+        name: "stream-read-error",
+        message: "stream_read_error",
+      },
      {
        name: "codex-empty-transport-response",
        message: "Request failed",
--- a/src/agents/model-fallback.test.ts
+++ b/src/agents/model-fallback.test.ts
@@ -1113,6 +1113,14 @@ describe("runWithModelFallback", () => {
        error: new Error("Model not found: openai/gpt-6"),
        expectedFallback: ["anthropic", "claude-haiku-3-5"],
      },
+      {
+        name: "bare stream read transport error",
+        provider: "openai",
+        model: "gpt-4.1-mini",
+        error: new Error("stream_read_error"),
+        expectedFallback: ["anthropic", "claude-haiku-3-5"],
+        expectedReason: "timeout",
+      },
    ];

    for (const testCase of cases) {
--- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
+++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
@@ -923,6 +923,8 @@ describe("isFailoverErrorMessage", () => {
      "terminated",
      "Terminated",
      "  terminated  ",
+      "stream_read_error",
+      "  stream_read_error  ",
      "UND_ERR_SOCKET",
      "Error: UND_ERR_SOCKET other side closed",
      "UND_ERR_CONNECT_TIMEOUT",
--- a/src/agents/pi-embedded-helpers/failover-matches.ts
+++ b/src/agents/pi-embedded-helpers/failover-matches.ts
@@ -168,6 +168,7 @@ const ERROR_PATTERNS = {
    // aborted). These arrive as bare strings on the outer error and, without
    // an explicit match, the fallback chain is never attempted (#69368).
    /^terminated$/i,
+    /^stream_read_error$/i,
    /\bund_err_(?:socket|connect|headers?|body|req_content_length_mismatch|aborted|closed)\b/i,
    // pi-ai's openai-codex provider surfaces `Request failed` when the HTTP
    // response has no body and no status text (typical of Cloudflare 502s
--- a/src/agents/pi-embedded-runner/run/assistant-failover.ts
+++ b/src/agents/pi-embedded-runner/run/assistant-failover.ts
@@ -99,17 +99,19 @@ export async function handleAssistantFailover(params: {
  if (decision.action === "rotate_profile") {
    const failedProfileId = params.lastProfileId;
    const failureReason = params.timedOut ? "timeout" : params.assistantProfileFailureReason;
-    const markFailedProfile = () => {
+    const markFailedProfile = async () => {
      if (!failedProfileId || !failureReason || failureReason === "timeout") {
        return;
      }
-      params
-        .maybeMarkAuthProfileFailure({
+      try {
+        await params.maybeMarkAuthProfileFailure({
          profileId: failedProfileId,
          reason: failureReason,
          modelId: params.modelId,
-        })
-        .catch((err) => params.warn(`deferred profile failure mark failed: ${String(err)}`));
+        });
+      } catch (err) {
+        params.warn(`profile failure mark failed: ${String(err)}`);
+      }
    };

    if (params.failoverReason === "overloaded") {
@@ -122,7 +124,7 @@ export async function handleAssistantFailover(params: {
        params.warn(
          `overload profile rotation cap reached for ${sanitizeForLog(params.provider)}/${sanitizeForLog(params.modelId)} after ${overloadProfileRotations} rotations; escalating to model fallback`,
        );
-        markFailedProfile();
+        await markFailedProfile();
        params.logAssistantFailoverDecision("fallback_model", { status });
        return {
          action: "throw",
@@ -151,7 +153,7 @@ export async function handleAssistantFailover(params: {
    }

    const rotated = await params.advanceAuthProfile();
-    markFailedProfile();
+    await markFailedProfile();
    if (params.timedOut && !params.isProbeSession && failedProfileId) {
      params.warn(`Profile ${failedProfileId} timed out. Trying next account...`);
    }