fix(agents): classify stream_read_error as transient (#79692)

* fix(agents): classify stream_read_error as transient

* fix: classify stream read errors as transient (#79692)

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
hekunwang
2026-05-09 18:18:04 +08:00
committed by GitHub
parent a855414bc6
commit 7236d6487e
6 changed files with 26 additions and 7 deletions

View File

@@ -153,6 +153,8 @@ Docs: https://docs.openclaw.ai
- Memory/QMD: warn with a manual stale collection removal hint when QMD reports a path/pattern conflict but `collection list` lacks verifiable metadata, avoiding unsafe stderr-only rebinds. Refs #71783. (#72297) Thanks @MonkeyLeeT.
- Models/auth: make `openclaw models status --check` and dashboard auth health honor effective auth profile order while keeping stale profiles visible. (#79685) Thanks @nimbleenigma.
- Agents/failover: classify bare `stream_read_error` streaming failures as transient timeouts so configured model fallback runs instead of surfacing the raw transport error. Fixes #79689. (#79692) Thanks @hekunwang.
- Agents/failover: persist overloaded auth-profile cooldown marks before exhausted fallback summaries surface, so immediate fallback retries honor the recorded cooldown state.
- Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net.
- Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang.
- OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.

View File

@@ -483,6 +483,10 @@ describe("runWithModelFallback + runEmbeddedPiAgent failover behavior", () => {
name: "undici-terminated",
message: "terminated",
},
{
name: "stream-read-error",
message: "stream_read_error",
},
{
name: "codex-empty-transport-response",
message: "Request failed",

View File

@@ -1113,6 +1113,14 @@ describe("runWithModelFallback", () => {
error: new Error("Model not found: openai/gpt-6"),
expectedFallback: ["anthropic", "claude-haiku-3-5"],
},
{
name: "bare stream read transport error",
provider: "openai",
model: "gpt-4.1-mini",
error: new Error("stream_read_error"),
expectedFallback: ["anthropic", "claude-haiku-3-5"],
expectedReason: "timeout",
},
];
for (const testCase of cases) {

View File

@@ -923,6 +923,8 @@ describe("isFailoverErrorMessage", () => {
"terminated",
"Terminated",
" terminated ",
"stream_read_error",
" stream_read_error ",
"UND_ERR_SOCKET",
"Error: UND_ERR_SOCKET other side closed",
"UND_ERR_CONNECT_TIMEOUT",

View File

@@ -168,6 +168,7 @@ const ERROR_PATTERNS = {
// aborted). These arrive as bare strings on the outer error and, without
// an explicit match, the fallback chain is never attempted (#69368).
/^terminated$/i,
/^stream_read_error$/i,
/\bund_err_(?:socket|connect|headers?|body|req_content_length_mismatch|aborted|closed)\b/i,
// pi-ai's openai-codex provider surfaces `Request failed` when the HTTP
// response has no body and no status text (typical of Cloudflare 502s

View File

@@ -99,17 +99,19 @@ export async function handleAssistantFailover(params: {
if (decision.action === "rotate_profile") {
const failedProfileId = params.lastProfileId;
const failureReason = params.timedOut ? "timeout" : params.assistantProfileFailureReason;
const markFailedProfile = () => {
const markFailedProfile = async () => {
if (!failedProfileId || !failureReason || failureReason === "timeout") {
return;
}
params
.maybeMarkAuthProfileFailure({
try {
await params.maybeMarkAuthProfileFailure({
profileId: failedProfileId,
reason: failureReason,
modelId: params.modelId,
})
.catch((err) => params.warn(`deferred profile failure mark failed: ${String(err)}`));
});
} catch (err) {
params.warn(`profile failure mark failed: ${String(err)}`);
}
};
if (params.failoverReason === "overloaded") {
@@ -122,7 +124,7 @@ export async function handleAssistantFailover(params: {
params.warn(
`overload profile rotation cap reached for ${sanitizeForLog(params.provider)}/${sanitizeForLog(params.modelId)} after ${overloadProfileRotations} rotations; escalating to model fallback`,
);
markFailedProfile();
await markFailedProfile();
params.logAssistantFailoverDecision("fallback_model", { status });
return {
action: "throw",
@@ -151,7 +153,7 @@ export async function handleAssistantFailover(params: {
}
const rotated = await params.advanceAuthProfile();
markFailedProfile();
await markFailedProfile();
if (params.timedOut && !params.isProbeSession && failedProfileId) {
params.warn(`Profile ${failedProfileId} timed out. Trying next account...`);
}