From f4891b083ddec4a5406a69896c20f4fe1944b9a4 Mon Sep 17 00:00:00 2001 From: Sk7n4k3d Date: Mon, 20 Apr 2026 22:01:06 +0200 Subject: [PATCH] fix(agents/failover): classify undici terminated and codex Request failed as timeout (#69368) --- CHANGELOG.md | 2 ++ ...dded-helpers.isbillingerrormessage.test.ts | 28 +++++++++++++++++++ .../pi-embedded-helpers/failover-matches.ts | 12 ++++++++ 3 files changed, 42 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c91fdc60675..c59e953c887 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,8 @@ Docs: https://docs.openclaw.ai - Thinking defaults/status: raise the implicit default thinking level for reasoning-capable models from legacy `off`/`low` fallback behavior to a safe provider-supported `medium` equivalent when no explicit config default is set, preserve configured-model reasoning metadata when runtime catalog loading is empty, and make `/status` report the same resolved default as runtime. - Gateway/model pricing: fetch OpenRouter and LiteLLM pricing asynchronously at startup and extend catalog fetch timeouts to 30 seconds, reducing noisy timeout warnings during slow upstream responses. +- Agents/failover: classify bare undici transport failures (`terminated`, `UND_ERR_SOCKET`, `UND_ERR_CONNECT_TIMEOUT`, body/header timeouts, aborted streams) and pi-ai's openai-codex `Request failed` sentinel as `timeout`, so Cloudflare 502s with empty bodies and mid-response socket resets actually enter the configured fallback chain instead of surfacing as unclassified errors. Fixes #69368. (#69677) Thanks @sk7n4k3d. +- Providers/Anthropic Vertex: restore ADC-backed model discovery after the lightweight provider-discovery path by resolving emitted discovery entries, exposing synthetic auth on bootstrap discovery, and honoring copied env snapshots when probing the default GCP ADC path. Fixes #65715. (#65716) Thanks @feiskyer. - Plugins/install: add newly installed plugin ids to an existing `plugins.allow` list before enabling them, so allowlisted configs load installed plugins after restart. - Status: show `Fast` in `/status` when fast mode is enabled, including config/default-derived fast mode, and omit it when disabled. - OpenAI/image generation: detect Azure OpenAI-style image endpoints, use Azure `api-key` auth plus deployment-scoped image URLs, and honor `AZURE_OPENAI_API_VERSION` so image generation and edits work against Azure-hosted OpenAI resources. (#70570) Thanks @zhanggpcsu. diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 94e5d67adab..e8cd0dbc88d 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -844,6 +844,34 @@ describe("isFailoverErrorMessage", () => { expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout"); expect(isFailoverErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(true); }); + + it("matches bare undici transport failures as timeout (#69368)", () => { + expectTimeoutFailoverSamples([ + "terminated", + "Terminated", + " terminated ", + "UND_ERR_SOCKET", + "Error: UND_ERR_SOCKET other side closed", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", + "UND_ERR_ABORTED", + "UND_ERR_REQ_CONTENT_LENGTH_MISMATCH", + ]); + }); + + it("matches pi-ai openai-codex bare transport failures as timeout (#69368)", () => { + expectTimeoutFailoverSamples([ + "Request failed", + "request failed", + " Request failed ", + "Request failed after repeated internal retries.", + ]); + }); + + it("does not classify unrelated 'terminated' prose as timeout", () => { + expectNotFailoverSample("The user terminated the session manually."); + }); }); describe("parseImageSizeError", () => { diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index efa5ff62166..c8b97158ae4 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -126,6 +126,18 @@ const ERROR_PATTERNS = { // falls through to reason=unknown (#58315). /\boperation was aborted\b/i, /\bstream (?:was )?(?:closed|aborted)\b/i, + // Undici transport-level failures during CDN/provider outages (Cloudflare + // 502 served with an empty body, socket reset mid-response, body-stream + // aborted). These arrive as bare strings on the outer error and, without + // an explicit match, the fallback chain is never attempted (#69368). + /^terminated$/i, + /\bund_err_(?:socket|connect|headers?|body|req_content_length_mismatch|aborted|closed)\b/i, + // pi-ai's openai-codex provider surfaces `Request failed` when the HTTP + // response has no body and no status text (typical of Cloudflare 502s + // from the upstream Codex service). Treat it as a transport failure so + // the configured fallback chain runs instead of surfacing the error. + /^request failed$/i, + /\brequest failed after repeated internal retries\b/i, ], billing: [ /["']?(?:status|code)["']?\s*[:=]\s*402\b|\bhttp\s*402\b|\berror(?:\s+code)?\s*[:=]?\s*402\b|\b(?:got|returned|received)\s+(?:a\s+)?402\b|^\s*402\s+payment/i,