From 983909f826f0ff817b4eaa0be2be25ec97331ed0 Mon Sep 17 00:00:00 2001 From: Aaron Zhu <139607425+aaron-he-zhu@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:11:46 +0800 Subject: [PATCH] fix(agents): classify generic provider errors for failover (#59325) * fix(agents): classify generic provider errors for failover Anthropic returns bare 'An unknown error occurred' during API instability and OpenRouter wraps upstream failures as 'Provider returned error'. Neither message was recognized by the failover classifier, so the error surfaced directly to users instead of triggering the configured fallback chain. Add both patterns to the serverError classifier so they are classified as transient server errors (timeout) and trigger model failover. Closes #49706 Closes #45834 * fix(agents): scope unknown-error failover by provider * docs(changelog): note provider-scoped unknown-error failover --------- Co-authored-by: Aaron Zhu Co-authored-by: Altay --- CHANGELOG.md | 1 + src/agents/cli-runner.ts | 4 +-- src/agents/failover-error.test.ts | 32 +++++++++++++++++++ src/agents/failover-error.ts | 17 ++++++++++ ...dded-helpers.isbillingerrormessage.test.ts | 15 +++++++++ src/agents/pi-embedded-helpers/errors.ts | 32 +++++++++++++++---- src/agents/pi-embedded-runner/run.ts | 9 ++++-- ...i-embedded-subscribe.handlers.lifecycle.ts | 4 ++- 8 files changed, 103 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74ca3309a25..669ca418efa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -110,6 +110,7 @@ Docs: https://docs.openclaw.ai - Gateway/device auth: reuse cached device-token scopes only for cached-token reconnects, while keeping explicit `deviceToken` scope requests and empty-cache fallbacks intact so reconnects preserve `operator.read` without breaking explicit auth flows. (#46032) Thanks @caicongyang. - Google Gemini CLI auth: improve OAuth credential discovery across Windows nvm and Homebrew libexec installs, and align Code Assist metadata so Gemini login stops failing on packaged CLI layouts. (#40729) Thanks @hughcube. - Mattermost/config schema: accept `groups.*.requireMention` again so existing Mattermost configs no longer fail strict validation after upgrade. (#58271) Thanks @MoerAI. +- Agents/failover: scope Anthropic `An unknown error occurred` failover matching by provider so generic internal unknown-error text no longer triggers retryable timeout fallback. (#59325) Thanks @aaron-he-zhu. - Providers/OpenRouter failover: classify `403 "Key limit exceeded"` spending-limit responses as billing so model fallback continues instead of stopping on generic auth. (#59892) Thanks @rockcent. - Device pairing/security: keep non-operator device scope checks bound to the requested role prefix so bootstrap verification cannot redeem `operator.*` scopes through `node` auth. (#57258) Thanks @jlapenna. - Gateway/device pairing: require non-admin paired-device sessions to manage only their own device for token rotate/revoke and paired-device removal, blocking cross-device token theft inside pairing-scoped sessions. (#50627) Thanks @coygeek. diff --git a/src/agents/cli-runner.ts b/src/agents/cli-runner.ts index 25fed7fc20a..9ae4d3909d9 100644 --- a/src/agents/cli-runner.ts +++ b/src/agents/cli-runner.ts @@ -73,8 +73,8 @@ export async function runCliAgent(params: RunCliAgentParams): Promise { ).toBe("overloaded"); }); + it("classifies Anthropic bare 'unknown error' as timeout for failover (#49706)", () => { + expect( + resolveFailoverReasonFromError({ + provider: "anthropic", + message: "An unknown error occurred", + }), + ).toBe("timeout"); + }); + + it("does not classify generic internal unknown-error text as failover timeout", () => { + expect( + resolveFailoverReasonFromError({ + message: "LLM request failed with an unknown error.", + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + message: "An unknown error occurred", + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + provider: "openrouter", + message: "An unknown error occurred", + }), + ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + message: "Provider returned error", + }), + ).toBeNull(); + }); it("treats 400 insufficient_quota payloads as billing instead of format", () => { expect( resolveFailoverReasonFromError({ diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index 12814e2d9f3..ffd3208e27d 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -132,6 +132,22 @@ function getErrorCode(err: unknown): string | undefined { return findErrorProperty(err, readDirectErrorCode); } +function readDirectProvider(err: unknown): string | undefined { + if (!err || typeof err !== "object") { + return undefined; + } + const provider = (err as { provider?: unknown }).provider; + if (typeof provider !== "string") { + return undefined; + } + const trimmed = provider.trim(); + return trimmed || undefined; +} + +function getProvider(err: unknown): string | undefined { + return findErrorProperty(err, readDirectProvider); +} + function readDirectErrorMessage(err: unknown): string | undefined { if (err instanceof Error) { return err.message || undefined; @@ -207,6 +223,7 @@ function normalizeErrorSignal(err: unknown): FailoverSignal { status: getStatusCode(err), code: getErrorCode(err), message: message || undefined, + provider: getProvider(err), }; } diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 6fae38b259d..47678b95a39 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -638,6 +638,21 @@ describe("classifyFailoverReason", () => { ), ).toBeNull(); }); + it("classifies Anthropic bare 'unknown error' as timeout for failover", () => { + expect(classifyFailoverReason("An unknown error occurred", { provider: "anthropic" })).toBe( + "timeout", + ); + }); + + it("does not classify generic internal unknown-error text as timeout", () => { + expect(classifyFailoverReason("An unknown error occurred")).toBeNull(); + expect( + classifyFailoverReason("An unknown error occurred", { provider: "openrouter" }), + ).toBeNull(); + expect(classifyFailoverReason("Provider returned error")).toBeNull(); + expect(classifyFailoverReason("Unknown error")).toBeNull(); + expect(classifyFailoverReason("LLM request failed with an unknown error.")).toBeNull(); + }); }); describe("isFailoverErrorMessage", () => { diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 1350d2499ca..82738d7c61b 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -371,6 +371,7 @@ export type FailoverSignal = { status?: number; code?: string; message?: string; + provider?: string; }; export type FailoverClassification = @@ -629,7 +630,19 @@ function classifyFailoverReasonFromCode(raw: string | undefined): FailoverReason } } -function classifyFailoverClassificationFromMessage(raw: string): FailoverClassification | null { +function isAnthropicProvider(provider?: string): boolean { + const normalized = provider?.trim().toLowerCase(); + return Boolean(normalized && normalized.includes("anthropic")); +} + +function isAnthropicGenericUnknownError(raw: string, provider?: string): boolean { + return isAnthropicProvider(provider) && raw.toLowerCase().includes("an unknown error occurred"); +} + +function classifyFailoverClassificationFromMessage( + raw: string, + provider?: string, +): FailoverClassification | null { if (isImageDimensionErrorMessage(raw)) { return null; } @@ -677,6 +690,9 @@ function classifyFailoverClassificationFromMessage(raw: string): FailoverClassif if (isAuthErrorMessage(raw)) { return toReasonClassification("auth"); } + if (isAnthropicGenericUnknownError(raw, provider)) { + return toReasonClassification("timeout"); + } if (isServerErrorMessage(raw)) { return toReasonClassification("timeout"); } @@ -703,7 +719,7 @@ export function classifyFailoverSignal(signal: FailoverSignal): FailoverClassifi ? signal.status : extractLeadingHttpStatus(signal.message?.trim() ?? "")?.code; const messageClassification = signal.message - ? classifyFailoverClassificationFromMessage(signal.message) + ? classifyFailoverClassificationFromMessage(signal.message, signal.provider) : null; const statusClassification = classifyFailoverClassificationFromHttpStatus( inferredStatus, @@ -1207,24 +1223,28 @@ function isCliSessionExpiredErrorMessage(raw: string): boolean { ); } -export function classifyFailoverReason(raw: string): FailoverReason | null { +export function classifyFailoverReason( + raw: string, + opts?: { provider?: string }, +): FailoverReason | null { const trimmed = raw.trim(); const leadingStatus = extractLeadingHttpStatus(trimmed); return failoverReasonFromClassification( classifyFailoverSignal({ status: leadingStatus?.code, message: raw, + provider: opts?.provider, }), ); } -export function isFailoverErrorMessage(raw: string): boolean { - return classifyFailoverReason(raw) !== null; +export function isFailoverErrorMessage(raw: string, opts?: { provider?: string }): boolean { + return classifyFailoverReason(raw, opts) !== null; } export function isFailoverAssistantError(msg: AssistantMessage | undefined): boolean { if (!msg || msg.stopReason !== "error") { return false; } - return isFailoverErrorMessage(msg.errorMessage ?? ""); + return isFailoverErrorMessage(msg.errorMessage ?? "", { provider: msg.provider }); } diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 698f7e862c8..47a6ed319fd 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -1048,7 +1048,7 @@ export async function runEmbeddedPiAgent( }; } const promptFailoverReason = - promptErrorDetails.reason ?? classifyFailoverReason(errorText); + promptErrorDetails.reason ?? classifyFailoverReason(errorText, { provider }); const promptProfileFailureReason = resolveAuthProfileFailureReason(promptFailoverReason); await maybeMarkAuthProfileFailure({ @@ -1161,7 +1161,12 @@ export async function runEmbeddedPiAgent( const rateLimitFailure = isRateLimitAssistantError(lastAssistant); const billingFailure = isBillingAssistantError(lastAssistant); const failoverFailure = isFailoverAssistantError(lastAssistant); - const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? ""); + const assistantFailoverReason = classifyFailoverReason( + lastAssistant?.errorMessage ?? "", + { + provider: lastAssistant?.provider, + }, + ); const assistantProfileFailureReason = resolveAuthProfileFailureReason(assistantFailoverReason); const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError; diff --git a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts index 658b2215165..3b573c24fc6 100644 --- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts +++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts @@ -47,7 +47,9 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) { model: lastAssistant.model, }); const rawError = lastAssistant.errorMessage?.trim(); - const failoverReason = classifyFailoverReason(rawError ?? ""); + const failoverReason = classifyFailoverReason(rawError ?? "", { + provider: lastAssistant.provider, + }); const errorText = (friendlyError || lastAssistant.errorMessage || "LLM request failed.").trim(); const observedError = buildApiErrorObservationFields(rawError); const safeErrorText =