From ae460eff84ab89030405a99e4d2f8f121890c5b2 Mon Sep 17 00:00:00 2001 From: Altay Date: Sat, 4 Apr 2026 18:24:03 +0300 Subject: [PATCH] fix(failover): scope openrouter-specific matchers (#60909) --- src/agents/cli-runner/execute.ts | 2 +- src/agents/failover-error.test.ts | 42 +++++++++++++++---- ...dded-helpers.isbillingerrormessage.test.ts | 27 +++++++++--- src/agents/pi-embedded-helpers/errors.ts | 31 ++++++++++++-- .../pi-embedded-helpers/failover-matches.ts | 1 - src/agents/pi-embedded-runner/run.ts | 2 +- .../pi-embedded-runner/run/auth-controller.ts | 8 ++-- 7 files changed, 88 insertions(+), 25 deletions(-) diff --git a/src/agents/cli-runner/execute.ts b/src/agents/cli-runner/execute.ts index eed71f000c9..b23df5f711a 100644 --- a/src/agents/cli-runner/execute.ts +++ b/src/agents/cli-runner/execute.ts @@ -279,7 +279,7 @@ export async function executePreparedCliRun( }); } const err = stderr || stdout || "CLI failed."; - const reason = classifyFailoverReason(err) ?? "unknown"; + const reason = classifyFailoverReason(err, { provider: params.provider }) ?? "unknown"; const status = resolveFailoverStatus(reason); throw new FailoverError(err, { reason, diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index ba900960ded..f6d7fd241b8 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -196,21 +196,22 @@ describe("failover-error", () => { ).toBe("overloaded"); }); - it("classifies Anthropic bare 'unknown error' as timeout for failover (#49706)", () => { + it("classifies provider-scoped generic upstream errors for failover", () => { expect( resolveFailoverReasonFromError({ provider: "anthropic", message: "An unknown error occurred", }), ).toBe("timeout"); - }); - - it("does not classify generic internal unknown-error text as failover timeout", () => { expect( resolveFailoverReasonFromError({ - message: "LLM request failed with an unknown error.", + provider: "openrouter", + message: "Provider returned error", }), - ).toBeNull(); + ).toBe("timeout"); + }); + + it("does not classify provider-scoped upstream errors without the matching provider", () => { expect( resolveFailoverReasonFromError({ message: "An unknown error occurred", @@ -227,7 +228,14 @@ describe("failover-error", () => { message: "Provider returned error", }), ).toBeNull(); + expect( + resolveFailoverReasonFromError({ + provider: "anthropic", + message: "Provider returned error", + }), + ).toBeNull(); }); + it("treats 400 insufficient_quota payloads as billing instead of format", () => { expect( resolveFailoverReasonFromError({ @@ -548,11 +556,16 @@ describe("failover-error", () => { // GitHub: openclaw/openclaw#53849 — OpenRouter returns 403 with "Key limit exceeded" // when the monthly key spending limit is reached. This must trigger billing failover // (model fallback), not generic auth. - expect(resolveFailoverReasonFromError({ status: 403, message: "Key limit exceeded" })).toBe( - "billing", - ); expect( resolveFailoverReasonFromError({ + provider: "openrouter", + status: 403, + message: "Key limit exceeded", + }), + ).toBe("billing"); + expect( + resolveFailoverReasonFromError({ + provider: "openrouter", status: 403, message: "403 Key limit exceeded (monthly limit)", }), @@ -562,12 +575,23 @@ describe("failover-error", () => { it("401 billing-style message returns billing instead of generic auth", () => { expect( resolveFailoverReasonFromError({ + provider: "openrouter", status: 401, message: "401 Key limit exceeded (monthly limit)", }), ).toBe("billing"); }); + it("does not treat OpenRouter key-limit text as billing without provider context", () => { + expect(resolveFailoverReasonFromError({ message: "Key limit exceeded" })).toBeNull(); + expect( + resolveFailoverReasonFromError({ + status: 403, + message: "403 Key limit exceeded (monthly limit)", + }), + ).toBe("auth"); + }); + it("resolveFailoverStatus maps auth_permanent to 403", () => { expect(resolveFailoverStatus("auth_permanent")).toBe(403); }); diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 47678b95a39..becaca64dae 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -570,10 +570,18 @@ describe("classifyFailoverReasonFromHttpStatus", () => { ).toBeNull(); }); - it("lets billing-classified HTTP 401 responses bypass generic auth", () => { + it("lets OpenRouter billing-classified HTTP 401 responses bypass generic auth", () => { + expect( + classifyFailoverReasonFromHttpStatus(401, "401 Key limit exceeded (monthly limit)", { + provider: "openrouter", + }), + ).toBe("billing"); + }); + + it("keeps generic HTTP 401 key-limit text on the auth path without provider context", () => { expect( classifyFailoverReasonFromHttpStatus(401, "401 Key limit exceeded (monthly limit)"), - ).toBe("billing"); + ).toBe("auth"); }); it("treats HTTP 499 as transient for structured errors", () => { @@ -638,20 +646,27 @@ describe("classifyFailoverReason", () => { ), ).toBeNull(); }); - it("classifies Anthropic bare 'unknown error' as timeout for failover", () => { + + it("classifies provider-scoped generic upstream messages", () => { expect(classifyFailoverReason("An unknown error occurred", { provider: "anthropic" })).toBe( "timeout", ); + expect(classifyFailoverReason("Provider returned error", { provider: "openrouter" })).toBe( + "timeout", + ); + expect(classifyFailoverReason("Key limit exceeded", { provider: "openrouter" })).toBe( + "billing", + ); }); - it("does not classify generic internal unknown-error text as timeout", () => { + it("does not classify provider-scoped generic upstream messages without provider context", () => { expect(classifyFailoverReason("An unknown error occurred")).toBeNull(); expect( classifyFailoverReason("An unknown error occurred", { provider: "openrouter" }), ).toBeNull(); expect(classifyFailoverReason("Provider returned error")).toBeNull(); - expect(classifyFailoverReason("Unknown error")).toBeNull(); - expect(classifyFailoverReason("LLM request failed with an unknown error.")).toBeNull(); + expect(classifyFailoverReason("Provider returned error", { provider: "anthropic" })).toBeNull(); + expect(classifyFailoverReason("Key limit exceeded")).toBeNull(); }); }); diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 82738d7c61b..dddcc89349a 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -524,8 +524,11 @@ export function isTransientHttpError(raw: string): boolean { export function classifyFailoverReasonFromHttpStatus( status: number | undefined, message?: string, + opts?: { provider?: string }, ): FailoverReason | null { - const messageClassification = message ? classifyFailoverClassificationFromMessage(message) : null; + const messageClassification = message + ? classifyFailoverClassificationFromMessage(message, opts?.provider) + : null; return failoverReasonFromClassification( classifyFailoverClassificationFromHttpStatus(status, message, messageClassification), ); @@ -630,13 +633,27 @@ function classifyFailoverReasonFromCode(raw: string | undefined): FailoverReason } } -function isAnthropicProvider(provider?: string): boolean { +function isProvider(provider: string | undefined, match: string): boolean { const normalized = provider?.trim().toLowerCase(); - return Boolean(normalized && normalized.includes("anthropic")); + return Boolean(normalized && normalized.includes(match)); } function isAnthropicGenericUnknownError(raw: string, provider?: string): boolean { - return isAnthropicProvider(provider) && raw.toLowerCase().includes("an unknown error occurred"); + return ( + isProvider(provider, "anthropic") && raw.toLowerCase().includes("an unknown error occurred") + ); +} + +function isOpenRouterProviderReturnedError(raw: string, provider?: string): boolean { + return ( + isProvider(provider, "openrouter") && raw.toLowerCase().includes("provider returned error") + ); +} + +function isOpenRouterKeyLimitExceededError(raw: string, provider?: string): boolean { + return ( + isProvider(provider, "openrouter") && /\bkey\s+limit\s*(?:exceeded|reached|hit)\b/i.test(raw) + ); } function classifyFailoverClassificationFromMessage( @@ -662,6 +679,9 @@ function classifyFailoverClassificationFromMessage( if (reasonFrom402Text) { return toReasonClassification(reasonFrom402Text); } + if (isOpenRouterKeyLimitExceededError(raw, provider)) { + return toReasonClassification("billing"); + } if (isPeriodicUsageLimitErrorMessage(raw)) { return toReasonClassification(isBillingErrorMessage(raw) ? "billing" : "rate_limit"); } @@ -693,6 +713,9 @@ function classifyFailoverClassificationFromMessage( if (isAnthropicGenericUnknownError(raw, provider)) { return toReasonClassification("timeout"); } + if (isOpenRouterProviderReturnedError(raw, provider)) { + return toReasonClassification("timeout"); + } if (isServerErrorMessage(raw)) { return toReasonClassification("timeout"); } diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index d3382245c5f..fdd9a8179dd 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -119,7 +119,6 @@ const ERROR_PATTERNS = { "insufficient balance", "insufficient usd or diem balance", /requires?\s+more\s+credits/i, - /\bkey\s+limit\s*(?:exceeded|reached|hit)\b/i, ], authPermanent: HIGH_CONFIDENCE_AUTH_PERMANENT_PATTERNS, auth: [...AMBIGUOUS_AUTH_ERROR_PATTERNS, ...COMMON_AUTH_ERROR_PATTERNS], diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 47a6ed319fd..9f86c6ddd82 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -1057,7 +1057,7 @@ export async function runEmbeddedPiAgent( modelId, }); const promptFailoverFailure = - promptFailoverReason !== null || isFailoverErrorMessage(errorText); + promptFailoverReason !== null || isFailoverErrorMessage(errorText, { provider }); // Capture the failing profile before auth-profile rotation mutates `lastProfileId`. const failedPromptProfileId = lastProfileId; const logPromptFailoverDecision = createFailoverDecisionLogger({ diff --git a/src/agents/pi-embedded-runner/run/auth-controller.ts b/src/agents/pi-embedded-runner/run/auth-controller.ts index a63f90659a6..154269d5ca2 100644 --- a/src/agents/pi-embedded-runner/run/auth-controller.ts +++ b/src/agents/pi-embedded-runner/run/auth-controller.ts @@ -273,7 +273,9 @@ export function createEmbeddedRunAuthController(params: { }) ?? "unknown" ); } - const classified = classifyFailoverReason(failoverParams.message); + const classified = classifyFailoverReason(failoverParams.message, { + provider: params.getProvider(), + }); return classified ?? "auth"; }; @@ -475,10 +477,10 @@ export function createEmbeddedRunAuthController(params: { if (!params.getRuntimeAuthState() || retried) { return false; } - if (!isFailoverErrorMessage(errorText)) { + if (!isFailoverErrorMessage(errorText, { provider: params.getProvider() })) { return false; } - if (classifyFailoverReason(errorText) !== "auth") { + if (classifyFailoverReason(errorText, { provider: params.getProvider() }) !== "auth") { return false; } try {