diff --git a/CHANGELOG.md b/CHANGELOG.md index 76838ea551e..920e8e451d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai - Feishu/card actions: resolve card-action chat type from the Feishu chat API when stored context is missing, preferring `chat_mode` over `chat_type`, so DM-originated card actions no longer bypass `dmPolicy` by falling through to the group handling path. (#68201) - Cron/isolated-agent: preserve `trusted: false` on isolated cron awareness events mirrored into the main session, and forward the optional `trusted` flag through the gateway cron wrapper so explicit trust downgrades survive session-key scoping. (#68210) - Agents/fallback: recognize bare leading ZenMux `402 ...` quota-refresh errors without misclassifying plain numeric `402 ...` text, and keep the embedded fallback regression coverage stable. (#47579) Thanks @bwjoke. +- Failover/google: only treat `INTERNAL` status payloads as retryable timeouts when they also carry a `500` code, so malformed non-500 payloads do not enter the retry path. (#68238) Thanks @altaywtf and @Openbling. ## 2026.4.15 diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 813cb8b7cb4..1404eb5590a 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -45,6 +45,12 @@ const GROQ_TOO_MANY_REQUESTS_MESSAGE = "429 Too Many Requests: Too many requests were sent in a given timeframe."; const GROQ_SERVICE_UNAVAILABLE_MESSAGE = "503 Service Unavailable: The server is temporarily unable to handle the request due to overloading or maintenance."; // pragma: allowlist secret +const PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE = + "Proxy notice: Status: Internal Server Error"; +const MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE = + `${PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE}; upstream connect error`; +const INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE = + `${PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE}; code:500`; function expectMessageMatches( matcher: (message: string) => boolean, @@ -64,6 +70,12 @@ function expectTimeoutFailoverSamples(samples: readonly string[]) { } } +function expectNotFailoverSample(sample: string) { + expect(isTimeoutErrorMessage(sample)).toBe(false); + expect(classifyFailoverReason(sample)).toBeNull(); + expect(isFailoverErrorMessage(sample)).toBe(false); +} + describe("isAuthPermanentErrorMessage", () => { it.each([ { @@ -811,6 +823,30 @@ describe("isFailoverErrorMessage", () => { expect(classifyFailoverReason(sample)).toBe(null); expect(isFailoverErrorMessage(sample)).toBe(false); }); + + it("matches google INTERNAL status errors as timeout", () => { + const sample = + "provider=google model=gemini-3.1-flash-lite-preview got status: INTERNAL upstream failure code:500"; + expect(isTimeoutErrorMessage(sample)).toBe(true); + expect(classifyFailoverReason(sample)).toBe("timeout"); + expect(isFailoverErrorMessage(sample)).toBe(true); + }); + + it("does not treat plain status text with internal-server-error wording as timeout", () => { + expectNotFailoverSample(PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE); + }); + + it("keeps mixed upstream server errors retryable when they also mention status prose", () => { + expect(isTimeoutErrorMessage(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe(false); + expect(classifyFailoverReason(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe("timeout"); + expect(isFailoverErrorMessage(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe(true); + }); + + it("keeps status prose retryable when it is explicitly paired with code 500", () => { + expect(isTimeoutErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(false); + expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout"); + expect(isFailoverErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(true); + }); }); describe("parseImageSizeError", () => { @@ -1230,4 +1266,33 @@ describe("classifyProviderRuntimeFailureKind", () => { ), ).not.toBe("proxy"); }); + + it("classifies google-style INTERNAL status payloads as timeout", () => { + expect( + classifyFailoverReason( + 'ERROR provider=google model=gemini-3.1-flash-lite-preview: got status: INTERNAL, details: {"code":500,"status":"INTERNAL"}', + ), + ).toBe("timeout"); + expect( + classifyFailoverReason( + 'got status: INTERNAL. {"error":{"code":500,"message":"Internal error encountered.","status":"INTERNAL"}}', + ), + ).toBe("timeout"); + }); + + it("does not classify google-style INTERNAL payloads without a 500 code as timeout", () => { + const sample = + 'got status: INTERNAL. {"error":{"code":400,"message":"Request malformed","status":"INTERNAL"}}'; + expect(isTimeoutErrorMessage(sample)).toBe(false); + expect(classifyFailoverReason(sample)).toBeNull(); + expect(isFailoverErrorMessage(sample)).toBe(false); + }); + + it("does not classify plain status text with internal server error wording as timeout", () => { + expectNotFailoverSample(PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE); + }); + + it("classifies internal server error status prose with code 500 as timeout", () => { + expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout"); + }); }); diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index 2130ff7fa0f..14d7ede0b1c 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -42,6 +42,9 @@ const COMMON_AUTH_ERROR_PATTERNS = [ const ZAI_BILLING_CODE_1311_RE = /"code"\s*:\s*1311\b/; const ZAI_AUTH_CODE_1113_RE = /"code"\s*:\s*1113\b/; +const STATUS_INTERNAL_SERVER_ERROR_RE = /\bstatus:\s*internal server error\b/i; +const STATUS_INTERNAL_SERVER_ERROR_WITH_500_RE = + /^(?=[\s\S]*\bstatus:\s*internal server error\b)(?=[\s\S]*\bcode["']?\s*[:=]\s*500\b)/i; const ZAI_AUTH_ERROR_PATTERNS = [ // Z.ai: error 1113 = wrong endpoint or invalid credentials (#48988) @@ -95,6 +98,8 @@ const ERROR_PATTERNS = { "service unavailable", "deadline exceeded", "context deadline exceeded", + /^(?=[\s\S]*\bgot status:\s*internal\b)(?=[\s\S]*\bcode["']?\s*[:=]\s*500\b)/i, + /^(?=[\s\S]*["']status["']\s*:\s*["']internal["'])(?=[\s\S]*["']code["']\s*:\s*500\b)/i, "connection error", "network error", "network request failed", @@ -233,5 +238,13 @@ export function isOverloadedErrorMessage(raw: string): boolean { } export function isServerErrorMessage(raw: string): boolean { - return matchesErrorPatterns(raw, ERROR_PATTERNS.serverError); + const value = normalizeLowercaseStringOrEmpty(raw); + if (!value) { + return false; + } + if (STATUS_INTERNAL_SERVER_ERROR_WITH_500_RE.test(value)) { + return true; + } + const scrubbed = value.replace(STATUS_INTERNAL_SERVER_ERROR_RE, "").trim(); + return scrubbed.length > 0 && matchesErrorPatterns(scrubbed, ERROR_PATTERNS.serverError); }