fix(failover): classify INTERNAL 500 responses as retryable timeouts (#68238)

* Agents: treat Google INTERNAL 500 as timeout failover

(cherry picked from commit c2538523a22d39b65c6b4056ab4857ee84f06887)

* test(failover): narrow INTERNAL timeout patterns

* fix: document INTERNAL timeout retry guard

* fix: ignore plain status prose in server error classification

* fix(failover): preserve mixed server-error retry signals

* test(failover): dedupe internal status samples

* fix(failover): retry status prose with code 500

* fix: classify INTERNAL 500 responses as retryable timeouts

* fix: classify INTERNAL 500 responses as retryable timeouts

---------

Co-authored-by: Kosbling <github@kosbling.com>
Co-authored-by: Openbling <github@openbling.ai>
This commit is contained in:
Altay
2026-04-17 23:24:26 +03:00
committed by GitHub
parent a001b5343f
commit d0cf6731aa
3 changed files with 80 additions and 1 deletions

View File

@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
- Feishu/card actions: resolve card-action chat type from the Feishu chat API when stored context is missing, preferring `chat_mode` over `chat_type`, so DM-originated card actions no longer bypass `dmPolicy` by falling through to the group handling path. (#68201)
- Cron/isolated-agent: preserve `trusted: false` on isolated cron awareness events mirrored into the main session, and forward the optional `trusted` flag through the gateway cron wrapper so explicit trust downgrades survive session-key scoping. (#68210)
- Agents/fallback: recognize bare leading ZenMux `402 ...` quota-refresh errors without misclassifying plain numeric `402 ...` text, and keep the embedded fallback regression coverage stable. (#47579) Thanks @bwjoke.
- Failover/google: only treat `INTERNAL` status payloads as retryable timeouts when they also carry a `500` code, so malformed non-500 payloads do not enter the retry path. (#68238) Thanks @altaywtf and @Openbling.
## 2026.4.15

View File

@@ -45,6 +45,12 @@ const GROQ_TOO_MANY_REQUESTS_MESSAGE =
"429 Too Many Requests: Too many requests were sent in a given timeframe.";
const GROQ_SERVICE_UNAVAILABLE_MESSAGE =
"503 Service Unavailable: The server is temporarily unable to handle the request due to overloading or maintenance."; // pragma: allowlist secret
const PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE =
"Proxy notice: Status: Internal Server Error";
const MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE =
`${PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE}; upstream connect error`;
const INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE =
`${PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE}; code:500`;
function expectMessageMatches(
matcher: (message: string) => boolean,
@@ -64,6 +70,12 @@ function expectTimeoutFailoverSamples(samples: readonly string[]) {
}
}
function expectNotFailoverSample(sample: string) {
expect(isTimeoutErrorMessage(sample)).toBe(false);
expect(classifyFailoverReason(sample)).toBeNull();
expect(isFailoverErrorMessage(sample)).toBe(false);
}
describe("isAuthPermanentErrorMessage", () => {
it.each([
{
@@ -811,6 +823,30 @@ describe("isFailoverErrorMessage", () => {
expect(classifyFailoverReason(sample)).toBe(null);
expect(isFailoverErrorMessage(sample)).toBe(false);
});
it("matches google INTERNAL status errors as timeout", () => {
const sample =
"provider=google model=gemini-3.1-flash-lite-preview got status: INTERNAL upstream failure code:500";
expect(isTimeoutErrorMessage(sample)).toBe(true);
expect(classifyFailoverReason(sample)).toBe("timeout");
expect(isFailoverErrorMessage(sample)).toBe(true);
});
it("does not treat plain status text with internal-server-error wording as timeout", () => {
expectNotFailoverSample(PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE);
});
it("keeps mixed upstream server errors retryable when they also mention status prose", () => {
expect(isTimeoutErrorMessage(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe(false);
expect(classifyFailoverReason(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe("timeout");
expect(isFailoverErrorMessage(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe(true);
});
it("keeps status prose retryable when it is explicitly paired with code 500", () => {
expect(isTimeoutErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(false);
expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout");
expect(isFailoverErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(true);
});
});
describe("parseImageSizeError", () => {
@@ -1230,4 +1266,33 @@ describe("classifyProviderRuntimeFailureKind", () => {
),
).not.toBe("proxy");
});
it("classifies google-style INTERNAL status payloads as timeout", () => {
expect(
classifyFailoverReason(
'ERROR provider=google model=gemini-3.1-flash-lite-preview: got status: INTERNAL, details: {"code":500,"status":"INTERNAL"}',
),
).toBe("timeout");
expect(
classifyFailoverReason(
'got status: INTERNAL. {"error":{"code":500,"message":"Internal error encountered.","status":"INTERNAL"}}',
),
).toBe("timeout");
});
it("does not classify google-style INTERNAL payloads without a 500 code as timeout", () => {
const sample =
'got status: INTERNAL. {"error":{"code":400,"message":"Request malformed","status":"INTERNAL"}}';
expect(isTimeoutErrorMessage(sample)).toBe(false);
expect(classifyFailoverReason(sample)).toBeNull();
expect(isFailoverErrorMessage(sample)).toBe(false);
});
it("does not classify plain status text with internal server error wording as timeout", () => {
expectNotFailoverSample(PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE);
});
it("classifies internal server error status prose with code 500 as timeout", () => {
expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout");
});
});

View File

@@ -42,6 +42,9 @@ const COMMON_AUTH_ERROR_PATTERNS = [
const ZAI_BILLING_CODE_1311_RE = /"code"\s*:\s*1311\b/;
const ZAI_AUTH_CODE_1113_RE = /"code"\s*:\s*1113\b/;
const STATUS_INTERNAL_SERVER_ERROR_RE = /\bstatus:\s*internal server error\b/i;
const STATUS_INTERNAL_SERVER_ERROR_WITH_500_RE =
/^(?=[\s\S]*\bstatus:\s*internal server error\b)(?=[\s\S]*\bcode["']?\s*[:=]\s*500\b)/i;
const ZAI_AUTH_ERROR_PATTERNS = [
// Z.ai: error 1113 = wrong endpoint or invalid credentials (#48988)
@@ -95,6 +98,8 @@ const ERROR_PATTERNS = {
"service unavailable",
"deadline exceeded",
"context deadline exceeded",
/^(?=[\s\S]*\bgot status:\s*internal\b)(?=[\s\S]*\bcode["']?\s*[:=]\s*500\b)/i,
/^(?=[\s\S]*["']status["']\s*:\s*["']internal["'])(?=[\s\S]*["']code["']\s*:\s*500\b)/i,
"connection error",
"network error",
"network request failed",
@@ -233,5 +238,13 @@ export function isOverloadedErrorMessage(raw: string): boolean {
}
export function isServerErrorMessage(raw: string): boolean {
return matchesErrorPatterns(raw, ERROR_PATTERNS.serverError);
const value = normalizeLowercaseStringOrEmpty(raw);
if (!value) {
return false;
}
if (STATUS_INTERNAL_SERVER_ERROR_WITH_500_RE.test(value)) {
return true;
}
const scrubbed = value.replace(STATUS_INTERNAL_SERVER_ERROR_RE, "").trim();
return scrubbed.length > 0 && matchesErrorPatterns(scrubbed, ERROR_PATTERNS.serverError);
}