mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:00:43 +00:00
fix(failover): classify INTERNAL 500 responses as retryable timeouts (#68238)
* Agents: treat Google INTERNAL 500 as timeout failover (cherry picked from commit c2538523a22d39b65c6b4056ab4857ee84f06887) * test(failover): narrow INTERNAL timeout patterns * fix: document INTERNAL timeout retry guard * fix: ignore plain status prose in server error classification * fix(failover): preserve mixed server-error retry signals * test(failover): dedupe internal status samples * fix(failover): retry status prose with code 500 * fix: classify INTERNAL 500 responses as retryable timeouts * fix: classify INTERNAL 500 responses as retryable timeouts --------- Co-authored-by: Kosbling <github@kosbling.com> Co-authored-by: Openbling <github@openbling.ai>
This commit is contained in:
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Feishu/card actions: resolve card-action chat type from the Feishu chat API when stored context is missing, preferring `chat_mode` over `chat_type`, so DM-originated card actions no longer bypass `dmPolicy` by falling through to the group handling path. (#68201)
|
||||
- Cron/isolated-agent: preserve `trusted: false` on isolated cron awareness events mirrored into the main session, and forward the optional `trusted` flag through the gateway cron wrapper so explicit trust downgrades survive session-key scoping. (#68210)
|
||||
- Agents/fallback: recognize bare leading ZenMux `402 ...` quota-refresh errors without misclassifying plain numeric `402 ...` text, and keep the embedded fallback regression coverage stable. (#47579) Thanks @bwjoke.
|
||||
- Failover/google: only treat `INTERNAL` status payloads as retryable timeouts when they also carry a `500` code, so malformed non-500 payloads do not enter the retry path. (#68238) Thanks @altaywtf and @Openbling.
|
||||
|
||||
## 2026.4.15
|
||||
|
||||
|
||||
@@ -45,6 +45,12 @@ const GROQ_TOO_MANY_REQUESTS_MESSAGE =
|
||||
"429 Too Many Requests: Too many requests were sent in a given timeframe.";
|
||||
const GROQ_SERVICE_UNAVAILABLE_MESSAGE =
|
||||
"503 Service Unavailable: The server is temporarily unable to handle the request due to overloading or maintenance."; // pragma: allowlist secret
|
||||
const PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE =
|
||||
"Proxy notice: Status: Internal Server Error";
|
||||
const MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE =
|
||||
`${PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE}; upstream connect error`;
|
||||
const INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE =
|
||||
`${PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE}; code:500`;
|
||||
|
||||
function expectMessageMatches(
|
||||
matcher: (message: string) => boolean,
|
||||
@@ -64,6 +70,12 @@ function expectTimeoutFailoverSamples(samples: readonly string[]) {
|
||||
}
|
||||
}
|
||||
|
||||
function expectNotFailoverSample(sample: string) {
|
||||
expect(isTimeoutErrorMessage(sample)).toBe(false);
|
||||
expect(classifyFailoverReason(sample)).toBeNull();
|
||||
expect(isFailoverErrorMessage(sample)).toBe(false);
|
||||
}
|
||||
|
||||
describe("isAuthPermanentErrorMessage", () => {
|
||||
it.each([
|
||||
{
|
||||
@@ -811,6 +823,30 @@ describe("isFailoverErrorMessage", () => {
|
||||
expect(classifyFailoverReason(sample)).toBe(null);
|
||||
expect(isFailoverErrorMessage(sample)).toBe(false);
|
||||
});
|
||||
|
||||
it("matches google INTERNAL status errors as timeout", () => {
|
||||
const sample =
|
||||
"provider=google model=gemini-3.1-flash-lite-preview got status: INTERNAL upstream failure code:500";
|
||||
expect(isTimeoutErrorMessage(sample)).toBe(true);
|
||||
expect(classifyFailoverReason(sample)).toBe("timeout");
|
||||
expect(isFailoverErrorMessage(sample)).toBe(true);
|
||||
});
|
||||
|
||||
it("does not treat plain status text with internal-server-error wording as timeout", () => {
|
||||
expectNotFailoverSample(PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE);
|
||||
});
|
||||
|
||||
it("keeps mixed upstream server errors retryable when they also mention status prose", () => {
|
||||
expect(isTimeoutErrorMessage(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe(false);
|
||||
expect(classifyFailoverReason(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe("timeout");
|
||||
expect(isFailoverErrorMessage(MIXED_INTERNAL_SERVER_ERROR_STATUS_SAMPLE)).toBe(true);
|
||||
});
|
||||
|
||||
it("keeps status prose retryable when it is explicitly paired with code 500", () => {
|
||||
expect(isTimeoutErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(false);
|
||||
expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout");
|
||||
expect(isFailoverErrorMessage(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseImageSizeError", () => {
|
||||
@@ -1230,4 +1266,33 @@ describe("classifyProviderRuntimeFailureKind", () => {
|
||||
),
|
||||
).not.toBe("proxy");
|
||||
});
|
||||
|
||||
it("classifies google-style INTERNAL status payloads as timeout", () => {
|
||||
expect(
|
||||
classifyFailoverReason(
|
||||
'ERROR provider=google model=gemini-3.1-flash-lite-preview: got status: INTERNAL, details: {"code":500,"status":"INTERNAL"}',
|
||||
),
|
||||
).toBe("timeout");
|
||||
expect(
|
||||
classifyFailoverReason(
|
||||
'got status: INTERNAL. {"error":{"code":500,"message":"Internal error encountered.","status":"INTERNAL"}}',
|
||||
),
|
||||
).toBe("timeout");
|
||||
});
|
||||
|
||||
it("does not classify google-style INTERNAL payloads without a 500 code as timeout", () => {
|
||||
const sample =
|
||||
'got status: INTERNAL. {"error":{"code":400,"message":"Request malformed","status":"INTERNAL"}}';
|
||||
expect(isTimeoutErrorMessage(sample)).toBe(false);
|
||||
expect(classifyFailoverReason(sample)).toBeNull();
|
||||
expect(isFailoverErrorMessage(sample)).toBe(false);
|
||||
});
|
||||
|
||||
it("does not classify plain status text with internal server error wording as timeout", () => {
|
||||
expectNotFailoverSample(PLAIN_INTERNAL_SERVER_ERROR_STATUS_SAMPLE);
|
||||
});
|
||||
|
||||
it("classifies internal server error status prose with code 500 as timeout", () => {
|
||||
expect(classifyFailoverReason(INTERNAL_SERVER_ERROR_STATUS_WITH_500_SAMPLE)).toBe("timeout");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -42,6 +42,9 @@ const COMMON_AUTH_ERROR_PATTERNS = [
|
||||
|
||||
const ZAI_BILLING_CODE_1311_RE = /"code"\s*:\s*1311\b/;
|
||||
const ZAI_AUTH_CODE_1113_RE = /"code"\s*:\s*1113\b/;
|
||||
const STATUS_INTERNAL_SERVER_ERROR_RE = /\bstatus:\s*internal server error\b/i;
|
||||
const STATUS_INTERNAL_SERVER_ERROR_WITH_500_RE =
|
||||
/^(?=[\s\S]*\bstatus:\s*internal server error\b)(?=[\s\S]*\bcode["']?\s*[:=]\s*500\b)/i;
|
||||
|
||||
const ZAI_AUTH_ERROR_PATTERNS = [
|
||||
// Z.ai: error 1113 = wrong endpoint or invalid credentials (#48988)
|
||||
@@ -95,6 +98,8 @@ const ERROR_PATTERNS = {
|
||||
"service unavailable",
|
||||
"deadline exceeded",
|
||||
"context deadline exceeded",
|
||||
/^(?=[\s\S]*\bgot status:\s*internal\b)(?=[\s\S]*\bcode["']?\s*[:=]\s*500\b)/i,
|
||||
/^(?=[\s\S]*["']status["']\s*:\s*["']internal["'])(?=[\s\S]*["']code["']\s*:\s*500\b)/i,
|
||||
"connection error",
|
||||
"network error",
|
||||
"network request failed",
|
||||
@@ -233,5 +238,13 @@ export function isOverloadedErrorMessage(raw: string): boolean {
|
||||
}
|
||||
|
||||
export function isServerErrorMessage(raw: string): boolean {
|
||||
return matchesErrorPatterns(raw, ERROR_PATTERNS.serverError);
|
||||
const value = normalizeLowercaseStringOrEmpty(raw);
|
||||
if (!value) {
|
||||
return false;
|
||||
}
|
||||
if (STATUS_INTERNAL_SERVER_ERROR_WITH_500_RE.test(value)) {
|
||||
return true;
|
||||
}
|
||||
const scrubbed = value.replace(STATUS_INTERNAL_SERVER_ERROR_RE, "").trim();
|
||||
return scrubbed.length > 0 && matchesErrorPatterns(scrubbed, ERROR_PATTERNS.serverError);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user