Files
openclaw/src/agents/failover-error.test.ts
Altay 6e962d8b9e fix(agents): handle overloaded failover separately (#38301)
* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
2026-03-07 01:42:11 +03:00

322 lines
12 KiB
TypeScript

import { describe, expect, it } from "vitest";
import {
coerceToFailoverError,
describeFailoverError,
isTimeoutError,
resolveFailoverReasonFromError,
resolveFailoverStatus,
} from "./failover-error.js";
// OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors
const OPENAI_RATE_LIMIT_MESSAGE =
"Rate limit reached for gpt-4.1-mini in organization org_test on requests per min. Limit: 3.000000 / min. Current: 3.000000 / min.";
// Anthropic overloaded_error example shape: https://docs.anthropic.com/en/api/errors
const ANTHROPIC_OVERLOADED_PAYLOAD =
'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_test"}';
// Gemini RESOURCE_EXHAUSTED troubleshooting example: https://ai.google.dev/gemini-api/docs/troubleshooting
const GEMINI_RESOURCE_EXHAUSTED_MESSAGE =
"RESOURCE_EXHAUSTED: Resource has been exhausted (e.g. check quota).";
// OpenRouter 402 billing example: https://openrouter.ai/docs/api-reference/errors
const OPENROUTER_CREDITS_MESSAGE = "Payment Required: insufficient credits";
// Issue-backed Anthropic/OpenAI-compatible insufficient_quota payload under HTTP 400:
// https://github.com/openclaw/openclaw/issues/23440
const INSUFFICIENT_QUOTA_PAYLOAD =
'{"type":"error","error":{"type":"insufficient_quota","message":"Your account has insufficient quota balance to run this request."}}';
// Issue-backed ZhipuAI/GLM quota-exhausted log from #33785:
// https://github.com/openclaw/openclaw/issues/33785
const ZHIPUAI_WEEKLY_MONTHLY_LIMIT_EXHAUSTED_MESSAGE =
"LLM error 1310: Weekly/Monthly Limit Exhausted. Your limit will reset at 2026-03-06 22:19:54 (request_id: 20260303141547610b7f574d1b44cb)";
// AWS Bedrock 429 ThrottlingException / 503 ServiceUnavailable:
// https://docs.aws.amazon.com/bedrock/latest/userguide/troubleshooting-api-error-codes.html
const BEDROCK_THROTTLING_EXCEPTION_MESSAGE =
"ThrottlingException: Your request was denied due to exceeding the account quotas for Amazon Bedrock.";
const BEDROCK_SERVICE_UNAVAILABLE_MESSAGE =
"ServiceUnavailable: The service is temporarily unable to handle the request.";
// Groq error codes examples: https://console.groq.com/docs/errors
const GROQ_TOO_MANY_REQUESTS_MESSAGE =
"429 Too Many Requests: Too many requests were sent in a given timeframe.";
const GROQ_SERVICE_UNAVAILABLE_MESSAGE =
"503 Service Unavailable: The server is temporarily unable to handle the request due to overloading or maintenance.";
describe("failover-error", () => {
it("infers failover reason from HTTP status", () => {
expect(resolveFailoverReasonFromError({ status: 402 })).toBe("billing");
// Anthropic Claude Max plan surfaces rate limits as HTTP 402 (#30484)
expect(
resolveFailoverReasonFromError({
status: 402,
message: "HTTP 402: request reached organization usage limit, try again later",
}),
).toBe("rate_limit");
// Explicit billing messages on 402 stay classified as billing
expect(
resolveFailoverReasonFromError({
status: 402,
message: "insufficient credits — please top up your account",
}),
).toBe("billing");
// Ambiguous "quota exceeded" + billing signal → billing wins
expect(
resolveFailoverReasonFromError({
status: 402,
message: "HTTP 402: You have exceeded your current quota. Please add more credits.",
}),
).toBe("billing");
expect(resolveFailoverReasonFromError({ statusCode: "429" })).toBe("rate_limit");
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");
// Keep the status-only path behavior-preserving and conservative.
expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
expect(resolveFailoverReasonFromError({ status: 529 })).toBe("overloaded");
});
it("classifies documented provider error shapes at the error boundary", () => {
expect(
resolveFailoverReasonFromError({
status: 429,
message: OPENAI_RATE_LIMIT_MESSAGE,
}),
).toBe("rate_limit");
expect(
resolveFailoverReasonFromError({
status: 529,
message: ANTHROPIC_OVERLOADED_PAYLOAD,
}),
).toBe("overloaded");
expect(
resolveFailoverReasonFromError({
status: 429,
message: GEMINI_RESOURCE_EXHAUSTED_MESSAGE,
}),
).toBe("rate_limit");
expect(
resolveFailoverReasonFromError({
status: 402,
message: OPENROUTER_CREDITS_MESSAGE,
}),
).toBe("billing");
expect(
resolveFailoverReasonFromError({
status: 429,
message: BEDROCK_THROTTLING_EXCEPTION_MESSAGE,
}),
).toBe("rate_limit");
expect(
resolveFailoverReasonFromError({
status: 503,
message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
}),
).toBe("timeout");
expect(
resolveFailoverReasonFromError({
status: 429,
message: GROQ_TOO_MANY_REQUESTS_MESSAGE,
}),
).toBe("rate_limit");
expect(
resolveFailoverReasonFromError({
status: 503,
message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
}),
).toBe("overloaded");
});
it("keeps status-only 503s conservative unless the payload is clearly overloaded", () => {
expect(
resolveFailoverReasonFromError({
status: 503,
message: "Internal database error",
}),
).toBe("timeout");
expect(
resolveFailoverReasonFromError({
status: 503,
message: '{"error":{"message":"The model is overloaded. Please try later"}}',
}),
).toBe("overloaded");
});
it("treats 400 insufficient_quota payloads as billing instead of format", () => {
expect(
resolveFailoverReasonFromError({
status: 400,
message: INSUFFICIENT_QUOTA_PAYLOAD,
}),
).toBe("billing");
});
it("treats zhipuai weekly/monthly limit exhausted as rate_limit", () => {
expect(
resolveFailoverReasonFromError({
message: ZHIPUAI_WEEKLY_MONTHLY_LIMIT_EXHAUSTED_MESSAGE,
}),
).toBe("rate_limit");
expect(
resolveFailoverReasonFromError({
message: "LLM error: monthly limit reached",
}),
).toBe("rate_limit");
});
it("treats overloaded provider payloads as overloaded", () => {
expect(
resolveFailoverReasonFromError({
message: ANTHROPIC_OVERLOADED_PAYLOAD,
}),
).toBe("overloaded");
});
it("keeps raw-text 402 weekly/monthly limit errors in billing", () => {
expect(
resolveFailoverReasonFromError({
message: "402 Payment Required: Weekly/Monthly Limit Exhausted",
}),
).toBe("billing");
});
it("infers format errors from error messages", () => {
expect(
resolveFailoverReasonFromError({
message: "invalid request format: messages.1.content.1.tool_use.id",
}),
).toBe("format");
});
it("infers timeout from common node error codes", () => {
expect(resolveFailoverReasonFromError({ code: "ETIMEDOUT" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout");
});
it("infers timeout from abort/error stop-reason messages", () => {
expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe(
"timeout",
);
expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: error" })).toBe(
"timeout",
);
expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "stop reason: error" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "reason: error" })).toBe("timeout");
});
it("infers timeout from connection/network error messages", () => {
expect(resolveFailoverReasonFromError({ message: "Connection error." })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "fetch failed" })).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "Network error: ECONNREFUSED" })).toBe(
"timeout",
);
expect(
resolveFailoverReasonFromError({
message: "dial tcp: lookup api.example.com: no such host (ENOTFOUND)",
}),
).toBe("timeout");
expect(resolveFailoverReasonFromError({ message: "temporary dns failure EAI_AGAIN" })).toBe(
"timeout",
);
});
it("treats AbortError reason=abort as timeout", () => {
const err = Object.assign(new Error("aborted"), {
name: "AbortError",
reason: "reason: abort",
});
expect(isTimeoutError(err)).toBe(true);
});
it("coerces failover-worthy errors into FailoverError with metadata", () => {
const err = coerceToFailoverError("credit balance too low", {
provider: "anthropic",
model: "claude-opus-4-5",
});
expect(err?.name).toBe("FailoverError");
expect(err?.reason).toBe("billing");
expect(err?.status).toBe(402);
expect(err?.provider).toBe("anthropic");
expect(err?.model).toBe("claude-opus-4-5");
});
it("maps overloaded to a 503 fallback status", () => {
expect(resolveFailoverStatus("overloaded")).toBe(503);
});
it("coerces format errors with a 400 status", () => {
const err = coerceToFailoverError("invalid request format", {
provider: "google",
model: "cloud-code-assist",
});
expect(err?.reason).toBe("format");
expect(err?.status).toBe(400);
});
it("401/403 with generic message still returns auth (backward compat)", () => {
expect(resolveFailoverReasonFromError({ status: 401, message: "Unauthorized" })).toBe("auth");
expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth");
});
it("401 with permanent auth message returns auth_permanent", () => {
expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe(
"auth_permanent",
);
});
it("403 with revoked key message returns auth_permanent", () => {
expect(resolveFailoverReasonFromError({ status: 403, message: "api key revoked" })).toBe(
"auth_permanent",
);
});
it("resolveFailoverStatus maps auth_permanent to 403", () => {
expect(resolveFailoverStatus("auth_permanent")).toBe(403);
});
it("coerces permanent auth error with correct reason", () => {
const err = coerceToFailoverError(
{ status: 401, message: "invalid_api_key" },
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
expect(err?.provider).toBe("anthropic");
});
it("403 permission_error returns auth_permanent", () => {
expect(
resolveFailoverReasonFromError({
status: 403,
message:
"permission_error: OAuth authentication is currently not allowed for this organization.",
}),
).toBe("auth_permanent");
});
it("permission_error in error message string classifies as auth_permanent", () => {
const err = coerceToFailoverError(
"HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.",
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
});
it("'not allowed for this organization' classifies as auth_permanent", () => {
const err = coerceToFailoverError(
"OAuth authentication is currently not allowed for this organization",
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
});
it("describes non-Error values consistently", () => {
const described = describeFailoverError(123);
expect(described.message).toBe("123");
expect(described.reason).toBeUndefined();
});
});