fix(agents): classify generic provider errors for failover (#59325)

* fix(agents): classify generic provider errors for failover

Anthropic returns bare 'An unknown error occurred' during API instability
and OpenRouter wraps upstream failures as 'Provider returned error'. Neither
message was recognized by the failover classifier, so the error surfaced
directly to users instead of triggering the configured fallback chain.

Add both patterns to the serverError classifier so they are classified as
transient server errors (timeout) and trigger model failover.

Closes #49706
Closes #45834

* fix(agents): scope unknown-error failover by provider

* docs(changelog): note provider-scoped unknown-error failover

---------

Co-authored-by: Aaron Zhu <aaron@Aarons-MacBook-Air.local>
Co-authored-by: Altay <altay@uinaf.dev>
This commit is contained in:
Aaron Zhu
2026-04-04 23:11:46 +08:00
committed by GitHub
parent 8a6da9d488
commit 983909f826
8 changed files with 103 additions and 11 deletions

View File

@@ -110,6 +110,7 @@ Docs: https://docs.openclaw.ai
- Gateway/device auth: reuse cached device-token scopes only for cached-token reconnects, while keeping explicit `deviceToken` scope requests and empty-cache fallbacks intact so reconnects preserve `operator.read` without breaking explicit auth flows. (#46032) Thanks @caicongyang.
- Google Gemini CLI auth: improve OAuth credential discovery across Windows nvm and Homebrew libexec installs, and align Code Assist metadata so Gemini login stops failing on packaged CLI layouts. (#40729) Thanks @hughcube.
- Mattermost/config schema: accept `groups.*.requireMention` again so existing Mattermost configs no longer fail strict validation after upgrade. (#58271) Thanks @MoerAI.
- Agents/failover: scope Anthropic `An unknown error occurred` failover matching by provider so generic internal unknown-error text no longer triggers retryable timeout fallback. (#59325) Thanks @aaron-he-zhu.
- Providers/OpenRouter failover: classify `403 "Key limit exceeded"` spending-limit responses as billing so model fallback continues instead of stopping on generic auth. (#59892) Thanks @rockcent.
- Device pairing/security: keep non-operator device scope checks bound to the requested role prefix so bootstrap verification cannot redeem `operator.*` scopes through `node` auth. (#57258) Thanks @jlapenna.
- Gateway/device pairing: require non-admin paired-device sessions to manage only their own device for token rotate/revoke and paired-device removal, blocking cross-device token theft inside pairing-scoped sessions. (#50627) Thanks @coygeek.

View File

@@ -73,8 +73,8 @@ export async function runCliAgent(params: RunCliAgentParams): Promise<EmbeddedPi
throw err;
}
const message = err instanceof Error ? err.message : String(err);
if (isFailoverErrorMessage(message)) {
const reason = classifyFailoverReason(message) ?? "unknown";
if (isFailoverErrorMessage(message, { provider: params.provider })) {
const reason = classifyFailoverReason(message, { provider: params.provider }) ?? "unknown";
const status = resolveFailoverStatus(reason);
throw new FailoverError(message, {
reason,

View File

@@ -196,6 +196,38 @@ describe("failover-error", () => {
).toBe("overloaded");
});
it("classifies Anthropic bare 'unknown error' as timeout for failover (#49706)", () => {
expect(
resolveFailoverReasonFromError({
provider: "anthropic",
message: "An unknown error occurred",
}),
).toBe("timeout");
});
it("does not classify generic internal unknown-error text as failover timeout", () => {
expect(
resolveFailoverReasonFromError({
message: "LLM request failed with an unknown error.",
}),
).toBeNull();
expect(
resolveFailoverReasonFromError({
message: "An unknown error occurred",
}),
).toBeNull();
expect(
resolveFailoverReasonFromError({
provider: "openrouter",
message: "An unknown error occurred",
}),
).toBeNull();
expect(
resolveFailoverReasonFromError({
message: "Provider returned error",
}),
).toBeNull();
});
it("treats 400 insufficient_quota payloads as billing instead of format", () => {
expect(
resolveFailoverReasonFromError({

View File

@@ -132,6 +132,22 @@ function getErrorCode(err: unknown): string | undefined {
return findErrorProperty(err, readDirectErrorCode);
}
function readDirectProvider(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const provider = (err as { provider?: unknown }).provider;
if (typeof provider !== "string") {
return undefined;
}
const trimmed = provider.trim();
return trimmed || undefined;
}
function getProvider(err: unknown): string | undefined {
return findErrorProperty(err, readDirectProvider);
}
function readDirectErrorMessage(err: unknown): string | undefined {
if (err instanceof Error) {
return err.message || undefined;
@@ -207,6 +223,7 @@ function normalizeErrorSignal(err: unknown): FailoverSignal {
status: getStatusCode(err),
code: getErrorCode(err),
message: message || undefined,
provider: getProvider(err),
};
}

View File

@@ -638,6 +638,21 @@ describe("classifyFailoverReason", () => {
),
).toBeNull();
});
it("classifies Anthropic bare 'unknown error' as timeout for failover", () => {
expect(classifyFailoverReason("An unknown error occurred", { provider: "anthropic" })).toBe(
"timeout",
);
});
it("does not classify generic internal unknown-error text as timeout", () => {
expect(classifyFailoverReason("An unknown error occurred")).toBeNull();
expect(
classifyFailoverReason("An unknown error occurred", { provider: "openrouter" }),
).toBeNull();
expect(classifyFailoverReason("Provider returned error")).toBeNull();
expect(classifyFailoverReason("Unknown error")).toBeNull();
expect(classifyFailoverReason("LLM request failed with an unknown error.")).toBeNull();
});
});
describe("isFailoverErrorMessage", () => {

View File

@@ -371,6 +371,7 @@ export type FailoverSignal = {
status?: number;
code?: string;
message?: string;
provider?: string;
};
export type FailoverClassification =
@@ -629,7 +630,19 @@ function classifyFailoverReasonFromCode(raw: string | undefined): FailoverReason
}
}
function classifyFailoverClassificationFromMessage(raw: string): FailoverClassification | null {
function isAnthropicProvider(provider?: string): boolean {
const normalized = provider?.trim().toLowerCase();
return Boolean(normalized && normalized.includes("anthropic"));
}
function isAnthropicGenericUnknownError(raw: string, provider?: string): boolean {
return isAnthropicProvider(provider) && raw.toLowerCase().includes("an unknown error occurred");
}
function classifyFailoverClassificationFromMessage(
raw: string,
provider?: string,
): FailoverClassification | null {
if (isImageDimensionErrorMessage(raw)) {
return null;
}
@@ -677,6 +690,9 @@ function classifyFailoverClassificationFromMessage(raw: string): FailoverClassif
if (isAuthErrorMessage(raw)) {
return toReasonClassification("auth");
}
if (isAnthropicGenericUnknownError(raw, provider)) {
return toReasonClassification("timeout");
}
if (isServerErrorMessage(raw)) {
return toReasonClassification("timeout");
}
@@ -703,7 +719,7 @@ export function classifyFailoverSignal(signal: FailoverSignal): FailoverClassifi
? signal.status
: extractLeadingHttpStatus(signal.message?.trim() ?? "")?.code;
const messageClassification = signal.message
? classifyFailoverClassificationFromMessage(signal.message)
? classifyFailoverClassificationFromMessage(signal.message, signal.provider)
: null;
const statusClassification = classifyFailoverClassificationFromHttpStatus(
inferredStatus,
@@ -1207,24 +1223,28 @@ function isCliSessionExpiredErrorMessage(raw: string): boolean {
);
}
export function classifyFailoverReason(raw: string): FailoverReason | null {
export function classifyFailoverReason(
raw: string,
opts?: { provider?: string },
): FailoverReason | null {
const trimmed = raw.trim();
const leadingStatus = extractLeadingHttpStatus(trimmed);
return failoverReasonFromClassification(
classifyFailoverSignal({
status: leadingStatus?.code,
message: raw,
provider: opts?.provider,
}),
);
}
export function isFailoverErrorMessage(raw: string): boolean {
return classifyFailoverReason(raw) !== null;
export function isFailoverErrorMessage(raw: string, opts?: { provider?: string }): boolean {
return classifyFailoverReason(raw, opts) !== null;
}
export function isFailoverAssistantError(msg: AssistantMessage | undefined): boolean {
if (!msg || msg.stopReason !== "error") {
return false;
}
return isFailoverErrorMessage(msg.errorMessage ?? "");
return isFailoverErrorMessage(msg.errorMessage ?? "", { provider: msg.provider });
}

View File

@@ -1048,7 +1048,7 @@ export async function runEmbeddedPiAgent(
};
}
const promptFailoverReason =
promptErrorDetails.reason ?? classifyFailoverReason(errorText);
promptErrorDetails.reason ?? classifyFailoverReason(errorText, { provider });
const promptProfileFailureReason =
resolveAuthProfileFailureReason(promptFailoverReason);
await maybeMarkAuthProfileFailure({
@@ -1161,7 +1161,12 @@ export async function runEmbeddedPiAgent(
const rateLimitFailure = isRateLimitAssistantError(lastAssistant);
const billingFailure = isBillingAssistantError(lastAssistant);
const failoverFailure = isFailoverAssistantError(lastAssistant);
const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
const assistantFailoverReason = classifyFailoverReason(
lastAssistant?.errorMessage ?? "",
{
provider: lastAssistant?.provider,
},
);
const assistantProfileFailureReason =
resolveAuthProfileFailureReason(assistantFailoverReason);
const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;

View File

@@ -47,7 +47,9 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
model: lastAssistant.model,
});
const rawError = lastAssistant.errorMessage?.trim();
const failoverReason = classifyFailoverReason(rawError ?? "");
const failoverReason = classifyFailoverReason(rawError ?? "", {
provider: lastAssistant.provider,
});
const errorText = (friendlyError || lastAssistant.errorMessage || "LLM request failed.").trim();
const observedError = buildApiErrorObservationFields(rawError);
const safeErrorText =