mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-05 22:32:12 +00:00
fix(agents): classify generic provider errors for failover (#59325)
* fix(agents): classify generic provider errors for failover Anthropic returns bare 'An unknown error occurred' during API instability and OpenRouter wraps upstream failures as 'Provider returned error'. Neither message was recognized by the failover classifier, so the error surfaced directly to users instead of triggering the configured fallback chain. Add both patterns to the serverError classifier so they are classified as transient server errors (timeout) and trigger model failover. Closes #49706 Closes #45834 * fix(agents): scope unknown-error failover by provider * docs(changelog): note provider-scoped unknown-error failover --------- Co-authored-by: Aaron Zhu <aaron@Aarons-MacBook-Air.local> Co-authored-by: Altay <altay@uinaf.dev>
This commit is contained in:
@@ -110,6 +110,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/device auth: reuse cached device-token scopes only for cached-token reconnects, while keeping explicit `deviceToken` scope requests and empty-cache fallbacks intact so reconnects preserve `operator.read` without breaking explicit auth flows. (#46032) Thanks @caicongyang.
|
||||
- Google Gemini CLI auth: improve OAuth credential discovery across Windows nvm and Homebrew libexec installs, and align Code Assist metadata so Gemini login stops failing on packaged CLI layouts. (#40729) Thanks @hughcube.
|
||||
- Mattermost/config schema: accept `groups.*.requireMention` again so existing Mattermost configs no longer fail strict validation after upgrade. (#58271) Thanks @MoerAI.
|
||||
- Agents/failover: scope Anthropic `An unknown error occurred` failover matching by provider so generic internal unknown-error text no longer triggers retryable timeout fallback. (#59325) Thanks @aaron-he-zhu.
|
||||
- Providers/OpenRouter failover: classify `403 "Key limit exceeded"` spending-limit responses as billing so model fallback continues instead of stopping on generic auth. (#59892) Thanks @rockcent.
|
||||
- Device pairing/security: keep non-operator device scope checks bound to the requested role prefix so bootstrap verification cannot redeem `operator.*` scopes through `node` auth. (#57258) Thanks @jlapenna.
|
||||
- Gateway/device pairing: require non-admin paired-device sessions to manage only their own device for token rotate/revoke and paired-device removal, blocking cross-device token theft inside pairing-scoped sessions. (#50627) Thanks @coygeek.
|
||||
|
||||
@@ -73,8 +73,8 @@ export async function runCliAgent(params: RunCliAgentParams): Promise<EmbeddedPi
|
||||
throw err;
|
||||
}
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
if (isFailoverErrorMessage(message)) {
|
||||
const reason = classifyFailoverReason(message) ?? "unknown";
|
||||
if (isFailoverErrorMessage(message, { provider: params.provider })) {
|
||||
const reason = classifyFailoverReason(message, { provider: params.provider }) ?? "unknown";
|
||||
const status = resolveFailoverStatus(reason);
|
||||
throw new FailoverError(message, {
|
||||
reason,
|
||||
|
||||
@@ -196,6 +196,38 @@ describe("failover-error", () => {
|
||||
).toBe("overloaded");
|
||||
});
|
||||
|
||||
it("classifies Anthropic bare 'unknown error' as timeout for failover (#49706)", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
provider: "anthropic",
|
||||
message: "An unknown error occurred",
|
||||
}),
|
||||
).toBe("timeout");
|
||||
});
|
||||
|
||||
it("does not classify generic internal unknown-error text as failover timeout", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
message: "LLM request failed with an unknown error.",
|
||||
}),
|
||||
).toBeNull();
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
message: "An unknown error occurred",
|
||||
}),
|
||||
).toBeNull();
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
provider: "openrouter",
|
||||
message: "An unknown error occurred",
|
||||
}),
|
||||
).toBeNull();
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
message: "Provider returned error",
|
||||
}),
|
||||
).toBeNull();
|
||||
});
|
||||
it("treats 400 insufficient_quota payloads as billing instead of format", () => {
|
||||
expect(
|
||||
resolveFailoverReasonFromError({
|
||||
|
||||
@@ -132,6 +132,22 @@ function getErrorCode(err: unknown): string | undefined {
|
||||
return findErrorProperty(err, readDirectErrorCode);
|
||||
}
|
||||
|
||||
function readDirectProvider(err: unknown): string | undefined {
|
||||
if (!err || typeof err !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const provider = (err as { provider?: unknown }).provider;
|
||||
if (typeof provider !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
const trimmed = provider.trim();
|
||||
return trimmed || undefined;
|
||||
}
|
||||
|
||||
function getProvider(err: unknown): string | undefined {
|
||||
return findErrorProperty(err, readDirectProvider);
|
||||
}
|
||||
|
||||
function readDirectErrorMessage(err: unknown): string | undefined {
|
||||
if (err instanceof Error) {
|
||||
return err.message || undefined;
|
||||
@@ -207,6 +223,7 @@ function normalizeErrorSignal(err: unknown): FailoverSignal {
|
||||
status: getStatusCode(err),
|
||||
code: getErrorCode(err),
|
||||
message: message || undefined,
|
||||
provider: getProvider(err),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -638,6 +638,21 @@ describe("classifyFailoverReason", () => {
|
||||
),
|
||||
).toBeNull();
|
||||
});
|
||||
it("classifies Anthropic bare 'unknown error' as timeout for failover", () => {
|
||||
expect(classifyFailoverReason("An unknown error occurred", { provider: "anthropic" })).toBe(
|
||||
"timeout",
|
||||
);
|
||||
});
|
||||
|
||||
it("does not classify generic internal unknown-error text as timeout", () => {
|
||||
expect(classifyFailoverReason("An unknown error occurred")).toBeNull();
|
||||
expect(
|
||||
classifyFailoverReason("An unknown error occurred", { provider: "openrouter" }),
|
||||
).toBeNull();
|
||||
expect(classifyFailoverReason("Provider returned error")).toBeNull();
|
||||
expect(classifyFailoverReason("Unknown error")).toBeNull();
|
||||
expect(classifyFailoverReason("LLM request failed with an unknown error.")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("isFailoverErrorMessage", () => {
|
||||
|
||||
@@ -371,6 +371,7 @@ export type FailoverSignal = {
|
||||
status?: number;
|
||||
code?: string;
|
||||
message?: string;
|
||||
provider?: string;
|
||||
};
|
||||
|
||||
export type FailoverClassification =
|
||||
@@ -629,7 +630,19 @@ function classifyFailoverReasonFromCode(raw: string | undefined): FailoverReason
|
||||
}
|
||||
}
|
||||
|
||||
function classifyFailoverClassificationFromMessage(raw: string): FailoverClassification | null {
|
||||
function isAnthropicProvider(provider?: string): boolean {
|
||||
const normalized = provider?.trim().toLowerCase();
|
||||
return Boolean(normalized && normalized.includes("anthropic"));
|
||||
}
|
||||
|
||||
function isAnthropicGenericUnknownError(raw: string, provider?: string): boolean {
|
||||
return isAnthropicProvider(provider) && raw.toLowerCase().includes("an unknown error occurred");
|
||||
}
|
||||
|
||||
function classifyFailoverClassificationFromMessage(
|
||||
raw: string,
|
||||
provider?: string,
|
||||
): FailoverClassification | null {
|
||||
if (isImageDimensionErrorMessage(raw)) {
|
||||
return null;
|
||||
}
|
||||
@@ -677,6 +690,9 @@ function classifyFailoverClassificationFromMessage(raw: string): FailoverClassif
|
||||
if (isAuthErrorMessage(raw)) {
|
||||
return toReasonClassification("auth");
|
||||
}
|
||||
if (isAnthropicGenericUnknownError(raw, provider)) {
|
||||
return toReasonClassification("timeout");
|
||||
}
|
||||
if (isServerErrorMessage(raw)) {
|
||||
return toReasonClassification("timeout");
|
||||
}
|
||||
@@ -703,7 +719,7 @@ export function classifyFailoverSignal(signal: FailoverSignal): FailoverClassifi
|
||||
? signal.status
|
||||
: extractLeadingHttpStatus(signal.message?.trim() ?? "")?.code;
|
||||
const messageClassification = signal.message
|
||||
? classifyFailoverClassificationFromMessage(signal.message)
|
||||
? classifyFailoverClassificationFromMessage(signal.message, signal.provider)
|
||||
: null;
|
||||
const statusClassification = classifyFailoverClassificationFromHttpStatus(
|
||||
inferredStatus,
|
||||
@@ -1207,24 +1223,28 @@ function isCliSessionExpiredErrorMessage(raw: string): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
export function classifyFailoverReason(raw: string): FailoverReason | null {
|
||||
export function classifyFailoverReason(
|
||||
raw: string,
|
||||
opts?: { provider?: string },
|
||||
): FailoverReason | null {
|
||||
const trimmed = raw.trim();
|
||||
const leadingStatus = extractLeadingHttpStatus(trimmed);
|
||||
return failoverReasonFromClassification(
|
||||
classifyFailoverSignal({
|
||||
status: leadingStatus?.code,
|
||||
message: raw,
|
||||
provider: opts?.provider,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
export function isFailoverErrorMessage(raw: string): boolean {
|
||||
return classifyFailoverReason(raw) !== null;
|
||||
export function isFailoverErrorMessage(raw: string, opts?: { provider?: string }): boolean {
|
||||
return classifyFailoverReason(raw, opts) !== null;
|
||||
}
|
||||
|
||||
export function isFailoverAssistantError(msg: AssistantMessage | undefined): boolean {
|
||||
if (!msg || msg.stopReason !== "error") {
|
||||
return false;
|
||||
}
|
||||
return isFailoverErrorMessage(msg.errorMessage ?? "");
|
||||
return isFailoverErrorMessage(msg.errorMessage ?? "", { provider: msg.provider });
|
||||
}
|
||||
|
||||
@@ -1048,7 +1048,7 @@ export async function runEmbeddedPiAgent(
|
||||
};
|
||||
}
|
||||
const promptFailoverReason =
|
||||
promptErrorDetails.reason ?? classifyFailoverReason(errorText);
|
||||
promptErrorDetails.reason ?? classifyFailoverReason(errorText, { provider });
|
||||
const promptProfileFailureReason =
|
||||
resolveAuthProfileFailureReason(promptFailoverReason);
|
||||
await maybeMarkAuthProfileFailure({
|
||||
@@ -1161,7 +1161,12 @@ export async function runEmbeddedPiAgent(
|
||||
const rateLimitFailure = isRateLimitAssistantError(lastAssistant);
|
||||
const billingFailure = isBillingAssistantError(lastAssistant);
|
||||
const failoverFailure = isFailoverAssistantError(lastAssistant);
|
||||
const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
|
||||
const assistantFailoverReason = classifyFailoverReason(
|
||||
lastAssistant?.errorMessage ?? "",
|
||||
{
|
||||
provider: lastAssistant?.provider,
|
||||
},
|
||||
);
|
||||
const assistantProfileFailureReason =
|
||||
resolveAuthProfileFailureReason(assistantFailoverReason);
|
||||
const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
|
||||
|
||||
@@ -47,7 +47,9 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
|
||||
model: lastAssistant.model,
|
||||
});
|
||||
const rawError = lastAssistant.errorMessage?.trim();
|
||||
const failoverReason = classifyFailoverReason(rawError ?? "");
|
||||
const failoverReason = classifyFailoverReason(rawError ?? "", {
|
||||
provider: lastAssistant.provider,
|
||||
});
|
||||
const errorText = (friendlyError || lastAssistant.errorMessage || "LLM request failed.").trim();
|
||||
const observedError = buildApiErrorObservationFields(rawError);
|
||||
const safeErrorText =
|
||||
|
||||
Reference in New Issue
Block a user