From 0e586bb48a3175c8e6d00666a75b0d1782a0b091 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 27 Apr 2026 12:49:30 +0100 Subject: [PATCH] fix(agents): improve fallback failure observability --- CHANGELOG.md | 1 + docs/concepts/model-failover.md | 4 +- src/agents/auth-profiles/types.ts | 3 + src/agents/auth-profiles/usage.ts | 9 ++- src/agents/failover-policy.test.ts | 18 +++++ src/agents/failover-policy.ts | 6 ++ src/agents/model-fallback-observation.ts | 77 +++++++++++++++++++ src/agents/model-fallback.probe.test.ts | 12 +++ ...dded-helpers.isbillingerrormessage.test.ts | 14 +++- src/agents/pi-embedded-helpers/errors.ts | 12 ++- src/agents/pi-embedded-helpers/types.ts | 3 + src/agents/runtime-plan/types.ts | 3 + src/auto-reply/fallback-state.ts | 9 ++- src/gateway/protocol/schema/cron.ts | 3 + 14 files changed, 166 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cb683019de..87bd19324ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai - Control UI: show loading, reload, and retry states when a lazy dashboard panel cannot load after an upgrade, so the Logs tab no longer appears blank on stale browser bundles. Fixes #72450. Thanks @sobergou. - Agents/reasoning: recover fully wrapped unclosed `` replies that would otherwise sanitize to empty text while keeping strict stripping for closed reasoning blocks and unclosed tails after visible text. Fixes #37696; supersedes #51915. Thanks @druide67 and @okuyam2y. - Control UI/Gateway: bind WebChat handshakes to their active socket and reject post-close server registrations, so aborted connects no longer leave zombie clients or misleading duplicate WebSocket connection logs. Fixes #72753. Thanks @LumenFromTheFuture. +- Agents/fallback: split ambiguous provider failures into `empty_response`, `no_error_details`, and `unclassified`, and add flat fallback-step fields to structured fallback logs so primary-model failures stay visible when later fallbacks also fail. Fixes #71922; refs #71744. Thanks @andyk-ms and @nikolaykazakovvs-ux. - Plugins/Windows: normalize Windows absolute paths before handing bundled plugin modules to Jiti, so Feishu/Lark message sending no longer fails with unsupported `c:` ESM loader URLs. Fixes #72783. Thanks @jackychen-png. - CLI/doctor: run bundled plugin runtime-dependency repairs through the async npm installer with spinner/line progress and heartbeat updates, so long `openclaw doctor --fix` installs no longer look hung in TTY or piped output. Fixes #72775. Thanks @dfpalhano. - Feishu/Windows: normalize bundled channel sidecar loads before Jiti evaluates them, so Feishu outbound sends no longer fail with raw `C:` ESM loader errors on Windows. Fixes #72783. Thanks @jackychen-png. diff --git a/docs/concepts/model-failover.md b/docs/concepts/model-failover.md index 8c9984ca235..65074b4ec7a 100644 --- a/docs/concepts/model-failover.md +++ b/docs/concepts/model-failover.md @@ -203,7 +203,7 @@ Defaults: ## Model fallback -If all profiles for a provider fail, OpenClaw moves to the next model in `agents.defaults.model.fallbacks`. This applies to auth failures, rate limits, and timeouts that exhausted profile rotation (other errors do not advance fallback). +If all profiles for a provider fail, OpenClaw moves to the next model in `agents.defaults.model.fallbacks`. This applies to auth failures, rate limits, and timeouts that exhausted profile rotation (other errors do not advance fallback). Provider errors that do not expose enough detail are still labeled precisely in fallback state: `empty_response` means the provider returned no usable message or status, `no_error_details` means the provider explicitly returned `Unknown error (no error details in response)`, and `unclassified` means OpenClaw preserved the raw preview but no classifier matched it yet. Overloaded and rate-limit errors are handled more aggressively than billing cooldowns. By default, OpenClaw allows one same-provider auth-profile retry, then switches to the next configured model fallback without waiting. Provider-busy signals such as `ModelNotReadyException` land in that overloaded bucket. Tune this with `auth.cooldowns.overloadedProfileRotations`, `auth.cooldowns.overloadedBackoffMs`, and `auth.cooldowns.rateLimitedProfileRotations`. @@ -302,6 +302,8 @@ The persisted fallback override closes that window, and the narrow rollback keep - optional status/code - human-readable error summary +Structured `model_fallback_decision` logs also include flat `fallbackStep*` fields when a candidate fails, is skipped, or a later fallback succeeds. These fields make the attempted transition explicit (`fallbackStepFromModel`, `fallbackStepToModel`, `fallbackStepFromFailureReason`, `fallbackStepFromFailureDetail`, `fallbackStepFinalOutcome`) so log and diagnostic exporters can reconstruct the primary failure even when the terminal fallback also fails. + When every candidate fails, OpenClaw throws `FallbackSummaryError`. The outer reply runner can use that to build a more specific message such as "all models are temporarily rate-limited" and include the soonest cooldown expiry when one is known. That cooldown summary is model-aware: diff --git a/src/agents/auth-profiles/types.ts b/src/agents/auth-profiles/types.ts index 0459b640a16..7ca7b7c6378 100644 --- a/src/agents/auth-profiles/types.ts +++ b/src/agents/auth-profiles/types.ts @@ -61,6 +61,9 @@ export type AuthProfileFailureReason = | "timeout" | "model_not_found" | "session_expired" + | "empty_response" + | "no_error_details" + | "unclassified" | "unknown"; /** Per-profile usage statistics for round-robin and cooldown tracking */ diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 6d292f0e1c8..5b1708d35a1 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -45,6 +45,9 @@ const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [ "overloaded", "timeout", "rate_limit", + "empty_response", + "no_error_details", + "unclassified", "unknown", ]; const FAILURE_REASON_SET = new Set(FAILURE_REASON_PRIORITY); @@ -89,7 +92,11 @@ function shouldProbeWhamForFailure( ): boolean { return ( normalizeProviderId(provider ?? "") === "openai-codex" && - (reason === "rate_limit" || reason === "unknown") + (reason === "rate_limit" || + reason === "empty_response" || + reason === "no_error_details" || + reason === "unclassified" || + reason === "unknown") ); } diff --git a/src/agents/failover-policy.test.ts b/src/agents/failover-policy.test.ts index b9931fb4e88..dbe9058c6d4 100644 --- a/src/agents/failover-policy.test.ts +++ b/src/agents/failover-policy.test.ts @@ -38,6 +38,24 @@ const CASES: ReasonCase[] = [ useTransientProbeSlot: true, preserveTransientProbeSlot: false, }, + { + reason: "empty_response", + allowCooldownProbe: true, + useTransientProbeSlot: true, + preserveTransientProbeSlot: false, + }, + { + reason: "no_error_details", + allowCooldownProbe: true, + useTransientProbeSlot: true, + preserveTransientProbeSlot: false, + }, + { + reason: "unclassified", + allowCooldownProbe: true, + useTransientProbeSlot: true, + preserveTransientProbeSlot: false, + }, { reason: "model_not_found", allowCooldownProbe: false, diff --git a/src/agents/failover-policy.ts b/src/agents/failover-policy.ts index babdb91cd00..62b1d51108d 100644 --- a/src/agents/failover-policy.ts +++ b/src/agents/failover-policy.ts @@ -8,6 +8,9 @@ export function shouldAllowCooldownProbeForReason( reason === "overloaded" || reason === "billing" || reason === "unknown" || + reason === "empty_response" || + reason === "no_error_details" || + reason === "unclassified" || reason === "timeout" ); } @@ -19,6 +22,9 @@ export function shouldUseTransientCooldownProbeSlot( reason === "rate_limit" || reason === "overloaded" || reason === "unknown" || + reason === "empty_response" || + reason === "no_error_details" || + reason === "unclassified" || reason === "timeout" ); } diff --git a/src/agents/model-fallback-observation.ts b/src/agents/model-fallback-observation.ts index 0a2c2eeef39..6f5c203fe1a 100644 --- a/src/agents/model-fallback-observation.ts +++ b/src/agents/model-fallback-observation.ts @@ -27,6 +27,68 @@ function buildErrorObservationFields(error?: string): { }; } +type FallbackStepOutcome = "next_fallback" | "succeeded" | "chain_exhausted"; + +function formatModelRef(candidate: ModelCandidate): string { + return `${candidate.provider}/${candidate.model}`; +} + +function buildFallbackStepFields(params: { + decision: "skip_candidate" | "candidate_failed" | "candidate_succeeded"; + candidate: ModelCandidate; + reason?: FailoverReason | null; + error?: string; + nextCandidate?: ModelCandidate; + attempt?: number; + previousAttempts?: FallbackAttempt[]; +}): + | { + fallbackStepType: "fallback_step"; + fallbackStepFromModel: string; + fallbackStepToModel?: string; + fallbackStepFromFailureReason?: FailoverReason; + fallbackStepFromFailureDetail?: string; + fallbackStepChainPosition?: number; + fallbackStepFinalOutcome: FallbackStepOutcome; + } + | undefined { + const lastPreviousAttempt = params.previousAttempts?.at(-1); + if (params.decision === "candidate_succeeded") { + if (!lastPreviousAttempt) { + return undefined; + } + return { + fallbackStepType: "fallback_step", + fallbackStepFromModel: `${lastPreviousAttempt.provider}/${lastPreviousAttempt.model}`, + fallbackStepToModel: formatModelRef(params.candidate), + ...(lastPreviousAttempt.reason + ? { fallbackStepFromFailureReason: lastPreviousAttempt.reason } + : {}), + ...(lastPreviousAttempt.error + ? { fallbackStepFromFailureDetail: lastPreviousAttempt.error } + : {}), + ...(typeof params.attempt === "number" ? { fallbackStepChainPosition: params.attempt } : {}), + fallbackStepFinalOutcome: "succeeded", + }; + } + + const observed = buildErrorObservationFields(params.error); + return { + fallbackStepType: "fallback_step", + fallbackStepFromModel: formatModelRef(params.candidate), + ...(params.nextCandidate ? { fallbackStepToModel: formatModelRef(params.nextCandidate) } : {}), + ...(params.reason ? { fallbackStepFromFailureReason: params.reason } : {}), + ...((observed.providerErrorMessagePreview ?? observed.errorPreview) + ? { + fallbackStepFromFailureDetail: + observed.providerErrorMessagePreview ?? observed.errorPreview, + } + : {}), + ...(typeof params.attempt === "number" ? { fallbackStepChainPosition: params.attempt } : {}), + fallbackStepFinalOutcome: params.nextCandidate ? "next_fallback" : "chain_exhausted", + }; +} + export function logModelFallbackDecision(params: { decision: | "skip_candidate" @@ -57,6 +119,20 @@ export function logModelFallbackDecision(params: { const reasonText = params.reason ?? "unknown"; const observedError = buildErrorObservationFields(params.error); const detailText = observedError.providerErrorMessagePreview ?? observedError.errorPreview; + const fallbackStepFields = + params.decision === "skip_candidate" || + params.decision === "candidate_failed" || + params.decision === "candidate_succeeded" + ? buildFallbackStepFields({ + decision: params.decision, + candidate: params.candidate, + reason: params.reason, + error: params.error, + nextCandidate: params.nextCandidate, + attempt: params.attempt, + previousAttempts: params.previousAttempts, + }) + : undefined; const providerErrorTypeSuffix = observedError.providerErrorType ? ` providerErrorType=${sanitizeForLog(observedError.providerErrorType)}` : ""; @@ -76,6 +152,7 @@ export function logModelFallbackDecision(params: { status: params.status, code: params.code, ...observedError, + ...fallbackStepFields, nextCandidateProvider: params.nextCandidate?.provider, nextCandidateModel: params.nextCandidate?.model, isPrimary: params.isPrimary, diff --git a/src/agents/model-fallback.probe.test.ts b/src/agents/model-fallback.probe.test.ts index ff9a0e045fd..90b2a6fa05f 100644 --- a/src/agents/model-fallback.probe.test.ts +++ b/src/agents/model-fallback.probe.test.ts @@ -346,6 +346,12 @@ describe("runWithModelFallback – probe logic", () => { requestedModelMatched: true, nextCandidateProvider: "anthropic", nextCandidateModel: "claude-haiku-3-5", + fallbackStepType: "fallback_step", + fallbackStepFromModel: "openai/gpt-4.1-mini", + fallbackStepToModel: "anthropic/claude-haiku-3-5", + fallbackStepFromFailureReason: "rate_limit", + fallbackStepChainPosition: 1, + fallbackStepFinalOutcome: "next_fallback", }), expect.objectContaining({ event: "model_fallback_decision", @@ -354,6 +360,12 @@ describe("runWithModelFallback – probe logic", () => { candidateModel: "claude-haiku-3-5", isPrimary: false, requestedModelMatched: false, + fallbackStepType: "fallback_step", + fallbackStepFromModel: "openai/gpt-4.1-mini", + fallbackStepToModel: "anthropic/claude-haiku-3-5", + fallbackStepFromFailureReason: "rate_limit", + fallbackStepChainPosition: 2, + fallbackStepFinalOutcome: "succeeded", }), ]), ); diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 44d76dc5bfe..f8d9171ccc2 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -733,9 +733,9 @@ describe("classifyFailoverReason", () => { ).toBeNull(); }); - it("classifies OpenAI Responses unknown-no-details message as unknown", () => { + it("classifies OpenAI Responses unknown-no-details message distinctly", () => { const message = "Unknown error (no error details in response)"; - expect(classifyFailoverReason(message)).toBe("unknown"); + expect(classifyFailoverReason(message)).toBe("no_error_details"); expect(isFailoverErrorMessage(message)).toBe(true); }); @@ -1376,6 +1376,16 @@ describe("classifyProviderRuntimeFailureKind", () => { ).toBe("replay_invalid"); }); + it("splits ambiguous provider runtime failures instead of collapsing to unknown", () => { + expect(classifyProviderRuntimeFailureKind({})).toBe("empty_response"); + expect(classifyProviderRuntimeFailureKind("Unknown error (no error details in response)")).toBe( + "no_error_details", + ); + expect(classifyProviderRuntimeFailureKind("provider sent a strange opaque failure")).toBe( + "unclassified", + ); + }); + it("does not classify generic config errors that mention proxy settings as proxy failures", () => { expect( classifyProviderRuntimeFailureKind( diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index df440252230..12c719818dd 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -270,6 +270,9 @@ export type ProviderRuntimeFailureKind = | "schema" | "sandbox_blocked" | "replay_invalid" + | "empty_response" + | "no_error_details" + | "unclassified" | "unknown"; const BILLING_402_HINTS = [ @@ -851,7 +854,7 @@ function classifyFailoverClassificationFromMessage( return toReasonClassification("format"); } if (isExactUnknownNoDetailsError(raw)) { - return toReasonClassification("unknown"); + return toReasonClassification("no_error_details"); } if (isTimeoutErrorMessage(raw)) { return toReasonClassification("timeout"); @@ -900,7 +903,7 @@ export function classifyProviderRuntimeFailureKind( const status = inferSignalStatus(normalizedSignal); if (!message && typeof status !== "number") { - return "unknown"; + return "empty_response"; } if (normalizedSignal.code === "refresh_contention") { return "refresh_contention"; @@ -958,7 +961,10 @@ export function classifyProviderRuntimeFailureKind( if (message && isTimeoutTransportErrorMessage(message, status)) { return "timeout"; } - return "unknown"; + if (message && isExactUnknownNoDetailsError(message)) { + return "no_error_details"; + } + return "unclassified"; } export function formatAssistantErrorText( diff --git a/src/agents/pi-embedded-helpers/types.ts b/src/agents/pi-embedded-helpers/types.ts index 5ae47d672d3..93074c940c2 100644 --- a/src/agents/pi-embedded-helpers/types.ts +++ b/src/agents/pi-embedded-helpers/types.ts @@ -10,4 +10,7 @@ export type FailoverReason = | "timeout" | "model_not_found" | "session_expired" + | "empty_response" + | "no_error_details" + | "unclassified" | "unknown"; diff --git a/src/agents/runtime-plan/types.ts b/src/agents/runtime-plan/types.ts index cd413c07a0c..a7aab908619 100644 --- a/src/agents/runtime-plan/types.ts +++ b/src/agents/runtime-plan/types.ts @@ -25,6 +25,9 @@ export type AgentRuntimeFailoverReason = | "timeout" | "model_not_found" | "session_expired" + | "empty_response" + | "no_error_details" + | "unclassified" | "unknown"; export type AgentRuntimeConfig = unknown; diff --git a/src/auto-reply/fallback-state.ts b/src/auto-reply/fallback-state.ts index 7ee4e65e553..8145c6abea3 100644 --- a/src/auto-reply/fallback-state.ts +++ b/src/auto-reply/fallback-state.ts @@ -9,7 +9,14 @@ export { } from "../status/fallback-notice-state.js"; const FALLBACK_REASON_PART_MAX = 80; -const TRANSIENT_FALLBACK_REASONS = new Set(["rate_limit", "overloaded", "timeout"]); +const TRANSIENT_FALLBACK_REASONS = new Set([ + "rate_limit", + "overloaded", + "timeout", + "empty_response", + "no_error_details", + "unclassified", +]); const TRANSIENT_ERROR_DETAIL_HINT_RE = /\b(?:429|5\d\d|too many requests|usage limit|quota|try again in|retry[- ]after|seconds?|minutes?|hours?|temporarily unavailable|overloaded|service unavailable|throttl)\b/i; diff --git a/src/gateway/protocol/schema/cron.ts b/src/gateway/protocol/schema/cron.ts index 5c09154e1b0..8b7cb3dc5f8 100644 --- a/src/gateway/protocol/schema/cron.ts +++ b/src/gateway/protocol/schema/cron.ts @@ -65,6 +65,9 @@ const CronFailoverReasonSchema = Type.Union([ Type.Literal("billing"), Type.Literal("timeout"), Type.Literal("model_not_found"), + Type.Literal("empty_response"), + Type.Literal("no_error_details"), + Type.Literal("unclassified"), Type.Literal("unknown"), ]); const CronCommonOptionalFields = {