From 398dd6e0b0915f6abd779cdfdba1ac11fc2dc47f Mon Sep 17 00:00:00 2001 From: hcl Date: Sat, 9 May 2026 18:48:29 +0800 Subject: [PATCH] fix(failover): stop retrying assistant-prefill format failures Summary: - classify assistant-prefill provider rejections as format errors - surface terminal format failover reasons instead of rotating profiles or falling back - refresh shared Swift protocol output from current main Verification: - pnpm test src/agents/pi-embedded-runner/run/failover-policy.test.ts src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts - pnpm exec oxfmt --check --threads=1 CHANGELOG.md src/agents/pi-embedded-runner/run.ts src/agents/pi-embedded-runner/run/assistant-failover.ts src/agents/pi-embedded-runner/run/failover-policy.ts src/agents/pi-embedded-runner/run/failover-policy.test.ts src/agents/pi-embedded-helpers/failover-matches.ts src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts - fnm exec --using=24.13.0 pnpm lint --threads=8 - pnpm protocol:check - GitHub CI on 678e92bcb2bb41539cb0d1d8f9ec230d82572727 --- CHANGELOG.md | 1 + .../OpenClawProtocol/GatewayModels.swift | 6 +- ...dded-helpers.isbillingerrormessage.test.ts | 9 +++ .../pi-embedded-helpers/failover-matches.ts | 6 ++ src/agents/pi-embedded-runner/run.ts | 1 + .../run/assistant-failover.ts | 1 + .../run/failover-policy.test.ts | 76 +++++++++++++++++++ .../pi-embedded-runner/run/failover-policy.ts | 26 ++++++- 8 files changed, 123 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79664267d92..f0ab408c19b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -156,6 +156,7 @@ Docs: https://docs.openclaw.ai - QQBot: route gateway WebSocket connections through the ambient proxy agent so deployments with `https_proxy`, `HTTPS_PROXY`, or `HTTP_PROXY` can reach the QQ gateway. (#72961) Thanks @xialonglee. - Agents/subagents: treat `sessions_spawn` `model: "default"` as the default-model fallback and ignore ACP-only stream targets for native sub-agent spawns. Fixes #72078. (#72101) Thanks @xialonglee. +- Agents/failover: stop retrying assistant-prefill format rejections across auth profiles or model fallbacks, surfacing the deterministic provider error instead of requeueing the lane. Fixes #79688. (#79728) Thanks @hclsys. - Memory/QMD: warn with a manual stale collection removal hint when QMD reports a path/pattern conflict but `collection list` lacks verifiable metadata, avoiding unsafe stderr-only rebinds. Refs #71783. (#72297) Thanks @MonkeyLeeT. - Models/auth: make `openclaw models status --check` and dashboard auth health honor effective auth profile order while keeping stale profiles visible. (#79685) Thanks @nimbleenigma. - Agents/failover: classify bare `stream_read_error` streaming failures as transient timeouts so configured model fallback runs instead of surfacing the raw transport error. Fixes #79689. (#79692) Thanks @hekunwang. diff --git a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift index 4cb68654591..ce01644eb65 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift @@ -3406,21 +3406,25 @@ public struct TalkSessionSubmitToolResultParams: Codable, Sendable { public let sessionid: String public let callid: String public let result: AnyCodable + public let options: [String: AnyCodable]? public init( sessionid: String, callid: String, - result: AnyCodable) + result: AnyCodable, + options: [String: AnyCodable]?) { self.sessionid = sessionid self.callid = callid self.result = result + self.options = options } private enum CodingKeys: String, CodingKey { case sessionid = "sessionId" case callid = "callId" case result + case options } } diff --git a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts index 03100d852a8..d2fce872c4d 100644 --- a/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts +++ b/src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts @@ -1153,6 +1153,15 @@ describe("classifyFailoverReason provider messages", () => { ), ).toBe("timeout"); expect(classifyFailoverReason("string should match pattern")).toBe("format"); + expect( + classifyFailoverReason( + "This model does not support assistant message prefill. The conversation must end with a user message.", + ), + ).toBe("format"); + expect( + classifyFailoverReason("LLM request rejected: does not support assistant message prefill"), + ).toBe("format"); + expect(classifyFailoverReason("conversation must end with a user message")).toBe("format"); expect(classifyFailoverReason("bad request")).toBeNull(); expect( classifyFailoverReason( diff --git a/src/agents/pi-embedded-helpers/failover-matches.ts b/src/agents/pi-embedded-helpers/failover-matches.ts index dd9e239a041..430cbebc494 100644 --- a/src/agents/pi-embedded-helpers/failover-matches.ts +++ b/src/agents/pi-embedded-helpers/failover-matches.ts @@ -218,6 +218,12 @@ const ERROR_PATTERNS = { "messages.1.content.1.tool_use.id", "invalid request format", /tool call id was.*must be/i, + // Prefill-strict models (e.g. claude-opus-4-7) reject requests that end + // with an assistant turn. The lane must not re-queue these — the same + // payload will fail identically on every retry, causing an infinite loop + // (#79688). + "does not support assistant message prefill", + "conversation must end with a user message", ], } as const; diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 40301f0c9d9..da91920652e 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -2216,6 +2216,7 @@ export async function runEmbeddedPiAgent( const assistantFailoverDecision = resolveRunFailoverDecision({ stage: "assistant", + allowFormatRetry: cloudCodeAssistFormatError, aborted, externalAbort, fallbackConfigured, diff --git a/src/agents/pi-embedded-runner/run/assistant-failover.ts b/src/agents/pi-embedded-runner/run/assistant-failover.ts index d9b40cf9e66..db931f48201 100644 --- a/src/agents/pi-embedded-runner/run/assistant-failover.ts +++ b/src/agents/pi-embedded-runner/run/assistant-failover.ts @@ -181,6 +181,7 @@ export async function handleAssistantFailover(params: { decision = resolveRunFailoverDecision({ stage: "assistant", + allowFormatRetry: params.cloudCodeAssistFormatError, aborted: params.aborted, externalAbort: params.externalAbort, fallbackConfigured: params.fallbackConfigured, diff --git a/src/agents/pi-embedded-runner/run/failover-policy.test.ts b/src/agents/pi-embedded-runner/run/failover-policy.test.ts index bfa3c0a04c8..88048e7c6d0 100644 --- a/src/agents/pi-embedded-runner/run/failover-policy.test.ts +++ b/src/agents/pi-embedded-runner/run/failover-policy.test.ts @@ -61,6 +61,41 @@ describe("resolveRunFailoverDecision", () => { }); }); + it("surfaces deterministic prompt format failures instead of rotating or falling back", () => { + expect( + resolveRunFailoverDecision({ + stage: "prompt", + aborted: false, + externalAbort: false, + fallbackConfigured: true, + failoverFailure: true, + failoverReason: "format", + profileRotated: false, + }), + ).toEqual({ + action: "surface_error", + reason: "format", + }); + }); + + it("can still rotate explicitly retryable prompt format failures", () => { + expect( + resolveRunFailoverDecision({ + stage: "prompt", + allowFormatRetry: true, + aborted: false, + externalAbort: false, + fallbackConfigured: true, + failoverFailure: true, + failoverReason: "format", + profileRotated: false, + }), + ).toEqual({ + action: "rotate_profile", + reason: "format", + }); + }); + it("treats classified assistant-side 429s as rotation candidates even without error stopReason", () => { expect( resolveRunFailoverDecision({ @@ -81,6 +116,47 @@ describe("resolveRunFailoverDecision", () => { }); }); + it("surfaces deterministic assistant format failures instead of rotating or falling back", () => { + expect( + resolveRunFailoverDecision({ + stage: "assistant", + aborted: false, + externalAbort: false, + fallbackConfigured: true, + failoverFailure: true, + failoverReason: "format", + timedOut: false, + timedOutDuringCompaction: false, + timedOutDuringToolExecution: false, + profileRotated: false, + }), + ).toEqual({ + action: "surface_error", + reason: "format", + }); + }); + + it("can still rotate explicitly retryable assistant format failures", () => { + expect( + resolveRunFailoverDecision({ + stage: "assistant", + allowFormatRetry: true, + aborted: false, + externalAbort: false, + fallbackConfigured: true, + failoverFailure: true, + failoverReason: "format", + timedOut: false, + timedOutDuringCompaction: false, + timedOutDuringToolExecution: false, + profileRotated: false, + }), + ).toEqual({ + action: "rotate_profile", + reason: "format", + }); + }); + it("falls back after assistant rotation is exhausted", () => { expect( resolveRunFailoverDecision({ diff --git a/src/agents/pi-embedded-runner/run/failover-policy.ts b/src/agents/pi-embedded-runner/run/failover-policy.ts index 10c026d417c..82619369f0c 100644 --- a/src/agents/pi-embedded-runner/run/failover-policy.ts +++ b/src/agents/pi-embedded-runner/run/failover-policy.ts @@ -39,6 +39,7 @@ type RetryLimitDecisionParams = { type PromptDecisionParams = { stage: "prompt"; + allowFormatRetry?: boolean; aborted: boolean; externalAbort: boolean; fallbackConfigured: boolean; @@ -49,6 +50,7 @@ type PromptDecisionParams = { type AssistantDecisionParams = { stage: "assistant"; + allowFormatRetry?: boolean; aborted: boolean; externalAbort: boolean; fallbackConfigured: boolean; @@ -75,11 +77,25 @@ function shouldEscalateRetryLimit(reason: FailoverReason | null): boolean { ); } +function isTerminalFormatFailure(params: { + allowFormatRetry?: boolean; + failoverReason: FailoverReason | null; +}): boolean { + return params.failoverReason === "format" && params.allowFormatRetry !== true; +} + function shouldRotatePrompt(params: PromptDecisionParams): boolean { - return params.failoverFailure && params.failoverReason !== "timeout"; + return ( + params.failoverFailure && + params.failoverReason !== "timeout" && + !isTerminalFormatFailure(params) + ); } function shouldRotateAssistant(params: AssistantDecisionParams): boolean { + if (isTerminalFormatFailure(params)) { + return false; + } return ( (!params.aborted && (params.failoverFailure || params.failoverReason !== null)) || (params.timedOut && !params.timedOutDuringCompaction && !params.timedOutDuringToolExecution) @@ -128,7 +144,7 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R reason: params.failoverReason, }; } - if (params.fallbackConfigured && params.failoverFailure) { + if (params.fallbackConfigured && params.failoverFailure && !isTerminalFormatFailure(params)) { return { action: "fallback_model", reason: params.failoverReason ?? "unknown", @@ -146,6 +162,12 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R reason: params.failoverReason, }; } + if (isTerminalFormatFailure(params)) { + return { + action: "surface_error", + reason: params.failoverReason, + }; + } const assistantShouldRotate = shouldRotateAssistant(params); if (!params.profileRotated && assistantShouldRotate) { return {