fix(failover): stop retrying assistant-prefill format failures

Summary:
- classify assistant-prefill provider rejections as format errors
- surface terminal format failover reasons instead of rotating profiles or falling back
- refresh shared Swift protocol output from current main

Verification:
- pnpm test src/agents/pi-embedded-runner/run/failover-policy.test.ts src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
- pnpm exec oxfmt --check --threads=1 CHANGELOG.md src/agents/pi-embedded-runner/run.ts src/agents/pi-embedded-runner/run/assistant-failover.ts src/agents/pi-embedded-runner/run/failover-policy.ts src/agents/pi-embedded-runner/run/failover-policy.test.ts src/agents/pi-embedded-helpers/failover-matches.ts src/agents/pi-embedded-helpers.isbillingerrormessage.test.ts
- fnm exec --using=24.13.0 pnpm lint --threads=8
- pnpm protocol:check
- GitHub CI on 678e92bcb2
This commit is contained in:
hcl
2026-05-09 18:48:29 +08:00
committed by GitHub
parent 01cf27bc2e
commit 398dd6e0b0
8 changed files with 123 additions and 3 deletions

View File

@@ -156,6 +156,7 @@ Docs: https://docs.openclaw.ai
- QQBot: route gateway WebSocket connections through the ambient proxy agent so deployments with `https_proxy`, `HTTPS_PROXY`, or `HTTP_PROXY` can reach the QQ gateway. (#72961) Thanks @xialonglee.
- Agents/subagents: treat `sessions_spawn` `model: "default"` as the default-model fallback and ignore ACP-only stream targets for native sub-agent spawns. Fixes #72078. (#72101) Thanks @xialonglee.
- Agents/failover: stop retrying assistant-prefill format rejections across auth profiles or model fallbacks, surfacing the deterministic provider error instead of requeueing the lane. Fixes #79688. (#79728) Thanks @hclsys.
- Memory/QMD: warn with a manual stale collection removal hint when QMD reports a path/pattern conflict but `collection list` lacks verifiable metadata, avoiding unsafe stderr-only rebinds. Refs #71783. (#72297) Thanks @MonkeyLeeT.
- Models/auth: make `openclaw models status --check` and dashboard auth health honor effective auth profile order while keeping stale profiles visible. (#79685) Thanks @nimbleenigma.
- Agents/failover: classify bare `stream_read_error` streaming failures as transient timeouts so configured model fallback runs instead of surfacing the raw transport error. Fixes #79689. (#79692) Thanks @hekunwang.

View File

@@ -3406,21 +3406,25 @@ public struct TalkSessionSubmitToolResultParams: Codable, Sendable {
public let sessionid: String
public let callid: String
public let result: AnyCodable
public let options: [String: AnyCodable]?
public init(
sessionid: String,
callid: String,
result: AnyCodable)
result: AnyCodable,
options: [String: AnyCodable]?)
{
self.sessionid = sessionid
self.callid = callid
self.result = result
self.options = options
}
private enum CodingKeys: String, CodingKey {
case sessionid = "sessionId"
case callid = "callId"
case result
case options
}
}

View File

@@ -1153,6 +1153,15 @@ describe("classifyFailoverReason provider messages", () => {
),
).toBe("timeout");
expect(classifyFailoverReason("string should match pattern")).toBe("format");
expect(
classifyFailoverReason(
"This model does not support assistant message prefill. The conversation must end with a user message.",
),
).toBe("format");
expect(
classifyFailoverReason("LLM request rejected: does not support assistant message prefill"),
).toBe("format");
expect(classifyFailoverReason("conversation must end with a user message")).toBe("format");
expect(classifyFailoverReason("bad request")).toBeNull();
expect(
classifyFailoverReason(

View File

@@ -218,6 +218,12 @@ const ERROR_PATTERNS = {
"messages.1.content.1.tool_use.id",
"invalid request format",
/tool call id was.*must be/i,
// Prefill-strict models (e.g. claude-opus-4-7) reject requests that end
// with an assistant turn. The lane must not re-queue these — the same
// payload will fail identically on every retry, causing an infinite loop
// (#79688).
"does not support assistant message prefill",
"conversation must end with a user message",
],
} as const;

View File

@@ -2216,6 +2216,7 @@ export async function runEmbeddedPiAgent(
const assistantFailoverDecision = resolveRunFailoverDecision({
stage: "assistant",
allowFormatRetry: cloudCodeAssistFormatError,
aborted,
externalAbort,
fallbackConfigured,

View File

@@ -181,6 +181,7 @@ export async function handleAssistantFailover(params: {
decision = resolveRunFailoverDecision({
stage: "assistant",
allowFormatRetry: params.cloudCodeAssistFormatError,
aborted: params.aborted,
externalAbort: params.externalAbort,
fallbackConfigured: params.fallbackConfigured,

View File

@@ -61,6 +61,41 @@ describe("resolveRunFailoverDecision", () => {
});
});
it("surfaces deterministic prompt format failures instead of rotating or falling back", () => {
expect(
resolveRunFailoverDecision({
stage: "prompt",
aborted: false,
externalAbort: false,
fallbackConfigured: true,
failoverFailure: true,
failoverReason: "format",
profileRotated: false,
}),
).toEqual({
action: "surface_error",
reason: "format",
});
});
it("can still rotate explicitly retryable prompt format failures", () => {
expect(
resolveRunFailoverDecision({
stage: "prompt",
allowFormatRetry: true,
aborted: false,
externalAbort: false,
fallbackConfigured: true,
failoverFailure: true,
failoverReason: "format",
profileRotated: false,
}),
).toEqual({
action: "rotate_profile",
reason: "format",
});
});
it("treats classified assistant-side 429s as rotation candidates even without error stopReason", () => {
expect(
resolveRunFailoverDecision({
@@ -81,6 +116,47 @@ describe("resolveRunFailoverDecision", () => {
});
});
it("surfaces deterministic assistant format failures instead of rotating or falling back", () => {
expect(
resolveRunFailoverDecision({
stage: "assistant",
aborted: false,
externalAbort: false,
fallbackConfigured: true,
failoverFailure: true,
failoverReason: "format",
timedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
}),
).toEqual({
action: "surface_error",
reason: "format",
});
});
it("can still rotate explicitly retryable assistant format failures", () => {
expect(
resolveRunFailoverDecision({
stage: "assistant",
allowFormatRetry: true,
aborted: false,
externalAbort: false,
fallbackConfigured: true,
failoverFailure: true,
failoverReason: "format",
timedOut: false,
timedOutDuringCompaction: false,
timedOutDuringToolExecution: false,
profileRotated: false,
}),
).toEqual({
action: "rotate_profile",
reason: "format",
});
});
it("falls back after assistant rotation is exhausted", () => {
expect(
resolveRunFailoverDecision({

View File

@@ -39,6 +39,7 @@ type RetryLimitDecisionParams = {
type PromptDecisionParams = {
stage: "prompt";
allowFormatRetry?: boolean;
aborted: boolean;
externalAbort: boolean;
fallbackConfigured: boolean;
@@ -49,6 +50,7 @@ type PromptDecisionParams = {
type AssistantDecisionParams = {
stage: "assistant";
allowFormatRetry?: boolean;
aborted: boolean;
externalAbort: boolean;
fallbackConfigured: boolean;
@@ -75,11 +77,25 @@ function shouldEscalateRetryLimit(reason: FailoverReason | null): boolean {
);
}
function isTerminalFormatFailure(params: {
allowFormatRetry?: boolean;
failoverReason: FailoverReason | null;
}): boolean {
return params.failoverReason === "format" && params.allowFormatRetry !== true;
}
function shouldRotatePrompt(params: PromptDecisionParams): boolean {
return params.failoverFailure && params.failoverReason !== "timeout";
return (
params.failoverFailure &&
params.failoverReason !== "timeout" &&
!isTerminalFormatFailure(params)
);
}
function shouldRotateAssistant(params: AssistantDecisionParams): boolean {
if (isTerminalFormatFailure(params)) {
return false;
}
return (
(!params.aborted && (params.failoverFailure || params.failoverReason !== null)) ||
(params.timedOut && !params.timedOutDuringCompaction && !params.timedOutDuringToolExecution)
@@ -128,7 +144,7 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
reason: params.failoverReason,
};
}
if (params.fallbackConfigured && params.failoverFailure) {
if (params.fallbackConfigured && params.failoverFailure && !isTerminalFormatFailure(params)) {
return {
action: "fallback_model",
reason: params.failoverReason ?? "unknown",
@@ -146,6 +162,12 @@ export function resolveRunFailoverDecision(params: RunFailoverDecisionParams): R
reason: params.failoverReason,
};
}
if (isTerminalFormatFailure(params)) {
return {
action: "surface_error",
reason: params.failoverReason,
};
}
const assistantShouldRotate = shouldRotateAssistant(params);
if (!params.profileRotated && assistantShouldRotate) {
return {