fix(agents): retry thinking-only errored turns (#92191)

Retry replay-safe reasoning-only provider errors before assistant failover while preserving classified fallback and terminal-output ownership. Adds deterministic Anthropic gateway fault-injection coverage and focused regression tests.\n\nCo-authored-by: ai-hpc <mail.speedy.hpc@hotmail.com>
This commit is contained in:
NVIDIAN
2026-06-14 09:39:27 -07:00
committed by GitHub
parent 364461949d
commit ecaebfc51b
10 changed files with 739 additions and 58 deletions

View File

@@ -149,6 +149,7 @@ const TINY_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
const QA_ANTHROPIC_THINKING_ERROR_RECOVERY_PROMPT_RE = /anthropic thinking error qa check/i;
const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i;
const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
@@ -189,6 +190,7 @@ const QA_GROUP_AUDIO_MIN_MULTIPART_BODY_CHARS = 48_000;
const QA_MCP_CODE_MODE_API_FILE_PROMPT_RE = /mcp code mode api file qa check/i;
type MockScenarioState = {
anthropicThinkingErrorPhase: number;
subagentFanoutPhase: number;
subagentHandoffSpawned: boolean;
};
@@ -3128,6 +3130,90 @@ function buildAnthropicMessageResponse(params: {
};
}
const QA_ANTHROPIC_THINKING_ERROR_TEXT =
"QA replay-safe read completed, but the provider stream failed after signed thinking.";
const QA_ANTHROPIC_THINKING_ERROR_SIGNATURE = "qa_signed_thinking_block_91953";
const QA_ANTHROPIC_THINKING_ERROR_MESSAGE = "QA injected provider stream failure";
function buildAnthropicThinkingErrorResponse(params: { model: string }): Record<string, unknown> {
return {
type: "error",
error: {
type: "api_error",
message: QA_ANTHROPIC_THINKING_ERROR_MESSAGE,
},
model: params.model || "claude-opus-4-8",
};
}
function buildAnthropicThinkingErrorStreamEvents(params: {
model: string;
}): AnthropicStreamEvent[] {
const messageId = `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`;
return [
{
type: "message_start",
message: {
id: messageId,
type: "message",
role: "assistant",
model: params.model || "claude-opus-4-8",
content: [],
stop_reason: null,
stop_sequence: null,
usage: {
input_tokens: 64,
output_tokens: 0,
},
},
},
{
type: "content_block_start",
index: 0,
content_block: {
type: "thinking",
thinking: "",
signature: "",
},
},
{
type: "content_block_delta",
index: 0,
delta: {
type: "thinking_delta",
thinking: QA_ANTHROPIC_THINKING_ERROR_TEXT,
},
},
{
type: "content_block_delta",
index: 0,
delta: {
type: "signature_delta",
signature: QA_ANTHROPIC_THINKING_ERROR_SIGNATURE,
},
},
{
type: "content_block_stop",
index: 0,
},
{
type: "message_delta",
delta: {},
usage: {
input_tokens: 64,
output_tokens: 1120,
},
},
{
type: "error",
error: {
type: "api_error",
message: QA_ANTHROPIC_THINKING_ERROR_MESSAGE,
},
},
];
}
function buildAnthropicMessageStreamEvents(params: {
model: string;
extracted: ExtractedAssistantOutput;
@@ -3254,6 +3340,35 @@ async function buildMessagesPayload(
stream: false,
...(Array.isArray(body.tools) ? { tools: body.tools } : {}),
};
const allInputText = extractAllRequestTexts(input, dispatchBody);
if (QA_ANTHROPIC_THINKING_ERROR_RECOVERY_PROMPT_RE.test(allInputText)) {
const toolOutput = extractToolOutput(input);
const shouldEmitThinkingError =
toolOutput.length > 0 && scenarioState.anthropicThinkingErrorPhase === 0;
const events =
toolOutput.length === 0
? buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" })
: shouldEmitThinkingError
? (() => {
scenarioState.anthropicThinkingErrorPhase = 1;
return buildAssistantEvents("");
})()
: buildAssistantEvents("ANTHROPIC-THINKING-ERROR-RECOVERED-OK");
const extracted = extractFinalAssistantOutputFromEvents(events);
const responseBody = shouldEmitThinkingError
? buildAnthropicThinkingErrorResponse({ model: normalizedModel })
: buildAnthropicMessageResponse({
model: normalizedModel,
extracted,
});
const streamEvents = shouldEmitThinkingError
? buildAnthropicThinkingErrorStreamEvents({ model: normalizedModel })
: buildAnthropicMessageStreamEvents({
model: normalizedModel,
extracted,
});
return { events, input, extracted, responseBody, streamEvents, model: normalizedModel };
}
const events = await buildResponsesPayload(dispatchBody, scenarioState);
const extracted = extractFinalAssistantOutputFromEvents(events);
const responseBody = buildAnthropicMessageResponse({
@@ -3270,6 +3385,7 @@ async function buildMessagesPayload(
export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) {
const host = params?.host ?? "127.0.0.1";
const scenarioState: MockScenarioState = {
anthropicThinkingErrorPhase: 0,
subagentFanoutPhase: 0,
subagentHandoffSpawned: false,
};

View File

@@ -0,0 +1,99 @@
# Anthropic thinking error recovery after replay-safe read
```yaml qa-scenario
id: anthropic-thinking-error-recovery-replay-safe-read
title: Anthropic thinking error recovery after replay-safe read
surface: runtime
coverage:
primary:
- runtime.anthropic-thinking-error-recovery
secondary:
- runtime.retry-policy
gatewayConfigPatch:
agents:
defaults:
models:
anthropic/claude-opus-4-8:
params: {}
objective: Verify an Anthropic stream error after signed thinking and a replay-safe read retries the same prompt into a visible answer.
successCriteria:
- Scenario is mock-openai only so live lanes do not pick it up implicitly.
- The agent performs a replay-safe read before the Anthropic stream error.
- The runtime retries the same prompt without injecting the visible-answer continuation instruction.
- The final visible reply contains the exact recovery marker.
docsRefs:
- docs/help/testing.md
codeRefs:
- extensions/qa-lab/src/providers/mock-openai/server.ts
- src/agents/embedded-agent-runner/run/incomplete-turn.ts
execution:
kind: flow
summary: Verify Anthropic stream errors after signed thinking recover after a replay-safe read.
config:
requiredProviderMode: mock-openai
anthropicModelRef: anthropic/claude-opus-4-8
promptSnippet: Anthropic thinking error QA check
prompt: "Anthropic thinking error QA check: read QA_KICKOFF_TASK.md, then answer with exactly ANTHROPIC-THINKING-ERROR-RECOVERED-OK."
expectedReply: ANTHROPIC-THINKING-ERROR-RECOVERED-OK
visibleAnswerRetryNeedle: The previous attempt did not produce a user-visible answer.
```
```yaml qa-flow
steps:
- name: retries a thinking-only Anthropic error after a replay-safe read
actions:
- assert:
expr: "env.providerMode === 'mock-openai'"
message: this seeded scenario is mock-openai only
- call: waitForGatewayHealthy
args:
- ref: env
- 60000
- call: reset
- set: requestCountBefore
value:
expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0"
- set: sessionKey
value:
expr: "`agent:qa:anthropic-thinking-error:${randomUUID().slice(0, 8)}`"
- set: modelAck
value:
expr: "await env.gateway.call('sessions.patch', { key: sessionKey, model: config.anthropicModelRef }, { timeoutMs: liveTurnTimeoutMs(env, 45000) })"
- call: runAgentPrompt
args:
- ref: env
- sessionKey:
ref: sessionKey
message:
expr: config.prompt
timeoutMs:
expr: liveTurnTimeoutMs(env, 45000)
- call: waitForOutboundMessage
saveAs: outbound
args:
- ref: state
- lambda:
params: [candidate]
expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)"
- expr: liveTurnTimeoutMs(env, 30000)
- assert:
expr: "outbound.text.includes(config.expectedReply)"
message:
expr: "`missing Anthropic thinking-error recovery marker: ${outbound.text}`"
- if:
expr: "Boolean(env.mock)"
then:
- set: scenarioRequests
value:
expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)"
- assert:
expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.providerVariant === 'anthropic' && request.plannedToolName === 'read')"
message: expected replay-safe read request on the Anthropic mock route
- assert:
expr: "scenarioRequests.filter((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.providerVariant === 'anthropic').length >= 3"
message: expected initial read, terminal-error attempt, and same-prompt retry
- assert:
expr: "!scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.visibleAnswerRetryNeedle))"
message: expected same-prompt retry, not visible-answer continuation retry
detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text"
```

View File

@@ -38,6 +38,7 @@ export {
isLikelyContextOverflowError,
isFailoverAssistantError,
isFailoverErrorMessage,
isGenericUnknownStreamErrorMessage,
isImageDimensionErrorMessage,
isImageSizeError,
isOverloadedErrorMessage,

View File

@@ -960,7 +960,7 @@ function isBilling429MessageForProvider(raw: string, provider: string | undefine
// stream ends with stopReason === "aborted" | "error" without specific info. Treat
// it as a transient transport failure so the configured fallback chain rotates
// instead of returning the bare string to the user (#71620).
function isGenericUnknownStreamError(raw: string): boolean {
export function isGenericUnknownStreamErrorMessage(raw: string): boolean {
return /^\s*an unknown error occurred\.?\s*$/i.test(raw);
}
@@ -1064,7 +1064,7 @@ function classifyFailoverClassificationFromMessage(
if (isAuthErrorMessage(raw)) {
return toReasonClassification("auth");
}
if (isGenericUnknownStreamError(raw)) {
if (isGenericUnknownStreamErrorMessage(raw)) {
return toReasonClassification("timeout");
}
if (isOpenRouterProviderReturnedError(raw, provider)) {

View File

@@ -3,6 +3,7 @@ import { beforeAll, beforeEach, describe, expect, it } from "vitest";
import { makeAttemptResult } from "./run.overflow-compaction.fixture.js";
import {
loadRunOverflowCompactionHarness,
mockedClassifyAssistantFailoverReason,
mockedClassifyFailoverReason,
mockedGlobalHookRunner,
mockedRunEmbeddedAttempt,
@@ -13,21 +14,27 @@ import type { EmbeddedRunAttemptResult } from "./run/types.js";
let runEmbeddedAgent: typeof import("./run.js").runEmbeddedAgent;
type AssistantContent = NonNullable<EmbeddedRunAttemptResult["lastAssistant"]>["content"];
function emptyErrorAttempt(
provider: string,
model: string,
outputTokens = 0,
content: AssistantContent = [],
errorMessage?: string,
): EmbeddedRunAttemptResult {
// Models can report stopReason=error with no output after tool activity; that
// is replay-safe only when the attempt metadata records no side effects.
return makeAttemptResult({
assistantTexts: [],
lastAssistant: {
role: "assistant",
stopReason: "error",
provider,
model,
content: [],
content,
usage: { input: 100, output: outputTokens, totalTokens: 100 + outputTokens },
...(errorMessage ? { errorMessage } : {}),
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
});
}
@@ -36,6 +43,7 @@ function successAttempt(provider: string, model: string): EmbeddedRunAttemptResu
return makeAttemptResult({
assistantTexts: ["Done."],
lastAssistant: {
role: "assistant",
stopReason: "stop",
provider,
model,
@@ -71,6 +79,118 @@ describe("runEmbeddedAgent silent-error retry", () => {
expect(result.payloads).toBeUndefined();
});
it("retries when stopReason=error emitted only thinking blocks and output tokens", async () => {
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
emptyErrorAttempt("anthropic", "claude-opus-4-8", 1120, [
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }),
},
]),
);
mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("anthropic", "claude-opus-4-8"));
const result = await runEmbeddedAgent({
...overflowBaseRunParams,
provider: "anthropic",
model: "claude-opus-4-8",
runId: "run-empty-error-retry-thinking-only",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
expect(result.payloads).toBeUndefined();
});
it("retries thinking-only unknown provider errors before assistant failover", async () => {
mockedClassifyFailoverReason.mockReturnValue("timeout");
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
emptyErrorAttempt(
"anthropic",
"claude-opus-4-8",
1120,
[
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }),
},
],
"An unknown error occurred",
),
);
mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("anthropic", "claude-opus-4-8"));
const result = await runEmbeddedAgent({
...overflowBaseRunParams,
provider: "anthropic",
model: "claude-opus-4-8",
runId: "run-empty-error-retry-before-assistant-failover",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
expect(result.payloads).toBeUndefined();
});
it.each([
["timeout", "LLM request timed out."],
["server_error", "Internal server error"],
] as const)("does not intercept recognized %s failover errors", async (reason, errorMessage) => {
mockedClassifyAssistantFailoverReason.mockReturnValue(reason);
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
emptyErrorAttempt(
"anthropic",
"claude-opus-4-8",
1120,
[
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }),
},
],
errorMessage,
),
);
await runEmbeddedAgent({
...overflowBaseRunParams,
provider: "anthropic",
model: "claude-opus-4-8",
runId: `run-empty-error-retry-${reason}`,
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
});
it("does not intercept concrete non-transient failover errors", async () => {
mockedClassifyFailoverReason.mockReturnValue("model_not_found");
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
emptyErrorAttempt(
"anthropic",
"missing-model",
1120,
[
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_missing_model", type: "reasoning" }),
},
],
"model not found",
),
);
await runEmbeddedAgent({
...overflowBaseRunParams,
provider: "anthropic",
model: "missing-model",
runId: "run-empty-error-retry-non-transient",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
});
it("caps retries at MAX_EMPTY_ERROR_RETRIES and surfaces incomplete-turn error", async () => {
// 1 initial + 3 retries = 4 attempts, all returning empty-error.
for (let i = 0; i < 4; i += 1) {
@@ -113,6 +233,7 @@ describe("runEmbeddedAgent silent-error retry", () => {
makeAttemptResult({
assistantTexts: [],
lastAssistant: {
role: "assistant",
stopReason: "stop",
provider: "plain-provider",
model: "plain-model",
@@ -156,6 +277,7 @@ describe("runEmbeddedAgent silent-error retry", () => {
makeAttemptResult({
assistantTexts: [],
lastAssistant: {
role: "assistant",
stopReason: "error",
provider: "ollama",
model: "glm-5.1:cloud",
@@ -179,4 +301,57 @@ describe("runEmbeddedAgent silent-error retry", () => {
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
expect(result.payloads?.[0]?.isError).toBe(true);
});
it.each([
[
"client tool calls",
{ clientToolCalls: [{ name: "browser", params: { url: "https://example.com" } }] },
],
["yield", { yieldDetected: true }],
["approval prompts", { didSendDeterministicApprovalPrompt: true }],
[
"heartbeat responses",
{
heartbeatToolResponse: {
outcome: "progress",
notify: false,
summary: "Still working",
},
},
],
["tool media", { toolMediaUrls: ["file:///tmp/render.png"] }],
["voice media", { toolAudioAsVoice: true }],
["trusted local media", { toolTrustedLocalMedia: true }],
[
"source reply payloads",
{ messagingToolSourceReplyPayloads: [{ text: "Delivered through the source reply." }] },
],
["delivered source replies", { didDeliverSourceReplyViaMessageTool: true }],
["tool errors", { lastToolError: { toolName: "read", error: "read failed" } }],
] satisfies Array<[string, Partial<EmbeddedRunAttemptResult>]>)(
"does not retry after terminal %s",
async (_label, attemptState) => {
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
makeAttemptResult({
...emptyErrorAttempt("anthropic", "claude-opus-4-8", 1120, [
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }),
},
]),
...attemptState,
}),
);
await runEmbeddedAgent({
...overflowBaseRunParams,
provider: "anthropic",
model: "claude-opus-4-8",
runId: `run-empty-error-retry-terminal-${_label.replaceAll(" ", "-")}`,
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
},
);
});

View File

@@ -41,6 +41,7 @@ import {
resolveRunLivenessState,
resolveSilentToolResultReplyPayload,
shouldRetryMissingAssistantTurn,
shouldRetrySilentErrorAssistantTurn,
shouldTreatEmptyAssistantReplyAsSilent,
} from "./run/incomplete-turn.js";
import type { EmbeddedRunAttemptResult } from "./run/types.js";
@@ -693,7 +694,7 @@ describe("runEmbeddedAgent incomplete-turn safety", () => {
expect(result.payloads).toBeUndefined();
});
it("does not retry reasoning-only turns when the assistant ended in error", async () => {
it("retries reasoning-only turns when the assistant ended in error", async () => {
mockedClassifyFailoverReason.mockReturnValue(null);
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
makeAttemptResult({
@@ -714,6 +715,18 @@ describe("runEmbeddedAgent incomplete-turn safety", () => {
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
}),
);
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
makeAttemptResult({
assistantTexts: ["Recovered."],
lastAssistant: {
role: "assistant",
stopReason: "stop",
provider: "openai",
model: "gpt-5.4",
content: [{ type: "text", text: "Recovered." }],
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
}),
);
const result = await runEmbeddedAgent({
...overflowBaseRunParams,
@@ -722,9 +735,8 @@ describe("runEmbeddedAgent incomplete-turn safety", () => {
runId: "run-reasoning-only-assistant-error",
});
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1);
expect(result.payloads?.[0]?.isError).toBe(true);
expect(result.payloads?.[0]?.text).toContain("Please try again");
expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2);
expect(result.payloads).toBeUndefined();
});
it("does not retry reasoning-only turns for non-strict-agentic providers", async () => {
@@ -2529,6 +2541,191 @@ describe("runEmbeddedAgent incomplete-turn safety", () => {
expect(retryInstruction).toBeNull();
});
it("surfaces incomplete-turn text for errored signed-thinking-only turns with payloads", () => {
const incompleteTurnText = resolveIncompleteTurnPayloadText({
payloadCount: 1,
aborted: false,
timedOut: false,
attempt: makeAttemptResult({
assistantTexts: [],
lastAssistant: {
role: "assistant",
stopReason: "error",
provider: "anthropic",
model: "claude-opus-4-8",
content: [
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_error_payload", type: "reasoning" }),
},
],
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
}),
});
expect(incompleteTurnText).toContain("couldn't generate a response");
});
it.each([
[
"heartbeat responses",
{
heartbeatToolResponse: {
outcome: "progress" as const,
notify: false,
summary: "Still working",
},
},
],
["tool media", { toolMediaUrls: ["file:///tmp/render.png"] }],
["voice media", { toolAudioAsVoice: true }],
["trusted local media", { toolTrustedLocalMedia: true }],
[
"source reply payloads",
{ messagingToolSourceReplyPayloads: [{ text: "Delivered through the source reply." }] },
],
["delivered source replies", { didDeliverSourceReplyViaMessageTool: true }],
] satisfies Array<[string, Partial<EmbeddedRunAttemptResult>]>)(
"does not replace terminal %s with an incomplete-turn warning",
(_label, attemptState) => {
const incompleteTurnText = resolveIncompleteTurnPayloadText({
payloadCount: 1,
aborted: false,
timedOut: false,
attempt: makeAttemptResult({
assistantTexts: [],
...attemptState,
lastAssistant: {
role: "assistant",
stopReason: "error",
provider: "anthropic",
model: "claude-opus-4-8",
content: [
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({
id: "rs_terminal_payload",
type: "reasoning",
}),
},
],
} as unknown as EmbeddedRunAttemptResult["lastAssistant"],
}),
});
expect(incompleteTurnText).toBeNull();
},
);
it("retries replay-safe errored turns that only emitted thinking blocks", () => {
const assistant = {
role: "assistant",
stopReason: "error",
provider: "anthropic",
model: "claude-opus-4-8",
content: [
{
type: "thinking",
thinking: "internal reasoning before provider error",
thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }),
},
{ type: "redacted_thinking", data: "opaque" },
{ type: "text", text: " " },
],
usage: { input: 100, output: 1120, totalTokens: 1220 },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"];
expect(
shouldRetrySilentErrorAssistantTurn({
attempt: makeAttemptResult({ assistantTexts: [], lastAssistant: assistant }),
assistant,
}),
).toBe(true);
});
it("does not retry errored empty turns when non-zero output may indicate progress", () => {
const assistant = {
role: "assistant",
stopReason: "error",
provider: "ollama",
model: "glm-5.1:cloud",
content: [],
usage: { input: 100, output: 12, totalTokens: 112 },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"];
expect(
shouldRetrySilentErrorAssistantTurn({
attempt: makeAttemptResult({ assistantTexts: [], lastAssistant: assistant }),
assistant,
}),
).toBe(false);
});
it.each([
{
name: "visible text",
content: [
{ type: "thinking", thinking: "internal", thinkingSignature: "sig" },
{ type: "text", text: "partial answer" },
],
},
{
name: "tool call",
content: [
{ type: "thinking", thinking: "internal", thinkingSignature: "sig" },
{ type: "toolCall", id: "call_1", name: "read", arguments: { path: "README.md" } },
],
},
{
name: "unknown block",
content: [{ type: "provider_metadata", value: "opaque" }],
},
])("does not retry errored turns containing $name", ({ content }) => {
const assistant = {
role: "assistant",
stopReason: "error",
provider: "anthropic",
model: "claude-opus-4-8",
content,
usage: { input: 100, output: 1120, totalTokens: 1220 },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"];
expect(
shouldRetrySilentErrorAssistantTurn({
attempt: makeAttemptResult({ assistantTexts: [], lastAssistant: assistant }),
assistant,
}),
).toBe(false);
});
it("does not retry errored thinking-only turns after side effects", () => {
const assistant = {
role: "assistant",
stopReason: "error",
provider: "anthropic",
model: "claude-opus-4-8",
content: [
{
type: "redacted_thinking",
data: "opaque",
},
],
usage: { input: 100, output: 1120, totalTokens: 1220 },
} as unknown as EmbeddedRunAttemptResult["lastAssistant"];
expect(
shouldRetrySilentErrorAssistantTurn({
attempt: makeAttemptResult({
assistantTexts: [],
replayMetadata: {
hadPotentialSideEffects: true,
replaySafe: false,
},
lastAssistant: assistant,
}),
assistant,
}),
).toBe(false);
});
it("detects empty openai-compatible stop turns with non-zero output usage", () => {
const retryInstruction = resolveEmptyResponseRetryInstruction({
provider: "llamacpp",

View File

@@ -225,6 +225,9 @@ export const mockedIsBillingAssistantError = vi.fn(() => false);
export const mockedIsCompactionFailureError = vi.fn(() => false);
export const mockedIsFailoverAssistantError = vi.fn<MockAssistantErrorProbe>(() => false);
export const mockedIsFailoverErrorMessage = vi.fn(() => false);
export const mockedIsGenericUnknownStreamErrorMessage = vi.fn((raw: string) =>
/^\s*an unknown error occurred\.?\s*$/i.test(raw),
);
export const mockedIsLikelyContextOverflowError = vi.fn((msg?: string) => {
const lower = normalizeLowercaseStringOrEmpty(msg ?? "");
return (
@@ -412,6 +415,10 @@ export function resetRunOverflowCompactionHarnessMocks(): void {
mockedIsFailoverAssistantError.mockReturnValue(false);
mockedIsFailoverErrorMessage.mockReset();
mockedIsFailoverErrorMessage.mockReturnValue(false);
mockedIsGenericUnknownStreamErrorMessage.mockReset();
mockedIsGenericUnknownStreamErrorMessage.mockImplementation((raw: string) =>
/^\s*an unknown error occurred\.?\s*$/i.test(raw),
);
mockedIsLikelyContextOverflowError.mockReset();
mockedIsLikelyContextOverflowError.mockImplementation((msg?: string) => {
const lower = normalizeLowercaseStringOrEmpty(msg ?? "");
@@ -642,6 +649,7 @@ export async function loadRunOverflowCompactionHarness(): Promise<{
isLikelyContextOverflowError: mockedIsLikelyContextOverflowError,
isFailoverAssistantError: mockedIsFailoverAssistantError,
isFailoverErrorMessage: mockedIsFailoverErrorMessage,
isGenericUnknownStreamErrorMessage: mockedIsGenericUnknownStreamErrorMessage,
parseImageSizeError: mockedParseImageSizeError,
parseImageDimensionError: mockedParseImageDimensionError,
isRateLimitAssistantError: mockedIsRateLimitAssistantError,

View File

@@ -72,6 +72,7 @@ import {
isCompactionFailureError,
isFailoverAssistantError,
isFailoverErrorMessage,
isGenericUnknownStreamErrorMessage,
isLikelyContextOverflowError,
isRateLimitAssistantError,
parseImageDimensionError,
@@ -107,6 +108,7 @@ import {
resolveSelectedOpenAIRuntimeProvider,
} from "../openai-routing.js";
import { resolveProviderIdForAuth } from "../provider-auth-aliases.js";
import { hasOnlyAssistantReasoningContent } from "../replay-turn-classification.js";
import { runAgentCleanupStep } from "../run-cleanup-timeout.js";
import { buildAgentRuntimeAuthPlan } from "../runtime-plan/auth.js";
import { buildAgentRuntimePlan } from "../runtime-plan/build.js";
@@ -195,6 +197,7 @@ import {
resolveReplayInvalidFlag,
resolveRunLivenessState,
shouldRetryMissingAssistantTurn,
shouldRetrySilentErrorAssistantTurn,
shouldTreatEmptyAssistantReplyAsSilent,
} from "./run/incomplete-turn.js";
import type { RunEmbeddedAgentParams } from "./run/params.js";
@@ -2936,6 +2939,43 @@ async function runEmbeddedAgentInternal(
const imageDimensionError = parseImageDimensionError(
assistantForFailover?.errorMessage ?? "",
);
// The shared runtime wraps interrupted streams as a timeout. Retry that
// wrapper only for reasoning-only output so ordinary timeouts keep failover.
const genericUnknownReasoningError =
assistantFailoverReason === "timeout" &&
isGenericUnknownStreamErrorMessage(assistantForFailover?.errorMessage ?? "") &&
Boolean(assistantForFailover && hasOnlyAssistantReasoningContent(assistantForFailover));
const silentErrorRetryReason =
assistantFailoverReason === null ||
genericUnknownReasoningError ||
assistantFailoverReason === "no_error_details" ||
assistantFailoverReason === "unclassified" ||
assistantFailoverReason === "unknown";
// Retry replay-safe non-visible provider errors before assistant
// failover surfaces them as terminal provider failures.
if (
!authFailure &&
!rateLimitFailure &&
!billingFailure &&
!cloudCodeAssistFormatError &&
!imageDimensionError &&
!aborted &&
!promptError &&
!timedOut &&
silentErrorRetryReason &&
shouldRetrySilentErrorAssistantTurn({ attempt, assistant: assistantForFailover }) &&
emptyErrorRetries < MAX_EMPTY_ERROR_RETRIES
) {
emptyErrorRetries += 1;
log.warn(
`[empty-error-retry] stopReason=error non-visible-output; resubmitting ` +
`attempt=${emptyErrorRetries}/${MAX_EMPTY_ERROR_RETRIES} ` +
`provider=${assistantForFailover?.provider ?? provider} ` +
`model=${assistantForFailover?.model ?? model.id} ` +
`sessionKey=${params.sessionKey ?? params.sessionId}`,
);
continue;
}
// Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
const failedAssistantProfileId = lastProfileId;
const logAssistantFailoverDecision = createFailoverDecisionLogger({
@@ -3602,47 +3642,6 @@ async function runEmbeddedAgentInternal(
`provider=${activeErrorContext.provider}/${activeErrorContext.model} attempts=${emptyResponseRetryAttempts}/${maxEmptyResponseRetryAttempts} — surfacing incomplete-turn error`,
);
}
// ── silent-error retry ────────────────────────────────────────────
// Observed with ollama/glm-5.1: a turn can end with stopReason="error"
// and zero output tokens AND empty content after a successful
// tool-call sequence, producing no user-visible text at all. This
// path is narrower than the empty-response continuation retry:
// same prompt, same session transcript (tool results already
// captured), no instruction injection. Placed before the
// incompleteTurnText return so it actually gets a chance to fire.
//
// Content-empty guard: a reasoning-only error (content has thinking
// blocks) is a distinct failure mode handled elsewhere; only retry
// when the assistant truly produced nothing.
//
// Side-effect guard: if the failed attempt already recorded potential
// side effects (messaging tool sent, cron add, mutating tool
// call that wasn't round-tripped as replay-safe), resubmission can
// duplicate those actions. Mirror the gate the other retry resolvers
// use (resolveEmptyResponseRetryInstruction, reasoning-only, planning-
// only), which short-circuit on attempt.replayMetadata.hadPotentialSideEffects.
const silentErrorContent = sessionLastAssistant?.content as Array<unknown> | undefined;
if (
incompleteTurnText &&
!aborted &&
!promptError &&
!timedOut &&
sessionLastAssistant?.stopReason === "error" &&
((sessionLastAssistant?.usage as { output?: number } | undefined)?.output ?? 0) === 0 &&
(silentErrorContent?.length ?? 0) === 0 &&
(attempt.replayMetadata ? !attempt.replayMetadata.hadPotentialSideEffects : false) &&
emptyErrorRetries < MAX_EMPTY_ERROR_RETRIES
) {
emptyErrorRetries += 1;
log.warn(
`[empty-error-retry] stopReason=error output=0; resubmitting ` +
`attempt=${emptyErrorRetries}/${MAX_EMPTY_ERROR_RETRIES} ` +
`provider=${sessionLastAssistant?.provider ?? provider} ` +
`model=${sessionLastAssistant?.model ?? model.id} ` +
`sessionKey=${params.sessionKey ?? params.sessionId}`,
);
continue;
}
if (incompleteTurnText) {
const replayInvalid = resolveReplayInvalidForAttempt(incompleteTurnText);
const livenessState = resolveRunLivenessState({

View File

@@ -16,6 +16,7 @@ import {
isStrictAgenticSupportedProviderModel,
stripProviderPrefix,
} from "../../execution-contract.js";
import { hasOnlyAssistantReasoningContent } from "../../replay-turn-classification.js";
import type { AgentMessage } from "../../runtime/index.js";
import { isLikelyMutatingToolName } from "../../tool-mutation.js";
import {
@@ -44,6 +45,12 @@ type IncompleteTurnAttempt = Pick<
| "currentAttemptAssistant"
| "yieldDetected"
| "didSendDeterministicApprovalPrompt"
| "heartbeatToolResponse"
| "toolMediaUrls"
| "toolAudioAsVoice"
| "toolTrustedLocalMedia"
| "didDeliverSourceReplyViaMessageTool"
| "messagingToolSourceReplyPayloads"
| "didSendViaMessagingTool"
| "messagingToolSentTexts"
| "messagingToolSentMediaUrls"
@@ -262,6 +269,35 @@ export function resolveAttemptReplayMetadata(attempt: {
return attempt.replayMetadata ?? REPLAY_UNSAFE_FALLBACK_METADATA;
}
type TerminalAttemptState = Pick<
EmbeddedRunAttemptResult,
| "clientToolCalls"
| "yieldDetected"
| "didSendDeterministicApprovalPrompt"
| "heartbeatToolResponse"
| "lastToolError"
| "toolMediaUrls"
| "toolAudioAsVoice"
| "toolTrustedLocalMedia"
| "didDeliverSourceReplyViaMessageTool"
| "messagingToolSourceReplyPayloads"
>;
function hasAttemptTerminalState(attempt: TerminalAttemptState): boolean {
return Boolean(
attempt.clientToolCalls ||
attempt.yieldDetected ||
attempt.didSendDeterministicApprovalPrompt ||
attempt.heartbeatToolResponse ||
attempt.lastToolError ||
attempt.toolMediaUrls?.some((url) => url.trim().length > 0) ||
attempt.toolAudioAsVoice ||
attempt.toolTrustedLocalMedia ||
attempt.didDeliverSourceReplyViaMessageTool ||
attempt.messagingToolSourceReplyPayloads?.length,
);
}
/**
* Builds the user-visible incomplete-turn warning when a terminal attempt did
* not produce a safe final assistant response and no committed delivery/progress
@@ -281,16 +317,17 @@ export function resolveIncompleteTurnPayloadText(params: {
// produced. (#76477)
const toolUseTerminal = params.attempt.lastAssistant?.stopReason === "toolUse";
const assistant = params.attempt.currentAttemptAssistant ?? params.attempt.lastAssistant;
// Unsigned thinking payloads count toward payloadCount but carry no user-visible
// content; bypass the visible-text guard when unsigned thinking was the only output
// so that incomplete-turn stall detection fires below. (#89787)
const unsignedThinkingOnlyTerminal =
// Thinking payloads can count toward payloadCount but carry no user-visible
// content; bypass the visible-text guard when thinking was the only output
// so that incomplete-turn stall detection fires below. (#89787, #91953)
const thinkingOnlyTerminal =
params.payloadCount !== 0 &&
!joinAssistantTexts(params.attempt.assistantTexts).length &&
isUnsignedThinkingOnlyAssistantTurn(assistant);
!hasAttemptTerminalState(params.attempt) &&
Boolean(assistant && hasOnlyAssistantReasoningContent(assistant));
if (
(params.payloadCount !== 0 && !toolUseTerminal && !unsignedThinkingOnlyTerminal) ||
(params.payloadCount !== 0 && !toolUseTerminal && !thinkingOnlyTerminal) ||
(params.aborted && params.externalAbort) ||
params.timedOut ||
params.attempt.clientToolCalls ||
@@ -330,7 +367,7 @@ export function resolveIncompleteTurnPayloadText(params: {
if (
!incompleteTerminalAssistant &&
!reasoningOnlyAssistant &&
!unsignedThinkingOnlyTerminal &&
!thinkingOnlyTerminal &&
!emptyResponseAssistant &&
stopReason !== "error"
) {
@@ -555,6 +592,50 @@ function isUnsignedThinkingOnlyAssistantTurn(message: unknown): boolean {
return assessLastAssistantMessage(message as AgentMessage) === "incomplete-thinking";
}
export function shouldRetrySilentErrorAssistantTurn(params: {
attempt: Pick<
EmbeddedRunAttemptResult,
| "assistantTexts"
| "clientToolCalls"
| "yieldDetected"
| "didSendDeterministicApprovalPrompt"
| "heartbeatToolResponse"
| "lastToolError"
| "toolMediaUrls"
| "toolAudioAsVoice"
| "toolTrustedLocalMedia"
| "didDeliverSourceReplyViaMessageTool"
| "messagingToolSourceReplyPayloads"
| "replayMetadata"
>;
assistant: EmbeddedRunAttemptResult["lastAssistant"] | null | undefined;
}): boolean {
if (joinAssistantTexts(params.attempt.assistantTexts).length > 0) {
return false;
}
if (hasAttemptTerminalState(params.attempt)) {
return false;
}
if (resolveAttemptReplayMetadata(params.attempt).hadPotentialSideEffects) {
return false;
}
const assistant = params.assistant;
if (!assistant || assistant.stopReason !== "error") {
return false;
}
const content = (assistant as { content?: unknown }).content;
if (!Array.isArray(content)) {
return false;
}
if (content.length === 0) {
return !hasPositiveOutputTokenUsage(assistant);
}
return hasOnlyAssistantReasoningContent(assistant);
}
function isEmptyResponseAssistantTurn(params: {
payloadCount: number;
attempt: Pick<

View File

@@ -4,9 +4,9 @@ type AssistantTurnLike = {
content?: unknown;
};
/** Returns true when a token-limited turn contains only incomplete provider reasoning. */
export function isReasoningOnlyLengthAssistantTurn(message: AssistantTurnLike): boolean {
if (message.role !== "assistant" || message.stopReason !== "length") {
/** Returns true when an assistant turn contains only provider reasoning and blank text. */
export function hasOnlyAssistantReasoningContent(message: AssistantTurnLike): boolean {
if (message.role !== "assistant") {
return false;
}
const content = Array.isArray(message.content)
@@ -31,3 +31,8 @@ export function isReasoningOnlyLengthAssistantTurn(message: AssistantTurnLike):
}
return hasThinking;
}
/** Returns true when a token-limited turn contains only incomplete provider reasoning. */
export function isReasoningOnlyLengthAssistantTurn(message: AssistantTurnLike): boolean {
return message.stopReason === "length" && hasOnlyAssistantReasoningContent(message);
}