From ecaebfc51b4aecfa7eae7fa47aaf612ee843448e Mon Sep 17 00:00:00 2001 From: NVIDIAN Date: Sun, 14 Jun 2026 09:39:27 -0700 Subject: [PATCH] fix(agents): retry thinking-only errored turns (#92191) Retry replay-safe reasoning-only provider errors before assistant failover while preserving classified fallback and terminal-output ownership. Adds deterministic Anthropic gateway fault-injection coverage and focused regression tests.\n\nCo-authored-by: ai-hpc --- .../src/providers/mock-openai/server.ts | 116 ++++++++++ ...hinking-error-recovery-replay-safe-read.md | 99 +++++++++ src/agents/embedded-agent-helpers.ts | 1 + src/agents/embedded-agent-helpers/errors.ts | 4 +- .../run.empty-error-retry.test.ts | 177 ++++++++++++++- .../run.incomplete-turn.test.ts | 205 +++++++++++++++++- .../run.overflow-compaction.harness.ts | 8 + src/agents/embedded-agent-runner/run.ts | 81 ++++--- .../run/incomplete-turn.ts | 95 +++++++- src/agents/replay-turn-classification.ts | 11 +- 10 files changed, 739 insertions(+), 58 deletions(-) create mode 100644 qa/scenarios/runtime/anthropic-thinking-error-recovery-replay-safe-read.md diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index 6b343b49985..36b071be8bb 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -149,6 +149,7 @@ const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII="; const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i; const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i; +const QA_ANTHROPIC_THINKING_ERROR_RECOVERY_PROMPT_RE = /anthropic thinking error qa check/i; const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i; const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i; const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i; @@ -189,6 +190,7 @@ const QA_GROUP_AUDIO_MIN_MULTIPART_BODY_CHARS = 48_000; const QA_MCP_CODE_MODE_API_FILE_PROMPT_RE = /mcp code mode api file qa check/i; type MockScenarioState = { + anthropicThinkingErrorPhase: number; subagentFanoutPhase: number; subagentHandoffSpawned: boolean; }; @@ -3128,6 +3130,90 @@ function buildAnthropicMessageResponse(params: { }; } +const QA_ANTHROPIC_THINKING_ERROR_TEXT = + "QA replay-safe read completed, but the provider stream failed after signed thinking."; +const QA_ANTHROPIC_THINKING_ERROR_SIGNATURE = "qa_signed_thinking_block_91953"; +const QA_ANTHROPIC_THINKING_ERROR_MESSAGE = "QA injected provider stream failure"; + +function buildAnthropicThinkingErrorResponse(params: { model: string }): Record { + return { + type: "error", + error: { + type: "api_error", + message: QA_ANTHROPIC_THINKING_ERROR_MESSAGE, + }, + model: params.model || "claude-opus-4-8", + }; +} + +function buildAnthropicThinkingErrorStreamEvents(params: { + model: string; +}): AnthropicStreamEvent[] { + const messageId = `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`; + return [ + { + type: "message_start", + message: { + id: messageId, + type: "message", + role: "assistant", + model: params.model || "claude-opus-4-8", + content: [], + stop_reason: null, + stop_sequence: null, + usage: { + input_tokens: 64, + output_tokens: 0, + }, + }, + }, + { + type: "content_block_start", + index: 0, + content_block: { + type: "thinking", + thinking: "", + signature: "", + }, + }, + { + type: "content_block_delta", + index: 0, + delta: { + type: "thinking_delta", + thinking: QA_ANTHROPIC_THINKING_ERROR_TEXT, + }, + }, + { + type: "content_block_delta", + index: 0, + delta: { + type: "signature_delta", + signature: QA_ANTHROPIC_THINKING_ERROR_SIGNATURE, + }, + }, + { + type: "content_block_stop", + index: 0, + }, + { + type: "message_delta", + delta: {}, + usage: { + input_tokens: 64, + output_tokens: 1120, + }, + }, + { + type: "error", + error: { + type: "api_error", + message: QA_ANTHROPIC_THINKING_ERROR_MESSAGE, + }, + }, + ]; +} + function buildAnthropicMessageStreamEvents(params: { model: string; extracted: ExtractedAssistantOutput; @@ -3254,6 +3340,35 @@ async function buildMessagesPayload( stream: false, ...(Array.isArray(body.tools) ? { tools: body.tools } : {}), }; + const allInputText = extractAllRequestTexts(input, dispatchBody); + if (QA_ANTHROPIC_THINKING_ERROR_RECOVERY_PROMPT_RE.test(allInputText)) { + const toolOutput = extractToolOutput(input); + const shouldEmitThinkingError = + toolOutput.length > 0 && scenarioState.anthropicThinkingErrorPhase === 0; + const events = + toolOutput.length === 0 + ? buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }) + : shouldEmitThinkingError + ? (() => { + scenarioState.anthropicThinkingErrorPhase = 1; + return buildAssistantEvents(""); + })() + : buildAssistantEvents("ANTHROPIC-THINKING-ERROR-RECOVERED-OK"); + const extracted = extractFinalAssistantOutputFromEvents(events); + const responseBody = shouldEmitThinkingError + ? buildAnthropicThinkingErrorResponse({ model: normalizedModel }) + : buildAnthropicMessageResponse({ + model: normalizedModel, + extracted, + }); + const streamEvents = shouldEmitThinkingError + ? buildAnthropicThinkingErrorStreamEvents({ model: normalizedModel }) + : buildAnthropicMessageStreamEvents({ + model: normalizedModel, + extracted, + }); + return { events, input, extracted, responseBody, streamEvents, model: normalizedModel }; + } const events = await buildResponsesPayload(dispatchBody, scenarioState); const extracted = extractFinalAssistantOutputFromEvents(events); const responseBody = buildAnthropicMessageResponse({ @@ -3270,6 +3385,7 @@ async function buildMessagesPayload( export async function startQaMockOpenAiServer(params?: { host?: string; port?: number }) { const host = params?.host ?? "127.0.0.1"; const scenarioState: MockScenarioState = { + anthropicThinkingErrorPhase: 0, subagentFanoutPhase: 0, subagentHandoffSpawned: false, }; diff --git a/qa/scenarios/runtime/anthropic-thinking-error-recovery-replay-safe-read.md b/qa/scenarios/runtime/anthropic-thinking-error-recovery-replay-safe-read.md new file mode 100644 index 00000000000..a92b51c8934 --- /dev/null +++ b/qa/scenarios/runtime/anthropic-thinking-error-recovery-replay-safe-read.md @@ -0,0 +1,99 @@ +# Anthropic thinking error recovery after replay-safe read + +```yaml qa-scenario +id: anthropic-thinking-error-recovery-replay-safe-read +title: Anthropic thinking error recovery after replay-safe read +surface: runtime +coverage: + primary: + - runtime.anthropic-thinking-error-recovery + secondary: + - runtime.retry-policy +gatewayConfigPatch: + agents: + defaults: + models: + anthropic/claude-opus-4-8: + params: {} +objective: Verify an Anthropic stream error after signed thinking and a replay-safe read retries the same prompt into a visible answer. +successCriteria: + - Scenario is mock-openai only so live lanes do not pick it up implicitly. + - The agent performs a replay-safe read before the Anthropic stream error. + - The runtime retries the same prompt without injecting the visible-answer continuation instruction. + - The final visible reply contains the exact recovery marker. +docsRefs: + - docs/help/testing.md +codeRefs: + - extensions/qa-lab/src/providers/mock-openai/server.ts + - src/agents/embedded-agent-runner/run/incomplete-turn.ts +execution: + kind: flow + summary: Verify Anthropic stream errors after signed thinking recover after a replay-safe read. + config: + requiredProviderMode: mock-openai + anthropicModelRef: anthropic/claude-opus-4-8 + promptSnippet: Anthropic thinking error QA check + prompt: "Anthropic thinking error QA check: read QA_KICKOFF_TASK.md, then answer with exactly ANTHROPIC-THINKING-ERROR-RECOVERED-OK." + expectedReply: ANTHROPIC-THINKING-ERROR-RECOVERED-OK + visibleAnswerRetryNeedle: The previous attempt did not produce a user-visible answer. +``` + +```yaml qa-flow +steps: + - name: retries a thinking-only Anthropic error after a replay-safe read + actions: + - assert: + expr: "env.providerMode === 'mock-openai'" + message: this seeded scenario is mock-openai only + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: reset + - set: requestCountBefore + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - set: sessionKey + value: + expr: "`agent:qa:anthropic-thinking-error:${randomUUID().slice(0, 8)}`" + - set: modelAck + value: + expr: "await env.gateway.call('sessions.patch', { key: sessionKey, model: config.anthropicModelRef }, { timeoutMs: liveTurnTimeoutMs(env, 45000) })" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + ref: sessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 45000) + - call: waitForOutboundMessage + saveAs: outbound + args: + - ref: state + - lambda: + params: [candidate] + expr: "candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.expectedReply)" + - expr: liveTurnTimeoutMs(env, 30000) + - assert: + expr: "outbound.text.includes(config.expectedReply)" + message: + expr: "`missing Anthropic thinking-error recovery marker: ${outbound.text}`" + - if: + expr: "Boolean(env.mock)" + then: + - set: scenarioRequests + value: + expr: "(await fetchJson(`${env.mock.baseUrl}/debug/requests`)).slice(requestCountBefore)" + - assert: + expr: "scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.providerVariant === 'anthropic' && request.plannedToolName === 'read')" + message: expected replay-safe read request on the Anthropic mock route + - assert: + expr: "scenarioRequests.filter((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && request.providerVariant === 'anthropic').length >= 3" + message: expected initial read, terminal-error attempt, and same-prompt retry + - assert: + expr: "!scenarioRequests.some((request) => String(request.allInputText ?? '').includes(config.visibleAnswerRetryNeedle))" + message: expected same-prompt retry, not visible-answer continuation retry + detailsExpr: "env.mock ? `${outbound.text}\\nrequests=${String(scenarioRequests?.length ?? 0)}` : outbound.text" +``` diff --git a/src/agents/embedded-agent-helpers.ts b/src/agents/embedded-agent-helpers.ts index 9df5540ee28..08bb930dfd7 100644 --- a/src/agents/embedded-agent-helpers.ts +++ b/src/agents/embedded-agent-helpers.ts @@ -38,6 +38,7 @@ export { isLikelyContextOverflowError, isFailoverAssistantError, isFailoverErrorMessage, + isGenericUnknownStreamErrorMessage, isImageDimensionErrorMessage, isImageSizeError, isOverloadedErrorMessage, diff --git a/src/agents/embedded-agent-helpers/errors.ts b/src/agents/embedded-agent-helpers/errors.ts index d003e87991b..ecb085a67a2 100644 --- a/src/agents/embedded-agent-helpers/errors.ts +++ b/src/agents/embedded-agent-helpers/errors.ts @@ -960,7 +960,7 @@ function isBilling429MessageForProvider(raw: string, provider: string | undefine // stream ends with stopReason === "aborted" | "error" without specific info. Treat // it as a transient transport failure so the configured fallback chain rotates // instead of returning the bare string to the user (#71620). -function isGenericUnknownStreamError(raw: string): boolean { +export function isGenericUnknownStreamErrorMessage(raw: string): boolean { return /^\s*an unknown error occurred\.?\s*$/i.test(raw); } @@ -1064,7 +1064,7 @@ function classifyFailoverClassificationFromMessage( if (isAuthErrorMessage(raw)) { return toReasonClassification("auth"); } - if (isGenericUnknownStreamError(raw)) { + if (isGenericUnknownStreamErrorMessage(raw)) { return toReasonClassification("timeout"); } if (isOpenRouterProviderReturnedError(raw, provider)) { diff --git a/src/agents/embedded-agent-runner/run.empty-error-retry.test.ts b/src/agents/embedded-agent-runner/run.empty-error-retry.test.ts index 7f55b2cfc20..1cb77db2467 100644 --- a/src/agents/embedded-agent-runner/run.empty-error-retry.test.ts +++ b/src/agents/embedded-agent-runner/run.empty-error-retry.test.ts @@ -3,6 +3,7 @@ import { beforeAll, beforeEach, describe, expect, it } from "vitest"; import { makeAttemptResult } from "./run.overflow-compaction.fixture.js"; import { loadRunOverflowCompactionHarness, + mockedClassifyAssistantFailoverReason, mockedClassifyFailoverReason, mockedGlobalHookRunner, mockedRunEmbeddedAttempt, @@ -13,21 +14,27 @@ import type { EmbeddedRunAttemptResult } from "./run/types.js"; let runEmbeddedAgent: typeof import("./run.js").runEmbeddedAgent; +type AssistantContent = NonNullable["content"]; + function emptyErrorAttempt( provider: string, model: string, outputTokens = 0, + content: AssistantContent = [], + errorMessage?: string, ): EmbeddedRunAttemptResult { // Models can report stopReason=error with no output after tool activity; that // is replay-safe only when the attempt metadata records no side effects. return makeAttemptResult({ assistantTexts: [], lastAssistant: { + role: "assistant", stopReason: "error", provider, model, - content: [], + content, usage: { input: 100, output: outputTokens, totalTokens: 100 + outputTokens }, + ...(errorMessage ? { errorMessage } : {}), } as unknown as EmbeddedRunAttemptResult["lastAssistant"], }); } @@ -36,6 +43,7 @@ function successAttempt(provider: string, model: string): EmbeddedRunAttemptResu return makeAttemptResult({ assistantTexts: ["Done."], lastAssistant: { + role: "assistant", stopReason: "stop", provider, model, @@ -71,6 +79,118 @@ describe("runEmbeddedAgent silent-error retry", () => { expect(result.payloads).toBeUndefined(); }); + it("retries when stopReason=error emitted only thinking blocks and output tokens", async () => { + mockedRunEmbeddedAttempt.mockResolvedValueOnce( + emptyErrorAttempt("anthropic", "claude-opus-4-8", 1120, [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }), + }, + ]), + ); + mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("anthropic", "claude-opus-4-8")); + + const result = await runEmbeddedAgent({ + ...overflowBaseRunParams, + provider: "anthropic", + model: "claude-opus-4-8", + runId: "run-empty-error-retry-thinking-only", + }); + + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2); + expect(result.payloads).toBeUndefined(); + }); + + it("retries thinking-only unknown provider errors before assistant failover", async () => { + mockedClassifyFailoverReason.mockReturnValue("timeout"); + mockedRunEmbeddedAttempt.mockResolvedValueOnce( + emptyErrorAttempt( + "anthropic", + "claude-opus-4-8", + 1120, + [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }), + }, + ], + "An unknown error occurred", + ), + ); + mockedRunEmbeddedAttempt.mockResolvedValueOnce(successAttempt("anthropic", "claude-opus-4-8")); + + const result = await runEmbeddedAgent({ + ...overflowBaseRunParams, + provider: "anthropic", + model: "claude-opus-4-8", + runId: "run-empty-error-retry-before-assistant-failover", + }); + + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2); + expect(result.payloads).toBeUndefined(); + }); + + it.each([ + ["timeout", "LLM request timed out."], + ["server_error", "Internal server error"], + ] as const)("does not intercept recognized %s failover errors", async (reason, errorMessage) => { + mockedClassifyAssistantFailoverReason.mockReturnValue(reason); + mockedRunEmbeddedAttempt.mockResolvedValueOnce( + emptyErrorAttempt( + "anthropic", + "claude-opus-4-8", + 1120, + [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }), + }, + ], + errorMessage, + ), + ); + + await runEmbeddedAgent({ + ...overflowBaseRunParams, + provider: "anthropic", + model: "claude-opus-4-8", + runId: `run-empty-error-retry-${reason}`, + }); + + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1); + }); + + it("does not intercept concrete non-transient failover errors", async () => { + mockedClassifyFailoverReason.mockReturnValue("model_not_found"); + mockedRunEmbeddedAttempt.mockResolvedValueOnce( + emptyErrorAttempt( + "anthropic", + "missing-model", + 1120, + [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_missing_model", type: "reasoning" }), + }, + ], + "model not found", + ), + ); + + await runEmbeddedAgent({ + ...overflowBaseRunParams, + provider: "anthropic", + model: "missing-model", + runId: "run-empty-error-retry-non-transient", + }); + + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1); + }); + it("caps retries at MAX_EMPTY_ERROR_RETRIES and surfaces incomplete-turn error", async () => { // 1 initial + 3 retries = 4 attempts, all returning empty-error. for (let i = 0; i < 4; i += 1) { @@ -113,6 +233,7 @@ describe("runEmbeddedAgent silent-error retry", () => { makeAttemptResult({ assistantTexts: [], lastAssistant: { + role: "assistant", stopReason: "stop", provider: "plain-provider", model: "plain-model", @@ -156,6 +277,7 @@ describe("runEmbeddedAgent silent-error retry", () => { makeAttemptResult({ assistantTexts: [], lastAssistant: { + role: "assistant", stopReason: "error", provider: "ollama", model: "glm-5.1:cloud", @@ -179,4 +301,57 @@ describe("runEmbeddedAgent silent-error retry", () => { expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1); expect(result.payloads?.[0]?.isError).toBe(true); }); + + it.each([ + [ + "client tool calls", + { clientToolCalls: [{ name: "browser", params: { url: "https://example.com" } }] }, + ], + ["yield", { yieldDetected: true }], + ["approval prompts", { didSendDeterministicApprovalPrompt: true }], + [ + "heartbeat responses", + { + heartbeatToolResponse: { + outcome: "progress", + notify: false, + summary: "Still working", + }, + }, + ], + ["tool media", { toolMediaUrls: ["file:///tmp/render.png"] }], + ["voice media", { toolAudioAsVoice: true }], + ["trusted local media", { toolTrustedLocalMedia: true }], + [ + "source reply payloads", + { messagingToolSourceReplyPayloads: [{ text: "Delivered through the source reply." }] }, + ], + ["delivered source replies", { didDeliverSourceReplyViaMessageTool: true }], + ["tool errors", { lastToolError: { toolName: "read", error: "read failed" } }], + ] satisfies Array<[string, Partial]>)( + "does not retry after terminal %s", + async (_label, attemptState) => { + mockedRunEmbeddedAttempt.mockResolvedValueOnce( + makeAttemptResult({ + ...emptyErrorAttempt("anthropic", "claude-opus-4-8", 1120, [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }), + }, + ]), + ...attemptState, + }), + ); + + await runEmbeddedAgent({ + ...overflowBaseRunParams, + provider: "anthropic", + model: "claude-opus-4-8", + runId: `run-empty-error-retry-terminal-${_label.replaceAll(" ", "-")}`, + }); + + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1); + }, + ); }); diff --git a/src/agents/embedded-agent-runner/run.incomplete-turn.test.ts b/src/agents/embedded-agent-runner/run.incomplete-turn.test.ts index d057f88e592..0dcc22bf747 100644 --- a/src/agents/embedded-agent-runner/run.incomplete-turn.test.ts +++ b/src/agents/embedded-agent-runner/run.incomplete-turn.test.ts @@ -41,6 +41,7 @@ import { resolveRunLivenessState, resolveSilentToolResultReplyPayload, shouldRetryMissingAssistantTurn, + shouldRetrySilentErrorAssistantTurn, shouldTreatEmptyAssistantReplyAsSilent, } from "./run/incomplete-turn.js"; import type { EmbeddedRunAttemptResult } from "./run/types.js"; @@ -693,7 +694,7 @@ describe("runEmbeddedAgent incomplete-turn safety", () => { expect(result.payloads).toBeUndefined(); }); - it("does not retry reasoning-only turns when the assistant ended in error", async () => { + it("retries reasoning-only turns when the assistant ended in error", async () => { mockedClassifyFailoverReason.mockReturnValue(null); mockedRunEmbeddedAttempt.mockResolvedValueOnce( makeAttemptResult({ @@ -714,6 +715,18 @@ describe("runEmbeddedAgent incomplete-turn safety", () => { } as unknown as EmbeddedRunAttemptResult["lastAssistant"], }), ); + mockedRunEmbeddedAttempt.mockResolvedValueOnce( + makeAttemptResult({ + assistantTexts: ["Recovered."], + lastAssistant: { + role: "assistant", + stopReason: "stop", + provider: "openai", + model: "gpt-5.4", + content: [{ type: "text", text: "Recovered." }], + } as unknown as EmbeddedRunAttemptResult["lastAssistant"], + }), + ); const result = await runEmbeddedAgent({ ...overflowBaseRunParams, @@ -722,9 +735,8 @@ describe("runEmbeddedAgent incomplete-turn safety", () => { runId: "run-reasoning-only-assistant-error", }); - expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(1); - expect(result.payloads?.[0]?.isError).toBe(true); - expect(result.payloads?.[0]?.text).toContain("Please try again"); + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2); + expect(result.payloads).toBeUndefined(); }); it("does not retry reasoning-only turns for non-strict-agentic providers", async () => { @@ -2529,6 +2541,191 @@ describe("runEmbeddedAgent incomplete-turn safety", () => { expect(retryInstruction).toBeNull(); }); + it("surfaces incomplete-turn text for errored signed-thinking-only turns with payloads", () => { + const incompleteTurnText = resolveIncompleteTurnPayloadText({ + payloadCount: 1, + aborted: false, + timedOut: false, + attempt: makeAttemptResult({ + assistantTexts: [], + lastAssistant: { + role: "assistant", + stopReason: "error", + provider: "anthropic", + model: "claude-opus-4-8", + content: [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_error_payload", type: "reasoning" }), + }, + ], + } as unknown as EmbeddedRunAttemptResult["lastAssistant"], + }), + }); + + expect(incompleteTurnText).toContain("couldn't generate a response"); + }); + + it.each([ + [ + "heartbeat responses", + { + heartbeatToolResponse: { + outcome: "progress" as const, + notify: false, + summary: "Still working", + }, + }, + ], + ["tool media", { toolMediaUrls: ["file:///tmp/render.png"] }], + ["voice media", { toolAudioAsVoice: true }], + ["trusted local media", { toolTrustedLocalMedia: true }], + [ + "source reply payloads", + { messagingToolSourceReplyPayloads: [{ text: "Delivered through the source reply." }] }, + ], + ["delivered source replies", { didDeliverSourceReplyViaMessageTool: true }], + ] satisfies Array<[string, Partial]>)( + "does not replace terminal %s with an incomplete-turn warning", + (_label, attemptState) => { + const incompleteTurnText = resolveIncompleteTurnPayloadText({ + payloadCount: 1, + aborted: false, + timedOut: false, + attempt: makeAttemptResult({ + assistantTexts: [], + ...attemptState, + lastAssistant: { + role: "assistant", + stopReason: "error", + provider: "anthropic", + model: "claude-opus-4-8", + content: [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ + id: "rs_terminal_payload", + type: "reasoning", + }), + }, + ], + } as unknown as EmbeddedRunAttemptResult["lastAssistant"], + }), + }); + + expect(incompleteTurnText).toBeNull(); + }, + ); + + it("retries replay-safe errored turns that only emitted thinking blocks", () => { + const assistant = { + role: "assistant", + stopReason: "error", + provider: "anthropic", + model: "claude-opus-4-8", + content: [ + { + type: "thinking", + thinking: "internal reasoning before provider error", + thinkingSignature: JSON.stringify({ id: "rs_error", type: "reasoning" }), + }, + { type: "redacted_thinking", data: "opaque" }, + { type: "text", text: " " }, + ], + usage: { input: 100, output: 1120, totalTokens: 1220 }, + } as unknown as EmbeddedRunAttemptResult["lastAssistant"]; + expect( + shouldRetrySilentErrorAssistantTurn({ + attempt: makeAttemptResult({ assistantTexts: [], lastAssistant: assistant }), + assistant, + }), + ).toBe(true); + }); + + it("does not retry errored empty turns when non-zero output may indicate progress", () => { + const assistant = { + role: "assistant", + stopReason: "error", + provider: "ollama", + model: "glm-5.1:cloud", + content: [], + usage: { input: 100, output: 12, totalTokens: 112 }, + } as unknown as EmbeddedRunAttemptResult["lastAssistant"]; + expect( + shouldRetrySilentErrorAssistantTurn({ + attempt: makeAttemptResult({ assistantTexts: [], lastAssistant: assistant }), + assistant, + }), + ).toBe(false); + }); + + it.each([ + { + name: "visible text", + content: [ + { type: "thinking", thinking: "internal", thinkingSignature: "sig" }, + { type: "text", text: "partial answer" }, + ], + }, + { + name: "tool call", + content: [ + { type: "thinking", thinking: "internal", thinkingSignature: "sig" }, + { type: "toolCall", id: "call_1", name: "read", arguments: { path: "README.md" } }, + ], + }, + { + name: "unknown block", + content: [{ type: "provider_metadata", value: "opaque" }], + }, + ])("does not retry errored turns containing $name", ({ content }) => { + const assistant = { + role: "assistant", + stopReason: "error", + provider: "anthropic", + model: "claude-opus-4-8", + content, + usage: { input: 100, output: 1120, totalTokens: 1220 }, + } as unknown as EmbeddedRunAttemptResult["lastAssistant"]; + expect( + shouldRetrySilentErrorAssistantTurn({ + attempt: makeAttemptResult({ assistantTexts: [], lastAssistant: assistant }), + assistant, + }), + ).toBe(false); + }); + + it("does not retry errored thinking-only turns after side effects", () => { + const assistant = { + role: "assistant", + stopReason: "error", + provider: "anthropic", + model: "claude-opus-4-8", + content: [ + { + type: "redacted_thinking", + data: "opaque", + }, + ], + usage: { input: 100, output: 1120, totalTokens: 1220 }, + } as unknown as EmbeddedRunAttemptResult["lastAssistant"]; + expect( + shouldRetrySilentErrorAssistantTurn({ + attempt: makeAttemptResult({ + assistantTexts: [], + replayMetadata: { + hadPotentialSideEffects: true, + replaySafe: false, + }, + lastAssistant: assistant, + }), + assistant, + }), + ).toBe(false); + }); + it("detects empty openai-compatible stop turns with non-zero output usage", () => { const retryInstruction = resolveEmptyResponseRetryInstruction({ provider: "llamacpp", diff --git a/src/agents/embedded-agent-runner/run.overflow-compaction.harness.ts b/src/agents/embedded-agent-runner/run.overflow-compaction.harness.ts index cb6e8b8a6e1..5a74185f3c8 100644 --- a/src/agents/embedded-agent-runner/run.overflow-compaction.harness.ts +++ b/src/agents/embedded-agent-runner/run.overflow-compaction.harness.ts @@ -225,6 +225,9 @@ export const mockedIsBillingAssistantError = vi.fn(() => false); export const mockedIsCompactionFailureError = vi.fn(() => false); export const mockedIsFailoverAssistantError = vi.fn(() => false); export const mockedIsFailoverErrorMessage = vi.fn(() => false); +export const mockedIsGenericUnknownStreamErrorMessage = vi.fn((raw: string) => + /^\s*an unknown error occurred\.?\s*$/i.test(raw), +); export const mockedIsLikelyContextOverflowError = vi.fn((msg?: string) => { const lower = normalizeLowercaseStringOrEmpty(msg ?? ""); return ( @@ -412,6 +415,10 @@ export function resetRunOverflowCompactionHarnessMocks(): void { mockedIsFailoverAssistantError.mockReturnValue(false); mockedIsFailoverErrorMessage.mockReset(); mockedIsFailoverErrorMessage.mockReturnValue(false); + mockedIsGenericUnknownStreamErrorMessage.mockReset(); + mockedIsGenericUnknownStreamErrorMessage.mockImplementation((raw: string) => + /^\s*an unknown error occurred\.?\s*$/i.test(raw), + ); mockedIsLikelyContextOverflowError.mockReset(); mockedIsLikelyContextOverflowError.mockImplementation((msg?: string) => { const lower = normalizeLowercaseStringOrEmpty(msg ?? ""); @@ -642,6 +649,7 @@ export async function loadRunOverflowCompactionHarness(): Promise<{ isLikelyContextOverflowError: mockedIsLikelyContextOverflowError, isFailoverAssistantError: mockedIsFailoverAssistantError, isFailoverErrorMessage: mockedIsFailoverErrorMessage, + isGenericUnknownStreamErrorMessage: mockedIsGenericUnknownStreamErrorMessage, parseImageSizeError: mockedParseImageSizeError, parseImageDimensionError: mockedParseImageDimensionError, isRateLimitAssistantError: mockedIsRateLimitAssistantError, diff --git a/src/agents/embedded-agent-runner/run.ts b/src/agents/embedded-agent-runner/run.ts index 25de572f0e0..337d82e7bc8 100644 --- a/src/agents/embedded-agent-runner/run.ts +++ b/src/agents/embedded-agent-runner/run.ts @@ -72,6 +72,7 @@ import { isCompactionFailureError, isFailoverAssistantError, isFailoverErrorMessage, + isGenericUnknownStreamErrorMessage, isLikelyContextOverflowError, isRateLimitAssistantError, parseImageDimensionError, @@ -107,6 +108,7 @@ import { resolveSelectedOpenAIRuntimeProvider, } from "../openai-routing.js"; import { resolveProviderIdForAuth } from "../provider-auth-aliases.js"; +import { hasOnlyAssistantReasoningContent } from "../replay-turn-classification.js"; import { runAgentCleanupStep } from "../run-cleanup-timeout.js"; import { buildAgentRuntimeAuthPlan } from "../runtime-plan/auth.js"; import { buildAgentRuntimePlan } from "../runtime-plan/build.js"; @@ -195,6 +197,7 @@ import { resolveReplayInvalidFlag, resolveRunLivenessState, shouldRetryMissingAssistantTurn, + shouldRetrySilentErrorAssistantTurn, shouldTreatEmptyAssistantReplyAsSilent, } from "./run/incomplete-turn.js"; import type { RunEmbeddedAgentParams } from "./run/params.js"; @@ -2936,6 +2939,43 @@ async function runEmbeddedAgentInternal( const imageDimensionError = parseImageDimensionError( assistantForFailover?.errorMessage ?? "", ); + // The shared runtime wraps interrupted streams as a timeout. Retry that + // wrapper only for reasoning-only output so ordinary timeouts keep failover. + const genericUnknownReasoningError = + assistantFailoverReason === "timeout" && + isGenericUnknownStreamErrorMessage(assistantForFailover?.errorMessage ?? "") && + Boolean(assistantForFailover && hasOnlyAssistantReasoningContent(assistantForFailover)); + const silentErrorRetryReason = + assistantFailoverReason === null || + genericUnknownReasoningError || + assistantFailoverReason === "no_error_details" || + assistantFailoverReason === "unclassified" || + assistantFailoverReason === "unknown"; + // Retry replay-safe non-visible provider errors before assistant + // failover surfaces them as terminal provider failures. + if ( + !authFailure && + !rateLimitFailure && + !billingFailure && + !cloudCodeAssistFormatError && + !imageDimensionError && + !aborted && + !promptError && + !timedOut && + silentErrorRetryReason && + shouldRetrySilentErrorAssistantTurn({ attempt, assistant: assistantForFailover }) && + emptyErrorRetries < MAX_EMPTY_ERROR_RETRIES + ) { + emptyErrorRetries += 1; + log.warn( + `[empty-error-retry] stopReason=error non-visible-output; resubmitting ` + + `attempt=${emptyErrorRetries}/${MAX_EMPTY_ERROR_RETRIES} ` + + `provider=${assistantForFailover?.provider ?? provider} ` + + `model=${assistantForFailover?.model ?? model.id} ` + + `sessionKey=${params.sessionKey ?? params.sessionId}`, + ); + continue; + } // Capture the failing profile before auth-profile rotation mutates `lastProfileId`. const failedAssistantProfileId = lastProfileId; const logAssistantFailoverDecision = createFailoverDecisionLogger({ @@ -3602,47 +3642,6 @@ async function runEmbeddedAgentInternal( `provider=${activeErrorContext.provider}/${activeErrorContext.model} attempts=${emptyResponseRetryAttempts}/${maxEmptyResponseRetryAttempts} — surfacing incomplete-turn error`, ); } - // ── silent-error retry ──────────────────────────────────────────── - // Observed with ollama/glm-5.1: a turn can end with stopReason="error" - // and zero output tokens AND empty content after a successful - // tool-call sequence, producing no user-visible text at all. This - // path is narrower than the empty-response continuation retry: - // same prompt, same session transcript (tool results already - // captured), no instruction injection. Placed before the - // incompleteTurnText return so it actually gets a chance to fire. - // - // Content-empty guard: a reasoning-only error (content has thinking - // blocks) is a distinct failure mode handled elsewhere; only retry - // when the assistant truly produced nothing. - // - // Side-effect guard: if the failed attempt already recorded potential - // side effects (messaging tool sent, cron add, mutating tool - // call that wasn't round-tripped as replay-safe), resubmission can - // duplicate those actions. Mirror the gate the other retry resolvers - // use (resolveEmptyResponseRetryInstruction, reasoning-only, planning- - // only), which short-circuit on attempt.replayMetadata.hadPotentialSideEffects. - const silentErrorContent = sessionLastAssistant?.content as Array | undefined; - if ( - incompleteTurnText && - !aborted && - !promptError && - !timedOut && - sessionLastAssistant?.stopReason === "error" && - ((sessionLastAssistant?.usage as { output?: number } | undefined)?.output ?? 0) === 0 && - (silentErrorContent?.length ?? 0) === 0 && - (attempt.replayMetadata ? !attempt.replayMetadata.hadPotentialSideEffects : false) && - emptyErrorRetries < MAX_EMPTY_ERROR_RETRIES - ) { - emptyErrorRetries += 1; - log.warn( - `[empty-error-retry] stopReason=error output=0; resubmitting ` + - `attempt=${emptyErrorRetries}/${MAX_EMPTY_ERROR_RETRIES} ` + - `provider=${sessionLastAssistant?.provider ?? provider} ` + - `model=${sessionLastAssistant?.model ?? model.id} ` + - `sessionKey=${params.sessionKey ?? params.sessionId}`, - ); - continue; - } if (incompleteTurnText) { const replayInvalid = resolveReplayInvalidForAttempt(incompleteTurnText); const livenessState = resolveRunLivenessState({ diff --git a/src/agents/embedded-agent-runner/run/incomplete-turn.ts b/src/agents/embedded-agent-runner/run/incomplete-turn.ts index fc606f871c1..eac7a0e0bd2 100644 --- a/src/agents/embedded-agent-runner/run/incomplete-turn.ts +++ b/src/agents/embedded-agent-runner/run/incomplete-turn.ts @@ -16,6 +16,7 @@ import { isStrictAgenticSupportedProviderModel, stripProviderPrefix, } from "../../execution-contract.js"; +import { hasOnlyAssistantReasoningContent } from "../../replay-turn-classification.js"; import type { AgentMessage } from "../../runtime/index.js"; import { isLikelyMutatingToolName } from "../../tool-mutation.js"; import { @@ -44,6 +45,12 @@ type IncompleteTurnAttempt = Pick< | "currentAttemptAssistant" | "yieldDetected" | "didSendDeterministicApprovalPrompt" + | "heartbeatToolResponse" + | "toolMediaUrls" + | "toolAudioAsVoice" + | "toolTrustedLocalMedia" + | "didDeliverSourceReplyViaMessageTool" + | "messagingToolSourceReplyPayloads" | "didSendViaMessagingTool" | "messagingToolSentTexts" | "messagingToolSentMediaUrls" @@ -262,6 +269,35 @@ export function resolveAttemptReplayMetadata(attempt: { return attempt.replayMetadata ?? REPLAY_UNSAFE_FALLBACK_METADATA; } +type TerminalAttemptState = Pick< + EmbeddedRunAttemptResult, + | "clientToolCalls" + | "yieldDetected" + | "didSendDeterministicApprovalPrompt" + | "heartbeatToolResponse" + | "lastToolError" + | "toolMediaUrls" + | "toolAudioAsVoice" + | "toolTrustedLocalMedia" + | "didDeliverSourceReplyViaMessageTool" + | "messagingToolSourceReplyPayloads" +>; + +function hasAttemptTerminalState(attempt: TerminalAttemptState): boolean { + return Boolean( + attempt.clientToolCalls || + attempt.yieldDetected || + attempt.didSendDeterministicApprovalPrompt || + attempt.heartbeatToolResponse || + attempt.lastToolError || + attempt.toolMediaUrls?.some((url) => url.trim().length > 0) || + attempt.toolAudioAsVoice || + attempt.toolTrustedLocalMedia || + attempt.didDeliverSourceReplyViaMessageTool || + attempt.messagingToolSourceReplyPayloads?.length, + ); +} + /** * Builds the user-visible incomplete-turn warning when a terminal attempt did * not produce a safe final assistant response and no committed delivery/progress @@ -281,16 +317,17 @@ export function resolveIncompleteTurnPayloadText(params: { // produced. (#76477) const toolUseTerminal = params.attempt.lastAssistant?.stopReason === "toolUse"; const assistant = params.attempt.currentAttemptAssistant ?? params.attempt.lastAssistant; - // Unsigned thinking payloads count toward payloadCount but carry no user-visible - // content; bypass the visible-text guard when unsigned thinking was the only output - // so that incomplete-turn stall detection fires below. (#89787) - const unsignedThinkingOnlyTerminal = + // Thinking payloads can count toward payloadCount but carry no user-visible + // content; bypass the visible-text guard when thinking was the only output + // so that incomplete-turn stall detection fires below. (#89787, #91953) + const thinkingOnlyTerminal = params.payloadCount !== 0 && !joinAssistantTexts(params.attempt.assistantTexts).length && - isUnsignedThinkingOnlyAssistantTurn(assistant); + !hasAttemptTerminalState(params.attempt) && + Boolean(assistant && hasOnlyAssistantReasoningContent(assistant)); if ( - (params.payloadCount !== 0 && !toolUseTerminal && !unsignedThinkingOnlyTerminal) || + (params.payloadCount !== 0 && !toolUseTerminal && !thinkingOnlyTerminal) || (params.aborted && params.externalAbort) || params.timedOut || params.attempt.clientToolCalls || @@ -330,7 +367,7 @@ export function resolveIncompleteTurnPayloadText(params: { if ( !incompleteTerminalAssistant && !reasoningOnlyAssistant && - !unsignedThinkingOnlyTerminal && + !thinkingOnlyTerminal && !emptyResponseAssistant && stopReason !== "error" ) { @@ -555,6 +592,50 @@ function isUnsignedThinkingOnlyAssistantTurn(message: unknown): boolean { return assessLastAssistantMessage(message as AgentMessage) === "incomplete-thinking"; } +export function shouldRetrySilentErrorAssistantTurn(params: { + attempt: Pick< + EmbeddedRunAttemptResult, + | "assistantTexts" + | "clientToolCalls" + | "yieldDetected" + | "didSendDeterministicApprovalPrompt" + | "heartbeatToolResponse" + | "lastToolError" + | "toolMediaUrls" + | "toolAudioAsVoice" + | "toolTrustedLocalMedia" + | "didDeliverSourceReplyViaMessageTool" + | "messagingToolSourceReplyPayloads" + | "replayMetadata" + >; + assistant: EmbeddedRunAttemptResult["lastAssistant"] | null | undefined; +}): boolean { + if (joinAssistantTexts(params.attempt.assistantTexts).length > 0) { + return false; + } + if (hasAttemptTerminalState(params.attempt)) { + return false; + } + if (resolveAttemptReplayMetadata(params.attempt).hadPotentialSideEffects) { + return false; + } + + const assistant = params.assistant; + if (!assistant || assistant.stopReason !== "error") { + return false; + } + + const content = (assistant as { content?: unknown }).content; + if (!Array.isArray(content)) { + return false; + } + if (content.length === 0) { + return !hasPositiveOutputTokenUsage(assistant); + } + + return hasOnlyAssistantReasoningContent(assistant); +} + function isEmptyResponseAssistantTurn(params: { payloadCount: number; attempt: Pick< diff --git a/src/agents/replay-turn-classification.ts b/src/agents/replay-turn-classification.ts index d4d26af2df7..66c5d76c863 100644 --- a/src/agents/replay-turn-classification.ts +++ b/src/agents/replay-turn-classification.ts @@ -4,9 +4,9 @@ type AssistantTurnLike = { content?: unknown; }; -/** Returns true when a token-limited turn contains only incomplete provider reasoning. */ -export function isReasoningOnlyLengthAssistantTurn(message: AssistantTurnLike): boolean { - if (message.role !== "assistant" || message.stopReason !== "length") { +/** Returns true when an assistant turn contains only provider reasoning and blank text. */ +export function hasOnlyAssistantReasoningContent(message: AssistantTurnLike): boolean { + if (message.role !== "assistant") { return false; } const content = Array.isArray(message.content) @@ -31,3 +31,8 @@ export function isReasoningOnlyLengthAssistantTurn(message: AssistantTurnLike): } return hasThinking; } + +/** Returns true when a token-limited turn contains only incomplete provider reasoning. */ +export function isReasoningOnlyLengthAssistantTurn(message: AssistantTurnLike): boolean { + return message.stopReason === "length" && hasOnlyAssistantReasoningContent(message); +}