diff --git a/src/agents/pi-embedded-runner.sanitize-session-history.test.ts b/src/agents/pi-embedded-runner.sanitize-session-history.test.ts index 7f6a9be8e05..d1d5c5105c4 100644 --- a/src/agents/pi-embedded-runner.sanitize-session-history.test.ts +++ b/src/agents/pi-embedded-runner.sanitize-session-history.test.ts @@ -17,6 +17,7 @@ import { } from "./pi-embedded-runner.sanitize-session-history.test-harness.js"; import { validateReplayTurns } from "./pi-embedded-runner/replay-history.js"; import { castAgentMessage, castAgentMessages } from "./test-helpers/agent-message-fixtures.js"; +import { extractToolCallsFromAssistant } from "./tool-call-id.js"; import type { TranscriptPolicy } from "./transcript-policy.js"; import { makeZeroUsageSnapshot } from "./usage.js"; @@ -1182,6 +1183,93 @@ describe("sanitizeSessionHistory", () => { ]); }); + it("drops later preserved signed turns that reuse an earlier raw tool id across the transcript", async () => { + setNonGoogleModelApi(); + + const sessionManager = makeMockSessionManager(); + const messages = castAgentMessages([ + makeUserMessage("first"), + makeAssistantMessage( + [ + { type: "thinking", thinking: "internal", thinkingSignature: "sig_1" }, + { type: "toolCall", id: "call1", name: "read", arguments: {} }, + ] as unknown as AssistantMessage["content"], + { stopReason: "toolUse" }, + ), + castAgentMessage({ + role: "toolResult", + toolCallId: "call1", + toolName: "read", + content: [{ type: "text", text: "first result" }], + isError: false, + }), + makeUserMessage("second"), + makeAssistantMessage( + [ + { type: "thinking", thinking: "internal", thinkingSignature: "sig_2" }, + { type: "toolCall", id: "call1", name: "read", arguments: {} }, + ] as unknown as AssistantMessage["content"], + { stopReason: "toolUse" }, + ), + castAgentMessage({ + role: "toolResult", + toolCallId: "call1", + toolName: "read", + content: [{ type: "text", text: "second result" }], + isError: false, + }), + makeUserMessage("retry"), + ]); + + const sanitized = await sanitizeSessionHistory({ + messages, + modelApi: "anthropic-messages", + provider: "anthropic", + modelId: "claude-sonnet-4-6", + sessionManager, + sessionId: TEST_SESSION_ID, + }); + const validated = await validateReplayTurns({ + messages: sanitized, + modelApi: "anthropic-messages", + provider: "anthropic", + modelId: "claude-sonnet-4-6", + sessionId: TEST_SESSION_ID, + }); + + expect( + sanitized.filter( + (message) => + message && + typeof message === "object" && + message.role === "assistant" && + extractToolCallsFromAssistant(message as Extract) + .length > 0, + ), + ).toHaveLength(1); + expect( + sanitized.filter( + (message) => message && typeof message === "object" && message.role === "toolResult", + ), + ).toHaveLength(1); + expect( + validated.filter( + (message) => + message && + typeof message === "object" && + message.role === "assistant" && + extractToolCallsFromAssistant(message as Extract) + .length > 0, + ), + ).toHaveLength(1); + expect( + validated.filter( + (message) => message && typeof message === "object" && message.role === "toolResult", + ), + ).toHaveLength(1); + expect(JSON.stringify(validated)).not.toContain("[tool calls omitted]"); + }); + it("keeps the earlier anthropic replay prefix stable after a later subagent turn", async () => { setNonGoogleModelApi(); diff --git a/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.ts b/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.ts index 75052a7c766..94549026813 100644 --- a/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.ts +++ b/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.ts @@ -6,6 +6,7 @@ import { isRedactedSessionsSpawnAttachment, sanitizeToolUseResultPairing, } from "../../session-transcript-repair.js"; +import { extractToolCallsFromAssistant } from "../../tool-call-id.js"; import { normalizeToolName } from "../../tool-policy.js"; import { shouldAllowProviderOwnedThinkingReplay } from "../../transcript-policy.js"; import type { TranscriptPolicy } from "../../transcript-policy.js"; @@ -284,7 +285,7 @@ function isReplaySafeThinkingTurn(content: unknown[], allowedToolNames?: Set(); for (const message of messages) { if (!message || typeof message !== "object" || message.role !== "assistant") { @@ -352,7 +354,16 @@ function sanitizeReplayToolCallInputs( message.content.some((block) => isThinkingLikeReplayBlock(block)) && message.content.some((block) => isReplayToolCallBlock(block)) ) { - if (isReplaySafeThinkingTurn(message.content, allowedToolNames)) { + const replaySafeToolCalls = extractToolCallsFromAssistant( + message as Extract, + ); + if ( + isReplaySafeThinkingTurn(message.content, allowedToolNames) && + replaySafeToolCalls.every((toolCall) => !claimedReplaySafeToolCallIds.has(toolCall.id)) + ) { + for (const toolCall of replaySafeToolCalls) { + claimedReplaySafeToolCallIds.add(toolCall.id); + } out.push(message); } else { changed = true; diff --git a/src/agents/session-transcript-repair.test.ts b/src/agents/session-transcript-repair.test.ts index eb5e64d8176..64cb3f81012 100644 --- a/src/agents/session-transcript-repair.test.ts +++ b/src/agents/session-transcript-repair.test.ts @@ -435,6 +435,40 @@ describe("sanitizeToolCallInputs", () => { expect(out).toEqual([]); }); + it("drops later signed-thinking assistant turns that reuse an earlier signed tool id", () => { + const input = castAgentMessages([ + { + role: "assistant", + content: [ + { + type: "thinking", + thinking: "First signed replay turn.", + thinkingSignature: "sig_first", + }, + { type: "toolCall", id: "call_shared", name: "read", arguments: { path: "a" } }, + ], + }, + { + role: "assistant", + content: [ + { + type: "thinking", + thinking: "Second signed replay turn.", + thinkingSignature: "sig_second", + }, + { type: "toolUse", id: "call_shared", name: "read", input: { path: "b" } }, + ], + }, + ]); + + const out = sanitizeToolCallInputs(input, { + allowedToolNames: ["read"], + allowProviderOwnedThinkingReplay: true, + }); + + expect(out).toEqual([input[0]]); + }); + it("drops signed-thinking assistant turns that would require attachment redaction", () => { const secret = "SIGNED_THINKING_ATTACHMENT_SECRET"; // pragma: allowlist secret const input = castAgentMessages([ diff --git a/src/agents/session-transcript-repair.ts b/src/agents/session-transcript-repair.ts index 84fffbf2d01..c69a4da4531 100644 --- a/src/agents/session-transcript-repair.ts +++ b/src/agents/session-transcript-repair.ts @@ -316,6 +316,7 @@ export function repairToolCallInputs( const out: AgentMessage[] = []; const allowedToolNames = normalizeAllowedToolNames(options?.allowedToolNames); const allowProviderOwnedThinkingReplay = options?.allowProviderOwnedThinkingReplay === true; + const claimedReplaySafeToolCallIds = new Set(); for (const msg of messages) { if (!msg || typeof msg !== "object") { @@ -337,7 +338,14 @@ export function repairToolCallInputs( // replay. Preserve the turn only if every sibling tool call is already // valid and requires no redaction or normalization. Otherwise drop the // whole assistant turn rather than mutating provider-owned content. - if (isReplaySafeThinkingAssistantTurn(msg.content, allowedToolNames)) { + const replaySafeToolCalls = extractToolCallsFromAssistant(msg); + if ( + isReplaySafeThinkingAssistantTurn(msg.content, allowedToolNames) && + replaySafeToolCalls.every((toolCall) => !claimedReplaySafeToolCallIds.has(toolCall.id)) + ) { + for (const toolCall of replaySafeToolCalls) { + claimedReplaySafeToolCallIds.add(toolCall.id); + } out.push(msg); } else { droppedToolCalls += countRawToolCallBlocks(msg.content);