diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index 588c6ba746f..e72e6ed105c 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -230,7 +230,7 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolMediaUrls).toEqual([]); }); - it("still queues structured media when verbose is full", async () => { + it("queues TTS structured media without leaking spoken text when verbose is full", async () => { const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult: vi.fn(), @@ -253,12 +253,12 @@ describe("handleToolExecutionEnd media emission", () => { }, }); - expect(ctx.emitToolOutput).toHaveBeenCalled(); + expect(ctx.emitToolOutput).not.toHaveBeenCalled(); expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); expect(ctx.state.pendingToolAudioAsVoice).toBe(true); }); - it("does not queue a duplicate voice copy when emitted tool output already sent the same audio", async () => { + it("queues one voice copy when TTS output also contains a legacy media directive", async () => { const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult: vi.fn(), @@ -281,9 +281,9 @@ describe("handleToolExecutionEnd media emission", () => { }, }); - expect(ctx.emitToolOutput).toHaveBeenCalled(); - expect(ctx.state.pendingToolMediaUrls).toEqual([]); - expect(ctx.state.pendingToolAudioAsVoice).toBe(false); + expect(ctx.emitToolOutput).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(true); }); async function handleVerboseGeneratedImage(toolResultFormat: "plain" | "markdown") { diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index 90ec01ec475..0736e3e4de6 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -189,6 +189,14 @@ function readApplyPatchSummary(result: unknown): ApplyPatchSummary | null { return { added, modified, deleted }; } +function shouldSuppressStructuredMediaToolOutput(params: { + toolName: string; + isToolError: boolean; + hasStructuredMedia: boolean; +}): boolean { + return params.toolName === "tts" && !params.isToolError && params.hasStructuredMedia; +} + function buildPatchSummaryText(summary: ApplyPatchSummary): string { const parts: string[] = []; if (summary.added.length > 0) { @@ -443,7 +451,7 @@ async function emitToolResultOutput(params: { sanitizedResult: unknown; }) { const { ctx, toolName, rawToolName, meta, isToolError, result, sanitizedResult } = params; - const hasStructuredMedia = + const hasStructuredMedia = Boolean( result && typeof result === "object" && (result as { details?: unknown }).details && @@ -451,7 +459,8 @@ async function emitToolResultOutput(params: { !Array.isArray((result as { details?: unknown }).details) && typeof ((result as { details?: { media?: unknown } }).details?.media ?? undefined) === "object" && - !Array.isArray((result as { details?: { media?: unknown } }).details?.media); + !Array.isArray((result as { details?: { media?: unknown } }).details?.media), + ); const approvalPending = readExecApprovalPendingDetails(result); let emittedToolOutputMediaUrls: string[] = []; if (!isToolError && approvalPending) { @@ -512,7 +521,8 @@ async function emitToolResultOutput(params: { const outputText = extractToolResultText(sanitizedResult); const shouldEmitOutput = - ctx.shouldEmitToolOutput() || shouldEmitCompactToolOutput({ toolName, result, outputText }); + !shouldSuppressStructuredMediaToolOutput({ toolName, isToolError, hasStructuredMedia }) && + (ctx.shouldEmitToolOutput() || shouldEmitCompactToolOutput({ toolName, result, outputText })); if (shouldEmitOutput) { if (outputText) { ctx.emitToolOutput(rawToolName, meta, outputText, result);