From 628f0e80553fe6d0d1d755eb2fef4679b0e2732b Mon Sep 17 00:00:00 2001 From: Neerav Makwana <261249544+neeravmakwana@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:57:11 -0400 Subject: [PATCH] fix: gate tts output suppression on deliverable media --- ...ded-subscribe.handlers.tools.media.test.ts | 29 +++++++++++++++++++ .../pi-embedded-subscribe.handlers.tools.ts | 21 +++++++------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index e72e6ed105c..6a607aed011 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -286,6 +286,35 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolAudioAsVoice).toBe(true); }); + it("keeps verbose TTS text when structured local media is not trusted", async () => { + const ctx = createMockContext({ + shouldEmitToolOutput: true, + onToolResult: vi.fn(), + toolResultFormat: "plain", + builtinToolNames: new Set(["tts"]), + }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "TTS", + toolCallId: "tc-1", + isError: false, + result: { + content: [{ type: "text", text: "(spoken) hello" }], + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }, + }); + + expect(ctx.emitToolOutput).toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(false); + }); + async function handleVerboseGeneratedImage(toolResultFormat: "plain" | "markdown") { const ctx = createMockContext({ shouldEmitToolOutput: true, diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index 0736e3e4de6..2aeb852b59a 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -192,9 +192,9 @@ function readApplyPatchSummary(result: unknown): ApplyPatchSummary | null { function shouldSuppressStructuredMediaToolOutput(params: { toolName: string; isToolError: boolean; - hasStructuredMedia: boolean; + hasDeliverableStructuredMedia: boolean; }): boolean { - return params.toolName === "tts" && !params.isToolError && params.hasStructuredMedia; + return params.toolName === "tts" && !params.isToolError && params.hasDeliverableStructuredMedia; } function buildPatchSummaryText(summary: ApplyPatchSummary): string { @@ -520,8 +520,16 @@ async function emitToolResultOutput(params: { } const outputText = extractToolResultText(sanitizedResult); + const mediaReply = isToolError ? undefined : extractToolResultMediaArtifact(result); + const mediaUrls = mediaReply + ? filterToolResultMediaUrls(rawToolName, mediaReply.mediaUrls, result, ctx.builtinToolNames) + : []; const shouldEmitOutput = - !shouldSuppressStructuredMediaToolOutput({ toolName, isToolError, hasStructuredMedia }) && + !shouldSuppressStructuredMediaToolOutput({ + toolName, + isToolError, + hasDeliverableStructuredMedia: hasStructuredMedia && mediaUrls.length > 0, + }) && (ctx.shouldEmitToolOutput() || shouldEmitCompactToolOutput({ toolName, result, outputText })); if (shouldEmitOutput) { if (outputText) { @@ -543,16 +551,9 @@ async function emitToolResultOutput(params: { return; } - const mediaReply = extractToolResultMediaArtifact(result); if (!mediaReply) { return; } - const mediaUrls = filterToolResultMediaUrls( - rawToolName, - mediaReply.mediaUrls, - result, - ctx.builtinToolNames, - ); const pendingMediaUrls = emittedToolOutputMediaUrls.length === 0 ? mediaUrls