diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index e1c2274564a..87879676854 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -231,7 +231,11 @@ describe("handleToolExecutionEnd media emission", () => { }); it("still queues structured media when verbose is full", async () => { - const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult: vi.fn() }); + const ctx = createMockContext({ + shouldEmitToolOutput: true, + onToolResult: vi.fn(), + toolResultFormat: "plain", + }); await handleToolExecutionEnd(ctx, { type: "tool_execution_end", @@ -254,6 +258,34 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolAudioAsVoice).toBe(true); }); + it("does not queue a duplicate voice copy when emitted tool output already sent the same audio", async () => { + const ctx = createMockContext({ + shouldEmitToolOutput: true, + onToolResult: vi.fn(), + toolResultFormat: "plain", + }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "tts", + toolCallId: "tc-1", + isError: false, + result: { + content: [{ type: "text", text: "Generated audio reply.\nMEDIA:/tmp/reply.opus" }], + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + }, + }); + + expect(ctx.emitToolOutput).toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual([]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(false); + }); + async function handleVerboseGeneratedImage(toolResultFormat: "plain" | "markdown") { const ctx = createMockContext({ shouldEmitToolOutput: true, diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index 8c67904288a..90ec01ec475 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -544,7 +544,7 @@ async function emitToolResultOutput(params: { ctx.builtinToolNames, ); const pendingMediaUrls = - mediaReply.audioAsVoice || emittedToolOutputMediaUrls.length === 0 + emittedToolOutputMediaUrls.length === 0 ? mediaUrls : mediaUrls.filter((url) => !emittedToolOutputMediaUrls.includes(url)); if (pendingMediaUrls.length === 0) {