diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fbc6d58f7f..b98a5622eec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -70,6 +70,7 @@ Docs: https://docs.openclaw.ai - Browser/sandbox: clean up idle tracked tabs opened by primary-agent browser sessions, while preserving active tab reuse and lifecycle cleanup for subagents, cron, and ACP sessions. Fixes #71165. Thanks @dwbutler. - Plugins/Voice Call: pin voice response sessions to `responseModel` before embedded agent runs, avoiding live-session model switch failures when the global default model differs. Fixes #60118. Thanks @xinbenlv. - Media tools: honor the configured web-fetch SSRF policy for media understanding, image/music/video generation references, and PDF inputs, so explicit RFC2544 opt-ins cover WebChat OSS uploads without weakening defaults. Fixes #71300. (#71321) Thanks @neeravmakwana. +- Agents/TTS: suppress successful spoken transcripts from verbose chat tool output when structured voice media is already queued, while preserving text output for non-builtin tool-name collisions. Fixes #71282. Thanks @neeravmakwana. - Gateway/sessions: recover main-agent turns interrupted by a gateway restart from stale transcript-lock evidence, avoiding stuck `status: "running"` sessions without broad post-boot transcript scans. Fixes #70555. Thanks @bitloi. - Codex approvals: keep command approval responses within Codex app-server `availableDecisions`, including deny/cancel fallbacks for prompts that do not offer `decline`. (#71338) Thanks @Lucenx9. - Plugins/Google Meet: include live Chrome-node readiness in `googlemeet setup` and document the Parallels recovery checks, so stale node tokens or disconnected VM browsers are visible before an agent opens a meeting. Thanks @steipete. diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index 6a607aed011..17b9e177d90 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -235,6 +235,7 @@ describe("handleToolExecutionEnd media emission", () => { shouldEmitToolOutput: true, onToolResult: vi.fn(), toolResultFormat: "plain", + builtinToolNames: new Set(["tts"]), }); await handleToolExecutionEnd(ctx, { @@ -263,6 +264,7 @@ describe("handleToolExecutionEnd media emission", () => { shouldEmitToolOutput: true, onToolResult: vi.fn(), toolResultFormat: "plain", + builtinToolNames: new Set(["tts"]), }); await handleToolExecutionEnd(ctx, { @@ -315,6 +317,40 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolAudioAsVoice).toBe(false); }); + it("keeps verbose TTS text for non-builtin remote media collisions", async () => { + const ctx = createMockContext({ + shouldEmitToolOutput: true, + onToolResult: vi.fn(), + toolResultFormat: "plain", + builtinToolNames: new Set(["web_search"]), + }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "tts", + toolCallId: "tc-1", + isError: false, + result: { + content: [{ type: "text", text: "remote tool output" }], + details: { + media: { + mediaUrl: "https://example.com/reply.opus", + audioAsVoice: true, + }, + }, + }, + }); + + expect(ctx.emitToolOutput).toHaveBeenCalledWith( + "tts", + undefined, + "remote tool output", + expect.any(Object), + ); + expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/reply.opus"]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(true); + }); + async function handleVerboseGeneratedImage(toolResultFormat: "plain" | "markdown") { const ctx = createMockContext({ shouldEmitToolOutput: true, diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.ts b/src/agents/pi-embedded-subscribe.handlers.tools.ts index 2aeb852b59a..b073bfe0010 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts @@ -191,10 +191,18 @@ function readApplyPatchSummary(result: unknown): ApplyPatchSummary | null { function shouldSuppressStructuredMediaToolOutput(params: { toolName: string; + rawToolName: string; isToolError: boolean; hasDeliverableStructuredMedia: boolean; + builtinToolNames?: ReadonlySet; }): boolean { - return params.toolName === "tts" && !params.isToolError && params.hasDeliverableStructuredMedia; + return ( + params.toolName === "tts" && + params.rawToolName.trim() === "tts" && + params.builtinToolNames?.has("tts") === true && + !params.isToolError && + params.hasDeliverableStructuredMedia + ); } function buildPatchSummaryText(summary: ApplyPatchSummary): string { @@ -527,8 +535,10 @@ async function emitToolResultOutput(params: { const shouldEmitOutput = !shouldSuppressStructuredMediaToolOutput({ toolName, + rawToolName, isToolError, hasDeliverableStructuredMedia: hasStructuredMedia && mediaUrls.length > 0, + builtinToolNames: ctx.builtinToolNames, }) && (ctx.shouldEmitToolOutput() || shouldEmitCompactToolOutput({ toolName, result, outputText })); if (shouldEmitOutput) {