From 60f93583488a6a636a7eb073a3afdfb688dbb975 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 17:56:28 +0100 Subject: [PATCH] fix(tts): preserve legacy tool voice hints --- CHANGELOG.md | 3 ++ docs/reference/rich-output-protocol.md | 1 + ...ded-subscribe.handlers.tools.media.test.ts | 28 +++++++++++++++++++ .../pi-embedded-subscribe.tools.media.test.ts | 27 ++++++++++++++++++ src/agents/pi-embedded-subscribe.tools.ts | 9 +++++- 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f494e8288f..afbaaf9f9e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai ### Fixes +- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result + `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535) + Thanks @azade-c. - Telegram/STT: frame inbound voice-note transcripts as machine-generated, untrusted text in agent context while preserving raw transcript mention detection. Closes #33360. Thanks @smartchainark. diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md index bd5ec4c8c0a..81cadaa6eb8 100644 --- a/docs/reference/rich-output-protocol.md +++ b/docs/reference/rich-output-protocol.md @@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives: - `[embed ...]` for Control UI rich rendering These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path. +Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note. When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a turn. If the same media URL is sent in a streamed block and repeated in the final diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index 94993168038..432cfdb6d09 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]); }); + it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => { + const onToolResult = vi.fn(); + const ctx = createMockContext({ + shouldEmitToolOutput: false, + onToolResult, + builtinToolNames: new Set(["tts"]), + }); + + await handleToolExecutionEnd(ctx, { + type: "tool_execution_end", + toolName: "tts", + toolCallId: "tc-1", + isError: false, + result: { + content: [ + { + type: "text", + text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus", + }, + ], + }, + }); + + expect(onToolResult).not.toHaveBeenCalled(); + expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(ctx.state.pendingToolAudioAsVoice).toBe(true); + }); + it("does NOT emit local media for untrusted tools", async () => { const onToolResult = vi.fn(); const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts index 8ffab15b574..d4ecdc1a8a4 100644 --- a/src/agents/pi-embedded-subscribe.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts @@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => { }); }); + it("extracts audioAsVoice from legacy MEDIA text", () => { + expect( + extractToolResultMediaArtifact({ + content: [ + { type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" }, + ], + }), + ).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + }); + + it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => { + expect( + extractToolResultMediaArtifact({ + content: [ + { type: "text", text: "[[audio_as_voice]]" }, + { type: "text", text: "MEDIA:/tmp/reply.opus" }, + ], + }), + ).toEqual({ + mediaUrls: ["/tmp/reply.opus"], + audioAsVoice: true, + }); + }); + it("extracts structured media trust markers", () => { expect( extractToolResultMediaArtifact({ diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts index 14c2be9119f..093c3568127 100644 --- a/src/agents/pi-embedded-subscribe.tools.ts +++ b/src/agents/pi-embedded-subscribe.tools.ts @@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact( // parser so directive matching and validation stay in sync with outbound // reply parsing. const paths: string[] = []; + let audioAsVoice = false; let hasImageContent = false; for (const item of content) { if (!item || typeof item !== "object") { @@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact( } if (entry.type === "text" && typeof entry.text === "string") { const parsed = splitMediaFromOutput(entry.text); + if (parsed.audioAsVoice) { + audioAsVoice = true; + } if (parsed.mediaUrls?.length) { paths.push(...parsed.mediaUrls); } @@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact( } if (paths.length > 0) { - return { mediaUrls: paths }; + return { + mediaUrls: paths, + ...(audioAsVoice ? { audioAsVoice: true } : {}), + }; } // Fall back to legacy details.path when image content exists but no