From 60f93583488a6a636a7eb073a3afdfb688dbb975 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 25 Apr 2026 17:56:28 +0100
Subject: [PATCH] fix(tts): preserve legacy tool voice hints

---
 CHANGELOG.md                                  |  3 ++
 docs/reference/rich-output-protocol.md        |  1 +
 ...ded-subscribe.handlers.tools.media.test.ts | 28 +++++++++++++++++++
 .../pi-embedded-subscribe.tools.media.test.ts | 27 ++++++++++++++++++
 src/agents/pi-embedded-subscribe.tools.ts     |  9 +++++-
 5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f494e8288f..afbaaf9f9e0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
+  `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
+  Thanks @azade-c.
 - Telegram/STT: frame inbound voice-note transcripts as machine-generated,
   untrusted text in agent context while preserving raw transcript mention
   detection. Closes #33360. Thanks @smartchainark.
diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md
index bd5ec4c8c0a..81cadaa6eb8 100644
--- a/docs/reference/rich-output-protocol.md
+++ b/docs/reference/rich-output-protocol.md
@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
 - `[embed ...]` for Control UI rich rendering
 
 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
+Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
 
 When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
 turn. If the same media URL is sent in a streamed block and repeated in the final
diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
index 94993168038..432cfdb6d09 100644
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
     expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
   });
 
+  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({
+      shouldEmitToolOutput: false,
+      onToolResult,
+      builtinToolNames: new Set(["tts"]),
+    });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "tts",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          {
+            type: "text",
+            text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
+          },
+        ],
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
+    expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
+  });
+
   it("does NOT emit local media for untrusted tools", async () => {
     const onToolResult = vi.fn();
     const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts
index 8ffab15b574..d4ecdc1a8a4 100644
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
     });
   });
 
+  it("extracts audioAsVoice from legacy MEDIA text", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
+  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "[[audio_as_voice]]" },
+          { type: "text", text: "MEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
   it("extracts structured media trust markers", () => {
     expect(
       extractToolResultMediaArtifact({
diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts
index 14c2be9119f..093c3568127 100644
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
   // parser so directive matching and validation stay in sync with outbound
   // reply parsing.
   const paths: string[] = [];
+  let audioAsVoice = false;
   let hasImageContent = false;
   for (const item of content) {
     if (!item || typeof item !== "object") {
@@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact(
     }
     if (entry.type === "text" && typeof entry.text === "string") {
       const parsed = splitMediaFromOutput(entry.text);
+      if (parsed.audioAsVoice) {
+        audioAsVoice = true;
+      }
       if (parsed.mediaUrls?.length) {
         paths.push(...parsed.mediaUrls);
       }
@@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact(
   }
 
   if (paths.length > 0) {
-    return { mediaUrls: paths };
+    return {
+      mediaUrls: paths,
+      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    };
   }
 
   // Fall back to legacy details.path when image content exists but no