fix(tts): preserve legacy tool voice hints

2026-05-06 08:10:44 +00:00 · 2026-04-25 17:56:28 +01:00
parent dc7c703425
commit 60f9358348
5 changed files with 67 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
+  `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
+  Thanks @azade-c.
 - Telegram/STT: frame inbound voice-note transcripts as machine-generated,
  untrusted text in agent context while preserving raw transcript mention
  detection. Closes #33360. Thanks @smartchainark.
--- a/docs/reference/rich-output-protocol.md
+++ b/docs/reference/rich-output-protocol.md
@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
 - `[embed ...]` for Control UI rich rendering

 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
+Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.

 When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
 turn. If the same media URL is sent in a streamed block and repeated in the final
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
  });

+  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({
+      shouldEmitToolOutput: false,
+      onToolResult,
+      builtinToolNames: new Set(["tts"]),
+    });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "tts",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          {
+            type: "text",
+            text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
+          },
+        ],
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
+    expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
+  });
+
  it("does NOT emit local media for untrusted tools", async () => {
    const onToolResult = vi.fn();
    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

+  it("extracts audioAsVoice from legacy MEDIA text", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
+  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "[[audio_as_voice]]" },
+          { type: "text", text: "MEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
  it("extracts structured media trust markers", () => {
    expect(
      extractToolResultMediaArtifact({
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
  // parser so directive matching and validation stay in sync with outbound
  // reply parsing.
  const paths: string[] = [];
+  let audioAsVoice = false;
  let hasImageContent = false;
  for (const item of content) {
    if (!item || typeof item !== "object") {
@@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact(
    }
    if (entry.type === "text" && typeof entry.text === "string") {
      const parsed = splitMediaFromOutput(entry.text);
+      if (parsed.audioAsVoice) {
+        audioAsVoice = true;
+      }
      if (parsed.mediaUrls?.length) {
        paths.push(...parsed.mediaUrls);
      }
@@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact(
  }

  if (paths.length > 0) {
-    return { mediaUrls: paths };
+    return {
+      mediaUrls: paths,
+      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    };
  }

  // Fall back to legacy details.path when image content exists but no