refactor(tts): clarify text media directives

2026-05-06 05:50:43 +00:00 · 2026-04-25 18:18:27 +01:00
parent 67506ac2a9
commit 8e7d382c37
5 changed files with 52 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,9 +25,9 @@ Docs: https://docs.openclaw.ai

 ### Fixes

- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
-  `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
-  Thanks @azade-c.
+- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text
+  tool-result `MEDIA:` payloads so generated audio still delivers as a voice
+  note. (#46535) Thanks @azade-c.
 - Agents/TTS: keep queued tool media when an assistant ends with `NO_REPLY` on
  non-block delivery paths, so media-only generated audio replies still send.
  (#60025) Thanks @bradlind1.
--- a/docs/reference/rich-output-protocol.md
+++ b/docs/reference/rich-output-protocol.md
@@ -14,7 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
 - `[embed ...]` for Control UI rich rendering

 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
-Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
+Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.

 When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
 turn. If the same media URL is sent in a streamed block and repeated in the final
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
  });

-  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+  it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
    const onToolResult = vi.fn();
    const ctx = createMockContext({
      shouldEmitToolOutput: false,
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
    expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
  });

-  it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
+  it("does NOT queue text MEDIA paths when verbose is full", async () => {
    const onToolResult = vi.fn();
    const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });

--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

-  it("extracts audioAsVoice from legacy MEDIA text", () => {
+  it("extracts audioAsVoice from text MEDIA directives", () => {
    expect(
      extractToolResultMediaArtifact({
        content: [
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

-  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+  it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
    expect(
      extractToolResultMediaArtifact({
        content: [
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
 *
 * Strategy (first match wins):
 * 1. Read structured `details.media` attachments from tool details.
- * 2. Parse legacy `MEDIA:` tokens from text content blocks.
+ * 2. Parse `MEDIA:` directive tokens from text content blocks.
 * 3. Fall back to `details.path` when image content exists (legacy imageResult).
 *
 * Returns an empty array when no media is found (e.g. Pi SDK `read` tool
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
  return Array.from(new Set(urls));
 }

+function extractTextContentMediaArtifact(content: unknown[]): {
+  mediaUrls: string[];
+  audioAsVoice?: boolean;
+  hasImageContent: boolean;
+} {
+  const mediaUrls: string[] = [];
+  let audioAsVoice = false;
+  let hasImageContent = false;
+
+  for (const item of content) {
+    if (!item || typeof item !== "object") {
+      continue;
+    }
+    const entry = item as Record<string, unknown>;
+    if (entry.type === "image") {
+      hasImageContent = true;
+      continue;
+    }
+    if (entry.type !== "text" || typeof entry.text !== "string") {
+      continue;
+    }
+
+    const parsed = splitMediaFromOutput(entry.text);
+    if (parsed.audioAsVoice) {
+      audioAsVoice = true;
+    }
+    if (parsed.mediaUrls?.length) {
+      mediaUrls.push(...parsed.mediaUrls);
+    }
+  }
+
+  return {
+    mediaUrls,
+    ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    hasImageContent,
+  };
+}
+
 export function extractToolResultMediaArtifact(
  result: unknown,
 ): ToolResultMediaArtifact | undefined {
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
    return undefined;
  }

-  // Extract legacy MEDIA: paths from text content blocks using the shared
-  // parser so directive matching and validation stay in sync with outbound
-  // reply parsing.
-  const paths: string[] = [];
-  let audioAsVoice = false;
-  let hasImageContent = false;
-  for (const item of content) {
-    if (!item || typeof item !== "object") {
-      continue;
-    }
-    const entry = item as Record<string, unknown>;
-    if (entry.type === "image") {
-      hasImageContent = true;
-      continue;
-    }
-    if (entry.type === "text" && typeof entry.text === "string") {
-      const parsed = splitMediaFromOutput(entry.text);
-      if (parsed.audioAsVoice) {
-        audioAsVoice = true;
-      }
-      if (parsed.mediaUrls?.length) {
-        paths.push(...parsed.mediaUrls);
-      }
-    }
-  }
+  const textMedia = extractTextContentMediaArtifact(content);

-  if (paths.length > 0) {
+  if (textMedia.mediaUrls.length > 0) {
    return {
-      mediaUrls: paths,
-      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+      mediaUrls: textMedia.mediaUrls,
+      ...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
    };
  }

  // Fall back to legacy details.path when image content exists but no
  // structured media details or MEDIA: text.
-  if (hasImageContent) {
+  if (textMedia.hasImageContent) {
    const details = record.details as Record<string, unknown> | undefined;
    const p = normalizeOptionalString(details?.path) ?? "";
    if (p) {