diff --git a/CHANGELOG.md b/CHANGELOG.md
index ded3ec79261..a9f6ac49740 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,9 +25,9 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
-- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
-  `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
-  Thanks @azade-c.
+- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text
+  tool-result `MEDIA:` payloads so generated audio still delivers as a voice
+  note. (#46535) Thanks @azade-c.
 - Agents/TTS: keep queued tool media when an assistant ends with `NO_REPLY` on
   non-block delivery paths, so media-only generated audio replies still send.
   (#60025) Thanks @bradlind1.
diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md
index 81cadaa6eb8..9cbe443e2b1 100644
--- a/docs/reference/rich-output-protocol.md
+++ b/docs/reference/rich-output-protocol.md
@@ -14,7 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
 - `[embed ...]` for Control UI rich rendering
 
 These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
-Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
+Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.
 
 When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
 turn. If the same media URL is sent in a streamed block and repeated in the final
diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
index 432cfdb6d09..36750fd97fd 100644
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
     expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
   });
 
-  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+  it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
     const onToolResult = vi.fn();
     const ctx = createMockContext({
       shouldEmitToolOutput: false,
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
     expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
   });
 
-  it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
+  it("does NOT queue text MEDIA paths when verbose is full", async () => {
     const onToolResult = vi.fn();
     const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
 
diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts
index d4ecdc1a8a4..b688e2829d5 100644
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
     });
   });
 
-  it("extracts audioAsVoice from legacy MEDIA text", () => {
+  it("extracts audioAsVoice from text MEDIA directives", () => {
     expect(
       extractToolResultMediaArtifact({
         content: [
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
     });
   });
 
-  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+  it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
     expect(
       extractToolResultMediaArtifact({
         content: [
diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts
index 093c3568127..e796b24321a 100644
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
  *
  * Strategy (first match wins):
  * 1. Read structured `details.media` attachments from tool details.
- * 2. Parse legacy `MEDIA:` tokens from text content blocks.
+ * 2. Parse `MEDIA:` directive tokens from text content blocks.
  * 3. Fall back to `details.path` when image content exists (legacy imageResult).
  *
  * Returns an empty array when no media is found (e.g. Pi SDK `read` tool
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
   return Array.from(new Set(urls));
 }
 
+function extractTextContentMediaArtifact(content: unknown[]): {
+  mediaUrls: string[];
+  audioAsVoice?: boolean;
+  hasImageContent: boolean;
+} {
+  const mediaUrls: string[] = [];
+  let audioAsVoice = false;
+  let hasImageContent = false;
+
+  for (const item of content) {
+    if (!item || typeof item !== "object") {
+      continue;
+    }
+    const entry = item as Record<string, unknown>;
+    if (entry.type === "image") {
+      hasImageContent = true;
+      continue;
+    }
+    if (entry.type !== "text" || typeof entry.text !== "string") {
+      continue;
+    }
+
+    const parsed = splitMediaFromOutput(entry.text);
+    if (parsed.audioAsVoice) {
+      audioAsVoice = true;
+    }
+    if (parsed.mediaUrls?.length) {
+      mediaUrls.push(...parsed.mediaUrls);
+    }
+  }
+
+  return {
+    mediaUrls,
+    ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    hasImageContent,
+  };
+}
+
 export function extractToolResultMediaArtifact(
   result: unknown,
 ): ToolResultMediaArtifact | undefined {
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
     return undefined;
   }
 
-  // Extract legacy MEDIA: paths from text content blocks using the shared
-  // parser so directive matching and validation stay in sync with outbound
-  // reply parsing.
-  const paths: string[] = [];
-  let audioAsVoice = false;
-  let hasImageContent = false;
-  for (const item of content) {
-    if (!item || typeof item !== "object") {
-      continue;
-    }
-    const entry = item as Record<string, unknown>;
-    if (entry.type === "image") {
-      hasImageContent = true;
-      continue;
-    }
-    if (entry.type === "text" && typeof entry.text === "string") {
-      const parsed = splitMediaFromOutput(entry.text);
-      if (parsed.audioAsVoice) {
-        audioAsVoice = true;
-      }
-      if (parsed.mediaUrls?.length) {
-        paths.push(...parsed.mediaUrls);
-      }
-    }
-  }
+  const textMedia = extractTextContentMediaArtifact(content);
 
-  if (paths.length > 0) {
+  if (textMedia.mediaUrls.length > 0) {
     return {
-      mediaUrls: paths,
-      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+      mediaUrls: textMedia.mediaUrls,
+      ...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
     };
   }
 
   // Fall back to legacy details.path when image content exists but no
   // structured media details or MEDIA: text.
-  if (hasImageContent) {
+  if (textMedia.hasImageContent) {
     const details = record.details as Record<string, unknown> | undefined;
     const p = normalizeOptionalString(details?.path) ?? "";
     if (p) {