diff --git a/CHANGELOG.md b/CHANGELOG.md index ded3ec79261..a9f6ac49740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,9 +25,9 @@ Docs: https://docs.openclaw.ai ### Fixes -- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result - `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535) - Thanks @azade-c. +- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text + tool-result `MEDIA:` payloads so generated audio still delivers as a voice + note. (#46535) Thanks @azade-c. - Agents/TTS: keep queued tool media when an assistant ends with `NO_REPLY` on non-block delivery paths, so media-only generated audio replies still send. (#60025) Thanks @bradlind1. diff --git a/docs/reference/rich-output-protocol.md b/docs/reference/rich-output-protocol.md index 81cadaa6eb8..9cbe443e2b1 100644 --- a/docs/reference/rich-output-protocol.md +++ b/docs/reference/rich-output-protocol.md @@ -14,7 +14,7 @@ Assistant output can carry a small set of delivery/render directives: - `[embed ...]` for Control UI rich rendering These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path. -Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note. +Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note. When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a turn. If the same media URL is sent in a streamed block and repeated in the final diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index 432cfdb6d09..36750fd97fd 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]); }); - it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => { + it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => { const onToolResult = vi.fn(); const ctx = createMockContext({ shouldEmitToolOutput: false, @@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => { expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]); }); - it("does NOT queue legacy MEDIA paths when verbose is full", async () => { + it("does NOT queue text MEDIA paths when verbose is full", async () => { const onToolResult = vi.fn(); const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult }); diff --git a/src/agents/pi-embedded-subscribe.tools.media.test.ts b/src/agents/pi-embedded-subscribe.tools.media.test.ts index d4ecdc1a8a4..b688e2829d5 100644 --- a/src/agents/pi-embedded-subscribe.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts @@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => { }); }); - it("extracts audioAsVoice from legacy MEDIA text", () => { + it("extracts audioAsVoice from text MEDIA directives", () => { expect( extractToolResultMediaArtifact({ content: [ @@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => { }); }); - it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => { + it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => { expect( extractToolResultMediaArtifact({ content: [ diff --git a/src/agents/pi-embedded-subscribe.tools.ts b/src/agents/pi-embedded-subscribe.tools.ts index 093c3568127..e796b24321a 100644 --- a/src/agents/pi-embedded-subscribe.tools.ts +++ b/src/agents/pi-embedded-subscribe.tools.ts @@ -239,7 +239,7 @@ export function filterToolResultMediaUrls( * * Strategy (first match wins): * 1. Read structured `details.media` attachments from tool details. - * 2. Parse legacy `MEDIA:` tokens from text content blocks. + * 2. Parse `MEDIA:` directive tokens from text content blocks. * 3. Fall back to `details.path` when image content exists (legacy imageResult). * * Returns an empty array when no media is found (e.g. Pi SDK `read` tool @@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record): string[] { return Array.from(new Set(urls)); } +function extractTextContentMediaArtifact(content: unknown[]): { + mediaUrls: string[]; + audioAsVoice?: boolean; + hasImageContent: boolean; +} { + const mediaUrls: string[] = []; + let audioAsVoice = false; + let hasImageContent = false; + + for (const item of content) { + if (!item || typeof item !== "object") { + continue; + } + const entry = item as Record; + if (entry.type === "image") { + hasImageContent = true; + continue; + } + if (entry.type !== "text" || typeof entry.text !== "string") { + continue; + } + + const parsed = splitMediaFromOutput(entry.text); + if (parsed.audioAsVoice) { + audioAsVoice = true; + } + if (parsed.mediaUrls?.length) { + mediaUrls.push(...parsed.mediaUrls); + } + } + + return { + mediaUrls, + ...(audioAsVoice ? { audioAsVoice: true } : {}), + hasImageContent, + }; +} + export function extractToolResultMediaArtifact( result: unknown, ): ToolResultMediaArtifact | undefined { @@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact( return undefined; } - // Extract legacy MEDIA: paths from text content blocks using the shared - // parser so directive matching and validation stay in sync with outbound - // reply parsing. - const paths: string[] = []; - let audioAsVoice = false; - let hasImageContent = false; - for (const item of content) { - if (!item || typeof item !== "object") { - continue; - } - const entry = item as Record; - if (entry.type === "image") { - hasImageContent = true; - continue; - } - if (entry.type === "text" && typeof entry.text === "string") { - const parsed = splitMediaFromOutput(entry.text); - if (parsed.audioAsVoice) { - audioAsVoice = true; - } - if (parsed.mediaUrls?.length) { - paths.push(...parsed.mediaUrls); - } - } - } + const textMedia = extractTextContentMediaArtifact(content); - if (paths.length > 0) { + if (textMedia.mediaUrls.length > 0) { return { - mediaUrls: paths, - ...(audioAsVoice ? { audioAsVoice: true } : {}), + mediaUrls: textMedia.mediaUrls, + ...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}), }; } // Fall back to legacy details.path when image content exists but no // structured media details or MEDIA: text. - if (hasImageContent) { + if (textMedia.hasImageContent) { const details = record.details as Record | undefined; const p = normalizeOptionalString(details?.path) ?? ""; if (p) {