refactor(tts): clarify text media directives

This commit is contained in:
Peter Steinberger
2026-04-25 18:18:27 +01:00
parent 67506ac2a9
commit 8e7d382c37
5 changed files with 52 additions and 38 deletions

View File

@@ -25,9 +25,9 @@ Docs: https://docs.openclaw.ai
### Fixes
- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
`MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
Thanks @azade-c.
- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text
tool-result `MEDIA:` payloads so generated audio still delivers as a voice
note. (#46535) Thanks @azade-c.
- Agents/TTS: keep queued tool media when an assistant ends with `NO_REPLY` on
non-block delivery paths, so media-only generated audio replies still send.
(#60025) Thanks @bradlind1.

View File

@@ -14,7 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
- `[embed ...]` for Control UI rich rendering
These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so text tool outputs can still mark an audio attachment as a voice note.
When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
turn. If the same media URL is sent in a streamed block and repeated in the final

View File

@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
});
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({
shouldEmitToolOutput: false,
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
});
it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
it("does NOT queue text MEDIA paths when verbose is full", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });

View File

@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
});
});
it("extracts audioAsVoice from legacy MEDIA text", () => {
it("extracts audioAsVoice from text MEDIA directives", () => {
expect(
extractToolResultMediaArtifact({
content: [
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
});
});
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
expect(
extractToolResultMediaArtifact({
content: [

View File

@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
*
* Strategy (first match wins):
* 1. Read structured `details.media` attachments from tool details.
* 2. Parse legacy `MEDIA:` tokens from text content blocks.
* 2. Parse `MEDIA:` directive tokens from text content blocks.
* 3. Fall back to `details.path` when image content exists (legacy imageResult).
*
* Returns an empty array when no media is found (e.g. Pi SDK `read` tool
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
return Array.from(new Set(urls));
}
function extractTextContentMediaArtifact(content: unknown[]): {
mediaUrls: string[];
audioAsVoice?: boolean;
hasImageContent: boolean;
} {
const mediaUrls: string[] = [];
let audioAsVoice = false;
let hasImageContent = false;
for (const item of content) {
if (!item || typeof item !== "object") {
continue;
}
const entry = item as Record<string, unknown>;
if (entry.type === "image") {
hasImageContent = true;
continue;
}
if (entry.type !== "text" || typeof entry.text !== "string") {
continue;
}
const parsed = splitMediaFromOutput(entry.text);
if (parsed.audioAsVoice) {
audioAsVoice = true;
}
if (parsed.mediaUrls?.length) {
mediaUrls.push(...parsed.mediaUrls);
}
}
return {
mediaUrls,
...(audioAsVoice ? { audioAsVoice: true } : {}),
hasImageContent,
};
}
export function extractToolResultMediaArtifact(
result: unknown,
): ToolResultMediaArtifact | undefined {
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
return undefined;
}
// Extract legacy MEDIA: paths from text content blocks using the shared
// parser so directive matching and validation stay in sync with outbound
// reply parsing.
const paths: string[] = [];
let audioAsVoice = false;
let hasImageContent = false;
for (const item of content) {
if (!item || typeof item !== "object") {
continue;
}
const entry = item as Record<string, unknown>;
if (entry.type === "image") {
hasImageContent = true;
continue;
}
if (entry.type === "text" && typeof entry.text === "string") {
const parsed = splitMediaFromOutput(entry.text);
if (parsed.audioAsVoice) {
audioAsVoice = true;
}
if (parsed.mediaUrls?.length) {
paths.push(...parsed.mediaUrls);
}
}
}
const textMedia = extractTextContentMediaArtifact(content);
if (paths.length > 0) {
if (textMedia.mediaUrls.length > 0) {
return {
mediaUrls: paths,
...(audioAsVoice ? { audioAsVoice: true } : {}),
mediaUrls: textMedia.mediaUrls,
...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
};
}
// Fall back to legacy details.path when image content exists but no
// structured media details or MEDIA: text.
if (hasImageContent) {
if (textMedia.hasImageContent) {
const details = record.details as Record<string, unknown> | undefined;
const p = normalizeOptionalString(details?.path) ?? "";
if (p) {