fix(tts): preserve legacy tool voice hints

This commit is contained in:
Peter Steinberger
2026-04-25 17:56:28 +01:00
parent dc7c703425
commit 60f9358348
5 changed files with 67 additions and 1 deletions

View File

@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai
### Fixes
- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
`MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
Thanks @azade-c.
- Telegram/STT: frame inbound voice-note transcripts as machine-generated,
untrusted text in agent context while preserving raw transcript mention
detection. Closes #33360. Thanks @smartchainark.

View File

@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
- `[embed ...]` for Control UI rich rendering
These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
turn. If the same media URL is sent in a streamed block and repeated in the final

View File

@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
});
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({
shouldEmitToolOutput: false,
onToolResult,
builtinToolNames: new Set(["tts"]),
});
await handleToolExecutionEnd(ctx, {
type: "tool_execution_end",
toolName: "tts",
toolCallId: "tc-1",
isError: false,
result: {
content: [
{
type: "text",
text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
},
],
},
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
});
it("does NOT emit local media for untrusted tools", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });

View File

@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
});
});
it("extracts audioAsVoice from legacy MEDIA text", () => {
expect(
extractToolResultMediaArtifact({
content: [
{ type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
],
}),
).toEqual({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
});
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
expect(
extractToolResultMediaArtifact({
content: [
{ type: "text", text: "[[audio_as_voice]]" },
{ type: "text", text: "MEDIA:/tmp/reply.opus" },
],
}),
).toEqual({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
});
it("extracts structured media trust markers", () => {
expect(
extractToolResultMediaArtifact({

View File

@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
// parser so directive matching and validation stay in sync with outbound
// reply parsing.
const paths: string[] = [];
let audioAsVoice = false;
let hasImageContent = false;
for (const item of content) {
if (!item || typeof item !== "object") {
@@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact(
}
if (entry.type === "text" && typeof entry.text === "string") {
const parsed = splitMediaFromOutput(entry.text);
if (parsed.audioAsVoice) {
audioAsVoice = true;
}
if (parsed.mediaUrls?.length) {
paths.push(...parsed.mediaUrls);
}
@@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact(
}
if (paths.length > 0) {
return { mediaUrls: paths };
return {
mediaUrls: paths,
...(audioAsVoice ? { audioAsVoice: true } : {}),
};
}
// Fall back to legacy details.path when image content exists but no