mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:10:44 +00:00
fix(tts): preserve legacy tool voice hints
This commit is contained in:
@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result
|
||||
`MEDIA:` payloads so generated audio still delivers as a voice note. (#46535)
|
||||
Thanks @azade-c.
|
||||
- Telegram/STT: frame inbound voice-note transcripts as machine-generated,
|
||||
untrusted text in agent context while preserving raw transcript mention
|
||||
detection. Closes #33360. Thanks @smartchainark.
|
||||
|
||||
@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
|
||||
- `[embed ...]` for Control UI rich rendering
|
||||
|
||||
These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path.
|
||||
Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note.
|
||||
|
||||
When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a
|
||||
turn. If the same media URL is sent in a streamed block and repeated in the final
|
||||
|
||||
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
|
||||
});
|
||||
|
||||
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({
|
||||
shouldEmitToolOutput: false,
|
||||
onToolResult,
|
||||
builtinToolNames: new Set(["tts"]),
|
||||
});
|
||||
|
||||
await handleToolExecutionEnd(ctx, {
|
||||
type: "tool_execution_end",
|
||||
toolName: "tts",
|
||||
toolCallId: "tc-1",
|
||||
isError: false,
|
||||
result: {
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(onToolResult).not.toHaveBeenCalled();
|
||||
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
|
||||
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
|
||||
});
|
||||
|
||||
it("does NOT emit local media for untrusted tools", async () => {
|
||||
const onToolResult = vi.fn();
|
||||
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
|
||||
|
||||
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts audioAsVoice from legacy MEDIA text", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
content: [
|
||||
{ type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
|
||||
],
|
||||
}),
|
||||
).toEqual({
|
||||
mediaUrls: ["/tmp/reply.opus"],
|
||||
audioAsVoice: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
content: [
|
||||
{ type: "text", text: "[[audio_as_voice]]" },
|
||||
{ type: "text", text: "MEDIA:/tmp/reply.opus" },
|
||||
],
|
||||
}),
|
||||
).toEqual({
|
||||
mediaUrls: ["/tmp/reply.opus"],
|
||||
audioAsVoice: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts structured media trust markers", () => {
|
||||
expect(
|
||||
extractToolResultMediaArtifact({
|
||||
|
||||
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
|
||||
// parser so directive matching and validation stay in sync with outbound
|
||||
// reply parsing.
|
||||
const paths: string[] = [];
|
||||
let audioAsVoice = false;
|
||||
let hasImageContent = false;
|
||||
for (const item of content) {
|
||||
if (!item || typeof item !== "object") {
|
||||
@@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact(
|
||||
}
|
||||
if (entry.type === "text" && typeof entry.text === "string") {
|
||||
const parsed = splitMediaFromOutput(entry.text);
|
||||
if (parsed.audioAsVoice) {
|
||||
audioAsVoice = true;
|
||||
}
|
||||
if (parsed.mediaUrls?.length) {
|
||||
paths.push(...parsed.mediaUrls);
|
||||
}
|
||||
@@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact(
|
||||
}
|
||||
|
||||
if (paths.length > 0) {
|
||||
return { mediaUrls: paths };
|
||||
return {
|
||||
mediaUrls: paths,
|
||||
...(audioAsVoice ? { audioAsVoice: true } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
// Fall back to legacy details.path when image content exists but no
|
||||
|
||||
Reference in New Issue
Block a user