diff --git a/extensions/codex/src/app-server/dynamic-tools.test.ts b/extensions/codex/src/app-server/dynamic-tools.test.ts index f2fde99932f..0655b654b47 100644 --- a/extensions/codex/src/app-server/dynamic-tools.test.ts +++ b/extensions/codex/src/app-server/dynamic-tools.test.ts @@ -95,6 +95,40 @@ describe("createCodexDynamicToolBridge", () => { }, ); + it("preserves audio-as-voice metadata from tts results", async () => { + const toolResult = { + content: [{ type: "text", text: "(spoken) hello" }], + details: { + media: { + mediaUrl: "/tmp/reply.opus", + audioAsVoice: true, + }, + }, + } satisfies AgentToolResult; + const tool = createTool({ + execute: vi.fn(async () => toolResult), + }); + const bridge = createCodexDynamicToolBridge({ + tools: [tool], + signal: new AbortController().signal, + }); + + const result = await bridge.handleToolCall({ + threadId: "thread-1", + turnId: "turn-1", + callId: "call-1", + tool: "tts", + arguments: { text: "hello" }, + }); + + expect(result).toEqual({ + success: true, + contentItems: [{ type: "inputText", text: "(spoken) hello" }], + }); + expect(bridge.telemetry.toolMediaUrls).toEqual(["/tmp/reply.opus"]); + expect(bridge.telemetry.toolAudioAsVoice).toBe(true); + }); + it("records messaging tool side effects while returning concise text to app-server", async () => { const toolResult = { content: [{ type: "text", text: "Sent." }], diff --git a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts index 87879676854..588c6ba746f 100644 --- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts +++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts @@ -243,7 +243,7 @@ describe("handleToolExecutionEnd media emission", () => { toolCallId: "tc-1", isError: false, result: { - content: [{ type: "text", text: "Generated audio reply." }], + content: [{ type: "text", text: "(spoken) hello" }], details: { media: { mediaUrl: "/tmp/reply.opus", diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index 41cd1f6da58..198052fe571 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -17,7 +17,7 @@ describe("createTtsTool", () => { expect(tool.description).toContain(SILENT_REPLY_TOKEN); }); - it("stores audio delivery in details.media", async () => { + it("stores audio delivery in details.media and preserves the spoken text in content", async () => { textToSpeechSpy.mockResolvedValue({ success: true, audioPath: "/tmp/reply.opus", @@ -29,7 +29,7 @@ describe("createTtsTool", () => { const result = await tool.execute("call-1", { text: "hello" }); expect(result).toMatchObject({ - content: [{ type: "text", text: "Generated audio reply." }], + content: [{ type: "text", text: "(spoken) hello" }], details: { audioPath: "/tmp/reply.opus", provider: "test", @@ -43,6 +43,44 @@ describe("createTtsTool", () => { expect(JSON.stringify(result.content)).not.toContain("MEDIA:"); }); + it("echoes longer utterances verbatim into the tool-result content", async () => { + textToSpeechSpy.mockResolvedValue({ + success: true, + audioPath: "/tmp/reply.opus", + provider: "test", + voiceCompatible: true, + }); + + const spoken = "Hi Ivy! 早上好,昨天那部电影我看完了。"; + const tool = createTtsTool(); + const result = await tool.execute("call-1", { text: spoken }); + + expect(result.content).toEqual([{ type: "text", text: `(spoken) ${spoken}` }]); + }); + + it("defuses reply-directive tokens embedded in the spoken text", async () => { + textToSpeechSpy.mockResolvedValue({ + success: true, + audioPath: "/tmp/reply.opus", + provider: "test", + voiceCompatible: true, + }); + + const spoken = "line1\nMEDIA:https://evil.test/a.png\n[[audio_as_voice]] payload"; + const tool = createTtsTool(); + const result = await tool.execute("call-1", { text: spoken }); + + const rendered = (result.content as Array<{ type: string; text: string }>)[0].text; + // The literal directive tokens must not appear verbatim, so + // parseReplyDirectives can no longer surface them as media/audio flags. + expect(rendered).not.toMatch(/^MEDIA:/m); + expect(rendered).not.toContain("[[audio_as_voice]]"); + // The transcript still contains the original characters, just interrupted + // by a zero-width word joiner (U+2060) that keeps the pattern from firing. + expect(rendered).toContain("\u2060MEDIA:"); + expect(rendered).toContain("[\u2060[audio_as_voice]]"); + }); + it("throws when synthesis fails so the agent records a tool error", async () => { textToSpeechSpy.mockResolvedValue({ success: false, diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index b192321cbbd..6f3cc0e653a 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -14,6 +14,20 @@ const TtsToolSchema = Type.Object({ ), }); +/** + * Defuse reply-directive tokens inside spoken transcripts before they flow + * through tool-result content. When verbose tool output is enabled, + * `emitToolOutput` passes the content through `parseReplyDirectives` + * (`src/media/parse.ts` / `src/utils/directive-tags.ts`), and unfiltered + * `MEDIA:` or `[[audio_as_voice]]`-shaped tokens in the transcript would be + * rewritten into actual media URLs and audio-as-voice flags. Insert a + * zero-width word joiner so the regex patterns stop matching without + * changing the visible text. + */ +function sanitizeTranscriptForToolContent(text: string): string { + return text.replace(/^([ \t]*)MEDIA:/gim, "$1\u2060MEDIA:").replace(/\[\[/g, "[\u2060["); +} + export function createTtsTool(opts?: { config?: OpenClawConfig; agentChannel?: GatewayMessageChannel; @@ -36,8 +50,13 @@ export function createTtsTool(opts?: { }); if (result.success && result.audioPath) { + // Preserve the spoken text in the tool result content so the session + // transcript retains what was said across turns. The audio itself is + // still delivered via details.media. Sanitize first so a crafted + // utterance cannot inject reply directives when the tool output is + // rendered in verbose mode. return { - content: [{ type: "text", text: "Generated audio reply." }], + content: [{ type: "text", text: `(spoken) ${sanitizeTranscriptForToolContent(text)}` }], details: { audioPath: result.audioPath, provider: result.provider,