fix(agents): preserve spoken text in tts tool result

The tts tool previously returned a fixed "Generated audio reply."
string in its content, so session transcripts lost what was actually
spoken. Across every channel, a voice-only reply left no text record
for future turns, forcing users to recover transcripts from the
provider's API. Echo the synthesized text back in the tool result
content (audio still delivered via details.media).

Sanitize the transcript before embedding so crafted utterances cannot
inject reply directives when tool output is rendered in verbose mode:
MEDIA: at line start and [[…]] markers are interrupted with a
zero-width word joiner (U+2060) that defuses parseReplyDirectives
without altering the visible text.
This commit is contained in:
Chris Zhang
2026-04-19 15:44:43 +08:00
committed by Peter Steinberger
parent f0cc29af9a
commit 7b51b7b26f
4 changed files with 95 additions and 4 deletions

View File

@@ -95,6 +95,40 @@ describe("createCodexDynamicToolBridge", () => {
},
);
it("preserves audio-as-voice metadata from tts results", async () => {
const toolResult = {
content: [{ type: "text", text: "(spoken) hello" }],
details: {
media: {
mediaUrl: "/tmp/reply.opus",
audioAsVoice: true,
},
},
} satisfies AgentToolResult<unknown>;
const tool = createTool({
execute: vi.fn(async () => toolResult),
});
const bridge = createCodexDynamicToolBridge({
tools: [tool],
signal: new AbortController().signal,
});
const result = await bridge.handleToolCall({
threadId: "thread-1",
turnId: "turn-1",
callId: "call-1",
tool: "tts",
arguments: { text: "hello" },
});
expect(result).toEqual({
success: true,
contentItems: [{ type: "inputText", text: "(spoken) hello" }],
});
expect(bridge.telemetry.toolMediaUrls).toEqual(["/tmp/reply.opus"]);
expect(bridge.telemetry.toolAudioAsVoice).toBe(true);
});
it("records messaging tool side effects while returning concise text to app-server", async () => {
const toolResult = {
content: [{ type: "text", text: "Sent." }],