fix(agents): preserve spoken text in tts tool result

The tts tool previously returned a fixed "Generated audio reply." string in its content, so session transcripts lost what was actually spoken. Across every channel, a voice-only reply left no text record for future turns, forcing users to recover transcripts from the provider's API. Echo the synthesized text back in the tool result content (audio still delivered via details.media). Sanitize the transcript before embedding so crafted utterances cannot inject reply directives when tool output is rendered in verbose mode: MEDIA: at line start and [[…]] markers are interrupted with a zero-width word joiner (U+2060) that defuses parseReplyDirectives without altering the visible text.
2026-05-06 16:50:43 +00:00 · 2026-04-19 15:44:43 +08:00
parent f0cc29af9a
commit 7b51b7b26f
4 changed files with 95 additions and 4 deletions
--- a/extensions/codex/src/app-server/dynamic-tools.test.ts
+++ b/extensions/codex/src/app-server/dynamic-tools.test.ts
@@ -95,6 +95,40 @@ describe("createCodexDynamicToolBridge", () => {
    },
  );

+  it("preserves audio-as-voice metadata from tts results", async () => {
+    const toolResult = {
+      content: [{ type: "text", text: "(spoken) hello" }],
+      details: {
+        media: {
+          mediaUrl: "/tmp/reply.opus",
+          audioAsVoice: true,
+        },
+      },
+    } satisfies AgentToolResult<unknown>;
+    const tool = createTool({
+      execute: vi.fn(async () => toolResult),
+    });
+    const bridge = createCodexDynamicToolBridge({
+      tools: [tool],
+      signal: new AbortController().signal,
+    });
+
+    const result = await bridge.handleToolCall({
+      threadId: "thread-1",
+      turnId: "turn-1",
+      callId: "call-1",
+      tool: "tts",
+      arguments: { text: "hello" },
+    });
+
+    expect(result).toEqual({
+      success: true,
+      contentItems: [{ type: "inputText", text: "(spoken) hello" }],
+    });
+    expect(bridge.telemetry.toolMediaUrls).toEqual(["/tmp/reply.opus"]);
+    expect(bridge.telemetry.toolAudioAsVoice).toBe(true);
+  });
+
  it("records messaging tool side effects while returning concise text to app-server", async () => {
    const toolResult = {
      content: [{ type: "text", text: "Sent." }],
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -243,7 +243,7 @@ describe("handleToolExecutionEnd media emission", () => {
      toolCallId: "tc-1",
      isError: false,
      result: {
-        content: [{ type: "text", text: "Generated audio reply." }],
+        content: [{ type: "text", text: "(spoken) hello" }],
        details: {
          media: {
            mediaUrl: "/tmp/reply.opus",
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -17,7 +17,7 @@ describe("createTtsTool", () => {
    expect(tool.description).toContain(SILENT_REPLY_TOKEN);
  });

-  it("stores audio delivery in details.media", async () => {
+  it("stores audio delivery in details.media and preserves the spoken text in content", async () => {
    textToSpeechSpy.mockResolvedValue({
      success: true,
      audioPath: "/tmp/reply.opus",
@@ -29,7 +29,7 @@ describe("createTtsTool", () => {
    const result = await tool.execute("call-1", { text: "hello" });

    expect(result).toMatchObject({
-      content: [{ type: "text", text: "Generated audio reply." }],
+      content: [{ type: "text", text: "(spoken) hello" }],
      details: {
        audioPath: "/tmp/reply.opus",
        provider: "test",
@@ -43,6 +43,44 @@ describe("createTtsTool", () => {
    expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
  });

+  it("echoes longer utterances verbatim into the tool-result content", async () => {
+    textToSpeechSpy.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/reply.opus",
+      provider: "test",
+      voiceCompatible: true,
+    });
+
+    const spoken = "Hi Ivy! 早上好,昨天那部电影我看完了。";
+    const tool = createTtsTool();
+    const result = await tool.execute("call-1", { text: spoken });
+
+    expect(result.content).toEqual([{ type: "text", text: `(spoken) ${spoken}` }]);
+  });
+
+  it("defuses reply-directive tokens embedded in the spoken text", async () => {
+    textToSpeechSpy.mockResolvedValue({
+      success: true,
+      audioPath: "/tmp/reply.opus",
+      provider: "test",
+      voiceCompatible: true,
+    });
+
+    const spoken = "line1\nMEDIA:https://evil.test/a.png\n[[audio_as_voice]] payload";
+    const tool = createTtsTool();
+    const result = await tool.execute("call-1", { text: spoken });
+
+    const rendered = (result.content as Array<{ type: string; text: string }>)[0].text;
+    // The literal directive tokens must not appear verbatim, so
+    // parseReplyDirectives can no longer surface them as media/audio flags.
+    expect(rendered).not.toMatch(/^MEDIA:/m);
+    expect(rendered).not.toContain("[[audio_as_voice]]");
+    // The transcript still contains the original characters, just interrupted
+    // by a zero-width word joiner (U+2060) that keeps the pattern from firing.
+    expect(rendered).toContain("\u2060MEDIA:");
+    expect(rendered).toContain("[\u2060[audio_as_voice]]");
+  });
+
  it("throws when synthesis fails so the agent records a tool error", async () => {
    textToSpeechSpy.mockResolvedValue({
      success: false,
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -14,6 +14,20 @@ const TtsToolSchema = Type.Object({
  ),
 });

+/**
+ * Defuse reply-directive tokens inside spoken transcripts before they flow
+ * through tool-result content. When verbose tool output is enabled,
+ * `emitToolOutput` passes the content through `parseReplyDirectives`
+ * (`src/media/parse.ts` / `src/utils/directive-tags.ts`), and unfiltered
+ * `MEDIA:` or `[[audio_as_voice]]`-shaped tokens in the transcript would be
+ * rewritten into actual media URLs and audio-as-voice flags. Insert a
+ * zero-width word joiner so the regex patterns stop matching without
+ * changing the visible text.
+ */
+function sanitizeTranscriptForToolContent(text: string): string {
+  return text.replace(/^([ \t]*)MEDIA:/gim, "$1\u2060MEDIA:").replace(/\[\[/g, "[\u2060[");
+}
+
 export function createTtsTool(opts?: {
  config?: OpenClawConfig;
  agentChannel?: GatewayMessageChannel;
@@ -36,8 +50,13 @@ export function createTtsTool(opts?: {
      });

      if (result.success && result.audioPath) {
+        // Preserve the spoken text in the tool result content so the session
+        // transcript retains what was said across turns. The audio itself is
+        // still delivered via details.media. Sanitize first so a crafted
+        // utterance cannot inject reply directives when the tool output is
+        // rendered in verbose mode.
        return {
-          content: [{ type: "text", text: "Generated audio reply." }],
+          content: [{ type: "text", text: `(spoken) ${sanitizeTranscriptForToolContent(text)}` }],
          details: {
            audioPath: result.audioPath,
            provider: result.provider,