fix(tts): preserve legacy tool voice hints

2026-05-06 14:20:44 +00:00 · 2026-04-25 17:56:28 +01:00
parent dc7c703425
commit 60f9358348
5 changed files with 67 additions and 1 deletions
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
  });

+  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({
+      shouldEmitToolOutput: false,
+      onToolResult,
+      builtinToolNames: new Set(["tts"]),
+    });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "tts",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          {
+            type: "text",
+            text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
+          },
+        ],
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
+    expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
+  });
+
  it("does NOT emit local media for untrusted tools", async () => {
    const onToolResult = vi.fn();
    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

+  it("extracts audioAsVoice from legacy MEDIA text", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
+  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+    expect(
+      extractToolResultMediaArtifact({
+        content: [
+          { type: "text", text: "[[audio_as_voice]]" },
+          { type: "text", text: "MEDIA:/tmp/reply.opus" },
+        ],
+      }),
+    ).toEqual({
+      mediaUrls: ["/tmp/reply.opus"],
+      audioAsVoice: true,
+    });
+  });
+
  it("extracts structured media trust markers", () => {
    expect(
      extractToolResultMediaArtifact({
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
  // parser so directive matching and validation stay in sync with outbound
  // reply parsing.
  const paths: string[] = [];
+  let audioAsVoice = false;
  let hasImageContent = false;
  for (const item of content) {
    if (!item || typeof item !== "object") {
@@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact(
    }
    if (entry.type === "text" && typeof entry.text === "string") {
      const parsed = splitMediaFromOutput(entry.text);
+      if (parsed.audioAsVoice) {
+        audioAsVoice = true;
+      }
      if (parsed.mediaUrls?.length) {
        paths.push(...parsed.mediaUrls);
      }
@@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact(
  }

  if (paths.length > 0) {
-    return { mediaUrls: paths };
+    return {
+      mediaUrls: paths,
+      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    };
  }

  // Fall back to legacy details.path when image content exists but no