refactor(tts): clarify text media directives

2026-05-06 20:40:43 +00:00 · 2026-04-25 18:18:27 +01:00
parent 67506ac2a9
commit 8e7d382c37
5 changed files with 52 additions and 38 deletions
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
    expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
  });

-  it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
+  it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
    const onToolResult = vi.fn();
    const ctx = createMockContext({
      shouldEmitToolOutput: false,
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
    expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
  });

-  it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
+  it("does NOT queue text MEDIA paths when verbose is full", async () => {
    const onToolResult = vi.fn();
    const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });

--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

-  it("extracts audioAsVoice from legacy MEDIA text", () => {
+  it("extracts audioAsVoice from text MEDIA directives", () => {
    expect(
      extractToolResultMediaArtifact({
        content: [
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
    });
  });

-  it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
+  it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
    expect(
      extractToolResultMediaArtifact({
        content: [
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
 *
 * Strategy (first match wins):
 * 1. Read structured `details.media` attachments from tool details.
- * 2. Parse legacy `MEDIA:` tokens from text content blocks.
+ * 2. Parse `MEDIA:` directive tokens from text content blocks.
 * 3. Fall back to `details.path` when image content exists (legacy imageResult).
 *
 * Returns an empty array when no media is found (e.g. Pi SDK `read` tool
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
  return Array.from(new Set(urls));
 }

+function extractTextContentMediaArtifact(content: unknown[]): {
+  mediaUrls: string[];
+  audioAsVoice?: boolean;
+  hasImageContent: boolean;
+} {
+  const mediaUrls: string[] = [];
+  let audioAsVoice = false;
+  let hasImageContent = false;
+
+  for (const item of content) {
+    if (!item || typeof item !== "object") {
+      continue;
+    }
+    const entry = item as Record<string, unknown>;
+    if (entry.type === "image") {
+      hasImageContent = true;
+      continue;
+    }
+    if (entry.type !== "text" || typeof entry.text !== "string") {
+      continue;
+    }
+
+    const parsed = splitMediaFromOutput(entry.text);
+    if (parsed.audioAsVoice) {
+      audioAsVoice = true;
+    }
+    if (parsed.mediaUrls?.length) {
+      mediaUrls.push(...parsed.mediaUrls);
+    }
+  }
+
+  return {
+    mediaUrls,
+    ...(audioAsVoice ? { audioAsVoice: true } : {}),
+    hasImageContent,
+  };
+}
+
 export function extractToolResultMediaArtifact(
  result: unknown,
 ): ToolResultMediaArtifact | undefined {
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
    return undefined;
  }

-  // Extract legacy MEDIA: paths from text content blocks using the shared
-  // parser so directive matching and validation stay in sync with outbound
-  // reply parsing.
-  const paths: string[] = [];
-  let audioAsVoice = false;
-  let hasImageContent = false;
-  for (const item of content) {
-    if (!item || typeof item !== "object") {
-      continue;
-    }
-    const entry = item as Record<string, unknown>;
-    if (entry.type === "image") {
-      hasImageContent = true;
-      continue;
-    }
-    if (entry.type === "text" && typeof entry.text === "string") {
-      const parsed = splitMediaFromOutput(entry.text);
-      if (parsed.audioAsVoice) {
-        audioAsVoice = true;
-      }
-      if (parsed.mediaUrls?.length) {
-        paths.push(...parsed.mediaUrls);
-      }
-    }
-  }
+  const textMedia = extractTextContentMediaArtifact(content);

-  if (paths.length > 0) {
+  if (textMedia.mediaUrls.length > 0) {
    return {
-      mediaUrls: paths,
-      ...(audioAsVoice ? { audioAsVoice: true } : {}),
+      mediaUrls: textMedia.mediaUrls,
+      ...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
    };
  }

  // Fall back to legacy details.path when image content exists but no
  // structured media details or MEDIA: text.
-  if (hasImageContent) {
+  if (textMedia.hasImageContent) {
    const details = record.details as Record<string, unknown> | undefined;
    const p = normalizeOptionalString(details?.path) ?? "";
    if (p) {