refactor(tts): clarify text media directives

This commit is contained in:
Peter Steinberger
2026-04-25 18:18:27 +01:00
parent 67506ac2a9
commit 8e7d382c37
5 changed files with 52 additions and 38 deletions

View File

@@ -165,7 +165,7 @@ describe("handleToolExecutionEnd media emission", () => {
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
});
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
it("preserves audio_as_voice when queuing trusted text MEDIA tool output", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({
shouldEmitToolOutput: false,
@@ -245,7 +245,7 @@ describe("handleToolExecutionEnd media emission", () => {
expect(ctx.state.pendingToolMediaUrls).toEqual(["https://example.com/file.png"]);
});
it("does NOT queue legacy MEDIA paths when verbose is full", async () => {
it("does NOT queue text MEDIA paths when verbose is full", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });

View File

@@ -51,7 +51,7 @@ describe("extractToolResultMediaPaths", () => {
});
});
it("extracts audioAsVoice from legacy MEDIA text", () => {
it("extracts audioAsVoice from text MEDIA directives", () => {
expect(
extractToolResultMediaArtifact({
content: [
@@ -64,7 +64,7 @@ describe("extractToolResultMediaPaths", () => {
});
});
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
it("keeps audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
expect(
extractToolResultMediaArtifact({
content: [

View File

@@ -239,7 +239,7 @@ export function filterToolResultMediaUrls(
*
* Strategy (first match wins):
* 1. Read structured `details.media` attachments from tool details.
* 2. Parse legacy `MEDIA:` tokens from text content blocks.
* 2. Parse `MEDIA:` directive tokens from text content blocks.
* 3. Fall back to `details.path` when image content exists (legacy imageResult).
*
* Returns an empty array when no media is found (e.g. Pi SDK `read` tool
@@ -279,6 +279,44 @@ function collectStructuredMediaUrls(media: Record<string, unknown>): string[] {
return Array.from(new Set(urls));
}
function extractTextContentMediaArtifact(content: unknown[]): {
mediaUrls: string[];
audioAsVoice?: boolean;
hasImageContent: boolean;
} {
const mediaUrls: string[] = [];
let audioAsVoice = false;
let hasImageContent = false;
for (const item of content) {
if (!item || typeof item !== "object") {
continue;
}
const entry = item as Record<string, unknown>;
if (entry.type === "image") {
hasImageContent = true;
continue;
}
if (entry.type !== "text" || typeof entry.text !== "string") {
continue;
}
const parsed = splitMediaFromOutput(entry.text);
if (parsed.audioAsVoice) {
audioAsVoice = true;
}
if (parsed.mediaUrls?.length) {
mediaUrls.push(...parsed.mediaUrls);
}
}
return {
mediaUrls,
...(audioAsVoice ? { audioAsVoice: true } : {}),
hasImageContent,
};
}
export function extractToolResultMediaArtifact(
result: unknown,
): ToolResultMediaArtifact | undefined {
@@ -303,42 +341,18 @@ export function extractToolResultMediaArtifact(
return undefined;
}
// Extract legacy MEDIA: paths from text content blocks using the shared
// parser so directive matching and validation stay in sync with outbound
// reply parsing.
const paths: string[] = [];
let audioAsVoice = false;
let hasImageContent = false;
for (const item of content) {
if (!item || typeof item !== "object") {
continue;
}
const entry = item as Record<string, unknown>;
if (entry.type === "image") {
hasImageContent = true;
continue;
}
if (entry.type === "text" && typeof entry.text === "string") {
const parsed = splitMediaFromOutput(entry.text);
if (parsed.audioAsVoice) {
audioAsVoice = true;
}
if (parsed.mediaUrls?.length) {
paths.push(...parsed.mediaUrls);
}
}
}
const textMedia = extractTextContentMediaArtifact(content);
if (paths.length > 0) {
if (textMedia.mediaUrls.length > 0) {
return {
mediaUrls: paths,
...(audioAsVoice ? { audioAsVoice: true } : {}),
mediaUrls: textMedia.mediaUrls,
...(textMedia.audioAsVoice ? { audioAsVoice: true } : {}),
};
}
// Fall back to legacy details.path when image content exists but no
// structured media details or MEDIA: text.
if (hasImageContent) {
if (textMedia.hasImageContent) {
const details = record.details as Record<string, unknown> | undefined;
const p = normalizeOptionalString(details?.path) ?? "";
if (p) {