fix(tts): preserve legacy tool voice hints

This commit is contained in:
Peter Steinberger
2026-04-25 17:56:28 +01:00
parent dc7c703425
commit 60f9358348
5 changed files with 67 additions and 1 deletions

View File

@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]);
});
it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({
shouldEmitToolOutput: false,
onToolResult,
builtinToolNames: new Set(["tts"]),
});
await handleToolExecutionEnd(ctx, {
type: "tool_execution_end",
toolName: "tts",
toolCallId: "tc-1",
isError: false,
result: {
content: [
{
type: "text",
text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus",
},
],
},
});
expect(onToolResult).not.toHaveBeenCalled();
expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]);
expect(ctx.state.pendingToolAudioAsVoice).toBe(true);
});
it("does NOT emit local media for untrusted tools", async () => {
const onToolResult = vi.fn();
const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });

View File

@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
});
});
it("extracts audioAsVoice from legacy MEDIA text", () => {
expect(
extractToolResultMediaArtifact({
content: [
{ type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" },
],
}),
).toEqual({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
});
it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => {
expect(
extractToolResultMediaArtifact({
content: [
{ type: "text", text: "[[audio_as_voice]]" },
{ type: "text", text: "MEDIA:/tmp/reply.opus" },
],
}),
).toEqual({
mediaUrls: ["/tmp/reply.opus"],
audioAsVoice: true,
});
});
it("extracts structured media trust markers", () => {
expect(
extractToolResultMediaArtifact({

View File

@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
// parser so directive matching and validation stay in sync with outbound
// reply parsing.
const paths: string[] = [];
let audioAsVoice = false;
let hasImageContent = false;
for (const item of content) {
if (!item || typeof item !== "object") {
@@ -319,6 +320,9 @@ export function extractToolResultMediaArtifact(
}
if (entry.type === "text" && typeof entry.text === "string") {
const parsed = splitMediaFromOutput(entry.text);
if (parsed.audioAsVoice) {
audioAsVoice = true;
}
if (parsed.mediaUrls?.length) {
paths.push(...parsed.mediaUrls);
}
@@ -326,7 +330,10 @@ export function extractToolResultMediaArtifact(
}
if (paths.length > 0) {
return { mediaUrls: paths };
return {
mediaUrls: paths,
...(audioAsVoice ? { audioAsVoice: true } : {}),
};
}
// Fall back to legacy details.path when image content exists but no