diff --git a/extensions/openai/speech-provider.test.ts b/extensions/openai/speech-provider.test.ts index 945bb8722c2..f05f6512833 100644 --- a/extensions/openai/speech-provider.test.ts +++ b/extensions/openai/speech-provider.test.ts @@ -1,7 +1,14 @@ -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it, vi } from "vitest"; import { buildOpenAISpeechProvider } from "./speech-provider.js"; describe("buildOpenAISpeechProvider", () => { + const originalFetch = globalThis.fetch; + + afterEach(() => { + globalThis.fetch = originalFetch; + vi.restoreAllMocks(); + }); + it("normalizes provider-owned speech config from raw provider config", () => { const provider = buildOpenAISpeechProvider(); const resolved = provider.resolveConfig?.({ @@ -16,6 +23,7 @@ describe("buildOpenAISpeechProvider", () => { voice: "alloy", speed: 1.25, instructions: " Speak warmly ", + responseFormat: " WAV ", }, }, }, @@ -28,6 +36,7 @@ describe("buildOpenAISpeechProvider", () => { voice: "alloy", speed: 1.25, instructions: "Speak warmly", + responseFormat: "wav", }); }); @@ -67,4 +76,61 @@ describe("buildOpenAISpeechProvider", () => { handled: false, }); }); + + it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => { + const provider = buildOpenAISpeechProvider(); + const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => { + expect(init?.body).toBeTruthy(); + const body = JSON.parse(String(init?.body)) as { response_format?: string }; + expect(body.response_format).toBe("wav"); + return new Response(new Uint8Array([1, 2, 3]), { status: 200 }); + }); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const result = await provider.synthesize({ + text: "hello", + cfg: {} as never, + providerConfig: { + apiKey: "sk-test", + baseUrl: "https://api.groq.com/openai/v1", + model: "canopylabs/orpheus-v1-english", + voice: "daniel", + }, + target: "audio-file", + timeoutMs: 1_000, + }); + + expect(result.outputFormat).toBe("wav"); + expect(result.fileExtension).toBe(".wav"); + expect(result.voiceCompatible).toBe(false); + }); + + it("honors explicit responseFormat overrides and clears voice-note compatibility when not opus", async () => { + const provider = buildOpenAISpeechProvider(); + const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => { + expect(init?.body).toBeTruthy(); + const body = JSON.parse(String(init?.body)) as { response_format?: string }; + expect(body.response_format).toBe("wav"); + return new Response(new Uint8Array([1, 2, 3]), { status: 200 }); + }); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const result = await provider.synthesize({ + text: "hello", + cfg: {} as never, + providerConfig: { + apiKey: "sk-test", + baseUrl: "https://proxy.example.com/openai/v1", + model: "canopylabs/orpheus-v1-english", + voice: "daniel", + responseFormat: "wav", + }, + target: "voice-note", + timeoutMs: 1_000, + }); + + expect(result.outputFormat).toBe("wav"); + expect(result.fileExtension).toBe(".wav"); + expect(result.voiceCompatible).toBe(false); + }); }); diff --git a/extensions/openai/speech-provider.ts b/extensions/openai/speech-provider.ts index 776844a1180..580430191d8 100644 --- a/extensions/openai/speech-provider.ts +++ b/extensions/openai/speech-provider.ts @@ -21,6 +21,10 @@ import { openaiTTS, } from "./tts.js"; +const OPENAI_SPEECH_RESPONSE_FORMATS = ["mp3", "opus", "wav"] as const; + +type OpenAiSpeechResponseFormat = (typeof OPENAI_SPEECH_RESPONSE_FORMATS)[number]; + type OpenAITtsProviderConfig = { apiKey?: string; baseUrl: string; @@ -28,6 +32,7 @@ type OpenAITtsProviderConfig = { voice: string; speed?: number; instructions?: string; + responseFormat?: OpenAiSpeechResponseFormat; }; type OpenAITtsProviderOverrides = { @@ -36,6 +41,57 @@ type OpenAITtsProviderOverrides = { speed?: number; }; +function normalizeOpenAISpeechResponseFormat( + value: unknown, +): OpenAiSpeechResponseFormat | undefined { + const next = trimToUndefined(typeof value === "string" ? value : undefined)?.toLowerCase(); + if (!next) { + return undefined; + } + if ( + OPENAI_SPEECH_RESPONSE_FORMATS.includes(next as (typeof OPENAI_SPEECH_RESPONSE_FORMATS)[number]) + ) { + return next as OpenAiSpeechResponseFormat; + } + throw new Error(`Invalid OpenAI speech responseFormat: ${next}`); +} + +function isGroqSpeechBaseUrl(baseUrl: string): boolean { + try { + const hostname = new URL(baseUrl).hostname.toLowerCase(); + return hostname === "groq.com" || hostname.endsWith(".groq.com"); + } catch { + return false; + } +} + +function resolveSpeechResponseFormat( + baseUrl: string, + target: "audio-file" | "voice-note", + configuredFormat?: OpenAiSpeechResponseFormat, +): OpenAiSpeechResponseFormat { + if (configuredFormat) { + return configuredFormat; + } + if (isGroqSpeechBaseUrl(baseUrl)) { + return "wav"; + } + return target === "voice-note" ? "opus" : "mp3"; +} + +function responseFormatToFileExtension( + format: OpenAiSpeechResponseFormat, +): ".mp3" | ".opus" | ".wav" { + switch (format) { + case "opus": + return ".opus"; + case "wav": + return ".wav"; + default: + return ".mp3"; + } +} + function normalizeOpenAIProviderConfig( rawConfig: Record, ): OpenAITtsProviderConfig { @@ -54,6 +110,7 @@ function normalizeOpenAIProviderConfig( voice: trimToUndefined(raw?.voice) ?? "coral", speed: asFiniteNumber(raw?.speed), instructions: trimToUndefined(raw?.instructions), + responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat), }; } @@ -66,6 +123,8 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid voice: trimToUndefined(config.voice) ?? normalized.voice, speed: asFiniteNumber(config.speed) ?? normalized.speed, instructions: trimToUndefined(config.instructions) ?? normalized.instructions, + responseFormat: + normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat, }; } @@ -171,7 +230,11 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { if (!apiKey) { throw new Error("OpenAI API key missing"); } - const responseFormat = req.target === "voice-note" ? "opus" : "mp3"; + const responseFormat = resolveSpeechResponseFormat( + config.baseUrl, + req.target, + config.responseFormat, + ); const audioBuffer = await openaiTTS({ text: req.text, apiKey, @@ -186,8 +249,8 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { return { audioBuffer, outputFormat: responseFormat, - fileExtension: responseFormat === "opus" ? ".opus" : ".mp3", - voiceCompatible: req.target === "voice-note", + fileExtension: responseFormatToFileExtension(responseFormat), + voiceCompatible: req.target === "voice-note" && responseFormat === "opus", }; }, synthesizeTelephony: async (req) => { diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts index 405be74062e..5cb4e48a1c5 100644 --- a/extensions/openai/tts.ts +++ b/extensions/openai/tts.ts @@ -112,7 +112,7 @@ export async function openaiTTS(params: { voice: string; speed?: number; instructions?: string; - responseFormat: "mp3" | "opus" | "pcm"; + responseFormat: "mp3" | "opus" | "pcm" | "wav"; timeoutMs: number; }): Promise { const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =