OpenAI TTS: use wav for Groq speech

Made-with: Cursor
This commit is contained in:
Neerav Makwana
2026-04-06 21:49:21 -04:00
committed by Ayaan Zaidi
parent 494c25b0c4
commit eb4bc200d7
3 changed files with 134 additions and 5 deletions

View File

@@ -1,7 +1,14 @@
import { describe, expect, it } from "vitest";
import { afterEach, describe, expect, it, vi } from "vitest";
import { buildOpenAISpeechProvider } from "./speech-provider.js";
describe("buildOpenAISpeechProvider", () => {
const originalFetch = globalThis.fetch;
afterEach(() => {
globalThis.fetch = originalFetch;
vi.restoreAllMocks();
});
it("normalizes provider-owned speech config from raw provider config", () => {
const provider = buildOpenAISpeechProvider();
const resolved = provider.resolveConfig?.({
@@ -16,6 +23,7 @@ describe("buildOpenAISpeechProvider", () => {
voice: "alloy",
speed: 1.25,
instructions: " Speak warmly ",
responseFormat: " WAV ",
},
},
},
@@ -28,6 +36,7 @@ describe("buildOpenAISpeechProvider", () => {
voice: "alloy",
speed: 1.25,
instructions: "Speak warmly",
responseFormat: "wav",
});
});
@@ -67,4 +76,61 @@ describe("buildOpenAISpeechProvider", () => {
handled: false,
});
});
it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => {
const provider = buildOpenAISpeechProvider();
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
expect(init?.body).toBeTruthy();
const body = JSON.parse(String(init?.body)) as { response_format?: string };
expect(body.response_format).toBe("wav");
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
});
globalThis.fetch = fetchMock as unknown as typeof fetch;
const result = await provider.synthesize({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
baseUrl: "https://api.groq.com/openai/v1",
model: "canopylabs/orpheus-v1-english",
voice: "daniel",
},
target: "audio-file",
timeoutMs: 1_000,
});
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
});
it("honors explicit responseFormat overrides and clears voice-note compatibility when not opus", async () => {
const provider = buildOpenAISpeechProvider();
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
expect(init?.body).toBeTruthy();
const body = JSON.parse(String(init?.body)) as { response_format?: string };
expect(body.response_format).toBe("wav");
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
});
globalThis.fetch = fetchMock as unknown as typeof fetch;
const result = await provider.synthesize({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
baseUrl: "https://proxy.example.com/openai/v1",
model: "canopylabs/orpheus-v1-english",
voice: "daniel",
responseFormat: "wav",
},
target: "voice-note",
timeoutMs: 1_000,
});
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
});
});

View File

@@ -21,6 +21,10 @@ import {
openaiTTS,
} from "./tts.js";
const OPENAI_SPEECH_RESPONSE_FORMATS = ["mp3", "opus", "wav"] as const;
type OpenAiSpeechResponseFormat = (typeof OPENAI_SPEECH_RESPONSE_FORMATS)[number];
type OpenAITtsProviderConfig = {
apiKey?: string;
baseUrl: string;
@@ -28,6 +32,7 @@ type OpenAITtsProviderConfig = {
voice: string;
speed?: number;
instructions?: string;
responseFormat?: OpenAiSpeechResponseFormat;
};
type OpenAITtsProviderOverrides = {
@@ -36,6 +41,57 @@ type OpenAITtsProviderOverrides = {
speed?: number;
};
function normalizeOpenAISpeechResponseFormat(
value: unknown,
): OpenAiSpeechResponseFormat | undefined {
const next = trimToUndefined(typeof value === "string" ? value : undefined)?.toLowerCase();
if (!next) {
return undefined;
}
if (
OPENAI_SPEECH_RESPONSE_FORMATS.includes(next as (typeof OPENAI_SPEECH_RESPONSE_FORMATS)[number])
) {
return next as OpenAiSpeechResponseFormat;
}
throw new Error(`Invalid OpenAI speech responseFormat: ${next}`);
}
function isGroqSpeechBaseUrl(baseUrl: string): boolean {
try {
const hostname = new URL(baseUrl).hostname.toLowerCase();
return hostname === "groq.com" || hostname.endsWith(".groq.com");
} catch {
return false;
}
}
function resolveSpeechResponseFormat(
baseUrl: string,
target: "audio-file" | "voice-note",
configuredFormat?: OpenAiSpeechResponseFormat,
): OpenAiSpeechResponseFormat {
if (configuredFormat) {
return configuredFormat;
}
if (isGroqSpeechBaseUrl(baseUrl)) {
return "wav";
}
return target === "voice-note" ? "opus" : "mp3";
}
function responseFormatToFileExtension(
format: OpenAiSpeechResponseFormat,
): ".mp3" | ".opus" | ".wav" {
switch (format) {
case "opus":
return ".opus";
case "wav":
return ".wav";
default:
return ".mp3";
}
}
function normalizeOpenAIProviderConfig(
rawConfig: Record<string, unknown>,
): OpenAITtsProviderConfig {
@@ -54,6 +110,7 @@ function normalizeOpenAIProviderConfig(
voice: trimToUndefined(raw?.voice) ?? "coral",
speed: asFiniteNumber(raw?.speed),
instructions: trimToUndefined(raw?.instructions),
responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat),
};
}
@@ -66,6 +123,8 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid
voice: trimToUndefined(config.voice) ?? normalized.voice,
speed: asFiniteNumber(config.speed) ?? normalized.speed,
instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
responseFormat:
normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat,
};
}
@@ -171,7 +230,11 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
const responseFormat = resolveSpeechResponseFormat(
config.baseUrl,
req.target,
config.responseFormat,
);
const audioBuffer = await openaiTTS({
text: req.text,
apiKey,
@@ -186,8 +249,8 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
return {
audioBuffer,
outputFormat: responseFormat,
fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
voiceCompatible: req.target === "voice-note",
fileExtension: responseFormatToFileExtension(responseFormat),
voiceCompatible: req.target === "voice-note" && responseFormat === "opus",
};
},
synthesizeTelephony: async (req) => {

View File

@@ -112,7 +112,7 @@ export async function openaiTTS(params: {
voice: string;
speed?: number;
instructions?: string;
responseFormat: "mp3" | "opus" | "pcm";
responseFormat: "mp3" | "opus" | "pcm" | "wav";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =