mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 09:41:11 +00:00
OpenAI TTS: use wav for Groq speech
Made-with: Cursor
This commit is contained in:
committed by
Ayaan Zaidi
parent
494c25b0c4
commit
eb4bc200d7
@@ -1,7 +1,14 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { buildOpenAISpeechProvider } from "./speech-provider.js";
|
||||
|
||||
describe("buildOpenAISpeechProvider", () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("normalizes provider-owned speech config from raw provider config", () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
const resolved = provider.resolveConfig?.({
|
||||
@@ -16,6 +23,7 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
voice: "alloy",
|
||||
speed: 1.25,
|
||||
instructions: " Speak warmly ",
|
||||
responseFormat: " WAV ",
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -28,6 +36,7 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
voice: "alloy",
|
||||
speed: 1.25,
|
||||
instructions: "Speak warmly",
|
||||
responseFormat: "wav",
|
||||
});
|
||||
});
|
||||
|
||||
@@ -67,4 +76,61 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
handled: false,
|
||||
});
|
||||
});
|
||||
|
||||
it("uses wav for Groq-compatible OpenAI TTS endpoints", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
|
||||
expect(init?.body).toBeTruthy();
|
||||
const body = JSON.parse(String(init?.body)) as { response_format?: string };
|
||||
expect(body.response_format).toBe("wav");
|
||||
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
|
||||
});
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "sk-test",
|
||||
baseUrl: "https://api.groq.com/openai/v1",
|
||||
model: "canopylabs/orpheus-v1-english",
|
||||
voice: "daniel",
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(result.outputFormat).toBe("wav");
|
||||
expect(result.fileExtension).toBe(".wav");
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
});
|
||||
|
||||
it("honors explicit responseFormat overrides and clears voice-note compatibility when not opus", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
|
||||
expect(init?.body).toBeTruthy();
|
||||
const body = JSON.parse(String(init?.body)) as { response_format?: string };
|
||||
expect(body.response_format).toBe("wav");
|
||||
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
|
||||
});
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "sk-test",
|
||||
baseUrl: "https://proxy.example.com/openai/v1",
|
||||
model: "canopylabs/orpheus-v1-english",
|
||||
voice: "daniel",
|
||||
responseFormat: "wav",
|
||||
},
|
||||
target: "voice-note",
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(result.outputFormat).toBe("wav");
|
||||
expect(result.fileExtension).toBe(".wav");
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -21,6 +21,10 @@ import {
|
||||
openaiTTS,
|
||||
} from "./tts.js";
|
||||
|
||||
const OPENAI_SPEECH_RESPONSE_FORMATS = ["mp3", "opus", "wav"] as const;
|
||||
|
||||
type OpenAiSpeechResponseFormat = (typeof OPENAI_SPEECH_RESPONSE_FORMATS)[number];
|
||||
|
||||
type OpenAITtsProviderConfig = {
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
@@ -28,6 +32,7 @@ type OpenAITtsProviderConfig = {
|
||||
voice: string;
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat?: OpenAiSpeechResponseFormat;
|
||||
};
|
||||
|
||||
type OpenAITtsProviderOverrides = {
|
||||
@@ -36,6 +41,57 @@ type OpenAITtsProviderOverrides = {
|
||||
speed?: number;
|
||||
};
|
||||
|
||||
function normalizeOpenAISpeechResponseFormat(
|
||||
value: unknown,
|
||||
): OpenAiSpeechResponseFormat | undefined {
|
||||
const next = trimToUndefined(typeof value === "string" ? value : undefined)?.toLowerCase();
|
||||
if (!next) {
|
||||
return undefined;
|
||||
}
|
||||
if (
|
||||
OPENAI_SPEECH_RESPONSE_FORMATS.includes(next as (typeof OPENAI_SPEECH_RESPONSE_FORMATS)[number])
|
||||
) {
|
||||
return next as OpenAiSpeechResponseFormat;
|
||||
}
|
||||
throw new Error(`Invalid OpenAI speech responseFormat: ${next}`);
|
||||
}
|
||||
|
||||
function isGroqSpeechBaseUrl(baseUrl: string): boolean {
|
||||
try {
|
||||
const hostname = new URL(baseUrl).hostname.toLowerCase();
|
||||
return hostname === "groq.com" || hostname.endsWith(".groq.com");
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function resolveSpeechResponseFormat(
|
||||
baseUrl: string,
|
||||
target: "audio-file" | "voice-note",
|
||||
configuredFormat?: OpenAiSpeechResponseFormat,
|
||||
): OpenAiSpeechResponseFormat {
|
||||
if (configuredFormat) {
|
||||
return configuredFormat;
|
||||
}
|
||||
if (isGroqSpeechBaseUrl(baseUrl)) {
|
||||
return "wav";
|
||||
}
|
||||
return target === "voice-note" ? "opus" : "mp3";
|
||||
}
|
||||
|
||||
function responseFormatToFileExtension(
|
||||
format: OpenAiSpeechResponseFormat,
|
||||
): ".mp3" | ".opus" | ".wav" {
|
||||
switch (format) {
|
||||
case "opus":
|
||||
return ".opus";
|
||||
case "wav":
|
||||
return ".wav";
|
||||
default:
|
||||
return ".mp3";
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeOpenAIProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): OpenAITtsProviderConfig {
|
||||
@@ -54,6 +110,7 @@ function normalizeOpenAIProviderConfig(
|
||||
voice: trimToUndefined(raw?.voice) ?? "coral",
|
||||
speed: asFiniteNumber(raw?.speed),
|
||||
instructions: trimToUndefined(raw?.instructions),
|
||||
responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -66,6 +123,8 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid
|
||||
voice: trimToUndefined(config.voice) ?? normalized.voice,
|
||||
speed: asFiniteNumber(config.speed) ?? normalized.speed,
|
||||
instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
|
||||
responseFormat:
|
||||
normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -171,7 +230,11 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
}
|
||||
const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
|
||||
const responseFormat = resolveSpeechResponseFormat(
|
||||
config.baseUrl,
|
||||
req.target,
|
||||
config.responseFormat,
|
||||
);
|
||||
const audioBuffer = await openaiTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
@@ -186,8 +249,8 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat: responseFormat,
|
||||
fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
|
||||
voiceCompatible: req.target === "voice-note",
|
||||
fileExtension: responseFormatToFileExtension(responseFormat),
|
||||
voiceCompatible: req.target === "voice-note" && responseFormat === "opus",
|
||||
};
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
|
||||
@@ -112,7 +112,7 @@ export async function openaiTTS(params: {
|
||||
voice: string;
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
responseFormat: "mp3" | "opus" | "pcm" | "wav";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
|
||||
|
||||
Reference in New Issue
Block a user