Files
openclaw/extensions/google/speech-provider.test.ts
2026-04-25 06:11:23 +01:00

319 lines
8.8 KiB
TypeScript

import { afterEach, describe, expect, it, vi } from "vitest";
import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
const fetchMock = vi.fn().mockResolvedValue({
ok: true,
json: async () => ({
candidates: [
{
content: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm.toString("base64"),
},
},
],
},
},
],
}),
});
vi.stubGlobal("fetch", fetchMock);
return fetchMock;
}
describe("Google speech provider", () => {
afterEach(() => {
vi.restoreAllMocks();
vi.unstubAllGlobals();
vi.unstubAllEnvs();
});
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "[whispers] The door is open.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
model: "google/gemini-3.1-flash-tts",
voiceName: "Puck",
},
target: "audio-file",
timeoutMs: 12_345,
});
expect(fetchMock).toHaveBeenCalledWith(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
expect.objectContaining({
method: "POST",
body: JSON.stringify({
contents: [
{
role: "user",
parts: [{ text: "[whispers] The door is open." }],
},
],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
},
}),
}),
);
const [, init] = fetchMock.mock.calls[0];
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("google-test-key");
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
expect(result.audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE);
expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0]));
});
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
vi.stubEnv("GEMINI_API_KEY", "env-google-key");
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
await provider.synthesize({
text: "Read this plainly.",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta/openai",
models: [],
},
},
},
},
providerConfig: {},
target: "voice-note",
timeoutMs: 10_000,
});
expect(fetchMock).toHaveBeenCalledWith(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
expect.any(Object),
);
const [, init] = fetchMock.mock.calls[0];
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("env-google-key");
});
it("can reuse a configured Google model-provider API key without auth profiles", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
const cfg = {
models: {
providers: {
google: {
apiKey: "model-provider-google-key",
baseUrl: "https://generativelanguage.googleapis.com",
models: [],
},
},
},
};
expect(provider.isConfigured({ cfg, providerConfig: {}, timeoutMs: 1 })).toBe(true);
await provider.synthesize({
text: "Use the configured model provider key.",
cfg,
providerConfig: {},
target: "audio-file",
timeoutMs: 10_000,
});
const [, init] = fetchMock.mock.calls[0];
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
});
it("returns Gemini PCM directly for telephony synthesis", async () => {
const pcm = Buffer.from([3, 0, 4, 0]);
installGoogleTtsFetchMock(pcm);
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesizeTelephony?.({
text: "Phone call audio.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
voice: "Kore",
},
timeoutMs: 5_000,
});
expect(result).toEqual({
audioBuffer: pcm,
outputFormat: "pcm",
sampleRate: 24_000,
});
});
it("prepends configured Gemini TTS profile text", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
await provider.synthesize({
text: "Status update starts now.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
audioProfile: "Speak professionally with a calm executive tone.",
speakerName: "Alex",
},
target: "audio-file",
timeoutMs: 10_000,
});
const [, init] = fetchMock.mock.calls[0];
expect(JSON.parse(String(init.body))).toMatchObject({
contents: [
{
parts: [
{
text:
"Speak professionally with a calm executive tone.\n\n" +
"Speaker name: Alex\n\n" +
"Status update starts now.",
},
],
},
],
});
});
it("resolves provider config and directive overrides", () => {
const provider = buildGoogleSpeechProvider();
expect(
provider.resolveConfig?.({
cfg: {},
rawConfig: {
providers: {
google: {
apiKey: "configured-key",
model: "google/gemini-3.1-flash-tts-preview",
voice: "Leda",
audioProfile: "Speak warmly.",
speakerName: "Narrator",
},
},
},
timeoutMs: 1,
}),
).toEqual({
apiKey: "configured-key",
audioProfile: "Speak warmly.",
baseUrl: undefined,
model: "gemini-3.1-flash-tts-preview",
speakerName: "Narrator",
voiceName: "Leda",
});
expect(
provider.parseDirectiveToken?.({
key: "google_voice",
value: "Aoede",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
overrides: {
voiceName: "Aoede",
},
});
expect(
provider.parseDirectiveToken?.({
key: "google_model",
value: "gemini-3.1-flash-tts-preview",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
overrides: {
model: "gemini-3.1-flash-tts-preview",
},
});
});
it("lists Gemini prebuilt TTS voices", async () => {
const provider = buildGoogleSpeechProvider();
await expect(provider.listVoices?.({ providerConfig: {} })).resolves.toEqual(
expect.arrayContaining([
{ id: "Kore", name: "Kore" },
{ id: "Puck", name: "Puck" },
]),
);
});
it("formats Google TTS HTTP errors with provider details", async () => {
vi.stubGlobal(
"fetch",
vi.fn().mockResolvedValue(
new Response(
JSON.stringify({
error: {
message: "Quota exceeded",
status: "RESOURCE_EXHAUSTED",
},
}),
{
status: 429,
headers: { "x-request-id": "google_req_123" },
},
),
),
);
const provider = buildGoogleSpeechProvider();
await expect(
provider.synthesize({
text: "Read this plainly.",
cfg: {},
providerConfig: { apiKey: "google-test-key" },
target: "audio-file",
timeoutMs: 10_000,
}),
).rejects.toThrow(
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
);
});
});