Files
openclaw/extensions/google/speech-provider.test.ts
2026-04-25 21:33:33 +01:00

416 lines
12 KiB
TypeScript

import * as providerHttp from "openclaw/plugin-sdk/provider-http";
import { afterEach, describe, expect, it, vi } from "vitest";
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
}));
import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
const fetchMock = vi.fn().mockResolvedValue({
ok: true,
json: async () => ({
candidates: [
{
content: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm.toString("base64"),
},
},
],
},
},
],
}),
});
vi.stubGlobal("fetch", fetchMock);
return fetchMock;
}
describe("Google speech provider", () => {
afterEach(() => {
vi.restoreAllMocks();
vi.unstubAllGlobals();
vi.unstubAllEnvs();
transcodeAudioBufferToOpusMock.mockReset();
});
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "[whispers] The door is open.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
model: "google/gemini-3.1-flash-tts",
voiceName: "Puck",
},
target: "audio-file",
timeoutMs: 12_345,
});
expect(fetchMock).toHaveBeenCalledWith(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
expect.objectContaining({
method: "POST",
body: JSON.stringify({
contents: [
{
role: "user",
parts: [{ text: "[whispers] The door is open." }],
},
],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
},
}),
}),
);
const [, init] = fetchMock.mock.calls[0];
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("google-test-key");
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
expect(result.audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE);
expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0]));
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0]));
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "Send this as a voice note.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
},
target: "voice-note",
timeoutMs: 12_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("google-opus"),
outputFormat: "opus",
fileExtension: ".opus",
voiceCompatible: true,
});
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
audioBuffer: expect.any(Buffer),
inputExtension: "wav",
tempPrefix: "tts-google-",
timeoutMs: 12_000,
});
const [{ audioBuffer }] = transcodeAudioBufferToOpusMock.mock.calls[0];
expect(audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
});
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
vi.stubEnv("GEMINI_API_KEY", "env-google-key");
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
await provider.synthesize({
text: "Read this plainly.",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta/openai",
models: [],
},
},
},
},
providerConfig: {},
target: "voice-note",
timeoutMs: 10_000,
});
expect(fetchMock).toHaveBeenCalledWith(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
expect.any(Object),
);
const [, init] = fetchMock.mock.calls[0];
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("env-google-key");
});
it("can reuse a configured Google model-provider API key without auth profiles", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
const cfg = {
models: {
providers: {
google: {
apiKey: "model-provider-google-key",
baseUrl: "https://generativelanguage.googleapis.com",
models: [],
},
},
},
};
expect(provider.isConfigured({ cfg, providerConfig: {}, timeoutMs: 1 })).toBe(true);
await provider.synthesize({
text: "Use the configured model provider key.",
cfg,
providerConfig: {},
target: "audio-file",
timeoutMs: 10_000,
});
const [, init] = fetchMock.mock.calls[0];
expect(new Headers(init.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
});
it("returns Gemini PCM directly for telephony synthesis", async () => {
const pcm = Buffer.from([3, 0, 4, 0]);
installGoogleTtsFetchMock(pcm);
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesizeTelephony?.({
text: "Phone call audio.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
voice: "Kore",
},
timeoutMs: 5_000,
});
expect(result).toEqual({
audioBuffer: pcm,
outputFormat: "pcm",
sampleRate: 24_000,
});
});
it("prepends configured Gemini TTS profile text", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
await provider.synthesize({
text: "Status update starts now.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
audioProfile: "Speak professionally with a calm executive tone.",
speakerName: "Alex",
},
target: "audio-file",
timeoutMs: 10_000,
});
const [, init] = fetchMock.mock.calls[0];
expect(JSON.parse(String(init.body))).toMatchObject({
contents: [
{
parts: [
{
text:
"Speak professionally with a calm executive tone.\n\n" +
"Speaker name: Alex\n\n" +
"Status update starts now.",
},
],
},
],
});
});
it("resolves provider config and directive overrides", () => {
const provider = buildGoogleSpeechProvider();
expect(
provider.resolveConfig?.({
cfg: {},
rawConfig: {
providers: {
google: {
apiKey: "configured-key",
model: "google/gemini-3.1-flash-tts-preview",
voice: "Leda",
audioProfile: "Speak warmly.",
speakerName: "Narrator",
},
},
},
timeoutMs: 1,
}),
).toEqual({
apiKey: "configured-key",
audioProfile: "Speak warmly.",
baseUrl: undefined,
model: "gemini-3.1-flash-tts-preview",
speakerName: "Narrator",
voiceName: "Leda",
});
expect(
provider.parseDirectiveToken?.({
key: "google_voice",
value: "Aoede",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
overrides: {
voiceName: "Aoede",
},
});
expect(
provider.parseDirectiveToken?.({
key: "google_model",
value: "gemini-3.1-flash-tts-preview",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
overrides: {
model: "gemini-3.1-flash-tts-preview",
},
});
});
it("lists Gemini prebuilt TTS voices", async () => {
const provider = buildGoogleSpeechProvider();
await expect(provider.listVoices?.({ providerConfig: {} })).resolves.toEqual(
expect.arrayContaining([
{ id: "Kore", name: "Kore" },
{ id: "Puck", name: "Puck" },
]),
);
});
it("formats Google TTS HTTP errors with provider details", async () => {
vi.stubGlobal(
"fetch",
vi.fn().mockResolvedValue(
new Response(
JSON.stringify({
error: {
message: "Quota exceeded",
status: "RESOURCE_EXHAUSTED",
},
}),
{
status: 429,
headers: { "x-request-id": "google_req_123" },
},
),
),
);
const provider = buildGoogleSpeechProvider();
await expect(
provider.synthesize({
text: "Read this plainly.",
cfg: {},
providerConfig: { apiKey: "google-test-key" },
target: "audio-file",
timeoutMs: 10_000,
}),
).rejects.toThrow(
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
);
});
it("honors configured private-network opt-in for Google TTS", async () => {
installGoogleTtsFetchMock();
const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
const provider = buildGoogleSpeechProvider();
await provider.synthesize({
text: "hello",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
request: { allowPrivateNetwork: true },
models: [],
},
},
},
},
providerConfig: { apiKey: "google-test-key" },
target: "audio-file",
timeoutMs: 12_345,
});
expect(postJsonRequestSpy).toHaveBeenCalledWith(
expect.objectContaining({ allowPrivateNetwork: true }),
);
});
it("honors configured private-network opt-in for Google telephony TTS", async () => {
installGoogleTtsFetchMock();
const postJsonRequestSpy = vi.spyOn(providerHttp, "postJsonRequest");
const provider = buildGoogleSpeechProvider();
await provider.synthesizeTelephony?.({
text: "hello",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
request: { allowPrivateNetwork: true },
models: [],
},
},
},
},
providerConfig: { apiKey: "google-test-key" },
timeoutMs: 12_345,
});
expect(postJsonRequestSpy).toHaveBeenCalledWith(
expect.objectContaining({ allowPrivateNetwork: true }),
);
});
});