Files
openclaw/extensions/google/speech-provider.test.ts
2026-05-03 22:52:18 -07:00

656 lines
19 KiB
TypeScript

import {
getProviderHttpMocks,
installProviderHttpMockCleanup,
} from "openclaw/plugin-sdk/provider-http-test-mocks";
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
}));
const {
assertOkOrThrowProviderErrorMock,
postJsonRequestMock,
resolveProviderHttpRequestConfigMock,
} = getProviderHttpMocks();
let buildGoogleSpeechProvider: typeof import("./speech-provider.js").buildGoogleSpeechProvider;
let __testing: typeof import("./speech-provider.js").__testing;
beforeAll(async () => {
({ buildGoogleSpeechProvider, __testing } = await import("./speech-provider.js"));
});
installProviderHttpMockCleanup();
function googleTtsResponse(pcm = Buffer.from([1, 0, 2, 0])) {
return {
ok: true,
json: async () => ({
candidates: [
{
content: {
parts: [
{
inlineData: {
mimeType: "audio/L16;codec=pcm;rate=24000",
data: pcm.toString("base64"),
},
},
],
},
},
],
}),
};
}
function installGoogleTtsRequestMock(pcm = Buffer.from([1, 0, 2, 0])) {
postJsonRequestMock.mockResolvedValue({
response: googleTtsResponse(pcm),
release: vi.fn(async () => {}),
});
return postJsonRequestMock;
}
describe("Google speech provider", () => {
afterEach(() => {
vi.unstubAllGlobals();
vi.unstubAllEnvs();
transcodeAudioBufferToOpusMock.mockReset();
});
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
const requestMock = installGoogleTtsRequestMock();
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "[whispers] The door is open.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
model: "google/gemini-3.1-flash-tts",
voiceName: "Puck",
},
target: "audio-file",
timeoutMs: 12_345,
});
expect(requestMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
body: {
contents: [
{
role: "user",
parts: [{ text: "[whispers] The door is open." }],
},
],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
},
},
fetchFn: fetch,
pinDns: false,
timeoutMs: 12_345,
}),
);
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("google-test-key");
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
expect(result.audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE);
expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0]));
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
installGoogleTtsRequestMock(Buffer.from([5, 0, 6, 0]));
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "Send this as a voice note.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
},
target: "voice-note",
timeoutMs: 12_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("google-opus"),
outputFormat: "opus",
fileExtension: ".opus",
voiceCompatible: true,
});
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
audioBuffer: expect.any(Buffer),
inputExtension: "wav",
tempPrefix: "tts-google-",
timeoutMs: 12_000,
});
const [{ audioBuffer }] = transcodeAudioBufferToOpusMock.mock.calls[0];
expect(audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
});
it("advertises all documented Gemini TTS-capable models", () => {
const provider = buildGoogleSpeechProvider();
expect(provider.models).toEqual(__testing.GOOGLE_TTS_MODELS);
});
it("renders deterministic audio-profile-v1 prompts without generating tags", async () => {
const provider = buildGoogleSpeechProvider();
const prepared = await provider.prepareSynthesis?.({
text: "[whispers] The door is open.",
cfg: {},
providerConfig: {
promptTemplate: "audio-profile-v1",
personaPrompt: "Keep a close-mic feel.",
},
persona: {
id: "alfred",
label: "Alfred",
prompt: {
profile: "A brilliant British butler.",
scene: "A quiet late-night study.",
sampleContext: "The speaker is answering a trusted operator.",
style: "Refined and lightly amused.",
accent: "British English.",
pacing: "Measured.",
constraints: ["Do not read configuration values aloud."],
},
},
target: "audio-file",
timeoutMs: 1_000,
});
expect(prepared?.text).toBe(
[
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
"as performance direction. Do not read section titles, notes, labels, or",
"configuration aloud.",
"",
"# AUDIO PROFILE: Alfred",
"A brilliant British butler.",
"",
"## THE SCENE",
"A quiet late-night study.",
"",
"### DIRECTOR'S NOTES",
"Style: Refined and lightly amused.",
"Accent: British English.",
"Pacing: Measured.",
"Constraints:",
"- Do not read configuration values aloud.",
"Provider notes:",
"Keep a close-mic feel.",
"",
"### SAMPLE CONTEXT",
"The speaker is answering a trusted operator.",
"",
"### TRANSCRIPT",
"[whispers] The door is open.",
].join("\n"),
);
});
it("does not wrap an OpenClaw audio-profile-v1 prompt twice", async () => {
const provider = buildGoogleSpeechProvider();
const text = [
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
"as performance direction. Do not read section titles, notes, labels, or",
"configuration aloud.",
"",
"# AUDIO PROFILE: Alfred",
"A brilliant British butler.",
"",
"### TRANSCRIPT",
"Hello.",
].join("\n");
const prepared = await provider.prepareSynthesis?.({
text,
cfg: {},
providerConfig: {
promptTemplate: "audio-profile-v1",
},
persona: {
id: "alfred",
label: "Alfred",
prompt: {
profile: "A brilliant British butler.",
},
},
target: "audio-file",
timeoutMs: 1_000,
});
expect(prepared).toBeUndefined();
});
it("retries once when Gemini returns no audio payload", async () => {
const pcm = Buffer.from([5, 0, 6, 0]);
const requestSequence = vi
.fn()
.mockResolvedValueOnce({
response: {
ok: true,
json: async () => ({ candidates: [{ content: { parts: [{ text: "not audio" }] } }] }),
},
release: vi.fn(async () => {}),
})
.mockResolvedValueOnce({
response: googleTtsResponse(pcm),
release: vi.fn(async () => {}),
});
postJsonRequestMock.mockImplementation(requestSequence);
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "Retry this.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
},
target: "audio-file",
timeoutMs: 5_000,
});
expect(requestSequence).toHaveBeenCalledTimes(2);
expect(result.audioBuffer.subarray(44)).toEqual(pcm);
});
it("retries once when Gemini TTS fetch aborts", async () => {
const pcm = Buffer.from([7, 0, 8, 0]);
const abortError = new Error("This operation was aborted");
abortError.name = "AbortError";
const requestSequence = vi
.fn()
.mockRejectedValueOnce(abortError)
.mockResolvedValueOnce({
response: googleTtsResponse(pcm),
release: vi.fn(async () => {}),
});
postJsonRequestMock.mockImplementation(requestSequence);
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "Retry aborted fetch.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
},
target: "audio-file",
timeoutMs: 5_000,
});
expect(requestSequence).toHaveBeenCalledTimes(2);
expect(result.audioBuffer.subarray(44)).toEqual(pcm);
});
it("does not retry non-transient Gemini TTS request failures", async () => {
const requestSequence = vi.fn().mockRejectedValueOnce(new Error("invalid request"));
postJsonRequestMock.mockImplementation(requestSequence);
const provider = buildGoogleSpeechProvider();
await expect(
provider.synthesize({
text: "Do not retry this.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
},
target: "audio-file",
timeoutMs: 5_000,
}),
).rejects.toThrow("invalid request");
expect(requestSequence).toHaveBeenCalledTimes(1);
});
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
vi.stubEnv("GEMINI_API_KEY", "env-google-key");
const requestMock = installGoogleTtsRequestMock();
const provider = buildGoogleSpeechProvider();
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
await provider.synthesize({
text: "Read this plainly.",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta/openai",
models: [],
},
},
},
},
providerConfig: {},
target: "voice-note",
timeoutMs: 10_000,
});
expect(requestMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
}),
);
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("env-google-key");
});
it("can reuse a configured Google model-provider API key without auth profiles", async () => {
const requestMock = installGoogleTtsRequestMock();
const provider = buildGoogleSpeechProvider();
const cfg = {
models: {
providers: {
google: {
apiKey: "model-provider-google-key",
baseUrl: "https://generativelanguage.googleapis.com",
models: [],
},
},
},
};
expect(provider.isConfigured({ cfg, providerConfig: {}, timeoutMs: 1 })).toBe(true);
await provider.synthesize({
text: "Use the configured model provider key.",
cfg,
providerConfig: {},
target: "audio-file",
timeoutMs: 10_000,
});
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
});
it("returns Gemini PCM directly for telephony synthesis", async () => {
const pcm = Buffer.from([3, 0, 4, 0]);
installGoogleTtsRequestMock(pcm);
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesizeTelephony?.({
text: "Phone call audio.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
model: "google/gemini-3.1-flash-tts",
voice: "Kore",
audioProfile: "Speak calmly.",
speakerName: "Default speaker",
},
providerOverrides: {
model: "google/gemini-3.1-pro-tts",
voiceName: "Puck",
audioProfile: "Speak brightly.",
speakerName: "Override speaker",
},
timeoutMs: 5_000,
});
expect(postJsonRequestMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent",
body: expect.objectContaining({
contents: [
{
role: "user",
parts: [
{ text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." },
],
},
],
generationConfig: expect.objectContaining({
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
}),
}),
}),
);
expect(result).toEqual({
audioBuffer: pcm,
outputFormat: "pcm",
sampleRate: 24_000,
});
});
it("prepends configured Gemini TTS profile text", async () => {
const requestMock = installGoogleTtsRequestMock();
const provider = buildGoogleSpeechProvider();
await provider.synthesize({
text: "Status update starts now.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
audioProfile: "Speak professionally with a calm executive tone.",
speakerName: "Alex",
},
target: "audio-file",
timeoutMs: 10_000,
});
expect(requestMock.mock.calls[0]?.[0].body).toMatchObject({
contents: [
{
parts: [
{
text:
"Speak professionally with a calm executive tone.\n\n" +
"Speaker name: Alex\n\n" +
"Status update starts now.",
},
],
},
],
});
});
it("resolves provider config and directive overrides", () => {
const provider = buildGoogleSpeechProvider();
expect(
provider.resolveConfig?.({
cfg: {},
rawConfig: {
providers: {
google: {
apiKey: "configured-key",
model: "google/gemini-3.1-flash-tts-preview",
voice: "Leda",
audioProfile: "Speak warmly.",
speakerName: "Narrator",
},
},
},
timeoutMs: 1,
}),
).toEqual({
apiKey: "configured-key",
audioProfile: "Speak warmly.",
baseUrl: undefined,
model: "gemini-3.1-flash-tts-preview",
speakerName: "Narrator",
voiceName: "Leda",
});
expect(
provider.parseDirectiveToken?.({
key: "google_voice",
value: "Aoede",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
overrides: {
voiceName: "Aoede",
},
});
expect(
provider.parseDirectiveToken?.({
key: "google_model",
value: "gemini-3.1-flash-tts-preview",
policy: {
enabled: true,
allowText: true,
allowProvider: true,
allowVoice: true,
allowModelId: true,
allowVoiceSettings: true,
allowNormalization: true,
allowSeed: true,
},
}),
).toEqual({
handled: true,
overrides: {
model: "gemini-3.1-flash-tts-preview",
},
});
});
it("lists Gemini prebuilt TTS voices", async () => {
const provider = buildGoogleSpeechProvider();
await expect(provider.listVoices?.({ providerConfig: {} })).resolves.toEqual(
expect.arrayContaining([
{ id: "Kore", name: "Kore" },
{ id: "Puck", name: "Puck" },
]),
);
});
it("formats Google TTS HTTP errors with provider details", async () => {
assertOkOrThrowProviderErrorMock.mockRejectedValue(
new Error(
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
),
);
postJsonRequestMock.mockResolvedValue({
response: new Response(
JSON.stringify({
error: {
message: "Quota exceeded",
status: "RESOURCE_EXHAUSTED",
},
}),
{
status: 429,
headers: { "x-request-id": "google_req_123" },
},
),
release: vi.fn(async () => {}),
});
const provider = buildGoogleSpeechProvider();
await expect(
provider.synthesize({
text: "Read this plainly.",
cfg: {},
providerConfig: { apiKey: "google-test-key" },
target: "audio-file",
timeoutMs: 10_000,
}),
).rejects.toThrow(
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
);
});
it("honors configured private-network opt-in for Google TTS", async () => {
installGoogleTtsRequestMock();
const provider = buildGoogleSpeechProvider();
await provider.synthesize({
text: "hello",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
request: { allowPrivateNetwork: true },
models: [],
},
},
},
},
providerConfig: { apiKey: "google-test-key" },
target: "audio-file",
timeoutMs: 12_345,
});
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
expect.objectContaining({
allowPrivateNetwork: true,
request: expect.objectContaining({ allowPrivateNetwork: true }),
}),
);
});
it("honors configured private-network opt-in for Google telephony TTS", async () => {
installGoogleTtsRequestMock();
const provider = buildGoogleSpeechProvider();
await provider.synthesizeTelephony?.({
text: "hello",
cfg: {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
request: { allowPrivateNetwork: true },
models: [],
},
},
},
},
providerConfig: { apiKey: "google-test-key" },
timeoutMs: 12_345,
});
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
expect.objectContaining({
allowPrivateNetwork: true,
request: expect.objectContaining({ allowPrivateNetwork: true }),
}),
);
});
});