mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
656 lines
19 KiB
TypeScript
656 lines
19 KiB
TypeScript
import {
|
|
getProviderHttpMocks,
|
|
installProviderHttpMockCleanup,
|
|
} from "openclaw/plugin-sdk/provider-http-test-mocks";
|
|
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
|
|
|
|
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
|
|
|
|
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
|
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
|
|
}));
|
|
|
|
const {
|
|
assertOkOrThrowProviderErrorMock,
|
|
postJsonRequestMock,
|
|
resolveProviderHttpRequestConfigMock,
|
|
} = getProviderHttpMocks();
|
|
|
|
let buildGoogleSpeechProvider: typeof import("./speech-provider.js").buildGoogleSpeechProvider;
|
|
let __testing: typeof import("./speech-provider.js").__testing;
|
|
|
|
beforeAll(async () => {
|
|
({ buildGoogleSpeechProvider, __testing } = await import("./speech-provider.js"));
|
|
});
|
|
|
|
installProviderHttpMockCleanup();
|
|
|
|
function googleTtsResponse(pcm = Buffer.from([1, 0, 2, 0])) {
|
|
return {
|
|
ok: true,
|
|
json: async () => ({
|
|
candidates: [
|
|
{
|
|
content: {
|
|
parts: [
|
|
{
|
|
inlineData: {
|
|
mimeType: "audio/L16;codec=pcm;rate=24000",
|
|
data: pcm.toString("base64"),
|
|
},
|
|
},
|
|
],
|
|
},
|
|
},
|
|
],
|
|
}),
|
|
};
|
|
}
|
|
|
|
function installGoogleTtsRequestMock(pcm = Buffer.from([1, 0, 2, 0])) {
|
|
postJsonRequestMock.mockResolvedValue({
|
|
response: googleTtsResponse(pcm),
|
|
release: vi.fn(async () => {}),
|
|
});
|
|
return postJsonRequestMock;
|
|
}
|
|
|
|
describe("Google speech provider", () => {
|
|
afterEach(() => {
|
|
vi.unstubAllGlobals();
|
|
vi.unstubAllEnvs();
|
|
transcodeAudioBufferToOpusMock.mockReset();
|
|
});
|
|
|
|
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
|
|
const requestMock = installGoogleTtsRequestMock();
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
const result = await provider.synthesize({
|
|
text: "[whispers] The door is open.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
model: "google/gemini-3.1-flash-tts",
|
|
voiceName: "Puck",
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 12_345,
|
|
});
|
|
|
|
expect(requestMock).toHaveBeenCalledWith(
|
|
expect.objectContaining({
|
|
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
|
|
body: {
|
|
contents: [
|
|
{
|
|
role: "user",
|
|
parts: [{ text: "[whispers] The door is open." }],
|
|
},
|
|
],
|
|
generationConfig: {
|
|
responseModalities: ["AUDIO"],
|
|
speechConfig: {
|
|
voiceConfig: {
|
|
prebuiltVoiceConfig: {
|
|
voiceName: "Puck",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
fetchFn: fetch,
|
|
pinDns: false,
|
|
timeoutMs: 12_345,
|
|
}),
|
|
);
|
|
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
|
|
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("google-test-key");
|
|
expect(result.outputFormat).toBe("wav");
|
|
expect(result.fileExtension).toBe(".wav");
|
|
expect(result.voiceCompatible).toBe(false);
|
|
expect(result.audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
|
|
expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
|
|
expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE);
|
|
expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0]));
|
|
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
|
|
});
|
|
|
|
it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
|
|
installGoogleTtsRequestMock(Buffer.from([5, 0, 6, 0]));
|
|
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
const result = await provider.synthesize({
|
|
text: "Send this as a voice note.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
},
|
|
target: "voice-note",
|
|
timeoutMs: 12_000,
|
|
});
|
|
|
|
expect(result).toEqual({
|
|
audioBuffer: Buffer.from("google-opus"),
|
|
outputFormat: "opus",
|
|
fileExtension: ".opus",
|
|
voiceCompatible: true,
|
|
});
|
|
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
|
|
audioBuffer: expect.any(Buffer),
|
|
inputExtension: "wav",
|
|
tempPrefix: "tts-google-",
|
|
timeoutMs: 12_000,
|
|
});
|
|
const [{ audioBuffer }] = transcodeAudioBufferToOpusMock.mock.calls[0];
|
|
expect(audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
|
|
expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
|
|
});
|
|
|
|
it("advertises all documented Gemini TTS-capable models", () => {
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
expect(provider.models).toEqual(__testing.GOOGLE_TTS_MODELS);
|
|
});
|
|
|
|
it("renders deterministic audio-profile-v1 prompts without generating tags", async () => {
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
const prepared = await provider.prepareSynthesis?.({
|
|
text: "[whispers] The door is open.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
promptTemplate: "audio-profile-v1",
|
|
personaPrompt: "Keep a close-mic feel.",
|
|
},
|
|
persona: {
|
|
id: "alfred",
|
|
label: "Alfred",
|
|
prompt: {
|
|
profile: "A brilliant British butler.",
|
|
scene: "A quiet late-night study.",
|
|
sampleContext: "The speaker is answering a trusted operator.",
|
|
style: "Refined and lightly amused.",
|
|
accent: "British English.",
|
|
pacing: "Measured.",
|
|
constraints: ["Do not read configuration values aloud."],
|
|
},
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 1_000,
|
|
});
|
|
|
|
expect(prepared?.text).toBe(
|
|
[
|
|
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
|
|
"as performance direction. Do not read section titles, notes, labels, or",
|
|
"configuration aloud.",
|
|
"",
|
|
"# AUDIO PROFILE: Alfred",
|
|
"A brilliant British butler.",
|
|
"",
|
|
"## THE SCENE",
|
|
"A quiet late-night study.",
|
|
"",
|
|
"### DIRECTOR'S NOTES",
|
|
"Style: Refined and lightly amused.",
|
|
"Accent: British English.",
|
|
"Pacing: Measured.",
|
|
"Constraints:",
|
|
"- Do not read configuration values aloud.",
|
|
"Provider notes:",
|
|
"Keep a close-mic feel.",
|
|
"",
|
|
"### SAMPLE CONTEXT",
|
|
"The speaker is answering a trusted operator.",
|
|
"",
|
|
"### TRANSCRIPT",
|
|
"[whispers] The door is open.",
|
|
].join("\n"),
|
|
);
|
|
});
|
|
|
|
it("does not wrap an OpenClaw audio-profile-v1 prompt twice", async () => {
|
|
const provider = buildGoogleSpeechProvider();
|
|
const text = [
|
|
"Synthesize speech from the TRANSCRIPT section only. Use the other sections only",
|
|
"as performance direction. Do not read section titles, notes, labels, or",
|
|
"configuration aloud.",
|
|
"",
|
|
"# AUDIO PROFILE: Alfred",
|
|
"A brilliant British butler.",
|
|
"",
|
|
"### TRANSCRIPT",
|
|
"Hello.",
|
|
].join("\n");
|
|
|
|
const prepared = await provider.prepareSynthesis?.({
|
|
text,
|
|
cfg: {},
|
|
providerConfig: {
|
|
promptTemplate: "audio-profile-v1",
|
|
},
|
|
persona: {
|
|
id: "alfred",
|
|
label: "Alfred",
|
|
prompt: {
|
|
profile: "A brilliant British butler.",
|
|
},
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 1_000,
|
|
});
|
|
|
|
expect(prepared).toBeUndefined();
|
|
});
|
|
|
|
it("retries once when Gemini returns no audio payload", async () => {
|
|
const pcm = Buffer.from([5, 0, 6, 0]);
|
|
const requestSequence = vi
|
|
.fn()
|
|
.mockResolvedValueOnce({
|
|
response: {
|
|
ok: true,
|
|
json: async () => ({ candidates: [{ content: { parts: [{ text: "not audio" }] } }] }),
|
|
},
|
|
release: vi.fn(async () => {}),
|
|
})
|
|
.mockResolvedValueOnce({
|
|
response: googleTtsResponse(pcm),
|
|
release: vi.fn(async () => {}),
|
|
});
|
|
postJsonRequestMock.mockImplementation(requestSequence);
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
const result = await provider.synthesize({
|
|
text: "Retry this.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 5_000,
|
|
});
|
|
|
|
expect(requestSequence).toHaveBeenCalledTimes(2);
|
|
expect(result.audioBuffer.subarray(44)).toEqual(pcm);
|
|
});
|
|
|
|
it("retries once when Gemini TTS fetch aborts", async () => {
|
|
const pcm = Buffer.from([7, 0, 8, 0]);
|
|
const abortError = new Error("This operation was aborted");
|
|
abortError.name = "AbortError";
|
|
const requestSequence = vi
|
|
.fn()
|
|
.mockRejectedValueOnce(abortError)
|
|
.mockResolvedValueOnce({
|
|
response: googleTtsResponse(pcm),
|
|
release: vi.fn(async () => {}),
|
|
});
|
|
postJsonRequestMock.mockImplementation(requestSequence);
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
const result = await provider.synthesize({
|
|
text: "Retry aborted fetch.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 5_000,
|
|
});
|
|
|
|
expect(requestSequence).toHaveBeenCalledTimes(2);
|
|
expect(result.audioBuffer.subarray(44)).toEqual(pcm);
|
|
});
|
|
|
|
it("does not retry non-transient Gemini TTS request failures", async () => {
|
|
const requestSequence = vi.fn().mockRejectedValueOnce(new Error("invalid request"));
|
|
postJsonRequestMock.mockImplementation(requestSequence);
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
await expect(
|
|
provider.synthesize({
|
|
text: "Do not retry this.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 5_000,
|
|
}),
|
|
).rejects.toThrow("invalid request");
|
|
|
|
expect(requestSequence).toHaveBeenCalledTimes(1);
|
|
});
|
|
|
|
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
|
|
vi.stubEnv("GEMINI_API_KEY", "env-google-key");
|
|
const requestMock = installGoogleTtsRequestMock();
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 1 })).toBe(true);
|
|
|
|
await provider.synthesize({
|
|
text: "Read this plainly.",
|
|
cfg: {
|
|
models: {
|
|
providers: {
|
|
google: {
|
|
baseUrl: "https://generativelanguage.googleapis.com/v1beta/openai",
|
|
models: [],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
providerConfig: {},
|
|
target: "voice-note",
|
|
timeoutMs: 10_000,
|
|
});
|
|
|
|
expect(requestMock).toHaveBeenCalledWith(
|
|
expect.objectContaining({
|
|
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-flash-tts-preview:generateContent",
|
|
}),
|
|
);
|
|
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
|
|
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("env-google-key");
|
|
});
|
|
|
|
it("can reuse a configured Google model-provider API key without auth profiles", async () => {
|
|
const requestMock = installGoogleTtsRequestMock();
|
|
const provider = buildGoogleSpeechProvider();
|
|
const cfg = {
|
|
models: {
|
|
providers: {
|
|
google: {
|
|
apiKey: "model-provider-google-key",
|
|
baseUrl: "https://generativelanguage.googleapis.com",
|
|
models: [],
|
|
},
|
|
},
|
|
},
|
|
};
|
|
|
|
expect(provider.isConfigured({ cfg, providerConfig: {}, timeoutMs: 1 })).toBe(true);
|
|
|
|
await provider.synthesize({
|
|
text: "Use the configured model provider key.",
|
|
cfg,
|
|
providerConfig: {},
|
|
target: "audio-file",
|
|
timeoutMs: 10_000,
|
|
});
|
|
|
|
const request = requestMock.mock.calls[0]?.[0] as { headers?: HeadersInit };
|
|
expect(new Headers(request.headers).get("x-goog-api-key")).toBe("model-provider-google-key");
|
|
});
|
|
|
|
it("returns Gemini PCM directly for telephony synthesis", async () => {
|
|
const pcm = Buffer.from([3, 0, 4, 0]);
|
|
installGoogleTtsRequestMock(pcm);
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
const result = await provider.synthesizeTelephony?.({
|
|
text: "Phone call audio.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
model: "google/gemini-3.1-flash-tts",
|
|
voice: "Kore",
|
|
audioProfile: "Speak calmly.",
|
|
speakerName: "Default speaker",
|
|
},
|
|
providerOverrides: {
|
|
model: "google/gemini-3.1-pro-tts",
|
|
voiceName: "Puck",
|
|
audioProfile: "Speak brightly.",
|
|
speakerName: "Override speaker",
|
|
},
|
|
timeoutMs: 5_000,
|
|
});
|
|
|
|
expect(postJsonRequestMock).toHaveBeenCalledWith(
|
|
expect.objectContaining({
|
|
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent",
|
|
body: expect.objectContaining({
|
|
contents: [
|
|
{
|
|
role: "user",
|
|
parts: [
|
|
{ text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." },
|
|
],
|
|
},
|
|
],
|
|
generationConfig: expect.objectContaining({
|
|
speechConfig: {
|
|
voiceConfig: {
|
|
prebuiltVoiceConfig: {
|
|
voiceName: "Puck",
|
|
},
|
|
},
|
|
},
|
|
}),
|
|
}),
|
|
}),
|
|
);
|
|
expect(result).toEqual({
|
|
audioBuffer: pcm,
|
|
outputFormat: "pcm",
|
|
sampleRate: 24_000,
|
|
});
|
|
});
|
|
|
|
it("prepends configured Gemini TTS profile text", async () => {
|
|
const requestMock = installGoogleTtsRequestMock();
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
await provider.synthesize({
|
|
text: "Status update starts now.",
|
|
cfg: {},
|
|
providerConfig: {
|
|
apiKey: "google-test-key",
|
|
audioProfile: "Speak professionally with a calm executive tone.",
|
|
speakerName: "Alex",
|
|
},
|
|
target: "audio-file",
|
|
timeoutMs: 10_000,
|
|
});
|
|
|
|
expect(requestMock.mock.calls[0]?.[0].body).toMatchObject({
|
|
contents: [
|
|
{
|
|
parts: [
|
|
{
|
|
text:
|
|
"Speak professionally with a calm executive tone.\n\n" +
|
|
"Speaker name: Alex\n\n" +
|
|
"Status update starts now.",
|
|
},
|
|
],
|
|
},
|
|
],
|
|
});
|
|
});
|
|
|
|
it("resolves provider config and directive overrides", () => {
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
expect(
|
|
provider.resolveConfig?.({
|
|
cfg: {},
|
|
rawConfig: {
|
|
providers: {
|
|
google: {
|
|
apiKey: "configured-key",
|
|
model: "google/gemini-3.1-flash-tts-preview",
|
|
voice: "Leda",
|
|
audioProfile: "Speak warmly.",
|
|
speakerName: "Narrator",
|
|
},
|
|
},
|
|
},
|
|
timeoutMs: 1,
|
|
}),
|
|
).toEqual({
|
|
apiKey: "configured-key",
|
|
audioProfile: "Speak warmly.",
|
|
baseUrl: undefined,
|
|
model: "gemini-3.1-flash-tts-preview",
|
|
speakerName: "Narrator",
|
|
voiceName: "Leda",
|
|
});
|
|
|
|
expect(
|
|
provider.parseDirectiveToken?.({
|
|
key: "google_voice",
|
|
value: "Aoede",
|
|
policy: {
|
|
enabled: true,
|
|
allowText: true,
|
|
allowProvider: true,
|
|
allowVoice: true,
|
|
allowModelId: true,
|
|
allowVoiceSettings: true,
|
|
allowNormalization: true,
|
|
allowSeed: true,
|
|
},
|
|
}),
|
|
).toEqual({
|
|
handled: true,
|
|
overrides: {
|
|
voiceName: "Aoede",
|
|
},
|
|
});
|
|
|
|
expect(
|
|
provider.parseDirectiveToken?.({
|
|
key: "google_model",
|
|
value: "gemini-3.1-flash-tts-preview",
|
|
policy: {
|
|
enabled: true,
|
|
allowText: true,
|
|
allowProvider: true,
|
|
allowVoice: true,
|
|
allowModelId: true,
|
|
allowVoiceSettings: true,
|
|
allowNormalization: true,
|
|
allowSeed: true,
|
|
},
|
|
}),
|
|
).toEqual({
|
|
handled: true,
|
|
overrides: {
|
|
model: "gemini-3.1-flash-tts-preview",
|
|
},
|
|
});
|
|
});
|
|
|
|
it("lists Gemini prebuilt TTS voices", async () => {
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
await expect(provider.listVoices?.({ providerConfig: {} })).resolves.toEqual(
|
|
expect.arrayContaining([
|
|
{ id: "Kore", name: "Kore" },
|
|
{ id: "Puck", name: "Puck" },
|
|
]),
|
|
);
|
|
});
|
|
|
|
it("formats Google TTS HTTP errors with provider details", async () => {
|
|
assertOkOrThrowProviderErrorMock.mockRejectedValue(
|
|
new Error(
|
|
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
|
|
),
|
|
);
|
|
postJsonRequestMock.mockResolvedValue({
|
|
response: new Response(
|
|
JSON.stringify({
|
|
error: {
|
|
message: "Quota exceeded",
|
|
status: "RESOURCE_EXHAUSTED",
|
|
},
|
|
}),
|
|
{
|
|
status: 429,
|
|
headers: { "x-request-id": "google_req_123" },
|
|
},
|
|
),
|
|
release: vi.fn(async () => {}),
|
|
});
|
|
const provider = buildGoogleSpeechProvider();
|
|
|
|
await expect(
|
|
provider.synthesize({
|
|
text: "Read this plainly.",
|
|
cfg: {},
|
|
providerConfig: { apiKey: "google-test-key" },
|
|
target: "audio-file",
|
|
timeoutMs: 10_000,
|
|
}),
|
|
).rejects.toThrow(
|
|
"Google TTS failed (429): Quota exceeded [code=RESOURCE_EXHAUSTED] [request_id=google_req_123]",
|
|
);
|
|
});
|
|
|
|
it("honors configured private-network opt-in for Google TTS", async () => {
|
|
installGoogleTtsRequestMock();
|
|
|
|
const provider = buildGoogleSpeechProvider();
|
|
await provider.synthesize({
|
|
text: "hello",
|
|
cfg: {
|
|
models: {
|
|
providers: {
|
|
google: {
|
|
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
|
|
request: { allowPrivateNetwork: true },
|
|
models: [],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
providerConfig: { apiKey: "google-test-key" },
|
|
target: "audio-file",
|
|
timeoutMs: 12_345,
|
|
});
|
|
|
|
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
|
|
expect.objectContaining({
|
|
allowPrivateNetwork: true,
|
|
request: expect.objectContaining({ allowPrivateNetwork: true }),
|
|
}),
|
|
);
|
|
});
|
|
|
|
it("honors configured private-network opt-in for Google telephony TTS", async () => {
|
|
installGoogleTtsRequestMock();
|
|
|
|
const provider = buildGoogleSpeechProvider();
|
|
await provider.synthesizeTelephony?.({
|
|
text: "hello",
|
|
cfg: {
|
|
models: {
|
|
providers: {
|
|
google: {
|
|
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
|
|
request: { allowPrivateNetwork: true },
|
|
models: [],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
providerConfig: { apiKey: "google-test-key" },
|
|
timeoutMs: 12_345,
|
|
});
|
|
|
|
expect(resolveProviderHttpRequestConfigMock).toHaveBeenCalledWith(
|
|
expect.objectContaining({
|
|
allowPrivateNetwork: true,
|
|
request: expect.objectContaining({ allowPrivateNetwork: true }),
|
|
}),
|
|
);
|
|
});
|
|
});
|