From 361737d1f1cb6a316cefdfb6944bab8ca957ad65 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 3 May 2026 22:49:46 -0700 Subject: [PATCH] fix(tts): honor telephony voice overrides --- CHANGELOG.md | 1 + .../azure-speech/speech-provider.test.ts | 36 +++++++++++++++++++ extensions/azure-speech/speech-provider.ts | 5 +-- extensions/google/speech-provider.test.ts | 33 +++++++++++++++++ extensions/google/speech-provider.ts | 9 ++--- extensions/gradium/speech-provider.test.ts | 8 +++-- extensions/gradium/speech-provider.ts | 3 +- extensions/inworld/speech-provider.test.ts | 7 ++-- extensions/inworld/speech-provider.ts | 7 ++-- extensions/xai/speech-provider.test.ts | 35 ++++++++++++++++++ extensions/xai/speech-provider.ts | 7 ++-- 11 files changed, 133 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94d1717a994..db301784881 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai - Agents/verbose: use compact explain-mode tool summaries for `/verbose` and progress drafts by default, with `agents.defaults.toolProgressDetail: "raw"` and per-agent overrides for debugging raw command/detail output. - Agents/commands: add `/steer ` for queue-independent steering of the active current-session run without starting a new turn when the session is idle. (#76934) - Agents/subagents: preserve every grouped child result when direct completion fallback has to bypass the requester-agent announce turn. Thanks @vincentkoc. +- TTS/telephony: honor provider voice/model overrides in telephony synthesis providers so Google Meet agent speech logs match the backend that actually produced the audio. Thanks @vincentkoc. - Tools/BTW: add `/side` as a text and native slash-command alias for `/btw` side questions. - Doctor/config: `doctor --fix` now commits safe legacy migrations even when unrelated validation issues (e.g. a missing plugin) prevent full validation from passing, so `agents.defaults.llm` and other known-legacy keys are always cleaned up by `doctor --fix` regardless of other config problems. Fixes #76798. (#76800) Thanks @hclsys. - Docs: clarify that IRC uses raw TCP/TLS sockets outside operator-managed forward proxy routing, so direct IRC egress should be explicitly approved before enabling IRC. Thanks @jesse-merhi. diff --git a/extensions/azure-speech/speech-provider.test.ts b/extensions/azure-speech/speech-provider.test.ts index 40d32ec32e5..c34fd652257 100644 --- a/extensions/azure-speech/speech-provider.test.ts +++ b/extensions/azure-speech/speech-provider.test.ts @@ -176,6 +176,42 @@ describe("buildAzureSpeechProvider", () => { }); }); + it("honors voice and language overrides for telephony output", async () => { + const provider = buildAzureSpeechProvider(); + const result = await provider.synthesizeTelephony?.({ + text: "hello", + cfg: {} as never, + providerConfig: { + apiKey: "key", + region: "eastus", + voice: "en-US-JennyNeural", + lang: "en-US", + }, + providerOverrides: { + voice: "en-US-AriaNeural", + lang: "es-US", + }, + timeoutMs: 30_000, + }); + + expect(azureSpeechTTSMock).toHaveBeenCalledWith({ + text: "hello", + apiKey: "key", + baseUrl: "https://eastus.tts.speech.microsoft.com", + endpoint: undefined, + region: "eastus", + voice: "en-US-AriaNeural", + lang: "es-US", + outputFormat: "raw-8khz-8bit-mono-mulaw", + timeoutMs: 30_000, + }); + expect(result).toEqual({ + audioBuffer: Buffer.from("audio-bytes"), + outputFormat: "raw-8khz-8bit-mono-mulaw", + sampleRate: 8_000, + }); + }); + it("lists voices through config or explicit request auth", async () => { const provider = buildAzureSpeechProvider(); const voices = await provider.listVoices?.({ diff --git a/extensions/azure-speech/speech-provider.ts b/extensions/azure-speech/speech-provider.ts index 22fcc637ea5..f88dbc8ddd4 100644 --- a/extensions/azure-speech/speech-provider.ts +++ b/extensions/azure-speech/speech-provider.ts @@ -279,6 +279,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin { }, synthesizeTelephony: async (req) => { const config = readAzureSpeechProviderConfig(req.providerConfig); + const overrides = readAzureSpeechOverrides(req.providerOverrides); const apiKey = resolveApiKey(config); if (!apiKey) { throw new Error("Azure Speech API key missing"); @@ -290,8 +291,8 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin { baseUrl: config.baseUrl, endpoint: config.endpoint, region: config.region, - voice: config.voice, - lang: config.lang, + voice: overrides.voice ?? config.voice, + lang: overrides.lang ?? config.lang, outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT, timeoutMs: resolveTimeoutMs(config, req.timeoutMs), }); diff --git a/extensions/google/speech-provider.test.ts b/extensions/google/speech-provider.test.ts index b8834a58f0a..f1da219f99d 100644 --- a/extensions/google/speech-provider.test.ts +++ b/extensions/google/speech-provider.test.ts @@ -397,11 +397,44 @@ describe("Google speech provider", () => { cfg: {}, providerConfig: { apiKey: "google-test-key", + model: "google/gemini-3.1-flash-tts", voice: "Kore", + audioProfile: "Speak calmly.", + speakerName: "Default speaker", + }, + providerOverrides: { + model: "google/gemini-3.1-pro-tts", + voiceName: "Puck", + audioProfile: "Speak brightly.", + speakerName: "Override speaker", }, timeoutMs: 5_000, }); + expect(postJsonRequestMock).toHaveBeenCalledWith( + expect.objectContaining({ + url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent", + body: expect.objectContaining({ + contents: [ + { + role: "user", + parts: [ + { text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." }, + ], + }, + ], + generationConfig: expect.objectContaining({ + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { + voiceName: "Puck", + }, + }, + }, + }), + }), + }), + ); expect(result).toEqual({ audioBuffer: pcm, outputFormat: "pcm", diff --git a/extensions/google/speech-provider.ts b/extensions/google/speech-provider.ts index 951a4001cfa..47358150cf2 100644 --- a/extensions/google/speech-provider.ts +++ b/extensions/google/speech-provider.ts @@ -640,6 +640,7 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { }, synthesizeTelephony: async (req) => { const config = readGoogleTtsProviderConfig(req.providerConfig); + const overrides = readGoogleTtsOverrides(req.providerOverrides); const apiKey = resolveGoogleTtsApiKey({ cfg: req.cfg, providerConfig: req.providerConfig, @@ -654,10 +655,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { request: sanitizeConfiguredModelProviderRequest( req.cfg?.models?.providers?.google?.request, ), - model: config.model, - voiceName: config.voiceName, - audioProfile: config.audioProfile, - speakerName: config.speakerName, + model: normalizeGoogleTtsModel(overrides.model ?? config.model), + voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName), + audioProfile: overrides.audioProfile ?? config.audioProfile, + speakerName: overrides.speakerName ?? config.speakerName, timeoutMs: req.timeoutMs, }); return { diff --git a/extensions/gradium/speech-provider.test.ts b/extensions/gradium/speech-provider.test.ts index bd439770b65..e98c4beb922 100644 --- a/extensions/gradium/speech-provider.test.ts +++ b/extensions/gradium/speech-provider.test.ts @@ -98,12 +98,16 @@ describe("gradium speech provider", () => { const result = await provider.synthesizeTelephony!({ text: "Telephony test", cfg: {} as never, - providerConfig: { apiKey: "gsk_test123" }, + providerConfig: { apiKey: "gsk_test123", voiceId: "default-voice" }, + providerOverrides: { voiceId: "override-voice" }, timeoutMs: 30_000, }); const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; - expect(JSON.parse(init.body as string).output_format).toBe("ulaw_8000"); + expect(JSON.parse(init.body as string)).toMatchObject({ + voice_id: "override-voice", + output_format: "ulaw_8000", + }); expect(result.outputFormat).toBe("ulaw_8000"); expect(result.sampleRate).toBe(8_000); expect(result.audioBuffer).toEqual(audioData); diff --git a/extensions/gradium/speech-provider.ts b/extensions/gradium/speech-provider.ts index 877b5dbdaef..aa9472b572b 100644 --- a/extensions/gradium/speech-provider.ts +++ b/extensions/gradium/speech-provider.ts @@ -96,6 +96,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin { }, synthesizeTelephony: async (req) => { const config = readGradiumProviderConfig(req.providerConfig); + const overrides = req.providerOverrides ?? {}; const apiKey = config.apiKey || process.env.GRADIUM_API_KEY; if (!apiKey) { throw new Error("Gradium API key missing"); @@ -106,7 +107,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin { text: req.text, apiKey, baseUrl: config.baseUrl, - voiceId: config.voiceId, + voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId, outputFormat, timeoutMs: req.timeoutMs, }); diff --git a/extensions/inworld/speech-provider.test.ts b/extensions/inworld/speech-provider.test.ts index 2bbd401b5a6..5676a905d88 100644 --- a/extensions/inworld/speech-provider.test.ts +++ b/extensions/inworld/speech-provider.test.ts @@ -190,6 +190,7 @@ describe("buildInworldSpeechProvider", () => { text: "Hello", cfg: {} as never, providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" }, + providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 }, timeoutMs: 30_000, }); @@ -197,11 +198,11 @@ describe("buildInworldSpeechProvider", () => { text: "Hello", apiKey: "key", baseUrl: "https://api.inworld.ai", - voiceId: "Sarah", - modelId: "inworld-tts-1.5-max", + voiceId: "Ashley", + modelId: "inworld-tts-1.5-mini", audioEncoding: "PCM", sampleRateHertz: 22_050, - temperature: undefined, + temperature: 0.6, timeoutMs: 30_000, }); expect(result).toEqual({ diff --git a/extensions/inworld/speech-provider.ts b/extensions/inworld/speech-provider.ts index f9c28a91e46..805145d7dda 100644 --- a/extensions/inworld/speech-provider.ts +++ b/extensions/inworld/speech-provider.ts @@ -197,6 +197,7 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin { }, synthesizeTelephony: async (req) => { const config = readInworldProviderConfig(req.providerConfig); + const overrides = readInworldOverrides(req.providerOverrides); const apiKey = config.apiKey || process.env.INWORLD_API_KEY; if (!apiKey) { throw new Error("Inworld API key missing"); @@ -207,11 +208,11 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin { text: req.text, apiKey, baseUrl: config.baseUrl, - voiceId: config.voiceId, - modelId: config.modelId, + voiceId: overrides.voiceId ?? config.voiceId, + modelId: overrides.modelId ?? config.modelId, audioEncoding: "PCM", sampleRateHertz: sampleRate, - temperature: config.temperature, + temperature: overrides.temperature ?? config.temperature, timeoutMs: req.timeoutMs, }); diff --git a/extensions/xai/speech-provider.test.ts b/extensions/xai/speech-provider.test.ts index 2d49969ffe3..7a4db54ad6a 100644 --- a/extensions/xai/speech-provider.test.ts +++ b/extensions/xai/speech-provider.test.ts @@ -68,4 +68,39 @@ describe("xai speech provider", () => { }), ); }); + + it("honors voice, language, and speed overrides for telephony output", async () => { + const provider = buildXaiSpeechProvider(); + const result = await provider.synthesizeTelephony?.({ + text: "hello", + cfg: {}, + providerConfig: { + apiKey: "xai-key", + baseUrl: "https://api.x.ai/v1", + voiceId: "eve", + language: "en", + speed: 1, + }, + providerOverrides: { + voice: "aura", + language: "es", + speed: 1.2, + }, + timeoutMs: 5_000, + }); + + expect(result).toEqual({ + audioBuffer: Buffer.from("audio-bytes"), + outputFormat: "pcm", + sampleRate: 24_000, + }); + expect(xaiTTSMock).toHaveBeenLastCalledWith( + expect.objectContaining({ + voiceId: "aura", + language: "es", + speed: 1.2, + responseFormat: "pcm", + }), + ); + }); }); diff --git a/extensions/xai/speech-provider.ts b/extensions/xai/speech-provider.ts index 9e5903007b2..142e8c3caea 100644 --- a/extensions/xai/speech-provider.ts +++ b/extensions/xai/speech-provider.ts @@ -230,6 +230,7 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin { }, synthesizeTelephony: async (req) => { const config = readXaiProviderConfig(req.providerConfig); + const overrides = readXaiOverrides(req.providerOverrides); const apiKey = config.apiKey || process.env.XAI_API_KEY; if (!apiKey) { throw new Error("xAI API key missing"); @@ -240,9 +241,9 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin { text: req.text, apiKey, baseUrl: config.baseUrl, - voiceId: config.voiceId, - language: config.language, - speed: config.speed, + voiceId: overrides.voiceId ?? config.voiceId, + language: overrides.language ?? config.language, + speed: overrides.speed ?? config.speed, responseFormat: outputFormat, timeoutMs: req.timeoutMs, });