From 361737d1f1cb6a316cefdfb6944bab8ca957ad65 Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Sun, 3 May 2026 22:49:46 -0700
Subject: [PATCH] fix(tts): honor telephony voice overrides

---
 CHANGELOG.md                                  |  1 +
 .../azure-speech/speech-provider.test.ts      | 36 +++++++++++++++++++
 extensions/azure-speech/speech-provider.ts    |  5 +--
 extensions/google/speech-provider.test.ts     | 33 +++++++++++++++++
 extensions/google/speech-provider.ts          |  9 ++---
 extensions/gradium/speech-provider.test.ts    |  8 +++--
 extensions/gradium/speech-provider.ts         |  3 +-
 extensions/inworld/speech-provider.test.ts    |  7 ++--
 extensions/inworld/speech-provider.ts         |  7 ++--
 extensions/xai/speech-provider.test.ts        | 35 ++++++++++++++++++
 extensions/xai/speech-provider.ts             |  7 ++--
 11 files changed, 133 insertions(+), 18 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94d1717a994..db301784881 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
 - Agents/verbose: use compact explain-mode tool summaries for `/verbose` and progress drafts by default, with `agents.defaults.toolProgressDetail: "raw"` and per-agent overrides for debugging raw command/detail output.
 - Agents/commands: add `/steer <message>` for queue-independent steering of the active current-session run without starting a new turn when the session is idle. (#76934)
 - Agents/subagents: preserve every grouped child result when direct completion fallback has to bypass the requester-agent announce turn. Thanks @vincentkoc.
+- TTS/telephony: honor provider voice/model overrides in telephony synthesis providers so Google Meet agent speech logs match the backend that actually produced the audio. Thanks @vincentkoc.
 - Tools/BTW: add `/side` as a text and native slash-command alias for `/btw` side questions.
 - Doctor/config: `doctor --fix` now commits safe legacy migrations even when unrelated validation issues (e.g. a missing plugin) prevent full validation from passing, so `agents.defaults.llm` and other known-legacy keys are always cleaned up by `doctor --fix` regardless of other config problems. Fixes #76798. (#76800) Thanks @hclsys.
 - Docs: clarify that IRC uses raw TCP/TLS sockets outside operator-managed forward proxy routing, so direct IRC egress should be explicitly approved before enabling IRC. Thanks @jesse-merhi.
diff --git a/extensions/azure-speech/speech-provider.test.ts b/extensions/azure-speech/speech-provider.test.ts
index 40d32ec32e5..c34fd652257 100644
--- a/extensions/azure-speech/speech-provider.test.ts
+++ b/extensions/azure-speech/speech-provider.test.ts
@@ -176,6 +176,42 @@ describe("buildAzureSpeechProvider", () => {
     });
   });
 
+  it("honors voice and language overrides for telephony output", async () => {
+    const provider = buildAzureSpeechProvider();
+    const result = await provider.synthesizeTelephony?.({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "key",
+        region: "eastus",
+        voice: "en-US-JennyNeural",
+        lang: "en-US",
+      },
+      providerOverrides: {
+        voice: "en-US-AriaNeural",
+        lang: "es-US",
+      },
+      timeoutMs: 30_000,
+    });
+
+    expect(azureSpeechTTSMock).toHaveBeenCalledWith({
+      text: "hello",
+      apiKey: "key",
+      baseUrl: "https://eastus.tts.speech.microsoft.com",
+      endpoint: undefined,
+      region: "eastus",
+      voice: "en-US-AriaNeural",
+      lang: "es-US",
+      outputFormat: "raw-8khz-8bit-mono-mulaw",
+      timeoutMs: 30_000,
+    });
+    expect(result).toEqual({
+      audioBuffer: Buffer.from("audio-bytes"),
+      outputFormat: "raw-8khz-8bit-mono-mulaw",
+      sampleRate: 8_000,
+    });
+  });
+
   it("lists voices through config or explicit request auth", async () => {
     const provider = buildAzureSpeechProvider();
     const voices = await provider.listVoices?.({
diff --git a/extensions/azure-speech/speech-provider.ts b/extensions/azure-speech/speech-provider.ts
index 22fcc637ea5..f88dbc8ddd4 100644
--- a/extensions/azure-speech/speech-provider.ts
+++ b/extensions/azure-speech/speech-provider.ts
@@ -279,6 +279,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
     },
     synthesizeTelephony: async (req) => {
       const config = readAzureSpeechProviderConfig(req.providerConfig);
+      const overrides = readAzureSpeechOverrides(req.providerOverrides);
       const apiKey = resolveApiKey(config);
       if (!apiKey) {
         throw new Error("Azure Speech API key missing");
@@ -290,8 +291,8 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
         baseUrl: config.baseUrl,
         endpoint: config.endpoint,
         region: config.region,
-        voice: config.voice,
-        lang: config.lang,
+        voice: overrides.voice ?? config.voice,
+        lang: overrides.lang ?? config.lang,
         outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
         timeoutMs: resolveTimeoutMs(config, req.timeoutMs),
       });
diff --git a/extensions/google/speech-provider.test.ts b/extensions/google/speech-provider.test.ts
index b8834a58f0a..f1da219f99d 100644
--- a/extensions/google/speech-provider.test.ts
+++ b/extensions/google/speech-provider.test.ts
@@ -397,11 +397,44 @@ describe("Google speech provider", () => {
       cfg: {},
       providerConfig: {
         apiKey: "google-test-key",
+        model: "google/gemini-3.1-flash-tts",
         voice: "Kore",
+        audioProfile: "Speak calmly.",
+        speakerName: "Default speaker",
+      },
+      providerOverrides: {
+        model: "google/gemini-3.1-pro-tts",
+        voiceName: "Puck",
+        audioProfile: "Speak brightly.",
+        speakerName: "Override speaker",
       },
       timeoutMs: 5_000,
     });
 
+    expect(postJsonRequestMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent",
+        body: expect.objectContaining({
+          contents: [
+            {
+              role: "user",
+              parts: [
+                { text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." },
+              ],
+            },
+          ],
+          generationConfig: expect.objectContaining({
+            speechConfig: {
+              voiceConfig: {
+                prebuiltVoiceConfig: {
+                  voiceName: "Puck",
+                },
+              },
+            },
+          }),
+        }),
+      }),
+    );
     expect(result).toEqual({
       audioBuffer: pcm,
       outputFormat: "pcm",
diff --git a/extensions/google/speech-provider.ts b/extensions/google/speech-provider.ts
index 951a4001cfa..47358150cf2 100644
--- a/extensions/google/speech-provider.ts
+++ b/extensions/google/speech-provider.ts
@@ -640,6 +640,7 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
     },
     synthesizeTelephony: async (req) => {
       const config = readGoogleTtsProviderConfig(req.providerConfig);
+      const overrides = readGoogleTtsOverrides(req.providerOverrides);
       const apiKey = resolveGoogleTtsApiKey({
         cfg: req.cfg,
         providerConfig: req.providerConfig,
@@ -654,10 +655,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
         request: sanitizeConfiguredModelProviderRequest(
           req.cfg?.models?.providers?.google?.request,
         ),
-        model: config.model,
-        voiceName: config.voiceName,
-        audioProfile: config.audioProfile,
-        speakerName: config.speakerName,
+        model: normalizeGoogleTtsModel(overrides.model ?? config.model),
+        voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
+        audioProfile: overrides.audioProfile ?? config.audioProfile,
+        speakerName: overrides.speakerName ?? config.speakerName,
         timeoutMs: req.timeoutMs,
       });
       return {
diff --git a/extensions/gradium/speech-provider.test.ts b/extensions/gradium/speech-provider.test.ts
index bd439770b65..e98c4beb922 100644
--- a/extensions/gradium/speech-provider.test.ts
+++ b/extensions/gradium/speech-provider.test.ts
@@ -98,12 +98,16 @@ describe("gradium speech provider", () => {
     const result = await provider.synthesizeTelephony!({
       text: "Telephony test",
       cfg: {} as never,
-      providerConfig: { apiKey: "gsk_test123" },
+      providerConfig: { apiKey: "gsk_test123", voiceId: "default-voice" },
+      providerOverrides: { voiceId: "override-voice" },
       timeoutMs: 30_000,
     });
 
     const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
-    expect(JSON.parse(init.body as string).output_format).toBe("ulaw_8000");
+    expect(JSON.parse(init.body as string)).toMatchObject({
+      voice_id: "override-voice",
+      output_format: "ulaw_8000",
+    });
     expect(result.outputFormat).toBe("ulaw_8000");
     expect(result.sampleRate).toBe(8_000);
     expect(result.audioBuffer).toEqual(audioData);
diff --git a/extensions/gradium/speech-provider.ts b/extensions/gradium/speech-provider.ts
index 877b5dbdaef..aa9472b572b 100644
--- a/extensions/gradium/speech-provider.ts
+++ b/extensions/gradium/speech-provider.ts
@@ -96,6 +96,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
     },
     synthesizeTelephony: async (req) => {
       const config = readGradiumProviderConfig(req.providerConfig);
+      const overrides = req.providerOverrides ?? {};
       const apiKey = config.apiKey || process.env.GRADIUM_API_KEY;
       if (!apiKey) {
         throw new Error("Gradium API key missing");
@@ -106,7 +107,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
         text: req.text,
         apiKey,
         baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
+        voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
         outputFormat,
         timeoutMs: req.timeoutMs,
       });
diff --git a/extensions/inworld/speech-provider.test.ts b/extensions/inworld/speech-provider.test.ts
index 2bbd401b5a6..5676a905d88 100644
--- a/extensions/inworld/speech-provider.test.ts
+++ b/extensions/inworld/speech-provider.test.ts
@@ -190,6 +190,7 @@ describe("buildInworldSpeechProvider", () => {
       text: "Hello",
       cfg: {} as never,
       providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
+      providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 },
       timeoutMs: 30_000,
     });
 
@@ -197,11 +198,11 @@ describe("buildInworldSpeechProvider", () => {
       text: "Hello",
       apiKey: "key",
       baseUrl: "https://api.inworld.ai",
-      voiceId: "Sarah",
-      modelId: "inworld-tts-1.5-max",
+      voiceId: "Ashley",
+      modelId: "inworld-tts-1.5-mini",
       audioEncoding: "PCM",
       sampleRateHertz: 22_050,
-      temperature: undefined,
+      temperature: 0.6,
       timeoutMs: 30_000,
     });
     expect(result).toEqual({
diff --git a/extensions/inworld/speech-provider.ts b/extensions/inworld/speech-provider.ts
index f9c28a91e46..805145d7dda 100644
--- a/extensions/inworld/speech-provider.ts
+++ b/extensions/inworld/speech-provider.ts
@@ -197,6 +197,7 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
     },
     synthesizeTelephony: async (req) => {
       const config = readInworldProviderConfig(req.providerConfig);
+      const overrides = readInworldOverrides(req.providerOverrides);
       const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
       if (!apiKey) {
         throw new Error("Inworld API key missing");
@@ -207,11 +208,11 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
         text: req.text,
         apiKey,
         baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
-        modelId: config.modelId,
+        voiceId: overrides.voiceId ?? config.voiceId,
+        modelId: overrides.modelId ?? config.modelId,
         audioEncoding: "PCM",
         sampleRateHertz: sampleRate,
-        temperature: config.temperature,
+        temperature: overrides.temperature ?? config.temperature,
         timeoutMs: req.timeoutMs,
       });
 
diff --git a/extensions/xai/speech-provider.test.ts b/extensions/xai/speech-provider.test.ts
index 2d49969ffe3..7a4db54ad6a 100644
--- a/extensions/xai/speech-provider.test.ts
+++ b/extensions/xai/speech-provider.test.ts
@@ -68,4 +68,39 @@ describe("xai speech provider", () => {
       }),
     );
   });
+
+  it("honors voice, language, and speed overrides for telephony output", async () => {
+    const provider = buildXaiSpeechProvider();
+    const result = await provider.synthesizeTelephony?.({
+      text: "hello",
+      cfg: {},
+      providerConfig: {
+        apiKey: "xai-key",
+        baseUrl: "https://api.x.ai/v1",
+        voiceId: "eve",
+        language: "en",
+        speed: 1,
+      },
+      providerOverrides: {
+        voice: "aura",
+        language: "es",
+        speed: 1.2,
+      },
+      timeoutMs: 5_000,
+    });
+
+    expect(result).toEqual({
+      audioBuffer: Buffer.from("audio-bytes"),
+      outputFormat: "pcm",
+      sampleRate: 24_000,
+    });
+    expect(xaiTTSMock).toHaveBeenLastCalledWith(
+      expect.objectContaining({
+        voiceId: "aura",
+        language: "es",
+        speed: 1.2,
+        responseFormat: "pcm",
+      }),
+    );
+  });
 });
diff --git a/extensions/xai/speech-provider.ts b/extensions/xai/speech-provider.ts
index 9e5903007b2..142e8c3caea 100644
--- a/extensions/xai/speech-provider.ts
+++ b/extensions/xai/speech-provider.ts
@@ -230,6 +230,7 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
     },
     synthesizeTelephony: async (req) => {
       const config = readXaiProviderConfig(req.providerConfig);
+      const overrides = readXaiOverrides(req.providerOverrides);
       const apiKey = config.apiKey || process.env.XAI_API_KEY;
       if (!apiKey) {
         throw new Error("xAI API key missing");
@@ -240,9 +241,9 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
         text: req.text,
         apiKey,
         baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
-        language: config.language,
-        speed: config.speed,
+        voiceId: overrides.voiceId ?? config.voiceId,
+        language: overrides.language ?? config.language,
+        speed: overrides.speed ?? config.speed,
         responseFormat: outputFormat,
         timeoutMs: req.timeoutMs,
       });