fix(tts): honor telephony voice overrides

2026-05-06 13:10:43 +00:00 · 2026-05-03 22:49:46 -07:00
parent a224810a7f
commit 361737d1f1
11 changed files with 133 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
 - Agents/verbose: use compact explain-mode tool summaries for `/verbose` and progress drafts by default, with `agents.defaults.toolProgressDetail: "raw"` and per-agent overrides for debugging raw command/detail output.
 - Agents/commands: add `/steer <message>` for queue-independent steering of the active current-session run without starting a new turn when the session is idle. (#76934)
 - Agents/subagents: preserve every grouped child result when direct completion fallback has to bypass the requester-agent announce turn. Thanks @vincentkoc.
+- TTS/telephony: honor provider voice/model overrides in telephony synthesis providers so Google Meet agent speech logs match the backend that actually produced the audio. Thanks @vincentkoc.
 - Tools/BTW: add `/side` as a text and native slash-command alias for `/btw` side questions.
 - Doctor/config: `doctor --fix` now commits safe legacy migrations even when unrelated validation issues (e.g. a missing plugin) prevent full validation from passing, so `agents.defaults.llm` and other known-legacy keys are always cleaned up by `doctor --fix` regardless of other config problems. Fixes #76798. (#76800) Thanks @hclsys.
 - Docs: clarify that IRC uses raw TCP/TLS sockets outside operator-managed forward proxy routing, so direct IRC egress should be explicitly approved before enabling IRC. Thanks @jesse-merhi.
--- a/extensions/azure-speech/speech-provider.test.ts
+++ b/extensions/azure-speech/speech-provider.test.ts
@@ -176,6 +176,42 @@ describe("buildAzureSpeechProvider", () => {
    });
  });

+  it("honors voice and language overrides for telephony output", async () => {
+    const provider = buildAzureSpeechProvider();
+    const result = await provider.synthesizeTelephony?.({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "key",
+        region: "eastus",
+        voice: "en-US-JennyNeural",
+        lang: "en-US",
+      },
+      providerOverrides: {
+        voice: "en-US-AriaNeural",
+        lang: "es-US",
+      },
+      timeoutMs: 30_000,
+    });
+
+    expect(azureSpeechTTSMock).toHaveBeenCalledWith({
+      text: "hello",
+      apiKey: "key",
+      baseUrl: "https://eastus.tts.speech.microsoft.com",
+      endpoint: undefined,
+      region: "eastus",
+      voice: "en-US-AriaNeural",
+      lang: "es-US",
+      outputFormat: "raw-8khz-8bit-mono-mulaw",
+      timeoutMs: 30_000,
+    });
+    expect(result).toEqual({
+      audioBuffer: Buffer.from("audio-bytes"),
+      outputFormat: "raw-8khz-8bit-mono-mulaw",
+      sampleRate: 8_000,
+    });
+  });
+
  it("lists voices through config or explicit request auth", async () => {
    const provider = buildAzureSpeechProvider();
    const voices = await provider.listVoices?.({
--- a/extensions/azure-speech/speech-provider.ts
+++ b/extensions/azure-speech/speech-provider.ts
@@ -279,6 +279,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readAzureSpeechProviderConfig(req.providerConfig);
+      const overrides = readAzureSpeechOverrides(req.providerOverrides);
      const apiKey = resolveApiKey(config);
      if (!apiKey) {
        throw new Error("Azure Speech API key missing");
@@ -290,8 +291,8 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
        baseUrl: config.baseUrl,
        endpoint: config.endpoint,
        region: config.region,
-        voice: config.voice,
-        lang: config.lang,
+        voice: overrides.voice ?? config.voice,
+        lang: overrides.lang ?? config.lang,
        outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
        timeoutMs: resolveTimeoutMs(config, req.timeoutMs),
      });
--- a/extensions/google/speech-provider.test.ts
+++ b/extensions/google/speech-provider.test.ts
@@ -397,11 +397,44 @@ describe("Google speech provider", () => {
      cfg: {},
      providerConfig: {
        apiKey: "google-test-key",
+        model: "google/gemini-3.1-flash-tts",
        voice: "Kore",
+        audioProfile: "Speak calmly.",
+        speakerName: "Default speaker",
+      },
+      providerOverrides: {
+        model: "google/gemini-3.1-pro-tts",
+        voiceName: "Puck",
+        audioProfile: "Speak brightly.",
+        speakerName: "Override speaker",
      },
      timeoutMs: 5_000,
    });

+    expect(postJsonRequestMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent",
+        body: expect.objectContaining({
+          contents: [
+            {
+              role: "user",
+              parts: [
+                { text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." },
+              ],
+            },
+          ],
+          generationConfig: expect.objectContaining({
+            speechConfig: {
+              voiceConfig: {
+                prebuiltVoiceConfig: {
+                  voiceName: "Puck",
+                },
+              },
+            },
+          }),
+        }),
+      }),
+    );
    expect(result).toEqual({
      audioBuffer: pcm,
      outputFormat: "pcm",
--- a/extensions/google/speech-provider.ts
+++ b/extensions/google/speech-provider.ts
@@ -640,6 +640,7 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readGoogleTtsProviderConfig(req.providerConfig);
+      const overrides = readGoogleTtsOverrides(req.providerOverrides);
      const apiKey = resolveGoogleTtsApiKey({
        cfg: req.cfg,
        providerConfig: req.providerConfig,
@@ -654,10 +655,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
        request: sanitizeConfiguredModelProviderRequest(
          req.cfg?.models?.providers?.google?.request,
        ),
-        model: config.model,
-        voiceName: config.voiceName,
-        audioProfile: config.audioProfile,
-        speakerName: config.speakerName,
+        model: normalizeGoogleTtsModel(overrides.model ?? config.model),
+        voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
+        audioProfile: overrides.audioProfile ?? config.audioProfile,
+        speakerName: overrides.speakerName ?? config.speakerName,
        timeoutMs: req.timeoutMs,
      });
      return {
--- a/extensions/gradium/speech-provider.test.ts
+++ b/extensions/gradium/speech-provider.test.ts
@@ -98,12 +98,16 @@ describe("gradium speech provider", () => {
    const result = await provider.synthesizeTelephony!({
      text: "Telephony test",
      cfg: {} as never,
-      providerConfig: { apiKey: "gsk_test123" },
+      providerConfig: { apiKey: "gsk_test123", voiceId: "default-voice" },
+      providerOverrides: { voiceId: "override-voice" },
      timeoutMs: 30_000,
    });

    const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
-    expect(JSON.parse(init.body as string).output_format).toBe("ulaw_8000");
+    expect(JSON.parse(init.body as string)).toMatchObject({
+      voice_id: "override-voice",
+      output_format: "ulaw_8000",
+    });
    expect(result.outputFormat).toBe("ulaw_8000");
    expect(result.sampleRate).toBe(8_000);
    expect(result.audioBuffer).toEqual(audioData);
--- a/extensions/gradium/speech-provider.ts
+++ b/extensions/gradium/speech-provider.ts
@@ -96,6 +96,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readGradiumProviderConfig(req.providerConfig);
+      const overrides = req.providerOverrides ?? {};
      const apiKey = config.apiKey || process.env.GRADIUM_API_KEY;
      if (!apiKey) {
        throw new Error("Gradium API key missing");
@@ -106,7 +107,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
        text: req.text,
        apiKey,
        baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
+        voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
        outputFormat,
        timeoutMs: req.timeoutMs,
      });
--- a/extensions/inworld/speech-provider.test.ts
+++ b/extensions/inworld/speech-provider.test.ts
@@ -190,6 +190,7 @@ describe("buildInworldSpeechProvider", () => {
      text: "Hello",
      cfg: {} as never,
      providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
+      providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 },
      timeoutMs: 30_000,
    });

@@ -197,11 +198,11 @@ describe("buildInworldSpeechProvider", () => {
      text: "Hello",
      apiKey: "key",
      baseUrl: "https://api.inworld.ai",
-      voiceId: "Sarah",
-      modelId: "inworld-tts-1.5-max",
+      voiceId: "Ashley",
+      modelId: "inworld-tts-1.5-mini",
      audioEncoding: "PCM",
      sampleRateHertz: 22_050,
-      temperature: undefined,
+      temperature: 0.6,
      timeoutMs: 30_000,
    });
    expect(result).toEqual({
--- a/extensions/inworld/speech-provider.ts
+++ b/extensions/inworld/speech-provider.ts
@@ -197,6 +197,7 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readInworldProviderConfig(req.providerConfig);
+      const overrides = readInworldOverrides(req.providerOverrides);
      const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
      if (!apiKey) {
        throw new Error("Inworld API key missing");
@@ -207,11 +208,11 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
        text: req.text,
        apiKey,
        baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
-        modelId: config.modelId,
+        voiceId: overrides.voiceId ?? config.voiceId,
+        modelId: overrides.modelId ?? config.modelId,
        audioEncoding: "PCM",
        sampleRateHertz: sampleRate,
-        temperature: config.temperature,
+        temperature: overrides.temperature ?? config.temperature,
        timeoutMs: req.timeoutMs,
      });

--- a/extensions/xai/speech-provider.test.ts
+++ b/extensions/xai/speech-provider.test.ts
@@ -68,4 +68,39 @@ describe("xai speech provider", () => {
      }),
    );
  });
+
+  it("honors voice, language, and speed overrides for telephony output", async () => {
+    const provider = buildXaiSpeechProvider();
+    const result = await provider.synthesizeTelephony?.({
+      text: "hello",
+      cfg: {},
+      providerConfig: {
+        apiKey: "xai-key",
+        baseUrl: "https://api.x.ai/v1",
+        voiceId: "eve",
+        language: "en",
+        speed: 1,
+      },
+      providerOverrides: {
+        voice: "aura",
+        language: "es",
+        speed: 1.2,
+      },
+      timeoutMs: 5_000,
+    });
+
+    expect(result).toEqual({
+      audioBuffer: Buffer.from("audio-bytes"),
+      outputFormat: "pcm",
+      sampleRate: 24_000,
+    });
+    expect(xaiTTSMock).toHaveBeenLastCalledWith(
+      expect.objectContaining({
+        voiceId: "aura",
+        language: "es",
+        speed: 1.2,
+        responseFormat: "pcm",
+      }),
+    );
+  });
 });
--- a/extensions/xai/speech-provider.ts
+++ b/extensions/xai/speech-provider.ts
@@ -230,6 +230,7 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readXaiProviderConfig(req.providerConfig);
+      const overrides = readXaiOverrides(req.providerOverrides);
      const apiKey = config.apiKey || process.env.XAI_API_KEY;
      if (!apiKey) {
        throw new Error("xAI API key missing");
@@ -240,9 +241,9 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
        text: req.text,
        apiKey,
        baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
-        language: config.language,
-        speed: config.speed,
+        voiceId: overrides.voiceId ?? config.voiceId,
+        language: overrides.language ?? config.language,
+        speed: overrides.speed ?? config.speed,
        responseFormat: outputFormat,
        timeoutMs: req.timeoutMs,
      });