feat(google): support Gemini TTS style profile

2026-05-06 13:10:43 +00:00 · 2026-04-25 06:11:15 +01:00
parent 3f63ba8fd8
commit 8acc92c881
5 changed files with 88 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -62,6 +62,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/Google Meet: add `googlemeet doctor` and a `recover_current_tab`/`recover-tab` flow so agents can inspect an already-open Meet tab and report the blocker without opening another window. Thanks @steipete.
 - Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
 - Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support. Thanks @steipete.
+- Providers/Google: let Gemini TTS prepend configured `audioProfile` and `speakerName` prompt text for reusable speech style control. Thanks @tdack.
 - Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop. Thanks @steipete.
 - Gateway/VoiceClaw: add a realtime brain WebSocket endpoint backed by Gemini Live, with owner-auth gating and async OpenClaw tool handoff. (#70938) Thanks @yagudaev.
 - Providers/DeepSeek: add DeepSeek V4 Flash and V4 Pro to the bundled catalog and make V4 Flash the onboarding default. Thanks @lsdsjy.
--- a/docs/providers/google.md
+++ b/docs/providers/google.md
@@ -267,6 +267,7 @@ To use Google as the default TTS provider:
        google: {
          model: "gemini-3.1-flash-tts-preview",
          voiceName: "Kore",
+          audioProfile: "Speak professionally with a calm tone.",
        },
      },
    },
@@ -274,9 +275,14 @@ To use Google as the default TTS provider:
 }
 ```

-Gemini API TTS accepts expressive square-bracket audio tags in the text, such as
-`[whispers]` or `[laughs]`. To keep tags out of the visible chat reply while
-sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]` block:
+Gemini API TTS uses natural-language prompting for style control. Set
+`audioProfile` to prepend a reusable style prompt before the spoken text. Set
+`speakerName` when your prompt text refers to a named speaker.
+
+Gemini API TTS also accepts expressive square-bracket audio tags in the text,
+such as `[whispers]` or `[laughs]`. To keep tags out of the visible chat reply
+while sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]`
+block:

 ```text
 Here is the clean reply text.
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -379,6 +379,8 @@ Then run:
 - `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values.
 - `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`).
 - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted).
+- `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text.
+- `providers.google.speakerName`: optional speaker label prepended before the spoken text when your TTS prompt uses a named speaker.
 - `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted.
  - If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback.
 - `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`).
--- a/extensions/google/speech-provider.test.ts
+++ b/extensions/google/speech-provider.test.ts
@@ -166,6 +166,39 @@ describe("Google speech provider", () => {
    });
  });

+  it("prepends configured Gemini TTS profile text", async () => {
+    const fetchMock = installGoogleTtsFetchMock();
+    const provider = buildGoogleSpeechProvider();
+
+    await provider.synthesize({
+      text: "Status update starts now.",
+      cfg: {},
+      providerConfig: {
+        apiKey: "google-test-key",
+        audioProfile: "Speak professionally with a calm executive tone.",
+        speakerName: "Alex",
+      },
+      target: "audio-file",
+      timeoutMs: 10_000,
+    });
+
+    const [, init] = fetchMock.mock.calls[0];
+    expect(JSON.parse(String(init.body))).toMatchObject({
+      contents: [
+        {
+          parts: [
+            {
+              text:
+                "Speak professionally with a calm executive tone.\n\n" +
+                "Speaker name: Alex\n\n" +
+                "Status update starts now.",
+            },
+          ],
+        },
+      ],
+    });
+  });
+
  it("resolves provider config and directive overrides", () => {
    const provider = buildGoogleSpeechProvider();

@@ -178,6 +211,8 @@ describe("Google speech provider", () => {
              apiKey: "configured-key",
              model: "google/gemini-3.1-flash-tts-preview",
              voice: "Leda",
+              audioProfile: "Speak warmly.",
+              speakerName: "Narrator",
            },
          },
        },
@@ -185,8 +220,10 @@ describe("Google speech provider", () => {
      }),
    ).toEqual({
      apiKey: "configured-key",
+      audioProfile: "Speak warmly.",
      baseUrl: undefined,
      model: "gemini-3.1-flash-tts-preview",
+      speakerName: "Narrator",
      voiceName: "Leda",
    });

--- a/extensions/google/speech-provider.ts
+++ b/extensions/google/speech-provider.ts
@@ -55,11 +55,15 @@ type GoogleTtsProviderConfig = {
  baseUrl?: string;
  model: string;
  voiceName: string;
+  audioProfile?: string;
+  speakerName?: string;
 };

 type GoogleTtsProviderOverrides = {
  model?: string;
  voiceName?: string;
+  audioProfile?: string;
+  speakerName?: string;
 };

 type Maybe<T> = T | undefined;
@@ -148,6 +152,8 @@ function normalizeGoogleTtsProviderConfig(
    baseUrl: trimToUndefined(raw?.baseUrl),
    model: normalizeGoogleTtsModel(raw?.model),
    voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice),
+    audioProfile: trimToUndefined(raw?.audioProfile),
+    speakerName: trimToUndefined(raw?.speakerName),
  };
 }

@@ -160,6 +166,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro
    voiceName: normalizeGoogleTtsVoiceName(
      config.voiceName ?? config.voice ?? normalized.voiceName,
    ),
+    audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile,
+    speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName,
  };
 }

@@ -172,9 +180,25 @@ function readGoogleTtsOverrides(
  return {
    model: normalizeOptionalString(overrides.model),
    voiceName: normalizeOptionalString(overrides.voiceName ?? overrides.voice),
+    audioProfile: normalizeOptionalString(overrides.audioProfile),
+    speakerName: normalizeOptionalString(overrides.speakerName),
  };
 }

+function composeGoogleTtsText(params: {
+  text: string;
+  audioProfile?: string;
+  speakerName?: string;
+}): string {
+  return [
+    trimToUndefined(params.audioProfile),
+    trimToUndefined(params.speakerName) ? `Speaker name: ${params.speakerName}` : undefined,
+    params.text,
+  ]
+    .filter((part): part is string => part !== undefined)
+    .join("\n\n");
+}
+
 function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
  handled: boolean;
  overrides?: SpeechProviderOverrides;
@@ -242,6 +266,8 @@ async function synthesizeGoogleTtsPcm(params: {
  baseUrl?: string;
  model: string;
  voiceName: string;
+  audioProfile?: string;
+  speakerName?: string;
  timeoutMs: number;
 }): Promise<Buffer> {
  const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
@@ -259,7 +285,15 @@ async function synthesizeGoogleTtsPcm(params: {
      contents: [
        {
          role: "user",
-          parts: [{ text: params.text }],
+          parts: [
+            {
+              text: composeGoogleTtsText({
+                text: params.text,
+                audioProfile: params.audioProfile,
+                speakerName: params.speakerName,
+              }),
+            },
+          ],
        },
      ],
      generationConfig: {
@@ -347,6 +381,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
        baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }),
        model: normalizeGoogleTtsModel(overrides.model ?? config.model),
        voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
+        audioProfile: overrides.audioProfile ?? config.audioProfile,
+        speakerName: overrides.speakerName ?? config.speakerName,
        timeoutMs: req.timeoutMs,
      });
      return {
@@ -371,6 +407,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
        baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }),
        model: config.model,
        voiceName: config.voiceName,
+        audioProfile: config.audioProfile,
+        speakerName: config.speakerName,
        timeoutMs: req.timeoutMs,
      });
      return {