diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a89bfea1c8..097034b461f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ Docs: https://docs.openclaw.ai - Plugins/Google Meet: add `googlemeet doctor` and a `recover_current_tab`/`recover-tab` flow so agents can inspect an already-open Meet tab and report the blocker without opening another window. Thanks @steipete. - Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc. - Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support. Thanks @steipete. +- Providers/Google: let Gemini TTS prepend configured `audioProfile` and `speakerName` prompt text for reusable speech style control. Thanks @tdack. - Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop. Thanks @steipete. - Gateway/VoiceClaw: add a realtime brain WebSocket endpoint backed by Gemini Live, with owner-auth gating and async OpenClaw tool handoff. (#70938) Thanks @yagudaev. - Providers/DeepSeek: add DeepSeek V4 Flash and V4 Pro to the bundled catalog and make V4 Flash the onboarding default. Thanks @lsdsjy. diff --git a/docs/providers/google.md b/docs/providers/google.md index ae8e2625a59..661cc7b7f2c 100644 --- a/docs/providers/google.md +++ b/docs/providers/google.md @@ -267,6 +267,7 @@ To use Google as the default TTS provider: google: { model: "gemini-3.1-flash-tts-preview", voiceName: "Kore", + audioProfile: "Speak professionally with a calm tone.", }, }, }, @@ -274,9 +275,14 @@ To use Google as the default TTS provider: } ``` -Gemini API TTS accepts expressive square-bracket audio tags in the text, such as -`[whispers]` or `[laughs]`. To keep tags out of the visible chat reply while -sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]` block: +Gemini API TTS uses natural-language prompting for style control. Set +`audioProfile` to prepend a reusable style prompt before the spoken text. Set +`speakerName` when your prompt text refers to a named speaker. + +Gemini API TTS also accepts expressive square-bracket audio tags in the text, +such as `[whispers]` or `[laughs]`. To keep tags out of the visible chat reply +while sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]` +block: ```text Here is the clean reply text. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 30a7104b3c3..16780a098f0 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -379,6 +379,8 @@ Then run: - `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values. - `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`). - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted). +- `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text. +- `providers.google.speakerName`: optional speaker label prepended before the spoken text when your TTS prompt uses a named speaker. - `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted. - If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback. - `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`). diff --git a/extensions/google/speech-provider.test.ts b/extensions/google/speech-provider.test.ts index c55deb20610..f64e90364f4 100644 --- a/extensions/google/speech-provider.test.ts +++ b/extensions/google/speech-provider.test.ts @@ -166,6 +166,39 @@ describe("Google speech provider", () => { }); }); + it("prepends configured Gemini TTS profile text", async () => { + const fetchMock = installGoogleTtsFetchMock(); + const provider = buildGoogleSpeechProvider(); + + await provider.synthesize({ + text: "Status update starts now.", + cfg: {}, + providerConfig: { + apiKey: "google-test-key", + audioProfile: "Speak professionally with a calm executive tone.", + speakerName: "Alex", + }, + target: "audio-file", + timeoutMs: 10_000, + }); + + const [, init] = fetchMock.mock.calls[0]; + expect(JSON.parse(String(init.body))).toMatchObject({ + contents: [ + { + parts: [ + { + text: + "Speak professionally with a calm executive tone.\n\n" + + "Speaker name: Alex\n\n" + + "Status update starts now.", + }, + ], + }, + ], + }); + }); + it("resolves provider config and directive overrides", () => { const provider = buildGoogleSpeechProvider(); @@ -178,6 +211,8 @@ describe("Google speech provider", () => { apiKey: "configured-key", model: "google/gemini-3.1-flash-tts-preview", voice: "Leda", + audioProfile: "Speak warmly.", + speakerName: "Narrator", }, }, }, @@ -185,8 +220,10 @@ describe("Google speech provider", () => { }), ).toEqual({ apiKey: "configured-key", + audioProfile: "Speak warmly.", baseUrl: undefined, model: "gemini-3.1-flash-tts-preview", + speakerName: "Narrator", voiceName: "Leda", }); diff --git a/extensions/google/speech-provider.ts b/extensions/google/speech-provider.ts index ff91c57664a..a34a8907916 100644 --- a/extensions/google/speech-provider.ts +++ b/extensions/google/speech-provider.ts @@ -55,11 +55,15 @@ type GoogleTtsProviderConfig = { baseUrl?: string; model: string; voiceName: string; + audioProfile?: string; + speakerName?: string; }; type GoogleTtsProviderOverrides = { model?: string; voiceName?: string; + audioProfile?: string; + speakerName?: string; }; type Maybe = T | undefined; @@ -148,6 +152,8 @@ function normalizeGoogleTtsProviderConfig( baseUrl: trimToUndefined(raw?.baseUrl), model: normalizeGoogleTtsModel(raw?.model), voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice), + audioProfile: trimToUndefined(raw?.audioProfile), + speakerName: trimToUndefined(raw?.speakerName), }; } @@ -160,6 +166,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro voiceName: normalizeGoogleTtsVoiceName( config.voiceName ?? config.voice ?? normalized.voiceName, ), + audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile, + speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName, }; } @@ -172,9 +180,25 @@ function readGoogleTtsOverrides( return { model: normalizeOptionalString(overrides.model), voiceName: normalizeOptionalString(overrides.voiceName ?? overrides.voice), + audioProfile: normalizeOptionalString(overrides.audioProfile), + speakerName: normalizeOptionalString(overrides.speakerName), }; } +function composeGoogleTtsText(params: { + text: string; + audioProfile?: string; + speakerName?: string; +}): string { + return [ + trimToUndefined(params.audioProfile), + trimToUndefined(params.speakerName) ? `Speaker name: ${params.speakerName}` : undefined, + params.text, + ] + .filter((part): part is string => part !== undefined) + .join("\n\n"); +} + function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { handled: boolean; overrides?: SpeechProviderOverrides; @@ -242,6 +266,8 @@ async function synthesizeGoogleTtsPcm(params: { baseUrl?: string; model: string; voiceName: string; + audioProfile?: string; + speakerName?: string; timeoutMs: number; }): Promise { const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } = @@ -259,7 +285,15 @@ async function synthesizeGoogleTtsPcm(params: { contents: [ { role: "user", - parts: [{ text: params.text }], + parts: [ + { + text: composeGoogleTtsText({ + text: params.text, + audioProfile: params.audioProfile, + speakerName: params.speakerName, + }), + }, + ], }, ], generationConfig: { @@ -347,6 +381,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }), model: normalizeGoogleTtsModel(overrides.model ?? config.model), voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName), + audioProfile: overrides.audioProfile ?? config.audioProfile, + speakerName: overrides.speakerName ?? config.speakerName, timeoutMs: req.timeoutMs, }); return { @@ -371,6 +407,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }), model: config.model, voiceName: config.voiceName, + audioProfile: config.audioProfile, + speakerName: config.speakerName, timeoutMs: req.timeoutMs, }); return {