feat(google): support Gemini TTS style profile

This commit is contained in:
Peter Steinberger
2026-04-25 06:11:15 +01:00
parent 3f63ba8fd8
commit 8acc92c881
5 changed files with 88 additions and 4 deletions

View File

@@ -62,6 +62,7 @@ Docs: https://docs.openclaw.ai
- Plugins/Google Meet: add `googlemeet doctor` and a `recover_current_tab`/`recover-tab` flow so agents can inspect an already-open Meet tab and report the blocker without opening another window. Thanks @steipete.
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support. Thanks @steipete.
- Providers/Google: let Gemini TTS prepend configured `audioProfile` and `speakerName` prompt text for reusable speech style control. Thanks @tdack.
- Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop. Thanks @steipete.
- Gateway/VoiceClaw: add a realtime brain WebSocket endpoint backed by Gemini Live, with owner-auth gating and async OpenClaw tool handoff. (#70938) Thanks @yagudaev.
- Providers/DeepSeek: add DeepSeek V4 Flash and V4 Pro to the bundled catalog and make V4 Flash the onboarding default. Thanks @lsdsjy.

View File

@@ -267,6 +267,7 @@ To use Google as the default TTS provider:
google: {
model: "gemini-3.1-flash-tts-preview",
voiceName: "Kore",
audioProfile: "Speak professionally with a calm tone.",
},
},
},
@@ -274,9 +275,14 @@ To use Google as the default TTS provider:
}
```
Gemini API TTS accepts expressive square-bracket audio tags in the text, such as
`[whispers]` or `[laughs]`. To keep tags out of the visible chat reply while
sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]` block:
Gemini API TTS uses natural-language prompting for style control. Set
`audioProfile` to prepend a reusable style prompt before the spoken text. Set
`speakerName` when your prompt text refers to a named speaker.
Gemini API TTS also accepts expressive square-bracket audio tags in the text,
such as `[whispers]` or `[laughs]`. To keep tags out of the visible chat reply
while sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]`
block:
```text
Here is the clean reply text.

View File

@@ -379,6 +379,8 @@ Then run:
- `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values.
- `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`).
- `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted).
- `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text.
- `providers.google.speakerName`: optional speaker label prepended before the spoken text when your TTS prompt uses a named speaker.
- `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted.
- If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback.
- `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`).

View File

@@ -166,6 +166,39 @@ describe("Google speech provider", () => {
});
});
it("prepends configured Gemini TTS profile text", async () => {
const fetchMock = installGoogleTtsFetchMock();
const provider = buildGoogleSpeechProvider();
await provider.synthesize({
text: "Status update starts now.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
audioProfile: "Speak professionally with a calm executive tone.",
speakerName: "Alex",
},
target: "audio-file",
timeoutMs: 10_000,
});
const [, init] = fetchMock.mock.calls[0];
expect(JSON.parse(String(init.body))).toMatchObject({
contents: [
{
parts: [
{
text:
"Speak professionally with a calm executive tone.\n\n" +
"Speaker name: Alex\n\n" +
"Status update starts now.",
},
],
},
],
});
});
it("resolves provider config and directive overrides", () => {
const provider = buildGoogleSpeechProvider();
@@ -178,6 +211,8 @@ describe("Google speech provider", () => {
apiKey: "configured-key",
model: "google/gemini-3.1-flash-tts-preview",
voice: "Leda",
audioProfile: "Speak warmly.",
speakerName: "Narrator",
},
},
},
@@ -185,8 +220,10 @@ describe("Google speech provider", () => {
}),
).toEqual({
apiKey: "configured-key",
audioProfile: "Speak warmly.",
baseUrl: undefined,
model: "gemini-3.1-flash-tts-preview",
speakerName: "Narrator",
voiceName: "Leda",
});

View File

@@ -55,11 +55,15 @@ type GoogleTtsProviderConfig = {
baseUrl?: string;
model: string;
voiceName: string;
audioProfile?: string;
speakerName?: string;
};
type GoogleTtsProviderOverrides = {
model?: string;
voiceName?: string;
audioProfile?: string;
speakerName?: string;
};
type Maybe<T> = T | undefined;
@@ -148,6 +152,8 @@ function normalizeGoogleTtsProviderConfig(
baseUrl: trimToUndefined(raw?.baseUrl),
model: normalizeGoogleTtsModel(raw?.model),
voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice),
audioProfile: trimToUndefined(raw?.audioProfile),
speakerName: trimToUndefined(raw?.speakerName),
};
}
@@ -160,6 +166,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro
voiceName: normalizeGoogleTtsVoiceName(
config.voiceName ?? config.voice ?? normalized.voiceName,
),
audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile,
speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName,
};
}
@@ -172,9 +180,25 @@ function readGoogleTtsOverrides(
return {
model: normalizeOptionalString(overrides.model),
voiceName: normalizeOptionalString(overrides.voiceName ?? overrides.voice),
audioProfile: normalizeOptionalString(overrides.audioProfile),
speakerName: normalizeOptionalString(overrides.speakerName),
};
}
function composeGoogleTtsText(params: {
text: string;
audioProfile?: string;
speakerName?: string;
}): string {
return [
trimToUndefined(params.audioProfile),
trimToUndefined(params.speakerName) ? `Speaker name: ${params.speakerName}` : undefined,
params.text,
]
.filter((part): part is string => part !== undefined)
.join("\n\n");
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
@@ -242,6 +266,8 @@ async function synthesizeGoogleTtsPcm(params: {
baseUrl?: string;
model: string;
voiceName: string;
audioProfile?: string;
speakerName?: string;
timeoutMs: number;
}): Promise<Buffer> {
const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
@@ -259,7 +285,15 @@ async function synthesizeGoogleTtsPcm(params: {
contents: [
{
role: "user",
parts: [{ text: params.text }],
parts: [
{
text: composeGoogleTtsText({
text: params.text,
audioProfile: params.audioProfile,
speakerName: params.speakerName,
}),
},
],
},
],
generationConfig: {
@@ -347,6 +381,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }),
model: normalizeGoogleTtsModel(overrides.model ?? config.model),
voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
audioProfile: overrides.audioProfile ?? config.audioProfile,
speakerName: overrides.speakerName ?? config.speakerName,
timeoutMs: req.timeoutMs,
});
return {
@@ -371,6 +407,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }),
model: config.model,
voiceName: config.voiceName,
audioProfile: config.audioProfile,
speakerName: config.speakerName,
timeoutMs: req.timeoutMs,
});
return {