mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 13:10:43 +00:00
feat(google): support Gemini TTS style profile
This commit is contained in:
@@ -62,6 +62,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Plugins/Google Meet: add `googlemeet doctor` and a `recover_current_tab`/`recover-tab` flow so agents can inspect an already-open Meet tab and report the blocker without opening another window. Thanks @steipete.
|
||||
- Plugins/Bonjour: move LAN Gateway discovery advertising into a default-enabled bundled plugin with its own `@homebridge/ciao` dependency, so users can disable Bonjour without cutting wide-area discovery. Thanks @vincentkoc.
|
||||
- Providers/Google: add a Gemini Live realtime voice provider for backend Voice Call and Google Meet audio bridges, with bidirectional audio and function-call support. Thanks @steipete.
|
||||
- Providers/Google: let Gemini TTS prepend configured `audioProfile` and `speakerName` prompt text for reusable speech style control. Thanks @tdack.
|
||||
- Plugins/Google Meet: let realtime Meet sessions consult the full OpenClaw agent for deeper answers while staying in the live voice loop. Thanks @steipete.
|
||||
- Gateway/VoiceClaw: add a realtime brain WebSocket endpoint backed by Gemini Live, with owner-auth gating and async OpenClaw tool handoff. (#70938) Thanks @yagudaev.
|
||||
- Providers/DeepSeek: add DeepSeek V4 Flash and V4 Pro to the bundled catalog and make V4 Flash the onboarding default. Thanks @lsdsjy.
|
||||
|
||||
@@ -267,6 +267,7 @@ To use Google as the default TTS provider:
|
||||
google: {
|
||||
model: "gemini-3.1-flash-tts-preview",
|
||||
voiceName: "Kore",
|
||||
audioProfile: "Speak professionally with a calm tone.",
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -274,9 +275,14 @@ To use Google as the default TTS provider:
|
||||
}
|
||||
```
|
||||
|
||||
Gemini API TTS accepts expressive square-bracket audio tags in the text, such as
|
||||
`[whispers]` or `[laughs]`. To keep tags out of the visible chat reply while
|
||||
sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]` block:
|
||||
Gemini API TTS uses natural-language prompting for style control. Set
|
||||
`audioProfile` to prepend a reusable style prompt before the spoken text. Set
|
||||
`speakerName` when your prompt text refers to a named speaker.
|
||||
|
||||
Gemini API TTS also accepts expressive square-bracket audio tags in the text,
|
||||
such as `[whispers]` or `[laughs]`. To keep tags out of the visible chat reply
|
||||
while sending them to TTS, put them inside a `[[tts:text]]...[[/tts:text]]`
|
||||
block:
|
||||
|
||||
```text
|
||||
Here is the clean reply text.
|
||||
|
||||
@@ -379,6 +379,8 @@ Then run:
|
||||
- `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values.
|
||||
- `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`).
|
||||
- `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted).
|
||||
- `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text.
|
||||
- `providers.google.speakerName`: optional speaker label prepended before the spoken text when your TTS prompt uses a named speaker.
|
||||
- `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted.
|
||||
- If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback.
|
||||
- `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`).
|
||||
|
||||
@@ -166,6 +166,39 @@ describe("Google speech provider", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("prepends configured Gemini TTS profile text", async () => {
|
||||
const fetchMock = installGoogleTtsFetchMock();
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
await provider.synthesize({
|
||||
text: "Status update starts now.",
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
apiKey: "google-test-key",
|
||||
audioProfile: "Speak professionally with a calm executive tone.",
|
||||
speakerName: "Alex",
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 10_000,
|
||||
});
|
||||
|
||||
const [, init] = fetchMock.mock.calls[0];
|
||||
expect(JSON.parse(String(init.body))).toMatchObject({
|
||||
contents: [
|
||||
{
|
||||
parts: [
|
||||
{
|
||||
text:
|
||||
"Speak professionally with a calm executive tone.\n\n" +
|
||||
"Speaker name: Alex\n\n" +
|
||||
"Status update starts now.",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("resolves provider config and directive overrides", () => {
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
@@ -178,6 +211,8 @@ describe("Google speech provider", () => {
|
||||
apiKey: "configured-key",
|
||||
model: "google/gemini-3.1-flash-tts-preview",
|
||||
voice: "Leda",
|
||||
audioProfile: "Speak warmly.",
|
||||
speakerName: "Narrator",
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -185,8 +220,10 @@ describe("Google speech provider", () => {
|
||||
}),
|
||||
).toEqual({
|
||||
apiKey: "configured-key",
|
||||
audioProfile: "Speak warmly.",
|
||||
baseUrl: undefined,
|
||||
model: "gemini-3.1-flash-tts-preview",
|
||||
speakerName: "Narrator",
|
||||
voiceName: "Leda",
|
||||
});
|
||||
|
||||
|
||||
@@ -55,11 +55,15 @@ type GoogleTtsProviderConfig = {
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voiceName: string;
|
||||
audioProfile?: string;
|
||||
speakerName?: string;
|
||||
};
|
||||
|
||||
type GoogleTtsProviderOverrides = {
|
||||
model?: string;
|
||||
voiceName?: string;
|
||||
audioProfile?: string;
|
||||
speakerName?: string;
|
||||
};
|
||||
|
||||
type Maybe<T> = T | undefined;
|
||||
@@ -148,6 +152,8 @@ function normalizeGoogleTtsProviderConfig(
|
||||
baseUrl: trimToUndefined(raw?.baseUrl),
|
||||
model: normalizeGoogleTtsModel(raw?.model),
|
||||
voiceName: normalizeGoogleTtsVoiceName(raw?.voiceName ?? raw?.voice),
|
||||
audioProfile: trimToUndefined(raw?.audioProfile),
|
||||
speakerName: trimToUndefined(raw?.speakerName),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -160,6 +166,8 @@ function readGoogleTtsProviderConfig(config: SpeechProviderConfig): GoogleTtsPro
|
||||
voiceName: normalizeGoogleTtsVoiceName(
|
||||
config.voiceName ?? config.voice ?? normalized.voiceName,
|
||||
),
|
||||
audioProfile: trimToUndefined(config.audioProfile) ?? normalized.audioProfile,
|
||||
speakerName: trimToUndefined(config.speakerName) ?? normalized.speakerName,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -172,9 +180,25 @@ function readGoogleTtsOverrides(
|
||||
return {
|
||||
model: normalizeOptionalString(overrides.model),
|
||||
voiceName: normalizeOptionalString(overrides.voiceName ?? overrides.voice),
|
||||
audioProfile: normalizeOptionalString(overrides.audioProfile),
|
||||
speakerName: normalizeOptionalString(overrides.speakerName),
|
||||
};
|
||||
}
|
||||
|
||||
function composeGoogleTtsText(params: {
|
||||
text: string;
|
||||
audioProfile?: string;
|
||||
speakerName?: string;
|
||||
}): string {
|
||||
return [
|
||||
trimToUndefined(params.audioProfile),
|
||||
trimToUndefined(params.speakerName) ? `Speaker name: ${params.speakerName}` : undefined,
|
||||
params.text,
|
||||
]
|
||||
.filter((part): part is string => part !== undefined)
|
||||
.join("\n\n");
|
||||
}
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
handled: boolean;
|
||||
overrides?: SpeechProviderOverrides;
|
||||
@@ -242,6 +266,8 @@ async function synthesizeGoogleTtsPcm(params: {
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voiceName: string;
|
||||
audioProfile?: string;
|
||||
speakerName?: string;
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { baseUrl, allowPrivateNetwork, headers, dispatcherPolicy } =
|
||||
@@ -259,7 +285,15 @@ async function synthesizeGoogleTtsPcm(params: {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [{ text: params.text }],
|
||||
parts: [
|
||||
{
|
||||
text: composeGoogleTtsText({
|
||||
text: params.text,
|
||||
audioProfile: params.audioProfile,
|
||||
speakerName: params.speakerName,
|
||||
}),
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
generationConfig: {
|
||||
@@ -347,6 +381,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }),
|
||||
model: normalizeGoogleTtsModel(overrides.model ?? config.model),
|
||||
voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
|
||||
audioProfile: overrides.audioProfile ?? config.audioProfile,
|
||||
speakerName: overrides.speakerName ?? config.speakerName,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
@@ -371,6 +407,8 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
baseUrl: resolveGoogleTtsBaseUrl({ cfg: req.cfg, providerConfig: config }),
|
||||
model: config.model,
|
||||
voiceName: config.voiceName,
|
||||
audioProfile: config.audioProfile,
|
||||
speakerName: config.speakerName,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user