fix(tts): honor telephony voice overrides

This commit is contained in:
Vincent Koc
2026-05-03 22:49:46 -07:00
parent a224810a7f
commit 361737d1f1
11 changed files with 133 additions and 18 deletions

View File

@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
- Agents/verbose: use compact explain-mode tool summaries for `/verbose` and progress drafts by default, with `agents.defaults.toolProgressDetail: "raw"` and per-agent overrides for debugging raw command/detail output.
- Agents/commands: add `/steer <message>` for queue-independent steering of the active current-session run without starting a new turn when the session is idle. (#76934)
- Agents/subagents: preserve every grouped child result when direct completion fallback has to bypass the requester-agent announce turn. Thanks @vincentkoc.
- TTS/telephony: honor provider voice/model overrides in telephony synthesis providers so Google Meet agent speech logs match the backend that actually produced the audio. Thanks @vincentkoc.
- Tools/BTW: add `/side` as a text and native slash-command alias for `/btw` side questions.
- Doctor/config: `doctor --fix` now commits safe legacy migrations even when unrelated validation issues (e.g. a missing plugin) prevent full validation from passing, so `agents.defaults.llm` and other known-legacy keys are always cleaned up by `doctor --fix` regardless of other config problems. Fixes #76798. (#76800) Thanks @hclsys.
- Docs: clarify that IRC uses raw TCP/TLS sockets outside operator-managed forward proxy routing, so direct IRC egress should be explicitly approved before enabling IRC. Thanks @jesse-merhi.

View File

@@ -176,6 +176,42 @@ describe("buildAzureSpeechProvider", () => {
});
});
it("honors voice and language overrides for telephony output", async () => {
const provider = buildAzureSpeechProvider();
const result = await provider.synthesizeTelephony?.({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "key",
region: "eastus",
voice: "en-US-JennyNeural",
lang: "en-US",
},
providerOverrides: {
voice: "en-US-AriaNeural",
lang: "es-US",
},
timeoutMs: 30_000,
});
expect(azureSpeechTTSMock).toHaveBeenCalledWith({
text: "hello",
apiKey: "key",
baseUrl: "https://eastus.tts.speech.microsoft.com",
endpoint: undefined,
region: "eastus",
voice: "en-US-AriaNeural",
lang: "es-US",
outputFormat: "raw-8khz-8bit-mono-mulaw",
timeoutMs: 30_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("audio-bytes"),
outputFormat: "raw-8khz-8bit-mono-mulaw",
sampleRate: 8_000,
});
});
it("lists voices through config or explicit request auth", async () => {
const provider = buildAzureSpeechProvider();
const voices = await provider.listVoices?.({

View File

@@ -279,6 +279,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readAzureSpeechProviderConfig(req.providerConfig);
const overrides = readAzureSpeechOverrides(req.providerOverrides);
const apiKey = resolveApiKey(config);
if (!apiKey) {
throw new Error("Azure Speech API key missing");
@@ -290,8 +291,8 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
baseUrl: config.baseUrl,
endpoint: config.endpoint,
region: config.region,
voice: config.voice,
lang: config.lang,
voice: overrides.voice ?? config.voice,
lang: overrides.lang ?? config.lang,
outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
timeoutMs: resolveTimeoutMs(config, req.timeoutMs),
});

View File

@@ -397,11 +397,44 @@ describe("Google speech provider", () => {
cfg: {},
providerConfig: {
apiKey: "google-test-key",
model: "google/gemini-3.1-flash-tts",
voice: "Kore",
audioProfile: "Speak calmly.",
speakerName: "Default speaker",
},
providerOverrides: {
model: "google/gemini-3.1-pro-tts",
voiceName: "Puck",
audioProfile: "Speak brightly.",
speakerName: "Override speaker",
},
timeoutMs: 5_000,
});
expect(postJsonRequestMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent",
body: expect.objectContaining({
contents: [
{
role: "user",
parts: [
{ text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." },
],
},
],
generationConfig: expect.objectContaining({
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: "Puck",
},
},
},
}),
}),
}),
);
expect(result).toEqual({
audioBuffer: pcm,
outputFormat: "pcm",

View File

@@ -640,6 +640,7 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readGoogleTtsProviderConfig(req.providerConfig);
const overrides = readGoogleTtsOverrides(req.providerOverrides);
const apiKey = resolveGoogleTtsApiKey({
cfg: req.cfg,
providerConfig: req.providerConfig,
@@ -654,10 +655,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
request: sanitizeConfiguredModelProviderRequest(
req.cfg?.models?.providers?.google?.request,
),
model: config.model,
voiceName: config.voiceName,
audioProfile: config.audioProfile,
speakerName: config.speakerName,
model: normalizeGoogleTtsModel(overrides.model ?? config.model),
voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
audioProfile: overrides.audioProfile ?? config.audioProfile,
speakerName: overrides.speakerName ?? config.speakerName,
timeoutMs: req.timeoutMs,
});
return {

View File

@@ -98,12 +98,16 @@ describe("gradium speech provider", () => {
const result = await provider.synthesizeTelephony!({
text: "Telephony test",
cfg: {} as never,
providerConfig: { apiKey: "gsk_test123" },
providerConfig: { apiKey: "gsk_test123", voiceId: "default-voice" },
providerOverrides: { voiceId: "override-voice" },
timeoutMs: 30_000,
});
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
expect(JSON.parse(init.body as string).output_format).toBe("ulaw_8000");
expect(JSON.parse(init.body as string)).toMatchObject({
voice_id: "override-voice",
output_format: "ulaw_8000",
});
expect(result.outputFormat).toBe("ulaw_8000");
expect(result.sampleRate).toBe(8_000);
expect(result.audioBuffer).toEqual(audioData);

View File

@@ -96,6 +96,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readGradiumProviderConfig(req.providerConfig);
const overrides = req.providerOverrides ?? {};
const apiKey = config.apiKey || process.env.GRADIUM_API_KEY;
if (!apiKey) {
throw new Error("Gradium API key missing");
@@ -106,7 +107,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
outputFormat,
timeoutMs: req.timeoutMs,
});

View File

@@ -190,6 +190,7 @@ describe("buildInworldSpeechProvider", () => {
text: "Hello",
cfg: {} as never,
providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 },
timeoutMs: 30_000,
});
@@ -197,11 +198,11 @@ describe("buildInworldSpeechProvider", () => {
text: "Hello",
apiKey: "key",
baseUrl: "https://api.inworld.ai",
voiceId: "Sarah",
modelId: "inworld-tts-1.5-max",
voiceId: "Ashley",
modelId: "inworld-tts-1.5-mini",
audioEncoding: "PCM",
sampleRateHertz: 22_050,
temperature: undefined,
temperature: 0.6,
timeoutMs: 30_000,
});
expect(result).toEqual({

View File

@@ -197,6 +197,7 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readInworldProviderConfig(req.providerConfig);
const overrides = readInworldOverrides(req.providerOverrides);
const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
if (!apiKey) {
throw new Error("Inworld API key missing");
@@ -207,11 +208,11 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
modelId: config.modelId,
voiceId: overrides.voiceId ?? config.voiceId,
modelId: overrides.modelId ?? config.modelId,
audioEncoding: "PCM",
sampleRateHertz: sampleRate,
temperature: config.temperature,
temperature: overrides.temperature ?? config.temperature,
timeoutMs: req.timeoutMs,
});

View File

@@ -68,4 +68,39 @@ describe("xai speech provider", () => {
}),
);
});
it("honors voice, language, and speed overrides for telephony output", async () => {
const provider = buildXaiSpeechProvider();
const result = await provider.synthesizeTelephony?.({
text: "hello",
cfg: {},
providerConfig: {
apiKey: "xai-key",
baseUrl: "https://api.x.ai/v1",
voiceId: "eve",
language: "en",
speed: 1,
},
providerOverrides: {
voice: "aura",
language: "es",
speed: 1.2,
},
timeoutMs: 5_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("audio-bytes"),
outputFormat: "pcm",
sampleRate: 24_000,
});
expect(xaiTTSMock).toHaveBeenLastCalledWith(
expect.objectContaining({
voiceId: "aura",
language: "es",
speed: 1.2,
responseFormat: "pcm",
}),
);
});
});

View File

@@ -230,6 +230,7 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
},
synthesizeTelephony: async (req) => {
const config = readXaiProviderConfig(req.providerConfig);
const overrides = readXaiOverrides(req.providerOverrides);
const apiKey = config.apiKey || process.env.XAI_API_KEY;
if (!apiKey) {
throw new Error("xAI API key missing");
@@ -240,9 +241,9 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
language: config.language,
speed: config.speed,
voiceId: overrides.voiceId ?? config.voiceId,
language: overrides.language ?? config.language,
speed: overrides.speed ?? config.speed,
responseFormat: outputFormat,
timeoutMs: req.timeoutMs,
});