mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 13:10:43 +00:00
fix(tts): honor telephony voice overrides
This commit is contained in:
@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/verbose: use compact explain-mode tool summaries for `/verbose` and progress drafts by default, with `agents.defaults.toolProgressDetail: "raw"` and per-agent overrides for debugging raw command/detail output.
|
||||
- Agents/commands: add `/steer <message>` for queue-independent steering of the active current-session run without starting a new turn when the session is idle. (#76934)
|
||||
- Agents/subagents: preserve every grouped child result when direct completion fallback has to bypass the requester-agent announce turn. Thanks @vincentkoc.
|
||||
- TTS/telephony: honor provider voice/model overrides in telephony synthesis providers so Google Meet agent speech logs match the backend that actually produced the audio. Thanks @vincentkoc.
|
||||
- Tools/BTW: add `/side` as a text and native slash-command alias for `/btw` side questions.
|
||||
- Doctor/config: `doctor --fix` now commits safe legacy migrations even when unrelated validation issues (e.g. a missing plugin) prevent full validation from passing, so `agents.defaults.llm` and other known-legacy keys are always cleaned up by `doctor --fix` regardless of other config problems. Fixes #76798. (#76800) Thanks @hclsys.
|
||||
- Docs: clarify that IRC uses raw TCP/TLS sockets outside operator-managed forward proxy routing, so direct IRC egress should be explicitly approved before enabling IRC. Thanks @jesse-merhi.
|
||||
|
||||
@@ -176,6 +176,42 @@ describe("buildAzureSpeechProvider", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("honors voice and language overrides for telephony output", async () => {
|
||||
const provider = buildAzureSpeechProvider();
|
||||
const result = await provider.synthesizeTelephony?.({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "key",
|
||||
region: "eastus",
|
||||
voice: "en-US-JennyNeural",
|
||||
lang: "en-US",
|
||||
},
|
||||
providerOverrides: {
|
||||
voice: "en-US-AriaNeural",
|
||||
lang: "es-US",
|
||||
},
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
expect(azureSpeechTTSMock).toHaveBeenCalledWith({
|
||||
text: "hello",
|
||||
apiKey: "key",
|
||||
baseUrl: "https://eastus.tts.speech.microsoft.com",
|
||||
endpoint: undefined,
|
||||
region: "eastus",
|
||||
voice: "en-US-AriaNeural",
|
||||
lang: "es-US",
|
||||
outputFormat: "raw-8khz-8bit-mono-mulaw",
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
expect(result).toEqual({
|
||||
audioBuffer: Buffer.from("audio-bytes"),
|
||||
outputFormat: "raw-8khz-8bit-mono-mulaw",
|
||||
sampleRate: 8_000,
|
||||
});
|
||||
});
|
||||
|
||||
it("lists voices through config or explicit request auth", async () => {
|
||||
const provider = buildAzureSpeechProvider();
|
||||
const voices = await provider.listVoices?.({
|
||||
|
||||
@@ -279,6 +279,7 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readAzureSpeechProviderConfig(req.providerConfig);
|
||||
const overrides = readAzureSpeechOverrides(req.providerOverrides);
|
||||
const apiKey = resolveApiKey(config);
|
||||
if (!apiKey) {
|
||||
throw new Error("Azure Speech API key missing");
|
||||
@@ -290,8 +291,8 @@ export function buildAzureSpeechProvider(): SpeechProviderPlugin {
|
||||
baseUrl: config.baseUrl,
|
||||
endpoint: config.endpoint,
|
||||
region: config.region,
|
||||
voice: config.voice,
|
||||
lang: config.lang,
|
||||
voice: overrides.voice ?? config.voice,
|
||||
lang: overrides.lang ?? config.lang,
|
||||
outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
|
||||
timeoutMs: resolveTimeoutMs(config, req.timeoutMs),
|
||||
});
|
||||
|
||||
@@ -397,11 +397,44 @@ describe("Google speech provider", () => {
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
apiKey: "google-test-key",
|
||||
model: "google/gemini-3.1-flash-tts",
|
||||
voice: "Kore",
|
||||
audioProfile: "Speak calmly.",
|
||||
speakerName: "Default speaker",
|
||||
},
|
||||
providerOverrides: {
|
||||
model: "google/gemini-3.1-pro-tts",
|
||||
voiceName: "Puck",
|
||||
audioProfile: "Speak brightly.",
|
||||
speakerName: "Override speaker",
|
||||
},
|
||||
timeoutMs: 5_000,
|
||||
});
|
||||
|
||||
expect(postJsonRequestMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-3.1-pro-tts:generateContent",
|
||||
body: expect.objectContaining({
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: "Speak brightly.\n\nSpeaker name: Override speaker\n\nPhone call audio." },
|
||||
],
|
||||
},
|
||||
],
|
||||
generationConfig: expect.objectContaining({
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: "Puck",
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(result).toEqual({
|
||||
audioBuffer: pcm,
|
||||
outputFormat: "pcm",
|
||||
|
||||
@@ -640,6 +640,7 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readGoogleTtsProviderConfig(req.providerConfig);
|
||||
const overrides = readGoogleTtsOverrides(req.providerOverrides);
|
||||
const apiKey = resolveGoogleTtsApiKey({
|
||||
cfg: req.cfg,
|
||||
providerConfig: req.providerConfig,
|
||||
@@ -654,10 +655,10 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
request: sanitizeConfiguredModelProviderRequest(
|
||||
req.cfg?.models?.providers?.google?.request,
|
||||
),
|
||||
model: config.model,
|
||||
voiceName: config.voiceName,
|
||||
audioProfile: config.audioProfile,
|
||||
speakerName: config.speakerName,
|
||||
model: normalizeGoogleTtsModel(overrides.model ?? config.model),
|
||||
voiceName: normalizeGoogleTtsVoiceName(overrides.voiceName ?? config.voiceName),
|
||||
audioProfile: overrides.audioProfile ?? config.audioProfile,
|
||||
speakerName: overrides.speakerName ?? config.speakerName,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
|
||||
@@ -98,12 +98,16 @@ describe("gradium speech provider", () => {
|
||||
const result = await provider.synthesizeTelephony!({
|
||||
text: "Telephony test",
|
||||
cfg: {} as never,
|
||||
providerConfig: { apiKey: "gsk_test123" },
|
||||
providerConfig: { apiKey: "gsk_test123", voiceId: "default-voice" },
|
||||
providerOverrides: { voiceId: "override-voice" },
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
|
||||
expect(JSON.parse(init.body as string).output_format).toBe("ulaw_8000");
|
||||
expect(JSON.parse(init.body as string)).toMatchObject({
|
||||
voice_id: "override-voice",
|
||||
output_format: "ulaw_8000",
|
||||
});
|
||||
expect(result.outputFormat).toBe("ulaw_8000");
|
||||
expect(result.sampleRate).toBe(8_000);
|
||||
expect(result.audioBuffer).toEqual(audioData);
|
||||
|
||||
@@ -96,6 +96,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readGradiumProviderConfig(req.providerConfig);
|
||||
const overrides = req.providerOverrides ?? {};
|
||||
const apiKey = config.apiKey || process.env.GRADIUM_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Gradium API key missing");
|
||||
@@ -106,7 +107,7 @@ export function buildGradiumSpeechProvider(): SpeechProviderPlugin {
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
voiceId: config.voiceId,
|
||||
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
|
||||
outputFormat,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
|
||||
@@ -190,6 +190,7 @@ describe("buildInworldSpeechProvider", () => {
|
||||
text: "Hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: { apiKey: "key", voiceId: "Sarah", modelId: "inworld-tts-1.5-max" },
|
||||
providerOverrides: { voice: "Ashley", model: "inworld-tts-1.5-mini", temperature: 0.6 },
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
@@ -197,11 +198,11 @@ describe("buildInworldSpeechProvider", () => {
|
||||
text: "Hello",
|
||||
apiKey: "key",
|
||||
baseUrl: "https://api.inworld.ai",
|
||||
voiceId: "Sarah",
|
||||
modelId: "inworld-tts-1.5-max",
|
||||
voiceId: "Ashley",
|
||||
modelId: "inworld-tts-1.5-mini",
|
||||
audioEncoding: "PCM",
|
||||
sampleRateHertz: 22_050,
|
||||
temperature: undefined,
|
||||
temperature: 0.6,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
expect(result).toEqual({
|
||||
|
||||
@@ -197,6 +197,7 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readInworldProviderConfig(req.providerConfig);
|
||||
const overrides = readInworldOverrides(req.providerOverrides);
|
||||
const apiKey = config.apiKey || process.env.INWORLD_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Inworld API key missing");
|
||||
@@ -207,11 +208,11 @@ export function buildInworldSpeechProvider(): SpeechProviderPlugin {
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
voiceId: config.voiceId,
|
||||
modelId: config.modelId,
|
||||
voiceId: overrides.voiceId ?? config.voiceId,
|
||||
modelId: overrides.modelId ?? config.modelId,
|
||||
audioEncoding: "PCM",
|
||||
sampleRateHertz: sampleRate,
|
||||
temperature: config.temperature,
|
||||
temperature: overrides.temperature ?? config.temperature,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
|
||||
|
||||
@@ -68,4 +68,39 @@ describe("xai speech provider", () => {
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("honors voice, language, and speed overrides for telephony output", async () => {
|
||||
const provider = buildXaiSpeechProvider();
|
||||
const result = await provider.synthesizeTelephony?.({
|
||||
text: "hello",
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
apiKey: "xai-key",
|
||||
baseUrl: "https://api.x.ai/v1",
|
||||
voiceId: "eve",
|
||||
language: "en",
|
||||
speed: 1,
|
||||
},
|
||||
providerOverrides: {
|
||||
voice: "aura",
|
||||
language: "es",
|
||||
speed: 1.2,
|
||||
},
|
||||
timeoutMs: 5_000,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
audioBuffer: Buffer.from("audio-bytes"),
|
||||
outputFormat: "pcm",
|
||||
sampleRate: 24_000,
|
||||
});
|
||||
expect(xaiTTSMock).toHaveBeenLastCalledWith(
|
||||
expect.objectContaining({
|
||||
voiceId: "aura",
|
||||
language: "es",
|
||||
speed: 1.2,
|
||||
responseFormat: "pcm",
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -230,6 +230,7 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const config = readXaiProviderConfig(req.providerConfig);
|
||||
const overrides = readXaiOverrides(req.providerOverrides);
|
||||
const apiKey = config.apiKey || process.env.XAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("xAI API key missing");
|
||||
@@ -240,9 +241,9 @@ export function buildXaiSpeechProvider(): SpeechProviderPlugin {
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
voiceId: config.voiceId,
|
||||
language: config.language,
|
||||
speed: config.speed,
|
||||
voiceId: overrides.voiceId ?? config.voiceId,
|
||||
language: overrides.language ?? config.language,
|
||||
speed: overrides.speed ?? config.speed,
|
||||
responseFormat: outputFormat,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user