diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt index be62498e24e..d4433d72a9c 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt @@ -723,6 +723,9 @@ class TalkModeManager( TalkModeRuntime.validatedLanguage(directive?.language)?.let { put("language", JsonPrimitive(it)) } + directive?.outputFormat?.trim()?.takeIf { it.isNotEmpty() }?.let { + put("outputFormat", JsonPrimitive(it)) + } } val res = session.request("talk.speak", params.toString()) val root = json.parseToJsonElement(res).asObjectOrNull() ?: error("talk.speak returned invalid JSON") diff --git a/src/gateway/protocol/schema/channels.ts b/src/gateway/protocol/schema/channels.ts index 923432c7ac8..52f5ad597bc 100644 --- a/src/gateway/protocol/schema/channels.ts +++ b/src/gateway/protocol/schema/channels.ts @@ -21,6 +21,7 @@ export const TalkSpeakParamsSchema = Type.Object( text: NonEmptyString, voiceId: Type.Optional(Type.String()), modelId: Type.Optional(Type.String()), + outputFormat: Type.Optional(Type.String()), speed: Type.Optional(Type.Number()), stability: Type.Optional(Type.Number()), similarity: Type.Optional(Type.Number()), diff --git a/src/gateway/server-methods/talk.ts b/src/gateway/server-methods/talk.ts index 85f78e91b6a..acbede0b33d 100644 --- a/src/gateway/server-methods/talk.ts +++ b/src/gateway/server-methods/talk.ts @@ -69,7 +69,13 @@ function resolveTalkVoiceId( if (!aliases) { return requested; } - return aliases[normalizeAliasKey(requested)] ?? requested; + const normalizedRequested = normalizeAliasKey(requested); + for (const [alias, voiceId] of Object.entries(aliases)) { + if (normalizeAliasKey(alias) === normalizedRequested) { + return voiceId; + } + } + return requested; } function readTalkVoiceSettings( @@ -189,6 +195,7 @@ function buildTalkSpeakOverrides( ): TtsDirectiveOverrides { const voiceId = resolveTalkVoiceId(providerConfig, trimString(params.voiceId)); const modelId = trimString(params.modelId); + const outputFormat = trimString(params.outputFormat); const speed = finiteNumber(params.speed); const seed = finiteNumber(params.seed); const normalize = normalizeTextNormalization(params.normalize); @@ -212,6 +219,7 @@ function buildTalkSpeakOverrides( overrides.elevenlabs = { ...(voiceId == null ? {} : { voiceId }), ...(modelId == null ? {} : { modelId }), + ...(outputFormat == null ? {} : { outputFormat }), ...(seed == null ? {} : { seed }), ...(normalize == null ? {} : { applyTextNormalization: normalize }), ...(language == null ? {} : { languageCode: language }), @@ -230,7 +238,10 @@ function buildTalkSpeakOverrides( } if (provider === "microsoft") { - overrides.microsoft = voiceId == null ? undefined : { voice: voiceId }; + overrides.microsoft = { + ...(voiceId == null ? {} : { voice: voiceId }), + ...(outputFormat == null ? {} : { outputFormat }), + }; } return overrides; diff --git a/src/gateway/server.talk-config.test.ts b/src/gateway/server.talk-config.test.ts index eb2925db158..6433445795f 100644 --- a/src/gateway/server.talk-config.test.ts +++ b/src/gateway/server.talk-config.test.ts @@ -301,4 +301,51 @@ describe("gateway talk.config", () => { globalThis.fetch = originalFetch; } }); + + it("resolves talk voice aliases case-insensitively and forwards output format", async () => { + const { writeConfigFile } = await import("../config/config.js"); + await writeConfigFile({ + talk: { + provider: "elevenlabs", + providers: { + elevenlabs: { + apiKey: "elevenlabs-talk-key", // pragma: allowlist secret + voiceId: "voice-default", + voiceAliases: { + Clawd: "EXAVITQu4vr4xnSDxMaL", + }, + }, + }, + }, + }); + + const originalFetch = globalThis.fetch; + let fetchUrl: string | undefined; + const fetchMock = vi.fn(async (input: RequestInfo | URL) => { + fetchUrl = typeof input === "string" ? input : input instanceof URL ? input.href : input.url; + return new Response(new Uint8Array([4, 5, 6]), { status: 200 }); + }); + globalThis.fetch = fetchMock as typeof fetch; + + try { + await withServer(async (ws) => { + await connectOperator(ws, ["operator.read", "operator.write"]); + const res = await fetchTalkSpeak(ws, { + text: "Hello from talk mode.", + voiceId: "clawd", + outputFormat: "pcm_44100", + }); + expect(res.ok).toBe(true); + expect(res.payload?.provider).toBe("elevenlabs"); + expect(res.payload?.outputFormat).toBe("pcm_44100"); + expect(res.payload?.audioBase64).toBe(Buffer.from([4, 5, 6]).toString("base64")); + }); + + expect(fetchMock).toHaveBeenCalled(); + expect(fetchUrl).toContain("/v1/text-to-speech/EXAVITQu4vr4xnSDxMaL"); + expect(fetchUrl).toContain("output_format=pcm_44100"); + } finally { + globalThis.fetch = originalFetch; + } + }); }); diff --git a/src/tts/providers/elevenlabs.ts b/src/tts/providers/elevenlabs.ts index c22425926bf..99097fc42f3 100644 --- a/src/tts/providers/elevenlabs.ts +++ b/src/tts/providers/elevenlabs.ts @@ -72,7 +72,9 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin { if (!apiKey) { throw new Error("ElevenLabs API key missing"); } - const outputFormat = req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"; + const outputFormat = + req.overrides?.elevenlabs?.outputFormat ?? + (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"); const audioBuffer = await elevenLabsTTS({ text: req.text, apiKey, diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts index ba2511e4de6..f6c5aa8c379 100644 --- a/src/tts/providers/microsoft.ts +++ b/src/tts/providers/microsoft.ts @@ -83,7 +83,7 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { const tempRoot = resolvePreferredOpenClawTmpDir(); mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-")); - let outputFormat = req.config.edge.outputFormat; + let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat; const fallbackOutputFormat = outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index c64dda83909..17a7c2fc981 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -167,6 +167,7 @@ export type TtsDirectiveOverrides = { elevenlabs?: { voiceId?: string; modelId?: string; + outputFormat?: string; seed?: number; applyTextNormalization?: "auto" | "on" | "off"; languageCode?: string; @@ -174,6 +175,7 @@ export type TtsDirectiveOverrides = { }; microsoft?: { voice?: string; + outputFormat?: string; }; };