diff --git a/CHANGELOG.md b/CHANGELOG.md index aa7636f0a07..595366d2e35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,7 @@ Docs: https://docs.openclaw.ai - CLI/agents: keep `openclaw agents list --json` on the config-only path by default, avoiding bundled plugin loading unless callers request `--bindings`. Fixes #71739. Thanks @kaloster. - Plugins/install: force plugin dependency installs to stay project-local even when inherited npm config requests global installs, so successful installs still materialize the plugin's staged `node_modules`. - Providers/Google: transcode Gemini TTS PCM to Opus for voice-note targets so WhatsApp and other native voice-note replies can play as voice messages. +- TTS/WhatsApp: mark non-Opus provider output as voice-note intent so channel delivery transcodes MP3/WebM replies to Ogg/Opus PTT audio. - Plugins/runtime deps: reuse existing external bundled-plugin stage roots when mirrored plugin roots are inspected again, avoiding second-generation `openclaw-unknown-*` stages and repeated first-turn restaging. Fixes #71599. - iOS/macOS Talk Mode: allow `talk.speechLocale` to set the speech recognition locale for non-English voice conversations. Fixes #44688. - Plugins/providers: honor explicit plugin candidate lists instead of reading a persisted registry snapshot from local state, keeping candidate-scoped provider discovery hermetic. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 7e2e9adc8ff..3d6ea28e21d 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -754,10 +754,11 @@ These override the effective config from `messages.tts` plus the active - **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). - 48kHz / 64kbps is a good voice message tradeoff. -- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another - likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with - `ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu - receives the original file as an attachment. +- **Feishu / WhatsApp**: when a voice-note reply is produced as MP3/WAV/M4A or + another likely audio file, the channel plugin transcodes it to 48kHz Ogg/Opus + with `ffmpeg` before sending the native voice message. If conversion fails, + Feishu receives the original file as an attachment; WhatsApp send fails rather + than posting an incompatible PTT payload. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery. @@ -844,8 +845,8 @@ Notes: The `tts` tool converts text to speech and returns an audio attachment for reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp, the audio is delivered as a voice message rather than a file attachment. -Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is -available. +Feishu and WhatsApp can transcode non-Opus TTS output on this path when +`ffmpeg` is available. WhatsApp sends visible text separately from PTT voice-note audio because clients do not consistently render captions on voice notes. It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 149ecb84a31..0249d178467 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -71,7 +71,12 @@ async function expectTtsPayloadResult(params: { text: string; target: "voice-note" | "audio-file"; audioAsVoice: true | undefined; + providerResult?: MockSpeechSynthesisResult; + mediaExtension?: string; }) { + if (params.providerResult) { + synthesizeMock.mockResolvedValueOnce(params.providerResult); + } const cfg = createTtsConfig(params.prefsName); let mediaDir: string | undefined; try { @@ -84,7 +89,7 @@ async function expectTtsPayloadResult(params: { expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ target: params.target })); expect(result.audioAsVoice).toBe(params.audioAsVoice); - expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/); + expect(result.mediaUrl).toMatch(new RegExp(`voice-\\d+\\.${params.mediaExtension ?? "ogg"}$`)); mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined; } finally { @@ -118,35 +123,26 @@ describe("speech-core native voice-note routing", () => { }); }); - it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => { - synthesizeMock.mockResolvedValueOnce({ - audioBuffer: Buffer.from("mp3"), - outputFormat: "mp3", - fileExtension: ".mp3", - voiceCompatible: false, - }); - const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test"); - let mediaDir: string | undefined; - try { - const result = await maybeApplyTtsToPayload({ - payload: { text: "This Feishu reply should be transcoded by the channel." }, - cfg, - channel: "feishu", - kind: "final", + it.each(["feishu", "whatsapp"] as const)( + "marks %s voice-note TTS for channel-side transcoding when provider returns mp3", + async (channel) => { + expect(_test.supportsTranscodedVoiceNoteTts(channel)).toBe(true); + await expectTtsPayloadResult({ + channel, + prefsName: `openclaw-speech-core-tts-${channel}-mp3-test`, + text: `This ${channel} reply should be transcoded by the channel.`, + target: "voice-note", + audioAsVoice: true, + mediaExtension: "mp3", + providerResult: { + audioBuffer: Buffer.from("mp3"), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }, }); - - expect(synthesizeMock).toHaveBeenCalledWith( - expect.objectContaining({ target: "voice-note" }), - ); - expect(result.audioAsVoice).toBe(true); - expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/); - mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined; - } finally { - if (mediaDir) { - rmSync(mediaDir, { recursive: true, force: true }); - } - } - }); + }, + ); it("keeps non-native voice-note channels as regular audio files", async () => { await expectTtsPayloadResult({ diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 0a29d0b1b8a..9133be9d1b1 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -640,7 +640,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { } const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]); -const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]); +const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]); function resolveChannelId(channel: string | undefined): ChannelId | null { return channel ? normalizeChannelId(channel) : null;