mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:20:43 +00:00
fix(tts): route WhatsApp MP3 TTS as voice notes
This commit is contained in:
@@ -122,6 +122,7 @@ Docs: https://docs.openclaw.ai
|
||||
- CLI/agents: keep `openclaw agents list --json` on the config-only path by default, avoiding bundled plugin loading unless callers request `--bindings`. Fixes #71739. Thanks @kaloster.
|
||||
- Plugins/install: force plugin dependency installs to stay project-local even when inherited npm config requests global installs, so successful installs still materialize the plugin's staged `node_modules`.
|
||||
- Providers/Google: transcode Gemini TTS PCM to Opus for voice-note targets so WhatsApp and other native voice-note replies can play as voice messages.
|
||||
- TTS/WhatsApp: mark non-Opus provider output as voice-note intent so channel delivery transcodes MP3/WebM replies to Ogg/Opus PTT audio.
|
||||
- Plugins/runtime deps: reuse existing external bundled-plugin stage roots when mirrored plugin roots are inspected again, avoiding second-generation `openclaw-unknown-*` stages and repeated first-turn restaging. Fixes #71599.
|
||||
- iOS/macOS Talk Mode: allow `talk.speechLocale` to set the speech recognition locale for non-English voice conversations. Fixes #44688.
|
||||
- Plugins/providers: honor explicit plugin candidate lists instead of reading a persisted registry snapshot from local state, keeping candidate-scoped provider discovery hermetic.
|
||||
|
||||
@@ -754,10 +754,11 @@ These override the effective config from `messages.tts` plus the active
|
||||
|
||||
- **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice message tradeoff.
|
||||
- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another
|
||||
likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with
|
||||
`ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu
|
||||
receives the original file as an attachment.
|
||||
- **Feishu / WhatsApp**: when a voice-note reply is produced as MP3/WAV/M4A or
|
||||
another likely audio file, the channel plugin transcodes it to 48kHz Ogg/Opus
|
||||
with `ffmpeg` before sending the native voice message. If conversion fails,
|
||||
Feishu receives the original file as an attachment; WhatsApp send fails rather
|
||||
than posting an incompatible PTT payload.
|
||||
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
|
||||
- 44.1kHz / 128kbps is the default balance for speech clarity.
|
||||
- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery.
|
||||
@@ -844,8 +845,8 @@ Notes:
|
||||
The `tts` tool converts text to speech and returns an audio attachment for
|
||||
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
|
||||
the audio is delivered as a voice message rather than a file attachment.
|
||||
Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
|
||||
available.
|
||||
Feishu and WhatsApp can transcode non-Opus TTS output on this path when
|
||||
`ffmpeg` is available.
|
||||
WhatsApp sends visible text separately from PTT voice-note audio because clients
|
||||
do not consistently render captions on voice notes.
|
||||
It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
|
||||
|
||||
@@ -71,7 +71,12 @@ async function expectTtsPayloadResult(params: {
|
||||
text: string;
|
||||
target: "voice-note" | "audio-file";
|
||||
audioAsVoice: true | undefined;
|
||||
providerResult?: MockSpeechSynthesisResult;
|
||||
mediaExtension?: string;
|
||||
}) {
|
||||
if (params.providerResult) {
|
||||
synthesizeMock.mockResolvedValueOnce(params.providerResult);
|
||||
}
|
||||
const cfg = createTtsConfig(params.prefsName);
|
||||
let mediaDir: string | undefined;
|
||||
try {
|
||||
@@ -84,7 +89,7 @@ async function expectTtsPayloadResult(params: {
|
||||
|
||||
expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ target: params.target }));
|
||||
expect(result.audioAsVoice).toBe(params.audioAsVoice);
|
||||
expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/);
|
||||
expect(result.mediaUrl).toMatch(new RegExp(`voice-\\d+\\.${params.mediaExtension ?? "ogg"}$`));
|
||||
|
||||
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
|
||||
} finally {
|
||||
@@ -118,35 +123,26 @@ describe("speech-core native voice-note routing", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => {
|
||||
synthesizeMock.mockResolvedValueOnce({
|
||||
audioBuffer: Buffer.from("mp3"),
|
||||
outputFormat: "mp3",
|
||||
fileExtension: ".mp3",
|
||||
voiceCompatible: false,
|
||||
});
|
||||
const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test");
|
||||
let mediaDir: string | undefined;
|
||||
try {
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload: { text: "This Feishu reply should be transcoded by the channel." },
|
||||
cfg,
|
||||
channel: "feishu",
|
||||
kind: "final",
|
||||
it.each(["feishu", "whatsapp"] as const)(
|
||||
"marks %s voice-note TTS for channel-side transcoding when provider returns mp3",
|
||||
async (channel) => {
|
||||
expect(_test.supportsTranscodedVoiceNoteTts(channel)).toBe(true);
|
||||
await expectTtsPayloadResult({
|
||||
channel,
|
||||
prefsName: `openclaw-speech-core-tts-${channel}-mp3-test`,
|
||||
text: `This ${channel} reply should be transcoded by the channel.`,
|
||||
target: "voice-note",
|
||||
audioAsVoice: true,
|
||||
mediaExtension: "mp3",
|
||||
providerResult: {
|
||||
audioBuffer: Buffer.from("mp3"),
|
||||
outputFormat: "mp3",
|
||||
fileExtension: ".mp3",
|
||||
voiceCompatible: false,
|
||||
},
|
||||
});
|
||||
|
||||
expect(synthesizeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ target: "voice-note" }),
|
||||
);
|
||||
expect(result.audioAsVoice).toBe(true);
|
||||
expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/);
|
||||
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
|
||||
} finally {
|
||||
if (mediaDir) {
|
||||
rmSync(mediaDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
it("keeps non-native voice-note channels as regular audio files", async () => {
|
||||
await expectTtsPayloadResult({
|
||||
|
||||
@@ -640,7 +640,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
|
||||
}
|
||||
|
||||
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
|
||||
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]);
|
||||
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]);
|
||||
|
||||
function resolveChannelId(channel: string | undefined): ChannelId | null {
|
||||
return channel ? normalizeChannelId(channel) : null;
|
||||
|
||||
Reference in New Issue
Block a user