diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b4c4940022..04f0af64a26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,8 @@ Docs: https://docs.openclaw.ai - Browser/CDP: honor configured remote and `attachOnly` CDP HTTP/WebSocket timeouts when opening tabs through raw CDP or `/json/new` fallback. (#54238) Thanks @FuncWei. +- WhatsApp/TTS: send visible text separately from PTT voice-note audio instead + of relying on hidden voice-note captions. Fixes #51081. - Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text tool-result `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535) Thanks @azade-c. diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index 74dfd52677e..239904d1ea5 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -365,7 +365,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s - non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded to Ogg/Opus before PTT delivery - native Ogg/Opus audio is sent with `audio/ogg; codecs=opus` for voice-note compatibility - animated GIF playback is supported via `gifPlayback: true` on video sends - - captions are applied to the first media item when sending multi-media reply payloads + - captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently - media source can be HTTP(S), `file://`, or local paths diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 6012df221b3..0af905c65f8 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -664,6 +664,8 @@ reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp, the audio is delivered as a voice message rather than a file attachment. Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is available. +WhatsApp sends visible text separately from PTT voice-note audio because clients +do not consistently render captions on voice notes. It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a per-call provider request timeout in milliseconds. diff --git a/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts b/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts index 9124ba1869e..5bb4c62f2bf 100644 --- a/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts +++ b/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts @@ -91,6 +91,12 @@ function mockFirstReplyFailureWithWrappedError(msg: WebInboundMsg, message: stri }); } +function expectFirstSendMediaPayload(msg: WebInboundMsg) { + const payload = vi.mocked(msg.sendMedia).mock.calls[0]?.[0]; + expect(payload).toBeDefined(); + return payload; +} + function mockSecondReplySuccess(msg: WebInboundMsg) { (msg.reply as unknown as { mockResolvedValueOnce: (v: unknown) => void }).mockResolvedValueOnce( undefined, @@ -524,14 +530,14 @@ describe("deliverWebReply", () => { audio: expect.any(Buffer), ptt: true, mimetype: "audio/ogg; codecs=opus", - caption: "caption", }), undefined, ); - expect(msg.reply).not.toHaveBeenCalled(); + expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption"); + expect(msg.reply).toHaveBeenCalledWith("caption", undefined); }); - it("sends audio media as ptt voice note", async () => { + it("sends audio media as ptt voice note with visible text separately", async () => { const msg = makeMsg(); ( loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void } @@ -555,10 +561,11 @@ describe("deliverWebReply", () => { audio: expect.any(Buffer), ptt: true, mimetype: "audio/ogg; codecs=opus", - caption: "cap", }), undefined, ); + expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption"); + expect(msg.reply).toHaveBeenCalledWith("cap", undefined); }); it("transcodes mp3 audio media before sending a ptt voice note", async () => { @@ -594,10 +601,11 @@ describe("deliverWebReply", () => { audio: Buffer.from("opus-output"), ptt: true, mimetype: "audio/ogg; codecs=opus", - caption: "cap", }), undefined, ); + expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption"); + expect(msg.reply).toHaveBeenCalledWith("cap", undefined); }); it("sends video media", async () => { diff --git a/extensions/whatsapp/src/auto-reply/deliver-reply.ts b/extensions/whatsapp/src/auto-reply/deliver-reply.ts index cfd16737df4..03768490536 100644 --- a/extensions/whatsapp/src/auto-reply/deliver-reply.ts +++ b/extensions/whatsapp/src/auto-reply/deliver-reply.ts @@ -156,12 +156,14 @@ export async function deliverWebReply(params: { audio: media.buffer, ptt: true, mimetype: media.mimetype, - caption, }, quote, ), "media:audio", ); + if (caption) { + await sendWithRetry(() => msg.reply(caption, quote), "media:audio-text"); + } } else if (media.kind === "video") { const quote = getQuote(); await sendWithRetry( diff --git a/extensions/whatsapp/src/inbound/send-api.test.ts b/extensions/whatsapp/src/inbound/send-api.test.ts index 7dec794a14c..b76779bc78b 100644 --- a/extensions/whatsapp/src/inbound/send-api.test.ts +++ b/extensions/whatsapp/src/inbound/send-api.test.ts @@ -100,6 +100,23 @@ describe("createWebSendApi", () => { }); }); + it("sends visible text separately from push-to-talk voice notes", async () => { + const payload = Buffer.from("aud"); + await api.sendMessage("+1555", "voice text", payload, "audio/ogg"); + expect(sendMessage).toHaveBeenNthCalledWith( + 1, + "1555@s.whatsapp.net", + expect.objectContaining({ + audio: payload, + ptt: true, + mimetype: "audio/ogg", + }), + ); + expect(sendMessage).toHaveBeenNthCalledWith(2, "1555@s.whatsapp.net", { + text: "voice text", + }); + }); + it("supports video media and gifPlayback option", async () => { const payload = Buffer.from("vid"); await api.sendMessage("+1555", "cap", payload, "video/mp4", { gifPlayback: true }); diff --git a/extensions/whatsapp/src/inbound/send-api.ts b/extensions/whatsapp/src/inbound/send-api.ts index 92ccef288ba..69aefcf9f8e 100644 --- a/extensions/whatsapp/src/inbound/send-api.ts +++ b/extensions/whatsapp/src/inbound/send-api.ts @@ -85,6 +85,14 @@ export function createWebSendApi(params: { const result = quotedOpts ? await params.sock.sendMessage(jid, payload, quotedOpts) : await params.sock.sendMessage(jid, payload); + if (mediaBuffer && mediaType?.startsWith("audio/") && text.trim()) { + const textPayload: AnyMessageContent = { text }; + if (quotedOpts) { + await params.sock.sendMessage(jid, textPayload, quotedOpts); + } else { + await params.sock.sendMessage(jid, textPayload); + } + } const accountId = sendOptions?.accountId ?? params.defaultAccountId; recordWhatsAppOutbound(accountId); const messageId = resolveOutboundMessageId(result); diff --git a/extensions/whatsapp/src/send.test.ts b/extensions/whatsapp/src/send.test.ts index 9d96bd9860d..0ccd2df1f50 100644 --- a/extensions/whatsapp/src/send.test.ts +++ b/extensions/whatsapp/src/send.test.ts @@ -245,12 +245,8 @@ describe("web outbound", () => { cfg: WHATSAPP_TEST_CFG, mediaUrl: "/tmp/voice.ogg", }); - expect(sendMessage).toHaveBeenLastCalledWith( - "+1555", - "voice note", - buf, - "audio/ogg; codecs=opus", - ); + expect(sendMessage).toHaveBeenNthCalledWith(1, "+1555", "", buf, "audio/ogg; codecs=opus"); + expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined); }); it.each([ @@ -274,12 +270,14 @@ describe("web outbound", () => { expect(hoisted.runFfmpeg).toHaveBeenCalledWith( expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]), ); - expect(sendMessage).toHaveBeenLastCalledWith( + expect(sendMessage).toHaveBeenNthCalledWith( + 1, "+1555", - "voice note", + "", Buffer.from("opus-output"), "audio/ogg; codecs=opus", ); + expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined); }); it("maps video with caption", async () => { diff --git a/extensions/whatsapp/src/send.ts b/extensions/whatsapp/src/send.ts index 4a54c82a6b1..da698c09b26 100644 --- a/extensions/whatsapp/src/send.ts +++ b/extensions/whatsapp/src/send.ts @@ -115,6 +115,7 @@ export async function sendMessageWhatsApp( let mediaBuffer: Buffer | undefined; let mediaType: string | undefined; let documentFileName: string | undefined; + let visibleTextAfterVoice: string | undefined; if (primaryMediaUrl) { const media = await prepareWhatsAppOutboundMedia( await loadOutboundMediaFromUrl(primaryMediaUrl, { @@ -128,7 +129,10 @@ export async function sendMessageWhatsApp( const caption = text || undefined; mediaBuffer = media.buffer; mediaType = media.mimetype; - if (media.kind === "document") { + if (media.kind === "audio" && caption) { + visibleTextAfterVoice = caption; + text = ""; + } else if (media.kind === "document") { text = caption ?? ""; documentFileName = media.fileName; } else { @@ -152,6 +156,13 @@ export async function sendMessageWhatsApp( const result = sendOptions ? await active.sendMessage(to, text, mediaBuffer, mediaType, sendOptions) : await active.sendMessage(to, text, mediaBuffer, mediaType); + if (visibleTextAfterVoice) { + if (sendOptions) { + await active.sendMessage(to, visibleTextAfterVoice, undefined, undefined, sendOptions); + } else { + await active.sendMessage(to, visibleTextAfterVoice, undefined, undefined); + } + } const messageId = (result as { messageId?: string })?.messageId ?? "unknown"; const durationMs = Date.now() - startedAt; outboundLog.info(