fix(whatsapp): send voice note text separately

This commit is contained in:
Peter Steinberger
2026-04-25 18:54:33 +01:00
parent 617e1dd6bf
commit 9ffe764416
9 changed files with 64 additions and 16 deletions

View File

@@ -41,6 +41,8 @@ Docs: https://docs.openclaw.ai
- Browser/CDP: honor configured remote and `attachOnly` CDP HTTP/WebSocket
timeouts when opening tabs through raw CDP or `/json/new` fallback. (#54238)
Thanks @FuncWei.
- WhatsApp/TTS: send visible text separately from PTT voice-note audio instead
of relying on hidden voice-note captions. Fixes #51081.
- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text
tool-result `MEDIA:` payloads so generated audio still delivers as a voice
note. (#46535) Thanks @azade-c.

View File

@@ -365,7 +365,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded to Ogg/Opus before PTT delivery
- native Ogg/Opus audio is sent with `audio/ogg; codecs=opus` for voice-note compatibility
- animated GIF playback is supported via `gifPlayback: true` on video sends
- captions are applied to the first media item when sending multi-media reply payloads
- captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently
- media source can be HTTP(S), `file://`, or local paths
</Accordion>

View File

@@ -664,6 +664,8 @@ reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
the audio is delivered as a voice message rather than a file attachment.
Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
available.
WhatsApp sends visible text separately from PTT voice-note audio because clients
do not consistently render captions on voice notes.
It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
per-call provider request timeout in milliseconds.

View File

@@ -91,6 +91,12 @@ function mockFirstReplyFailureWithWrappedError(msg: WebInboundMsg, message: stri
});
}
function expectFirstSendMediaPayload(msg: WebInboundMsg) {
const payload = vi.mocked(msg.sendMedia).mock.calls[0]?.[0];
expect(payload).toBeDefined();
return payload;
}
function mockSecondReplySuccess(msg: WebInboundMsg) {
(msg.reply as unknown as { mockResolvedValueOnce: (v: unknown) => void }).mockResolvedValueOnce(
undefined,
@@ -524,14 +530,14 @@ describe("deliverWebReply", () => {
audio: expect.any(Buffer),
ptt: true,
mimetype: "audio/ogg; codecs=opus",
caption: "caption",
}),
undefined,
);
expect(msg.reply).not.toHaveBeenCalled();
expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption");
expect(msg.reply).toHaveBeenCalledWith("caption", undefined);
});
it("sends audio media as ptt voice note", async () => {
it("sends audio media as ptt voice note with visible text separately", async () => {
const msg = makeMsg();
(
loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
@@ -555,10 +561,11 @@ describe("deliverWebReply", () => {
audio: expect.any(Buffer),
ptt: true,
mimetype: "audio/ogg; codecs=opus",
caption: "cap",
}),
undefined,
);
expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption");
expect(msg.reply).toHaveBeenCalledWith("cap", undefined);
});
it("transcodes mp3 audio media before sending a ptt voice note", async () => {
@@ -594,10 +601,11 @@ describe("deliverWebReply", () => {
audio: Buffer.from("opus-output"),
ptt: true,
mimetype: "audio/ogg; codecs=opus",
caption: "cap",
}),
undefined,
);
expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption");
expect(msg.reply).toHaveBeenCalledWith("cap", undefined);
});
it("sends video media", async () => {

View File

@@ -156,12 +156,14 @@ export async function deliverWebReply(params: {
audio: media.buffer,
ptt: true,
mimetype: media.mimetype,
caption,
},
quote,
),
"media:audio",
);
if (caption) {
await sendWithRetry(() => msg.reply(caption, quote), "media:audio-text");
}
} else if (media.kind === "video") {
const quote = getQuote();
await sendWithRetry(

View File

@@ -100,6 +100,23 @@ describe("createWebSendApi", () => {
});
});
it("sends visible text separately from push-to-talk voice notes", async () => {
const payload = Buffer.from("aud");
await api.sendMessage("+1555", "voice text", payload, "audio/ogg");
expect(sendMessage).toHaveBeenNthCalledWith(
1,
"1555@s.whatsapp.net",
expect.objectContaining({
audio: payload,
ptt: true,
mimetype: "audio/ogg",
}),
);
expect(sendMessage).toHaveBeenNthCalledWith(2, "1555@s.whatsapp.net", {
text: "voice text",
});
});
it("supports video media and gifPlayback option", async () => {
const payload = Buffer.from("vid");
await api.sendMessage("+1555", "cap", payload, "video/mp4", { gifPlayback: true });

View File

@@ -85,6 +85,14 @@ export function createWebSendApi(params: {
const result = quotedOpts
? await params.sock.sendMessage(jid, payload, quotedOpts)
: await params.sock.sendMessage(jid, payload);
if (mediaBuffer && mediaType?.startsWith("audio/") && text.trim()) {
const textPayload: AnyMessageContent = { text };
if (quotedOpts) {
await params.sock.sendMessage(jid, textPayload, quotedOpts);
} else {
await params.sock.sendMessage(jid, textPayload);
}
}
const accountId = sendOptions?.accountId ?? params.defaultAccountId;
recordWhatsAppOutbound(accountId);
const messageId = resolveOutboundMessageId(result);

View File

@@ -245,12 +245,8 @@ describe("web outbound", () => {
cfg: WHATSAPP_TEST_CFG,
mediaUrl: "/tmp/voice.ogg",
});
expect(sendMessage).toHaveBeenLastCalledWith(
"+1555",
"voice note",
buf,
"audio/ogg; codecs=opus",
);
expect(sendMessage).toHaveBeenNthCalledWith(1, "+1555", "", buf, "audio/ogg; codecs=opus");
expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined);
});
it.each([
@@ -274,12 +270,14 @@ describe("web outbound", () => {
expect(hoisted.runFfmpeg).toHaveBeenCalledWith(
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]),
);
expect(sendMessage).toHaveBeenLastCalledWith(
expect(sendMessage).toHaveBeenNthCalledWith(
1,
"+1555",
"voice note",
"",
Buffer.from("opus-output"),
"audio/ogg; codecs=opus",
);
expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined);
});
it("maps video with caption", async () => {

View File

@@ -115,6 +115,7 @@ export async function sendMessageWhatsApp(
let mediaBuffer: Buffer | undefined;
let mediaType: string | undefined;
let documentFileName: string | undefined;
let visibleTextAfterVoice: string | undefined;
if (primaryMediaUrl) {
const media = await prepareWhatsAppOutboundMedia(
await loadOutboundMediaFromUrl(primaryMediaUrl, {
@@ -128,7 +129,10 @@ export async function sendMessageWhatsApp(
const caption = text || undefined;
mediaBuffer = media.buffer;
mediaType = media.mimetype;
if (media.kind === "document") {
if (media.kind === "audio" && caption) {
visibleTextAfterVoice = caption;
text = "";
} else if (media.kind === "document") {
text = caption ?? "";
documentFileName = media.fileName;
} else {
@@ -152,6 +156,13 @@ export async function sendMessageWhatsApp(
const result = sendOptions
? await active.sendMessage(to, text, mediaBuffer, mediaType, sendOptions)
: await active.sendMessage(to, text, mediaBuffer, mediaType);
if (visibleTextAfterVoice) {
if (sendOptions) {
await active.sendMessage(to, visibleTextAfterVoice, undefined, undefined, sendOptions);
} else {
await active.sendMessage(to, visibleTextAfterVoice, undefined, undefined);
}
}
const messageId = (result as { messageId?: string })?.messageId ?? "unknown";
const durationMs = Date.now() - startedAt;
outboundLog.info(