mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:20:43 +00:00
fix(whatsapp): send voice note text separately
This commit is contained in:
@@ -41,6 +41,8 @@ Docs: https://docs.openclaw.ai
|
||||
- Browser/CDP: honor configured remote and `attachOnly` CDP HTTP/WebSocket
|
||||
timeouts when opening tabs through raw CDP or `/json/new` fallback. (#54238)
|
||||
Thanks @FuncWei.
|
||||
- WhatsApp/TTS: send visible text separately from PTT voice-note audio instead
|
||||
of relying on hidden voice-note captions. Fixes #51081.
|
||||
- Agents/TTS: preserve `[[audio_as_voice]]` directives on trusted text
|
||||
tool-result `MEDIA:` payloads so generated audio still delivers as a voice
|
||||
note. (#46535) Thanks @azade-c.
|
||||
|
||||
@@ -365,7 +365,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
|
||||
- non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded to Ogg/Opus before PTT delivery
|
||||
- native Ogg/Opus audio is sent with `audio/ogg; codecs=opus` for voice-note compatibility
|
||||
- animated GIF playback is supported via `gifPlayback: true` on video sends
|
||||
- captions are applied to the first media item when sending multi-media reply payloads
|
||||
- captions are applied to the first media item when sending multi-media reply payloads, except PTT voice notes send the audio first and visible text separately because WhatsApp clients do not render voice-note captions consistently
|
||||
- media source can be HTTP(S), `file://`, or local paths
|
||||
</Accordion>
|
||||
|
||||
|
||||
@@ -664,6 +664,8 @@ reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
|
||||
the audio is delivered as a voice message rather than a file attachment.
|
||||
Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
|
||||
available.
|
||||
WhatsApp sends visible text separately from PTT voice-note audio because clients
|
||||
do not consistently render captions on voice notes.
|
||||
It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
|
||||
per-call provider request timeout in milliseconds.
|
||||
|
||||
|
||||
@@ -91,6 +91,12 @@ function mockFirstReplyFailureWithWrappedError(msg: WebInboundMsg, message: stri
|
||||
});
|
||||
}
|
||||
|
||||
function expectFirstSendMediaPayload(msg: WebInboundMsg) {
|
||||
const payload = vi.mocked(msg.sendMedia).mock.calls[0]?.[0];
|
||||
expect(payload).toBeDefined();
|
||||
return payload;
|
||||
}
|
||||
|
||||
function mockSecondReplySuccess(msg: WebInboundMsg) {
|
||||
(msg.reply as unknown as { mockResolvedValueOnce: (v: unknown) => void }).mockResolvedValueOnce(
|
||||
undefined,
|
||||
@@ -524,14 +530,14 @@ describe("deliverWebReply", () => {
|
||||
audio: expect.any(Buffer),
|
||||
ptt: true,
|
||||
mimetype: "audio/ogg; codecs=opus",
|
||||
caption: "caption",
|
||||
}),
|
||||
undefined,
|
||||
);
|
||||
expect(msg.reply).not.toHaveBeenCalled();
|
||||
expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption");
|
||||
expect(msg.reply).toHaveBeenCalledWith("caption", undefined);
|
||||
});
|
||||
|
||||
it("sends audio media as ptt voice note", async () => {
|
||||
it("sends audio media as ptt voice note with visible text separately", async () => {
|
||||
const msg = makeMsg();
|
||||
(
|
||||
loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void }
|
||||
@@ -555,10 +561,11 @@ describe("deliverWebReply", () => {
|
||||
audio: expect.any(Buffer),
|
||||
ptt: true,
|
||||
mimetype: "audio/ogg; codecs=opus",
|
||||
caption: "cap",
|
||||
}),
|
||||
undefined,
|
||||
);
|
||||
expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption");
|
||||
expect(msg.reply).toHaveBeenCalledWith("cap", undefined);
|
||||
});
|
||||
|
||||
it("transcodes mp3 audio media before sending a ptt voice note", async () => {
|
||||
@@ -594,10 +601,11 @@ describe("deliverWebReply", () => {
|
||||
audio: Buffer.from("opus-output"),
|
||||
ptt: true,
|
||||
mimetype: "audio/ogg; codecs=opus",
|
||||
caption: "cap",
|
||||
}),
|
||||
undefined,
|
||||
);
|
||||
expect(expectFirstSendMediaPayload(msg)).not.toHaveProperty("caption");
|
||||
expect(msg.reply).toHaveBeenCalledWith("cap", undefined);
|
||||
});
|
||||
|
||||
it("sends video media", async () => {
|
||||
|
||||
@@ -156,12 +156,14 @@ export async function deliverWebReply(params: {
|
||||
audio: media.buffer,
|
||||
ptt: true,
|
||||
mimetype: media.mimetype,
|
||||
caption,
|
||||
},
|
||||
quote,
|
||||
),
|
||||
"media:audio",
|
||||
);
|
||||
if (caption) {
|
||||
await sendWithRetry(() => msg.reply(caption, quote), "media:audio-text");
|
||||
}
|
||||
} else if (media.kind === "video") {
|
||||
const quote = getQuote();
|
||||
await sendWithRetry(
|
||||
|
||||
@@ -100,6 +100,23 @@ describe("createWebSendApi", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("sends visible text separately from push-to-talk voice notes", async () => {
|
||||
const payload = Buffer.from("aud");
|
||||
await api.sendMessage("+1555", "voice text", payload, "audio/ogg");
|
||||
expect(sendMessage).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
"1555@s.whatsapp.net",
|
||||
expect.objectContaining({
|
||||
audio: payload,
|
||||
ptt: true,
|
||||
mimetype: "audio/ogg",
|
||||
}),
|
||||
);
|
||||
expect(sendMessage).toHaveBeenNthCalledWith(2, "1555@s.whatsapp.net", {
|
||||
text: "voice text",
|
||||
});
|
||||
});
|
||||
|
||||
it("supports video media and gifPlayback option", async () => {
|
||||
const payload = Buffer.from("vid");
|
||||
await api.sendMessage("+1555", "cap", payload, "video/mp4", { gifPlayback: true });
|
||||
|
||||
@@ -85,6 +85,14 @@ export function createWebSendApi(params: {
|
||||
const result = quotedOpts
|
||||
? await params.sock.sendMessage(jid, payload, quotedOpts)
|
||||
: await params.sock.sendMessage(jid, payload);
|
||||
if (mediaBuffer && mediaType?.startsWith("audio/") && text.trim()) {
|
||||
const textPayload: AnyMessageContent = { text };
|
||||
if (quotedOpts) {
|
||||
await params.sock.sendMessage(jid, textPayload, quotedOpts);
|
||||
} else {
|
||||
await params.sock.sendMessage(jid, textPayload);
|
||||
}
|
||||
}
|
||||
const accountId = sendOptions?.accountId ?? params.defaultAccountId;
|
||||
recordWhatsAppOutbound(accountId);
|
||||
const messageId = resolveOutboundMessageId(result);
|
||||
|
||||
@@ -245,12 +245,8 @@ describe("web outbound", () => {
|
||||
cfg: WHATSAPP_TEST_CFG,
|
||||
mediaUrl: "/tmp/voice.ogg",
|
||||
});
|
||||
expect(sendMessage).toHaveBeenLastCalledWith(
|
||||
"+1555",
|
||||
"voice note",
|
||||
buf,
|
||||
"audio/ogg; codecs=opus",
|
||||
);
|
||||
expect(sendMessage).toHaveBeenNthCalledWith(1, "+1555", "", buf, "audio/ogg; codecs=opus");
|
||||
expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined);
|
||||
});
|
||||
|
||||
it.each([
|
||||
@@ -274,12 +270,14 @@ describe("web outbound", () => {
|
||||
expect(hoisted.runFfmpeg).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]),
|
||||
);
|
||||
expect(sendMessage).toHaveBeenLastCalledWith(
|
||||
expect(sendMessage).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
"+1555",
|
||||
"voice note",
|
||||
"",
|
||||
Buffer.from("opus-output"),
|
||||
"audio/ogg; codecs=opus",
|
||||
);
|
||||
expect(sendMessage).toHaveBeenNthCalledWith(2, "+1555", "voice note", undefined, undefined);
|
||||
});
|
||||
|
||||
it("maps video with caption", async () => {
|
||||
|
||||
@@ -115,6 +115,7 @@ export async function sendMessageWhatsApp(
|
||||
let mediaBuffer: Buffer | undefined;
|
||||
let mediaType: string | undefined;
|
||||
let documentFileName: string | undefined;
|
||||
let visibleTextAfterVoice: string | undefined;
|
||||
if (primaryMediaUrl) {
|
||||
const media = await prepareWhatsAppOutboundMedia(
|
||||
await loadOutboundMediaFromUrl(primaryMediaUrl, {
|
||||
@@ -128,7 +129,10 @@ export async function sendMessageWhatsApp(
|
||||
const caption = text || undefined;
|
||||
mediaBuffer = media.buffer;
|
||||
mediaType = media.mimetype;
|
||||
if (media.kind === "document") {
|
||||
if (media.kind === "audio" && caption) {
|
||||
visibleTextAfterVoice = caption;
|
||||
text = "";
|
||||
} else if (media.kind === "document") {
|
||||
text = caption ?? "";
|
||||
documentFileName = media.fileName;
|
||||
} else {
|
||||
@@ -152,6 +156,13 @@ export async function sendMessageWhatsApp(
|
||||
const result = sendOptions
|
||||
? await active.sendMessage(to, text, mediaBuffer, mediaType, sendOptions)
|
||||
: await active.sendMessage(to, text, mediaBuffer, mediaType);
|
||||
if (visibleTextAfterVoice) {
|
||||
if (sendOptions) {
|
||||
await active.sendMessage(to, visibleTextAfterVoice, undefined, undefined, sendOptions);
|
||||
} else {
|
||||
await active.sendMessage(to, visibleTextAfterVoice, undefined, undefined);
|
||||
}
|
||||
}
|
||||
const messageId = (result as { messageId?: string })?.messageId ?? "unknown";
|
||||
const durationMs = Date.now() - startedAt;
|
||||
outboundLog.info(
|
||||
|
||||
Reference in New Issue
Block a user