diff --git a/CHANGELOG.md b/CHANGELOG.md index c7aeccbbdf0..981e07ec7ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,6 +74,7 @@ Docs: https://docs.openclaw.ai - Discord/cron: deliver text-only isolated cron and heartbeat announce output from the canonical final assistant text once, avoiding duplicate Discord posts when streamed block payloads and the final answer contain the same content. Fixes #71406. Thanks @alexgross21. - macOS Gateway: wait for launchd to reload the exited Gateway LaunchAgent before bootstrapping repair fallback, preventing config-triggered restarts from leaving the service not loaded. Fixes #45178. Thanks @vincentkoc. - TTS/hooks: preserve audio-only TTS transcripts for `message_sending` and `message_sent` hooks without rendering the transcript as a media caption. Thanks @zqchris. +- WhatsApp/TTS: preserve `audioAsVoice` through shared media payload sends and the WhatsApp outbound adapter, so `[[audio_as_voice]]` reply payloads keep their voice-note intent when routed through `sendPayload`. Fixes #66053. Thanks @masatohoshino. - Control UI/WebChat: hide heartbeat prompts, `HEARTBEAT_OK` acknowledgments, and internal-only runtime context turns from visible chat history while leaving the underlying transcript intact. Fixes #71381. Thanks @gerald1950ggg-ai. - Control UI/chat: keep optimistic user and assistant tail messages visible when a final history refresh briefly returns an older snapshot, preventing message cards from flash-disappearing until the next refresh. Fixes #71371. Thanks @WolvenRA. - Talk/TTS: resolve configured extension speech providers from the active runtime registry before provider-list discovery, so Talk mode no longer rejects valid plugin speech providers as unsupported. diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index 90391d73668..6649d68f816 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -361,6 +361,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s - supports image, video, audio (PTT voice-note), and document payloads + - reply payloads preserve `audioAsVoice`; WhatsApp sends audio media as Baileys PTT voice notes - `audio/ogg` is rewritten to `audio/ogg; codecs=opus` for voice-note compatibility - animated GIF playback is supported via `gifPlayback: true` on video sends - captions are applied to the first media item when sending multi-media reply payloads diff --git a/extensions/whatsapp/src/outbound-adapter.sendpayload.test.ts b/extensions/whatsapp/src/outbound-adapter.sendpayload.test.ts index f2ebf3ae7c3..1f568237f74 100644 --- a/extensions/whatsapp/src/outbound-adapter.sendpayload.test.ts +++ b/extensions/whatsapp/src/outbound-adapter.sendpayload.test.ts @@ -75,6 +75,28 @@ describe("whatsappOutbound sendPayload", () => { }); }); + it("preserves audioAsVoice from payload media sends", async () => { + const sendWhatsApp = vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" })); + + await whatsappOutbound.sendPayload!({ + cfg: {}, + to: "5511999999999@c.us", + text: "", + payload: { text: "voice", mediaUrl: "/tmp/voice.ogg", audioAsVoice: true }, + deps: { sendWhatsApp }, + }); + + expect(sendWhatsApp).toHaveBeenCalledWith("5511999999999@c.us", "voice", { + verbose: false, + cfg: {}, + mediaUrl: "/tmp/voice.ogg", + mediaLocalRoots: undefined, + audioAsVoice: true, + accountId: undefined, + gifPlayback: undefined, + }); + }); + it("drops blank mediaUrls before sending payload media", async () => { const sendWhatsApp = vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" })); diff --git a/extensions/whatsapp/src/outbound-base.test.ts b/extensions/whatsapp/src/outbound-base.test.ts index 9e735ca394b..b38bace239b 100644 --- a/extensions/whatsapp/src/outbound-base.test.ts +++ b/extensions/whatsapp/src/outbound-base.test.ts @@ -55,6 +55,40 @@ describe("createWhatsAppOutboundBase", () => { expect(result).toMatchObject({ channel: "whatsapp", messageId: "msg-1" }); }); + it("forwards audioAsVoice to sendMessageWhatsApp", async () => { + const sendMessageWhatsApp = vi.fn(async () => ({ + messageId: "msg-voice", + toJid: "15551234567@s.whatsapp.net", + })); + const outbound = createWhatsAppOutboundBase({ + chunker: (text) => [text], + sendMessageWhatsApp, + sendPollWhatsApp: vi.fn(), + shouldLogVerbose: () => false, + resolveTarget: ({ to }) => ({ ok: true as const, to: to ?? "" }), + }); + + await outbound.sendMedia!({ + cfg: {} as never, + to: "whatsapp:+15551234567", + text: "voice", + mediaUrl: "/tmp/workspace/voice.ogg", + audioAsVoice: true, + accountId: "default", + deps: { sendWhatsApp: sendMessageWhatsApp }, + }); + + expect(sendMessageWhatsApp).toHaveBeenCalledWith( + "whatsapp:+15551234567", + "voice", + expect.objectContaining({ + mediaUrl: "/tmp/workspace/voice.ogg", + audioAsVoice: true, + accountId: "default", + }), + ); + }); + it("uses the configured default account for quote metadata lookup when accountId is omitted", async () => { cacheInboundMessageMeta("work", "15551234567@s.whatsapp.net", "reply-1", { participant: "111@s.whatsapp.net", diff --git a/extensions/whatsapp/src/outbound-base.ts b/extensions/whatsapp/src/outbound-base.ts index 5bcb0f3986c..5741afd7df9 100644 --- a/extensions/whatsapp/src/outbound-base.ts +++ b/extensions/whatsapp/src/outbound-base.ts @@ -31,6 +31,7 @@ type WhatsAppSendTextOptions = { mediaLocalRoots?: readonly string[]; mediaReadFile?: (filePath: string) => Promise; gifPlayback?: boolean; + audioAsVoice?: boolean; accountId?: string; quotedMessageKey?: { id: string; @@ -178,6 +179,7 @@ export function createWhatsAppOutboundBase({ mediaAccess, mediaLocalRoots, mediaReadFile, + audioAsVoice, accountId, deps, gifPlayback, @@ -200,6 +202,7 @@ export function createWhatsAppOutboundBase({ mediaAccess, mediaLocalRoots, mediaReadFile, + ...(audioAsVoice === undefined ? {} : { audioAsVoice }), accountId: accountId ?? undefined, gifPlayback, quotedMessageKey, diff --git a/extensions/whatsapp/src/send.ts b/extensions/whatsapp/src/send.ts index 1cb5100621a..6e756950f67 100644 --- a/extensions/whatsapp/src/send.ts +++ b/extensions/whatsapp/src/send.ts @@ -67,6 +67,7 @@ export async function sendMessageWhatsApp( mediaLocalRoots?: readonly string[]; mediaReadFile?: (filePath: string) => Promise; gifPlayback?: boolean; + audioAsVoice?: boolean; accountId?: string; quotedMessageKey?: { id: string; diff --git a/src/plugin-sdk/reply-payload.test.ts b/src/plugin-sdk/reply-payload.test.ts index ed8a469a982..f16fab9f171 100644 --- a/src/plugin-sdk/reply-payload.test.ts +++ b/src/plugin-sdk/reply-payload.test.ts @@ -139,6 +139,27 @@ describe("sendTextMediaPayload", () => { expect(sendMedia.mock.calls.map((call) => call[0].replyToId)).toEqual(["reply-1", undefined]); }); + it("preserves audioAsVoice on media fallback sends", async () => { + const sendMedia = vi.fn(async ({ mediaUrl }) => ({ channel: "test", messageId: mediaUrl })); + + await sendTextMediaPayload({ + channel: "test", + ctx: { + cfg: {}, + to: "target", + text: "", + payload: { + text: "caption", + mediaUrls: ["https://example.com/voice.ogg", "https://example.com/next.ogg"], + audioAsVoice: true, + }, + }, + adapter: { sendMedia }, + }); + + expect(sendMedia.mock.calls.map((call) => call[0].audioAsVoice)).toEqual([true, true]); + }); + it("keeps explicit reply tags independent from single-use implicit reply modes", async () => { const sendText = vi.fn(async ({ text }) => ({ channel: "test", messageId: text })); diff --git a/src/plugin-sdk/reply-payload.ts b/src/plugin-sdk/reply-payload.ts index b0b663585a6..3832ee2f6e1 100644 --- a/src/plugin-sdk/reply-payload.ts +++ b/src/plugin-sdk/reply-payload.ts @@ -292,6 +292,7 @@ export async function sendTextMediaPayload(params: { } const nextReplyToId = createReplyToFanout(params.ctx); if (urls.length > 0) { + const audioAsVoice = params.ctx.payload.audioAsVoice ?? params.ctx.audioAsVoice; const lastResult = await sendPayloadMediaSequence({ text, mediaUrls: urls, @@ -300,6 +301,7 @@ export async function sendTextMediaPayload(params: { ...params.ctx, text, mediaUrl, + ...(audioAsVoice === undefined ? {} : { audioAsVoice }), replyToId: nextReplyToId(), }), });