fix(whatsapp): preserve audio-as-voice payload intent

2026-05-06 06:20:43 +00:00 · 2026-04-25 06:35:51 +01:00
parent 80b6da72f5
commit c2a2a481b2
8 changed files with 85 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -74,6 +74,7 @@ Docs: https://docs.openclaw.ai
 - Discord/cron: deliver text-only isolated cron and heartbeat announce output from the canonical final assistant text once, avoiding duplicate Discord posts when streamed block payloads and the final answer contain the same content. Fixes #71406. Thanks @alexgross21.
 - macOS Gateway: wait for launchd to reload the exited Gateway LaunchAgent before bootstrapping repair fallback, preventing config-triggered restarts from leaving the service not loaded. Fixes #45178. Thanks @vincentkoc.
 - TTS/hooks: preserve audio-only TTS transcripts for `message_sending` and `message_sent` hooks without rendering the transcript as a media caption. Thanks @zqchris.
+- WhatsApp/TTS: preserve `audioAsVoice` through shared media payload sends and the WhatsApp outbound adapter, so `[[audio_as_voice]]` reply payloads keep their voice-note intent when routed through `sendPayload`. Fixes #66053. Thanks @masatohoshino.
 - Control UI/WebChat: hide heartbeat prompts, `HEARTBEAT_OK` acknowledgments, and internal-only runtime context turns from visible chat history while leaving the underlying transcript intact. Fixes #71381. Thanks @gerald1950ggg-ai.
 - Control UI/chat: keep optimistic user and assistant tail messages visible when a final history refresh briefly returns an older snapshot, preventing message cards from flash-disappearing until the next refresh. Fixes #71371. Thanks @WolvenRA.
 - Talk/TTS: resolve configured extension speech providers from the active runtime registry before provider-list discovery, so Talk mode no longer rejects valid plugin speech providers as unsupported.
--- a/docs/channels/whatsapp.md
+++ b/docs/channels/whatsapp.md
@@ -361,6 +361,7 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s

  <Accordion title="Outbound media behavior">
    - supports image, video, audio (PTT voice-note), and document payloads
+    - reply payloads preserve `audioAsVoice`; WhatsApp sends audio media as Baileys PTT voice notes
    - `audio/ogg` is rewritten to `audio/ogg; codecs=opus` for voice-note compatibility
    - animated GIF playback is supported via `gifPlayback: true` on video sends
    - captions are applied to the first media item when sending multi-media reply payloads
--- a/extensions/whatsapp/src/outbound-adapter.sendpayload.test.ts
+++ b/extensions/whatsapp/src/outbound-adapter.sendpayload.test.ts
@@ -75,6 +75,28 @@ describe("whatsappOutbound sendPayload", () => {
    });
  });

+  it("preserves audioAsVoice from payload media sends", async () => {
+    const sendWhatsApp = vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" }));
+
+    await whatsappOutbound.sendPayload!({
+      cfg: {},
+      to: "5511999999999@c.us",
+      text: "",
+      payload: { text: "voice", mediaUrl: "/tmp/voice.ogg", audioAsVoice: true },
+      deps: { sendWhatsApp },
+    });
+
+    expect(sendWhatsApp).toHaveBeenCalledWith("5511999999999@c.us", "voice", {
+      verbose: false,
+      cfg: {},
+      mediaUrl: "/tmp/voice.ogg",
+      mediaLocalRoots: undefined,
+      audioAsVoice: true,
+      accountId: undefined,
+      gifPlayback: undefined,
+    });
+  });
+
  it("drops blank mediaUrls before sending payload media", async () => {
    const sendWhatsApp = vi.fn(async () => ({ messageId: "wa-1", toJid: "jid" }));

--- a/extensions/whatsapp/src/outbound-base.test.ts
+++ b/extensions/whatsapp/src/outbound-base.test.ts
@@ -55,6 +55,40 @@ describe("createWhatsAppOutboundBase", () => {
    expect(result).toMatchObject({ channel: "whatsapp", messageId: "msg-1" });
  });

+  it("forwards audioAsVoice to sendMessageWhatsApp", async () => {
+    const sendMessageWhatsApp = vi.fn(async () => ({
+      messageId: "msg-voice",
+      toJid: "15551234567@s.whatsapp.net",
+    }));
+    const outbound = createWhatsAppOutboundBase({
+      chunker: (text) => [text],
+      sendMessageWhatsApp,
+      sendPollWhatsApp: vi.fn(),
+      shouldLogVerbose: () => false,
+      resolveTarget: ({ to }) => ({ ok: true as const, to: to ?? "" }),
+    });
+
+    await outbound.sendMedia!({
+      cfg: {} as never,
+      to: "whatsapp:+15551234567",
+      text: "voice",
+      mediaUrl: "/tmp/workspace/voice.ogg",
+      audioAsVoice: true,
+      accountId: "default",
+      deps: { sendWhatsApp: sendMessageWhatsApp },
+    });
+
+    expect(sendMessageWhatsApp).toHaveBeenCalledWith(
+      "whatsapp:+15551234567",
+      "voice",
+      expect.objectContaining({
+        mediaUrl: "/tmp/workspace/voice.ogg",
+        audioAsVoice: true,
+        accountId: "default",
+      }),
+    );
+  });
+
  it("uses the configured default account for quote metadata lookup when accountId is omitted", async () => {
    cacheInboundMessageMeta("work", "15551234567@s.whatsapp.net", "reply-1", {
      participant: "111@s.whatsapp.net",
--- a/extensions/whatsapp/src/outbound-base.ts
+++ b/extensions/whatsapp/src/outbound-base.ts
@@ -31,6 +31,7 @@ type WhatsAppSendTextOptions = {
  mediaLocalRoots?: readonly string[];
  mediaReadFile?: (filePath: string) => Promise<Buffer>;
  gifPlayback?: boolean;
+  audioAsVoice?: boolean;
  accountId?: string;
  quotedMessageKey?: {
    id: string;
@@ -178,6 +179,7 @@ export function createWhatsAppOutboundBase({
        mediaAccess,
        mediaLocalRoots,
        mediaReadFile,
+        audioAsVoice,
        accountId,
        deps,
        gifPlayback,
@@ -200,6 +202,7 @@ export function createWhatsAppOutboundBase({
          mediaAccess,
          mediaLocalRoots,
          mediaReadFile,
+          ...(audioAsVoice === undefined ? {} : { audioAsVoice }),
          accountId: accountId ?? undefined,
          gifPlayback,
          quotedMessageKey,
--- a/extensions/whatsapp/src/send.ts
+++ b/extensions/whatsapp/src/send.ts
@@ -67,6 +67,7 @@ export async function sendMessageWhatsApp(
    mediaLocalRoots?: readonly string[];
    mediaReadFile?: (filePath: string) => Promise<Buffer>;
    gifPlayback?: boolean;
+    audioAsVoice?: boolean;
    accountId?: string;
    quotedMessageKey?: {
      id: string;
--- a/src/plugin-sdk/reply-payload.test.ts
+++ b/src/plugin-sdk/reply-payload.test.ts
@@ -139,6 +139,27 @@ describe("sendTextMediaPayload", () => {
    expect(sendMedia.mock.calls.map((call) => call[0].replyToId)).toEqual(["reply-1", undefined]);
  });

+  it("preserves audioAsVoice on media fallback sends", async () => {
+    const sendMedia = vi.fn(async ({ mediaUrl }) => ({ channel: "test", messageId: mediaUrl }));
+
+    await sendTextMediaPayload({
+      channel: "test",
+      ctx: {
+        cfg: {},
+        to: "target",
+        text: "",
+        payload: {
+          text: "caption",
+          mediaUrls: ["https://example.com/voice.ogg", "https://example.com/next.ogg"],
+          audioAsVoice: true,
+        },
+      },
+      adapter: { sendMedia },
+    });
+
+    expect(sendMedia.mock.calls.map((call) => call[0].audioAsVoice)).toEqual([true, true]);
+  });
+
  it("keeps explicit reply tags independent from single-use implicit reply modes", async () => {
    const sendText = vi.fn(async ({ text }) => ({ channel: "test", messageId: text }));

--- a/src/plugin-sdk/reply-payload.ts
+++ b/src/plugin-sdk/reply-payload.ts
@@ -292,6 +292,7 @@ export async function sendTextMediaPayload(params: {
  }
  const nextReplyToId = createReplyToFanout(params.ctx);
  if (urls.length > 0) {
+    const audioAsVoice = params.ctx.payload.audioAsVoice ?? params.ctx.audioAsVoice;
    const lastResult = await sendPayloadMediaSequence({
      text,
      mediaUrls: urls,
@@ -300,6 +301,7 @@ export async function sendTextMediaPayload(params: {
          ...params.ctx,
          text,
          mediaUrl,
+          ...(audioAsVoice === undefined ? {} : { audioAsVoice }),
          replyToId: nextReplyToId(),
        }),
    });