From 7480b339da6ffa520a1852de1956989c72ca7e89 Mon Sep 17 00:00:00 2001
From: Marcus Castro <mcaxtr@openclaw.ai>
Date: Fri, 24 Apr 2026 21:48:16 -0300
Subject: [PATCH] fix(whatsapp): isolate voice transcripts from commands

---
 CHANGELOG.md                                  |  1 +
 .../monitor/inbound-dispatch.test.ts          | 29 ++++++++++++++
 .../auto-reply/monitor/inbound-dispatch.ts    | 11 ++++--
 .../process-message.audio-preflight.test.ts   | 38 +++++++++++++++----
 .../src/auto-reply/monitor/process-message.ts | 18 +++++----
 5 files changed, 78 insertions(+), 19 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7fb5d9526f7..ae547a06fda 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -155,6 +155,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/startup: await startup sidecars before channel monitors report ready, reducing Discord and plugin startup races while still keeping gateway boot observability intact. Thanks @steipete.
 - Plugins/Google Meet: report required manual actions for Chrome joins, use browser automation for Meet entry, and persist the private-WS node opt-in so paired-node realtime sessions keep their intended network policy. Thanks @steipete.
 - Slack: route native stream fallback replies through the normal chunked sender so long buffered Slack Connect responses are not dropped or duplicated. (#71124) Thanks @martingarramon.
+- WhatsApp: transcribe accepted voice notes before agent dispatch while keeping spoken transcripts out of command authorization. (#64120) Thanks @rogerdigital.
 
 ## 2026.4.23
 
diff --git a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts
index 12bad29cf1a..c8ea78646fa 100644
--- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts
@@ -175,6 +175,35 @@ describe("whatsapp inbound dispatch", () => {
     });
   });
 
+  it("keeps agent and command bodies independently overridable", () => {
+    const ctx = buildWhatsAppInboundContext({
+      bodyForAgent: "spoken transcript",
+      combinedBody: "spoken transcript",
+      commandBody: "<media:audio>",
+      conversationId: "+1000",
+      msg: makeMsg({
+        body: "<media:audio>",
+        mediaPath: "/tmp/voice.ogg",
+        mediaType: "audio/ogg; codecs=opus",
+      }),
+      rawBody: "<media:audio>",
+      route: makeRoute(),
+      sender: {
+        e164: "+1000",
+      },
+      transcript: "spoken transcript",
+    });
+
+    expect(ctx).toMatchObject({
+      Body: "spoken transcript",
+      BodyForAgent: "spoken transcript",
+      BodyForCommands: "<media:audio>",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "spoken transcript",
+    });
+  });
+
   it("falls back SenderId to SenderE164 when sender id is missing", () => {
     const ctx = buildWhatsAppInboundContext({
       combinedBody: "hi",
diff --git a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts
index be9f1d668df..379f6f96294 100644
--- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts
@@ -86,15 +86,19 @@ export function resolveWhatsAppResponsePrefix(params: {
 }
 
 export function buildWhatsAppInboundContext(params: {
+  bodyForAgent?: string;
   combinedBody: string;
+  commandBody?: string;
   commandAuthorized?: boolean;
   conversationId: string;
   groupHistory?: GroupHistoryEntry[];
   groupMemberRoster?: Map<string, string>;
   groupSystemPrompt?: string;
   msg: WebInboundMsg;
+  rawBody?: string;
   route: ReturnType<typeof resolveAgentRoute>;
   sender: SenderContext;
+  transcript?: string;
   replyThreading?: ReplyThreadingContext;
   visibleReplyTo?: VisibleReplyTarget;
 }) {
@@ -109,10 +113,11 @@ export function buildWhatsAppInboundContext(params: {
 
   const result = finalizeInboundContext({
     Body: params.combinedBody,
-    BodyForAgent: params.msg.body,
+    BodyForAgent: params.bodyForAgent ?? params.msg.body,
     InboundHistory: inboundHistory,
-    RawBody: params.msg.body,
-    CommandBody: params.msg.body,
+    RawBody: params.rawBody ?? params.msg.body,
+    CommandBody: params.commandBody ?? params.msg.body,
+    Transcript: params.transcript,
     From: params.msg.from,
     To: params.msg.to,
     SessionKey: params.route.sessionKey,
diff --git a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts
index baff7745db4..ce11166d4a2 100644
--- a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts
@@ -10,6 +10,7 @@ vi.mock("./audio-preflight.runtime.js", () => ({
 
 // Controllable shouldComputeCommandAuthorized for command-sync tests
 let shouldComputeCommandResult = false;
+let shouldComputeCommandBodies: string[] = [];
 
 // Minimal mocks for process-message dependencies
 vi.mock("../../accounts.js", () => ({
@@ -77,20 +78,32 @@ vi.mock("./runtime-api.js", () => ({
   }),
   resolvePinnedMainDmOwnerFromAllowlist: () => null,
   resolveDmGroupAccessWithCommandGate: () => ({ commandAuthorized: true }),
-  shouldComputeCommandAuthorized: (body: string) =>
-    shouldComputeCommandResult || body.startsWith("/"),
+  shouldComputeCommandAuthorized: (body: string) => {
+    shouldComputeCommandBodies.push(body);
+    return shouldComputeCommandResult || body.startsWith("/");
+  },
   shouldLogVerbose: () => false,
   type: undefined,
 }));
 
 vi.mock("./inbound-dispatch.js", () => ({
   buildWhatsAppInboundContext: (params: {
+    bodyForAgent?: string;
+    combinedBody: string;
+    commandAuthorized?: boolean;
+    commandBody?: string;
     msg: { body: string; mediaPath?: string; mediaType?: string };
+    rawBody?: string;
+    transcript?: string;
   }) => ({
-    Body: params.msg.body,
-    BodyForAgent: params.msg.body,
+    Body: params.combinedBody,
+    BodyForAgent: params.bodyForAgent ?? params.msg.body,
+    CommandAuthorized: params.commandAuthorized,
+    CommandBody: params.commandBody ?? params.msg.body,
     MediaPath: params.msg.mediaPath,
     MediaType: params.msg.mediaType,
+    RawBody: params.rawBody ?? params.msg.body,
+    Transcript: params.transcript,
   }),
   dispatchWhatsAppBufferedReply: vi.fn(async () => true),
   resolveWhatsAppDmRouteTarget: () => "+15550000002",
@@ -165,6 +178,7 @@ describe("processMessage audio preflight transcription", () => {
     maybeSendAckReactionMock.mockReset();
     maybeSendAckReactionMock.mockResolvedValue(undefined);
     shouldComputeCommandResult = false;
+    shouldComputeCommandBodies = [];
     vi.mocked(dispatchWhatsAppBufferedReply).mockClear();
   });
 
@@ -187,6 +201,9 @@ describe("processMessage audio preflight transcription", () => {
     expect(dispatchCall?.context).toMatchObject({
       Body: "okay let's test this voice message",
       BodyForAgent: "okay let's test this voice message",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "okay let's test this voice message",
     });
     // mediaPath and mediaType must be preserved so inboundAudio detection (used by
     // features like messages.tts.auto: "inbound") still recognises this as audio.
@@ -258,18 +275,20 @@ describe("processMessage audio preflight transcription", () => {
     });
   });
 
-  it("uses transcript body for command detection so voice commands are not missed", async () => {
-    // Transcript starts with a slash command — shouldComputeCommandAuthorized must
-    // see the transcript, not the original <media:audio> placeholder.
+  it("does not use transcript body for command detection", async () => {
     transcribeFirstAudioMock.mockResolvedValueOnce("/new start a new session");
 
     await processMessage(makeParams());
 
-    // Command detection ran against the transcript, so CommandBody is the transcript.
+    expect(shouldComputeCommandBodies).toEqual(["<media:audio>"]);
+
     const dispatchCall = vi.mocked(dispatchWhatsAppBufferedReply).mock.calls[0]?.[0];
     expect(dispatchCall?.context).toMatchObject({
       Body: "/new start a new session",
       BodyForAgent: "/new start a new session",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "/new start a new session",
     });
   });
 
@@ -287,6 +306,9 @@ describe("processMessage audio preflight transcription", () => {
     expect(dispatchCall?.context).toMatchObject({
       Body: "pre-computed transcript from fan-out caller",
       BodyForAgent: "pre-computed transcript from fan-out caller",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "pre-computed transcript from fan-out caller",
     });
   });
 
diff --git a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts
index 0f2fbdb23f2..c98dcb71837 100644
--- a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts
@@ -248,19 +248,19 @@ export async function processMessage(params: {
     }
   }
 
-  // If we have a transcript, replace the body so the agent sees the spoken text.
+  // If we have a transcript, replace the agent-facing body so the agent sees the spoken text.
   // mediaPath and mediaType are intentionally preserved so that inboundAudio detection
   // (used by features such as messages.tts.auto: "inbound") still sees this as an
   // audio message. The transcript is also stored in Transcript so downstream pipelines
   // can detect it. Preventing a second STT pass in the media-understanding pipeline
   // requires SDK-level support (alreadyTranscribed on a shared attachment instance);
   // that is a shared concern across all channels and is tracked separately.
-  const msgForInbound =
+  const msgForAgent =
     audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg;
 
   let combinedBody = buildInboundLine({
     cfg: params.cfg,
-    msg: msgForInbound,
+    msg: msgForAgent,
     agentId: params.route.agentId,
     previousTimestamp,
     envelope: envelopeOptions,
@@ -368,12 +368,10 @@ export async function processMessage(params: {
     senderE164: sender.e164 ?? undefined,
     normalizeE164,
   });
-  // Use msgForInbound so that if a voice note transcribes to a command (e.g. /new),
-  // command detection and auth are evaluated against the transcript, not <media:audio>.
-  const commandAuthorized = shouldComputeCommandAuthorized(msgForInbound.body, params.cfg)
+  const commandAuthorized = shouldComputeCommandAuthorized(params.msg.body, params.cfg)
     ? await resolveWhatsAppCommandAuthorized({
         cfg: params.cfg,
-        msg: msgForInbound,
+        msg: params.msg,
         policy: inboundPolicy,
       })
     : undefined;
@@ -407,19 +405,23 @@ export async function processMessage(params: {
         });
 
   const ctxPayload = buildWhatsAppInboundContext({
+    bodyForAgent: msgForAgent.body,
     combinedBody,
+    commandBody: params.msg.body,
     commandAuthorized,
     conversationId,
     groupHistory: visibleGroupHistory,
     groupMemberRoster: params.groupMemberNames.get(params.groupHistoryKey),
     groupSystemPrompt: conversationSystemPrompt,
-    msg: msgForInbound,
+    msg: params.msg,
+    rawBody: params.msg.body,
     route: params.route,
     sender: {
       id: getPrimaryIdentityId(sender) ?? undefined,
       name: sender.name ?? undefined,
       e164: sender.e164 ?? undefined,
     },
+    ...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}),
     replyThreading,
     visibleReplyTo: visibleReplyTo ?? undefined,
   });