fix(whatsapp): isolate voice transcripts from commands

2026-05-06 05:30:42 +00:00 · 2026-04-24 21:48:16 -03:00
parent 21f8a0ee9e
commit 7480b339da
5 changed files with 78 additions and 19 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -155,6 +155,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/startup: await startup sidecars before channel monitors report ready, reducing Discord and plugin startup races while still keeping gateway boot observability intact. Thanks @steipete.
 - Plugins/Google Meet: report required manual actions for Chrome joins, use browser automation for Meet entry, and persist the private-WS node opt-in so paired-node realtime sessions keep their intended network policy. Thanks @steipete.
 - Slack: route native stream fallback replies through the normal chunked sender so long buffered Slack Connect responses are not dropped or duplicated. (#71124) Thanks @martingarramon.
+- WhatsApp: transcribe accepted voice notes before agent dispatch while keeping spoken transcripts out of command authorization. (#64120) Thanks @rogerdigital.

 ## 2026.4.23

--- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts
@@ -175,6 +175,35 @@ describe("whatsapp inbound dispatch", () => {
    });
  });

+  it("keeps agent and command bodies independently overridable", () => {
+    const ctx = buildWhatsAppInboundContext({
+      bodyForAgent: "spoken transcript",
+      combinedBody: "spoken transcript",
+      commandBody: "<media:audio>",
+      conversationId: "+1000",
+      msg: makeMsg({
+        body: "<media:audio>",
+        mediaPath: "/tmp/voice.ogg",
+        mediaType: "audio/ogg; codecs=opus",
+      }),
+      rawBody: "<media:audio>",
+      route: makeRoute(),
+      sender: {
+        e164: "+1000",
+      },
+      transcript: "spoken transcript",
+    });
+
+    expect(ctx).toMatchObject({
+      Body: "spoken transcript",
+      BodyForAgent: "spoken transcript",
+      BodyForCommands: "<media:audio>",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "spoken transcript",
+    });
+  });
+
  it("falls back SenderId to SenderE164 when sender id is missing", () => {
    const ctx = buildWhatsAppInboundContext({
      combinedBody: "hi",
--- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts
@@ -86,15 +86,19 @@ export function resolveWhatsAppResponsePrefix(params: {
 }

 export function buildWhatsAppInboundContext(params: {
+  bodyForAgent?: string;
  combinedBody: string;
+  commandBody?: string;
  commandAuthorized?: boolean;
  conversationId: string;
  groupHistory?: GroupHistoryEntry[];
  groupMemberRoster?: Map<string, string>;
  groupSystemPrompt?: string;
  msg: WebInboundMsg;
+  rawBody?: string;
  route: ReturnType<typeof resolveAgentRoute>;
  sender: SenderContext;
+  transcript?: string;
  replyThreading?: ReplyThreadingContext;
  visibleReplyTo?: VisibleReplyTarget;
 }) {
@@ -109,10 +113,11 @@ export function buildWhatsAppInboundContext(params: {

  const result = finalizeInboundContext({
    Body: params.combinedBody,
-    BodyForAgent: params.msg.body,
+    BodyForAgent: params.bodyForAgent ?? params.msg.body,
    InboundHistory: inboundHistory,
-    RawBody: params.msg.body,
-    CommandBody: params.msg.body,
+    RawBody: params.rawBody ?? params.msg.body,
+    CommandBody: params.commandBody ?? params.msg.body,
+    Transcript: params.transcript,
    From: params.msg.from,
    To: params.msg.to,
    SessionKey: params.route.sessionKey,
--- a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts
@@ -10,6 +10,7 @@ vi.mock("./audio-preflight.runtime.js", () => ({

 // Controllable shouldComputeCommandAuthorized for command-sync tests
 let shouldComputeCommandResult = false;
+let shouldComputeCommandBodies: string[] = [];

 // Minimal mocks for process-message dependencies
 vi.mock("../../accounts.js", () => ({
@@ -77,20 +78,32 @@ vi.mock("./runtime-api.js", () => ({
  }),
  resolvePinnedMainDmOwnerFromAllowlist: () => null,
  resolveDmGroupAccessWithCommandGate: () => ({ commandAuthorized: true }),
-  shouldComputeCommandAuthorized: (body: string) =>
-    shouldComputeCommandResult || body.startsWith("/"),
+  shouldComputeCommandAuthorized: (body: string) => {
+    shouldComputeCommandBodies.push(body);
+    return shouldComputeCommandResult || body.startsWith("/");
+  },
  shouldLogVerbose: () => false,
  type: undefined,
 }));

 vi.mock("./inbound-dispatch.js", () => ({
  buildWhatsAppInboundContext: (params: {
+    bodyForAgent?: string;
+    combinedBody: string;
+    commandAuthorized?: boolean;
+    commandBody?: string;
    msg: { body: string; mediaPath?: string; mediaType?: string };
+    rawBody?: string;
+    transcript?: string;
  }) => ({
-    Body: params.msg.body,
-    BodyForAgent: params.msg.body,
+    Body: params.combinedBody,
+    BodyForAgent: params.bodyForAgent ?? params.msg.body,
+    CommandAuthorized: params.commandAuthorized,
+    CommandBody: params.commandBody ?? params.msg.body,
    MediaPath: params.msg.mediaPath,
    MediaType: params.msg.mediaType,
+    RawBody: params.rawBody ?? params.msg.body,
+    Transcript: params.transcript,
  }),
  dispatchWhatsAppBufferedReply: vi.fn(async () => true),
  resolveWhatsAppDmRouteTarget: () => "+15550000002",
@@ -165,6 +178,7 @@ describe("processMessage audio preflight transcription", () => {
    maybeSendAckReactionMock.mockReset();
    maybeSendAckReactionMock.mockResolvedValue(undefined);
    shouldComputeCommandResult = false;
+    shouldComputeCommandBodies = [];
    vi.mocked(dispatchWhatsAppBufferedReply).mockClear();
  });

@@ -187,6 +201,9 @@ describe("processMessage audio preflight transcription", () => {
    expect(dispatchCall?.context).toMatchObject({
      Body: "okay let's test this voice message",
      BodyForAgent: "okay let's test this voice message",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "okay let's test this voice message",
    });
    // mediaPath and mediaType must be preserved so inboundAudio detection (used by
    // features like messages.tts.auto: "inbound") still recognises this as audio.
@@ -258,18 +275,20 @@ describe("processMessage audio preflight transcription", () => {
    });
  });

-  it("uses transcript body for command detection so voice commands are not missed", async () => {
-    // Transcript starts with a slash command — shouldComputeCommandAuthorized must
-    // see the transcript, not the original <media:audio> placeholder.
+  it("does not use transcript body for command detection", async () => {
    transcribeFirstAudioMock.mockResolvedValueOnce("/new start a new session");

    await processMessage(makeParams());

-    // Command detection ran against the transcript, so CommandBody is the transcript.
+    expect(shouldComputeCommandBodies).toEqual(["<media:audio>"]);
+
    const dispatchCall = vi.mocked(dispatchWhatsAppBufferedReply).mock.calls[0]?.[0];
    expect(dispatchCall?.context).toMatchObject({
      Body: "/new start a new session",
      BodyForAgent: "/new start a new session",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "/new start a new session",
    });
  });

@@ -287,6 +306,9 @@ describe("processMessage audio preflight transcription", () => {
    expect(dispatchCall?.context).toMatchObject({
      Body: "pre-computed transcript from fan-out caller",
      BodyForAgent: "pre-computed transcript from fan-out caller",
+      CommandBody: "<media:audio>",
+      RawBody: "<media:audio>",
+      Transcript: "pre-computed transcript from fan-out caller",
    });
  });

--- a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts
+++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts
@@ -248,19 +248,19 @@ export async function processMessage(params: {
    }
  }

-  // If we have a transcript, replace the body so the agent sees the spoken text.
+  // If we have a transcript, replace the agent-facing body so the agent sees the spoken text.
  // mediaPath and mediaType are intentionally preserved so that inboundAudio detection
  // (used by features such as messages.tts.auto: "inbound") still sees this as an
  // audio message. The transcript is also stored in Transcript so downstream pipelines
  // can detect it. Preventing a second STT pass in the media-understanding pipeline
  // requires SDK-level support (alreadyTranscribed on a shared attachment instance);
  // that is a shared concern across all channels and is tracked separately.
-  const msgForInbound =
+  const msgForAgent =
    audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg;

  let combinedBody = buildInboundLine({
    cfg: params.cfg,
-    msg: msgForInbound,
+    msg: msgForAgent,
    agentId: params.route.agentId,
    previousTimestamp,
    envelope: envelopeOptions,
@@ -368,12 +368,10 @@ export async function processMessage(params: {
    senderE164: sender.e164 ?? undefined,
    normalizeE164,
  });
-  // Use msgForInbound so that if a voice note transcribes to a command (e.g. /new),
-  // command detection and auth are evaluated against the transcript, not <media:audio>.
-  const commandAuthorized = shouldComputeCommandAuthorized(msgForInbound.body, params.cfg)
+  const commandAuthorized = shouldComputeCommandAuthorized(params.msg.body, params.cfg)
    ? await resolveWhatsAppCommandAuthorized({
        cfg: params.cfg,
-        msg: msgForInbound,
+        msg: params.msg,
        policy: inboundPolicy,
      })
    : undefined;
@@ -407,19 +405,23 @@ export async function processMessage(params: {
        });

  const ctxPayload = buildWhatsAppInboundContext({
+    bodyForAgent: msgForAgent.body,
    combinedBody,
+    commandBody: params.msg.body,
    commandAuthorized,
    conversationId,
    groupHistory: visibleGroupHistory,
    groupMemberRoster: params.groupMemberNames.get(params.groupHistoryKey),
    groupSystemPrompt: conversationSystemPrompt,
-    msg: msgForInbound,
+    msg: params.msg,
+    rawBody: params.msg.body,
    route: params.route,
    sender: {
      id: getPrimaryIdentityId(sender) ?? undefined,
      name: sender.name ?? undefined,
      e164: sender.e164 ?? undefined,
    },
+    ...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}),
    replyThreading,
    visibleReplyTo: visibleReplyTo ?? undefined,
  });