From 7480b339da6ffa520a1852de1956989c72ca7e89 Mon Sep 17 00:00:00 2001 From: Marcus Castro Date: Fri, 24 Apr 2026 21:48:16 -0300 Subject: [PATCH] fix(whatsapp): isolate voice transcripts from commands --- CHANGELOG.md | 1 + .../monitor/inbound-dispatch.test.ts | 29 ++++++++++++++ .../auto-reply/monitor/inbound-dispatch.ts | 11 ++++-- .../process-message.audio-preflight.test.ts | 38 +++++++++++++++---- .../src/auto-reply/monitor/process-message.ts | 18 +++++---- 5 files changed, 78 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fb5d9526f7..ae547a06fda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -155,6 +155,7 @@ Docs: https://docs.openclaw.ai - Gateway/startup: await startup sidecars before channel monitors report ready, reducing Discord and plugin startup races while still keeping gateway boot observability intact. Thanks @steipete. - Plugins/Google Meet: report required manual actions for Chrome joins, use browser automation for Meet entry, and persist the private-WS node opt-in so paired-node realtime sessions keep their intended network policy. Thanks @steipete. - Slack: route native stream fallback replies through the normal chunked sender so long buffered Slack Connect responses are not dropped or duplicated. (#71124) Thanks @martingarramon. +- WhatsApp: transcribe accepted voice notes before agent dispatch while keeping spoken transcripts out of command authorization. (#64120) Thanks @rogerdigital. ## 2026.4.23 diff --git a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts index 12bad29cf1a..c8ea78646fa 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.test.ts @@ -175,6 +175,35 @@ describe("whatsapp inbound dispatch", () => { }); }); + it("keeps agent and command bodies independently overridable", () => { + const ctx = buildWhatsAppInboundContext({ + bodyForAgent: "spoken transcript", + combinedBody: "spoken transcript", + commandBody: "", + conversationId: "+1000", + msg: makeMsg({ + body: "", + mediaPath: "/tmp/voice.ogg", + mediaType: "audio/ogg; codecs=opus", + }), + rawBody: "", + route: makeRoute(), + sender: { + e164: "+1000", + }, + transcript: "spoken transcript", + }); + + expect(ctx).toMatchObject({ + Body: "spoken transcript", + BodyForAgent: "spoken transcript", + BodyForCommands: "", + CommandBody: "", + RawBody: "", + Transcript: "spoken transcript", + }); + }); + it("falls back SenderId to SenderE164 when sender id is missing", () => { const ctx = buildWhatsAppInboundContext({ combinedBody: "hi", diff --git a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts index be9f1d668df..379f6f96294 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts @@ -86,15 +86,19 @@ export function resolveWhatsAppResponsePrefix(params: { } export function buildWhatsAppInboundContext(params: { + bodyForAgent?: string; combinedBody: string; + commandBody?: string; commandAuthorized?: boolean; conversationId: string; groupHistory?: GroupHistoryEntry[]; groupMemberRoster?: Map; groupSystemPrompt?: string; msg: WebInboundMsg; + rawBody?: string; route: ReturnType; sender: SenderContext; + transcript?: string; replyThreading?: ReplyThreadingContext; visibleReplyTo?: VisibleReplyTarget; }) { @@ -109,10 +113,11 @@ export function buildWhatsAppInboundContext(params: { const result = finalizeInboundContext({ Body: params.combinedBody, - BodyForAgent: params.msg.body, + BodyForAgent: params.bodyForAgent ?? params.msg.body, InboundHistory: inboundHistory, - RawBody: params.msg.body, - CommandBody: params.msg.body, + RawBody: params.rawBody ?? params.msg.body, + CommandBody: params.commandBody ?? params.msg.body, + Transcript: params.transcript, From: params.msg.from, To: params.msg.to, SessionKey: params.route.sessionKey, diff --git a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts index baff7745db4..ce11166d4a2 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts @@ -10,6 +10,7 @@ vi.mock("./audio-preflight.runtime.js", () => ({ // Controllable shouldComputeCommandAuthorized for command-sync tests let shouldComputeCommandResult = false; +let shouldComputeCommandBodies: string[] = []; // Minimal mocks for process-message dependencies vi.mock("../../accounts.js", () => ({ @@ -77,20 +78,32 @@ vi.mock("./runtime-api.js", () => ({ }), resolvePinnedMainDmOwnerFromAllowlist: () => null, resolveDmGroupAccessWithCommandGate: () => ({ commandAuthorized: true }), - shouldComputeCommandAuthorized: (body: string) => - shouldComputeCommandResult || body.startsWith("/"), + shouldComputeCommandAuthorized: (body: string) => { + shouldComputeCommandBodies.push(body); + return shouldComputeCommandResult || body.startsWith("/"); + }, shouldLogVerbose: () => false, type: undefined, })); vi.mock("./inbound-dispatch.js", () => ({ buildWhatsAppInboundContext: (params: { + bodyForAgent?: string; + combinedBody: string; + commandAuthorized?: boolean; + commandBody?: string; msg: { body: string; mediaPath?: string; mediaType?: string }; + rawBody?: string; + transcript?: string; }) => ({ - Body: params.msg.body, - BodyForAgent: params.msg.body, + Body: params.combinedBody, + BodyForAgent: params.bodyForAgent ?? params.msg.body, + CommandAuthorized: params.commandAuthorized, + CommandBody: params.commandBody ?? params.msg.body, MediaPath: params.msg.mediaPath, MediaType: params.msg.mediaType, + RawBody: params.rawBody ?? params.msg.body, + Transcript: params.transcript, }), dispatchWhatsAppBufferedReply: vi.fn(async () => true), resolveWhatsAppDmRouteTarget: () => "+15550000002", @@ -165,6 +178,7 @@ describe("processMessage audio preflight transcription", () => { maybeSendAckReactionMock.mockReset(); maybeSendAckReactionMock.mockResolvedValue(undefined); shouldComputeCommandResult = false; + shouldComputeCommandBodies = []; vi.mocked(dispatchWhatsAppBufferedReply).mockClear(); }); @@ -187,6 +201,9 @@ describe("processMessage audio preflight transcription", () => { expect(dispatchCall?.context).toMatchObject({ Body: "okay let's test this voice message", BodyForAgent: "okay let's test this voice message", + CommandBody: "", + RawBody: "", + Transcript: "okay let's test this voice message", }); // mediaPath and mediaType must be preserved so inboundAudio detection (used by // features like messages.tts.auto: "inbound") still recognises this as audio. @@ -258,18 +275,20 @@ describe("processMessage audio preflight transcription", () => { }); }); - it("uses transcript body for command detection so voice commands are not missed", async () => { - // Transcript starts with a slash command — shouldComputeCommandAuthorized must - // see the transcript, not the original placeholder. + it("does not use transcript body for command detection", async () => { transcribeFirstAudioMock.mockResolvedValueOnce("/new start a new session"); await processMessage(makeParams()); - // Command detection ran against the transcript, so CommandBody is the transcript. + expect(shouldComputeCommandBodies).toEqual([""]); + const dispatchCall = vi.mocked(dispatchWhatsAppBufferedReply).mock.calls[0]?.[0]; expect(dispatchCall?.context).toMatchObject({ Body: "/new start a new session", BodyForAgent: "/new start a new session", + CommandBody: "", + RawBody: "", + Transcript: "/new start a new session", }); }); @@ -287,6 +306,9 @@ describe("processMessage audio preflight transcription", () => { expect(dispatchCall?.context).toMatchObject({ Body: "pre-computed transcript from fan-out caller", BodyForAgent: "pre-computed transcript from fan-out caller", + CommandBody: "", + RawBody: "", + Transcript: "pre-computed transcript from fan-out caller", }); }); diff --git a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts index 0f2fbdb23f2..c98dcb71837 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts @@ -248,19 +248,19 @@ export async function processMessage(params: { } } - // If we have a transcript, replace the body so the agent sees the spoken text. + // If we have a transcript, replace the agent-facing body so the agent sees the spoken text. // mediaPath and mediaType are intentionally preserved so that inboundAudio detection // (used by features such as messages.tts.auto: "inbound") still sees this as an // audio message. The transcript is also stored in Transcript so downstream pipelines // can detect it. Preventing a second STT pass in the media-understanding pipeline // requires SDK-level support (alreadyTranscribed on a shared attachment instance); // that is a shared concern across all channels and is tracked separately. - const msgForInbound = + const msgForAgent = audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg; let combinedBody = buildInboundLine({ cfg: params.cfg, - msg: msgForInbound, + msg: msgForAgent, agentId: params.route.agentId, previousTimestamp, envelope: envelopeOptions, @@ -368,12 +368,10 @@ export async function processMessage(params: { senderE164: sender.e164 ?? undefined, normalizeE164, }); - // Use msgForInbound so that if a voice note transcribes to a command (e.g. /new), - // command detection and auth are evaluated against the transcript, not . - const commandAuthorized = shouldComputeCommandAuthorized(msgForInbound.body, params.cfg) + const commandAuthorized = shouldComputeCommandAuthorized(params.msg.body, params.cfg) ? await resolveWhatsAppCommandAuthorized({ cfg: params.cfg, - msg: msgForInbound, + msg: params.msg, policy: inboundPolicy, }) : undefined; @@ -407,19 +405,23 @@ export async function processMessage(params: { }); const ctxPayload = buildWhatsAppInboundContext({ + bodyForAgent: msgForAgent.body, combinedBody, + commandBody: params.msg.body, commandAuthorized, conversationId, groupHistory: visibleGroupHistory, groupMemberRoster: params.groupMemberNames.get(params.groupHistoryKey), groupSystemPrompt: conversationSystemPrompt, - msg: msgForInbound, + msg: params.msg, + rawBody: params.msg.body, route: params.route, sender: { id: getPrimaryIdentityId(sender) ?? undefined, name: sender.name ?? undefined, e164: sender.e164 ?? undefined, }, + ...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}), replyThreading, visibleReplyTo: visibleReplyTo ?? undefined, });