fix(whatsapp): isolate voice transcripts from commands

This commit is contained in:
Marcus Castro
2026-04-24 21:48:16 -03:00
parent 21f8a0ee9e
commit 7480b339da
5 changed files with 78 additions and 19 deletions

View File

@@ -155,6 +155,7 @@ Docs: https://docs.openclaw.ai
- Gateway/startup: await startup sidecars before channel monitors report ready, reducing Discord and plugin startup races while still keeping gateway boot observability intact. Thanks @steipete.
- Plugins/Google Meet: report required manual actions for Chrome joins, use browser automation for Meet entry, and persist the private-WS node opt-in so paired-node realtime sessions keep their intended network policy. Thanks @steipete.
- Slack: route native stream fallback replies through the normal chunked sender so long buffered Slack Connect responses are not dropped or duplicated. (#71124) Thanks @martingarramon.
- WhatsApp: transcribe accepted voice notes before agent dispatch while keeping spoken transcripts out of command authorization. (#64120) Thanks @rogerdigital.
## 2026.4.23

View File

@@ -175,6 +175,35 @@ describe("whatsapp inbound dispatch", () => {
});
});
it("keeps agent and command bodies independently overridable", () => {
const ctx = buildWhatsAppInboundContext({
bodyForAgent: "spoken transcript",
combinedBody: "spoken transcript",
commandBody: "<media:audio>",
conversationId: "+1000",
msg: makeMsg({
body: "<media:audio>",
mediaPath: "/tmp/voice.ogg",
mediaType: "audio/ogg; codecs=opus",
}),
rawBody: "<media:audio>",
route: makeRoute(),
sender: {
e164: "+1000",
},
transcript: "spoken transcript",
});
expect(ctx).toMatchObject({
Body: "spoken transcript",
BodyForAgent: "spoken transcript",
BodyForCommands: "<media:audio>",
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "spoken transcript",
});
});
it("falls back SenderId to SenderE164 when sender id is missing", () => {
const ctx = buildWhatsAppInboundContext({
combinedBody: "hi",

View File

@@ -86,15 +86,19 @@ export function resolveWhatsAppResponsePrefix(params: {
}
export function buildWhatsAppInboundContext(params: {
bodyForAgent?: string;
combinedBody: string;
commandBody?: string;
commandAuthorized?: boolean;
conversationId: string;
groupHistory?: GroupHistoryEntry[];
groupMemberRoster?: Map<string, string>;
groupSystemPrompt?: string;
msg: WebInboundMsg;
rawBody?: string;
route: ReturnType<typeof resolveAgentRoute>;
sender: SenderContext;
transcript?: string;
replyThreading?: ReplyThreadingContext;
visibleReplyTo?: VisibleReplyTarget;
}) {
@@ -109,10 +113,11 @@ export function buildWhatsAppInboundContext(params: {
const result = finalizeInboundContext({
Body: params.combinedBody,
BodyForAgent: params.msg.body,
BodyForAgent: params.bodyForAgent ?? params.msg.body,
InboundHistory: inboundHistory,
RawBody: params.msg.body,
CommandBody: params.msg.body,
RawBody: params.rawBody ?? params.msg.body,
CommandBody: params.commandBody ?? params.msg.body,
Transcript: params.transcript,
From: params.msg.from,
To: params.msg.to,
SessionKey: params.route.sessionKey,

View File

@@ -10,6 +10,7 @@ vi.mock("./audio-preflight.runtime.js", () => ({
// Controllable shouldComputeCommandAuthorized for command-sync tests
let shouldComputeCommandResult = false;
let shouldComputeCommandBodies: string[] = [];
// Minimal mocks for process-message dependencies
vi.mock("../../accounts.js", () => ({
@@ -77,20 +78,32 @@ vi.mock("./runtime-api.js", () => ({
}),
resolvePinnedMainDmOwnerFromAllowlist: () => null,
resolveDmGroupAccessWithCommandGate: () => ({ commandAuthorized: true }),
shouldComputeCommandAuthorized: (body: string) =>
shouldComputeCommandResult || body.startsWith("/"),
shouldComputeCommandAuthorized: (body: string) => {
shouldComputeCommandBodies.push(body);
return shouldComputeCommandResult || body.startsWith("/");
},
shouldLogVerbose: () => false,
type: undefined,
}));
vi.mock("./inbound-dispatch.js", () => ({
buildWhatsAppInboundContext: (params: {
bodyForAgent?: string;
combinedBody: string;
commandAuthorized?: boolean;
commandBody?: string;
msg: { body: string; mediaPath?: string; mediaType?: string };
rawBody?: string;
transcript?: string;
}) => ({
Body: params.msg.body,
BodyForAgent: params.msg.body,
Body: params.combinedBody,
BodyForAgent: params.bodyForAgent ?? params.msg.body,
CommandAuthorized: params.commandAuthorized,
CommandBody: params.commandBody ?? params.msg.body,
MediaPath: params.msg.mediaPath,
MediaType: params.msg.mediaType,
RawBody: params.rawBody ?? params.msg.body,
Transcript: params.transcript,
}),
dispatchWhatsAppBufferedReply: vi.fn(async () => true),
resolveWhatsAppDmRouteTarget: () => "+15550000002",
@@ -165,6 +178,7 @@ describe("processMessage audio preflight transcription", () => {
maybeSendAckReactionMock.mockReset();
maybeSendAckReactionMock.mockResolvedValue(undefined);
shouldComputeCommandResult = false;
shouldComputeCommandBodies = [];
vi.mocked(dispatchWhatsAppBufferedReply).mockClear();
});
@@ -187,6 +201,9 @@ describe("processMessage audio preflight transcription", () => {
expect(dispatchCall?.context).toMatchObject({
Body: "okay let's test this voice message",
BodyForAgent: "okay let's test this voice message",
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "okay let's test this voice message",
});
// mediaPath and mediaType must be preserved so inboundAudio detection (used by
// features like messages.tts.auto: "inbound") still recognises this as audio.
@@ -258,18 +275,20 @@ describe("processMessage audio preflight transcription", () => {
});
});
it("uses transcript body for command detection so voice commands are not missed", async () => {
// Transcript starts with a slash command — shouldComputeCommandAuthorized must
// see the transcript, not the original <media:audio> placeholder.
it("does not use transcript body for command detection", async () => {
transcribeFirstAudioMock.mockResolvedValueOnce("/new start a new session");
await processMessage(makeParams());
// Command detection ran against the transcript, so CommandBody is the transcript.
expect(shouldComputeCommandBodies).toEqual(["<media:audio>"]);
const dispatchCall = vi.mocked(dispatchWhatsAppBufferedReply).mock.calls[0]?.[0];
expect(dispatchCall?.context).toMatchObject({
Body: "/new start a new session",
BodyForAgent: "/new start a new session",
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "/new start a new session",
});
});
@@ -287,6 +306,9 @@ describe("processMessage audio preflight transcription", () => {
expect(dispatchCall?.context).toMatchObject({
Body: "pre-computed transcript from fan-out caller",
BodyForAgent: "pre-computed transcript from fan-out caller",
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "pre-computed transcript from fan-out caller",
});
});

View File

@@ -248,19 +248,19 @@ export async function processMessage(params: {
}
}
// If we have a transcript, replace the body so the agent sees the spoken text.
// If we have a transcript, replace the agent-facing body so the agent sees the spoken text.
// mediaPath and mediaType are intentionally preserved so that inboundAudio detection
// (used by features such as messages.tts.auto: "inbound") still sees this as an
// audio message. The transcript is also stored in Transcript so downstream pipelines
// can detect it. Preventing a second STT pass in the media-understanding pipeline
// requires SDK-level support (alreadyTranscribed on a shared attachment instance);
// that is a shared concern across all channels and is tracked separately.
const msgForInbound =
const msgForAgent =
audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg;
let combinedBody = buildInboundLine({
cfg: params.cfg,
msg: msgForInbound,
msg: msgForAgent,
agentId: params.route.agentId,
previousTimestamp,
envelope: envelopeOptions,
@@ -368,12 +368,10 @@ export async function processMessage(params: {
senderE164: sender.e164 ?? undefined,
normalizeE164,
});
// Use msgForInbound so that if a voice note transcribes to a command (e.g. /new),
// command detection and auth are evaluated against the transcript, not <media:audio>.
const commandAuthorized = shouldComputeCommandAuthorized(msgForInbound.body, params.cfg)
const commandAuthorized = shouldComputeCommandAuthorized(params.msg.body, params.cfg)
? await resolveWhatsAppCommandAuthorized({
cfg: params.cfg,
msg: msgForInbound,
msg: params.msg,
policy: inboundPolicy,
})
: undefined;
@@ -407,19 +405,23 @@ export async function processMessage(params: {
});
const ctxPayload = buildWhatsAppInboundContext({
bodyForAgent: msgForAgent.body,
combinedBody,
commandBody: params.msg.body,
commandAuthorized,
conversationId,
groupHistory: visibleGroupHistory,
groupMemberRoster: params.groupMemberNames.get(params.groupHistoryKey),
groupSystemPrompt: conversationSystemPrompt,
msg: msgForInbound,
msg: params.msg,
rawBody: params.msg.body,
route: params.route,
sender: {
id: getPrimaryIdentityId(sender) ?? undefined,
name: sender.name ?? undefined,
e164: sender.e164 ?? undefined,
},
...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}),
replyThreading,
visibleReplyTo: visibleReplyTo ?? undefined,
});