mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix(whatsapp): isolate voice transcripts from commands
This commit is contained in:
@@ -155,6 +155,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/startup: await startup sidecars before channel monitors report ready, reducing Discord and plugin startup races while still keeping gateway boot observability intact. Thanks @steipete.
|
||||
- Plugins/Google Meet: report required manual actions for Chrome joins, use browser automation for Meet entry, and persist the private-WS node opt-in so paired-node realtime sessions keep their intended network policy. Thanks @steipete.
|
||||
- Slack: route native stream fallback replies through the normal chunked sender so long buffered Slack Connect responses are not dropped or duplicated. (#71124) Thanks @martingarramon.
|
||||
- WhatsApp: transcribe accepted voice notes before agent dispatch while keeping spoken transcripts out of command authorization. (#64120) Thanks @rogerdigital.
|
||||
|
||||
## 2026.4.23
|
||||
|
||||
|
||||
@@ -175,6 +175,35 @@ describe("whatsapp inbound dispatch", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps agent and command bodies independently overridable", () => {
|
||||
const ctx = buildWhatsAppInboundContext({
|
||||
bodyForAgent: "spoken transcript",
|
||||
combinedBody: "spoken transcript",
|
||||
commandBody: "<media:audio>",
|
||||
conversationId: "+1000",
|
||||
msg: makeMsg({
|
||||
body: "<media:audio>",
|
||||
mediaPath: "/tmp/voice.ogg",
|
||||
mediaType: "audio/ogg; codecs=opus",
|
||||
}),
|
||||
rawBody: "<media:audio>",
|
||||
route: makeRoute(),
|
||||
sender: {
|
||||
e164: "+1000",
|
||||
},
|
||||
transcript: "spoken transcript",
|
||||
});
|
||||
|
||||
expect(ctx).toMatchObject({
|
||||
Body: "spoken transcript",
|
||||
BodyForAgent: "spoken transcript",
|
||||
BodyForCommands: "<media:audio>",
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "spoken transcript",
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back SenderId to SenderE164 when sender id is missing", () => {
|
||||
const ctx = buildWhatsAppInboundContext({
|
||||
combinedBody: "hi",
|
||||
|
||||
@@ -86,15 +86,19 @@ export function resolveWhatsAppResponsePrefix(params: {
|
||||
}
|
||||
|
||||
export function buildWhatsAppInboundContext(params: {
|
||||
bodyForAgent?: string;
|
||||
combinedBody: string;
|
||||
commandBody?: string;
|
||||
commandAuthorized?: boolean;
|
||||
conversationId: string;
|
||||
groupHistory?: GroupHistoryEntry[];
|
||||
groupMemberRoster?: Map<string, string>;
|
||||
groupSystemPrompt?: string;
|
||||
msg: WebInboundMsg;
|
||||
rawBody?: string;
|
||||
route: ReturnType<typeof resolveAgentRoute>;
|
||||
sender: SenderContext;
|
||||
transcript?: string;
|
||||
replyThreading?: ReplyThreadingContext;
|
||||
visibleReplyTo?: VisibleReplyTarget;
|
||||
}) {
|
||||
@@ -109,10 +113,11 @@ export function buildWhatsAppInboundContext(params: {
|
||||
|
||||
const result = finalizeInboundContext({
|
||||
Body: params.combinedBody,
|
||||
BodyForAgent: params.msg.body,
|
||||
BodyForAgent: params.bodyForAgent ?? params.msg.body,
|
||||
InboundHistory: inboundHistory,
|
||||
RawBody: params.msg.body,
|
||||
CommandBody: params.msg.body,
|
||||
RawBody: params.rawBody ?? params.msg.body,
|
||||
CommandBody: params.commandBody ?? params.msg.body,
|
||||
Transcript: params.transcript,
|
||||
From: params.msg.from,
|
||||
To: params.msg.to,
|
||||
SessionKey: params.route.sessionKey,
|
||||
|
||||
@@ -10,6 +10,7 @@ vi.mock("./audio-preflight.runtime.js", () => ({
|
||||
|
||||
// Controllable shouldComputeCommandAuthorized for command-sync tests
|
||||
let shouldComputeCommandResult = false;
|
||||
let shouldComputeCommandBodies: string[] = [];
|
||||
|
||||
// Minimal mocks for process-message dependencies
|
||||
vi.mock("../../accounts.js", () => ({
|
||||
@@ -77,20 +78,32 @@ vi.mock("./runtime-api.js", () => ({
|
||||
}),
|
||||
resolvePinnedMainDmOwnerFromAllowlist: () => null,
|
||||
resolveDmGroupAccessWithCommandGate: () => ({ commandAuthorized: true }),
|
||||
shouldComputeCommandAuthorized: (body: string) =>
|
||||
shouldComputeCommandResult || body.startsWith("/"),
|
||||
shouldComputeCommandAuthorized: (body: string) => {
|
||||
shouldComputeCommandBodies.push(body);
|
||||
return shouldComputeCommandResult || body.startsWith("/");
|
||||
},
|
||||
shouldLogVerbose: () => false,
|
||||
type: undefined,
|
||||
}));
|
||||
|
||||
vi.mock("./inbound-dispatch.js", () => ({
|
||||
buildWhatsAppInboundContext: (params: {
|
||||
bodyForAgent?: string;
|
||||
combinedBody: string;
|
||||
commandAuthorized?: boolean;
|
||||
commandBody?: string;
|
||||
msg: { body: string; mediaPath?: string; mediaType?: string };
|
||||
rawBody?: string;
|
||||
transcript?: string;
|
||||
}) => ({
|
||||
Body: params.msg.body,
|
||||
BodyForAgent: params.msg.body,
|
||||
Body: params.combinedBody,
|
||||
BodyForAgent: params.bodyForAgent ?? params.msg.body,
|
||||
CommandAuthorized: params.commandAuthorized,
|
||||
CommandBody: params.commandBody ?? params.msg.body,
|
||||
MediaPath: params.msg.mediaPath,
|
||||
MediaType: params.msg.mediaType,
|
||||
RawBody: params.rawBody ?? params.msg.body,
|
||||
Transcript: params.transcript,
|
||||
}),
|
||||
dispatchWhatsAppBufferedReply: vi.fn(async () => true),
|
||||
resolveWhatsAppDmRouteTarget: () => "+15550000002",
|
||||
@@ -165,6 +178,7 @@ describe("processMessage audio preflight transcription", () => {
|
||||
maybeSendAckReactionMock.mockReset();
|
||||
maybeSendAckReactionMock.mockResolvedValue(undefined);
|
||||
shouldComputeCommandResult = false;
|
||||
shouldComputeCommandBodies = [];
|
||||
vi.mocked(dispatchWhatsAppBufferedReply).mockClear();
|
||||
});
|
||||
|
||||
@@ -187,6 +201,9 @@ describe("processMessage audio preflight transcription", () => {
|
||||
expect(dispatchCall?.context).toMatchObject({
|
||||
Body: "okay let's test this voice message",
|
||||
BodyForAgent: "okay let's test this voice message",
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "okay let's test this voice message",
|
||||
});
|
||||
// mediaPath and mediaType must be preserved so inboundAudio detection (used by
|
||||
// features like messages.tts.auto: "inbound") still recognises this as audio.
|
||||
@@ -258,18 +275,20 @@ describe("processMessage audio preflight transcription", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("uses transcript body for command detection so voice commands are not missed", async () => {
|
||||
// Transcript starts with a slash command — shouldComputeCommandAuthorized must
|
||||
// see the transcript, not the original <media:audio> placeholder.
|
||||
it("does not use transcript body for command detection", async () => {
|
||||
transcribeFirstAudioMock.mockResolvedValueOnce("/new start a new session");
|
||||
|
||||
await processMessage(makeParams());
|
||||
|
||||
// Command detection ran against the transcript, so CommandBody is the transcript.
|
||||
expect(shouldComputeCommandBodies).toEqual(["<media:audio>"]);
|
||||
|
||||
const dispatchCall = vi.mocked(dispatchWhatsAppBufferedReply).mock.calls[0]?.[0];
|
||||
expect(dispatchCall?.context).toMatchObject({
|
||||
Body: "/new start a new session",
|
||||
BodyForAgent: "/new start a new session",
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "/new start a new session",
|
||||
});
|
||||
});
|
||||
|
||||
@@ -287,6 +306,9 @@ describe("processMessage audio preflight transcription", () => {
|
||||
expect(dispatchCall?.context).toMatchObject({
|
||||
Body: "pre-computed transcript from fan-out caller",
|
||||
BodyForAgent: "pre-computed transcript from fan-out caller",
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "pre-computed transcript from fan-out caller",
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -248,19 +248,19 @@ export async function processMessage(params: {
|
||||
}
|
||||
}
|
||||
|
||||
// If we have a transcript, replace the body so the agent sees the spoken text.
|
||||
// If we have a transcript, replace the agent-facing body so the agent sees the spoken text.
|
||||
// mediaPath and mediaType are intentionally preserved so that inboundAudio detection
|
||||
// (used by features such as messages.tts.auto: "inbound") still sees this as an
|
||||
// audio message. The transcript is also stored in Transcript so downstream pipelines
|
||||
// can detect it. Preventing a second STT pass in the media-understanding pipeline
|
||||
// requires SDK-level support (alreadyTranscribed on a shared attachment instance);
|
||||
// that is a shared concern across all channels and is tracked separately.
|
||||
const msgForInbound =
|
||||
const msgForAgent =
|
||||
audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg;
|
||||
|
||||
let combinedBody = buildInboundLine({
|
||||
cfg: params.cfg,
|
||||
msg: msgForInbound,
|
||||
msg: msgForAgent,
|
||||
agentId: params.route.agentId,
|
||||
previousTimestamp,
|
||||
envelope: envelopeOptions,
|
||||
@@ -368,12 +368,10 @@ export async function processMessage(params: {
|
||||
senderE164: sender.e164 ?? undefined,
|
||||
normalizeE164,
|
||||
});
|
||||
// Use msgForInbound so that if a voice note transcribes to a command (e.g. /new),
|
||||
// command detection and auth are evaluated against the transcript, not <media:audio>.
|
||||
const commandAuthorized = shouldComputeCommandAuthorized(msgForInbound.body, params.cfg)
|
||||
const commandAuthorized = shouldComputeCommandAuthorized(params.msg.body, params.cfg)
|
||||
? await resolveWhatsAppCommandAuthorized({
|
||||
cfg: params.cfg,
|
||||
msg: msgForInbound,
|
||||
msg: params.msg,
|
||||
policy: inboundPolicy,
|
||||
})
|
||||
: undefined;
|
||||
@@ -407,19 +405,23 @@ export async function processMessage(params: {
|
||||
});
|
||||
|
||||
const ctxPayload = buildWhatsAppInboundContext({
|
||||
bodyForAgent: msgForAgent.body,
|
||||
combinedBody,
|
||||
commandBody: params.msg.body,
|
||||
commandAuthorized,
|
||||
conversationId,
|
||||
groupHistory: visibleGroupHistory,
|
||||
groupMemberRoster: params.groupMemberNames.get(params.groupHistoryKey),
|
||||
groupSystemPrompt: conversationSystemPrompt,
|
||||
msg: msgForInbound,
|
||||
msg: params.msg,
|
||||
rawBody: params.msg.body,
|
||||
route: params.route,
|
||||
sender: {
|
||||
id: getPrimaryIdentityId(sender) ?? undefined,
|
||||
name: sender.name ?? undefined,
|
||||
e164: sender.e164 ?? undefined,
|
||||
},
|
||||
...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}),
|
||||
replyThreading,
|
||||
visibleReplyTo: visibleReplyTo ?? undefined,
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user