fix(voice): reuse preflight transcripts across channels

This commit is contained in:
Peter Steinberger
2026-04-26 05:29:24 +01:00
parent 46b9044c3f
commit 6a67f65568
30 changed files with 586 additions and 64 deletions

View File

@@ -96,6 +96,14 @@ Docs: https://docs.openclaw.ai
before agent dispatch and keep raw Feishu `file_key` payloads out of message
text. Fixes #67120 and #61876.
- Tasks: terminalize async Gateway agent task records from the Gateway run result while preserving aborted, failed, and cancelled outcomes instead of leaving completed runs stuck as active or lost. (#71905) Thanks @likewen-tech.
- WhatsApp: let authorized group voice-note transcripts satisfy mention gating
before reply dispatch, while keeping unmentioned transcripts in pending group
history. Fixes #44908.
- Media understanding: carry channel voice-note preflight state into attachment
selection so WhatsApp, Feishu, Telegram, and Discord do not transcribe the
same inbound audio twice. Fixes #70580.
- TTS/BlueBubbles: deliver compatible auto-TTS audio as iMessage voice memo
bubbles instead of plain MP3/CAF file attachments. Fixes #16848.
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
plain prompts instead of OpenClaw internal runtime-context envelopes, while
keeping those envelopes out of ACP transcripts.

View File

@@ -21,6 +21,8 @@ need a separate `openclaw plugins install` step.
- OpenClaw talks to it through its REST API (`GET /api/v1/ping`, `POST /message/text`, `POST /chat/:id/*`).
- Incoming messages arrive via webhooks; outgoing replies, typing indicators, read receipts, and tapbacks are REST calls.
- Attachments and stickers are ingested as inbound media (and surfaced to the agent when possible).
- Auto-TTS replies that synthesize MP3 or CAF audio are delivered as iMessage
voice memo bubbles instead of plain file attachments.
- Pairing/allowlist works the same way as other channels (`/channels/pairing` etc) with `channels.bluebubbles.allowFrom` + pairing codes.
- Reactions are surfaced as system events just like Slack/Telegram so agents can "mention" them before replying.
- Advanced features: edit, unsend, reply threading, message effects, group management.

View File

@@ -244,6 +244,7 @@ content and identifiers.
- explicit WhatsApp mentions of the bot identity
- configured mention regex patterns (`agents.list[].groupChat.mentionPatterns`, fallback `messages.groupChat.mentionPatterns`)
- inbound voice-note transcripts for authorized group messages
- implicit reply-to-bot detection (reply sender matches bot identity)
Security note:
@@ -296,6 +297,11 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
- `<media:document>`
- `<media:sticker>`
Authorized group voice notes are transcribed before mention gating when the
body is only `<media:audio>`, so saying the bot mention in the voice note can
trigger the reply. If the transcript still does not mention the bot, the
transcript is kept in pending group history instead of the raw placeholder.
Location bodies use terse coordinate text. Location labels/comments and contact/vCard details are rendered as fenced untrusted metadata, not inline prompt text.
</Accordion>

View File

@@ -58,6 +58,10 @@ Video and music generation run as background tasks because provider processing t
Deepgram, ElevenLabs, Mistral, OpenAI, SenseAudio, and xAI can all transcribe
inbound audio through the batch `tools.media.audio` path when configured.
Channel plugins that preflight a voice note for mention gating or command
parsing mark the transcribed attachment on the inbound context, so the shared
media-understanding pass reuses that transcript instead of making a second STT
call for the same audio.
Deepgram, ElevenLabs, Mistral, OpenAI, and xAI also register Voice Call
streaming STT providers, so live phone audio can be forwarded to the selected
vendor without waiting for a completed recording.

View File

@@ -330,7 +330,7 @@ export const bluebubblesPlugin: ChannelPlugin<ResolvedBlueBubblesAccount, BlueBu
},
sendMedia: async (ctx) => {
const runtime = await loadBlueBubblesChannelRuntime();
const { cfg, to, text, mediaUrl, accountId, replyToId } = ctx;
const { cfg, to, text, mediaUrl, accountId, replyToId, audioAsVoice } = ctx;
const { mediaPath, mediaBuffer, contentType, filename, caption } = ctx as {
mediaPath?: string;
mediaBuffer?: Uint8Array;
@@ -349,6 +349,7 @@ export const bluebubblesPlugin: ChannelPlugin<ResolvedBlueBubblesAccount, BlueBu
caption: caption ?? text ?? undefined,
replyToId: replyToId ?? null,
accountId: accountId ?? undefined,
asVoice: audioAsVoice === true,
});
},
},

View File

@@ -323,4 +323,27 @@ describe("sendBlueBubblesMedia local-path hardening", () => {
);
expect(sendBlueBubblesAttachmentMock).toHaveBeenCalledTimes(1);
});
it("passes asVoice through to attachment delivery", async () => {
runtimeMocks.fetchRemoteMedia.mockResolvedValueOnce({
buffer: new Uint8Array([1, 2, 3]),
contentType: "audio/mpeg",
fileName: "voice.mp3",
});
await sendBlueBubblesMedia({
cfg: createConfig(),
to: "chat:123",
mediaUrl: "https://example.com/voice.mp3",
asVoice: true,
});
expect(sendBlueBubblesAttachmentMock).toHaveBeenCalledWith(
expect.objectContaining({
asVoice: true,
contentType: "audio/mpeg",
filename: "voice.mp3",
}),
);
});
});

View File

@@ -1686,6 +1686,7 @@ async function processMessageAfterDedupe(
caption: caption ?? undefined,
replyToId: replyToMessageGuid || null,
accountId: account.accountId,
asVoice: payload.audioAsVoice === true,
});
} catch (err) {
forgetPendingOutboundMessageId(pendingId);

View File

@@ -405,6 +405,7 @@ describe("preflightDiscordMessage", () => {
);
expect(result).not.toBeNull();
expect(result?.isDirectMessage).toBe(true);
expect(result?.preflightAudioTranscript).toBe("hello openclaw from dm audio");
});
it("falls back to the default discord account for omitted-account dm authorization", async () => {
@@ -1096,6 +1097,7 @@ describe("preflightDiscordMessage", () => {
);
expect(result).not.toBeNull();
expect(result?.wasMentioned).toBe(true);
expect(result?.preflightAudioTranscript).toBe("hey openclaw");
});
it("does not transcribe guild audio from unauthorized members", async () => {

View File

@@ -1112,6 +1112,7 @@ export async function preflightDiscordMessage(
commandAuthorized,
baseText,
messageText,
...(preflightTranscript !== undefined ? { preflightAudioTranscript: preflightTranscript } : {}),
wasMentioned,
route: effectiveRoute,
threadBinding,

View File

@@ -56,6 +56,7 @@ export type DiscordMessagePreflightContext = DiscordMessagePreflightSharedFields
commandAuthorized: boolean;
baseText: string;
messageText: string;
preflightAudioTranscript?: string;
wasMentioned: boolean;
route: ReturnType<typeof resolveAgentRoute>;

View File

@@ -285,11 +285,27 @@ function getLastRouteUpdate():
}
function getLastDispatchCtx():
| { SessionKey?: string; MessageThreadId?: string | number }
| {
BodyForAgent?: string;
CommandBody?: string;
MediaTranscribedIndexes?: number[];
MessageThreadId?: string | number;
SessionKey?: string;
Transcript?: string;
}
| undefined {
const callArgs = dispatchInboundMessage.mock.calls.at(-1) as unknown[] | undefined;
const params = callArgs?.[0] as
| { ctx?: { SessionKey?: string; MessageThreadId?: string | number } }
| {
ctx?: {
BodyForAgent?: string;
CommandBody?: string;
MediaTranscribedIndexes?: number[];
MessageThreadId?: string | number;
SessionKey?: string;
Transcript?: string;
};
}
| undefined;
return params?.ctx;
}
@@ -656,6 +672,45 @@ describe("processDiscordMessage ack reactions", () => {
});
describe("processDiscordMessage session routing", () => {
it("carries preflight audio transcript into dispatch context and marks media transcribed", async () => {
const fetchImpl = vi.fn(
async () =>
new Response(new Uint8Array([1, 2, 3, 4]), {
headers: { "content-type": "audio/ogg" },
}),
);
const ctx = await createBaseContext({
message: {
id: "m-audio-preflight",
channelId: "c1",
content: "",
timestamp: new Date().toISOString(),
attachments: [
{
id: "att-audio-preflight",
url: "https://cdn.discordapp.com/attachments/voice.ogg",
content_type: "audio/ogg",
filename: "voice.ogg",
},
],
},
baseText: "<media:audio>",
messageText: "<media:audio>",
preflightAudioTranscript: "hello from discord voice",
discordRestFetch: fetchImpl,
mediaMaxBytes: 1024 * 1024,
});
await processDiscordMessage(ctx as any);
expect(getLastDispatchCtx()).toMatchObject({
BodyForAgent: "hello from discord voice",
CommandBody: "hello from discord voice",
Transcript: "hello from discord voice",
MediaTranscribedIndexes: [0],
});
});
it("stores DM lastRoute with user target for direct-session continuity", async () => {
const ctx = await createBaseContext({
...createDirectMessageContextOverrides(),

View File

@@ -148,6 +148,7 @@ export async function processDiscordMessage(
isGroupDm,
baseText,
messageText,
preflightAudioTranscript,
shouldRequireMention,
canDetectMention,
effectiveWasMentioned,
@@ -428,6 +429,10 @@ export async function processDiscordMessage(
}
}
const mediaPayload = buildDiscordMediaPayload(mediaList);
const preflightAudioIndex =
preflightAudioTranscript === undefined
? -1
: mediaList.findIndex((media) => media.contentType?.startsWith("audio/"));
const threadKeys = resolveThreadSessionKeys({
baseSessionKey,
threadId: threadChannel ? messageChannelId : undefined,
@@ -487,10 +492,11 @@ export async function processDiscordMessage(
const ctxPayload = finalizeInboundContext({
Body: combinedBody,
BodyForAgent: baseText ?? text,
BodyForAgent: preflightAudioTranscript ?? baseText ?? text,
InboundHistory: inboundHistory,
RawBody: baseText,
CommandBody: baseText,
RawBody: preflightAudioTranscript ?? baseText,
CommandBody: preflightAudioTranscript ?? baseText,
...(preflightAudioTranscript !== undefined ? { Transcript: preflightAudioTranscript } : {}),
From: effectiveFrom,
To: effectiveTo,
SessionKey: boundSessionKey ?? autoThreadContext?.SessionKey ?? threadKeys.sessionKey,
@@ -521,6 +527,7 @@ export async function processDiscordMessage(
ThreadLabel: threadLabel,
Timestamp: resolveTimestampMs(message.timestamp),
...mediaPayload,
...(preflightAudioIndex >= 0 ? { MediaTranscribedIndexes: [preflightAudioIndex] } : {}),
CommandAuthorized: commandAuthorized,
CommandSource: "text" as const,
// Originating channel for reply routing.

View File

@@ -1512,6 +1512,7 @@ describe("handleFeishuMessage command authorization", () => {
Transcript: "voice transcript",
MediaPaths: ["/tmp/inbound-voice.ogg"],
MediaTypes: ["audio/ogg"],
MediaTranscribedIndexes: [0],
}),
);
const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];

View File

@@ -759,6 +759,10 @@ export async function handleFeishuMessage(params: {
chatType: isGroup ? "group" : "direct",
log,
});
const preflightAudioIndex =
audioTranscript === undefined
? -1
: mediaList.findIndex((media) => media.contentType?.startsWith("audio/"));
const agentFacingContent = audioTranscript ?? ctx.content;
const agentFacingCtx =
audioTranscript === undefined
@@ -1078,6 +1082,7 @@ export async function handleFeishuMessage(params: {
OriginatingTo: feishuTo,
GroupSystemPrompt: isGroup ? normalizeOptionalString(groupConfig?.systemPrompt) : undefined,
...mediaPayload,
...(preflightAudioIndex >= 0 ? { MediaTranscribedIndexes: [preflightAudioIndex] } : {}),
});
};

View File

@@ -68,7 +68,14 @@ const {
textToSpeechTelephony,
} = await import("./tts.js");
const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const;
const nativeVoiceNoteChannels = [
"bluebubbles",
"discord",
"feishu",
"matrix",
"telegram",
"whatsapp",
] as const;
function createMockSpeechProvider(
id = "mock",
@@ -164,6 +171,33 @@ describe("speech-core native voice-note routing", () => {
});
});
it("keeps BlueBubbles synthesis on mp3 audio-file output but delivers it as a voice memo", async () => {
await expectTtsPayloadResult({
channel: "bluebubbles",
prefsName: "openclaw-speech-core-tts-bluebubbles-mp3-test",
text: "This BlueBubbles reply should be delivered as an iMessage voice memo.",
target: "audio-file",
audioAsVoice: true,
mediaExtension: "mp3",
providerResult: {
audioBuffer: Buffer.from("mp3"),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: false,
},
});
});
it("does not mark unsupported BlueBubbles audio-file output as a voice memo", async () => {
await expectTtsPayloadResult({
channel: "bluebubbles",
prefsName: "openclaw-speech-core-tts-bluebubbles-ogg-test",
text: "This BlueBubbles reply should stay a regular audio attachment.",
target: "audio-file",
audioAsVoice: undefined,
});
});
it.each(["feishu", "whatsapp"] as const)(
"marks %s voice-note TTS for channel-side transcoding when provider returns mp3",
async (channel) => {

View File

@@ -738,8 +738,17 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
lastTtsAttempt = entry;
}
const VOICE_DELIVERY_CHANNELS = new Set([
"bluebubbles",
"telegram",
"feishu",
"whatsapp",
"matrix",
"discord",
]);
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]);
const AUDIO_FILE_VOICE_MEMO_CHANNELS = new Set(["bluebubbles"]);
function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
@@ -747,7 +756,7 @@ function resolveChannelId(channel: string | undefined): ChannelId | null {
function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
const channelId = resolveChannelId(channel);
return channelId !== null && OPUS_CHANNELS.has(channelId);
return channelId !== null && VOICE_DELIVERY_CHANNELS.has(channelId);
}
function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
@@ -755,12 +764,43 @@ function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
}
function resolveTtsSynthesisTarget(channel: string | undefined): "audio-file" | "voice-note" {
const channelId = resolveChannelId(channel);
return channelId !== null && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
}
function supportsAudioFileVoiceMemoOutput(params: {
fileExtension?: string;
outputFormat?: string;
}): boolean {
const extension = params.fileExtension?.trim().toLowerCase();
if (extension === ".mp3" || extension === ".caf") {
return true;
}
const outputFormat = params.outputFormat?.trim().toLowerCase();
return (
outputFormat === "mp3" ||
outputFormat === "caf" ||
outputFormat === "audio/mpeg" ||
outputFormat === "audio/x-caf"
);
}
function shouldDeliverTtsAsVoice(params: {
channel: string | undefined;
target: "audio-file" | "voice-note" | undefined;
voiceCompatible: boolean | undefined;
fileExtension?: string;
outputFormat?: string;
}): boolean {
if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") {
const channelId = resolveChannelId(params.channel);
if (channelId === null || !supportsNativeVoiceNoteTts(channelId)) {
return false;
}
if (AUDIO_FILE_VOICE_MEMO_CHANNELS.has(channelId)) {
return params.target === "audio-file" && supportsAudioFileVoiceMemoOutput(params);
}
if (params.target !== "voice-note") {
return false;
}
return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
@@ -1032,6 +1072,8 @@ export async function textToSpeech(params: {
channel: params.channel,
target: synthesis.target,
voiceCompatible: synthesis.voiceCompatible,
fileExtension: synthesis.fileExtension,
outputFormat: synthesis.outputFormat,
}),
target: synthesis.target,
};
@@ -1061,7 +1103,7 @@ export async function synthesizeSpeech(params: {
const { config, persona, providers } = setup;
const timeoutMs = params.timeoutMs ?? config.timeoutMs;
const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file";
const target = resolveTtsSynthesisTarget(params.channel);
const errors: string[] = [];
const attemptedProviders: string[] = [];
@@ -1499,6 +1541,7 @@ export const _test = {
resolveModelOverridePolicy,
supportsNativeVoiceNoteTts,
supportsTranscodedVoiceNoteTts,
resolveTtsSynthesisTarget,
shouldDeliverTtsAsVoice,
summarizeText,
getResolvedSpeechProviderConfig,

View File

@@ -69,6 +69,7 @@ function expectTranscriptRendered(
expect(ctx?.ctxPayload?.BodyForAgent).toBe(framed);
expect(ctx?.ctxPayload?.Body).toContain(framed);
expect(ctx?.ctxPayload?.Body).not.toContain("<media:audio>");
expect(ctx?.ctxPayload?.MediaTranscribedIndexes).toEqual([0]);
}
function expectAudioPlaceholderRendered(ctx: Awaited<ReturnType<typeof buildGroupVoiceContext>>) {

View File

@@ -73,6 +73,7 @@ export type TelegramInboundBodyResult = {
effectiveWasMentioned: boolean;
canDetectMention: boolean;
shouldBypassMention: boolean;
audioTranscribedMediaIndex?: number;
stickerCacheHit: boolean;
locationData?: NormalizedLocation;
};
@@ -230,6 +231,10 @@ export async function resolveTelegramInboundBody(params: {
logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`);
}
}
const audioTranscribedMediaIndex =
preflightTranscript === undefined
? undefined
: allMedia.findIndex((media) => media.contentType?.startsWith("audio/"));
if (hasAudio && bodyText === "<media:audio>" && preflightTranscript) {
bodyText = formatAudioTranscriptForAgent(preflightTranscript);
@@ -363,6 +368,9 @@ export async function resolveTelegramInboundBody(params: {
effectiveWasMentioned,
canDetectMention,
shouldBypassMention: mentionDecision.shouldBypassMention,
...(audioTranscribedMediaIndex !== undefined && audioTranscribedMediaIndex >= 0
? { audioTranscribedMediaIndex }
: {}),
stickerCacheHit,
locationData: locationData ?? undefined,
};

View File

@@ -111,6 +111,7 @@ export async function buildTelegramInboundContextPayload(params: {
topicConfig?: TelegramTopicConfig;
stickerCacheHit: boolean;
effectiveWasMentioned: boolean;
audioTranscribedMediaIndex?: number;
commandAuthorized: boolean;
locationData?: NormalizedLocation;
options?: TelegramMessageContextOptions;
@@ -146,6 +147,7 @@ export async function buildTelegramInboundContextPayload(params: {
topicConfig,
stickerCacheHit,
effectiveWasMentioned,
audioTranscribedMediaIndex,
commandAuthorized,
locationData,
options,
@@ -372,6 +374,9 @@ export async function buildTelegramInboundContextPayload(params: {
contextMedia.length > 0
? (contextMedia.map((m) => m.contentType).filter(Boolean) as string[])
: undefined,
...(audioTranscribedMediaIndex !== undefined
? { MediaTranscribedIndexes: [audioTranscribedMediaIndex] }
: {}),
Sticker: allMedia[0]?.stickerMetadata,
StickerMediaIncluded: allMedia[0]?.stickerMetadata ? !stickerCacheHit : undefined,
...(locationData ? toLocationContext(locationData) : undefined),

View File

@@ -578,6 +578,9 @@ export const buildTelegramMessageContext = async ({
topicConfig,
stickerCacheHit: bodyResult.stickerCacheHit,
effectiveWasMentioned: bodyResult.effectiveWasMentioned,
...(bodyResult.audioTranscribedMediaIndex !== undefined
? { audioTranscribedMediaIndex: bodyResult.audioTranscribedMediaIndex }
: {}),
locationData: bodyResult.locationData,
options,
dmAllowFrom,

View File

@@ -0,0 +1,103 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
vi.mock("./group-activation.js", () => ({
resolveGroupActivationFor: vi.fn(async () => "mention"),
}));
import type { MentionConfig } from "../mentions.js";
import type { WebInboundMsg } from "../types.js";
import { applyGroupGating, type GroupHistoryEntry } from "./group-gating.js";
function makeGroupAudioMsg(): WebInboundMsg {
return {
id: "msg-1",
from: "1203630@g.us",
to: "+15550000001",
body: "<media:audio>",
chatId: "1203630@g.us",
chatType: "group",
conversationId: "1203630@g.us",
mediaType: "audio/ogg; codecs=opus",
mediaPath: "/tmp/voice.ogg",
timestamp: 1700000000,
accountId: "default",
sender: { e164: "+15550000002", name: "Alice" },
} as WebInboundMsg;
}
function makeParams(msg: WebInboundMsg, groupHistories: Map<string, GroupHistoryEntry[]>) {
return {
cfg: {
channels: {
whatsapp: {
groupPolicy: "open",
},
},
messages: {
groupChat: {
mentionPatterns: ["\\bopenclaw\\b"],
},
},
} as never,
msg,
conversationId: "1203630@g.us",
groupHistoryKey: "whatsapp:group:1203630",
agentId: "main",
sessionKey: "agent:main:whatsapp:group:1203630",
baseMentionConfig: { mentionRegexes: [/\bopenclaw\b/i] } satisfies MentionConfig,
groupHistories,
groupHistoryLimit: 20,
groupMemberNames: new Map<string, Map<string, string>>(),
logVerbose: vi.fn(),
replyLogger: { debug: vi.fn() },
};
}
describe("applyGroupGating audio preflight mention text", () => {
let groupHistories: Map<string, GroupHistoryEntry[]>;
beforeEach(() => {
groupHistories = new Map();
});
it("defers a missing mention without storing placeholder history", async () => {
const msg = makeGroupAudioMsg();
const result = await applyGroupGating({
...makeParams(msg, groupHistories),
deferMissingMention: true,
});
expect(result).toEqual({ shouldProcess: false, needsMentionText: true });
expect(groupHistories.get("whatsapp:group:1203630")).toBeUndefined();
});
it("accepts voice transcript text that satisfies mention gating", async () => {
const msg = makeGroupAudioMsg();
const result = await applyGroupGating({
...makeParams(msg, groupHistories),
mentionText: "openclaw please summarize the thread",
});
expect(result).toEqual({ shouldProcess: true });
expect(msg.wasMentioned).toBe(true);
expect(groupHistories.get("whatsapp:group:1203630")).toBeUndefined();
});
it("stores transcript text instead of the audio placeholder when mention is still missing", async () => {
const msg = makeGroupAudioMsg();
const result = await applyGroupGating({
...makeParams(msg, groupHistories),
mentionText: "please summarize the thread",
});
expect(result).toEqual({ shouldProcess: false });
expect(groupHistories.get("whatsapp:group:1203630")).toEqual([
expect.objectContaining({
body: "please summarize the thread",
}),
]);
});
});

View File

@@ -33,6 +33,8 @@ export type GroupHistoryEntry = {
type ApplyGroupGatingParams = {
cfg: ReturnType<typeof loadConfig>;
msg: WebInboundMsg;
mentionText?: string;
deferMissingMention?: boolean;
conversationId: string;
groupHistoryKey: string;
agentId: string;
@@ -58,6 +60,7 @@ function isOwnerSender(baseMentionConfig: MentionConfig, msg: WebInboundMsg) {
function recordPendingGroupHistoryEntry(params: {
msg: WebInboundMsg;
body?: string;
groupHistories: Map<string, GroupHistoryEntry[]>;
groupHistoryKey: string;
groupHistoryLimit: number;
@@ -76,7 +79,7 @@ function recordPendingGroupHistoryEntry(params: {
limit: params.groupHistoryLimit,
entry: {
sender,
body: params.msg.body,
body: params.body ?? params.msg.body,
timestamp: params.msg.timestamp,
id: params.msg.id,
senderJid: senderIdentity.jid ?? params.msg.senderJid,
@@ -84,10 +87,15 @@ function recordPendingGroupHistoryEntry(params: {
});
}
function skipGroupMessageAndStoreHistory(params: ApplyGroupGatingParams, verboseMessage: string) {
function skipGroupMessageAndStoreHistory(
params: ApplyGroupGatingParams,
verboseMessage: string,
body?: string,
) {
params.logVerbose(verboseMessage);
recordPendingGroupHistoryEntry({
msg: params.msg,
body,
groupHistories: params.groupHistories,
groupHistoryKey: params.groupHistoryKey,
groupHistoryLimit: params.groupHistoryLimit,
@@ -126,8 +134,10 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) {
...buildMentionConfig(params.cfg, params.agentId),
allowFrom: inboundPolicy.configuredAllowFrom,
};
const mentionMsg =
params.mentionText !== undefined ? { ...params.msg, body: params.mentionText } : params.msg;
const commandBody = stripMentionsForCommand(
params.msg.body,
mentionMsg.body,
mentionConfig.mentionRegexes,
self.e164,
);
@@ -142,7 +152,7 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) {
);
}
const mentionDebug = debugMention(params.msg, mentionConfig, params.authDir);
const mentionDebug = debugMention(mentionMsg, mentionConfig, params.authDir);
params.replyLogger.debug(
{
conversationId: params.conversationId,
@@ -190,9 +200,16 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) {
const effectiveWasMentioned = mentionDecision.effectiveWasMentioned || shouldBypassMention;
params.msg.wasMentioned = effectiveWasMentioned;
if (!shouldBypassMention && requireMention && mentionDecision.shouldSkip) {
if (params.deferMissingMention === true) {
params.logVerbose(
`Deferring group mention skip until audio preflight completes in ${params.conversationId}`,
);
return { shouldProcess: false, needsMentionText: true } as const;
}
return skipGroupMessageAndStoreHistory(
params,
`Group message stored for context (no mention detected) in ${params.conversationId}: ${params.msg.body}`,
`Group message stored for context (no mention detected) in ${params.conversationId}: ${mentionMsg.body}`,
params.mentionText,
);
}

View File

@@ -102,6 +102,7 @@ export function buildWhatsAppInboundContext(params: {
route: ReturnType<typeof resolveAgentRoute>;
sender: SenderContext;
transcript?: string;
mediaTranscribedIndexes?: number[];
replyThreading?: ReplyThreadingContext;
visibleReplyTo?: VisibleReplyTarget;
}) {
@@ -132,6 +133,7 @@ export function buildWhatsAppInboundContext(params: {
MediaPath: params.msg.mediaPath,
MediaUrl: params.msg.mediaUrl,
MediaType: params.msg.mediaType,
MediaTranscribedIndexes: params.mediaTranscribedIndexes,
ChatType: params.msg.chatType,
Timestamp: params.msg.timestamp,
ConversationLabel: params.msg.chatType === "group" ? params.conversationId : params.msg.from,

View File

@@ -10,6 +10,7 @@ const ackReactionHandle = {
ackReactionValue: "👀",
remove: vi.fn(async () => undefined),
};
const applyGroupGatingMock = vi.fn();
vi.mock("./audio-preflight.runtime.js", () => ({
transcribeFirstAudio: (...args: unknown[]) => transcribeFirstAudioMock(...args),
@@ -28,7 +29,7 @@ vi.mock("./broadcast.js", () => ({
}));
vi.mock("./group-gating.js", () => ({
applyGroupGating: vi.fn(async () => ({ shouldProcess: true })),
applyGroupGating: (...args: unknown[]) => applyGroupGatingMock(...args),
}));
vi.mock("./last-route.js", () => ({
@@ -127,6 +128,8 @@ describe("createWebOnMessageHandler audio preflight", () => {
});
processMessageMock.mockReset();
processMessageMock.mockResolvedValue(true);
applyGroupGatingMock.mockReset();
applyGroupGatingMock.mockResolvedValue({ shouldProcess: true });
});
it("sends ack reaction before audio preflight for voice notes", async () => {
@@ -258,4 +261,94 @@ describe("createWebOnMessageHandler audio preflight", () => {
expect(events).toEqual(["ack", "stt"]);
expect(processMessageMock).not.toHaveBeenCalled();
});
it("uses group voice transcript for mention gating before dispatch", async () => {
applyGroupGatingMock
.mockResolvedValueOnce({ shouldProcess: false, needsMentionText: true })
.mockResolvedValueOnce({ shouldProcess: true });
const handler = createWebOnMessageHandler({
cfg: {
channels: {
whatsapp: {
ackReaction: { enabled: true },
},
},
} as never,
verbose: false,
connectionId: "conn-1",
maxMediaBytes: 1024 * 1024,
groupHistoryLimit: 20,
groupHistories: new Map(),
groupMemberNames: new Map(),
echoTracker: makeEchoTracker() as never,
backgroundTasks: new Set(),
replyResolver: vi.fn() as never,
replyLogger: {
info: () => {},
warn: () => {},
debug: () => {},
error: () => {},
} as never,
baseMentionConfig: {} as never,
account: { authDir: "/tmp/auth", accountId: "default" },
});
await handler(makeGroupAudioMsg());
expect(applyGroupGatingMock).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
deferMissingMention: true,
}),
);
expect(events).toEqual(["ack", "stt"]);
expect(applyGroupGatingMock).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
mentionText: "transcribed voice note",
}),
);
expect(processMessageMock).toHaveBeenCalledWith(
expect.objectContaining({
preflightAudioTranscript: "transcribed voice note",
ackAlreadySent: true,
}),
);
});
it("does not transcribe group voice when policy gating rejects before mention", async () => {
applyGroupGatingMock.mockResolvedValueOnce({ shouldProcess: false });
const handler = createWebOnMessageHandler({
cfg: {
channels: {
whatsapp: {
ackReaction: { enabled: true },
},
},
} as never,
verbose: false,
connectionId: "conn-1",
maxMediaBytes: 1024 * 1024,
groupHistoryLimit: 20,
groupHistories: new Map(),
groupMemberNames: new Map(),
echoTracker: makeEchoTracker() as never,
backgroundTasks: new Set(),
replyResolver: vi.fn() as never,
replyLogger: {
info: () => {},
warn: () => {},
debug: () => {},
error: () => {},
} as never,
baseMentionConfig: {} as never,
account: { authDir: "/tmp/auth", accountId: "default" },
});
await handler(makeGroupAudioMsg());
expect(transcribeFirstAudioMock).not.toHaveBeenCalled();
expect(maybeSendAckReactionMock).not.toHaveBeenCalled();
expect(processMessageMock).not.toHaveBeenCalled();
});
});

View File

@@ -119,6 +119,58 @@ export function createWebOnMessageHandler(params: {
return;
}
// Preflight audio transcription: run once before broadcast fan-out so all
// agents share the same transcript instead of each making a separate STT call.
// For DMs, only do this on the real inbound path after access-control/pairing
// checks have already passed in inbound/monitor.ts. For groups, the first
// gating pass must approve the group/sender before STT is attempted.
// null = preflight was attempted but produced no transcript (failed / disabled / no audio);
// undefined = preflight was not attempted (non-audio message).
let preflightAudioTranscript: string | null | undefined;
const hasAudioBody =
msg.mediaType?.startsWith("audio/") === true && msg.body === "<media:audio>";
const canRunEarlyAudioPreflight = msg.chatType === "group" || msg.accessControlPassed === true;
let ackAlreadySent = false;
let ackReaction: AckReactionHandle | null = null;
const runAudioPreflightOnce = async () => {
if (
preflightAudioTranscript !== undefined ||
!canRunEarlyAudioPreflight ||
!hasAudioBody ||
!msg.mediaPath
) {
return;
}
ackReaction = await maybeSendAckReaction({
cfg: params.cfg,
msg,
agentId: route.agentId,
sessionKey: route.sessionKey,
conversationId,
verbose: params.verbose,
accountId: route.accountId,
info: params.replyLogger.info.bind(params.replyLogger),
warn: params.replyLogger.warn.bind(params.replyLogger),
});
ackAlreadySent = ackReaction !== null;
try {
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
// transcribeFirstAudio returns undefined on failure/disabled; store null so
// processMessage knows the attempt was already made and does not retry.
preflightAudioTranscript =
(await transcribeFirstAudio({
ctx: {
MediaPaths: [msg.mediaPath],
MediaTypes: msg.mediaType ? [msg.mediaType] : undefined,
},
cfg: params.cfg,
})) ?? null;
} catch {
// Non-fatal: store null so per-agent retries are suppressed.
preflightAudioTranscript = null;
}
};
if (msg.chatType === "group") {
const sender = getSenderIdentity(msg);
const metaCtx = {
@@ -149,9 +201,10 @@ export function createWebOnMessageHandler(params: {
warn: params.replyLogger.warn.bind(params.replyLogger),
});
const gating = await applyGroupGating({
let gating = await applyGroupGating({
cfg: params.cfg,
msg,
deferMissingMention: hasAudioBody && Boolean(msg.mediaPath),
conversationId,
groupHistoryKey,
agentId: route.agentId,
@@ -165,6 +218,32 @@ export function createWebOnMessageHandler(params: {
logVerbose,
replyLogger: params.replyLogger,
});
if (
!gating.shouldProcess &&
"needsMentionText" in gating &&
gating.needsMentionText === true
) {
await runAudioPreflightOnce();
gating = await applyGroupGating({
cfg: params.cfg,
msg,
...(typeof preflightAudioTranscript === "string"
? { mentionText: preflightAudioTranscript }
: {}),
conversationId,
groupHistoryKey,
agentId: route.agentId,
sessionKey: route.sessionKey,
baseMentionConfig: params.baseMentionConfig,
authDir: params.account.authDir,
selfChatMode: params.account.selfChatMode,
groupHistories: params.groupHistories,
groupHistoryLimit: params.groupHistoryLimit,
groupMemberNames: params.groupMemberNames,
logVerbose,
replyLogger: params.replyLogger,
});
}
if (!gating.shouldProcess) {
return;
}
@@ -179,49 +258,7 @@ export function createWebOnMessageHandler(params: {
}
}
// Preflight audio transcription: run once here, before broadcast fan-out, so
// all agents share the same transcript instead of each making a separate STT call.
// For DMs, only do this on the real inbound path after access-control/pairing
// checks have already passed in inbound/monitor.ts. That keeps external STT and
// early ack feedback behind the same auth-first gate as the rest of DM handling.
// null = preflight was attempted but produced no transcript (failed / disabled / no audio);
// undefined = preflight was not attempted (non-audio message).
let preflightAudioTranscript: string | null | undefined;
const hasAudioBody =
msg.mediaType?.startsWith("audio/") === true && msg.body === "<media:audio>";
const canRunEarlyDmPreflight = msg.chatType === "group" || msg.accessControlPassed === true;
let ackAlreadySent = false;
let ackReaction: AckReactionHandle | null = null;
if (canRunEarlyDmPreflight && hasAudioBody && msg.mediaPath) {
ackReaction = await maybeSendAckReaction({
cfg: params.cfg,
msg,
agentId: route.agentId,
sessionKey: route.sessionKey,
conversationId,
verbose: params.verbose,
accountId: route.accountId,
info: params.replyLogger.info.bind(params.replyLogger),
warn: params.replyLogger.warn.bind(params.replyLogger),
});
ackAlreadySent = ackReaction !== null;
try {
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
// transcribeFirstAudio returns undefined on failure/disabled; store null so
// processMessage knows the attempt was already made and does not retry.
preflightAudioTranscript =
(await transcribeFirstAudio({
ctx: {
MediaPaths: [msg.mediaPath],
MediaTypes: msg.mediaType ? [msg.mediaType] : undefined,
},
cfg: params.cfg,
})) ?? null;
} catch {
// Non-fatal: store null so per-agent retries are suppressed.
preflightAudioTranscript = null;
}
}
await runAudioPreflightOnce();
// Broadcast groups: when we'd reply anyway, run multiple agents.
// Does not bypass group mention/activation gating above.

View File

@@ -93,6 +93,7 @@ vi.mock("./inbound-dispatch.js", () => ({
commandAuthorized?: boolean;
commandBody?: string;
msg: { body: string; mediaPath?: string; mediaType?: string };
mediaTranscribedIndexes?: number[];
rawBody?: string;
transcript?: string;
}) => ({
@@ -102,6 +103,7 @@ vi.mock("./inbound-dispatch.js", () => ({
CommandBody: params.commandBody ?? params.msg.body,
MediaPath: params.msg.mediaPath,
MediaType: params.msg.mediaType,
MediaTranscribedIndexes: params.mediaTranscribedIndexes,
RawBody: params.rawBody ?? params.msg.body,
Transcript: params.transcript,
}),
@@ -230,6 +232,7 @@ describe("processMessage audio preflight transcription", () => {
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "okay let's test this voice message",
MediaTranscribedIndexes: [0],
});
// mediaPath and mediaType must be preserved so inboundAudio detection (used by
// features like messages.tts.auto: "inbound") still recognises this as audio.
@@ -315,6 +318,7 @@ describe("processMessage audio preflight transcription", () => {
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "/new start a new session",
MediaTranscribedIndexes: [0],
});
});
@@ -335,6 +339,7 @@ describe("processMessage audio preflight transcription", () => {
CommandBody: "<media:audio>",
RawBody: "<media:audio>",
Transcript: "pre-computed transcript from fan-out caller",
MediaTranscribedIndexes: [0],
});
});

View File

@@ -257,10 +257,8 @@ export async function processMessage(params: {
// If we have a transcript, replace the agent-facing body so the agent sees the spoken text.
// mediaPath and mediaType are intentionally preserved so that inboundAudio detection
// (used by features such as messages.tts.auto: "inbound") still sees this as an
// audio message. The transcript is also stored in Transcript so downstream pipelines
// can detect it. Preventing a second STT pass in the media-understanding pipeline
// requires SDK-level support (alreadyTranscribed on a shared attachment instance);
// that is a shared concern across all channels and is tracked separately.
// audio message. The transcript and transcribed media index are also stored on
// context so downstream media understanding does not transcribe it again.
const msgForAgent =
audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg;
@@ -429,6 +427,7 @@ export async function processMessage(params: {
e164: sender.e164 ?? undefined,
},
...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}),
...(audioTranscript !== undefined ? { mediaTranscribedIndexes: [0] } : {}),
replyThreading,
visibleReplyTo: visibleReplyTo ?? undefined,
});

View File

@@ -117,6 +117,8 @@ export type MsgContext = {
MediaPaths?: string[];
MediaUrls?: string[];
MediaTypes?: string[];
/** Attachment indexes whose audio was already transcribed before media understanding runs. */
MediaTranscribedIndexes?: number[];
/** Telegram sticker metadata (emoji, set name, file IDs, cached description). */
Sticker?: StickerContextMetadata;
/** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */

View File

@@ -984,6 +984,51 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.Transcript).toBe("fallback transcript");
});
it("skips audio STT for attachments marked transcribed by channel preflight", async () => {
const dir = await createTempMediaDir();
const audioPath = path.join(dir, "voice.ogg");
await fs.writeFile(audioPath, createSafeAudioFixtureBuffer(2048));
const transcribeAudio = vi.fn(async () => ({ text: "duplicate transcript" }));
const ctx: MsgContext = {
Body: "preflight transcript",
Transcript: "preflight transcript",
MediaPath: audioPath,
MediaType: "audio/ogg",
MediaTranscribedIndexes: [0],
};
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio,
},
},
});
expect(transcribeAudio).not.toHaveBeenCalled();
expect(result.appliedAudio).toBe(false);
expect(ctx.Transcript).toBe("preflight transcript");
expect(result.decisions).toContainEqual(
expect.objectContaining({
capability: "audio",
outcome: "no-attachment",
}),
);
});
it("handles multiple audio attachments when attachment mode is all", async () => {
const dir = await createTempMediaDir();
const audioBytes = createSafeAudioFixtureBuffer(2048);

View File

@@ -28,6 +28,11 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
const transcribedIndexes = new Set(
Array.isArray(ctx.MediaTranscribedIndexes)
? ctx.MediaTranscribedIndexes.filter((index) => Number.isInteger(index) && index >= 0)
: [],
);
const resolveMime = (count: number, index: number) => {
const typeHint = normalizeOptionalString(typesFromArray?.[index]);
if (typeHint) {
@@ -45,6 +50,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
url: urls?.[index] ?? ctx.MediaUrl,
mime: resolveMime(count, index),
index,
alreadyTranscribed: transcribedIndexes.has(index),
}))
.filter((entry) => Boolean(entry.path ?? normalizeOptionalString(entry.url)));
}
@@ -57,6 +63,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
url: normalizeOptionalString(value),
mime: resolveMime(count, index),
index,
alreadyTranscribed: transcribedIndexes.has(index),
}))
.filter((entry) => Boolean(entry.url));
}
@@ -72,6 +79,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
url: url || undefined,
mime: ctx.MediaType,
index: 0,
alreadyTranscribed: transcribedIndexes.has(0),
},
];
}