From 6a67f6556885d376aca2aa1283b540bf485416c5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 26 Apr 2026 05:29:24 +0100 Subject: [PATCH] fix(voice): reuse preflight transcripts across channels --- CHANGELOG.md | 8 ++ docs/channels/bluebubbles.md | 2 + docs/channels/whatsapp.md | 6 + docs/tools/media-overview.md | 4 + extensions/bluebubbles/src/channel.ts | 3 +- extensions/bluebubbles/src/media-send.test.ts | 23 ++++ .../bluebubbles/src/monitor-processing.ts | 1 + .../monitor/message-handler.preflight.test.ts | 2 + .../src/monitor/message-handler.preflight.ts | 1 + .../message-handler.preflight.types.ts | 1 + .../monitor/message-handler.process.test.ts | 59 ++++++++- .../src/monitor/message-handler.process.ts | 13 +- extensions/feishu/src/bot.test.ts | 1 + extensions/feishu/src/bot.ts | 5 + extensions/speech-core/src/tts.test.ts | 36 ++++- extensions/speech-core/src/tts.ts | 49 ++++++- ...e-context.audio-transcript.test-support.ts | 1 + .../telegram/src/bot-message-context.body.ts | 8 ++ .../src/bot-message-context.session.ts | 5 + .../telegram/src/bot-message-context.ts | 3 + .../group-gating.audio-preflight.test.ts | 103 +++++++++++++++ .../src/auto-reply/monitor/group-gating.ts | 27 +++- .../auto-reply/monitor/inbound-dispatch.ts | 2 + .../on-message.audio-preflight.test.ts | 95 ++++++++++++- .../src/auto-reply/monitor/on-message.ts | 125 ++++++++++++------ .../process-message.audio-preflight.test.ts | 5 + .../src/auto-reply/monitor/process-message.ts | 7 +- src/auto-reply/templating.ts | 2 + src/media-understanding/apply.test.ts | 45 +++++++ .../attachments.normalize.ts | 8 ++ 30 files changed, 586 insertions(+), 64 deletions(-) create mode 100644 extensions/whatsapp/src/auto-reply/monitor/group-gating.audio-preflight.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index cc927407b68..1de7e95411a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -96,6 +96,14 @@ Docs: https://docs.openclaw.ai before agent dispatch and keep raw Feishu `file_key` payloads out of message text. Fixes #67120 and #61876. - Tasks: terminalize async Gateway agent task records from the Gateway run result while preserving aborted, failed, and cancelled outcomes instead of leaving completed runs stuck as active or lost. (#71905) Thanks @likewen-tech. +- WhatsApp: let authorized group voice-note transcripts satisfy mention gating + before reply dispatch, while keeping unmentioned transcripts in pending group + history. Fixes #44908. +- Media understanding: carry channel voice-note preflight state into attachment + selection so WhatsApp, Feishu, Telegram, and Discord do not transcribe the + same inbound audio twice. Fixes #70580. +- TTS/BlueBubbles: deliver compatible auto-TTS audio as iMessage voice memo + bubbles instead of plain MP3/CAF file attachments. Fixes #16848. - ACP: send subagent and async-task completion wakes to external ACP harnesses as plain prompts instead of OpenClaw internal runtime-context envelopes, while keeping those envelopes out of ACP transcripts. diff --git a/docs/channels/bluebubbles.md b/docs/channels/bluebubbles.md index e69b0200a1f..e0947639363 100644 --- a/docs/channels/bluebubbles.md +++ b/docs/channels/bluebubbles.md @@ -21,6 +21,8 @@ need a separate `openclaw plugins install` step. - OpenClaw talks to it through its REST API (`GET /api/v1/ping`, `POST /message/text`, `POST /chat/:id/*`). - Incoming messages arrive via webhooks; outgoing replies, typing indicators, read receipts, and tapbacks are REST calls. - Attachments and stickers are ingested as inbound media (and surfaced to the agent when possible). +- Auto-TTS replies that synthesize MP3 or CAF audio are delivered as iMessage + voice memo bubbles instead of plain file attachments. - Pairing/allowlist works the same way as other channels (`/channels/pairing` etc) with `channels.bluebubbles.allowFrom` + pairing codes. - Reactions are surfaced as system events just like Slack/Telegram so agents can "mention" them before replying. - Advanced features: edit, unsend, reply threading, message effects, group management. diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index 022ef1062a3..13092af45c8 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -244,6 +244,7 @@ content and identifiers. - explicit WhatsApp mentions of the bot identity - configured mention regex patterns (`agents.list[].groupChat.mentionPatterns`, fallback `messages.groupChat.mentionPatterns`) + - inbound voice-note transcripts for authorized group messages - implicit reply-to-bot detection (reply sender matches bot identity) Security note: @@ -296,6 +297,11 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s - `` - `` + Authorized group voice notes are transcribed before mention gating when the + body is only ``, so saying the bot mention in the voice note can + trigger the reply. If the transcript still does not mention the bot, the + transcript is kept in pending group history instead of the raw placeholder. + Location bodies use terse coordinate text. Location labels/comments and contact/vCard details are rendered as fenced untrusted metadata, not inline prompt text. diff --git a/docs/tools/media-overview.md b/docs/tools/media-overview.md index d79cc887cc7..1e29a92e5fa 100644 --- a/docs/tools/media-overview.md +++ b/docs/tools/media-overview.md @@ -58,6 +58,10 @@ Video and music generation run as background tasks because provider processing t Deepgram, ElevenLabs, Mistral, OpenAI, SenseAudio, and xAI can all transcribe inbound audio through the batch `tools.media.audio` path when configured. +Channel plugins that preflight a voice note for mention gating or command +parsing mark the transcribed attachment on the inbound context, so the shared +media-understanding pass reuses that transcript instead of making a second STT +call for the same audio. Deepgram, ElevenLabs, Mistral, OpenAI, and xAI also register Voice Call streaming STT providers, so live phone audio can be forwarded to the selected vendor without waiting for a completed recording. diff --git a/extensions/bluebubbles/src/channel.ts b/extensions/bluebubbles/src/channel.ts index 3166313fc3e..3f01fa8d39b 100644 --- a/extensions/bluebubbles/src/channel.ts +++ b/extensions/bluebubbles/src/channel.ts @@ -330,7 +330,7 @@ export const bluebubblesPlugin: ChannelPlugin { const runtime = await loadBlueBubblesChannelRuntime(); - const { cfg, to, text, mediaUrl, accountId, replyToId } = ctx; + const { cfg, to, text, mediaUrl, accountId, replyToId, audioAsVoice } = ctx; const { mediaPath, mediaBuffer, contentType, filename, caption } = ctx as { mediaPath?: string; mediaBuffer?: Uint8Array; @@ -349,6 +349,7 @@ export const bluebubblesPlugin: ChannelPlugin { ); expect(sendBlueBubblesAttachmentMock).toHaveBeenCalledTimes(1); }); + + it("passes asVoice through to attachment delivery", async () => { + runtimeMocks.fetchRemoteMedia.mockResolvedValueOnce({ + buffer: new Uint8Array([1, 2, 3]), + contentType: "audio/mpeg", + fileName: "voice.mp3", + }); + + await sendBlueBubblesMedia({ + cfg: createConfig(), + to: "chat:123", + mediaUrl: "https://example.com/voice.mp3", + asVoice: true, + }); + + expect(sendBlueBubblesAttachmentMock).toHaveBeenCalledWith( + expect.objectContaining({ + asVoice: true, + contentType: "audio/mpeg", + filename: "voice.mp3", + }), + ); + }); }); diff --git a/extensions/bluebubbles/src/monitor-processing.ts b/extensions/bluebubbles/src/monitor-processing.ts index 810096fddf3..0d2c69d6d6e 100644 --- a/extensions/bluebubbles/src/monitor-processing.ts +++ b/extensions/bluebubbles/src/monitor-processing.ts @@ -1686,6 +1686,7 @@ async function processMessageAfterDedupe( caption: caption ?? undefined, replyToId: replyToMessageGuid || null, accountId: account.accountId, + asVoice: payload.audioAsVoice === true, }); } catch (err) { forgetPendingOutboundMessageId(pendingId); diff --git a/extensions/discord/src/monitor/message-handler.preflight.test.ts b/extensions/discord/src/monitor/message-handler.preflight.test.ts index d200b913a77..fd5c937604d 100644 --- a/extensions/discord/src/monitor/message-handler.preflight.test.ts +++ b/extensions/discord/src/monitor/message-handler.preflight.test.ts @@ -405,6 +405,7 @@ describe("preflightDiscordMessage", () => { ); expect(result).not.toBeNull(); expect(result?.isDirectMessage).toBe(true); + expect(result?.preflightAudioTranscript).toBe("hello openclaw from dm audio"); }); it("falls back to the default discord account for omitted-account dm authorization", async () => { @@ -1096,6 +1097,7 @@ describe("preflightDiscordMessage", () => { ); expect(result).not.toBeNull(); expect(result?.wasMentioned).toBe(true); + expect(result?.preflightAudioTranscript).toBe("hey openclaw"); }); it("does not transcribe guild audio from unauthorized members", async () => { diff --git a/extensions/discord/src/monitor/message-handler.preflight.ts b/extensions/discord/src/monitor/message-handler.preflight.ts index f8be60a3103..7324dfc89fc 100644 --- a/extensions/discord/src/monitor/message-handler.preflight.ts +++ b/extensions/discord/src/monitor/message-handler.preflight.ts @@ -1112,6 +1112,7 @@ export async function preflightDiscordMessage( commandAuthorized, baseText, messageText, + ...(preflightTranscript !== undefined ? { preflightAudioTranscript: preflightTranscript } : {}), wasMentioned, route: effectiveRoute, threadBinding, diff --git a/extensions/discord/src/monitor/message-handler.preflight.types.ts b/extensions/discord/src/monitor/message-handler.preflight.types.ts index 10550bbb079..1d980ba8e26 100644 --- a/extensions/discord/src/monitor/message-handler.preflight.types.ts +++ b/extensions/discord/src/monitor/message-handler.preflight.types.ts @@ -56,6 +56,7 @@ export type DiscordMessagePreflightContext = DiscordMessagePreflightSharedFields commandAuthorized: boolean; baseText: string; messageText: string; + preflightAudioTranscript?: string; wasMentioned: boolean; route: ReturnType; diff --git a/extensions/discord/src/monitor/message-handler.process.test.ts b/extensions/discord/src/monitor/message-handler.process.test.ts index 79eb5915c2d..38f62be1c52 100644 --- a/extensions/discord/src/monitor/message-handler.process.test.ts +++ b/extensions/discord/src/monitor/message-handler.process.test.ts @@ -285,11 +285,27 @@ function getLastRouteUpdate(): } function getLastDispatchCtx(): - | { SessionKey?: string; MessageThreadId?: string | number } + | { + BodyForAgent?: string; + CommandBody?: string; + MediaTranscribedIndexes?: number[]; + MessageThreadId?: string | number; + SessionKey?: string; + Transcript?: string; + } | undefined { const callArgs = dispatchInboundMessage.mock.calls.at(-1) as unknown[] | undefined; const params = callArgs?.[0] as - | { ctx?: { SessionKey?: string; MessageThreadId?: string | number } } + | { + ctx?: { + BodyForAgent?: string; + CommandBody?: string; + MediaTranscribedIndexes?: number[]; + MessageThreadId?: string | number; + SessionKey?: string; + Transcript?: string; + }; + } | undefined; return params?.ctx; } @@ -656,6 +672,45 @@ describe("processDiscordMessage ack reactions", () => { }); describe("processDiscordMessage session routing", () => { + it("carries preflight audio transcript into dispatch context and marks media transcribed", async () => { + const fetchImpl = vi.fn( + async () => + new Response(new Uint8Array([1, 2, 3, 4]), { + headers: { "content-type": "audio/ogg" }, + }), + ); + const ctx = await createBaseContext({ + message: { + id: "m-audio-preflight", + channelId: "c1", + content: "", + timestamp: new Date().toISOString(), + attachments: [ + { + id: "att-audio-preflight", + url: "https://cdn.discordapp.com/attachments/voice.ogg", + content_type: "audio/ogg", + filename: "voice.ogg", + }, + ], + }, + baseText: "", + messageText: "", + preflightAudioTranscript: "hello from discord voice", + discordRestFetch: fetchImpl, + mediaMaxBytes: 1024 * 1024, + }); + + await processDiscordMessage(ctx as any); + + expect(getLastDispatchCtx()).toMatchObject({ + BodyForAgent: "hello from discord voice", + CommandBody: "hello from discord voice", + Transcript: "hello from discord voice", + MediaTranscribedIndexes: [0], + }); + }); + it("stores DM lastRoute with user target for direct-session continuity", async () => { const ctx = await createBaseContext({ ...createDirectMessageContextOverrides(), diff --git a/extensions/discord/src/monitor/message-handler.process.ts b/extensions/discord/src/monitor/message-handler.process.ts index 94706759195..868b10352bd 100644 --- a/extensions/discord/src/monitor/message-handler.process.ts +++ b/extensions/discord/src/monitor/message-handler.process.ts @@ -148,6 +148,7 @@ export async function processDiscordMessage( isGroupDm, baseText, messageText, + preflightAudioTranscript, shouldRequireMention, canDetectMention, effectiveWasMentioned, @@ -428,6 +429,10 @@ export async function processDiscordMessage( } } const mediaPayload = buildDiscordMediaPayload(mediaList); + const preflightAudioIndex = + preflightAudioTranscript === undefined + ? -1 + : mediaList.findIndex((media) => media.contentType?.startsWith("audio/")); const threadKeys = resolveThreadSessionKeys({ baseSessionKey, threadId: threadChannel ? messageChannelId : undefined, @@ -487,10 +492,11 @@ export async function processDiscordMessage( const ctxPayload = finalizeInboundContext({ Body: combinedBody, - BodyForAgent: baseText ?? text, + BodyForAgent: preflightAudioTranscript ?? baseText ?? text, InboundHistory: inboundHistory, - RawBody: baseText, - CommandBody: baseText, + RawBody: preflightAudioTranscript ?? baseText, + CommandBody: preflightAudioTranscript ?? baseText, + ...(preflightAudioTranscript !== undefined ? { Transcript: preflightAudioTranscript } : {}), From: effectiveFrom, To: effectiveTo, SessionKey: boundSessionKey ?? autoThreadContext?.SessionKey ?? threadKeys.sessionKey, @@ -521,6 +527,7 @@ export async function processDiscordMessage( ThreadLabel: threadLabel, Timestamp: resolveTimestampMs(message.timestamp), ...mediaPayload, + ...(preflightAudioIndex >= 0 ? { MediaTranscribedIndexes: [preflightAudioIndex] } : {}), CommandAuthorized: commandAuthorized, CommandSource: "text" as const, // Originating channel for reply routing. diff --git a/extensions/feishu/src/bot.test.ts b/extensions/feishu/src/bot.test.ts index 9e4b966ad0c..87889d5e50c 100644 --- a/extensions/feishu/src/bot.test.ts +++ b/extensions/feishu/src/bot.test.ts @@ -1512,6 +1512,7 @@ describe("handleFeishuMessage command authorization", () => { Transcript: "voice transcript", MediaPaths: ["/tmp/inbound-voice.ogg"], MediaTypes: ["audio/ogg"], + MediaTranscribedIndexes: [0], }), ); const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0]; diff --git a/extensions/feishu/src/bot.ts b/extensions/feishu/src/bot.ts index 8b871b72d3d..26691dfd46f 100644 --- a/extensions/feishu/src/bot.ts +++ b/extensions/feishu/src/bot.ts @@ -759,6 +759,10 @@ export async function handleFeishuMessage(params: { chatType: isGroup ? "group" : "direct", log, }); + const preflightAudioIndex = + audioTranscript === undefined + ? -1 + : mediaList.findIndex((media) => media.contentType?.startsWith("audio/")); const agentFacingContent = audioTranscript ?? ctx.content; const agentFacingCtx = audioTranscript === undefined @@ -1078,6 +1082,7 @@ export async function handleFeishuMessage(params: { OriginatingTo: feishuTo, GroupSystemPrompt: isGroup ? normalizeOptionalString(groupConfig?.systemPrompt) : undefined, ...mediaPayload, + ...(preflightAudioIndex >= 0 ? { MediaTranscribedIndexes: [preflightAudioIndex] } : {}), }); }; diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 9755f554775..c91ece82bda 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -68,7 +68,14 @@ const { textToSpeechTelephony, } = await import("./tts.js"); -const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const; +const nativeVoiceNoteChannels = [ + "bluebubbles", + "discord", + "feishu", + "matrix", + "telegram", + "whatsapp", +] as const; function createMockSpeechProvider( id = "mock", @@ -164,6 +171,33 @@ describe("speech-core native voice-note routing", () => { }); }); + it("keeps BlueBubbles synthesis on mp3 audio-file output but delivers it as a voice memo", async () => { + await expectTtsPayloadResult({ + channel: "bluebubbles", + prefsName: "openclaw-speech-core-tts-bluebubbles-mp3-test", + text: "This BlueBubbles reply should be delivered as an iMessage voice memo.", + target: "audio-file", + audioAsVoice: true, + mediaExtension: "mp3", + providerResult: { + audioBuffer: Buffer.from("mp3"), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }, + }); + }); + + it("does not mark unsupported BlueBubbles audio-file output as a voice memo", async () => { + await expectTtsPayloadResult({ + channel: "bluebubbles", + prefsName: "openclaw-speech-core-tts-bluebubbles-ogg-test", + text: "This BlueBubbles reply should stay a regular audio attachment.", + target: "audio-file", + audioAsVoice: undefined, + }); + }); + it.each(["feishu", "whatsapp"] as const)( "marks %s voice-note TTS for channel-side transcoding when provider returns mp3", async (channel) => { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 44f76e5cc54..23fb38ccc7a 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -738,8 +738,17 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { lastTtsAttempt = entry; } +const VOICE_DELIVERY_CHANNELS = new Set([ + "bluebubbles", + "telegram", + "feishu", + "whatsapp", + "matrix", + "discord", +]); const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]); const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]); +const AUDIO_FILE_VOICE_MEMO_CHANNELS = new Set(["bluebubbles"]); function resolveChannelId(channel: string | undefined): ChannelId | null { return channel ? normalizeChannelId(channel) : null; @@ -747,7 +756,7 @@ function resolveChannelId(channel: string | undefined): ChannelId | null { function supportsNativeVoiceNoteTts(channel: string | undefined): boolean { const channelId = resolveChannelId(channel); - return channelId !== null && OPUS_CHANNELS.has(channelId); + return channelId !== null && VOICE_DELIVERY_CHANNELS.has(channelId); } function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean { @@ -755,12 +764,43 @@ function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean { return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId); } +function resolveTtsSynthesisTarget(channel: string | undefined): "audio-file" | "voice-note" { + const channelId = resolveChannelId(channel); + return channelId !== null && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file"; +} + +function supportsAudioFileVoiceMemoOutput(params: { + fileExtension?: string; + outputFormat?: string; +}): boolean { + const extension = params.fileExtension?.trim().toLowerCase(); + if (extension === ".mp3" || extension === ".caf") { + return true; + } + const outputFormat = params.outputFormat?.trim().toLowerCase(); + return ( + outputFormat === "mp3" || + outputFormat === "caf" || + outputFormat === "audio/mpeg" || + outputFormat === "audio/x-caf" + ); +} + function shouldDeliverTtsAsVoice(params: { channel: string | undefined; target: "audio-file" | "voice-note" | undefined; voiceCompatible: boolean | undefined; + fileExtension?: string; + outputFormat?: string; }): boolean { - if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") { + const channelId = resolveChannelId(params.channel); + if (channelId === null || !supportsNativeVoiceNoteTts(channelId)) { + return false; + } + if (AUDIO_FILE_VOICE_MEMO_CHANNELS.has(channelId)) { + return params.target === "audio-file" && supportsAudioFileVoiceMemoOutput(params); + } + if (params.target !== "voice-note") { return false; } return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel); @@ -1032,6 +1072,8 @@ export async function textToSpeech(params: { channel: params.channel, target: synthesis.target, voiceCompatible: synthesis.voiceCompatible, + fileExtension: synthesis.fileExtension, + outputFormat: synthesis.outputFormat, }), target: synthesis.target, }; @@ -1061,7 +1103,7 @@ export async function synthesizeSpeech(params: { const { config, persona, providers } = setup; const timeoutMs = params.timeoutMs ?? config.timeoutMs; - const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file"; + const target = resolveTtsSynthesisTarget(params.channel); const errors: string[] = []; const attemptedProviders: string[] = []; @@ -1499,6 +1541,7 @@ export const _test = { resolveModelOverridePolicy, supportsNativeVoiceNoteTts, supportsTranscodedVoiceNoteTts, + resolveTtsSynthesisTarget, shouldDeliverTtsAsVoice, summarizeText, getResolvedSpeechProviderConfig, diff --git a/extensions/telegram/src/bot-message-context.audio-transcript.test-support.ts b/extensions/telegram/src/bot-message-context.audio-transcript.test-support.ts index e1ca84ed3df..2f047d1f90b 100644 --- a/extensions/telegram/src/bot-message-context.audio-transcript.test-support.ts +++ b/extensions/telegram/src/bot-message-context.audio-transcript.test-support.ts @@ -69,6 +69,7 @@ function expectTranscriptRendered( expect(ctx?.ctxPayload?.BodyForAgent).toBe(framed); expect(ctx?.ctxPayload?.Body).toContain(framed); expect(ctx?.ctxPayload?.Body).not.toContain(""); + expect(ctx?.ctxPayload?.MediaTranscribedIndexes).toEqual([0]); } function expectAudioPlaceholderRendered(ctx: Awaited>) { diff --git a/extensions/telegram/src/bot-message-context.body.ts b/extensions/telegram/src/bot-message-context.body.ts index deb1cbf66a9..cf6b9610673 100644 --- a/extensions/telegram/src/bot-message-context.body.ts +++ b/extensions/telegram/src/bot-message-context.body.ts @@ -73,6 +73,7 @@ export type TelegramInboundBodyResult = { effectiveWasMentioned: boolean; canDetectMention: boolean; shouldBypassMention: boolean; + audioTranscribedMediaIndex?: number; stickerCacheHit: boolean; locationData?: NormalizedLocation; }; @@ -230,6 +231,10 @@ export async function resolveTelegramInboundBody(params: { logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`); } } + const audioTranscribedMediaIndex = + preflightTranscript === undefined + ? undefined + : allMedia.findIndex((media) => media.contentType?.startsWith("audio/")); if (hasAudio && bodyText === "" && preflightTranscript) { bodyText = formatAudioTranscriptForAgent(preflightTranscript); @@ -363,6 +368,9 @@ export async function resolveTelegramInboundBody(params: { effectiveWasMentioned, canDetectMention, shouldBypassMention: mentionDecision.shouldBypassMention, + ...(audioTranscribedMediaIndex !== undefined && audioTranscribedMediaIndex >= 0 + ? { audioTranscribedMediaIndex } + : {}), stickerCacheHit, locationData: locationData ?? undefined, }; diff --git a/extensions/telegram/src/bot-message-context.session.ts b/extensions/telegram/src/bot-message-context.session.ts index 58c1791ed9b..ca4905946ed 100644 --- a/extensions/telegram/src/bot-message-context.session.ts +++ b/extensions/telegram/src/bot-message-context.session.ts @@ -111,6 +111,7 @@ export async function buildTelegramInboundContextPayload(params: { topicConfig?: TelegramTopicConfig; stickerCacheHit: boolean; effectiveWasMentioned: boolean; + audioTranscribedMediaIndex?: number; commandAuthorized: boolean; locationData?: NormalizedLocation; options?: TelegramMessageContextOptions; @@ -146,6 +147,7 @@ export async function buildTelegramInboundContextPayload(params: { topicConfig, stickerCacheHit, effectiveWasMentioned, + audioTranscribedMediaIndex, commandAuthorized, locationData, options, @@ -372,6 +374,9 @@ export async function buildTelegramInboundContextPayload(params: { contextMedia.length > 0 ? (contextMedia.map((m) => m.contentType).filter(Boolean) as string[]) : undefined, + ...(audioTranscribedMediaIndex !== undefined + ? { MediaTranscribedIndexes: [audioTranscribedMediaIndex] } + : {}), Sticker: allMedia[0]?.stickerMetadata, StickerMediaIncluded: allMedia[0]?.stickerMetadata ? !stickerCacheHit : undefined, ...(locationData ? toLocationContext(locationData) : undefined), diff --git a/extensions/telegram/src/bot-message-context.ts b/extensions/telegram/src/bot-message-context.ts index 2c81c5a07b2..99dc9b860d4 100644 --- a/extensions/telegram/src/bot-message-context.ts +++ b/extensions/telegram/src/bot-message-context.ts @@ -578,6 +578,9 @@ export const buildTelegramMessageContext = async ({ topicConfig, stickerCacheHit: bodyResult.stickerCacheHit, effectiveWasMentioned: bodyResult.effectiveWasMentioned, + ...(bodyResult.audioTranscribedMediaIndex !== undefined + ? { audioTranscribedMediaIndex: bodyResult.audioTranscribedMediaIndex } + : {}), locationData: bodyResult.locationData, options, dmAllowFrom, diff --git a/extensions/whatsapp/src/auto-reply/monitor/group-gating.audio-preflight.test.ts b/extensions/whatsapp/src/auto-reply/monitor/group-gating.audio-preflight.test.ts new file mode 100644 index 00000000000..381bba52bf1 --- /dev/null +++ b/extensions/whatsapp/src/auto-reply/monitor/group-gating.audio-preflight.test.ts @@ -0,0 +1,103 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +vi.mock("./group-activation.js", () => ({ + resolveGroupActivationFor: vi.fn(async () => "mention"), +})); + +import type { MentionConfig } from "../mentions.js"; +import type { WebInboundMsg } from "../types.js"; +import { applyGroupGating, type GroupHistoryEntry } from "./group-gating.js"; + +function makeGroupAudioMsg(): WebInboundMsg { + return { + id: "msg-1", + from: "1203630@g.us", + to: "+15550000001", + body: "", + chatId: "1203630@g.us", + chatType: "group", + conversationId: "1203630@g.us", + mediaType: "audio/ogg; codecs=opus", + mediaPath: "/tmp/voice.ogg", + timestamp: 1700000000, + accountId: "default", + sender: { e164: "+15550000002", name: "Alice" }, + } as WebInboundMsg; +} + +function makeParams(msg: WebInboundMsg, groupHistories: Map) { + return { + cfg: { + channels: { + whatsapp: { + groupPolicy: "open", + }, + }, + messages: { + groupChat: { + mentionPatterns: ["\\bopenclaw\\b"], + }, + }, + } as never, + msg, + conversationId: "1203630@g.us", + groupHistoryKey: "whatsapp:group:1203630", + agentId: "main", + sessionKey: "agent:main:whatsapp:group:1203630", + baseMentionConfig: { mentionRegexes: [/\bopenclaw\b/i] } satisfies MentionConfig, + groupHistories, + groupHistoryLimit: 20, + groupMemberNames: new Map>(), + logVerbose: vi.fn(), + replyLogger: { debug: vi.fn() }, + }; +} + +describe("applyGroupGating audio preflight mention text", () => { + let groupHistories: Map; + + beforeEach(() => { + groupHistories = new Map(); + }); + + it("defers a missing mention without storing placeholder history", async () => { + const msg = makeGroupAudioMsg(); + + const result = await applyGroupGating({ + ...makeParams(msg, groupHistories), + deferMissingMention: true, + }); + + expect(result).toEqual({ shouldProcess: false, needsMentionText: true }); + expect(groupHistories.get("whatsapp:group:1203630")).toBeUndefined(); + }); + + it("accepts voice transcript text that satisfies mention gating", async () => { + const msg = makeGroupAudioMsg(); + + const result = await applyGroupGating({ + ...makeParams(msg, groupHistories), + mentionText: "openclaw please summarize the thread", + }); + + expect(result).toEqual({ shouldProcess: true }); + expect(msg.wasMentioned).toBe(true); + expect(groupHistories.get("whatsapp:group:1203630")).toBeUndefined(); + }); + + it("stores transcript text instead of the audio placeholder when mention is still missing", async () => { + const msg = makeGroupAudioMsg(); + + const result = await applyGroupGating({ + ...makeParams(msg, groupHistories), + mentionText: "please summarize the thread", + }); + + expect(result).toEqual({ shouldProcess: false }); + expect(groupHistories.get("whatsapp:group:1203630")).toEqual([ + expect.objectContaining({ + body: "please summarize the thread", + }), + ]); + }); +}); diff --git a/extensions/whatsapp/src/auto-reply/monitor/group-gating.ts b/extensions/whatsapp/src/auto-reply/monitor/group-gating.ts index 867531f6a5f..a7b0ac17cc6 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/group-gating.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/group-gating.ts @@ -33,6 +33,8 @@ export type GroupHistoryEntry = { type ApplyGroupGatingParams = { cfg: ReturnType; msg: WebInboundMsg; + mentionText?: string; + deferMissingMention?: boolean; conversationId: string; groupHistoryKey: string; agentId: string; @@ -58,6 +60,7 @@ function isOwnerSender(baseMentionConfig: MentionConfig, msg: WebInboundMsg) { function recordPendingGroupHistoryEntry(params: { msg: WebInboundMsg; + body?: string; groupHistories: Map; groupHistoryKey: string; groupHistoryLimit: number; @@ -76,7 +79,7 @@ function recordPendingGroupHistoryEntry(params: { limit: params.groupHistoryLimit, entry: { sender, - body: params.msg.body, + body: params.body ?? params.msg.body, timestamp: params.msg.timestamp, id: params.msg.id, senderJid: senderIdentity.jid ?? params.msg.senderJid, @@ -84,10 +87,15 @@ function recordPendingGroupHistoryEntry(params: { }); } -function skipGroupMessageAndStoreHistory(params: ApplyGroupGatingParams, verboseMessage: string) { +function skipGroupMessageAndStoreHistory( + params: ApplyGroupGatingParams, + verboseMessage: string, + body?: string, +) { params.logVerbose(verboseMessage); recordPendingGroupHistoryEntry({ msg: params.msg, + body, groupHistories: params.groupHistories, groupHistoryKey: params.groupHistoryKey, groupHistoryLimit: params.groupHistoryLimit, @@ -126,8 +134,10 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) { ...buildMentionConfig(params.cfg, params.agentId), allowFrom: inboundPolicy.configuredAllowFrom, }; + const mentionMsg = + params.mentionText !== undefined ? { ...params.msg, body: params.mentionText } : params.msg; const commandBody = stripMentionsForCommand( - params.msg.body, + mentionMsg.body, mentionConfig.mentionRegexes, self.e164, ); @@ -142,7 +152,7 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) { ); } - const mentionDebug = debugMention(params.msg, mentionConfig, params.authDir); + const mentionDebug = debugMention(mentionMsg, mentionConfig, params.authDir); params.replyLogger.debug( { conversationId: params.conversationId, @@ -190,9 +200,16 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) { const effectiveWasMentioned = mentionDecision.effectiveWasMentioned || shouldBypassMention; params.msg.wasMentioned = effectiveWasMentioned; if (!shouldBypassMention && requireMention && mentionDecision.shouldSkip) { + if (params.deferMissingMention === true) { + params.logVerbose( + `Deferring group mention skip until audio preflight completes in ${params.conversationId}`, + ); + return { shouldProcess: false, needsMentionText: true } as const; + } return skipGroupMessageAndStoreHistory( params, - `Group message stored for context (no mention detected) in ${params.conversationId}: ${params.msg.body}`, + `Group message stored for context (no mention detected) in ${params.conversationId}: ${mentionMsg.body}`, + params.mentionText, ); } diff --git a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts index 0705ac0fbd8..971d8cedd75 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/inbound-dispatch.ts @@ -102,6 +102,7 @@ export function buildWhatsAppInboundContext(params: { route: ReturnType; sender: SenderContext; transcript?: string; + mediaTranscribedIndexes?: number[]; replyThreading?: ReplyThreadingContext; visibleReplyTo?: VisibleReplyTarget; }) { @@ -132,6 +133,7 @@ export function buildWhatsAppInboundContext(params: { MediaPath: params.msg.mediaPath, MediaUrl: params.msg.mediaUrl, MediaType: params.msg.mediaType, + MediaTranscribedIndexes: params.mediaTranscribedIndexes, ChatType: params.msg.chatType, Timestamp: params.msg.timestamp, ConversationLabel: params.msg.chatType === "group" ? params.conversationId : params.msg.from, diff --git a/extensions/whatsapp/src/auto-reply/monitor/on-message.audio-preflight.test.ts b/extensions/whatsapp/src/auto-reply/monitor/on-message.audio-preflight.test.ts index 41d7ffa873b..17895e1bf79 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/on-message.audio-preflight.test.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/on-message.audio-preflight.test.ts @@ -10,6 +10,7 @@ const ackReactionHandle = { ackReactionValue: "👀", remove: vi.fn(async () => undefined), }; +const applyGroupGatingMock = vi.fn(); vi.mock("./audio-preflight.runtime.js", () => ({ transcribeFirstAudio: (...args: unknown[]) => transcribeFirstAudioMock(...args), @@ -28,7 +29,7 @@ vi.mock("./broadcast.js", () => ({ })); vi.mock("./group-gating.js", () => ({ - applyGroupGating: vi.fn(async () => ({ shouldProcess: true })), + applyGroupGating: (...args: unknown[]) => applyGroupGatingMock(...args), })); vi.mock("./last-route.js", () => ({ @@ -127,6 +128,8 @@ describe("createWebOnMessageHandler audio preflight", () => { }); processMessageMock.mockReset(); processMessageMock.mockResolvedValue(true); + applyGroupGatingMock.mockReset(); + applyGroupGatingMock.mockResolvedValue({ shouldProcess: true }); }); it("sends ack reaction before audio preflight for voice notes", async () => { @@ -258,4 +261,94 @@ describe("createWebOnMessageHandler audio preflight", () => { expect(events).toEqual(["ack", "stt"]); expect(processMessageMock).not.toHaveBeenCalled(); }); + + it("uses group voice transcript for mention gating before dispatch", async () => { + applyGroupGatingMock + .mockResolvedValueOnce({ shouldProcess: false, needsMentionText: true }) + .mockResolvedValueOnce({ shouldProcess: true }); + const handler = createWebOnMessageHandler({ + cfg: { + channels: { + whatsapp: { + ackReaction: { enabled: true }, + }, + }, + } as never, + verbose: false, + connectionId: "conn-1", + maxMediaBytes: 1024 * 1024, + groupHistoryLimit: 20, + groupHistories: new Map(), + groupMemberNames: new Map(), + echoTracker: makeEchoTracker() as never, + backgroundTasks: new Set(), + replyResolver: vi.fn() as never, + replyLogger: { + info: () => {}, + warn: () => {}, + debug: () => {}, + error: () => {}, + } as never, + baseMentionConfig: {} as never, + account: { authDir: "/tmp/auth", accountId: "default" }, + }); + + await handler(makeGroupAudioMsg()); + + expect(applyGroupGatingMock).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ + deferMissingMention: true, + }), + ); + expect(events).toEqual(["ack", "stt"]); + expect(applyGroupGatingMock).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + mentionText: "transcribed voice note", + }), + ); + expect(processMessageMock).toHaveBeenCalledWith( + expect.objectContaining({ + preflightAudioTranscript: "transcribed voice note", + ackAlreadySent: true, + }), + ); + }); + + it("does not transcribe group voice when policy gating rejects before mention", async () => { + applyGroupGatingMock.mockResolvedValueOnce({ shouldProcess: false }); + const handler = createWebOnMessageHandler({ + cfg: { + channels: { + whatsapp: { + ackReaction: { enabled: true }, + }, + }, + } as never, + verbose: false, + connectionId: "conn-1", + maxMediaBytes: 1024 * 1024, + groupHistoryLimit: 20, + groupHistories: new Map(), + groupMemberNames: new Map(), + echoTracker: makeEchoTracker() as never, + backgroundTasks: new Set(), + replyResolver: vi.fn() as never, + replyLogger: { + info: () => {}, + warn: () => {}, + debug: () => {}, + error: () => {}, + } as never, + baseMentionConfig: {} as never, + account: { authDir: "/tmp/auth", accountId: "default" }, + }); + + await handler(makeGroupAudioMsg()); + + expect(transcribeFirstAudioMock).not.toHaveBeenCalled(); + expect(maybeSendAckReactionMock).not.toHaveBeenCalled(); + expect(processMessageMock).not.toHaveBeenCalled(); + }); }); diff --git a/extensions/whatsapp/src/auto-reply/monitor/on-message.ts b/extensions/whatsapp/src/auto-reply/monitor/on-message.ts index 10509eb3f80..e03b9471f09 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/on-message.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/on-message.ts @@ -119,6 +119,58 @@ export function createWebOnMessageHandler(params: { return; } + // Preflight audio transcription: run once before broadcast fan-out so all + // agents share the same transcript instead of each making a separate STT call. + // For DMs, only do this on the real inbound path after access-control/pairing + // checks have already passed in inbound/monitor.ts. For groups, the first + // gating pass must approve the group/sender before STT is attempted. + // null = preflight was attempted but produced no transcript (failed / disabled / no audio); + // undefined = preflight was not attempted (non-audio message). + let preflightAudioTranscript: string | null | undefined; + const hasAudioBody = + msg.mediaType?.startsWith("audio/") === true && msg.body === ""; + const canRunEarlyAudioPreflight = msg.chatType === "group" || msg.accessControlPassed === true; + let ackAlreadySent = false; + let ackReaction: AckReactionHandle | null = null; + const runAudioPreflightOnce = async () => { + if ( + preflightAudioTranscript !== undefined || + !canRunEarlyAudioPreflight || + !hasAudioBody || + !msg.mediaPath + ) { + return; + } + ackReaction = await maybeSendAckReaction({ + cfg: params.cfg, + msg, + agentId: route.agentId, + sessionKey: route.sessionKey, + conversationId, + verbose: params.verbose, + accountId: route.accountId, + info: params.replyLogger.info.bind(params.replyLogger), + warn: params.replyLogger.warn.bind(params.replyLogger), + }); + ackAlreadySent = ackReaction !== null; + try { + const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js"); + // transcribeFirstAudio returns undefined on failure/disabled; store null so + // processMessage knows the attempt was already made and does not retry. + preflightAudioTranscript = + (await transcribeFirstAudio({ + ctx: { + MediaPaths: [msg.mediaPath], + MediaTypes: msg.mediaType ? [msg.mediaType] : undefined, + }, + cfg: params.cfg, + })) ?? null; + } catch { + // Non-fatal: store null so per-agent retries are suppressed. + preflightAudioTranscript = null; + } + }; + if (msg.chatType === "group") { const sender = getSenderIdentity(msg); const metaCtx = { @@ -149,9 +201,10 @@ export function createWebOnMessageHandler(params: { warn: params.replyLogger.warn.bind(params.replyLogger), }); - const gating = await applyGroupGating({ + let gating = await applyGroupGating({ cfg: params.cfg, msg, + deferMissingMention: hasAudioBody && Boolean(msg.mediaPath), conversationId, groupHistoryKey, agentId: route.agentId, @@ -165,6 +218,32 @@ export function createWebOnMessageHandler(params: { logVerbose, replyLogger: params.replyLogger, }); + if ( + !gating.shouldProcess && + "needsMentionText" in gating && + gating.needsMentionText === true + ) { + await runAudioPreflightOnce(); + gating = await applyGroupGating({ + cfg: params.cfg, + msg, + ...(typeof preflightAudioTranscript === "string" + ? { mentionText: preflightAudioTranscript } + : {}), + conversationId, + groupHistoryKey, + agentId: route.agentId, + sessionKey: route.sessionKey, + baseMentionConfig: params.baseMentionConfig, + authDir: params.account.authDir, + selfChatMode: params.account.selfChatMode, + groupHistories: params.groupHistories, + groupHistoryLimit: params.groupHistoryLimit, + groupMemberNames: params.groupMemberNames, + logVerbose, + replyLogger: params.replyLogger, + }); + } if (!gating.shouldProcess) { return; } @@ -179,49 +258,7 @@ export function createWebOnMessageHandler(params: { } } - // Preflight audio transcription: run once here, before broadcast fan-out, so - // all agents share the same transcript instead of each making a separate STT call. - // For DMs, only do this on the real inbound path after access-control/pairing - // checks have already passed in inbound/monitor.ts. That keeps external STT and - // early ack feedback behind the same auth-first gate as the rest of DM handling. - // null = preflight was attempted but produced no transcript (failed / disabled / no audio); - // undefined = preflight was not attempted (non-audio message). - let preflightAudioTranscript: string | null | undefined; - const hasAudioBody = - msg.mediaType?.startsWith("audio/") === true && msg.body === ""; - const canRunEarlyDmPreflight = msg.chatType === "group" || msg.accessControlPassed === true; - let ackAlreadySent = false; - let ackReaction: AckReactionHandle | null = null; - if (canRunEarlyDmPreflight && hasAudioBody && msg.mediaPath) { - ackReaction = await maybeSendAckReaction({ - cfg: params.cfg, - msg, - agentId: route.agentId, - sessionKey: route.sessionKey, - conversationId, - verbose: params.verbose, - accountId: route.accountId, - info: params.replyLogger.info.bind(params.replyLogger), - warn: params.replyLogger.warn.bind(params.replyLogger), - }); - ackAlreadySent = ackReaction !== null; - try { - const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js"); - // transcribeFirstAudio returns undefined on failure/disabled; store null so - // processMessage knows the attempt was already made and does not retry. - preflightAudioTranscript = - (await transcribeFirstAudio({ - ctx: { - MediaPaths: [msg.mediaPath], - MediaTypes: msg.mediaType ? [msg.mediaType] : undefined, - }, - cfg: params.cfg, - })) ?? null; - } catch { - // Non-fatal: store null so per-agent retries are suppressed. - preflightAudioTranscript = null; - } - } + await runAudioPreflightOnce(); // Broadcast groups: when we'd reply anyway, run multiple agents. // Does not bypass group mention/activation gating above. diff --git a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts index af3be034a44..a77994b38c5 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.audio-preflight.test.ts @@ -93,6 +93,7 @@ vi.mock("./inbound-dispatch.js", () => ({ commandAuthorized?: boolean; commandBody?: string; msg: { body: string; mediaPath?: string; mediaType?: string }; + mediaTranscribedIndexes?: number[]; rawBody?: string; transcript?: string; }) => ({ @@ -102,6 +103,7 @@ vi.mock("./inbound-dispatch.js", () => ({ CommandBody: params.commandBody ?? params.msg.body, MediaPath: params.msg.mediaPath, MediaType: params.msg.mediaType, + MediaTranscribedIndexes: params.mediaTranscribedIndexes, RawBody: params.rawBody ?? params.msg.body, Transcript: params.transcript, }), @@ -230,6 +232,7 @@ describe("processMessage audio preflight transcription", () => { CommandBody: "", RawBody: "", Transcript: "okay let's test this voice message", + MediaTranscribedIndexes: [0], }); // mediaPath and mediaType must be preserved so inboundAudio detection (used by // features like messages.tts.auto: "inbound") still recognises this as audio. @@ -315,6 +318,7 @@ describe("processMessage audio preflight transcription", () => { CommandBody: "", RawBody: "", Transcript: "/new start a new session", + MediaTranscribedIndexes: [0], }); }); @@ -335,6 +339,7 @@ describe("processMessage audio preflight transcription", () => { CommandBody: "", RawBody: "", Transcript: "pre-computed transcript from fan-out caller", + MediaTranscribedIndexes: [0], }); }); diff --git a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts index 18d6bcb3d72..995a8380d3f 100644 --- a/extensions/whatsapp/src/auto-reply/monitor/process-message.ts +++ b/extensions/whatsapp/src/auto-reply/monitor/process-message.ts @@ -257,10 +257,8 @@ export async function processMessage(params: { // If we have a transcript, replace the agent-facing body so the agent sees the spoken text. // mediaPath and mediaType are intentionally preserved so that inboundAudio detection // (used by features such as messages.tts.auto: "inbound") still sees this as an - // audio message. The transcript is also stored in Transcript so downstream pipelines - // can detect it. Preventing a second STT pass in the media-understanding pipeline - // requires SDK-level support (alreadyTranscribed on a shared attachment instance); - // that is a shared concern across all channels and is tracked separately. + // audio message. The transcript and transcribed media index are also stored on + // context so downstream media understanding does not transcribe it again. const msgForAgent = audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg; @@ -429,6 +427,7 @@ export async function processMessage(params: { e164: sender.e164 ?? undefined, }, ...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}), + ...(audioTranscript !== undefined ? { mediaTranscribedIndexes: [0] } : {}), replyThreading, visibleReplyTo: visibleReplyTo ?? undefined, }); diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts index 6e2ad37d28d..92aec055046 100644 --- a/src/auto-reply/templating.ts +++ b/src/auto-reply/templating.ts @@ -117,6 +117,8 @@ export type MsgContext = { MediaPaths?: string[]; MediaUrls?: string[]; MediaTypes?: string[]; + /** Attachment indexes whose audio was already transcribed before media understanding runs. */ + MediaTranscribedIndexes?: number[]; /** Telegram sticker metadata (emoji, set name, file IDs, cached description). */ Sticker?: StickerContextMetadata; /** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */ diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 9789a748078..4be1088b146 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -984,6 +984,51 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Transcript).toBe("fallback transcript"); }); + it("skips audio STT for attachments marked transcribed by channel preflight", async () => { + const dir = await createTempMediaDir(); + const audioPath = path.join(dir, "voice.ogg"); + await fs.writeFile(audioPath, createSafeAudioFixtureBuffer(2048)); + const transcribeAudio = vi.fn(async () => ({ text: "duplicate transcript" })); + const ctx: MsgContext = { + Body: "preflight transcript", + Transcript: "preflight transcript", + MediaPath: audioPath, + MediaType: "audio/ogg", + MediaTranscribedIndexes: [0], + }; + const cfg: OpenClawConfig = { + tools: { + media: { + audio: { + enabled: true, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio, + }, + }, + }); + + expect(transcribeAudio).not.toHaveBeenCalled(); + expect(result.appliedAudio).toBe(false); + expect(ctx.Transcript).toBe("preflight transcript"); + expect(result.decisions).toContainEqual( + expect.objectContaining({ + capability: "audio", + outcome: "no-attachment", + }), + ); + }); + it("handles multiple audio attachments when attachment mode is all", async () => { const dir = await createTempMediaDir(); const audioBytes = createSafeAudioFixtureBuffer(2048); diff --git a/src/media-understanding/attachments.normalize.ts b/src/media-understanding/attachments.normalize.ts index 4881e95e152..110ff3fab3e 100644 --- a/src/media-understanding/attachments.normalize.ts +++ b/src/media-understanding/attachments.normalize.ts @@ -28,6 +28,11 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined; const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined; const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined; + const transcribedIndexes = new Set( + Array.isArray(ctx.MediaTranscribedIndexes) + ? ctx.MediaTranscribedIndexes.filter((index) => Number.isInteger(index) && index >= 0) + : [], + ); const resolveMime = (count: number, index: number) => { const typeHint = normalizeOptionalString(typesFromArray?.[index]); if (typeHint) { @@ -45,6 +50,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { url: urls?.[index] ?? ctx.MediaUrl, mime: resolveMime(count, index), index, + alreadyTranscribed: transcribedIndexes.has(index), })) .filter((entry) => Boolean(entry.path ?? normalizeOptionalString(entry.url))); } @@ -57,6 +63,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { url: normalizeOptionalString(value), mime: resolveMime(count, index), index, + alreadyTranscribed: transcribedIndexes.has(index), })) .filter((entry) => Boolean(entry.url)); } @@ -72,6 +79,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] { url: url || undefined, mime: ctx.MediaType, index: 0, + alreadyTranscribed: transcribedIndexes.has(0), }, ]; }