mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:00:43 +00:00
fix(voice): reuse preflight transcripts across channels
This commit is contained in:
@@ -96,6 +96,14 @@ Docs: https://docs.openclaw.ai
|
||||
before agent dispatch and keep raw Feishu `file_key` payloads out of message
|
||||
text. Fixes #67120 and #61876.
|
||||
- Tasks: terminalize async Gateway agent task records from the Gateway run result while preserving aborted, failed, and cancelled outcomes instead of leaving completed runs stuck as active or lost. (#71905) Thanks @likewen-tech.
|
||||
- WhatsApp: let authorized group voice-note transcripts satisfy mention gating
|
||||
before reply dispatch, while keeping unmentioned transcripts in pending group
|
||||
history. Fixes #44908.
|
||||
- Media understanding: carry channel voice-note preflight state into attachment
|
||||
selection so WhatsApp, Feishu, Telegram, and Discord do not transcribe the
|
||||
same inbound audio twice. Fixes #70580.
|
||||
- TTS/BlueBubbles: deliver compatible auto-TTS audio as iMessage voice memo
|
||||
bubbles instead of plain MP3/CAF file attachments. Fixes #16848.
|
||||
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
|
||||
plain prompts instead of OpenClaw internal runtime-context envelopes, while
|
||||
keeping those envelopes out of ACP transcripts.
|
||||
|
||||
@@ -21,6 +21,8 @@ need a separate `openclaw plugins install` step.
|
||||
- OpenClaw talks to it through its REST API (`GET /api/v1/ping`, `POST /message/text`, `POST /chat/:id/*`).
|
||||
- Incoming messages arrive via webhooks; outgoing replies, typing indicators, read receipts, and tapbacks are REST calls.
|
||||
- Attachments and stickers are ingested as inbound media (and surfaced to the agent when possible).
|
||||
- Auto-TTS replies that synthesize MP3 or CAF audio are delivered as iMessage
|
||||
voice memo bubbles instead of plain file attachments.
|
||||
- Pairing/allowlist works the same way as other channels (`/channels/pairing` etc) with `channels.bluebubbles.allowFrom` + pairing codes.
|
||||
- Reactions are surfaced as system events just like Slack/Telegram so agents can "mention" them before replying.
|
||||
- Advanced features: edit, unsend, reply threading, message effects, group management.
|
||||
|
||||
@@ -244,6 +244,7 @@ content and identifiers.
|
||||
|
||||
- explicit WhatsApp mentions of the bot identity
|
||||
- configured mention regex patterns (`agents.list[].groupChat.mentionPatterns`, fallback `messages.groupChat.mentionPatterns`)
|
||||
- inbound voice-note transcripts for authorized group messages
|
||||
- implicit reply-to-bot detection (reply sender matches bot identity)
|
||||
|
||||
Security note:
|
||||
@@ -296,6 +297,11 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s
|
||||
- `<media:document>`
|
||||
- `<media:sticker>`
|
||||
|
||||
Authorized group voice notes are transcribed before mention gating when the
|
||||
body is only `<media:audio>`, so saying the bot mention in the voice note can
|
||||
trigger the reply. If the transcript still does not mention the bot, the
|
||||
transcript is kept in pending group history instead of the raw placeholder.
|
||||
|
||||
Location bodies use terse coordinate text. Location labels/comments and contact/vCard details are rendered as fenced untrusted metadata, not inline prompt text.
|
||||
|
||||
</Accordion>
|
||||
|
||||
@@ -58,6 +58,10 @@ Video and music generation run as background tasks because provider processing t
|
||||
|
||||
Deepgram, ElevenLabs, Mistral, OpenAI, SenseAudio, and xAI can all transcribe
|
||||
inbound audio through the batch `tools.media.audio` path when configured.
|
||||
Channel plugins that preflight a voice note for mention gating or command
|
||||
parsing mark the transcribed attachment on the inbound context, so the shared
|
||||
media-understanding pass reuses that transcript instead of making a second STT
|
||||
call for the same audio.
|
||||
Deepgram, ElevenLabs, Mistral, OpenAI, and xAI also register Voice Call
|
||||
streaming STT providers, so live phone audio can be forwarded to the selected
|
||||
vendor without waiting for a completed recording.
|
||||
|
||||
@@ -330,7 +330,7 @@ export const bluebubblesPlugin: ChannelPlugin<ResolvedBlueBubblesAccount, BlueBu
|
||||
},
|
||||
sendMedia: async (ctx) => {
|
||||
const runtime = await loadBlueBubblesChannelRuntime();
|
||||
const { cfg, to, text, mediaUrl, accountId, replyToId } = ctx;
|
||||
const { cfg, to, text, mediaUrl, accountId, replyToId, audioAsVoice } = ctx;
|
||||
const { mediaPath, mediaBuffer, contentType, filename, caption } = ctx as {
|
||||
mediaPath?: string;
|
||||
mediaBuffer?: Uint8Array;
|
||||
@@ -349,6 +349,7 @@ export const bluebubblesPlugin: ChannelPlugin<ResolvedBlueBubblesAccount, BlueBu
|
||||
caption: caption ?? text ?? undefined,
|
||||
replyToId: replyToId ?? null,
|
||||
accountId: accountId ?? undefined,
|
||||
asVoice: audioAsVoice === true,
|
||||
});
|
||||
},
|
||||
},
|
||||
|
||||
@@ -323,4 +323,27 @@ describe("sendBlueBubblesMedia local-path hardening", () => {
|
||||
);
|
||||
expect(sendBlueBubblesAttachmentMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("passes asVoice through to attachment delivery", async () => {
|
||||
runtimeMocks.fetchRemoteMedia.mockResolvedValueOnce({
|
||||
buffer: new Uint8Array([1, 2, 3]),
|
||||
contentType: "audio/mpeg",
|
||||
fileName: "voice.mp3",
|
||||
});
|
||||
|
||||
await sendBlueBubblesMedia({
|
||||
cfg: createConfig(),
|
||||
to: "chat:123",
|
||||
mediaUrl: "https://example.com/voice.mp3",
|
||||
asVoice: true,
|
||||
});
|
||||
|
||||
expect(sendBlueBubblesAttachmentMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
asVoice: true,
|
||||
contentType: "audio/mpeg",
|
||||
filename: "voice.mp3",
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1686,6 +1686,7 @@ async function processMessageAfterDedupe(
|
||||
caption: caption ?? undefined,
|
||||
replyToId: replyToMessageGuid || null,
|
||||
accountId: account.accountId,
|
||||
asVoice: payload.audioAsVoice === true,
|
||||
});
|
||||
} catch (err) {
|
||||
forgetPendingOutboundMessageId(pendingId);
|
||||
|
||||
@@ -405,6 +405,7 @@ describe("preflightDiscordMessage", () => {
|
||||
);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.isDirectMessage).toBe(true);
|
||||
expect(result?.preflightAudioTranscript).toBe("hello openclaw from dm audio");
|
||||
});
|
||||
|
||||
it("falls back to the default discord account for omitted-account dm authorization", async () => {
|
||||
@@ -1096,6 +1097,7 @@ describe("preflightDiscordMessage", () => {
|
||||
);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.wasMentioned).toBe(true);
|
||||
expect(result?.preflightAudioTranscript).toBe("hey openclaw");
|
||||
});
|
||||
|
||||
it("does not transcribe guild audio from unauthorized members", async () => {
|
||||
|
||||
@@ -1112,6 +1112,7 @@ export async function preflightDiscordMessage(
|
||||
commandAuthorized,
|
||||
baseText,
|
||||
messageText,
|
||||
...(preflightTranscript !== undefined ? { preflightAudioTranscript: preflightTranscript } : {}),
|
||||
wasMentioned,
|
||||
route: effectiveRoute,
|
||||
threadBinding,
|
||||
|
||||
@@ -56,6 +56,7 @@ export type DiscordMessagePreflightContext = DiscordMessagePreflightSharedFields
|
||||
commandAuthorized: boolean;
|
||||
baseText: string;
|
||||
messageText: string;
|
||||
preflightAudioTranscript?: string;
|
||||
wasMentioned: boolean;
|
||||
|
||||
route: ReturnType<typeof resolveAgentRoute>;
|
||||
|
||||
@@ -285,11 +285,27 @@ function getLastRouteUpdate():
|
||||
}
|
||||
|
||||
function getLastDispatchCtx():
|
||||
| { SessionKey?: string; MessageThreadId?: string | number }
|
||||
| {
|
||||
BodyForAgent?: string;
|
||||
CommandBody?: string;
|
||||
MediaTranscribedIndexes?: number[];
|
||||
MessageThreadId?: string | number;
|
||||
SessionKey?: string;
|
||||
Transcript?: string;
|
||||
}
|
||||
| undefined {
|
||||
const callArgs = dispatchInboundMessage.mock.calls.at(-1) as unknown[] | undefined;
|
||||
const params = callArgs?.[0] as
|
||||
| { ctx?: { SessionKey?: string; MessageThreadId?: string | number } }
|
||||
| {
|
||||
ctx?: {
|
||||
BodyForAgent?: string;
|
||||
CommandBody?: string;
|
||||
MediaTranscribedIndexes?: number[];
|
||||
MessageThreadId?: string | number;
|
||||
SessionKey?: string;
|
||||
Transcript?: string;
|
||||
};
|
||||
}
|
||||
| undefined;
|
||||
return params?.ctx;
|
||||
}
|
||||
@@ -656,6 +672,45 @@ describe("processDiscordMessage ack reactions", () => {
|
||||
});
|
||||
|
||||
describe("processDiscordMessage session routing", () => {
|
||||
it("carries preflight audio transcript into dispatch context and marks media transcribed", async () => {
|
||||
const fetchImpl = vi.fn(
|
||||
async () =>
|
||||
new Response(new Uint8Array([1, 2, 3, 4]), {
|
||||
headers: { "content-type": "audio/ogg" },
|
||||
}),
|
||||
);
|
||||
const ctx = await createBaseContext({
|
||||
message: {
|
||||
id: "m-audio-preflight",
|
||||
channelId: "c1",
|
||||
content: "",
|
||||
timestamp: new Date().toISOString(),
|
||||
attachments: [
|
||||
{
|
||||
id: "att-audio-preflight",
|
||||
url: "https://cdn.discordapp.com/attachments/voice.ogg",
|
||||
content_type: "audio/ogg",
|
||||
filename: "voice.ogg",
|
||||
},
|
||||
],
|
||||
},
|
||||
baseText: "<media:audio>",
|
||||
messageText: "<media:audio>",
|
||||
preflightAudioTranscript: "hello from discord voice",
|
||||
discordRestFetch: fetchImpl,
|
||||
mediaMaxBytes: 1024 * 1024,
|
||||
});
|
||||
|
||||
await processDiscordMessage(ctx as any);
|
||||
|
||||
expect(getLastDispatchCtx()).toMatchObject({
|
||||
BodyForAgent: "hello from discord voice",
|
||||
CommandBody: "hello from discord voice",
|
||||
Transcript: "hello from discord voice",
|
||||
MediaTranscribedIndexes: [0],
|
||||
});
|
||||
});
|
||||
|
||||
it("stores DM lastRoute with user target for direct-session continuity", async () => {
|
||||
const ctx = await createBaseContext({
|
||||
...createDirectMessageContextOverrides(),
|
||||
|
||||
@@ -148,6 +148,7 @@ export async function processDiscordMessage(
|
||||
isGroupDm,
|
||||
baseText,
|
||||
messageText,
|
||||
preflightAudioTranscript,
|
||||
shouldRequireMention,
|
||||
canDetectMention,
|
||||
effectiveWasMentioned,
|
||||
@@ -428,6 +429,10 @@ export async function processDiscordMessage(
|
||||
}
|
||||
}
|
||||
const mediaPayload = buildDiscordMediaPayload(mediaList);
|
||||
const preflightAudioIndex =
|
||||
preflightAudioTranscript === undefined
|
||||
? -1
|
||||
: mediaList.findIndex((media) => media.contentType?.startsWith("audio/"));
|
||||
const threadKeys = resolveThreadSessionKeys({
|
||||
baseSessionKey,
|
||||
threadId: threadChannel ? messageChannelId : undefined,
|
||||
@@ -487,10 +492,11 @@ export async function processDiscordMessage(
|
||||
|
||||
const ctxPayload = finalizeInboundContext({
|
||||
Body: combinedBody,
|
||||
BodyForAgent: baseText ?? text,
|
||||
BodyForAgent: preflightAudioTranscript ?? baseText ?? text,
|
||||
InboundHistory: inboundHistory,
|
||||
RawBody: baseText,
|
||||
CommandBody: baseText,
|
||||
RawBody: preflightAudioTranscript ?? baseText,
|
||||
CommandBody: preflightAudioTranscript ?? baseText,
|
||||
...(preflightAudioTranscript !== undefined ? { Transcript: preflightAudioTranscript } : {}),
|
||||
From: effectiveFrom,
|
||||
To: effectiveTo,
|
||||
SessionKey: boundSessionKey ?? autoThreadContext?.SessionKey ?? threadKeys.sessionKey,
|
||||
@@ -521,6 +527,7 @@ export async function processDiscordMessage(
|
||||
ThreadLabel: threadLabel,
|
||||
Timestamp: resolveTimestampMs(message.timestamp),
|
||||
...mediaPayload,
|
||||
...(preflightAudioIndex >= 0 ? { MediaTranscribedIndexes: [preflightAudioIndex] } : {}),
|
||||
CommandAuthorized: commandAuthorized,
|
||||
CommandSource: "text" as const,
|
||||
// Originating channel for reply routing.
|
||||
|
||||
@@ -1512,6 +1512,7 @@ describe("handleFeishuMessage command authorization", () => {
|
||||
Transcript: "voice transcript",
|
||||
MediaPaths: ["/tmp/inbound-voice.ogg"],
|
||||
MediaTypes: ["audio/ogg"],
|
||||
MediaTranscribedIndexes: [0],
|
||||
}),
|
||||
);
|
||||
const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];
|
||||
|
||||
@@ -759,6 +759,10 @@ export async function handleFeishuMessage(params: {
|
||||
chatType: isGroup ? "group" : "direct",
|
||||
log,
|
||||
});
|
||||
const preflightAudioIndex =
|
||||
audioTranscript === undefined
|
||||
? -1
|
||||
: mediaList.findIndex((media) => media.contentType?.startsWith("audio/"));
|
||||
const agentFacingContent = audioTranscript ?? ctx.content;
|
||||
const agentFacingCtx =
|
||||
audioTranscript === undefined
|
||||
@@ -1078,6 +1082,7 @@ export async function handleFeishuMessage(params: {
|
||||
OriginatingTo: feishuTo,
|
||||
GroupSystemPrompt: isGroup ? normalizeOptionalString(groupConfig?.systemPrompt) : undefined,
|
||||
...mediaPayload,
|
||||
...(preflightAudioIndex >= 0 ? { MediaTranscribedIndexes: [preflightAudioIndex] } : {}),
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
@@ -68,7 +68,14 @@ const {
|
||||
textToSpeechTelephony,
|
||||
} = await import("./tts.js");
|
||||
|
||||
const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const;
|
||||
const nativeVoiceNoteChannels = [
|
||||
"bluebubbles",
|
||||
"discord",
|
||||
"feishu",
|
||||
"matrix",
|
||||
"telegram",
|
||||
"whatsapp",
|
||||
] as const;
|
||||
|
||||
function createMockSpeechProvider(
|
||||
id = "mock",
|
||||
@@ -164,6 +171,33 @@ describe("speech-core native voice-note routing", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps BlueBubbles synthesis on mp3 audio-file output but delivers it as a voice memo", async () => {
|
||||
await expectTtsPayloadResult({
|
||||
channel: "bluebubbles",
|
||||
prefsName: "openclaw-speech-core-tts-bluebubbles-mp3-test",
|
||||
text: "This BlueBubbles reply should be delivered as an iMessage voice memo.",
|
||||
target: "audio-file",
|
||||
audioAsVoice: true,
|
||||
mediaExtension: "mp3",
|
||||
providerResult: {
|
||||
audioBuffer: Buffer.from("mp3"),
|
||||
outputFormat: "mp3",
|
||||
fileExtension: ".mp3",
|
||||
voiceCompatible: false,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("does not mark unsupported BlueBubbles audio-file output as a voice memo", async () => {
|
||||
await expectTtsPayloadResult({
|
||||
channel: "bluebubbles",
|
||||
prefsName: "openclaw-speech-core-tts-bluebubbles-ogg-test",
|
||||
text: "This BlueBubbles reply should stay a regular audio attachment.",
|
||||
target: "audio-file",
|
||||
audioAsVoice: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
it.each(["feishu", "whatsapp"] as const)(
|
||||
"marks %s voice-note TTS for channel-side transcoding when provider returns mp3",
|
||||
async (channel) => {
|
||||
|
||||
@@ -738,8 +738,17 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
|
||||
lastTtsAttempt = entry;
|
||||
}
|
||||
|
||||
const VOICE_DELIVERY_CHANNELS = new Set([
|
||||
"bluebubbles",
|
||||
"telegram",
|
||||
"feishu",
|
||||
"whatsapp",
|
||||
"matrix",
|
||||
"discord",
|
||||
]);
|
||||
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
|
||||
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu", "whatsapp"]);
|
||||
const AUDIO_FILE_VOICE_MEMO_CHANNELS = new Set(["bluebubbles"]);
|
||||
|
||||
function resolveChannelId(channel: string | undefined): ChannelId | null {
|
||||
return channel ? normalizeChannelId(channel) : null;
|
||||
@@ -747,7 +756,7 @@ function resolveChannelId(channel: string | undefined): ChannelId | null {
|
||||
|
||||
function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
|
||||
const channelId = resolveChannelId(channel);
|
||||
return channelId !== null && OPUS_CHANNELS.has(channelId);
|
||||
return channelId !== null && VOICE_DELIVERY_CHANNELS.has(channelId);
|
||||
}
|
||||
|
||||
function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
|
||||
@@ -755,12 +764,43 @@ function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
|
||||
return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
|
||||
}
|
||||
|
||||
function resolveTtsSynthesisTarget(channel: string | undefined): "audio-file" | "voice-note" {
|
||||
const channelId = resolveChannelId(channel);
|
||||
return channelId !== null && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
|
||||
}
|
||||
|
||||
function supportsAudioFileVoiceMemoOutput(params: {
|
||||
fileExtension?: string;
|
||||
outputFormat?: string;
|
||||
}): boolean {
|
||||
const extension = params.fileExtension?.trim().toLowerCase();
|
||||
if (extension === ".mp3" || extension === ".caf") {
|
||||
return true;
|
||||
}
|
||||
const outputFormat = params.outputFormat?.trim().toLowerCase();
|
||||
return (
|
||||
outputFormat === "mp3" ||
|
||||
outputFormat === "caf" ||
|
||||
outputFormat === "audio/mpeg" ||
|
||||
outputFormat === "audio/x-caf"
|
||||
);
|
||||
}
|
||||
|
||||
function shouldDeliverTtsAsVoice(params: {
|
||||
channel: string | undefined;
|
||||
target: "audio-file" | "voice-note" | undefined;
|
||||
voiceCompatible: boolean | undefined;
|
||||
fileExtension?: string;
|
||||
outputFormat?: string;
|
||||
}): boolean {
|
||||
if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") {
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
if (channelId === null || !supportsNativeVoiceNoteTts(channelId)) {
|
||||
return false;
|
||||
}
|
||||
if (AUDIO_FILE_VOICE_MEMO_CHANNELS.has(channelId)) {
|
||||
return params.target === "audio-file" && supportsAudioFileVoiceMemoOutput(params);
|
||||
}
|
||||
if (params.target !== "voice-note") {
|
||||
return false;
|
||||
}
|
||||
return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
|
||||
@@ -1032,6 +1072,8 @@ export async function textToSpeech(params: {
|
||||
channel: params.channel,
|
||||
target: synthesis.target,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
fileExtension: synthesis.fileExtension,
|
||||
outputFormat: synthesis.outputFormat,
|
||||
}),
|
||||
target: synthesis.target,
|
||||
};
|
||||
@@ -1061,7 +1103,7 @@ export async function synthesizeSpeech(params: {
|
||||
|
||||
const { config, persona, providers } = setup;
|
||||
const timeoutMs = params.timeoutMs ?? config.timeoutMs;
|
||||
const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file";
|
||||
const target = resolveTtsSynthesisTarget(params.channel);
|
||||
|
||||
const errors: string[] = [];
|
||||
const attemptedProviders: string[] = [];
|
||||
@@ -1499,6 +1541,7 @@ export const _test = {
|
||||
resolveModelOverridePolicy,
|
||||
supportsNativeVoiceNoteTts,
|
||||
supportsTranscodedVoiceNoteTts,
|
||||
resolveTtsSynthesisTarget,
|
||||
shouldDeliverTtsAsVoice,
|
||||
summarizeText,
|
||||
getResolvedSpeechProviderConfig,
|
||||
|
||||
@@ -69,6 +69,7 @@ function expectTranscriptRendered(
|
||||
expect(ctx?.ctxPayload?.BodyForAgent).toBe(framed);
|
||||
expect(ctx?.ctxPayload?.Body).toContain(framed);
|
||||
expect(ctx?.ctxPayload?.Body).not.toContain("<media:audio>");
|
||||
expect(ctx?.ctxPayload?.MediaTranscribedIndexes).toEqual([0]);
|
||||
}
|
||||
|
||||
function expectAudioPlaceholderRendered(ctx: Awaited<ReturnType<typeof buildGroupVoiceContext>>) {
|
||||
|
||||
@@ -73,6 +73,7 @@ export type TelegramInboundBodyResult = {
|
||||
effectiveWasMentioned: boolean;
|
||||
canDetectMention: boolean;
|
||||
shouldBypassMention: boolean;
|
||||
audioTranscribedMediaIndex?: number;
|
||||
stickerCacheHit: boolean;
|
||||
locationData?: NormalizedLocation;
|
||||
};
|
||||
@@ -230,6 +231,10 @@ export async function resolveTelegramInboundBody(params: {
|
||||
logVerbose(`telegram: audio preflight transcription failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
const audioTranscribedMediaIndex =
|
||||
preflightTranscript === undefined
|
||||
? undefined
|
||||
: allMedia.findIndex((media) => media.contentType?.startsWith("audio/"));
|
||||
|
||||
if (hasAudio && bodyText === "<media:audio>" && preflightTranscript) {
|
||||
bodyText = formatAudioTranscriptForAgent(preflightTranscript);
|
||||
@@ -363,6 +368,9 @@ export async function resolveTelegramInboundBody(params: {
|
||||
effectiveWasMentioned,
|
||||
canDetectMention,
|
||||
shouldBypassMention: mentionDecision.shouldBypassMention,
|
||||
...(audioTranscribedMediaIndex !== undefined && audioTranscribedMediaIndex >= 0
|
||||
? { audioTranscribedMediaIndex }
|
||||
: {}),
|
||||
stickerCacheHit,
|
||||
locationData: locationData ?? undefined,
|
||||
};
|
||||
|
||||
@@ -111,6 +111,7 @@ export async function buildTelegramInboundContextPayload(params: {
|
||||
topicConfig?: TelegramTopicConfig;
|
||||
stickerCacheHit: boolean;
|
||||
effectiveWasMentioned: boolean;
|
||||
audioTranscribedMediaIndex?: number;
|
||||
commandAuthorized: boolean;
|
||||
locationData?: NormalizedLocation;
|
||||
options?: TelegramMessageContextOptions;
|
||||
@@ -146,6 +147,7 @@ export async function buildTelegramInboundContextPayload(params: {
|
||||
topicConfig,
|
||||
stickerCacheHit,
|
||||
effectiveWasMentioned,
|
||||
audioTranscribedMediaIndex,
|
||||
commandAuthorized,
|
||||
locationData,
|
||||
options,
|
||||
@@ -372,6 +374,9 @@ export async function buildTelegramInboundContextPayload(params: {
|
||||
contextMedia.length > 0
|
||||
? (contextMedia.map((m) => m.contentType).filter(Boolean) as string[])
|
||||
: undefined,
|
||||
...(audioTranscribedMediaIndex !== undefined
|
||||
? { MediaTranscribedIndexes: [audioTranscribedMediaIndex] }
|
||||
: {}),
|
||||
Sticker: allMedia[0]?.stickerMetadata,
|
||||
StickerMediaIncluded: allMedia[0]?.stickerMetadata ? !stickerCacheHit : undefined,
|
||||
...(locationData ? toLocationContext(locationData) : undefined),
|
||||
|
||||
@@ -578,6 +578,9 @@ export const buildTelegramMessageContext = async ({
|
||||
topicConfig,
|
||||
stickerCacheHit: bodyResult.stickerCacheHit,
|
||||
effectiveWasMentioned: bodyResult.effectiveWasMentioned,
|
||||
...(bodyResult.audioTranscribedMediaIndex !== undefined
|
||||
? { audioTranscribedMediaIndex: bodyResult.audioTranscribedMediaIndex }
|
||||
: {}),
|
||||
locationData: bodyResult.locationData,
|
||||
options,
|
||||
dmAllowFrom,
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
vi.mock("./group-activation.js", () => ({
|
||||
resolveGroupActivationFor: vi.fn(async () => "mention"),
|
||||
}));
|
||||
|
||||
import type { MentionConfig } from "../mentions.js";
|
||||
import type { WebInboundMsg } from "../types.js";
|
||||
import { applyGroupGating, type GroupHistoryEntry } from "./group-gating.js";
|
||||
|
||||
function makeGroupAudioMsg(): WebInboundMsg {
|
||||
return {
|
||||
id: "msg-1",
|
||||
from: "1203630@g.us",
|
||||
to: "+15550000001",
|
||||
body: "<media:audio>",
|
||||
chatId: "1203630@g.us",
|
||||
chatType: "group",
|
||||
conversationId: "1203630@g.us",
|
||||
mediaType: "audio/ogg; codecs=opus",
|
||||
mediaPath: "/tmp/voice.ogg",
|
||||
timestamp: 1700000000,
|
||||
accountId: "default",
|
||||
sender: { e164: "+15550000002", name: "Alice" },
|
||||
} as WebInboundMsg;
|
||||
}
|
||||
|
||||
function makeParams(msg: WebInboundMsg, groupHistories: Map<string, GroupHistoryEntry[]>) {
|
||||
return {
|
||||
cfg: {
|
||||
channels: {
|
||||
whatsapp: {
|
||||
groupPolicy: "open",
|
||||
},
|
||||
},
|
||||
messages: {
|
||||
groupChat: {
|
||||
mentionPatterns: ["\\bopenclaw\\b"],
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
msg,
|
||||
conversationId: "1203630@g.us",
|
||||
groupHistoryKey: "whatsapp:group:1203630",
|
||||
agentId: "main",
|
||||
sessionKey: "agent:main:whatsapp:group:1203630",
|
||||
baseMentionConfig: { mentionRegexes: [/\bopenclaw\b/i] } satisfies MentionConfig,
|
||||
groupHistories,
|
||||
groupHistoryLimit: 20,
|
||||
groupMemberNames: new Map<string, Map<string, string>>(),
|
||||
logVerbose: vi.fn(),
|
||||
replyLogger: { debug: vi.fn() },
|
||||
};
|
||||
}
|
||||
|
||||
describe("applyGroupGating audio preflight mention text", () => {
|
||||
let groupHistories: Map<string, GroupHistoryEntry[]>;
|
||||
|
||||
beforeEach(() => {
|
||||
groupHistories = new Map();
|
||||
});
|
||||
|
||||
it("defers a missing mention without storing placeholder history", async () => {
|
||||
const msg = makeGroupAudioMsg();
|
||||
|
||||
const result = await applyGroupGating({
|
||||
...makeParams(msg, groupHistories),
|
||||
deferMissingMention: true,
|
||||
});
|
||||
|
||||
expect(result).toEqual({ shouldProcess: false, needsMentionText: true });
|
||||
expect(groupHistories.get("whatsapp:group:1203630")).toBeUndefined();
|
||||
});
|
||||
|
||||
it("accepts voice transcript text that satisfies mention gating", async () => {
|
||||
const msg = makeGroupAudioMsg();
|
||||
|
||||
const result = await applyGroupGating({
|
||||
...makeParams(msg, groupHistories),
|
||||
mentionText: "openclaw please summarize the thread",
|
||||
});
|
||||
|
||||
expect(result).toEqual({ shouldProcess: true });
|
||||
expect(msg.wasMentioned).toBe(true);
|
||||
expect(groupHistories.get("whatsapp:group:1203630")).toBeUndefined();
|
||||
});
|
||||
|
||||
it("stores transcript text instead of the audio placeholder when mention is still missing", async () => {
|
||||
const msg = makeGroupAudioMsg();
|
||||
|
||||
const result = await applyGroupGating({
|
||||
...makeParams(msg, groupHistories),
|
||||
mentionText: "please summarize the thread",
|
||||
});
|
||||
|
||||
expect(result).toEqual({ shouldProcess: false });
|
||||
expect(groupHistories.get("whatsapp:group:1203630")).toEqual([
|
||||
expect.objectContaining({
|
||||
body: "please summarize the thread",
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
@@ -33,6 +33,8 @@ export type GroupHistoryEntry = {
|
||||
type ApplyGroupGatingParams = {
|
||||
cfg: ReturnType<typeof loadConfig>;
|
||||
msg: WebInboundMsg;
|
||||
mentionText?: string;
|
||||
deferMissingMention?: boolean;
|
||||
conversationId: string;
|
||||
groupHistoryKey: string;
|
||||
agentId: string;
|
||||
@@ -58,6 +60,7 @@ function isOwnerSender(baseMentionConfig: MentionConfig, msg: WebInboundMsg) {
|
||||
|
||||
function recordPendingGroupHistoryEntry(params: {
|
||||
msg: WebInboundMsg;
|
||||
body?: string;
|
||||
groupHistories: Map<string, GroupHistoryEntry[]>;
|
||||
groupHistoryKey: string;
|
||||
groupHistoryLimit: number;
|
||||
@@ -76,7 +79,7 @@ function recordPendingGroupHistoryEntry(params: {
|
||||
limit: params.groupHistoryLimit,
|
||||
entry: {
|
||||
sender,
|
||||
body: params.msg.body,
|
||||
body: params.body ?? params.msg.body,
|
||||
timestamp: params.msg.timestamp,
|
||||
id: params.msg.id,
|
||||
senderJid: senderIdentity.jid ?? params.msg.senderJid,
|
||||
@@ -84,10 +87,15 @@ function recordPendingGroupHistoryEntry(params: {
|
||||
});
|
||||
}
|
||||
|
||||
function skipGroupMessageAndStoreHistory(params: ApplyGroupGatingParams, verboseMessage: string) {
|
||||
function skipGroupMessageAndStoreHistory(
|
||||
params: ApplyGroupGatingParams,
|
||||
verboseMessage: string,
|
||||
body?: string,
|
||||
) {
|
||||
params.logVerbose(verboseMessage);
|
||||
recordPendingGroupHistoryEntry({
|
||||
msg: params.msg,
|
||||
body,
|
||||
groupHistories: params.groupHistories,
|
||||
groupHistoryKey: params.groupHistoryKey,
|
||||
groupHistoryLimit: params.groupHistoryLimit,
|
||||
@@ -126,8 +134,10 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) {
|
||||
...buildMentionConfig(params.cfg, params.agentId),
|
||||
allowFrom: inboundPolicy.configuredAllowFrom,
|
||||
};
|
||||
const mentionMsg =
|
||||
params.mentionText !== undefined ? { ...params.msg, body: params.mentionText } : params.msg;
|
||||
const commandBody = stripMentionsForCommand(
|
||||
params.msg.body,
|
||||
mentionMsg.body,
|
||||
mentionConfig.mentionRegexes,
|
||||
self.e164,
|
||||
);
|
||||
@@ -142,7 +152,7 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) {
|
||||
);
|
||||
}
|
||||
|
||||
const mentionDebug = debugMention(params.msg, mentionConfig, params.authDir);
|
||||
const mentionDebug = debugMention(mentionMsg, mentionConfig, params.authDir);
|
||||
params.replyLogger.debug(
|
||||
{
|
||||
conversationId: params.conversationId,
|
||||
@@ -190,9 +200,16 @@ export async function applyGroupGating(params: ApplyGroupGatingParams) {
|
||||
const effectiveWasMentioned = mentionDecision.effectiveWasMentioned || shouldBypassMention;
|
||||
params.msg.wasMentioned = effectiveWasMentioned;
|
||||
if (!shouldBypassMention && requireMention && mentionDecision.shouldSkip) {
|
||||
if (params.deferMissingMention === true) {
|
||||
params.logVerbose(
|
||||
`Deferring group mention skip until audio preflight completes in ${params.conversationId}`,
|
||||
);
|
||||
return { shouldProcess: false, needsMentionText: true } as const;
|
||||
}
|
||||
return skipGroupMessageAndStoreHistory(
|
||||
params,
|
||||
`Group message stored for context (no mention detected) in ${params.conversationId}: ${params.msg.body}`,
|
||||
`Group message stored for context (no mention detected) in ${params.conversationId}: ${mentionMsg.body}`,
|
||||
params.mentionText,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -102,6 +102,7 @@ export function buildWhatsAppInboundContext(params: {
|
||||
route: ReturnType<typeof resolveAgentRoute>;
|
||||
sender: SenderContext;
|
||||
transcript?: string;
|
||||
mediaTranscribedIndexes?: number[];
|
||||
replyThreading?: ReplyThreadingContext;
|
||||
visibleReplyTo?: VisibleReplyTarget;
|
||||
}) {
|
||||
@@ -132,6 +133,7 @@ export function buildWhatsAppInboundContext(params: {
|
||||
MediaPath: params.msg.mediaPath,
|
||||
MediaUrl: params.msg.mediaUrl,
|
||||
MediaType: params.msg.mediaType,
|
||||
MediaTranscribedIndexes: params.mediaTranscribedIndexes,
|
||||
ChatType: params.msg.chatType,
|
||||
Timestamp: params.msg.timestamp,
|
||||
ConversationLabel: params.msg.chatType === "group" ? params.conversationId : params.msg.from,
|
||||
|
||||
@@ -10,6 +10,7 @@ const ackReactionHandle = {
|
||||
ackReactionValue: "👀",
|
||||
remove: vi.fn(async () => undefined),
|
||||
};
|
||||
const applyGroupGatingMock = vi.fn();
|
||||
|
||||
vi.mock("./audio-preflight.runtime.js", () => ({
|
||||
transcribeFirstAudio: (...args: unknown[]) => transcribeFirstAudioMock(...args),
|
||||
@@ -28,7 +29,7 @@ vi.mock("./broadcast.js", () => ({
|
||||
}));
|
||||
|
||||
vi.mock("./group-gating.js", () => ({
|
||||
applyGroupGating: vi.fn(async () => ({ shouldProcess: true })),
|
||||
applyGroupGating: (...args: unknown[]) => applyGroupGatingMock(...args),
|
||||
}));
|
||||
|
||||
vi.mock("./last-route.js", () => ({
|
||||
@@ -127,6 +128,8 @@ describe("createWebOnMessageHandler audio preflight", () => {
|
||||
});
|
||||
processMessageMock.mockReset();
|
||||
processMessageMock.mockResolvedValue(true);
|
||||
applyGroupGatingMock.mockReset();
|
||||
applyGroupGatingMock.mockResolvedValue({ shouldProcess: true });
|
||||
});
|
||||
|
||||
it("sends ack reaction before audio preflight for voice notes", async () => {
|
||||
@@ -258,4 +261,94 @@ describe("createWebOnMessageHandler audio preflight", () => {
|
||||
expect(events).toEqual(["ack", "stt"]);
|
||||
expect(processMessageMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("uses group voice transcript for mention gating before dispatch", async () => {
|
||||
applyGroupGatingMock
|
||||
.mockResolvedValueOnce({ shouldProcess: false, needsMentionText: true })
|
||||
.mockResolvedValueOnce({ shouldProcess: true });
|
||||
const handler = createWebOnMessageHandler({
|
||||
cfg: {
|
||||
channels: {
|
||||
whatsapp: {
|
||||
ackReaction: { enabled: true },
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
verbose: false,
|
||||
connectionId: "conn-1",
|
||||
maxMediaBytes: 1024 * 1024,
|
||||
groupHistoryLimit: 20,
|
||||
groupHistories: new Map(),
|
||||
groupMemberNames: new Map(),
|
||||
echoTracker: makeEchoTracker() as never,
|
||||
backgroundTasks: new Set(),
|
||||
replyResolver: vi.fn() as never,
|
||||
replyLogger: {
|
||||
info: () => {},
|
||||
warn: () => {},
|
||||
debug: () => {},
|
||||
error: () => {},
|
||||
} as never,
|
||||
baseMentionConfig: {} as never,
|
||||
account: { authDir: "/tmp/auth", accountId: "default" },
|
||||
});
|
||||
|
||||
await handler(makeGroupAudioMsg());
|
||||
|
||||
expect(applyGroupGatingMock).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
expect.objectContaining({
|
||||
deferMissingMention: true,
|
||||
}),
|
||||
);
|
||||
expect(events).toEqual(["ack", "stt"]);
|
||||
expect(applyGroupGatingMock).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
expect.objectContaining({
|
||||
mentionText: "transcribed voice note",
|
||||
}),
|
||||
);
|
||||
expect(processMessageMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
preflightAudioTranscript: "transcribed voice note",
|
||||
ackAlreadySent: true,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("does not transcribe group voice when policy gating rejects before mention", async () => {
|
||||
applyGroupGatingMock.mockResolvedValueOnce({ shouldProcess: false });
|
||||
const handler = createWebOnMessageHandler({
|
||||
cfg: {
|
||||
channels: {
|
||||
whatsapp: {
|
||||
ackReaction: { enabled: true },
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
verbose: false,
|
||||
connectionId: "conn-1",
|
||||
maxMediaBytes: 1024 * 1024,
|
||||
groupHistoryLimit: 20,
|
||||
groupHistories: new Map(),
|
||||
groupMemberNames: new Map(),
|
||||
echoTracker: makeEchoTracker() as never,
|
||||
backgroundTasks: new Set(),
|
||||
replyResolver: vi.fn() as never,
|
||||
replyLogger: {
|
||||
info: () => {},
|
||||
warn: () => {},
|
||||
debug: () => {},
|
||||
error: () => {},
|
||||
} as never,
|
||||
baseMentionConfig: {} as never,
|
||||
account: { authDir: "/tmp/auth", accountId: "default" },
|
||||
});
|
||||
|
||||
await handler(makeGroupAudioMsg());
|
||||
|
||||
expect(transcribeFirstAudioMock).not.toHaveBeenCalled();
|
||||
expect(maybeSendAckReactionMock).not.toHaveBeenCalled();
|
||||
expect(processMessageMock).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -119,6 +119,58 @@ export function createWebOnMessageHandler(params: {
|
||||
return;
|
||||
}
|
||||
|
||||
// Preflight audio transcription: run once before broadcast fan-out so all
|
||||
// agents share the same transcript instead of each making a separate STT call.
|
||||
// For DMs, only do this on the real inbound path after access-control/pairing
|
||||
// checks have already passed in inbound/monitor.ts. For groups, the first
|
||||
// gating pass must approve the group/sender before STT is attempted.
|
||||
// null = preflight was attempted but produced no transcript (failed / disabled / no audio);
|
||||
// undefined = preflight was not attempted (non-audio message).
|
||||
let preflightAudioTranscript: string | null | undefined;
|
||||
const hasAudioBody =
|
||||
msg.mediaType?.startsWith("audio/") === true && msg.body === "<media:audio>";
|
||||
const canRunEarlyAudioPreflight = msg.chatType === "group" || msg.accessControlPassed === true;
|
||||
let ackAlreadySent = false;
|
||||
let ackReaction: AckReactionHandle | null = null;
|
||||
const runAudioPreflightOnce = async () => {
|
||||
if (
|
||||
preflightAudioTranscript !== undefined ||
|
||||
!canRunEarlyAudioPreflight ||
|
||||
!hasAudioBody ||
|
||||
!msg.mediaPath
|
||||
) {
|
||||
return;
|
||||
}
|
||||
ackReaction = await maybeSendAckReaction({
|
||||
cfg: params.cfg,
|
||||
msg,
|
||||
agentId: route.agentId,
|
||||
sessionKey: route.sessionKey,
|
||||
conversationId,
|
||||
verbose: params.verbose,
|
||||
accountId: route.accountId,
|
||||
info: params.replyLogger.info.bind(params.replyLogger),
|
||||
warn: params.replyLogger.warn.bind(params.replyLogger),
|
||||
});
|
||||
ackAlreadySent = ackReaction !== null;
|
||||
try {
|
||||
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
|
||||
// transcribeFirstAudio returns undefined on failure/disabled; store null so
|
||||
// processMessage knows the attempt was already made and does not retry.
|
||||
preflightAudioTranscript =
|
||||
(await transcribeFirstAudio({
|
||||
ctx: {
|
||||
MediaPaths: [msg.mediaPath],
|
||||
MediaTypes: msg.mediaType ? [msg.mediaType] : undefined,
|
||||
},
|
||||
cfg: params.cfg,
|
||||
})) ?? null;
|
||||
} catch {
|
||||
// Non-fatal: store null so per-agent retries are suppressed.
|
||||
preflightAudioTranscript = null;
|
||||
}
|
||||
};
|
||||
|
||||
if (msg.chatType === "group") {
|
||||
const sender = getSenderIdentity(msg);
|
||||
const metaCtx = {
|
||||
@@ -149,9 +201,10 @@ export function createWebOnMessageHandler(params: {
|
||||
warn: params.replyLogger.warn.bind(params.replyLogger),
|
||||
});
|
||||
|
||||
const gating = await applyGroupGating({
|
||||
let gating = await applyGroupGating({
|
||||
cfg: params.cfg,
|
||||
msg,
|
||||
deferMissingMention: hasAudioBody && Boolean(msg.mediaPath),
|
||||
conversationId,
|
||||
groupHistoryKey,
|
||||
agentId: route.agentId,
|
||||
@@ -165,6 +218,32 @@ export function createWebOnMessageHandler(params: {
|
||||
logVerbose,
|
||||
replyLogger: params.replyLogger,
|
||||
});
|
||||
if (
|
||||
!gating.shouldProcess &&
|
||||
"needsMentionText" in gating &&
|
||||
gating.needsMentionText === true
|
||||
) {
|
||||
await runAudioPreflightOnce();
|
||||
gating = await applyGroupGating({
|
||||
cfg: params.cfg,
|
||||
msg,
|
||||
...(typeof preflightAudioTranscript === "string"
|
||||
? { mentionText: preflightAudioTranscript }
|
||||
: {}),
|
||||
conversationId,
|
||||
groupHistoryKey,
|
||||
agentId: route.agentId,
|
||||
sessionKey: route.sessionKey,
|
||||
baseMentionConfig: params.baseMentionConfig,
|
||||
authDir: params.account.authDir,
|
||||
selfChatMode: params.account.selfChatMode,
|
||||
groupHistories: params.groupHistories,
|
||||
groupHistoryLimit: params.groupHistoryLimit,
|
||||
groupMemberNames: params.groupMemberNames,
|
||||
logVerbose,
|
||||
replyLogger: params.replyLogger,
|
||||
});
|
||||
}
|
||||
if (!gating.shouldProcess) {
|
||||
return;
|
||||
}
|
||||
@@ -179,49 +258,7 @@ export function createWebOnMessageHandler(params: {
|
||||
}
|
||||
}
|
||||
|
||||
// Preflight audio transcription: run once here, before broadcast fan-out, so
|
||||
// all agents share the same transcript instead of each making a separate STT call.
|
||||
// For DMs, only do this on the real inbound path after access-control/pairing
|
||||
// checks have already passed in inbound/monitor.ts. That keeps external STT and
|
||||
// early ack feedback behind the same auth-first gate as the rest of DM handling.
|
||||
// null = preflight was attempted but produced no transcript (failed / disabled / no audio);
|
||||
// undefined = preflight was not attempted (non-audio message).
|
||||
let preflightAudioTranscript: string | null | undefined;
|
||||
const hasAudioBody =
|
||||
msg.mediaType?.startsWith("audio/") === true && msg.body === "<media:audio>";
|
||||
const canRunEarlyDmPreflight = msg.chatType === "group" || msg.accessControlPassed === true;
|
||||
let ackAlreadySent = false;
|
||||
let ackReaction: AckReactionHandle | null = null;
|
||||
if (canRunEarlyDmPreflight && hasAudioBody && msg.mediaPath) {
|
||||
ackReaction = await maybeSendAckReaction({
|
||||
cfg: params.cfg,
|
||||
msg,
|
||||
agentId: route.agentId,
|
||||
sessionKey: route.sessionKey,
|
||||
conversationId,
|
||||
verbose: params.verbose,
|
||||
accountId: route.accountId,
|
||||
info: params.replyLogger.info.bind(params.replyLogger),
|
||||
warn: params.replyLogger.warn.bind(params.replyLogger),
|
||||
});
|
||||
ackAlreadySent = ackReaction !== null;
|
||||
try {
|
||||
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
|
||||
// transcribeFirstAudio returns undefined on failure/disabled; store null so
|
||||
// processMessage knows the attempt was already made and does not retry.
|
||||
preflightAudioTranscript =
|
||||
(await transcribeFirstAudio({
|
||||
ctx: {
|
||||
MediaPaths: [msg.mediaPath],
|
||||
MediaTypes: msg.mediaType ? [msg.mediaType] : undefined,
|
||||
},
|
||||
cfg: params.cfg,
|
||||
})) ?? null;
|
||||
} catch {
|
||||
// Non-fatal: store null so per-agent retries are suppressed.
|
||||
preflightAudioTranscript = null;
|
||||
}
|
||||
}
|
||||
await runAudioPreflightOnce();
|
||||
|
||||
// Broadcast groups: when we'd reply anyway, run multiple agents.
|
||||
// Does not bypass group mention/activation gating above.
|
||||
|
||||
@@ -93,6 +93,7 @@ vi.mock("./inbound-dispatch.js", () => ({
|
||||
commandAuthorized?: boolean;
|
||||
commandBody?: string;
|
||||
msg: { body: string; mediaPath?: string; mediaType?: string };
|
||||
mediaTranscribedIndexes?: number[];
|
||||
rawBody?: string;
|
||||
transcript?: string;
|
||||
}) => ({
|
||||
@@ -102,6 +103,7 @@ vi.mock("./inbound-dispatch.js", () => ({
|
||||
CommandBody: params.commandBody ?? params.msg.body,
|
||||
MediaPath: params.msg.mediaPath,
|
||||
MediaType: params.msg.mediaType,
|
||||
MediaTranscribedIndexes: params.mediaTranscribedIndexes,
|
||||
RawBody: params.rawBody ?? params.msg.body,
|
||||
Transcript: params.transcript,
|
||||
}),
|
||||
@@ -230,6 +232,7 @@ describe("processMessage audio preflight transcription", () => {
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "okay let's test this voice message",
|
||||
MediaTranscribedIndexes: [0],
|
||||
});
|
||||
// mediaPath and mediaType must be preserved so inboundAudio detection (used by
|
||||
// features like messages.tts.auto: "inbound") still recognises this as audio.
|
||||
@@ -315,6 +318,7 @@ describe("processMessage audio preflight transcription", () => {
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "/new start a new session",
|
||||
MediaTranscribedIndexes: [0],
|
||||
});
|
||||
});
|
||||
|
||||
@@ -335,6 +339,7 @@ describe("processMessage audio preflight transcription", () => {
|
||||
CommandBody: "<media:audio>",
|
||||
RawBody: "<media:audio>",
|
||||
Transcript: "pre-computed transcript from fan-out caller",
|
||||
MediaTranscribedIndexes: [0],
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -257,10 +257,8 @@ export async function processMessage(params: {
|
||||
// If we have a transcript, replace the agent-facing body so the agent sees the spoken text.
|
||||
// mediaPath and mediaType are intentionally preserved so that inboundAudio detection
|
||||
// (used by features such as messages.tts.auto: "inbound") still sees this as an
|
||||
// audio message. The transcript is also stored in Transcript so downstream pipelines
|
||||
// can detect it. Preventing a second STT pass in the media-understanding pipeline
|
||||
// requires SDK-level support (alreadyTranscribed on a shared attachment instance);
|
||||
// that is a shared concern across all channels and is tracked separately.
|
||||
// audio message. The transcript and transcribed media index are also stored on
|
||||
// context so downstream media understanding does not transcribe it again.
|
||||
const msgForAgent =
|
||||
audioTranscript !== undefined ? { ...params.msg, body: audioTranscript } : params.msg;
|
||||
|
||||
@@ -429,6 +427,7 @@ export async function processMessage(params: {
|
||||
e164: sender.e164 ?? undefined,
|
||||
},
|
||||
...(audioTranscript !== undefined ? { transcript: audioTranscript } : {}),
|
||||
...(audioTranscript !== undefined ? { mediaTranscribedIndexes: [0] } : {}),
|
||||
replyThreading,
|
||||
visibleReplyTo: visibleReplyTo ?? undefined,
|
||||
});
|
||||
|
||||
@@ -117,6 +117,8 @@ export type MsgContext = {
|
||||
MediaPaths?: string[];
|
||||
MediaUrls?: string[];
|
||||
MediaTypes?: string[];
|
||||
/** Attachment indexes whose audio was already transcribed before media understanding runs. */
|
||||
MediaTranscribedIndexes?: number[];
|
||||
/** Telegram sticker metadata (emoji, set name, file IDs, cached description). */
|
||||
Sticker?: StickerContextMetadata;
|
||||
/** True when current-turn sticker media is present in MediaPaths (false for cached-description path). */
|
||||
|
||||
@@ -984,6 +984,51 @@ describe("applyMediaUnderstanding", () => {
|
||||
expect(ctx.Transcript).toBe("fallback transcript");
|
||||
});
|
||||
|
||||
it("skips audio STT for attachments marked transcribed by channel preflight", async () => {
|
||||
const dir = await createTempMediaDir();
|
||||
const audioPath = path.join(dir, "voice.ogg");
|
||||
await fs.writeFile(audioPath, createSafeAudioFixtureBuffer(2048));
|
||||
const transcribeAudio = vi.fn(async () => ({ text: "duplicate transcript" }));
|
||||
const ctx: MsgContext = {
|
||||
Body: "preflight transcript",
|
||||
Transcript: "preflight transcript",
|
||||
MediaPath: audioPath,
|
||||
MediaType: "audio/ogg",
|
||||
MediaTranscribedIndexes: [0],
|
||||
};
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
models: [{ provider: "groq" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await applyMediaUnderstanding({
|
||||
ctx,
|
||||
cfg,
|
||||
providers: {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||
expect(result.appliedAudio).toBe(false);
|
||||
expect(ctx.Transcript).toBe("preflight transcript");
|
||||
expect(result.decisions).toContainEqual(
|
||||
expect.objectContaining({
|
||||
capability: "audio",
|
||||
outcome: "no-attachment",
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("handles multiple audio attachments when attachment mode is all", async () => {
|
||||
const dir = await createTempMediaDir();
|
||||
const audioBytes = createSafeAudioFixtureBuffer(2048);
|
||||
|
||||
@@ -28,6 +28,11 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
|
||||
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
|
||||
const transcribedIndexes = new Set(
|
||||
Array.isArray(ctx.MediaTranscribedIndexes)
|
||||
? ctx.MediaTranscribedIndexes.filter((index) => Number.isInteger(index) && index >= 0)
|
||||
: [],
|
||||
);
|
||||
const resolveMime = (count: number, index: number) => {
|
||||
const typeHint = normalizeOptionalString(typesFromArray?.[index]);
|
||||
if (typeHint) {
|
||||
@@ -45,6 +50,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
url: urls?.[index] ?? ctx.MediaUrl,
|
||||
mime: resolveMime(count, index),
|
||||
index,
|
||||
alreadyTranscribed: transcribedIndexes.has(index),
|
||||
}))
|
||||
.filter((entry) => Boolean(entry.path ?? normalizeOptionalString(entry.url)));
|
||||
}
|
||||
@@ -57,6 +63,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
url: normalizeOptionalString(value),
|
||||
mime: resolveMime(count, index),
|
||||
index,
|
||||
alreadyTranscribed: transcribedIndexes.has(index),
|
||||
}))
|
||||
.filter((entry) => Boolean(entry.url));
|
||||
}
|
||||
@@ -72,6 +79,7 @@ export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
url: url || undefined,
|
||||
mime: ctx.MediaType,
|
||||
index: 0,
|
||||
alreadyTranscribed: transcribedIndexes.has(0),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user