diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f64cddb1fc..855bbbf1a80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,6 +80,9 @@ Docs: https://docs.openclaw.ai - TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so generated voice-note files use the same safe media path and native voice routing as normal final replies. Fixes #71920. +- Feishu: transcribe inbound voice-note audio with the shared media audio path + before agent dispatch and keep raw Feishu `file_key` payloads out of message + text. Fixes #67120 and #61876. - ACP: send subagent and async-task completion wakes to external ACP harnesses as plain prompts instead of OpenClaw internal runtime-context envelopes, while keeping those envelopes out of ACP transcripts. diff --git a/docs/channels/feishu.md b/docs/channels/feishu.md index a4f81263b1b..c3ba93b3424 100644 --- a/docs/channels/feishu.md +++ b/docs/channels/feishu.md @@ -414,6 +414,15 @@ Full configuration: [Gateway configuration](/gateway/configuration) - ✅ Video/media - ✅ Stickers +Inbound Feishu/Lark audio messages are normalized as media placeholders instead +of raw `file_key` JSON. When `tools.media.audio` is configured, OpenClaw +downloads the voice-note resource and runs shared audio transcription before the +agent turn, so the agent receives the spoken transcript. If Feishu includes +transcript text directly in the audio payload, that text is used without another +ASR call. Without an audio transcription provider, the agent still receives a +`` placeholder plus the saved attachment, not the raw Feishu +resource payload. + ### Send - ✅ Text diff --git a/extensions/feishu/src/audio-preflight.runtime.ts b/extensions/feishu/src/audio-preflight.runtime.ts new file mode 100644 index 00000000000..7e7f111d104 --- /dev/null +++ b/extensions/feishu/src/audio-preflight.runtime.ts @@ -0,0 +1,9 @@ +import { transcribeFirstAudio as transcribeFirstAudioImpl } from "openclaw/plugin-sdk/media-runtime"; + +type TranscribeFirstAudio = typeof import("openclaw/plugin-sdk/media-runtime").transcribeFirstAudio; + +export async function transcribeFirstAudio( + ...args: Parameters +): ReturnType { + return await transcribeFirstAudioImpl(...args); +} diff --git a/extensions/feishu/src/bot-content.ts b/extensions/feishu/src/bot-content.ts index 90e470e2726..5eba58d191d 100644 --- a/extensions/feishu/src/bot-content.ts +++ b/extensions/feishu/src/bot-content.ts @@ -139,6 +139,18 @@ export function parseMessageContent(content: string, messageType: string): strin if (messageType === "text") { return parsed.text || ""; } + if (["image", "file", "audio", "video", "media", "sticker"].includes(messageType)) { + if (messageType === "audio") { + const speechToText = + typeof parsed.speech_to_text === "string" ? parsed.speech_to_text.trim() : ""; + if (speechToText) { + return speechToText; + } + } + const placeholder = inferPlaceholder(messageType); + const fileName = typeof parsed.file_name === "string" ? parsed.file_name.trim() : ""; + return fileName ? `${placeholder} (${fileName})` : placeholder; + } if (messageType === "share_chat") { if (parsed && typeof parsed === "object") { const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown }; diff --git a/extensions/feishu/src/bot.helpers.test.ts b/extensions/feishu/src/bot.helpers.test.ts index fda65c69570..597dbf24484 100644 --- a/extensions/feishu/src/bot.helpers.test.ts +++ b/extensions/feishu/src/bot.helpers.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; import type { ClawdbotConfig } from "../runtime-api.js"; +import { parseMessageContent } from "./bot-content.js"; import { buildBroadcastSessionKey, buildFeishuAgentBody, @@ -47,6 +48,29 @@ describe("toMessageResourceType", () => { }); }); +describe("parseMessageContent media placeholders", () => { + it("uses an audio placeholder instead of leaking raw file_key JSON", () => { + expect( + parseMessageContent(JSON.stringify({ file_key: "file_audio", duration: 1200 }), "audio"), + ).toBe(""); + }); + + it("prefers Feishu-provided audio transcript text when present", () => { + expect( + parseMessageContent( + JSON.stringify({ file_key: "file_audio", speech_to_text: " spoken words " }), + "audio", + ), + ).toBe("spoken words"); + }); + + it("keeps media filenames as placeholder context without raw payload fields", () => { + expect( + parseMessageContent(JSON.stringify({ file_key: "file_doc", file_name: "q1.pdf" }), "file"), + ).toBe(" (q1.pdf)"); + }); +}); + describe("resolveBroadcastAgents", () => { it("returns agent list when broadcast config has the peerId", () => { const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } }; diff --git a/extensions/feishu/src/bot.test.ts b/extensions/feishu/src/bot.test.ts index 07f7c334971..9e4b966ad0c 100644 --- a/extensions/feishu/src/bot.test.ts +++ b/extensions/feishu/src/bot.test.ts @@ -231,6 +231,7 @@ const { mockResolveBoundConversation, mockTouchBinding, mockResolveFeishuReasoningPreviewEnabled, + mockTranscribeFirstAudio, } = vi.hoisted(() => ({ mockCreateFeishuReplyDispatcher: vi.fn(() => ({ dispatcher: createReplyDispatcher(), @@ -265,6 +266,7 @@ const { mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation), mockTouchBinding: vi.fn(), mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false), + mockTranscribeFirstAudio: vi.fn(), })); vi.mock("./reply-dispatcher.js", () => ({ @@ -285,6 +287,10 @@ vi.mock("./media.js", () => ({ downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu, })); +vi.mock("./audio-preflight.runtime.js", () => ({ + transcribeFirstAudio: mockTranscribeFirstAudio, +})); + vi.mock("./client.js", () => ({ createFeishuClient: mockCreateFeishuClient, })); @@ -357,6 +363,7 @@ describe("handleFeishuMessage ACP routing", () => { mockResolveBoundConversation.mockReset().mockReturnValue(null); mockTouchBinding.mockReset(); mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false); + mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined); mockResolveAgentRoute.mockReset().mockReturnValue({ ...buildDefaultResolveRoute(), sessionKey: "agent:main:feishu:direct:ou_sender_1", @@ -555,6 +562,7 @@ describe("handleFeishuMessage command authorization", () => { mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true }); mockResolveBoundConversation.mockReset().mockReturnValue(null); mockTouchBinding.mockReset(); + mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined); mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute()); mockCreateFeishuClient.mockReturnValue({ contact: { @@ -1438,6 +1446,78 @@ describe("handleFeishuMessage command authorization", () => { expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled(); }); + it("transcribes inbound audio before building the agent turn", async () => { + mockShouldComputeCommandAuthorized.mockReturnValue(false); + mockDownloadMessageResourceFeishu.mockResolvedValueOnce({ + buffer: Buffer.from("voice"), + contentType: "audio/ogg", + fileName: "voice.ogg", + }); + mockSaveMediaBuffer.mockResolvedValueOnce({ + id: "inbound-voice.ogg", + path: "/tmp/inbound-voice.ogg", + size: Buffer.byteLength("voice"), + contentType: "audio/ogg", + }); + mockTranscribeFirstAudio.mockResolvedValueOnce("voice transcript"); + + const cfg: ClawdbotConfig = { + channels: { + feishu: { + dmPolicy: "open", + }, + }, + } as ClawdbotConfig; + + const event: FeishuMessageEvent = { + sender: { + sender_id: { + open_id: "ou-voice", + }, + }, + message: { + message_id: "msg-audio-inbound", + chat_id: "oc-dm", + chat_type: "p2p", + message_type: "audio", + content: JSON.stringify({ + file_key: "file_audio_payload", + duration: 1200, + }), + }, + }; + + await dispatchMessage({ cfg, event }); + + expect(mockDownloadMessageResourceFeishu).toHaveBeenCalledWith( + expect.objectContaining({ + messageId: "msg-audio-inbound", + fileKey: "file_audio_payload", + type: "file", + }), + ); + expect(mockTranscribeFirstAudio).toHaveBeenCalledWith({ + ctx: { + MediaPaths: ["/tmp/inbound-voice.ogg"], + MediaTypes: ["audio/ogg"], + ChatType: "direct", + }, + cfg, + }); + expect(mockFinalizeInboundContext).toHaveBeenCalledWith( + expect.objectContaining({ + BodyForAgent: "[message_id: msg-audio-inbound]\nou-voice: voice transcript", + RawBody: "voice transcript", + CommandBody: "voice transcript", + Transcript: "voice transcript", + MediaPaths: ["/tmp/inbound-voice.ogg"], + MediaTypes: ["audio/ogg"], + }), + ); + const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0]; + expect(finalized.BodyForAgent).not.toContain("file_audio_payload"); + }); + it("uses video file_key (not thumbnail image_key) for inbound video download", async () => { mockShouldComputeCommandAuthorized.mockReturnValue(false); diff --git a/extensions/feishu/src/bot.ts b/extensions/feishu/src/bot.ts index 4d1fad5ee21..8b871b72d3d 100644 --- a/extensions/feishu/src/bot.ts +++ b/extensions/feishu/src/bot.ts @@ -57,6 +57,7 @@ import type { FeishuMessageEvent } from "./event-types.js"; import { isFeishuGroupChatType, type FeishuMessageContext, + type FeishuMediaInfo, type FeishuMessageInfo, } from "./types.js"; import type { DynamicAgentCreationConfig } from "./types.js"; @@ -68,6 +69,37 @@ export { toMessageResourceType } from "./bot-content.js"; const permissionErrorNotifiedAt = new Map(); const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes +async function resolveFeishuAudioPreflightTranscript(params: { + cfg: ClawdbotConfig; + mediaList: FeishuMediaInfo[]; + content: string; + chatType: "direct" | "group"; + log: (msg: string) => void; +}): Promise { + if (params.content.trim() !== "") { + return undefined; + } + const audioMedia = params.mediaList.filter((media) => media.contentType?.startsWith("audio/")); + if (audioMedia.length === 0) { + return undefined; + } + + try { + const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js"); + return await transcribeFirstAudio({ + ctx: { + MediaPaths: audioMedia.map((media) => media.path), + MediaTypes: audioMedia.map((media) => media.contentType).filter(Boolean) as string[], + ChatType: params.chatType, + }, + cfg: params.cfg, + }); + } catch (err) { + params.log(`feishu: audio preflight transcription failed: ${String(err)}`); + return undefined; + } +} + // --- Broadcast support --- // Resolve broadcast agent list for a given peer (group) ID. // Returns null if no broadcast config exists or the peer is not in the broadcast list. @@ -567,14 +599,6 @@ export async function handleFeishuMessage(params: { senderIds: [senderUserId], senderName: ctx.senderName, }).allowed; - const commandAuthorized = shouldComputeCommandAuthorized - ? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({ - useAccessGroups, - authorizers: [ - { configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands }, - ], - }) - : undefined; // In group chats, the session is scoped to the group, but the *speaker* is the sender. // Using a group-scoped From causes the agent to treat different users as the same person. @@ -728,6 +752,39 @@ export async function handleFeishuMessage(params: { accountId: account.accountId, }); const mediaPayload = buildAgentMediaPayload(mediaList); + const audioTranscript = await resolveFeishuAudioPreflightTranscript({ + cfg: effectiveCfg, + mediaList, + content: ctx.content, + chatType: isGroup ? "group" : "direct", + log, + }); + const agentFacingContent = audioTranscript ?? ctx.content; + const agentFacingCtx = + audioTranscript === undefined + ? ctx + : { + ...ctx, + content: audioTranscript, + }; + const effectiveCommandProbeBody = + audioTranscript === undefined + ? commandProbeBody + : isGroup + ? normalizeFeishuCommandProbeBody(audioTranscript) + : audioTranscript; + const shouldComputeEffectiveCommandAuthorized = + audioTranscript === undefined + ? shouldComputeCommandAuthorized + : core.channel.commands.shouldComputeCommandAuthorized(effectiveCommandProbeBody, cfg); + const commandAuthorized = shouldComputeEffectiveCommandAuthorized + ? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({ + useAccessGroups, + authorizers: [ + { configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands }, + ], + }) + : undefined; // Fetch quoted/replied message content if parentId exists let quotedMessageInfo: Awaited> = null; @@ -771,7 +828,7 @@ export async function handleFeishuMessage(params: { const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg); const messageBody = buildFeishuAgentBody({ - ctx, + ctx: agentFacingCtx, quotedContent, permissionErrorForAgent, botOpenId, @@ -993,8 +1050,9 @@ export async function handleFeishuMessage(params: { InboundHistory: inboundHistory, ReplyToId: ctx.parentId, RootMessageId: ctx.rootId, - RawBody: ctx.content, - CommandBody: ctx.content, + RawBody: agentFacingContent, + CommandBody: agentFacingContent, + Transcript: audioTranscript, From: feishuFrom, To: feishuTo, SessionKey: agentSessionKey,