fix(feishu): transcribe inbound voice notes

2026-05-06 06:40:44 +00:00 · 2026-04-26 04:47:33 +01:00
parent 38e61e0046
commit 29741f696a
7 changed files with 206 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -80,6 +80,9 @@ Docs: https://docs.openclaw.ai
 - TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
  generated voice-note files use the same safe media path and native voice
  routing as normal final replies. Fixes #71920.
+- Feishu: transcribe inbound voice-note audio with the shared media audio path
+  before agent dispatch and keep raw Feishu `file_key` payloads out of message
+  text. Fixes #67120 and #61876.
 - ACP: send subagent and async-task completion wakes to external ACP harnesses as
  plain prompts instead of OpenClaw internal runtime-context envelopes, while
  keeping those envelopes out of ACP transcripts.
--- a/docs/channels/feishu.md
+++ b/docs/channels/feishu.md
@@ -414,6 +414,15 @@ Full configuration: [Gateway configuration](/gateway/configuration)
 - ✅ Video/media
 - ✅ Stickers

+Inbound Feishu/Lark audio messages are normalized as media placeholders instead
+of raw `file_key` JSON. When `tools.media.audio` is configured, OpenClaw
+downloads the voice-note resource and runs shared audio transcription before the
+agent turn, so the agent receives the spoken transcript. If Feishu includes
+transcript text directly in the audio payload, that text is used without another
+ASR call. Without an audio transcription provider, the agent still receives a
+`<media:audio>` placeholder plus the saved attachment, not the raw Feishu
+resource payload.
+
 ### Send

 - ✅ Text
--- a/extensions/feishu/src/audio-preflight.runtime.ts
+++ b/extensions/feishu/src/audio-preflight.runtime.ts
@@ -0,0 +1,9 @@
+import { transcribeFirstAudio as transcribeFirstAudioImpl } from "openclaw/plugin-sdk/media-runtime";
+
+type TranscribeFirstAudio = typeof import("openclaw/plugin-sdk/media-runtime").transcribeFirstAudio;
+
+export async function transcribeFirstAudio(
+  ...args: Parameters<TranscribeFirstAudio>
+): ReturnType<TranscribeFirstAudio> {
+  return await transcribeFirstAudioImpl(...args);
+}
--- a/extensions/feishu/src/bot-content.ts
+++ b/extensions/feishu/src/bot-content.ts
@@ -139,6 +139,18 @@ export function parseMessageContent(content: string, messageType: string): strin
    if (messageType === "text") {
      return parsed.text || "";
    }
+    if (["image", "file", "audio", "video", "media", "sticker"].includes(messageType)) {
+      if (messageType === "audio") {
+        const speechToText =
+          typeof parsed.speech_to_text === "string" ? parsed.speech_to_text.trim() : "";
+        if (speechToText) {
+          return speechToText;
+        }
+      }
+      const placeholder = inferPlaceholder(messageType);
+      const fileName = typeof parsed.file_name === "string" ? parsed.file_name.trim() : "";
+      return fileName ? `${placeholder} (${fileName})` : placeholder;
+    }
    if (messageType === "share_chat") {
      if (parsed && typeof parsed === "object") {
        const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown };
--- a/extensions/feishu/src/bot.helpers.test.ts
+++ b/extensions/feishu/src/bot.helpers.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 import type { ClawdbotConfig } from "../runtime-api.js";
+import { parseMessageContent } from "./bot-content.js";
 import {
  buildBroadcastSessionKey,
  buildFeishuAgentBody,
@@ -47,6 +48,29 @@ describe("toMessageResourceType", () => {
  });
 });

+describe("parseMessageContent media placeholders", () => {
+  it("uses an audio placeholder instead of leaking raw file_key JSON", () => {
+    expect(
+      parseMessageContent(JSON.stringify({ file_key: "file_audio", duration: 1200 }), "audio"),
+    ).toBe("<media:audio>");
+  });
+
+  it("prefers Feishu-provided audio transcript text when present", () => {
+    expect(
+      parseMessageContent(
+        JSON.stringify({ file_key: "file_audio", speech_to_text: " spoken words " }),
+        "audio",
+      ),
+    ).toBe("spoken words");
+  });
+
+  it("keeps media filenames as placeholder context without raw payload fields", () => {
+    expect(
+      parseMessageContent(JSON.stringify({ file_key: "file_doc", file_name: "q1.pdf" }), "file"),
+    ).toBe("<media:document> (q1.pdf)");
+  });
+});
+
 describe("resolveBroadcastAgents", () => {
  it("returns agent list when broadcast config has the peerId", () => {
    const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } };
--- a/extensions/feishu/src/bot.test.ts
+++ b/extensions/feishu/src/bot.test.ts
@@ -231,6 +231,7 @@ const {
  mockResolveBoundConversation,
  mockTouchBinding,
  mockResolveFeishuReasoningPreviewEnabled,
+  mockTranscribeFirstAudio,
 } = vi.hoisted(() => ({
  mockCreateFeishuReplyDispatcher: vi.fn(() => ({
    dispatcher: createReplyDispatcher(),
@@ -265,6 +266,7 @@ const {
  mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation),
  mockTouchBinding: vi.fn(),
  mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false),
+  mockTranscribeFirstAudio: vi.fn(),
 }));

 vi.mock("./reply-dispatcher.js", () => ({
@@ -285,6 +287,10 @@ vi.mock("./media.js", () => ({
  downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu,
 }));

+vi.mock("./audio-preflight.runtime.js", () => ({
+  transcribeFirstAudio: mockTranscribeFirstAudio,
+}));
+
 vi.mock("./client.js", () => ({
  createFeishuClient: mockCreateFeishuClient,
 }));
@@ -357,6 +363,7 @@ describe("handleFeishuMessage ACP routing", () => {
    mockResolveBoundConversation.mockReset().mockReturnValue(null);
    mockTouchBinding.mockReset();
    mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false);
+    mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
    mockResolveAgentRoute.mockReset().mockReturnValue({
      ...buildDefaultResolveRoute(),
      sessionKey: "agent:main:feishu:direct:ou_sender_1",
@@ -555,6 +562,7 @@ describe("handleFeishuMessage command authorization", () => {
    mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true });
    mockResolveBoundConversation.mockReset().mockReturnValue(null);
    mockTouchBinding.mockReset();
+    mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
    mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute());
    mockCreateFeishuClient.mockReturnValue({
      contact: {
@@ -1438,6 +1446,78 @@ describe("handleFeishuMessage command authorization", () => {
    expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
  });

+  it("transcribes inbound audio before building the agent turn", async () => {
+    mockShouldComputeCommandAuthorized.mockReturnValue(false);
+    mockDownloadMessageResourceFeishu.mockResolvedValueOnce({
+      buffer: Buffer.from("voice"),
+      contentType: "audio/ogg",
+      fileName: "voice.ogg",
+    });
+    mockSaveMediaBuffer.mockResolvedValueOnce({
+      id: "inbound-voice.ogg",
+      path: "/tmp/inbound-voice.ogg",
+      size: Buffer.byteLength("voice"),
+      contentType: "audio/ogg",
+    });
+    mockTranscribeFirstAudio.mockResolvedValueOnce("voice transcript");
+
+    const cfg: ClawdbotConfig = {
+      channels: {
+        feishu: {
+          dmPolicy: "open",
+        },
+      },
+    } as ClawdbotConfig;
+
+    const event: FeishuMessageEvent = {
+      sender: {
+        sender_id: {
+          open_id: "ou-voice",
+        },
+      },
+      message: {
+        message_id: "msg-audio-inbound",
+        chat_id: "oc-dm",
+        chat_type: "p2p",
+        message_type: "audio",
+        content: JSON.stringify({
+          file_key: "file_audio_payload",
+          duration: 1200,
+        }),
+      },
+    };
+
+    await dispatchMessage({ cfg, event });
+
+    expect(mockDownloadMessageResourceFeishu).toHaveBeenCalledWith(
+      expect.objectContaining({
+        messageId: "msg-audio-inbound",
+        fileKey: "file_audio_payload",
+        type: "file",
+      }),
+    );
+    expect(mockTranscribeFirstAudio).toHaveBeenCalledWith({
+      ctx: {
+        MediaPaths: ["/tmp/inbound-voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        ChatType: "direct",
+      },
+      cfg,
+    });
+    expect(mockFinalizeInboundContext).toHaveBeenCalledWith(
+      expect.objectContaining({
+        BodyForAgent: "[message_id: msg-audio-inbound]\nou-voice: voice transcript",
+        RawBody: "voice transcript",
+        CommandBody: "voice transcript",
+        Transcript: "voice transcript",
+        MediaPaths: ["/tmp/inbound-voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      }),
+    );
+    const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];
+    expect(finalized.BodyForAgent).not.toContain("file_audio_payload");
+  });
+
  it("uses video file_key (not thumbnail image_key) for inbound video download", async () => {
    mockShouldComputeCommandAuthorized.mockReturnValue(false);

--- a/extensions/feishu/src/bot.ts
+++ b/extensions/feishu/src/bot.ts
@@ -57,6 +57,7 @@ import type { FeishuMessageEvent } from "./event-types.js";
 import {
  isFeishuGroupChatType,
  type FeishuMessageContext,
+  type FeishuMediaInfo,
  type FeishuMessageInfo,
 } from "./types.js";
 import type { DynamicAgentCreationConfig } from "./types.js";
@@ -68,6 +69,37 @@ export { toMessageResourceType } from "./bot-content.js";
 const permissionErrorNotifiedAt = new Map<string, number>();
 const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes

+async function resolveFeishuAudioPreflightTranscript(params: {
+  cfg: ClawdbotConfig;
+  mediaList: FeishuMediaInfo[];
+  content: string;
+  chatType: "direct" | "group";
+  log: (msg: string) => void;
+}): Promise<string | undefined> {
+  if (params.content.trim() !== "<media:audio>") {
+    return undefined;
+  }
+  const audioMedia = params.mediaList.filter((media) => media.contentType?.startsWith("audio/"));
+  if (audioMedia.length === 0) {
+    return undefined;
+  }
+
+  try {
+    const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
+    return await transcribeFirstAudio({
+      ctx: {
+        MediaPaths: audioMedia.map((media) => media.path),
+        MediaTypes: audioMedia.map((media) => media.contentType).filter(Boolean) as string[],
+        ChatType: params.chatType,
+      },
+      cfg: params.cfg,
+    });
+  } catch (err) {
+    params.log(`feishu: audio preflight transcription failed: ${String(err)}`);
+    return undefined;
+  }
+}
+
 // --- Broadcast support ---
 // Resolve broadcast agent list for a given peer (group) ID.
 // Returns null if no broadcast config exists or the peer is not in the broadcast list.
@@ -567,14 +599,6 @@ export async function handleFeishuMessage(params: {
      senderIds: [senderUserId],
      senderName: ctx.senderName,
    }).allowed;
-    const commandAuthorized = shouldComputeCommandAuthorized
-      ? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
-          useAccessGroups,
-          authorizers: [
-            { configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
-          ],
-        })
-      : undefined;

    // In group chats, the session is scoped to the group, but the *speaker* is the sender.
    // Using a group-scoped From causes the agent to treat different users as the same person.
@@ -728,6 +752,39 @@ export async function handleFeishuMessage(params: {
      accountId: account.accountId,
    });
    const mediaPayload = buildAgentMediaPayload(mediaList);
+    const audioTranscript = await resolveFeishuAudioPreflightTranscript({
+      cfg: effectiveCfg,
+      mediaList,
+      content: ctx.content,
+      chatType: isGroup ? "group" : "direct",
+      log,
+    });
+    const agentFacingContent = audioTranscript ?? ctx.content;
+    const agentFacingCtx =
+      audioTranscript === undefined
+        ? ctx
+        : {
+            ...ctx,
+            content: audioTranscript,
+          };
+    const effectiveCommandProbeBody =
+      audioTranscript === undefined
+        ? commandProbeBody
+        : isGroup
+          ? normalizeFeishuCommandProbeBody(audioTranscript)
+          : audioTranscript;
+    const shouldComputeEffectiveCommandAuthorized =
+      audioTranscript === undefined
+        ? shouldComputeCommandAuthorized
+        : core.channel.commands.shouldComputeCommandAuthorized(effectiveCommandProbeBody, cfg);
+    const commandAuthorized = shouldComputeEffectiveCommandAuthorized
+      ? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
+          useAccessGroups,
+          authorizers: [
+            { configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
+          ],
+        })
+      : undefined;

    // Fetch quoted/replied message content if parentId exists
    let quotedMessageInfo: Awaited<ReturnType<typeof getMessageFeishu>> = null;
@@ -771,7 +828,7 @@ export async function handleFeishuMessage(params: {

    const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg);
    const messageBody = buildFeishuAgentBody({
-      ctx,
+      ctx: agentFacingCtx,
      quotedContent,
      permissionErrorForAgent,
      botOpenId,
@@ -993,8 +1050,9 @@ export async function handleFeishuMessage(params: {
        InboundHistory: inboundHistory,
        ReplyToId: ctx.parentId,
        RootMessageId: ctx.rootId,
-        RawBody: ctx.content,
-        CommandBody: ctx.content,
+        RawBody: agentFacingContent,
+        CommandBody: agentFacingContent,
+        Transcript: audioTranscript,
        From: feishuFrom,
        To: feishuTo,
        SessionKey: agentSessionKey,