fix(feishu): transcribe inbound voice notes

This commit is contained in:
Peter Steinberger
2026-04-26 04:47:33 +01:00
parent 38e61e0046
commit 29741f696a
7 changed files with 206 additions and 11 deletions

View File

@@ -80,6 +80,9 @@ Docs: https://docs.openclaw.ai
- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
generated voice-note files use the same safe media path and native voice
routing as normal final replies. Fixes #71920.
- Feishu: transcribe inbound voice-note audio with the shared media audio path
before agent dispatch and keep raw Feishu `file_key` payloads out of message
text. Fixes #67120 and #61876.
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
plain prompts instead of OpenClaw internal runtime-context envelopes, while
keeping those envelopes out of ACP transcripts.

View File

@@ -414,6 +414,15 @@ Full configuration: [Gateway configuration](/gateway/configuration)
- ✅ Video/media
- ✅ Stickers
Inbound Feishu/Lark audio messages are normalized as media placeholders instead
of raw `file_key` JSON. When `tools.media.audio` is configured, OpenClaw
downloads the voice-note resource and runs shared audio transcription before the
agent turn, so the agent receives the spoken transcript. If Feishu includes
transcript text directly in the audio payload, that text is used without another
ASR call. Without an audio transcription provider, the agent still receives a
`<media:audio>` placeholder plus the saved attachment, not the raw Feishu
resource payload.
### Send
- ✅ Text

View File

@@ -0,0 +1,9 @@
import { transcribeFirstAudio as transcribeFirstAudioImpl } from "openclaw/plugin-sdk/media-runtime";
type TranscribeFirstAudio = typeof import("openclaw/plugin-sdk/media-runtime").transcribeFirstAudio;
export async function transcribeFirstAudio(
...args: Parameters<TranscribeFirstAudio>
): ReturnType<TranscribeFirstAudio> {
return await transcribeFirstAudioImpl(...args);
}

View File

@@ -139,6 +139,18 @@ export function parseMessageContent(content: string, messageType: string): strin
if (messageType === "text") {
return parsed.text || "";
}
if (["image", "file", "audio", "video", "media", "sticker"].includes(messageType)) {
if (messageType === "audio") {
const speechToText =
typeof parsed.speech_to_text === "string" ? parsed.speech_to_text.trim() : "";
if (speechToText) {
return speechToText;
}
}
const placeholder = inferPlaceholder(messageType);
const fileName = typeof parsed.file_name === "string" ? parsed.file_name.trim() : "";
return fileName ? `${placeholder} (${fileName})` : placeholder;
}
if (messageType === "share_chat") {
if (parsed && typeof parsed === "object") {
const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown };

View File

@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
import type { ClawdbotConfig } from "../runtime-api.js";
import { parseMessageContent } from "./bot-content.js";
import {
buildBroadcastSessionKey,
buildFeishuAgentBody,
@@ -47,6 +48,29 @@ describe("toMessageResourceType", () => {
});
});
describe("parseMessageContent media placeholders", () => {
it("uses an audio placeholder instead of leaking raw file_key JSON", () => {
expect(
parseMessageContent(JSON.stringify({ file_key: "file_audio", duration: 1200 }), "audio"),
).toBe("<media:audio>");
});
it("prefers Feishu-provided audio transcript text when present", () => {
expect(
parseMessageContent(
JSON.stringify({ file_key: "file_audio", speech_to_text: " spoken words " }),
"audio",
),
).toBe("spoken words");
});
it("keeps media filenames as placeholder context without raw payload fields", () => {
expect(
parseMessageContent(JSON.stringify({ file_key: "file_doc", file_name: "q1.pdf" }), "file"),
).toBe("<media:document> (q1.pdf)");
});
});
describe("resolveBroadcastAgents", () => {
it("returns agent list when broadcast config has the peerId", () => {
const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } };

View File

@@ -231,6 +231,7 @@ const {
mockResolveBoundConversation,
mockTouchBinding,
mockResolveFeishuReasoningPreviewEnabled,
mockTranscribeFirstAudio,
} = vi.hoisted(() => ({
mockCreateFeishuReplyDispatcher: vi.fn(() => ({
dispatcher: createReplyDispatcher(),
@@ -265,6 +266,7 @@ const {
mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation),
mockTouchBinding: vi.fn(),
mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false),
mockTranscribeFirstAudio: vi.fn(),
}));
vi.mock("./reply-dispatcher.js", () => ({
@@ -285,6 +287,10 @@ vi.mock("./media.js", () => ({
downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu,
}));
vi.mock("./audio-preflight.runtime.js", () => ({
transcribeFirstAudio: mockTranscribeFirstAudio,
}));
vi.mock("./client.js", () => ({
createFeishuClient: mockCreateFeishuClient,
}));
@@ -357,6 +363,7 @@ describe("handleFeishuMessage ACP routing", () => {
mockResolveBoundConversation.mockReset().mockReturnValue(null);
mockTouchBinding.mockReset();
mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false);
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
mockResolveAgentRoute.mockReset().mockReturnValue({
...buildDefaultResolveRoute(),
sessionKey: "agent:main:feishu:direct:ou_sender_1",
@@ -555,6 +562,7 @@ describe("handleFeishuMessage command authorization", () => {
mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true });
mockResolveBoundConversation.mockReset().mockReturnValue(null);
mockTouchBinding.mockReset();
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute());
mockCreateFeishuClient.mockReturnValue({
contact: {
@@ -1438,6 +1446,78 @@ describe("handleFeishuMessage command authorization", () => {
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
});
it("transcribes inbound audio before building the agent turn", async () => {
mockShouldComputeCommandAuthorized.mockReturnValue(false);
mockDownloadMessageResourceFeishu.mockResolvedValueOnce({
buffer: Buffer.from("voice"),
contentType: "audio/ogg",
fileName: "voice.ogg",
});
mockSaveMediaBuffer.mockResolvedValueOnce({
id: "inbound-voice.ogg",
path: "/tmp/inbound-voice.ogg",
size: Buffer.byteLength("voice"),
contentType: "audio/ogg",
});
mockTranscribeFirstAudio.mockResolvedValueOnce("voice transcript");
const cfg: ClawdbotConfig = {
channels: {
feishu: {
dmPolicy: "open",
},
},
} as ClawdbotConfig;
const event: FeishuMessageEvent = {
sender: {
sender_id: {
open_id: "ou-voice",
},
},
message: {
message_id: "msg-audio-inbound",
chat_id: "oc-dm",
chat_type: "p2p",
message_type: "audio",
content: JSON.stringify({
file_key: "file_audio_payload",
duration: 1200,
}),
},
};
await dispatchMessage({ cfg, event });
expect(mockDownloadMessageResourceFeishu).toHaveBeenCalledWith(
expect.objectContaining({
messageId: "msg-audio-inbound",
fileKey: "file_audio_payload",
type: "file",
}),
);
expect(mockTranscribeFirstAudio).toHaveBeenCalledWith({
ctx: {
MediaPaths: ["/tmp/inbound-voice.ogg"],
MediaTypes: ["audio/ogg"],
ChatType: "direct",
},
cfg,
});
expect(mockFinalizeInboundContext).toHaveBeenCalledWith(
expect.objectContaining({
BodyForAgent: "[message_id: msg-audio-inbound]\nou-voice: voice transcript",
RawBody: "voice transcript",
CommandBody: "voice transcript",
Transcript: "voice transcript",
MediaPaths: ["/tmp/inbound-voice.ogg"],
MediaTypes: ["audio/ogg"],
}),
);
const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];
expect(finalized.BodyForAgent).not.toContain("file_audio_payload");
});
it("uses video file_key (not thumbnail image_key) for inbound video download", async () => {
mockShouldComputeCommandAuthorized.mockReturnValue(false);

View File

@@ -57,6 +57,7 @@ import type { FeishuMessageEvent } from "./event-types.js";
import {
isFeishuGroupChatType,
type FeishuMessageContext,
type FeishuMediaInfo,
type FeishuMessageInfo,
} from "./types.js";
import type { DynamicAgentCreationConfig } from "./types.js";
@@ -68,6 +69,37 @@ export { toMessageResourceType } from "./bot-content.js";
const permissionErrorNotifiedAt = new Map<string, number>();
const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
async function resolveFeishuAudioPreflightTranscript(params: {
cfg: ClawdbotConfig;
mediaList: FeishuMediaInfo[];
content: string;
chatType: "direct" | "group";
log: (msg: string) => void;
}): Promise<string | undefined> {
if (params.content.trim() !== "<media:audio>") {
return undefined;
}
const audioMedia = params.mediaList.filter((media) => media.contentType?.startsWith("audio/"));
if (audioMedia.length === 0) {
return undefined;
}
try {
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
return await transcribeFirstAudio({
ctx: {
MediaPaths: audioMedia.map((media) => media.path),
MediaTypes: audioMedia.map((media) => media.contentType).filter(Boolean) as string[],
ChatType: params.chatType,
},
cfg: params.cfg,
});
} catch (err) {
params.log(`feishu: audio preflight transcription failed: ${String(err)}`);
return undefined;
}
}
// --- Broadcast support ---
// Resolve broadcast agent list for a given peer (group) ID.
// Returns null if no broadcast config exists or the peer is not in the broadcast list.
@@ -567,14 +599,6 @@ export async function handleFeishuMessage(params: {
senderIds: [senderUserId],
senderName: ctx.senderName,
}).allowed;
const commandAuthorized = shouldComputeCommandAuthorized
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
useAccessGroups,
authorizers: [
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
],
})
: undefined;
// In group chats, the session is scoped to the group, but the *speaker* is the sender.
// Using a group-scoped From causes the agent to treat different users as the same person.
@@ -728,6 +752,39 @@ export async function handleFeishuMessage(params: {
accountId: account.accountId,
});
const mediaPayload = buildAgentMediaPayload(mediaList);
const audioTranscript = await resolveFeishuAudioPreflightTranscript({
cfg: effectiveCfg,
mediaList,
content: ctx.content,
chatType: isGroup ? "group" : "direct",
log,
});
const agentFacingContent = audioTranscript ?? ctx.content;
const agentFacingCtx =
audioTranscript === undefined
? ctx
: {
...ctx,
content: audioTranscript,
};
const effectiveCommandProbeBody =
audioTranscript === undefined
? commandProbeBody
: isGroup
? normalizeFeishuCommandProbeBody(audioTranscript)
: audioTranscript;
const shouldComputeEffectiveCommandAuthorized =
audioTranscript === undefined
? shouldComputeCommandAuthorized
: core.channel.commands.shouldComputeCommandAuthorized(effectiveCommandProbeBody, cfg);
const commandAuthorized = shouldComputeEffectiveCommandAuthorized
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
useAccessGroups,
authorizers: [
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
],
})
: undefined;
// Fetch quoted/replied message content if parentId exists
let quotedMessageInfo: Awaited<ReturnType<typeof getMessageFeishu>> = null;
@@ -771,7 +828,7 @@ export async function handleFeishuMessage(params: {
const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg);
const messageBody = buildFeishuAgentBody({
ctx,
ctx: agentFacingCtx,
quotedContent,
permissionErrorForAgent,
botOpenId,
@@ -993,8 +1050,9 @@ export async function handleFeishuMessage(params: {
InboundHistory: inboundHistory,
ReplyToId: ctx.parentId,
RootMessageId: ctx.rootId,
RawBody: ctx.content,
CommandBody: ctx.content,
RawBody: agentFacingContent,
CommandBody: agentFacingContent,
Transcript: audioTranscript,
From: feishuFrom,
To: feishuTo,
SessionKey: agentSessionKey,