fix(feishu): transcribe inbound voice notes

This commit is contained in:
Peter Steinberger
2026-04-26 04:47:33 +01:00
parent 38e61e0046
commit 29741f696a
7 changed files with 206 additions and 11 deletions

View File

@@ -0,0 +1,9 @@
import { transcribeFirstAudio as transcribeFirstAudioImpl } from "openclaw/plugin-sdk/media-runtime";
type TranscribeFirstAudio = typeof import("openclaw/plugin-sdk/media-runtime").transcribeFirstAudio;
export async function transcribeFirstAudio(
...args: Parameters<TranscribeFirstAudio>
): ReturnType<TranscribeFirstAudio> {
return await transcribeFirstAudioImpl(...args);
}

View File

@@ -139,6 +139,18 @@ export function parseMessageContent(content: string, messageType: string): strin
if (messageType === "text") {
return parsed.text || "";
}
if (["image", "file", "audio", "video", "media", "sticker"].includes(messageType)) {
if (messageType === "audio") {
const speechToText =
typeof parsed.speech_to_text === "string" ? parsed.speech_to_text.trim() : "";
if (speechToText) {
return speechToText;
}
}
const placeholder = inferPlaceholder(messageType);
const fileName = typeof parsed.file_name === "string" ? parsed.file_name.trim() : "";
return fileName ? `${placeholder} (${fileName})` : placeholder;
}
if (messageType === "share_chat") {
if (parsed && typeof parsed === "object") {
const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown };

View File

@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
import type { ClawdbotConfig } from "../runtime-api.js";
import { parseMessageContent } from "./bot-content.js";
import {
buildBroadcastSessionKey,
buildFeishuAgentBody,
@@ -47,6 +48,29 @@ describe("toMessageResourceType", () => {
});
});
describe("parseMessageContent media placeholders", () => {
it("uses an audio placeholder instead of leaking raw file_key JSON", () => {
expect(
parseMessageContent(JSON.stringify({ file_key: "file_audio", duration: 1200 }), "audio"),
).toBe("<media:audio>");
});
it("prefers Feishu-provided audio transcript text when present", () => {
expect(
parseMessageContent(
JSON.stringify({ file_key: "file_audio", speech_to_text: " spoken words " }),
"audio",
),
).toBe("spoken words");
});
it("keeps media filenames as placeholder context without raw payload fields", () => {
expect(
parseMessageContent(JSON.stringify({ file_key: "file_doc", file_name: "q1.pdf" }), "file"),
).toBe("<media:document> (q1.pdf)");
});
});
describe("resolveBroadcastAgents", () => {
it("returns agent list when broadcast config has the peerId", () => {
const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } };

View File

@@ -231,6 +231,7 @@ const {
mockResolveBoundConversation,
mockTouchBinding,
mockResolveFeishuReasoningPreviewEnabled,
mockTranscribeFirstAudio,
} = vi.hoisted(() => ({
mockCreateFeishuReplyDispatcher: vi.fn(() => ({
dispatcher: createReplyDispatcher(),
@@ -265,6 +266,7 @@ const {
mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation),
mockTouchBinding: vi.fn(),
mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false),
mockTranscribeFirstAudio: vi.fn(),
}));
vi.mock("./reply-dispatcher.js", () => ({
@@ -285,6 +287,10 @@ vi.mock("./media.js", () => ({
downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu,
}));
vi.mock("./audio-preflight.runtime.js", () => ({
transcribeFirstAudio: mockTranscribeFirstAudio,
}));
vi.mock("./client.js", () => ({
createFeishuClient: mockCreateFeishuClient,
}));
@@ -357,6 +363,7 @@ describe("handleFeishuMessage ACP routing", () => {
mockResolveBoundConversation.mockReset().mockReturnValue(null);
mockTouchBinding.mockReset();
mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false);
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
mockResolveAgentRoute.mockReset().mockReturnValue({
...buildDefaultResolveRoute(),
sessionKey: "agent:main:feishu:direct:ou_sender_1",
@@ -555,6 +562,7 @@ describe("handleFeishuMessage command authorization", () => {
mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true });
mockResolveBoundConversation.mockReset().mockReturnValue(null);
mockTouchBinding.mockReset();
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute());
mockCreateFeishuClient.mockReturnValue({
contact: {
@@ -1438,6 +1446,78 @@ describe("handleFeishuMessage command authorization", () => {
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
});
it("transcribes inbound audio before building the agent turn", async () => {
mockShouldComputeCommandAuthorized.mockReturnValue(false);
mockDownloadMessageResourceFeishu.mockResolvedValueOnce({
buffer: Buffer.from("voice"),
contentType: "audio/ogg",
fileName: "voice.ogg",
});
mockSaveMediaBuffer.mockResolvedValueOnce({
id: "inbound-voice.ogg",
path: "/tmp/inbound-voice.ogg",
size: Buffer.byteLength("voice"),
contentType: "audio/ogg",
});
mockTranscribeFirstAudio.mockResolvedValueOnce("voice transcript");
const cfg: ClawdbotConfig = {
channels: {
feishu: {
dmPolicy: "open",
},
},
} as ClawdbotConfig;
const event: FeishuMessageEvent = {
sender: {
sender_id: {
open_id: "ou-voice",
},
},
message: {
message_id: "msg-audio-inbound",
chat_id: "oc-dm",
chat_type: "p2p",
message_type: "audio",
content: JSON.stringify({
file_key: "file_audio_payload",
duration: 1200,
}),
},
};
await dispatchMessage({ cfg, event });
expect(mockDownloadMessageResourceFeishu).toHaveBeenCalledWith(
expect.objectContaining({
messageId: "msg-audio-inbound",
fileKey: "file_audio_payload",
type: "file",
}),
);
expect(mockTranscribeFirstAudio).toHaveBeenCalledWith({
ctx: {
MediaPaths: ["/tmp/inbound-voice.ogg"],
MediaTypes: ["audio/ogg"],
ChatType: "direct",
},
cfg,
});
expect(mockFinalizeInboundContext).toHaveBeenCalledWith(
expect.objectContaining({
BodyForAgent: "[message_id: msg-audio-inbound]\nou-voice: voice transcript",
RawBody: "voice transcript",
CommandBody: "voice transcript",
Transcript: "voice transcript",
MediaPaths: ["/tmp/inbound-voice.ogg"],
MediaTypes: ["audio/ogg"],
}),
);
const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];
expect(finalized.BodyForAgent).not.toContain("file_audio_payload");
});
it("uses video file_key (not thumbnail image_key) for inbound video download", async () => {
mockShouldComputeCommandAuthorized.mockReturnValue(false);

View File

@@ -57,6 +57,7 @@ import type { FeishuMessageEvent } from "./event-types.js";
import {
isFeishuGroupChatType,
type FeishuMessageContext,
type FeishuMediaInfo,
type FeishuMessageInfo,
} from "./types.js";
import type { DynamicAgentCreationConfig } from "./types.js";
@@ -68,6 +69,37 @@ export { toMessageResourceType } from "./bot-content.js";
const permissionErrorNotifiedAt = new Map<string, number>();
const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
async function resolveFeishuAudioPreflightTranscript(params: {
cfg: ClawdbotConfig;
mediaList: FeishuMediaInfo[];
content: string;
chatType: "direct" | "group";
log: (msg: string) => void;
}): Promise<string | undefined> {
if (params.content.trim() !== "<media:audio>") {
return undefined;
}
const audioMedia = params.mediaList.filter((media) => media.contentType?.startsWith("audio/"));
if (audioMedia.length === 0) {
return undefined;
}
try {
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
return await transcribeFirstAudio({
ctx: {
MediaPaths: audioMedia.map((media) => media.path),
MediaTypes: audioMedia.map((media) => media.contentType).filter(Boolean) as string[],
ChatType: params.chatType,
},
cfg: params.cfg,
});
} catch (err) {
params.log(`feishu: audio preflight transcription failed: ${String(err)}`);
return undefined;
}
}
// --- Broadcast support ---
// Resolve broadcast agent list for a given peer (group) ID.
// Returns null if no broadcast config exists or the peer is not in the broadcast list.
@@ -567,14 +599,6 @@ export async function handleFeishuMessage(params: {
senderIds: [senderUserId],
senderName: ctx.senderName,
}).allowed;
const commandAuthorized = shouldComputeCommandAuthorized
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
useAccessGroups,
authorizers: [
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
],
})
: undefined;
// In group chats, the session is scoped to the group, but the *speaker* is the sender.
// Using a group-scoped From causes the agent to treat different users as the same person.
@@ -728,6 +752,39 @@ export async function handleFeishuMessage(params: {
accountId: account.accountId,
});
const mediaPayload = buildAgentMediaPayload(mediaList);
const audioTranscript = await resolveFeishuAudioPreflightTranscript({
cfg: effectiveCfg,
mediaList,
content: ctx.content,
chatType: isGroup ? "group" : "direct",
log,
});
const agentFacingContent = audioTranscript ?? ctx.content;
const agentFacingCtx =
audioTranscript === undefined
? ctx
: {
...ctx,
content: audioTranscript,
};
const effectiveCommandProbeBody =
audioTranscript === undefined
? commandProbeBody
: isGroup
? normalizeFeishuCommandProbeBody(audioTranscript)
: audioTranscript;
const shouldComputeEffectiveCommandAuthorized =
audioTranscript === undefined
? shouldComputeCommandAuthorized
: core.channel.commands.shouldComputeCommandAuthorized(effectiveCommandProbeBody, cfg);
const commandAuthorized = shouldComputeEffectiveCommandAuthorized
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
useAccessGroups,
authorizers: [
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
],
})
: undefined;
// Fetch quoted/replied message content if parentId exists
let quotedMessageInfo: Awaited<ReturnType<typeof getMessageFeishu>> = null;
@@ -771,7 +828,7 @@ export async function handleFeishuMessage(params: {
const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg);
const messageBody = buildFeishuAgentBody({
ctx,
ctx: agentFacingCtx,
quotedContent,
permissionErrorForAgent,
botOpenId,
@@ -993,8 +1050,9 @@ export async function handleFeishuMessage(params: {
InboundHistory: inboundHistory,
ReplyToId: ctx.parentId,
RootMessageId: ctx.rootId,
RawBody: ctx.content,
CommandBody: ctx.content,
RawBody: agentFacingContent,
CommandBody: agentFacingContent,
Transcript: audioTranscript,
From: feishuFrom,
To: feishuTo,
SessionKey: agentSessionKey,