mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:40:44 +00:00
fix(feishu): transcribe inbound voice notes
This commit is contained in:
@@ -80,6 +80,9 @@ Docs: https://docs.openclaw.ai
|
||||
- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
|
||||
generated voice-note files use the same safe media path and native voice
|
||||
routing as normal final replies. Fixes #71920.
|
||||
- Feishu: transcribe inbound voice-note audio with the shared media audio path
|
||||
before agent dispatch and keep raw Feishu `file_key` payloads out of message
|
||||
text. Fixes #67120 and #61876.
|
||||
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
|
||||
plain prompts instead of OpenClaw internal runtime-context envelopes, while
|
||||
keeping those envelopes out of ACP transcripts.
|
||||
|
||||
@@ -414,6 +414,15 @@ Full configuration: [Gateway configuration](/gateway/configuration)
|
||||
- ✅ Video/media
|
||||
- ✅ Stickers
|
||||
|
||||
Inbound Feishu/Lark audio messages are normalized as media placeholders instead
|
||||
of raw `file_key` JSON. When `tools.media.audio` is configured, OpenClaw
|
||||
downloads the voice-note resource and runs shared audio transcription before the
|
||||
agent turn, so the agent receives the spoken transcript. If Feishu includes
|
||||
transcript text directly in the audio payload, that text is used without another
|
||||
ASR call. Without an audio transcription provider, the agent still receives a
|
||||
`<media:audio>` placeholder plus the saved attachment, not the raw Feishu
|
||||
resource payload.
|
||||
|
||||
### Send
|
||||
|
||||
- ✅ Text
|
||||
|
||||
9
extensions/feishu/src/audio-preflight.runtime.ts
Normal file
9
extensions/feishu/src/audio-preflight.runtime.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import { transcribeFirstAudio as transcribeFirstAudioImpl } from "openclaw/plugin-sdk/media-runtime";
|
||||
|
||||
type TranscribeFirstAudio = typeof import("openclaw/plugin-sdk/media-runtime").transcribeFirstAudio;
|
||||
|
||||
export async function transcribeFirstAudio(
|
||||
...args: Parameters<TranscribeFirstAudio>
|
||||
): ReturnType<TranscribeFirstAudio> {
|
||||
return await transcribeFirstAudioImpl(...args);
|
||||
}
|
||||
@@ -139,6 +139,18 @@ export function parseMessageContent(content: string, messageType: string): strin
|
||||
if (messageType === "text") {
|
||||
return parsed.text || "";
|
||||
}
|
||||
if (["image", "file", "audio", "video", "media", "sticker"].includes(messageType)) {
|
||||
if (messageType === "audio") {
|
||||
const speechToText =
|
||||
typeof parsed.speech_to_text === "string" ? parsed.speech_to_text.trim() : "";
|
||||
if (speechToText) {
|
||||
return speechToText;
|
||||
}
|
||||
}
|
||||
const placeholder = inferPlaceholder(messageType);
|
||||
const fileName = typeof parsed.file_name === "string" ? parsed.file_name.trim() : "";
|
||||
return fileName ? `${placeholder} (${fileName})` : placeholder;
|
||||
}
|
||||
if (messageType === "share_chat") {
|
||||
if (parsed && typeof parsed === "object") {
|
||||
const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown };
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { ClawdbotConfig } from "../runtime-api.js";
|
||||
import { parseMessageContent } from "./bot-content.js";
|
||||
import {
|
||||
buildBroadcastSessionKey,
|
||||
buildFeishuAgentBody,
|
||||
@@ -47,6 +48,29 @@ describe("toMessageResourceType", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseMessageContent media placeholders", () => {
|
||||
it("uses an audio placeholder instead of leaking raw file_key JSON", () => {
|
||||
expect(
|
||||
parseMessageContent(JSON.stringify({ file_key: "file_audio", duration: 1200 }), "audio"),
|
||||
).toBe("<media:audio>");
|
||||
});
|
||||
|
||||
it("prefers Feishu-provided audio transcript text when present", () => {
|
||||
expect(
|
||||
parseMessageContent(
|
||||
JSON.stringify({ file_key: "file_audio", speech_to_text: " spoken words " }),
|
||||
"audio",
|
||||
),
|
||||
).toBe("spoken words");
|
||||
});
|
||||
|
||||
it("keeps media filenames as placeholder context without raw payload fields", () => {
|
||||
expect(
|
||||
parseMessageContent(JSON.stringify({ file_key: "file_doc", file_name: "q1.pdf" }), "file"),
|
||||
).toBe("<media:document> (q1.pdf)");
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveBroadcastAgents", () => {
|
||||
it("returns agent list when broadcast config has the peerId", () => {
|
||||
const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } };
|
||||
|
||||
@@ -231,6 +231,7 @@ const {
|
||||
mockResolveBoundConversation,
|
||||
mockTouchBinding,
|
||||
mockResolveFeishuReasoningPreviewEnabled,
|
||||
mockTranscribeFirstAudio,
|
||||
} = vi.hoisted(() => ({
|
||||
mockCreateFeishuReplyDispatcher: vi.fn(() => ({
|
||||
dispatcher: createReplyDispatcher(),
|
||||
@@ -265,6 +266,7 @@ const {
|
||||
mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation),
|
||||
mockTouchBinding: vi.fn(),
|
||||
mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false),
|
||||
mockTranscribeFirstAudio: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock("./reply-dispatcher.js", () => ({
|
||||
@@ -285,6 +287,10 @@ vi.mock("./media.js", () => ({
|
||||
downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu,
|
||||
}));
|
||||
|
||||
vi.mock("./audio-preflight.runtime.js", () => ({
|
||||
transcribeFirstAudio: mockTranscribeFirstAudio,
|
||||
}));
|
||||
|
||||
vi.mock("./client.js", () => ({
|
||||
createFeishuClient: mockCreateFeishuClient,
|
||||
}));
|
||||
@@ -357,6 +363,7 @@ describe("handleFeishuMessage ACP routing", () => {
|
||||
mockResolveBoundConversation.mockReset().mockReturnValue(null);
|
||||
mockTouchBinding.mockReset();
|
||||
mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false);
|
||||
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
|
||||
mockResolveAgentRoute.mockReset().mockReturnValue({
|
||||
...buildDefaultResolveRoute(),
|
||||
sessionKey: "agent:main:feishu:direct:ou_sender_1",
|
||||
@@ -555,6 +562,7 @@ describe("handleFeishuMessage command authorization", () => {
|
||||
mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true });
|
||||
mockResolveBoundConversation.mockReset().mockReturnValue(null);
|
||||
mockTouchBinding.mockReset();
|
||||
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
|
||||
mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute());
|
||||
mockCreateFeishuClient.mockReturnValue({
|
||||
contact: {
|
||||
@@ -1438,6 +1446,78 @@ describe("handleFeishuMessage command authorization", () => {
|
||||
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcribes inbound audio before building the agent turn", async () => {
|
||||
mockShouldComputeCommandAuthorized.mockReturnValue(false);
|
||||
mockDownloadMessageResourceFeishu.mockResolvedValueOnce({
|
||||
buffer: Buffer.from("voice"),
|
||||
contentType: "audio/ogg",
|
||||
fileName: "voice.ogg",
|
||||
});
|
||||
mockSaveMediaBuffer.mockResolvedValueOnce({
|
||||
id: "inbound-voice.ogg",
|
||||
path: "/tmp/inbound-voice.ogg",
|
||||
size: Buffer.byteLength("voice"),
|
||||
contentType: "audio/ogg",
|
||||
});
|
||||
mockTranscribeFirstAudio.mockResolvedValueOnce("voice transcript");
|
||||
|
||||
const cfg: ClawdbotConfig = {
|
||||
channels: {
|
||||
feishu: {
|
||||
dmPolicy: "open",
|
||||
},
|
||||
},
|
||||
} as ClawdbotConfig;
|
||||
|
||||
const event: FeishuMessageEvent = {
|
||||
sender: {
|
||||
sender_id: {
|
||||
open_id: "ou-voice",
|
||||
},
|
||||
},
|
||||
message: {
|
||||
message_id: "msg-audio-inbound",
|
||||
chat_id: "oc-dm",
|
||||
chat_type: "p2p",
|
||||
message_type: "audio",
|
||||
content: JSON.stringify({
|
||||
file_key: "file_audio_payload",
|
||||
duration: 1200,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
await dispatchMessage({ cfg, event });
|
||||
|
||||
expect(mockDownloadMessageResourceFeishu).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messageId: "msg-audio-inbound",
|
||||
fileKey: "file_audio_payload",
|
||||
type: "file",
|
||||
}),
|
||||
);
|
||||
expect(mockTranscribeFirstAudio).toHaveBeenCalledWith({
|
||||
ctx: {
|
||||
MediaPaths: ["/tmp/inbound-voice.ogg"],
|
||||
MediaTypes: ["audio/ogg"],
|
||||
ChatType: "direct",
|
||||
},
|
||||
cfg,
|
||||
});
|
||||
expect(mockFinalizeInboundContext).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
BodyForAgent: "[message_id: msg-audio-inbound]\nou-voice: voice transcript",
|
||||
RawBody: "voice transcript",
|
||||
CommandBody: "voice transcript",
|
||||
Transcript: "voice transcript",
|
||||
MediaPaths: ["/tmp/inbound-voice.ogg"],
|
||||
MediaTypes: ["audio/ogg"],
|
||||
}),
|
||||
);
|
||||
const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];
|
||||
expect(finalized.BodyForAgent).not.toContain("file_audio_payload");
|
||||
});
|
||||
|
||||
it("uses video file_key (not thumbnail image_key) for inbound video download", async () => {
|
||||
mockShouldComputeCommandAuthorized.mockReturnValue(false);
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ import type { FeishuMessageEvent } from "./event-types.js";
|
||||
import {
|
||||
isFeishuGroupChatType,
|
||||
type FeishuMessageContext,
|
||||
type FeishuMediaInfo,
|
||||
type FeishuMessageInfo,
|
||||
} from "./types.js";
|
||||
import type { DynamicAgentCreationConfig } from "./types.js";
|
||||
@@ -68,6 +69,37 @@ export { toMessageResourceType } from "./bot-content.js";
|
||||
const permissionErrorNotifiedAt = new Map<string, number>();
|
||||
const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
async function resolveFeishuAudioPreflightTranscript(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
mediaList: FeishuMediaInfo[];
|
||||
content: string;
|
||||
chatType: "direct" | "group";
|
||||
log: (msg: string) => void;
|
||||
}): Promise<string | undefined> {
|
||||
if (params.content.trim() !== "<media:audio>") {
|
||||
return undefined;
|
||||
}
|
||||
const audioMedia = params.mediaList.filter((media) => media.contentType?.startsWith("audio/"));
|
||||
if (audioMedia.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
try {
|
||||
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
|
||||
return await transcribeFirstAudio({
|
||||
ctx: {
|
||||
MediaPaths: audioMedia.map((media) => media.path),
|
||||
MediaTypes: audioMedia.map((media) => media.contentType).filter(Boolean) as string[],
|
||||
ChatType: params.chatType,
|
||||
},
|
||||
cfg: params.cfg,
|
||||
});
|
||||
} catch (err) {
|
||||
params.log(`feishu: audio preflight transcription failed: ${String(err)}`);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Broadcast support ---
|
||||
// Resolve broadcast agent list for a given peer (group) ID.
|
||||
// Returns null if no broadcast config exists or the peer is not in the broadcast list.
|
||||
@@ -567,14 +599,6 @@ export async function handleFeishuMessage(params: {
|
||||
senderIds: [senderUserId],
|
||||
senderName: ctx.senderName,
|
||||
}).allowed;
|
||||
const commandAuthorized = shouldComputeCommandAuthorized
|
||||
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
|
||||
useAccessGroups,
|
||||
authorizers: [
|
||||
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
|
||||
],
|
||||
})
|
||||
: undefined;
|
||||
|
||||
// In group chats, the session is scoped to the group, but the *speaker* is the sender.
|
||||
// Using a group-scoped From causes the agent to treat different users as the same person.
|
||||
@@ -728,6 +752,39 @@ export async function handleFeishuMessage(params: {
|
||||
accountId: account.accountId,
|
||||
});
|
||||
const mediaPayload = buildAgentMediaPayload(mediaList);
|
||||
const audioTranscript = await resolveFeishuAudioPreflightTranscript({
|
||||
cfg: effectiveCfg,
|
||||
mediaList,
|
||||
content: ctx.content,
|
||||
chatType: isGroup ? "group" : "direct",
|
||||
log,
|
||||
});
|
||||
const agentFacingContent = audioTranscript ?? ctx.content;
|
||||
const agentFacingCtx =
|
||||
audioTranscript === undefined
|
||||
? ctx
|
||||
: {
|
||||
...ctx,
|
||||
content: audioTranscript,
|
||||
};
|
||||
const effectiveCommandProbeBody =
|
||||
audioTranscript === undefined
|
||||
? commandProbeBody
|
||||
: isGroup
|
||||
? normalizeFeishuCommandProbeBody(audioTranscript)
|
||||
: audioTranscript;
|
||||
const shouldComputeEffectiveCommandAuthorized =
|
||||
audioTranscript === undefined
|
||||
? shouldComputeCommandAuthorized
|
||||
: core.channel.commands.shouldComputeCommandAuthorized(effectiveCommandProbeBody, cfg);
|
||||
const commandAuthorized = shouldComputeEffectiveCommandAuthorized
|
||||
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
|
||||
useAccessGroups,
|
||||
authorizers: [
|
||||
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
|
||||
],
|
||||
})
|
||||
: undefined;
|
||||
|
||||
// Fetch quoted/replied message content if parentId exists
|
||||
let quotedMessageInfo: Awaited<ReturnType<typeof getMessageFeishu>> = null;
|
||||
@@ -771,7 +828,7 @@ export async function handleFeishuMessage(params: {
|
||||
|
||||
const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg);
|
||||
const messageBody = buildFeishuAgentBody({
|
||||
ctx,
|
||||
ctx: agentFacingCtx,
|
||||
quotedContent,
|
||||
permissionErrorForAgent,
|
||||
botOpenId,
|
||||
@@ -993,8 +1050,9 @@ export async function handleFeishuMessage(params: {
|
||||
InboundHistory: inboundHistory,
|
||||
ReplyToId: ctx.parentId,
|
||||
RootMessageId: ctx.rootId,
|
||||
RawBody: ctx.content,
|
||||
CommandBody: ctx.content,
|
||||
RawBody: agentFacingContent,
|
||||
CommandBody: agentFacingContent,
|
||||
Transcript: audioTranscript,
|
||||
From: feishuFrom,
|
||||
To: feishuTo,
|
||||
SessionKey: agentSessionKey,
|
||||
|
||||
Reference in New Issue
Block a user