mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 19:31:00 +00:00
fix(feishu): transcribe inbound voice notes
This commit is contained in:
@@ -80,6 +80,9 @@ Docs: https://docs.openclaw.ai
|
|||||||
- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
|
- TTS/Feishu: normalize final-mode streamed TTS-only audio before delivery so
|
||||||
generated voice-note files use the same safe media path and native voice
|
generated voice-note files use the same safe media path and native voice
|
||||||
routing as normal final replies. Fixes #71920.
|
routing as normal final replies. Fixes #71920.
|
||||||
|
- Feishu: transcribe inbound voice-note audio with the shared media audio path
|
||||||
|
before agent dispatch and keep raw Feishu `file_key` payloads out of message
|
||||||
|
text. Fixes #67120 and #61876.
|
||||||
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
|
- ACP: send subagent and async-task completion wakes to external ACP harnesses as
|
||||||
plain prompts instead of OpenClaw internal runtime-context envelopes, while
|
plain prompts instead of OpenClaw internal runtime-context envelopes, while
|
||||||
keeping those envelopes out of ACP transcripts.
|
keeping those envelopes out of ACP transcripts.
|
||||||
|
|||||||
@@ -414,6 +414,15 @@ Full configuration: [Gateway configuration](/gateway/configuration)
|
|||||||
- ✅ Video/media
|
- ✅ Video/media
|
||||||
- ✅ Stickers
|
- ✅ Stickers
|
||||||
|
|
||||||
|
Inbound Feishu/Lark audio messages are normalized as media placeholders instead
|
||||||
|
of raw `file_key` JSON. When `tools.media.audio` is configured, OpenClaw
|
||||||
|
downloads the voice-note resource and runs shared audio transcription before the
|
||||||
|
agent turn, so the agent receives the spoken transcript. If Feishu includes
|
||||||
|
transcript text directly in the audio payload, that text is used without another
|
||||||
|
ASR call. Without an audio transcription provider, the agent still receives a
|
||||||
|
`<media:audio>` placeholder plus the saved attachment, not the raw Feishu
|
||||||
|
resource payload.
|
||||||
|
|
||||||
### Send
|
### Send
|
||||||
|
|
||||||
- ✅ Text
|
- ✅ Text
|
||||||
|
|||||||
9
extensions/feishu/src/audio-preflight.runtime.ts
Normal file
9
extensions/feishu/src/audio-preflight.runtime.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import { transcribeFirstAudio as transcribeFirstAudioImpl } from "openclaw/plugin-sdk/media-runtime";
|
||||||
|
|
||||||
|
type TranscribeFirstAudio = typeof import("openclaw/plugin-sdk/media-runtime").transcribeFirstAudio;
|
||||||
|
|
||||||
|
export async function transcribeFirstAudio(
|
||||||
|
...args: Parameters<TranscribeFirstAudio>
|
||||||
|
): ReturnType<TranscribeFirstAudio> {
|
||||||
|
return await transcribeFirstAudioImpl(...args);
|
||||||
|
}
|
||||||
@@ -139,6 +139,18 @@ export function parseMessageContent(content: string, messageType: string): strin
|
|||||||
if (messageType === "text") {
|
if (messageType === "text") {
|
||||||
return parsed.text || "";
|
return parsed.text || "";
|
||||||
}
|
}
|
||||||
|
if (["image", "file", "audio", "video", "media", "sticker"].includes(messageType)) {
|
||||||
|
if (messageType === "audio") {
|
||||||
|
const speechToText =
|
||||||
|
typeof parsed.speech_to_text === "string" ? parsed.speech_to_text.trim() : "";
|
||||||
|
if (speechToText) {
|
||||||
|
return speechToText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const placeholder = inferPlaceholder(messageType);
|
||||||
|
const fileName = typeof parsed.file_name === "string" ? parsed.file_name.trim() : "";
|
||||||
|
return fileName ? `${placeholder} (${fileName})` : placeholder;
|
||||||
|
}
|
||||||
if (messageType === "share_chat") {
|
if (messageType === "share_chat") {
|
||||||
if (parsed && typeof parsed === "object") {
|
if (parsed && typeof parsed === "object") {
|
||||||
const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown };
|
const share = parsed as { body?: unknown; summary?: unknown; share_chat_id?: unknown };
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it } from "vitest";
|
||||||
import type { ClawdbotConfig } from "../runtime-api.js";
|
import type { ClawdbotConfig } from "../runtime-api.js";
|
||||||
|
import { parseMessageContent } from "./bot-content.js";
|
||||||
import {
|
import {
|
||||||
buildBroadcastSessionKey,
|
buildBroadcastSessionKey,
|
||||||
buildFeishuAgentBody,
|
buildFeishuAgentBody,
|
||||||
@@ -47,6 +48,29 @@ describe("toMessageResourceType", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("parseMessageContent media placeholders", () => {
|
||||||
|
it("uses an audio placeholder instead of leaking raw file_key JSON", () => {
|
||||||
|
expect(
|
||||||
|
parseMessageContent(JSON.stringify({ file_key: "file_audio", duration: 1200 }), "audio"),
|
||||||
|
).toBe("<media:audio>");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("prefers Feishu-provided audio transcript text when present", () => {
|
||||||
|
expect(
|
||||||
|
parseMessageContent(
|
||||||
|
JSON.stringify({ file_key: "file_audio", speech_to_text: " spoken words " }),
|
||||||
|
"audio",
|
||||||
|
),
|
||||||
|
).toBe("spoken words");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("keeps media filenames as placeholder context without raw payload fields", () => {
|
||||||
|
expect(
|
||||||
|
parseMessageContent(JSON.stringify({ file_key: "file_doc", file_name: "q1.pdf" }), "file"),
|
||||||
|
).toBe("<media:document> (q1.pdf)");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("resolveBroadcastAgents", () => {
|
describe("resolveBroadcastAgents", () => {
|
||||||
it("returns agent list when broadcast config has the peerId", () => {
|
it("returns agent list when broadcast config has the peerId", () => {
|
||||||
const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } };
|
const cfg: ClawdbotConfig = { broadcast: { oc_group123: ["susan", "main"] } };
|
||||||
|
|||||||
@@ -231,6 +231,7 @@ const {
|
|||||||
mockResolveBoundConversation,
|
mockResolveBoundConversation,
|
||||||
mockTouchBinding,
|
mockTouchBinding,
|
||||||
mockResolveFeishuReasoningPreviewEnabled,
|
mockResolveFeishuReasoningPreviewEnabled,
|
||||||
|
mockTranscribeFirstAudio,
|
||||||
} = vi.hoisted(() => ({
|
} = vi.hoisted(() => ({
|
||||||
mockCreateFeishuReplyDispatcher: vi.fn(() => ({
|
mockCreateFeishuReplyDispatcher: vi.fn(() => ({
|
||||||
dispatcher: createReplyDispatcher(),
|
dispatcher: createReplyDispatcher(),
|
||||||
@@ -265,6 +266,7 @@ const {
|
|||||||
mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation),
|
mockResolveBoundConversation: vi.fn((_ref?: unknown) => null as BoundConversation),
|
||||||
mockTouchBinding: vi.fn(),
|
mockTouchBinding: vi.fn(),
|
||||||
mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false),
|
mockResolveFeishuReasoningPreviewEnabled: vi.fn(() => false),
|
||||||
|
mockTranscribeFirstAudio: vi.fn(),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
vi.mock("./reply-dispatcher.js", () => ({
|
vi.mock("./reply-dispatcher.js", () => ({
|
||||||
@@ -285,6 +287,10 @@ vi.mock("./media.js", () => ({
|
|||||||
downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu,
|
downloadMessageResourceFeishu: mockDownloadMessageResourceFeishu,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("./audio-preflight.runtime.js", () => ({
|
||||||
|
transcribeFirstAudio: mockTranscribeFirstAudio,
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("./client.js", () => ({
|
vi.mock("./client.js", () => ({
|
||||||
createFeishuClient: mockCreateFeishuClient,
|
createFeishuClient: mockCreateFeishuClient,
|
||||||
}));
|
}));
|
||||||
@@ -357,6 +363,7 @@ describe("handleFeishuMessage ACP routing", () => {
|
|||||||
mockResolveBoundConversation.mockReset().mockReturnValue(null);
|
mockResolveBoundConversation.mockReset().mockReturnValue(null);
|
||||||
mockTouchBinding.mockReset();
|
mockTouchBinding.mockReset();
|
||||||
mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false);
|
mockResolveFeishuReasoningPreviewEnabled.mockReset().mockReturnValue(false);
|
||||||
|
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
|
||||||
mockResolveAgentRoute.mockReset().mockReturnValue({
|
mockResolveAgentRoute.mockReset().mockReturnValue({
|
||||||
...buildDefaultResolveRoute(),
|
...buildDefaultResolveRoute(),
|
||||||
sessionKey: "agent:main:feishu:direct:ou_sender_1",
|
sessionKey: "agent:main:feishu:direct:ou_sender_1",
|
||||||
@@ -555,6 +562,7 @@ describe("handleFeishuMessage command authorization", () => {
|
|||||||
mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true });
|
mockEnsureConfiguredBindingRouteReady.mockReset().mockResolvedValue({ ok: true });
|
||||||
mockResolveBoundConversation.mockReset().mockReturnValue(null);
|
mockResolveBoundConversation.mockReset().mockReturnValue(null);
|
||||||
mockTouchBinding.mockReset();
|
mockTouchBinding.mockReset();
|
||||||
|
mockTranscribeFirstAudio.mockReset().mockResolvedValue(undefined);
|
||||||
mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute());
|
mockResolveAgentRoute.mockReturnValue(buildDefaultResolveRoute());
|
||||||
mockCreateFeishuClient.mockReturnValue({
|
mockCreateFeishuClient.mockReturnValue({
|
||||||
contact: {
|
contact: {
|
||||||
@@ -1438,6 +1446,78 @@ describe("handleFeishuMessage command authorization", () => {
|
|||||||
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
|
expect(mockDispatchReplyFromConfig).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("transcribes inbound audio before building the agent turn", async () => {
|
||||||
|
mockShouldComputeCommandAuthorized.mockReturnValue(false);
|
||||||
|
mockDownloadMessageResourceFeishu.mockResolvedValueOnce({
|
||||||
|
buffer: Buffer.from("voice"),
|
||||||
|
contentType: "audio/ogg",
|
||||||
|
fileName: "voice.ogg",
|
||||||
|
});
|
||||||
|
mockSaveMediaBuffer.mockResolvedValueOnce({
|
||||||
|
id: "inbound-voice.ogg",
|
||||||
|
path: "/tmp/inbound-voice.ogg",
|
||||||
|
size: Buffer.byteLength("voice"),
|
||||||
|
contentType: "audio/ogg",
|
||||||
|
});
|
||||||
|
mockTranscribeFirstAudio.mockResolvedValueOnce("voice transcript");
|
||||||
|
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
channels: {
|
||||||
|
feishu: {
|
||||||
|
dmPolicy: "open",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} as ClawdbotConfig;
|
||||||
|
|
||||||
|
const event: FeishuMessageEvent = {
|
||||||
|
sender: {
|
||||||
|
sender_id: {
|
||||||
|
open_id: "ou-voice",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
message: {
|
||||||
|
message_id: "msg-audio-inbound",
|
||||||
|
chat_id: "oc-dm",
|
||||||
|
chat_type: "p2p",
|
||||||
|
message_type: "audio",
|
||||||
|
content: JSON.stringify({
|
||||||
|
file_key: "file_audio_payload",
|
||||||
|
duration: 1200,
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
await dispatchMessage({ cfg, event });
|
||||||
|
|
||||||
|
expect(mockDownloadMessageResourceFeishu).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
messageId: "msg-audio-inbound",
|
||||||
|
fileKey: "file_audio_payload",
|
||||||
|
type: "file",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
expect(mockTranscribeFirstAudio).toHaveBeenCalledWith({
|
||||||
|
ctx: {
|
||||||
|
MediaPaths: ["/tmp/inbound-voice.ogg"],
|
||||||
|
MediaTypes: ["audio/ogg"],
|
||||||
|
ChatType: "direct",
|
||||||
|
},
|
||||||
|
cfg,
|
||||||
|
});
|
||||||
|
expect(mockFinalizeInboundContext).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
BodyForAgent: "[message_id: msg-audio-inbound]\nou-voice: voice transcript",
|
||||||
|
RawBody: "voice transcript",
|
||||||
|
CommandBody: "voice transcript",
|
||||||
|
Transcript: "voice transcript",
|
||||||
|
MediaPaths: ["/tmp/inbound-voice.ogg"],
|
||||||
|
MediaTypes: ["audio/ogg"],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const finalized = mockFinalizeInboundContext.mock.calls[0]?.[0];
|
||||||
|
expect(finalized.BodyForAgent).not.toContain("file_audio_payload");
|
||||||
|
});
|
||||||
|
|
||||||
it("uses video file_key (not thumbnail image_key) for inbound video download", async () => {
|
it("uses video file_key (not thumbnail image_key) for inbound video download", async () => {
|
||||||
mockShouldComputeCommandAuthorized.mockReturnValue(false);
|
mockShouldComputeCommandAuthorized.mockReturnValue(false);
|
||||||
|
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ import type { FeishuMessageEvent } from "./event-types.js";
|
|||||||
import {
|
import {
|
||||||
isFeishuGroupChatType,
|
isFeishuGroupChatType,
|
||||||
type FeishuMessageContext,
|
type FeishuMessageContext,
|
||||||
|
type FeishuMediaInfo,
|
||||||
type FeishuMessageInfo,
|
type FeishuMessageInfo,
|
||||||
} from "./types.js";
|
} from "./types.js";
|
||||||
import type { DynamicAgentCreationConfig } from "./types.js";
|
import type { DynamicAgentCreationConfig } from "./types.js";
|
||||||
@@ -68,6 +69,37 @@ export { toMessageResourceType } from "./bot-content.js";
|
|||||||
const permissionErrorNotifiedAt = new Map<string, number>();
|
const permissionErrorNotifiedAt = new Map<string, number>();
|
||||||
const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
|
const PERMISSION_ERROR_COOLDOWN_MS = 5 * 60 * 1000; // 5 minutes
|
||||||
|
|
||||||
|
async function resolveFeishuAudioPreflightTranscript(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
mediaList: FeishuMediaInfo[];
|
||||||
|
content: string;
|
||||||
|
chatType: "direct" | "group";
|
||||||
|
log: (msg: string) => void;
|
||||||
|
}): Promise<string | undefined> {
|
||||||
|
if (params.content.trim() !== "<media:audio>") {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const audioMedia = params.mediaList.filter((media) => media.contentType?.startsWith("audio/"));
|
||||||
|
if (audioMedia.length === 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { transcribeFirstAudio } = await import("./audio-preflight.runtime.js");
|
||||||
|
return await transcribeFirstAudio({
|
||||||
|
ctx: {
|
||||||
|
MediaPaths: audioMedia.map((media) => media.path),
|
||||||
|
MediaTypes: audioMedia.map((media) => media.contentType).filter(Boolean) as string[],
|
||||||
|
ChatType: params.chatType,
|
||||||
|
},
|
||||||
|
cfg: params.cfg,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
params.log(`feishu: audio preflight transcription failed: ${String(err)}`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --- Broadcast support ---
|
// --- Broadcast support ---
|
||||||
// Resolve broadcast agent list for a given peer (group) ID.
|
// Resolve broadcast agent list for a given peer (group) ID.
|
||||||
// Returns null if no broadcast config exists or the peer is not in the broadcast list.
|
// Returns null if no broadcast config exists or the peer is not in the broadcast list.
|
||||||
@@ -567,14 +599,6 @@ export async function handleFeishuMessage(params: {
|
|||||||
senderIds: [senderUserId],
|
senderIds: [senderUserId],
|
||||||
senderName: ctx.senderName,
|
senderName: ctx.senderName,
|
||||||
}).allowed;
|
}).allowed;
|
||||||
const commandAuthorized = shouldComputeCommandAuthorized
|
|
||||||
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
|
|
||||||
useAccessGroups,
|
|
||||||
authorizers: [
|
|
||||||
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
|
|
||||||
],
|
|
||||||
})
|
|
||||||
: undefined;
|
|
||||||
|
|
||||||
// In group chats, the session is scoped to the group, but the *speaker* is the sender.
|
// In group chats, the session is scoped to the group, but the *speaker* is the sender.
|
||||||
// Using a group-scoped From causes the agent to treat different users as the same person.
|
// Using a group-scoped From causes the agent to treat different users as the same person.
|
||||||
@@ -728,6 +752,39 @@ export async function handleFeishuMessage(params: {
|
|||||||
accountId: account.accountId,
|
accountId: account.accountId,
|
||||||
});
|
});
|
||||||
const mediaPayload = buildAgentMediaPayload(mediaList);
|
const mediaPayload = buildAgentMediaPayload(mediaList);
|
||||||
|
const audioTranscript = await resolveFeishuAudioPreflightTranscript({
|
||||||
|
cfg: effectiveCfg,
|
||||||
|
mediaList,
|
||||||
|
content: ctx.content,
|
||||||
|
chatType: isGroup ? "group" : "direct",
|
||||||
|
log,
|
||||||
|
});
|
||||||
|
const agentFacingContent = audioTranscript ?? ctx.content;
|
||||||
|
const agentFacingCtx =
|
||||||
|
audioTranscript === undefined
|
||||||
|
? ctx
|
||||||
|
: {
|
||||||
|
...ctx,
|
||||||
|
content: audioTranscript,
|
||||||
|
};
|
||||||
|
const effectiveCommandProbeBody =
|
||||||
|
audioTranscript === undefined
|
||||||
|
? commandProbeBody
|
||||||
|
: isGroup
|
||||||
|
? normalizeFeishuCommandProbeBody(audioTranscript)
|
||||||
|
: audioTranscript;
|
||||||
|
const shouldComputeEffectiveCommandAuthorized =
|
||||||
|
audioTranscript === undefined
|
||||||
|
? shouldComputeCommandAuthorized
|
||||||
|
: core.channel.commands.shouldComputeCommandAuthorized(effectiveCommandProbeBody, cfg);
|
||||||
|
const commandAuthorized = shouldComputeEffectiveCommandAuthorized
|
||||||
|
? core.channel.commands.resolveCommandAuthorizedFromAuthorizers({
|
||||||
|
useAccessGroups,
|
||||||
|
authorizers: [
|
||||||
|
{ configured: commandAllowFrom.length > 0, allowed: senderAllowedForCommands },
|
||||||
|
],
|
||||||
|
})
|
||||||
|
: undefined;
|
||||||
|
|
||||||
// Fetch quoted/replied message content if parentId exists
|
// Fetch quoted/replied message content if parentId exists
|
||||||
let quotedMessageInfo: Awaited<ReturnType<typeof getMessageFeishu>> = null;
|
let quotedMessageInfo: Awaited<ReturnType<typeof getMessageFeishu>> = null;
|
||||||
@@ -771,7 +828,7 @@ export async function handleFeishuMessage(params: {
|
|||||||
|
|
||||||
const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg);
|
const envelopeOptions = core.channel.reply.resolveEnvelopeFormatOptions(cfg);
|
||||||
const messageBody = buildFeishuAgentBody({
|
const messageBody = buildFeishuAgentBody({
|
||||||
ctx,
|
ctx: agentFacingCtx,
|
||||||
quotedContent,
|
quotedContent,
|
||||||
permissionErrorForAgent,
|
permissionErrorForAgent,
|
||||||
botOpenId,
|
botOpenId,
|
||||||
@@ -993,8 +1050,9 @@ export async function handleFeishuMessage(params: {
|
|||||||
InboundHistory: inboundHistory,
|
InboundHistory: inboundHistory,
|
||||||
ReplyToId: ctx.parentId,
|
ReplyToId: ctx.parentId,
|
||||||
RootMessageId: ctx.rootId,
|
RootMessageId: ctx.rootId,
|
||||||
RawBody: ctx.content,
|
RawBody: agentFacingContent,
|
||||||
CommandBody: ctx.content,
|
CommandBody: agentFacingContent,
|
||||||
|
Transcript: audioTranscript,
|
||||||
From: feishuFrom,
|
From: feishuFrom,
|
||||||
To: feishuTo,
|
To: feishuTo,
|
||||||
SessionKey: agentSessionKey,
|
SessionKey: agentSessionKey,
|
||||||
|
|||||||
Reference in New Issue
Block a user