From b0c55eb65948b633f61fe04e04c91d7da02cf4f8 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 09:26:08 +0100 Subject: [PATCH] fix(feishu): transcode voice TTS audio --- CHANGELOG.md | 1 + docs/channels/feishu.md | 8 ++ docs/tools/tts.md | 8 +- extensions/feishu/src/channel.test.ts | 28 ++++ extensions/feishu/src/channel.ts | 12 ++ extensions/feishu/src/media.test.ts | 111 ++++++++++++++++ extensions/feishu/src/media.ts | 120 +++++++++++++++++- extensions/feishu/src/outbound.test.ts | 18 +++ extensions/feishu/src/outbound.ts | 2 + .../feishu/src/reply-dispatcher.test.ts | 15 +++ extensions/feishu/src/reply-dispatcher.ts | 1 + extensions/speech-core/src/tts.test.ts | 30 +++++ extensions/speech-core/src/tts.ts | 33 ++++- src/agents/tools/tts-tool.test.ts | 22 ++++ src/agents/tools/tts-tool.ts | 2 +- src/plugin-sdk/tts-runtime.types.ts | 11 ++ 16 files changed, 416 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12f087a5f0d..a39003f644e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Docs: https://docs.openclaw.ai - Control UI: make `/usage` use the fresh context snapshot for context percentage, and include cache-write tokens in the Usage overview cache-hit denominator. Fixes #47885. Thanks @imwyvern and @Ante042. - GitHub Copilot: preserve encrypted Responses reasoning item IDs during replay so Copilot can validate encrypted reasoning payloads across requests. (#71448) Thanks @a410979729-sys. - Agents/replies: recover final-answer text when streamed assistant chunks contain only whitespace, preventing completed turns from surfacing as empty-payload errors. Fixes #71454. (#71467) Thanks @Sanjays2402. +- Feishu/TTS: transcode voice-intent MP3 and other audio replies to Ogg/Opus before sending native Feishu audio bubbles, while keeping ordinary MP3 attachments as files. Fixes #61249 and #37868. - Telegram/webhook: acknowledge validated webhook updates before running bot middleware, keeping slow agent turns from tripping Telegram delivery retries while preserving per-chat processing lanes. Fixes #71392. Thanks @joelforsberg46-source. - MCP: retire one-shot embedded bundled MCP runtimes at run end, skip bundle-MCP startup when a runtime tool allowlist cannot reach bundle-MCP tools, and add `mcp.sessionIdleTtlMs` idle eviction for leaked session runtimes. Fixes #71106, #71110, #70389, and #70808. - MCP/config reload: hot-apply `mcp.*` changes by disposing cached session MCP runtimes, and dispose bundled MCP runtimes during gateway shutdown so removed `mcp.servers` entries reap child processes promptly. Fixes #60656. diff --git a/docs/channels/feishu.md b/docs/channels/feishu.md index 50571935ae4..c839818a492 100644 --- a/docs/channels/feishu.md +++ b/docs/channels/feishu.md @@ -424,6 +424,14 @@ Full configuration: [Gateway configuration](/gateway/configuration) - ✅ Interactive cards (including streaming updates) - ⚠️ Rich text (post-style formatting; doesn't support full Feishu/Lark authoring capabilities) +Native Feishu/Lark audio bubbles use the Feishu `audio` message type and require +Ogg/Opus upload media (`file_type: "opus"`). Existing `.opus` and `.ogg` media +is sent directly as native audio. MP3/WAV/M4A and other likely audio formats are +transcoded to 48kHz Ogg/Opus with `ffmpeg` only when the reply requests voice +delivery (`audioAsVoice` / message tool `asVoice`, including TTS voice-note +replies). Ordinary MP3 attachments stay regular files. If `ffmpeg` is missing or +conversion fails, OpenClaw falls back to a file attachment and logs the reason. + ### Threads and replies - ✅ Inline replies diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 16780a098f0..2d7c6cb9c99 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -489,8 +489,12 @@ These override `messages.tts.*` for that host. ## Output formats (fixed) -- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). +- **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). - 48kHz / 64kbps is a good voice message tradeoff. +- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another + likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with + `ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu + receives the original file as an attachment. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery. @@ -572,6 +576,8 @@ Notes: The `tts` tool converts text to speech and returns an audio attachment for reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp, the audio is delivered as a voice message rather than a file attachment. +Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is +available. It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a per-call provider request timeout in milliseconds. diff --git a/extensions/feishu/src/channel.test.ts b/extensions/feishu/src/channel.test.ts index 2811b7fcfa4..a6e7af830ce 100644 --- a/extensions/feishu/src/channel.test.ts +++ b/extensions/feishu/src/channel.test.ts @@ -461,6 +461,34 @@ describe("feishuPlugin actions", () => { expect(result?.details).toMatchObject({ messageId: "om_media" }); }); + it("passes asVoice through media sends", async () => { + feishuOutboundSendMediaMock.mockResolvedValueOnce({ + channel: "feishu", + messageId: "om_voice", + details: { messageId: "om_voice", chatId: "oc_group_1" }, + }); + + await feishuPlugin.actions?.handleAction?.({ + action: "send", + params: { + to: "chat:oc_group_1", + media: "https://example.com/reply.mp3", + asVoice: true, + }, + cfg, + accountId: undefined, + toolContext: {}, + mediaLocalRoots: [], + } as never); + + expect(feishuOutboundSendMediaMock).toHaveBeenCalledWith( + expect.objectContaining({ + mediaUrl: "https://example.com/reply.mp3", + audioAsVoice: true, + }), + ); + }); + it("reads messages", async () => { getMessageFeishuMock.mockResolvedValueOnce({ messageId: "om_1", diff --git a/extensions/feishu/src/channel.ts b/extensions/feishu/src/channel.ts index 55ef53d8e42..07844d57a0c 100644 --- a/extensions/feishu/src/channel.ts +++ b/extensions/feishu/src/channel.ts @@ -81,6 +81,16 @@ function readFeishuMediaParam(params: Record): string | undefin return media.trim() ? media : undefined; } +function readBooleanParam(params: Record, keys: string[]): boolean | undefined { + for (const key of keys) { + const value = params[key]; + if (typeof value === "boolean") { + return value; + } + } + return undefined; +} + function hasLegacyFeishuCardCommandValue(actionValue: unknown): boolean { return ( isRecord(actionValue) && @@ -695,6 +705,7 @@ export const feishuPlugin: ChannelPlugin vi.fn()); const normalizeFeishuTargetMock = vi.hoisted(() => vi.fn()); const resolveReceiveIdTypeMock = vi.hoisted(() => vi.fn()); const loadWebMediaMock = vi.hoisted(() => vi.fn()); +const runFfmpegMock = vi.hoisted(() => vi.fn()); const fileCreateMock = vi.hoisted(() => vi.fn()); const imageCreateMock = vi.hoisted(() => vi.fn()); @@ -42,6 +43,14 @@ vi.mock("./runtime.js", () => ({ }), })); +vi.mock("openclaw/plugin-sdk/media-runtime", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + runFfmpeg: runFfmpegMock, + }; +}); + vi.mock("../../../src/channels/plugins/bundled.js", () => ({ bundledChannelPlugins: [], bundledChannelSetupPlugins: [], @@ -145,6 +154,10 @@ describe("sendMediaFeishu msg_type routing", () => { imageGetMock.mockResolvedValue(Buffer.from("image-bytes")); messageResourceGetMock.mockResolvedValue(Buffer.from("resource-bytes")); + runFfmpegMock.mockImplementation(async (args: string[]) => { + await fs.writeFile(args.at(-1) ?? "", Buffer.from("opus-output")); + return ""; + }); }); it("uses msg_type=media for mp4 video", async () => { @@ -260,6 +273,104 @@ describe("sendMediaFeishu msg_type routing", () => { data: expect.objectContaining({ msg_type: "file" }), }), ); + expect(runFfmpegMock).not.toHaveBeenCalled(); + }); + + it("transcodes voice-intent mp3 to msg_type=audio", async () => { + loadWebMediaMock.mockResolvedValueOnce({ + buffer: Buffer.from("remote-mp3"), + fileName: "reply.mp3", + kind: "audio", + contentType: "audio/mpeg", + }); + + await sendMediaFeishu({ + cfg: emptyConfig, + to: "user:ou_target", + mediaUrl: "https://example.com/reply.mp3", + audioAsVoice: true, + }); + + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]), + ); + expect(fileCreateMock).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ + file_type: "opus", + file_name: "voice.ogg", + file: Buffer.from("opus-output"), + }), + }), + ); + expect(messageCreateMock).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ msg_type: "audio" }), + }), + ); + }); + + it("leaves native voice audio unchanged when audioAsVoice is true", async () => { + await sendMediaFeishu({ + cfg: emptyConfig, + to: "user:ou_target", + mediaBuffer: Buffer.from("opus"), + fileName: "reply.ogg", + audioAsVoice: true, + }); + + expect(runFfmpegMock).not.toHaveBeenCalled(); + expect(fileCreateMock).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ + file_type: "opus", + file_name: "reply.ogg", + }), + }), + ); + expect(messageCreateMock).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ msg_type: "audio" }), + }), + ); + }); + + it("falls back to file when voice-intent audio cannot be transcoded", async () => { + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => undefined); + runFfmpegMock.mockRejectedValueOnce(new Error("ffmpeg missing")); + loadWebMediaMock.mockResolvedValueOnce({ + buffer: Buffer.from("remote-mp3"), + fileName: "reply.mp3", + kind: "audio", + contentType: "audio/mpeg", + }); + + await sendMediaFeishu({ + cfg: emptyConfig, + to: "user:ou_target", + mediaUrl: "https://example.com/reply.mp3", + audioAsVoice: true, + }); + + expect(fileCreateMock).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ + file_type: "stream", + file_name: "reply.mp3", + file: Buffer.from("remote-mp3"), + }), + }), + ); + expect(messageCreateMock).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ msg_type: "file" }), + }), + ); + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining("audioAsVoice transcode failed"), + expect.any(Error), + ); + warnSpy.mockRestore(); }); it("configures the media client timeout for image uploads", async () => { diff --git a/extensions/feishu/src/media.ts b/extensions/feishu/src/media.ts index 6497db99955..529dba5b616 100644 --- a/extensions/feishu/src/media.ts +++ b/extensions/feishu/src/media.ts @@ -3,7 +3,11 @@ import path from "node:path"; import { Readable } from "node:stream"; import type * as Lark from "@larksuiteoapi/node-sdk"; import { mediaKindFromMime } from "openclaw/plugin-sdk/media-mime"; -import { withTempDownloadPath } from "openclaw/plugin-sdk/temp-path"; +import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS, runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { + resolvePreferredOpenClawTmpDir, + withTempDownloadPath, +} from "openclaw/plugin-sdk/temp-path"; import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime"; import type { ClawdbotConfig } from "../runtime-api.js"; import { resolveFeishuRuntimeAccount } from "./accounts.js"; @@ -14,6 +18,24 @@ import { assertFeishuMessageApiSuccess, toFeishuSendResult } from "./send-result import { resolveFeishuSendTarget } from "./send-target.js"; const FEISHU_MEDIA_HTTP_TIMEOUT_MS = 120_000; +const FEISHU_VOICE_FILE_NAME = "voice.ogg"; +const FEISHU_VOICE_SAMPLE_RATE_HZ = 48_000; +const FEISHU_VOICE_BITRATE = "64k"; + +const FEISHU_TRANSCODABLE_AUDIO_EXTS = new Set([ + ".aac", + ".aiff", + ".alac", + ".amr", + ".caf", + ".flac", + ".m4a", + ".mp3", + ".oga", + ".wav", + ".webm", + ".wma", +]); export type DownloadImageResult = { buffer: Buffer; @@ -568,6 +590,89 @@ function resolveFeishuOutboundMediaKind(params: { fileName: string; contentType? }; } +function isFeishuNativeVoiceAudio(params: { fileName: string; contentType?: string }): boolean { + const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName)); + const contentType = normalizeLowercaseStringOrEmpty(params.contentType); + return ( + ext === ".opus" || ext === ".ogg" || contentType === "audio/ogg" || contentType === "audio/opus" + ); +} + +function isLikelyTranscodableAudio(params: { fileName: string; contentType?: string }): boolean { + const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName)); + const contentType = normalizeLowercaseStringOrEmpty(params.contentType); + return FEISHU_TRANSCODABLE_AUDIO_EXTS.has(ext) || mediaKindFromMime(contentType) === "audio"; +} + +async function transcodeToFeishuVoiceOpus(params: { + buffer: Buffer; + fileName: string; + contentType?: string; +}): Promise<{ buffer: Buffer; fileName: string; contentType: string }> { + const tempRoot = resolvePreferredOpenClawTmpDir(); + await fs.promises.mkdir(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = await fs.promises.mkdtemp(path.join(tempRoot, "feishu-voice-")); + try { + const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName)); + const inputExt = ext && ext.length <= 12 ? ext : ".audio"; + const inputPath = path.join(tempDir, `input${inputExt}`); + const outputPath = path.join(tempDir, FEISHU_VOICE_FILE_NAME); + await fs.promises.writeFile(inputPath, params.buffer, { mode: 0o600 }); + await runFfmpeg([ + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + inputPath, + "-vn", + "-sn", + "-dn", + "-t", + String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS), + "-ar", + String(FEISHU_VOICE_SAMPLE_RATE_HZ), + "-ac", + "1", + "-c:a", + "libopus", + "-b:a", + FEISHU_VOICE_BITRATE, + outputPath, + ]); + return { + buffer: await fs.promises.readFile(outputPath), + fileName: FEISHU_VOICE_FILE_NAME, + contentType: "audio/ogg", + }; + } finally { + await fs.promises.rm(tempDir, { recursive: true, force: true }); + } +} + +async function prepareFeishuVoiceMedia(params: { + buffer: Buffer; + fileName: string; + contentType?: string; + audioAsVoice?: boolean; +}): Promise<{ buffer: Buffer; fileName: string; contentType?: string }> { + if (isFeishuNativeVoiceAudio(params)) { + return params; + } + if (params.audioAsVoice !== true || !isLikelyTranscodableAudio(params)) { + return params; + } + try { + return await transcodeToFeishuVoiceOpus(params); + } catch (err) { + console.warn( + `[feishu] audioAsVoice transcode failed; sending ${params.fileName} as a file attachment:`, + err, + ); + return params; + } +} + /** * Upload and send media (image or file) from URL, local path, or buffer. * When mediaUrl is a local path, mediaLocalRoots (from core outbound context) @@ -584,6 +689,8 @@ export async function sendMediaFeishu(params: { accountId?: string; /** Allowed roots for local path reads; required for local filePath to work. */ mediaLocalRoots?: readonly string[]; + /** When true, transcode compatible audio to Feishu native Ogg/Opus voice bubbles. */ + audioAsVoice?: boolean; }): Promise { const { cfg, @@ -595,6 +702,7 @@ export async function sendMediaFeishu(params: { replyInThread, accountId, mediaLocalRoots, + audioAsVoice, } = params; const account = resolveFeishuRuntimeAccount({ cfg, accountId }); if (!account.configured) { @@ -622,6 +730,16 @@ export async function sendMediaFeishu(params: { throw new Error("Either mediaUrl or mediaBuffer must be provided"); } + const prepared = await prepareFeishuVoiceMedia({ + buffer, + fileName: name, + contentType, + audioAsVoice, + }); + buffer = prepared.buffer; + name = prepared.fileName; + contentType = prepared.contentType; + const routing = resolveFeishuOutboundMediaKind({ fileName: name, contentType }); if (routing.msgType === "image") { diff --git a/extensions/feishu/src/outbound.test.ts b/extensions/feishu/src/outbound.test.ts index 0c070dd6d38..db939731277 100644 --- a/extensions/feishu/src/outbound.test.ts +++ b/extensions/feishu/src/outbound.test.ts @@ -457,6 +457,24 @@ describe("feishuOutbound.sendMedia replyToId forwarding", () => { ); }); + it("forwards audioAsVoice to sendMediaFeishu", async () => { + await feishuOutbound.sendMedia?.({ + cfg: emptyConfig, + to: "chat_1", + text: "", + mediaUrl: "https://example.com/reply.mp3", + audioAsVoice: true, + accountId: "main", + }); + + expect(sendMediaFeishuMock).toHaveBeenCalledWith( + expect.objectContaining({ + mediaUrl: "https://example.com/reply.mp3", + audioAsVoice: true, + }), + ); + }); + it("forwards replyToId to text caption send", async () => { await feishuOutbound.sendMedia?.({ cfg: emptyConfig, diff --git a/extensions/feishu/src/outbound.ts b/extensions/feishu/src/outbound.ts index d451786730b..43493490ccd 100644 --- a/extensions/feishu/src/outbound.ts +++ b/extensions/feishu/src/outbound.ts @@ -232,6 +232,7 @@ export const feishuOutbound: ChannelOutboundAdapter = { to, text, mediaUrl, + audioAsVoice, accountId, mediaLocalRoots, replyToId, @@ -271,6 +272,7 @@ export const feishuOutbound: ChannelOutboundAdapter = { accountId: accountId ?? undefined, mediaLocalRoots, replyToMessageId, + ...(audioAsVoice === true ? { audioAsVoice: true } : {}), }); } catch (err) { // Log the error for debugging diff --git a/extensions/feishu/src/reply-dispatcher.test.ts b/extensions/feishu/src/reply-dispatcher.test.ts index 16411a9ed51..9dbc60b3f6a 100644 --- a/extensions/feishu/src/reply-dispatcher.test.ts +++ b/extensions/feishu/src/reply-dispatcher.test.ts @@ -469,6 +469,21 @@ describe("createFeishuReplyDispatcher streaming behavior", () => { expect(sendMarkdownCardFeishuMock).not.toHaveBeenCalled(); }); + it("passes audioAsVoice to media attachments", async () => { + const { options } = createDispatcherHarness(); + await options.deliver( + { mediaUrl: "https://example.com/reply.mp3", audioAsVoice: true }, + { kind: "final" }, + ); + + expect(sendMediaFeishuMock).toHaveBeenCalledWith( + expect.objectContaining({ + mediaUrl: "https://example.com/reply.mp3", + audioAsVoice: true, + }), + ); + }); + it("falls back to legacy mediaUrl when mediaUrls is an empty array", async () => { const { options } = createDispatcherHarness(); await options.deliver( diff --git a/extensions/feishu/src/reply-dispatcher.ts b/extensions/feishu/src/reply-dispatcher.ts index 460c0413137..dd4857a9cc2 100644 --- a/extensions/feishu/src/reply-dispatcher.ts +++ b/extensions/feishu/src/reply-dispatcher.ts @@ -396,6 +396,7 @@ export function createFeishuReplyDispatcher(params: CreateFeishuReplyDispatcherP replyToMessageId: sendReplyToMessageId, replyInThread: effectiveReplyInThread, accountId, + ...(payload.audioAsVoice === true ? { audioAsVoice: true } : {}), }); }, }); diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index d6fb3b164a8..b13aff709c2 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -118,6 +118,36 @@ describe("speech-core native voice-note routing", () => { }); }); + it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => { + synthesizeMock.mockResolvedValueOnce({ + audioBuffer: Buffer.from("mp3"), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }); + const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test"); + let mediaDir: string | undefined; + try { + const result = await maybeApplyTtsToPayload({ + payload: { text: "This Feishu reply should be transcoded by the channel." }, + cfg, + channel: "feishu", + kind: "final", + }); + + expect(synthesizeMock).toHaveBeenCalledWith( + expect.objectContaining({ target: "voice-note" }), + ); + expect(result.audioAsVoice).toBe(true); + expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/); + mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined; + } finally { + if (mediaDir) { + rmSync(mediaDir, { recursive: true, force: true }); + } + } + }); + it("keeps non-native voice-note channels as regular audio files", async () => { await expectTtsPayloadResult({ channel: "slack", diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 4eccca12e19..47f78df91dc 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -100,6 +100,8 @@ export type TtsResult = { attempts?: TtsProviderAttempt[]; outputFormat?: string; voiceCompatible?: boolean; + audioAsVoice?: boolean; + target?: "audio-file" | "voice-note"; }; export type TtsSynthesisResult = { @@ -114,6 +116,7 @@ export type TtsSynthesisResult = { outputFormat?: string; voiceCompatible?: boolean; fileExtension?: string; + target?: "audio-file" | "voice-note"; }; export type TtsTelephonyResult = { @@ -586,6 +589,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { } const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]); +const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]); function resolveChannelId(channel: string | undefined): ChannelId | null { return channel ? normalizeChannelId(channel) : null; @@ -596,6 +600,22 @@ function supportsNativeVoiceNoteTts(channel: string | undefined): boolean { return channelId !== null && OPUS_CHANNELS.has(channelId); } +function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean { + const channelId = resolveChannelId(channel); + return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId); +} + +function shouldDeliverTtsAsVoice(params: { + channel: string | undefined; + target: "audio-file" | "voice-note" | undefined; + voiceCompatible: boolean | undefined; +}): boolean { + if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") { + return false; + } + return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel); +} + export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] { const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary; const ordered = new Set([normalizedPrimary]); @@ -782,6 +802,12 @@ export async function textToSpeech(params: { attempts: synthesis.attempts, outputFormat: synthesis.outputFormat, voiceCompatible: synthesis.voiceCompatible, + audioAsVoice: shouldDeliverTtsAsVoice({ + channel: params.channel, + target: synthesis.target, + voiceCompatible: synthesis.voiceCompatible, + }), + target: synthesis.target, }; } @@ -863,6 +889,7 @@ export async function synthesizeSpeech(params: { outputFormat: synthesis.outputFormat, voiceCompatible: synthesis.voiceCompatible, fileExtension: synthesis.fileExtension, + target, }; } catch (err) { const errorMsg = formatTtsProviderError(provider, err); @@ -1171,12 +1198,10 @@ export async function maybeApplyTtsToPayload(params: { latencyMs: result.latencyMs, }; - const shouldVoice = - supportsNativeVoiceNoteTts(params.channel) && result.voiceCompatible === true; return { ...nextPayload, mediaUrl: result.audioPath, - audioAsVoice: shouldVoice || params.payload.audioAsVoice, + audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice, }; } @@ -1199,6 +1224,8 @@ export const _test = { parseTtsDirectives, resolveModelOverridePolicy, supportsNativeVoiceNoteTts, + supportsTranscodedVoiceNoteTts, + shouldDeliverTtsAsVoice, summarizeText, getResolvedSpeechProviderConfig, formatTtsProviderError, diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index e833d5eb6df..412e42b8667 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -43,6 +43,28 @@ describe("createTtsTool", () => { expect(JSON.stringify(result.content)).not.toContain("MEDIA:"); }); + it("uses audioAsVoice from the TTS runtime even when the provider output is not native", async () => { + textToSpeechSpy.mockResolvedValue({ + success: true, + audioPath: "/tmp/reply.mp3", + provider: "test", + voiceCompatible: false, + audioAsVoice: true, + }); + + const tool = createTtsTool(); + const result = await tool.execute("call-1", { text: "hello", channel: "feishu" }); + + expect(result).toMatchObject({ + details: { + media: { + mediaUrl: "/tmp/reply.mp3", + audioAsVoice: true, + }, + }, + }); + }); + it("passes an optional timeout to speech generation", async () => { textToSpeechSpy.mockResolvedValue({ success: true, diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index a3db27121cb..8861cf3ad0b 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -92,7 +92,7 @@ export function createTtsTool(opts?: { media: { mediaUrl: result.audioPath, trustedLocalMedia: true, - ...(result.voiceCompatible ? { audioAsVoice: true } : {}), + ...(result.audioAsVoice || result.voiceCompatible ? { audioAsVoice: true } : {}), }, }, }; diff --git a/src/plugin-sdk/tts-runtime.types.ts b/src/plugin-sdk/tts-runtime.types.ts index 0e43dba237a..09a8cac2ea1 100644 --- a/src/plugin-sdk/tts-runtime.types.ts +++ b/src/plugin-sdk/tts-runtime.types.ts @@ -41,6 +41,8 @@ export type TtsStatusEntry = { error?: string; }; +export type TtsSpeechTarget = "audio-file" | "voice-note"; + export type SummarizeResult = { summary: string; latencyMs: number; @@ -99,6 +101,12 @@ export type TtsTestFacade = { parseTtsDirectives: (...args: unknown[]) => TtsDirectiveParseResult; resolveModelOverridePolicy: (...args: unknown[]) => ResolvedTtsModelOverrides; supportsNativeVoiceNoteTts: (channel: string | undefined) => boolean; + supportsTranscodedVoiceNoteTts: (channel: string | undefined) => boolean; + shouldDeliverTtsAsVoice: (params: { + channel: string | undefined; + target: TtsSpeechTarget | undefined; + voiceCompatible: boolean | undefined; + }) => boolean; summarizeText: (...args: unknown[]) => Promise; getResolvedSpeechProviderConfig: ( config: ResolvedTtsConfig, @@ -120,6 +128,8 @@ export type TtsResult = { attempts?: TtsProviderAttempt[]; outputFormat?: string; voiceCompatible?: boolean; + audioAsVoice?: boolean; + target?: TtsSpeechTarget; }; export type TtsSynthesisResult = { @@ -134,6 +144,7 @@ export type TtsSynthesisResult = { outputFormat?: string; voiceCompatible?: boolean; fileExtension?: string; + target?: TtsSpeechTarget; }; export type TtsTelephonyResult = {