fix(feishu): transcode voice TTS audio

This commit is contained in:
Peter Steinberger
2026-04-25 09:26:08 +01:00
parent bd32b1a906
commit b0c55eb659
16 changed files with 416 additions and 6 deletions

View File

@@ -21,6 +21,7 @@ Docs: https://docs.openclaw.ai
- Control UI: make `/usage` use the fresh context snapshot for context percentage, and include cache-write tokens in the Usage overview cache-hit denominator. Fixes #47885. Thanks @imwyvern and @Ante042.
- GitHub Copilot: preserve encrypted Responses reasoning item IDs during replay so Copilot can validate encrypted reasoning payloads across requests. (#71448) Thanks @a410979729-sys.
- Agents/replies: recover final-answer text when streamed assistant chunks contain only whitespace, preventing completed turns from surfacing as empty-payload errors. Fixes #71454. (#71467) Thanks @Sanjays2402.
- Feishu/TTS: transcode voice-intent MP3 and other audio replies to Ogg/Opus before sending native Feishu audio bubbles, while keeping ordinary MP3 attachments as files. Fixes #61249 and #37868.
- Telegram/webhook: acknowledge validated webhook updates before running bot middleware, keeping slow agent turns from tripping Telegram delivery retries while preserving per-chat processing lanes. Fixes #71392. Thanks @joelforsberg46-source.
- MCP: retire one-shot embedded bundled MCP runtimes at run end, skip bundle-MCP startup when a runtime tool allowlist cannot reach bundle-MCP tools, and add `mcp.sessionIdleTtlMs` idle eviction for leaked session runtimes. Fixes #71106, #71110, #70389, and #70808.
- MCP/config reload: hot-apply `mcp.*` changes by disposing cached session MCP runtimes, and dispose bundled MCP runtimes during gateway shutdown so removed `mcp.servers` entries reap child processes promptly. Fixes #60656.

View File

@@ -424,6 +424,14 @@ Full configuration: [Gateway configuration](/gateway/configuration)
- ✅ Interactive cards (including streaming updates)
- ⚠️ Rich text (post-style formatting; doesn't support full Feishu/Lark authoring capabilities)
Native Feishu/Lark audio bubbles use the Feishu `audio` message type and require
Ogg/Opus upload media (`file_type: "opus"`). Existing `.opus` and `.ogg` media
is sent directly as native audio. MP3/WAV/M4A and other likely audio formats are
transcoded to 48kHz Ogg/Opus with `ffmpeg` only when the reply requests voice
delivery (`audioAsVoice` / message tool `asVoice`, including TTS voice-note
replies). Ordinary MP3 attachments stay regular files. If `ffmpeg` is missing or
conversion fails, OpenClaw falls back to a file attachment and logs the reason.
### Threads and replies
- ✅ Inline replies

View File

@@ -489,8 +489,12 @@ These override `messages.tts.*` for that host.
## Output formats (fixed)
- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- 48kHz / 64kbps is a good voice message tradeoff.
- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another
likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with
`ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu
receives the original file as an attachment.
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
- 44.1kHz / 128kbps is the default balance for speech clarity.
- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery.
@@ -572,6 +576,8 @@ Notes:
The `tts` tool converts text to speech and returns an audio attachment for
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
the audio is delivered as a voice message rather than a file attachment.
Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
available.
It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
per-call provider request timeout in milliseconds.

View File

@@ -461,6 +461,34 @@ describe("feishuPlugin actions", () => {
expect(result?.details).toMatchObject({ messageId: "om_media" });
});
it("passes asVoice through media sends", async () => {
feishuOutboundSendMediaMock.mockResolvedValueOnce({
channel: "feishu",
messageId: "om_voice",
details: { messageId: "om_voice", chatId: "oc_group_1" },
});
await feishuPlugin.actions?.handleAction?.({
action: "send",
params: {
to: "chat:oc_group_1",
media: "https://example.com/reply.mp3",
asVoice: true,
},
cfg,
accountId: undefined,
toolContext: {},
mediaLocalRoots: [],
} as never);
expect(feishuOutboundSendMediaMock).toHaveBeenCalledWith(
expect.objectContaining({
mediaUrl: "https://example.com/reply.mp3",
audioAsVoice: true,
}),
);
});
it("reads messages", async () => {
getMessageFeishuMock.mockResolvedValueOnce({
messageId: "om_1",

View File

@@ -81,6 +81,16 @@ function readFeishuMediaParam(params: Record<string, unknown>): string | undefin
return media.trim() ? media : undefined;
}
function readBooleanParam(params: Record<string, unknown>, keys: string[]): boolean | undefined {
for (const key of keys) {
const value = params[key];
if (typeof value === "boolean") {
return value;
}
}
return undefined;
}
function hasLegacyFeishuCardCommandValue(actionValue: unknown): boolean {
return (
isRecord(actionValue) &&
@@ -695,6 +705,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
const presentation = normalizeMessagePresentation(ctx.params.presentation);
const text = readFirstString(ctx.params, ["text", "message"]);
const mediaUrl = readFeishuMediaParam(ctx.params);
const audioAsVoice = readBooleanParam(ctx.params, ["asVoice", "audioAsVoice"]);
const card = presentation
? buildFeishuPresentationCard({ presentation, fallbackText: text })
: undefined;
@@ -734,6 +745,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
accountId: ctx.accountId ?? undefined,
mediaLocalRoots: ctx.mediaLocalRoots,
replyToId: replyToMessageId,
...(audioAsVoice === true ? { audioAsVoice: true } : {}),
});
} else {
result = await runtime.sendMessageFeishu({

View File

@@ -9,6 +9,7 @@ const resolveFeishuAccountMock = vi.hoisted(() => vi.fn());
const normalizeFeishuTargetMock = vi.hoisted(() => vi.fn());
const resolveReceiveIdTypeMock = vi.hoisted(() => vi.fn());
const loadWebMediaMock = vi.hoisted(() => vi.fn());
const runFfmpegMock = vi.hoisted(() => vi.fn());
const fileCreateMock = vi.hoisted(() => vi.fn());
const imageCreateMock = vi.hoisted(() => vi.fn());
@@ -42,6 +43,14 @@ vi.mock("./runtime.js", () => ({
}),
}));
vi.mock("openclaw/plugin-sdk/media-runtime", async (importOriginal) => {
const actual = await importOriginal<typeof import("openclaw/plugin-sdk/media-runtime")>();
return {
...actual,
runFfmpeg: runFfmpegMock,
};
});
vi.mock("../../../src/channels/plugins/bundled.js", () => ({
bundledChannelPlugins: [],
bundledChannelSetupPlugins: [],
@@ -145,6 +154,10 @@ describe("sendMediaFeishu msg_type routing", () => {
imageGetMock.mockResolvedValue(Buffer.from("image-bytes"));
messageResourceGetMock.mockResolvedValue(Buffer.from("resource-bytes"));
runFfmpegMock.mockImplementation(async (args: string[]) => {
await fs.writeFile(args.at(-1) ?? "", Buffer.from("opus-output"));
return "";
});
});
it("uses msg_type=media for mp4 video", async () => {
@@ -260,6 +273,104 @@ describe("sendMediaFeishu msg_type routing", () => {
data: expect.objectContaining({ msg_type: "file" }),
}),
);
expect(runFfmpegMock).not.toHaveBeenCalled();
});
it("transcodes voice-intent mp3 to msg_type=audio", async () => {
loadWebMediaMock.mockResolvedValueOnce({
buffer: Buffer.from("remote-mp3"),
fileName: "reply.mp3",
kind: "audio",
contentType: "audio/mpeg",
});
await sendMediaFeishu({
cfg: emptyConfig,
to: "user:ou_target",
mediaUrl: "https://example.com/reply.mp3",
audioAsVoice: true,
});
expect(runFfmpegMock).toHaveBeenCalledWith(
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]),
);
expect(fileCreateMock).toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({
file_type: "opus",
file_name: "voice.ogg",
file: Buffer.from("opus-output"),
}),
}),
);
expect(messageCreateMock).toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({ msg_type: "audio" }),
}),
);
});
it("leaves native voice audio unchanged when audioAsVoice is true", async () => {
await sendMediaFeishu({
cfg: emptyConfig,
to: "user:ou_target",
mediaBuffer: Buffer.from("opus"),
fileName: "reply.ogg",
audioAsVoice: true,
});
expect(runFfmpegMock).not.toHaveBeenCalled();
expect(fileCreateMock).toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({
file_type: "opus",
file_name: "reply.ogg",
}),
}),
);
expect(messageCreateMock).toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({ msg_type: "audio" }),
}),
);
});
it("falls back to file when voice-intent audio cannot be transcoded", async () => {
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => undefined);
runFfmpegMock.mockRejectedValueOnce(new Error("ffmpeg missing"));
loadWebMediaMock.mockResolvedValueOnce({
buffer: Buffer.from("remote-mp3"),
fileName: "reply.mp3",
kind: "audio",
contentType: "audio/mpeg",
});
await sendMediaFeishu({
cfg: emptyConfig,
to: "user:ou_target",
mediaUrl: "https://example.com/reply.mp3",
audioAsVoice: true,
});
expect(fileCreateMock).toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({
file_type: "stream",
file_name: "reply.mp3",
file: Buffer.from("remote-mp3"),
}),
}),
);
expect(messageCreateMock).toHaveBeenCalledWith(
expect.objectContaining({
data: expect.objectContaining({ msg_type: "file" }),
}),
);
expect(warnSpy).toHaveBeenCalledWith(
expect.stringContaining("audioAsVoice transcode failed"),
expect.any(Error),
);
warnSpy.mockRestore();
});
it("configures the media client timeout for image uploads", async () => {

View File

@@ -3,7 +3,11 @@ import path from "node:path";
import { Readable } from "node:stream";
import type * as Lark from "@larksuiteoapi/node-sdk";
import { mediaKindFromMime } from "openclaw/plugin-sdk/media-mime";
import { withTempDownloadPath } from "openclaw/plugin-sdk/temp-path";
import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS, runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
import {
resolvePreferredOpenClawTmpDir,
withTempDownloadPath,
} from "openclaw/plugin-sdk/temp-path";
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
import type { ClawdbotConfig } from "../runtime-api.js";
import { resolveFeishuRuntimeAccount } from "./accounts.js";
@@ -14,6 +18,24 @@ import { assertFeishuMessageApiSuccess, toFeishuSendResult } from "./send-result
import { resolveFeishuSendTarget } from "./send-target.js";
const FEISHU_MEDIA_HTTP_TIMEOUT_MS = 120_000;
const FEISHU_VOICE_FILE_NAME = "voice.ogg";
const FEISHU_VOICE_SAMPLE_RATE_HZ = 48_000;
const FEISHU_VOICE_BITRATE = "64k";
const FEISHU_TRANSCODABLE_AUDIO_EXTS = new Set([
".aac",
".aiff",
".alac",
".amr",
".caf",
".flac",
".m4a",
".mp3",
".oga",
".wav",
".webm",
".wma",
]);
export type DownloadImageResult = {
buffer: Buffer;
@@ -568,6 +590,89 @@ function resolveFeishuOutboundMediaKind(params: { fileName: string; contentType?
};
}
function isFeishuNativeVoiceAudio(params: { fileName: string; contentType?: string }): boolean {
const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
return (
ext === ".opus" || ext === ".ogg" || contentType === "audio/ogg" || contentType === "audio/opus"
);
}
function isLikelyTranscodableAudio(params: { fileName: string; contentType?: string }): boolean {
const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
return FEISHU_TRANSCODABLE_AUDIO_EXTS.has(ext) || mediaKindFromMime(contentType) === "audio";
}
async function transcodeToFeishuVoiceOpus(params: {
buffer: Buffer;
fileName: string;
contentType?: string;
}): Promise<{ buffer: Buffer; fileName: string; contentType: string }> {
const tempRoot = resolvePreferredOpenClawTmpDir();
await fs.promises.mkdir(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = await fs.promises.mkdtemp(path.join(tempRoot, "feishu-voice-"));
try {
const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
const inputExt = ext && ext.length <= 12 ? ext : ".audio";
const inputPath = path.join(tempDir, `input${inputExt}`);
const outputPath = path.join(tempDir, FEISHU_VOICE_FILE_NAME);
await fs.promises.writeFile(inputPath, params.buffer, { mode: 0o600 });
await runFfmpeg([
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
inputPath,
"-vn",
"-sn",
"-dn",
"-t",
String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS),
"-ar",
String(FEISHU_VOICE_SAMPLE_RATE_HZ),
"-ac",
"1",
"-c:a",
"libopus",
"-b:a",
FEISHU_VOICE_BITRATE,
outputPath,
]);
return {
buffer: await fs.promises.readFile(outputPath),
fileName: FEISHU_VOICE_FILE_NAME,
contentType: "audio/ogg",
};
} finally {
await fs.promises.rm(tempDir, { recursive: true, force: true });
}
}
async function prepareFeishuVoiceMedia(params: {
buffer: Buffer;
fileName: string;
contentType?: string;
audioAsVoice?: boolean;
}): Promise<{ buffer: Buffer; fileName: string; contentType?: string }> {
if (isFeishuNativeVoiceAudio(params)) {
return params;
}
if (params.audioAsVoice !== true || !isLikelyTranscodableAudio(params)) {
return params;
}
try {
return await transcodeToFeishuVoiceOpus(params);
} catch (err) {
console.warn(
`[feishu] audioAsVoice transcode failed; sending ${params.fileName} as a file attachment:`,
err,
);
return params;
}
}
/**
* Upload and send media (image or file) from URL, local path, or buffer.
* When mediaUrl is a local path, mediaLocalRoots (from core outbound context)
@@ -584,6 +689,8 @@ export async function sendMediaFeishu(params: {
accountId?: string;
/** Allowed roots for local path reads; required for local filePath to work. */
mediaLocalRoots?: readonly string[];
/** When true, transcode compatible audio to Feishu native Ogg/Opus voice bubbles. */
audioAsVoice?: boolean;
}): Promise<SendMediaResult> {
const {
cfg,
@@ -595,6 +702,7 @@ export async function sendMediaFeishu(params: {
replyInThread,
accountId,
mediaLocalRoots,
audioAsVoice,
} = params;
const account = resolveFeishuRuntimeAccount({ cfg, accountId });
if (!account.configured) {
@@ -622,6 +730,16 @@ export async function sendMediaFeishu(params: {
throw new Error("Either mediaUrl or mediaBuffer must be provided");
}
const prepared = await prepareFeishuVoiceMedia({
buffer,
fileName: name,
contentType,
audioAsVoice,
});
buffer = prepared.buffer;
name = prepared.fileName;
contentType = prepared.contentType;
const routing = resolveFeishuOutboundMediaKind({ fileName: name, contentType });
if (routing.msgType === "image") {

View File

@@ -457,6 +457,24 @@ describe("feishuOutbound.sendMedia replyToId forwarding", () => {
);
});
it("forwards audioAsVoice to sendMediaFeishu", async () => {
await feishuOutbound.sendMedia?.({
cfg: emptyConfig,
to: "chat_1",
text: "",
mediaUrl: "https://example.com/reply.mp3",
audioAsVoice: true,
accountId: "main",
});
expect(sendMediaFeishuMock).toHaveBeenCalledWith(
expect.objectContaining({
mediaUrl: "https://example.com/reply.mp3",
audioAsVoice: true,
}),
);
});
it("forwards replyToId to text caption send", async () => {
await feishuOutbound.sendMedia?.({
cfg: emptyConfig,

View File

@@ -232,6 +232,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
to,
text,
mediaUrl,
audioAsVoice,
accountId,
mediaLocalRoots,
replyToId,
@@ -271,6 +272,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
accountId: accountId ?? undefined,
mediaLocalRoots,
replyToMessageId,
...(audioAsVoice === true ? { audioAsVoice: true } : {}),
});
} catch (err) {
// Log the error for debugging

View File

@@ -469,6 +469,21 @@ describe("createFeishuReplyDispatcher streaming behavior", () => {
expect(sendMarkdownCardFeishuMock).not.toHaveBeenCalled();
});
it("passes audioAsVoice to media attachments", async () => {
const { options } = createDispatcherHarness();
await options.deliver(
{ mediaUrl: "https://example.com/reply.mp3", audioAsVoice: true },
{ kind: "final" },
);
expect(sendMediaFeishuMock).toHaveBeenCalledWith(
expect.objectContaining({
mediaUrl: "https://example.com/reply.mp3",
audioAsVoice: true,
}),
);
});
it("falls back to legacy mediaUrl when mediaUrls is an empty array", async () => {
const { options } = createDispatcherHarness();
await options.deliver(

View File

@@ -396,6 +396,7 @@ export function createFeishuReplyDispatcher(params: CreateFeishuReplyDispatcherP
replyToMessageId: sendReplyToMessageId,
replyInThread: effectiveReplyInThread,
accountId,
...(payload.audioAsVoice === true ? { audioAsVoice: true } : {}),
});
},
});

View File

@@ -118,6 +118,36 @@ describe("speech-core native voice-note routing", () => {
});
});
it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => {
synthesizeMock.mockResolvedValueOnce({
audioBuffer: Buffer.from("mp3"),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: false,
});
const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test");
let mediaDir: string | undefined;
try {
const result = await maybeApplyTtsToPayload({
payload: { text: "This Feishu reply should be transcoded by the channel." },
cfg,
channel: "feishu",
kind: "final",
});
expect(synthesizeMock).toHaveBeenCalledWith(
expect.objectContaining({ target: "voice-note" }),
);
expect(result.audioAsVoice).toBe(true);
expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/);
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
} finally {
if (mediaDir) {
rmSync(mediaDir, { recursive: true, force: true });
}
}
});
it("keeps non-native voice-note channels as regular audio files", async () => {
await expectTtsPayloadResult({
channel: "slack",

View File

@@ -100,6 +100,8 @@ export type TtsResult = {
attempts?: TtsProviderAttempt[];
outputFormat?: string;
voiceCompatible?: boolean;
audioAsVoice?: boolean;
target?: "audio-file" | "voice-note";
};
export type TtsSynthesisResult = {
@@ -114,6 +116,7 @@ export type TtsSynthesisResult = {
outputFormat?: string;
voiceCompatible?: boolean;
fileExtension?: string;
target?: "audio-file" | "voice-note";
};
export type TtsTelephonyResult = {
@@ -586,6 +589,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
}
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]);
function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
@@ -596,6 +600,22 @@ function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
return channelId !== null && OPUS_CHANNELS.has(channelId);
}
function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
const channelId = resolveChannelId(channel);
return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
}
function shouldDeliverTtsAsVoice(params: {
channel: string | undefined;
target: "audio-file" | "voice-note" | undefined;
voiceCompatible: boolean | undefined;
}): boolean {
if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") {
return false;
}
return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
}
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary;
const ordered = new Set<TtsProvider>([normalizedPrimary]);
@@ -782,6 +802,12 @@ export async function textToSpeech(params: {
attempts: synthesis.attempts,
outputFormat: synthesis.outputFormat,
voiceCompatible: synthesis.voiceCompatible,
audioAsVoice: shouldDeliverTtsAsVoice({
channel: params.channel,
target: synthesis.target,
voiceCompatible: synthesis.voiceCompatible,
}),
target: synthesis.target,
};
}
@@ -863,6 +889,7 @@ export async function synthesizeSpeech(params: {
outputFormat: synthesis.outputFormat,
voiceCompatible: synthesis.voiceCompatible,
fileExtension: synthesis.fileExtension,
target,
};
} catch (err) {
const errorMsg = formatTtsProviderError(provider, err);
@@ -1171,12 +1198,10 @@ export async function maybeApplyTtsToPayload(params: {
latencyMs: result.latencyMs,
};
const shouldVoice =
supportsNativeVoiceNoteTts(params.channel) && result.voiceCompatible === true;
return {
...nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice,
};
}
@@ -1199,6 +1224,8 @@ export const _test = {
parseTtsDirectives,
resolveModelOverridePolicy,
supportsNativeVoiceNoteTts,
supportsTranscodedVoiceNoteTts,
shouldDeliverTtsAsVoice,
summarizeText,
getResolvedSpeechProviderConfig,
formatTtsProviderError,

View File

@@ -43,6 +43,28 @@ describe("createTtsTool", () => {
expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
});
it("uses audioAsVoice from the TTS runtime even when the provider output is not native", async () => {
textToSpeechSpy.mockResolvedValue({
success: true,
audioPath: "/tmp/reply.mp3",
provider: "test",
voiceCompatible: false,
audioAsVoice: true,
});
const tool = createTtsTool();
const result = await tool.execute("call-1", { text: "hello", channel: "feishu" });
expect(result).toMatchObject({
details: {
media: {
mediaUrl: "/tmp/reply.mp3",
audioAsVoice: true,
},
},
});
});
it("passes an optional timeout to speech generation", async () => {
textToSpeechSpy.mockResolvedValue({
success: true,

View File

@@ -92,7 +92,7 @@ export function createTtsTool(opts?: {
media: {
mediaUrl: result.audioPath,
trustedLocalMedia: true,
...(result.voiceCompatible ? { audioAsVoice: true } : {}),
...(result.audioAsVoice || result.voiceCompatible ? { audioAsVoice: true } : {}),
},
},
};

View File

@@ -41,6 +41,8 @@ export type TtsStatusEntry = {
error?: string;
};
export type TtsSpeechTarget = "audio-file" | "voice-note";
export type SummarizeResult = {
summary: string;
latencyMs: number;
@@ -99,6 +101,12 @@ export type TtsTestFacade = {
parseTtsDirectives: (...args: unknown[]) => TtsDirectiveParseResult;
resolveModelOverridePolicy: (...args: unknown[]) => ResolvedTtsModelOverrides;
supportsNativeVoiceNoteTts: (channel: string | undefined) => boolean;
supportsTranscodedVoiceNoteTts: (channel: string | undefined) => boolean;
shouldDeliverTtsAsVoice: (params: {
channel: string | undefined;
target: TtsSpeechTarget | undefined;
voiceCompatible: boolean | undefined;
}) => boolean;
summarizeText: (...args: unknown[]) => Promise<SummarizeResult>;
getResolvedSpeechProviderConfig: (
config: ResolvedTtsConfig,
@@ -120,6 +128,8 @@ export type TtsResult = {
attempts?: TtsProviderAttempt[];
outputFormat?: string;
voiceCompatible?: boolean;
audioAsVoice?: boolean;
target?: TtsSpeechTarget;
};
export type TtsSynthesisResult = {
@@ -134,6 +144,7 @@ export type TtsSynthesisResult = {
outputFormat?: string;
voiceCompatible?: boolean;
fileExtension?: string;
target?: TtsSpeechTarget;
};
export type TtsTelephonyResult = {