mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 11:20:43 +00:00
fix(feishu): transcode voice TTS audio
This commit is contained in:
@@ -21,6 +21,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Control UI: make `/usage` use the fresh context snapshot for context percentage, and include cache-write tokens in the Usage overview cache-hit denominator. Fixes #47885. Thanks @imwyvern and @Ante042.
|
||||
- GitHub Copilot: preserve encrypted Responses reasoning item IDs during replay so Copilot can validate encrypted reasoning payloads across requests. (#71448) Thanks @a410979729-sys.
|
||||
- Agents/replies: recover final-answer text when streamed assistant chunks contain only whitespace, preventing completed turns from surfacing as empty-payload errors. Fixes #71454. (#71467) Thanks @Sanjays2402.
|
||||
- Feishu/TTS: transcode voice-intent MP3 and other audio replies to Ogg/Opus before sending native Feishu audio bubbles, while keeping ordinary MP3 attachments as files. Fixes #61249 and #37868.
|
||||
- Telegram/webhook: acknowledge validated webhook updates before running bot middleware, keeping slow agent turns from tripping Telegram delivery retries while preserving per-chat processing lanes. Fixes #71392. Thanks @joelforsberg46-source.
|
||||
- MCP: retire one-shot embedded bundled MCP runtimes at run end, skip bundle-MCP startup when a runtime tool allowlist cannot reach bundle-MCP tools, and add `mcp.sessionIdleTtlMs` idle eviction for leaked session runtimes. Fixes #71106, #71110, #70389, and #70808.
|
||||
- MCP/config reload: hot-apply `mcp.*` changes by disposing cached session MCP runtimes, and dispose bundled MCP runtimes during gateway shutdown so removed `mcp.servers` entries reap child processes promptly. Fixes #60656.
|
||||
|
||||
@@ -424,6 +424,14 @@ Full configuration: [Gateway configuration](/gateway/configuration)
|
||||
- ✅ Interactive cards (including streaming updates)
|
||||
- ⚠️ Rich text (post-style formatting; doesn't support full Feishu/Lark authoring capabilities)
|
||||
|
||||
Native Feishu/Lark audio bubbles use the Feishu `audio` message type and require
|
||||
Ogg/Opus upload media (`file_type: "opus"`). Existing `.opus` and `.ogg` media
|
||||
is sent directly as native audio. MP3/WAV/M4A and other likely audio formats are
|
||||
transcoded to 48kHz Ogg/Opus with `ffmpeg` only when the reply requests voice
|
||||
delivery (`audioAsVoice` / message tool `asVoice`, including TTS voice-note
|
||||
replies). Ordinary MP3 attachments stay regular files. If `ffmpeg` is missing or
|
||||
conversion fails, OpenClaw falls back to a file attachment and logs the reason.
|
||||
|
||||
### Threads and replies
|
||||
|
||||
- ✅ Inline replies
|
||||
|
||||
@@ -489,8 +489,12 @@ These override `messages.tts.*` for that host.
|
||||
|
||||
## Output formats (fixed)
|
||||
|
||||
- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- **Feishu / Matrix / Telegram / WhatsApp**: voice-note replies prefer Opus (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice message tradeoff.
|
||||
- **Feishu**: when a voice-note reply is produced as MP3/WAV/M4A or another
|
||||
likely audio file, the Feishu plugin transcodes it to 48kHz Ogg/Opus with
|
||||
`ffmpeg` before sending the native `audio` bubble. If conversion fails, Feishu
|
||||
receives the original file as an attachment.
|
||||
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
|
||||
- 44.1kHz / 128kbps is the default balance for speech clarity.
|
||||
- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery.
|
||||
@@ -572,6 +576,8 @@ Notes:
|
||||
The `tts` tool converts text to speech and returns an audio attachment for
|
||||
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
|
||||
the audio is delivered as a voice message rather than a file attachment.
|
||||
Feishu can transcode non-Opus TTS output on this path when `ffmpeg` is
|
||||
available.
|
||||
It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
|
||||
per-call provider request timeout in milliseconds.
|
||||
|
||||
|
||||
@@ -461,6 +461,34 @@ describe("feishuPlugin actions", () => {
|
||||
expect(result?.details).toMatchObject({ messageId: "om_media" });
|
||||
});
|
||||
|
||||
it("passes asVoice through media sends", async () => {
|
||||
feishuOutboundSendMediaMock.mockResolvedValueOnce({
|
||||
channel: "feishu",
|
||||
messageId: "om_voice",
|
||||
details: { messageId: "om_voice", chatId: "oc_group_1" },
|
||||
});
|
||||
|
||||
await feishuPlugin.actions?.handleAction?.({
|
||||
action: "send",
|
||||
params: {
|
||||
to: "chat:oc_group_1",
|
||||
media: "https://example.com/reply.mp3",
|
||||
asVoice: true,
|
||||
},
|
||||
cfg,
|
||||
accountId: undefined,
|
||||
toolContext: {},
|
||||
mediaLocalRoots: [],
|
||||
} as never);
|
||||
|
||||
expect(feishuOutboundSendMediaMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
mediaUrl: "https://example.com/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("reads messages", async () => {
|
||||
getMessageFeishuMock.mockResolvedValueOnce({
|
||||
messageId: "om_1",
|
||||
|
||||
@@ -81,6 +81,16 @@ function readFeishuMediaParam(params: Record<string, unknown>): string | undefin
|
||||
return media.trim() ? media : undefined;
|
||||
}
|
||||
|
||||
function readBooleanParam(params: Record<string, unknown>, keys: string[]): boolean | undefined {
|
||||
for (const key of keys) {
|
||||
const value = params[key];
|
||||
if (typeof value === "boolean") {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function hasLegacyFeishuCardCommandValue(actionValue: unknown): boolean {
|
||||
return (
|
||||
isRecord(actionValue) &&
|
||||
@@ -695,6 +705,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
|
||||
const presentation = normalizeMessagePresentation(ctx.params.presentation);
|
||||
const text = readFirstString(ctx.params, ["text", "message"]);
|
||||
const mediaUrl = readFeishuMediaParam(ctx.params);
|
||||
const audioAsVoice = readBooleanParam(ctx.params, ["asVoice", "audioAsVoice"]);
|
||||
const card = presentation
|
||||
? buildFeishuPresentationCard({ presentation, fallbackText: text })
|
||||
: undefined;
|
||||
@@ -734,6 +745,7 @@ export const feishuPlugin: ChannelPlugin<ResolvedFeishuAccount, FeishuProbeResul
|
||||
accountId: ctx.accountId ?? undefined,
|
||||
mediaLocalRoots: ctx.mediaLocalRoots,
|
||||
replyToId: replyToMessageId,
|
||||
...(audioAsVoice === true ? { audioAsVoice: true } : {}),
|
||||
});
|
||||
} else {
|
||||
result = await runtime.sendMessageFeishu({
|
||||
|
||||
@@ -9,6 +9,7 @@ const resolveFeishuAccountMock = vi.hoisted(() => vi.fn());
|
||||
const normalizeFeishuTargetMock = vi.hoisted(() => vi.fn());
|
||||
const resolveReceiveIdTypeMock = vi.hoisted(() => vi.fn());
|
||||
const loadWebMediaMock = vi.hoisted(() => vi.fn());
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
const fileCreateMock = vi.hoisted(() => vi.fn());
|
||||
const imageCreateMock = vi.hoisted(() => vi.fn());
|
||||
@@ -42,6 +43,14 @@ vi.mock("./runtime.js", () => ({
|
||||
}),
|
||||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", async (importOriginal) => {
|
||||
const actual = await importOriginal<typeof import("openclaw/plugin-sdk/media-runtime")>();
|
||||
return {
|
||||
...actual,
|
||||
runFfmpeg: runFfmpegMock,
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("../../../src/channels/plugins/bundled.js", () => ({
|
||||
bundledChannelPlugins: [],
|
||||
bundledChannelSetupPlugins: [],
|
||||
@@ -145,6 +154,10 @@ describe("sendMediaFeishu msg_type routing", () => {
|
||||
|
||||
imageGetMock.mockResolvedValue(Buffer.from("image-bytes"));
|
||||
messageResourceGetMock.mockResolvedValue(Buffer.from("resource-bytes"));
|
||||
runFfmpegMock.mockImplementation(async (args: string[]) => {
|
||||
await fs.writeFile(args.at(-1) ?? "", Buffer.from("opus-output"));
|
||||
return "";
|
||||
});
|
||||
});
|
||||
|
||||
it("uses msg_type=media for mp4 video", async () => {
|
||||
@@ -260,6 +273,104 @@ describe("sendMediaFeishu msg_type routing", () => {
|
||||
data: expect.objectContaining({ msg_type: "file" }),
|
||||
}),
|
||||
);
|
||||
expect(runFfmpegMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcodes voice-intent mp3 to msg_type=audio", async () => {
|
||||
loadWebMediaMock.mockResolvedValueOnce({
|
||||
buffer: Buffer.from("remote-mp3"),
|
||||
fileName: "reply.mp3",
|
||||
kind: "audio",
|
||||
contentType: "audio/mpeg",
|
||||
});
|
||||
|
||||
await sendMediaFeishu({
|
||||
cfg: emptyConfig,
|
||||
to: "user:ou_target",
|
||||
mediaUrl: "https://example.com/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
});
|
||||
|
||||
expect(runFfmpegMock).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]),
|
||||
);
|
||||
expect(fileCreateMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
data: expect.objectContaining({
|
||||
file_type: "opus",
|
||||
file_name: "voice.ogg",
|
||||
file: Buffer.from("opus-output"),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(messageCreateMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
data: expect.objectContaining({ msg_type: "audio" }),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("leaves native voice audio unchanged when audioAsVoice is true", async () => {
|
||||
await sendMediaFeishu({
|
||||
cfg: emptyConfig,
|
||||
to: "user:ou_target",
|
||||
mediaBuffer: Buffer.from("opus"),
|
||||
fileName: "reply.ogg",
|
||||
audioAsVoice: true,
|
||||
});
|
||||
|
||||
expect(runFfmpegMock).not.toHaveBeenCalled();
|
||||
expect(fileCreateMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
data: expect.objectContaining({
|
||||
file_type: "opus",
|
||||
file_name: "reply.ogg",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(messageCreateMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
data: expect.objectContaining({ msg_type: "audio" }),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("falls back to file when voice-intent audio cannot be transcoded", async () => {
|
||||
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => undefined);
|
||||
runFfmpegMock.mockRejectedValueOnce(new Error("ffmpeg missing"));
|
||||
loadWebMediaMock.mockResolvedValueOnce({
|
||||
buffer: Buffer.from("remote-mp3"),
|
||||
fileName: "reply.mp3",
|
||||
kind: "audio",
|
||||
contentType: "audio/mpeg",
|
||||
});
|
||||
|
||||
await sendMediaFeishu({
|
||||
cfg: emptyConfig,
|
||||
to: "user:ou_target",
|
||||
mediaUrl: "https://example.com/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
});
|
||||
|
||||
expect(fileCreateMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
data: expect.objectContaining({
|
||||
file_type: "stream",
|
||||
file_name: "reply.mp3",
|
||||
file: Buffer.from("remote-mp3"),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(messageCreateMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
data: expect.objectContaining({ msg_type: "file" }),
|
||||
}),
|
||||
);
|
||||
expect(warnSpy).toHaveBeenCalledWith(
|
||||
expect.stringContaining("audioAsVoice transcode failed"),
|
||||
expect.any(Error),
|
||||
);
|
||||
warnSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("configures the media client timeout for image uploads", async () => {
|
||||
|
||||
@@ -3,7 +3,11 @@ import path from "node:path";
|
||||
import { Readable } from "node:stream";
|
||||
import type * as Lark from "@larksuiteoapi/node-sdk";
|
||||
import { mediaKindFromMime } from "openclaw/plugin-sdk/media-mime";
|
||||
import { withTempDownloadPath } from "openclaw/plugin-sdk/temp-path";
|
||||
import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS, runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
|
||||
import {
|
||||
resolvePreferredOpenClawTmpDir,
|
||||
withTempDownloadPath,
|
||||
} from "openclaw/plugin-sdk/temp-path";
|
||||
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
|
||||
import type { ClawdbotConfig } from "../runtime-api.js";
|
||||
import { resolveFeishuRuntimeAccount } from "./accounts.js";
|
||||
@@ -14,6 +18,24 @@ import { assertFeishuMessageApiSuccess, toFeishuSendResult } from "./send-result
|
||||
import { resolveFeishuSendTarget } from "./send-target.js";
|
||||
|
||||
const FEISHU_MEDIA_HTTP_TIMEOUT_MS = 120_000;
|
||||
const FEISHU_VOICE_FILE_NAME = "voice.ogg";
|
||||
const FEISHU_VOICE_SAMPLE_RATE_HZ = 48_000;
|
||||
const FEISHU_VOICE_BITRATE = "64k";
|
||||
|
||||
const FEISHU_TRANSCODABLE_AUDIO_EXTS = new Set([
|
||||
".aac",
|
||||
".aiff",
|
||||
".alac",
|
||||
".amr",
|
||||
".caf",
|
||||
".flac",
|
||||
".m4a",
|
||||
".mp3",
|
||||
".oga",
|
||||
".wav",
|
||||
".webm",
|
||||
".wma",
|
||||
]);
|
||||
|
||||
export type DownloadImageResult = {
|
||||
buffer: Buffer;
|
||||
@@ -568,6 +590,89 @@ function resolveFeishuOutboundMediaKind(params: { fileName: string; contentType?
|
||||
};
|
||||
}
|
||||
|
||||
function isFeishuNativeVoiceAudio(params: { fileName: string; contentType?: string }): boolean {
|
||||
const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
|
||||
const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
|
||||
return (
|
||||
ext === ".opus" || ext === ".ogg" || contentType === "audio/ogg" || contentType === "audio/opus"
|
||||
);
|
||||
}
|
||||
|
||||
function isLikelyTranscodableAudio(params: { fileName: string; contentType?: string }): boolean {
|
||||
const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
|
||||
const contentType = normalizeLowercaseStringOrEmpty(params.contentType);
|
||||
return FEISHU_TRANSCODABLE_AUDIO_EXTS.has(ext) || mediaKindFromMime(contentType) === "audio";
|
||||
}
|
||||
|
||||
async function transcodeToFeishuVoiceOpus(params: {
|
||||
buffer: Buffer;
|
||||
fileName: string;
|
||||
contentType?: string;
|
||||
}): Promise<{ buffer: Buffer; fileName: string; contentType: string }> {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
await fs.promises.mkdir(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = await fs.promises.mkdtemp(path.join(tempRoot, "feishu-voice-"));
|
||||
try {
|
||||
const ext = normalizeLowercaseStringOrEmpty(path.extname(params.fileName));
|
||||
const inputExt = ext && ext.length <= 12 ? ext : ".audio";
|
||||
const inputPath = path.join(tempDir, `input${inputExt}`);
|
||||
const outputPath = path.join(tempDir, FEISHU_VOICE_FILE_NAME);
|
||||
await fs.promises.writeFile(inputPath, params.buffer, { mode: 0o600 });
|
||||
await runFfmpeg([
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-vn",
|
||||
"-sn",
|
||||
"-dn",
|
||||
"-t",
|
||||
String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS),
|
||||
"-ar",
|
||||
String(FEISHU_VOICE_SAMPLE_RATE_HZ),
|
||||
"-ac",
|
||||
"1",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
FEISHU_VOICE_BITRATE,
|
||||
outputPath,
|
||||
]);
|
||||
return {
|
||||
buffer: await fs.promises.readFile(outputPath),
|
||||
fileName: FEISHU_VOICE_FILE_NAME,
|
||||
contentType: "audio/ogg",
|
||||
};
|
||||
} finally {
|
||||
await fs.promises.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
async function prepareFeishuVoiceMedia(params: {
|
||||
buffer: Buffer;
|
||||
fileName: string;
|
||||
contentType?: string;
|
||||
audioAsVoice?: boolean;
|
||||
}): Promise<{ buffer: Buffer; fileName: string; contentType?: string }> {
|
||||
if (isFeishuNativeVoiceAudio(params)) {
|
||||
return params;
|
||||
}
|
||||
if (params.audioAsVoice !== true || !isLikelyTranscodableAudio(params)) {
|
||||
return params;
|
||||
}
|
||||
try {
|
||||
return await transcodeToFeishuVoiceOpus(params);
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[feishu] audioAsVoice transcode failed; sending ${params.fileName} as a file attachment:`,
|
||||
err,
|
||||
);
|
||||
return params;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload and send media (image or file) from URL, local path, or buffer.
|
||||
* When mediaUrl is a local path, mediaLocalRoots (from core outbound context)
|
||||
@@ -584,6 +689,8 @@ export async function sendMediaFeishu(params: {
|
||||
accountId?: string;
|
||||
/** Allowed roots for local path reads; required for local filePath to work. */
|
||||
mediaLocalRoots?: readonly string[];
|
||||
/** When true, transcode compatible audio to Feishu native Ogg/Opus voice bubbles. */
|
||||
audioAsVoice?: boolean;
|
||||
}): Promise<SendMediaResult> {
|
||||
const {
|
||||
cfg,
|
||||
@@ -595,6 +702,7 @@ export async function sendMediaFeishu(params: {
|
||||
replyInThread,
|
||||
accountId,
|
||||
mediaLocalRoots,
|
||||
audioAsVoice,
|
||||
} = params;
|
||||
const account = resolveFeishuRuntimeAccount({ cfg, accountId });
|
||||
if (!account.configured) {
|
||||
@@ -622,6 +730,16 @@ export async function sendMediaFeishu(params: {
|
||||
throw new Error("Either mediaUrl or mediaBuffer must be provided");
|
||||
}
|
||||
|
||||
const prepared = await prepareFeishuVoiceMedia({
|
||||
buffer,
|
||||
fileName: name,
|
||||
contentType,
|
||||
audioAsVoice,
|
||||
});
|
||||
buffer = prepared.buffer;
|
||||
name = prepared.fileName;
|
||||
contentType = prepared.contentType;
|
||||
|
||||
const routing = resolveFeishuOutboundMediaKind({ fileName: name, contentType });
|
||||
|
||||
if (routing.msgType === "image") {
|
||||
|
||||
@@ -457,6 +457,24 @@ describe("feishuOutbound.sendMedia replyToId forwarding", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("forwards audioAsVoice to sendMediaFeishu", async () => {
|
||||
await feishuOutbound.sendMedia?.({
|
||||
cfg: emptyConfig,
|
||||
to: "chat_1",
|
||||
text: "",
|
||||
mediaUrl: "https://example.com/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
accountId: "main",
|
||||
});
|
||||
|
||||
expect(sendMediaFeishuMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
mediaUrl: "https://example.com/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("forwards replyToId to text caption send", async () => {
|
||||
await feishuOutbound.sendMedia?.({
|
||||
cfg: emptyConfig,
|
||||
|
||||
@@ -232,6 +232,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
|
||||
to,
|
||||
text,
|
||||
mediaUrl,
|
||||
audioAsVoice,
|
||||
accountId,
|
||||
mediaLocalRoots,
|
||||
replyToId,
|
||||
@@ -271,6 +272,7 @@ export const feishuOutbound: ChannelOutboundAdapter = {
|
||||
accountId: accountId ?? undefined,
|
||||
mediaLocalRoots,
|
||||
replyToMessageId,
|
||||
...(audioAsVoice === true ? { audioAsVoice: true } : {}),
|
||||
});
|
||||
} catch (err) {
|
||||
// Log the error for debugging
|
||||
|
||||
@@ -469,6 +469,21 @@ describe("createFeishuReplyDispatcher streaming behavior", () => {
|
||||
expect(sendMarkdownCardFeishuMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("passes audioAsVoice to media attachments", async () => {
|
||||
const { options } = createDispatcherHarness();
|
||||
await options.deliver(
|
||||
{ mediaUrl: "https://example.com/reply.mp3", audioAsVoice: true },
|
||||
{ kind: "final" },
|
||||
);
|
||||
|
||||
expect(sendMediaFeishuMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
mediaUrl: "https://example.com/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("falls back to legacy mediaUrl when mediaUrls is an empty array", async () => {
|
||||
const { options } = createDispatcherHarness();
|
||||
await options.deliver(
|
||||
|
||||
@@ -396,6 +396,7 @@ export function createFeishuReplyDispatcher(params: CreateFeishuReplyDispatcherP
|
||||
replyToMessageId: sendReplyToMessageId,
|
||||
replyInThread: effectiveReplyInThread,
|
||||
accountId,
|
||||
...(payload.audioAsVoice === true ? { audioAsVoice: true } : {}),
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
@@ -118,6 +118,36 @@ describe("speech-core native voice-note routing", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("marks Feishu voice-note TTS for channel-side transcoding when provider returns mp3", async () => {
|
||||
synthesizeMock.mockResolvedValueOnce({
|
||||
audioBuffer: Buffer.from("mp3"),
|
||||
outputFormat: "mp3",
|
||||
fileExtension: ".mp3",
|
||||
voiceCompatible: false,
|
||||
});
|
||||
const cfg = createTtsConfig("openclaw-speech-core-tts-feishu-mp3-test");
|
||||
let mediaDir: string | undefined;
|
||||
try {
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload: { text: "This Feishu reply should be transcoded by the channel." },
|
||||
cfg,
|
||||
channel: "feishu",
|
||||
kind: "final",
|
||||
});
|
||||
|
||||
expect(synthesizeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ target: "voice-note" }),
|
||||
);
|
||||
expect(result.audioAsVoice).toBe(true);
|
||||
expect(result.mediaUrl).toMatch(/voice-\d+\.mp3$/);
|
||||
mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
|
||||
} finally {
|
||||
if (mediaDir) {
|
||||
rmSync(mediaDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps non-native voice-note channels as regular audio files", async () => {
|
||||
await expectTtsPayloadResult({
|
||||
channel: "slack",
|
||||
|
||||
@@ -100,6 +100,8 @@ export type TtsResult = {
|
||||
attempts?: TtsProviderAttempt[];
|
||||
outputFormat?: string;
|
||||
voiceCompatible?: boolean;
|
||||
audioAsVoice?: boolean;
|
||||
target?: "audio-file" | "voice-note";
|
||||
};
|
||||
|
||||
export type TtsSynthesisResult = {
|
||||
@@ -114,6 +116,7 @@ export type TtsSynthesisResult = {
|
||||
outputFormat?: string;
|
||||
voiceCompatible?: boolean;
|
||||
fileExtension?: string;
|
||||
target?: "audio-file" | "voice-note";
|
||||
};
|
||||
|
||||
export type TtsTelephonyResult = {
|
||||
@@ -586,6 +589,7 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
|
||||
}
|
||||
|
||||
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix", "discord"]);
|
||||
const TRANSCODED_VOICE_NOTE_CHANNELS = new Set(["feishu"]);
|
||||
|
||||
function resolveChannelId(channel: string | undefined): ChannelId | null {
|
||||
return channel ? normalizeChannelId(channel) : null;
|
||||
@@ -596,6 +600,22 @@ function supportsNativeVoiceNoteTts(channel: string | undefined): boolean {
|
||||
return channelId !== null && OPUS_CHANNELS.has(channelId);
|
||||
}
|
||||
|
||||
function supportsTranscodedVoiceNoteTts(channel: string | undefined): boolean {
|
||||
const channelId = resolveChannelId(channel);
|
||||
return channelId !== null && TRANSCODED_VOICE_NOTE_CHANNELS.has(channelId);
|
||||
}
|
||||
|
||||
function shouldDeliverTtsAsVoice(params: {
|
||||
channel: string | undefined;
|
||||
target: "audio-file" | "voice-note" | undefined;
|
||||
voiceCompatible: boolean | undefined;
|
||||
}): boolean {
|
||||
if (!supportsNativeVoiceNoteTts(params.channel) || params.target !== "voice-note") {
|
||||
return false;
|
||||
}
|
||||
return params.voiceCompatible === true || supportsTranscodedVoiceNoteTts(params.channel);
|
||||
}
|
||||
|
||||
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
|
||||
const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary;
|
||||
const ordered = new Set<TtsProvider>([normalizedPrimary]);
|
||||
@@ -782,6 +802,12 @@ export async function textToSpeech(params: {
|
||||
attempts: synthesis.attempts,
|
||||
outputFormat: synthesis.outputFormat,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
audioAsVoice: shouldDeliverTtsAsVoice({
|
||||
channel: params.channel,
|
||||
target: synthesis.target,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
}),
|
||||
target: synthesis.target,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -863,6 +889,7 @@ export async function synthesizeSpeech(params: {
|
||||
outputFormat: synthesis.outputFormat,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
fileExtension: synthesis.fileExtension,
|
||||
target,
|
||||
};
|
||||
} catch (err) {
|
||||
const errorMsg = formatTtsProviderError(provider, err);
|
||||
@@ -1171,12 +1198,10 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
latencyMs: result.latencyMs,
|
||||
};
|
||||
|
||||
const shouldVoice =
|
||||
supportsNativeVoiceNoteTts(params.channel) && result.voiceCompatible === true;
|
||||
return {
|
||||
...nextPayload,
|
||||
mediaUrl: result.audioPath,
|
||||
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
|
||||
audioAsVoice: result.audioAsVoice || params.payload.audioAsVoice,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1199,6 +1224,8 @@ export const _test = {
|
||||
parseTtsDirectives,
|
||||
resolveModelOverridePolicy,
|
||||
supportsNativeVoiceNoteTts,
|
||||
supportsTranscodedVoiceNoteTts,
|
||||
shouldDeliverTtsAsVoice,
|
||||
summarizeText,
|
||||
getResolvedSpeechProviderConfig,
|
||||
formatTtsProviderError,
|
||||
|
||||
@@ -43,6 +43,28 @@ describe("createTtsTool", () => {
|
||||
expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
|
||||
});
|
||||
|
||||
it("uses audioAsVoice from the TTS runtime even when the provider output is not native", async () => {
|
||||
textToSpeechSpy.mockResolvedValue({
|
||||
success: true,
|
||||
audioPath: "/tmp/reply.mp3",
|
||||
provider: "test",
|
||||
voiceCompatible: false,
|
||||
audioAsVoice: true,
|
||||
});
|
||||
|
||||
const tool = createTtsTool();
|
||||
const result = await tool.execute("call-1", { text: "hello", channel: "feishu" });
|
||||
|
||||
expect(result).toMatchObject({
|
||||
details: {
|
||||
media: {
|
||||
mediaUrl: "/tmp/reply.mp3",
|
||||
audioAsVoice: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("passes an optional timeout to speech generation", async () => {
|
||||
textToSpeechSpy.mockResolvedValue({
|
||||
success: true,
|
||||
|
||||
@@ -92,7 +92,7 @@ export function createTtsTool(opts?: {
|
||||
media: {
|
||||
mediaUrl: result.audioPath,
|
||||
trustedLocalMedia: true,
|
||||
...(result.voiceCompatible ? { audioAsVoice: true } : {}),
|
||||
...(result.audioAsVoice || result.voiceCompatible ? { audioAsVoice: true } : {}),
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -41,6 +41,8 @@ export type TtsStatusEntry = {
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export type TtsSpeechTarget = "audio-file" | "voice-note";
|
||||
|
||||
export type SummarizeResult = {
|
||||
summary: string;
|
||||
latencyMs: number;
|
||||
@@ -99,6 +101,12 @@ export type TtsTestFacade = {
|
||||
parseTtsDirectives: (...args: unknown[]) => TtsDirectiveParseResult;
|
||||
resolveModelOverridePolicy: (...args: unknown[]) => ResolvedTtsModelOverrides;
|
||||
supportsNativeVoiceNoteTts: (channel: string | undefined) => boolean;
|
||||
supportsTranscodedVoiceNoteTts: (channel: string | undefined) => boolean;
|
||||
shouldDeliverTtsAsVoice: (params: {
|
||||
channel: string | undefined;
|
||||
target: TtsSpeechTarget | undefined;
|
||||
voiceCompatible: boolean | undefined;
|
||||
}) => boolean;
|
||||
summarizeText: (...args: unknown[]) => Promise<SummarizeResult>;
|
||||
getResolvedSpeechProviderConfig: (
|
||||
config: ResolvedTtsConfig,
|
||||
@@ -120,6 +128,8 @@ export type TtsResult = {
|
||||
attempts?: TtsProviderAttempt[];
|
||||
outputFormat?: string;
|
||||
voiceCompatible?: boolean;
|
||||
audioAsVoice?: boolean;
|
||||
target?: TtsSpeechTarget;
|
||||
};
|
||||
|
||||
export type TtsSynthesisResult = {
|
||||
@@ -134,6 +144,7 @@ export type TtsSynthesisResult = {
|
||||
outputFormat?: string;
|
||||
voiceCompatible?: boolean;
|
||||
fileExtension?: string;
|
||||
target?: TtsSpeechTarget;
|
||||
};
|
||||
|
||||
export type TtsTelephonyResult = {
|
||||
|
||||
Reference in New Issue
Block a user