mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:50:43 +00:00
fix(minimax): transcode voice-note tts to opus
This commit is contained in:
@@ -91,7 +91,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Plugins/OpenCode: strip unsupported disabled Responses reasoning payloads for OpenCode image understanding. Fixes #70252.
|
||||
- Plugins/OpenCode/OpenCode Go: register image understanding metadata so the image tool is available for OpenCode catalog models with vision support. Fixes #70482 and #61789.
|
||||
- Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
|
||||
- Providers/MiniMax TTS: mark MP3 output voice-compatible for Telegram voice-note delivery. Fixes #63540.
|
||||
- Providers/MiniMax TTS: transcode voice-note targets to Opus so Feishu/Telegram receive native voice messages instead of MP3 file attachments. Fixes #63540, #64134, and #70445.
|
||||
- Providers/Microsoft TTS: keep allowlisted bundled speech providers discoverable even when another speech plugin has already registered, so Edge/Microsoft TTS is available alongside OpenAI. Fixes #62117 and #66850.
|
||||
- Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153.
|
||||
- Providers/OpenRouter: add an OpenRouter TTS provider using the OpenAI-compatible `/audio/speech` endpoint and `OPENROUTER_API_KEY`. Fixes #71268.
|
||||
|
||||
@@ -244,6 +244,18 @@ exposed separately through the plugin-owned `MiniMax-VL-01` media provider.
|
||||
See [Image Generation](/tools/image-generation) for shared tool parameters, provider selection, and failover behavior.
|
||||
</Note>
|
||||
|
||||
### Text-to-speech
|
||||
|
||||
The bundled `minimax` plugin registers MiniMax T2A v2 as a speech provider for
|
||||
`messages.tts`.
|
||||
|
||||
- Default TTS model: `speech-2.8-hd`
|
||||
- Default voice: `English_expressive_narrator`
|
||||
- Normal audio attachments stay MP3.
|
||||
- Voice-note targets such as Feishu and Telegram are transcoded from MiniMax
|
||||
MP3 to 48kHz Opus with `ffmpeg`, because the Feishu/Lark file API only
|
||||
accepts `file_type: "opus"` for native audio messages.
|
||||
|
||||
### Music generation
|
||||
|
||||
The bundled `minimax` plugin also registers music generation through the shared
|
||||
|
||||
@@ -488,7 +488,7 @@ These override `messages.tts.*` for that host.
|
||||
- 48kHz / 64kbps is a good voice message tradeoff.
|
||||
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
|
||||
- 44.1kHz / 128kbps is the default balance for speech clarity.
|
||||
- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate). Voice-note format not natively supported; use OpenAI or ElevenLabs for guaranteed Opus voice messages.
|
||||
- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery.
|
||||
- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path.
|
||||
- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony.
|
||||
- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path.
|
||||
|
||||
@@ -1,4 +1,11 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
runFfmpeg: runFfmpegMock,
|
||||
}));
|
||||
|
||||
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
|
||||
|
||||
describe("buildMinimaxSpeechProvider", () => {
|
||||
@@ -213,6 +220,7 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal("fetch", vi.fn());
|
||||
runFfmpegMock.mockReset();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
@@ -240,7 +248,7 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
|
||||
expect(result.outputFormat).toBe("mp3");
|
||||
expect(result.fileExtension).toBe(".mp3");
|
||||
expect(result.voiceCompatible).toBe(true);
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
expect(result.audioBuffer.toString()).toBe("fake-audio-data");
|
||||
|
||||
expect(mockFetch).toHaveBeenCalledOnce();
|
||||
@@ -250,6 +258,44 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
expect(body.model).toBe("speech-2.8-hd");
|
||||
expect(body.text).toBe("Hello world");
|
||||
expect(body.voice_setting.voice_id).toBe("English_expressive_narrator");
|
||||
expect(runFfmpegMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcodes MiniMax MP3 to Opus for voice-note targets", async () => {
|
||||
const hexAudio = Buffer.from("fake-mp3-data").toString("hex");
|
||||
const mockFetch = vi.mocked(globalThis.fetch);
|
||||
mockFetch.mockResolvedValueOnce(
|
||||
new Response(JSON.stringify({ data: { audio: hexAudio } }), {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
}),
|
||||
);
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
const outputPath = args.at(-1);
|
||||
if (typeof outputPath !== "string") {
|
||||
throw new Error("missing ffmpeg output path");
|
||||
}
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(outputPath, Buffer.from("fake-opus-data")),
|
||||
);
|
||||
});
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Hello world",
|
||||
cfg: {} as never,
|
||||
providerConfig: { apiKey: "sk-test", baseUrl: "https://api.minimaxi.com" },
|
||||
target: "voice-note",
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
|
||||
expect(result.outputFormat).toBe("opus");
|
||||
expect(result.fileExtension).toBe(".opus");
|
||||
expect(result.voiceCompatible).toBe(true);
|
||||
expect(result.audioBuffer.toString()).toBe("fake-opus-data");
|
||||
expect(runFfmpegMock).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
|
||||
{ timeoutMs: 30000 },
|
||||
);
|
||||
});
|
||||
|
||||
it("applies overrides", async () => {
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import type {
|
||||
SpeechDirectiveTokenParseContext,
|
||||
@@ -6,6 +9,7 @@ import type {
|
||||
SpeechProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
|
||||
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
|
||||
import {
|
||||
DEFAULT_MINIMAX_TTS_BASE_URL,
|
||||
MINIMAX_TTS_MODELS,
|
||||
@@ -150,6 +154,41 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
}
|
||||
}
|
||||
|
||||
async function transcodeMp3ToOpus(audioBuffer: Buffer, timeoutMs: number | undefined) {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = await mkdtemp(path.join(tempRoot, "tts-minimax-"));
|
||||
try {
|
||||
const inputPath = path.join(tempDir, "input.mp3");
|
||||
const outputPath = path.join(tempDir, "voice.opus");
|
||||
await writeFile(inputPath, audioBuffer, { mode: 0o600 });
|
||||
await runFfmpeg(
|
||||
[
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"64k",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"1",
|
||||
outputPath,
|
||||
],
|
||||
{ timeoutMs },
|
||||
);
|
||||
return await readFile(outputPath);
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "minimax",
|
||||
@@ -223,11 +262,20 @@ export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
|
||||
pitch: overrides.pitch ?? config.pitch,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
if (req.target === "voice-note") {
|
||||
const opusBuffer = await transcodeMp3ToOpus(audioBuffer, req.timeoutMs);
|
||||
return {
|
||||
audioBuffer: opusBuffer,
|
||||
outputFormat: "opus",
|
||||
fileExtension: ".opus",
|
||||
voiceCompatible: true,
|
||||
};
|
||||
}
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat: "mp3",
|
||||
fileExtension: ".mp3",
|
||||
voiceCompatible: true,
|
||||
voiceCompatible: false,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user