diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f1b6261bb7..dd60b47c65a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,7 +91,7 @@ Docs: https://docs.openclaw.ai - Plugins/OpenCode: strip unsupported disabled Responses reasoning payloads for OpenCode image understanding. Fixes #70252. - Plugins/OpenCode/OpenCode Go: register image understanding metadata so the image tool is available for OpenCode catalog models with vision support. Fixes #70482 and #61789. - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot. -- Providers/MiniMax TTS: mark MP3 output voice-compatible for Telegram voice-note delivery. Fixes #63540. +- Providers/MiniMax TTS: transcode voice-note targets to Opus so Feishu/Telegram receive native voice messages instead of MP3 file attachments. Fixes #63540, #64134, and #70445. - Providers/Microsoft TTS: keep allowlisted bundled speech providers discoverable even when another speech plugin has already registered, so Edge/Microsoft TTS is available alongside OpenAI. Fixes #62117 and #66850. - Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153. - Providers/OpenRouter: add an OpenRouter TTS provider using the OpenAI-compatible `/audio/speech` endpoint and `OPENROUTER_API_KEY`. Fixes #71268. diff --git a/docs/providers/minimax.md b/docs/providers/minimax.md index 0fa206022d7..f9393e50e53 100644 --- a/docs/providers/minimax.md +++ b/docs/providers/minimax.md @@ -244,6 +244,18 @@ exposed separately through the plugin-owned `MiniMax-VL-01` media provider. See [Image Generation](/tools/image-generation) for shared tool parameters, provider selection, and failover behavior. +### Text-to-speech + +The bundled `minimax` plugin registers MiniMax T2A v2 as a speech provider for +`messages.tts`. + +- Default TTS model: `speech-2.8-hd` +- Default voice: `English_expressive_narrator` +- Normal audio attachments stay MP3. +- Voice-note targets such as Feishu and Telegram are transcoded from MiniMax + MP3 to 48kHz Opus with `ffmpeg`, because the Feishu/Lark file API only + accepts `file_type: "opus"` for native audio messages. + ### Music generation The bundled `minimax` plugin also registers music generation through the shared diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 7871538c2aa..418a9000b77 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -488,7 +488,7 @@ These override `messages.tts.*` for that host. - 48kHz / 64kbps is a good voice message tradeoff. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. -- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate). Voice-note format not natively supported; use OpenAI or ElevenLabs for guaranteed Opus voice messages. +- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery. - **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path. - **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. - **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. diff --git a/extensions/minimax/speech-provider.test.ts b/extensions/minimax/speech-provider.test.ts index bb2fc6cf319..9daf2d1e4b1 100644 --- a/extensions/minimax/speech-provider.test.ts +++ b/extensions/minimax/speech-provider.test.ts @@ -1,4 +1,11 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +const runFfmpegMock = vi.hoisted(() => vi.fn()); + +vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ + runFfmpeg: runFfmpegMock, +})); + import { buildMinimaxSpeechProvider } from "./speech-provider.js"; describe("buildMinimaxSpeechProvider", () => { @@ -213,6 +220,7 @@ describe("buildMinimaxSpeechProvider", () => { beforeEach(() => { vi.stubGlobal("fetch", vi.fn()); + runFfmpegMock.mockReset(); }); afterEach(() => { @@ -240,7 +248,7 @@ describe("buildMinimaxSpeechProvider", () => { expect(result.outputFormat).toBe("mp3"); expect(result.fileExtension).toBe(".mp3"); - expect(result.voiceCompatible).toBe(true); + expect(result.voiceCompatible).toBe(false); expect(result.audioBuffer.toString()).toBe("fake-audio-data"); expect(mockFetch).toHaveBeenCalledOnce(); @@ -250,6 +258,44 @@ describe("buildMinimaxSpeechProvider", () => { expect(body.model).toBe("speech-2.8-hd"); expect(body.text).toBe("Hello world"); expect(body.voice_setting.voice_id).toBe("English_expressive_narrator"); + expect(runFfmpegMock).not.toHaveBeenCalled(); + }); + + it("transcodes MiniMax MP3 to Opus for voice-note targets", async () => { + const hexAudio = Buffer.from("fake-mp3-data").toString("hex"); + const mockFetch = vi.mocked(globalThis.fetch); + mockFetch.mockResolvedValueOnce( + new Response(JSON.stringify({ data: { audio: hexAudio } }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }), + ); + runFfmpegMock.mockImplementationOnce(async (args: string[]) => { + const outputPath = args.at(-1); + if (typeof outputPath !== "string") { + throw new Error("missing ffmpeg output path"); + } + await import("node:fs/promises").then((fs) => + fs.writeFile(outputPath, Buffer.from("fake-opus-data")), + ); + }); + + const result = await provider.synthesize({ + text: "Hello world", + cfg: {} as never, + providerConfig: { apiKey: "sk-test", baseUrl: "https://api.minimaxi.com" }, + target: "voice-note", + timeoutMs: 30000, + }); + + expect(result.outputFormat).toBe("opus"); + expect(result.fileExtension).toBe(".opus"); + expect(result.voiceCompatible).toBe(true); + expect(result.audioBuffer.toString()).toBe("fake-opus-data"); + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]), + { timeoutMs: 30000 }, + ); }); it("applies overrides", async () => { diff --git a/extensions/minimax/speech-provider.ts b/extensions/minimax/speech-provider.ts index 84aded26ec4..c9b526c89e2 100644 --- a/extensions/minimax/speech-provider.ts +++ b/extensions/minimax/speech-provider.ts @@ -1,3 +1,6 @@ +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; import type { SpeechDirectiveTokenParseContext, @@ -6,6 +9,7 @@ import type { SpeechProviderPlugin, } from "openclaw/plugin-sdk/speech-core"; import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core"; +import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; import { DEFAULT_MINIMAX_TTS_BASE_URL, MINIMAX_TTS_MODELS, @@ -150,6 +154,41 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { } } +async function transcodeMp3ToOpus(audioBuffer: Buffer, timeoutMs: number | undefined) { + const tempRoot = resolvePreferredOpenClawTmpDir(); + await mkdir(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = await mkdtemp(path.join(tempRoot, "tts-minimax-")); + try { + const inputPath = path.join(tempDir, "input.mp3"); + const outputPath = path.join(tempDir, "voice.opus"); + await writeFile(inputPath, audioBuffer, { mode: 0o600 }); + await runFfmpeg( + [ + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + inputPath, + "-vn", + "-c:a", + "libopus", + "-b:a", + "64k", + "-ar", + "48000", + "-ac", + "1", + outputPath, + ], + { timeoutMs }, + ); + return await readFile(outputPath); + } finally { + await rm(tempDir, { recursive: true, force: true }); + } +} + export function buildMinimaxSpeechProvider(): SpeechProviderPlugin { return { id: "minimax", @@ -223,11 +262,20 @@ export function buildMinimaxSpeechProvider(): SpeechProviderPlugin { pitch: overrides.pitch ?? config.pitch, timeoutMs: req.timeoutMs, }); + if (req.target === "voice-note") { + const opusBuffer = await transcodeMp3ToOpus(audioBuffer, req.timeoutMs); + return { + audioBuffer: opusBuffer, + outputFormat: "opus", + fileExtension: ".opus", + voiceCompatible: true, + }; + } return { audioBuffer, outputFormat: "mp3", fileExtension: ".mp3", - voiceCompatible: true, + voiceCompatible: false, }; }, };