diff --git a/docs/tools/tts.md b/docs/tools/tts.md index f058c150117..85355c4f0b9 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -9,13 +9,14 @@ title: "Text-to-Speech" # Text-to-speech (TTS) -OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI. +OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, MiniMax, or OpenAI. It works anywhere OpenClaw can send audio. ## Supported services - **ElevenLabs** (primary or fallback provider) - **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) +- **MiniMax** (primary or fallback provider; uses the T2A v2 API) - **OpenAI** (primary or fallback provider; also used for summaries) ### Microsoft speech notes @@ -33,9 +34,10 @@ or ElevenLabs. ## Optional keys -If you want OpenAI or ElevenLabs: +If you want OpenAI, ElevenLabs, or MiniMax: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) +- `MINIMAX_API_KEY` - `OPENAI_API_KEY` Microsoft speech does **not** require an API key. @@ -50,6 +52,7 @@ so that provider must also be authenticated if you enable summaries. - [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio) - [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) +- [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) - [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) @@ -143,6 +146,30 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### MiniMax primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "minimax", + providers: { + minimax: { + apiKey: "minimax_api_key", + baseUrl: "https://api.minimaxi.com", + model: "speech-2.8-hd", + voiceId: "English_expressive_narrator", + speed: 1.0, + vol: 1.0, + pitch: 0, + }, + }, + }, + }, +} +``` + ### Disable Microsoft speech ```json5 @@ -211,7 +238,7 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts]]` tags. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: speech provider id such as `"elevenlabs"`, `"microsoft"`, or `"openai"` (fallback is automatic). +- `provider`: speech provider id such as `"elevenlabs"`, `"microsoft"`, `"minimax"`, or `"openai"` (fallback is automatic). - If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` still works and is normalized to `microsoft`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. @@ -223,7 +250,7 @@ Then run: - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`). +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`). - `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. - `providers.openai.baseUrl`: override the OpenAI TTS endpoint. - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` @@ -235,6 +262,12 @@ Then run: - `providers.elevenlabs.applyTextNormalization`: `auto|on|off` - `providers.elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) - `providers.elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) +- `providers.minimax.baseUrl`: override MiniMax API base URL (default `https://api.minimaxi.com`, env: `MINIMAX_API_HOST`). +- `providers.minimax.model`: TTS model (default `speech-2.8-hd`, env: `MINIMAX_TTS_MODEL`). +- `providers.minimax.voiceId`: voice identifier (default `English_expressive_narrator`, env: `MINIMAX_TTS_VOICE_ID`). +- `providers.minimax.speed`: playback speed `0.5..2.0` (default 1.0). +- `providers.minimax.vol`: volume `(0, 10]` (default 1.0; must be greater than 0). +- `providers.minimax.pitch`: pitch shift `-12..12` (default 0). - `providers.microsoft.enabled`: allow Microsoft speech usage (default `true`; no API key). - `providers.microsoft.voice`: Microsoft neural voice name (e.g. `en-US-MichelleNeural`). - `providers.microsoft.lang`: language code (e.g. `en-US`). @@ -269,10 +302,12 @@ Here you go. Available directive keys (when enabled): -- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, or `microsoft`; requires `allowProvider: true`) -- `voice` (OpenAI voice) or `voiceId` (ElevenLabs) -- `model` (OpenAI TTS model or ElevenLabs model id) +- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `minimax`, or `microsoft`; requires `allowProvider: true`) +- `voice` (OpenAI voice) or `voiceId` (ElevenLabs / MiniMax) +- `model` (OpenAI TTS model, ElevenLabs model id, or MiniMax model) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` +- `vol` / `volume` (MiniMax volume, 0-10) +- `pitch` (MiniMax pitch, -12 to 12) - `applyTextNormalization` (`auto|on|off`) - `languageCode` (ISO 639-1) - `seed` @@ -328,6 +363,7 @@ These override `messages.tts.*` for that host. - 48kHz / 64kbps is a good voice message tradeoff. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. +- **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate). Voice-note format not natively supported; use OpenAI or ElevenLabs for guaranteed Opus voice messages. - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). - The bundled transport accepts an `outputFormat`, but not all formats are available from the service. - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). diff --git a/extensions/minimax/index.ts b/extensions/minimax/index.ts index 8d8f869dd7f..a1cae49376e 100644 --- a/extensions/minimax/index.ts +++ b/extensions/minimax/index.ts @@ -26,6 +26,7 @@ import { import type { MiniMaxRegion } from "./oauth.js"; import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js"; import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js"; +import { buildMinimaxSpeechProvider } from "./speech-provider.js"; const API_PROVIDER_ID = "minimax"; const PORTAL_PROVIDER_ID = "minimax-portal"; @@ -303,5 +304,6 @@ export default definePluginEntry({ }); api.registerImageGenerationProvider(buildMinimaxImageGenerationProvider()); api.registerImageGenerationProvider(buildMinimaxPortalImageGenerationProvider()); + api.registerSpeechProvider(buildMinimaxSpeechProvider()); }, }); diff --git a/extensions/minimax/openclaw.plugin.json b/extensions/minimax/openclaw.plugin.json index 831b6143b9d..3d6e517b6ce 100644 --- a/extensions/minimax/openclaw.plugin.json +++ b/extensions/minimax/openclaw.plugin.json @@ -61,6 +61,7 @@ } ], "contracts": { + "speechProviders": ["minimax"], "mediaUnderstandingProviders": ["minimax", "minimax-portal"], "imageGenerationProviders": ["minimax", "minimax-portal"] }, diff --git a/extensions/minimax/speech-provider.test.ts b/extensions/minimax/speech-provider.test.ts new file mode 100644 index 00000000000..4612f22edee --- /dev/null +++ b/extensions/minimax/speech-provider.test.ts @@ -0,0 +1,318 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { buildMinimaxSpeechProvider } from "./speech-provider.js"; + +describe("buildMinimaxSpeechProvider", () => { + const provider = buildMinimaxSpeechProvider(); + + describe("metadata", () => { + it("has correct id and label", () => { + expect(provider.id).toBe("minimax"); + expect(provider.label).toBe("MiniMax"); + }); + + it("has autoSelectOrder 40", () => { + expect(provider.autoSelectOrder).toBe(40); + }); + + it("exposes models and voices", () => { + expect(provider.models).toContain("speech-2.8-hd"); + expect(provider.voices).toContain("English_expressive_narrator"); + }); + }); + + describe("isConfigured", () => { + const savedEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...savedEnv }; + }); + + it("returns true when apiKey is in provider config", () => { + expect( + provider.isConfigured({ providerConfig: { apiKey: "sk-test" }, timeoutMs: 30000 }), + ).toBe(true); + }); + + it("returns false when no apiKey anywhere", () => { + delete process.env.MINIMAX_API_KEY; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(false); + }); + + it("returns true when MINIMAX_API_KEY env var is set", () => { + process.env.MINIMAX_API_KEY = "sk-env"; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(true); + }); + }); + + describe("resolveConfig", () => { + const savedEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...savedEnv }; + }); + + it("returns defaults when rawConfig is empty", () => { + delete process.env.MINIMAX_API_HOST; + delete process.env.MINIMAX_TTS_MODEL; + delete process.env.MINIMAX_TTS_VOICE_ID; + const config = provider.resolveConfig!({ rawConfig: {}, cfg: {} as never, timeoutMs: 30000 }); + expect(config.baseUrl).toBe("https://api.minimaxi.com"); + expect(config.model).toBe("speech-2.8-hd"); + expect(config.voiceId).toBe("English_expressive_narrator"); + }); + + it("reads from providers.minimax in rawConfig", () => { + const config = provider.resolveConfig!({ + rawConfig: { + providers: { + minimax: { + baseUrl: "https://custom.api.com", + model: "speech-01-240228", + voiceId: "Chinese (Mandarin)_Warm_Girl", + speed: 1.5, + vol: 2.0, + pitch: 3, + }, + }, + }, + cfg: {} as never, + timeoutMs: 30000, + }); + expect(config.baseUrl).toBe("https://custom.api.com"); + expect(config.model).toBe("speech-01-240228"); + expect(config.voiceId).toBe("Chinese (Mandarin)_Warm_Girl"); + expect(config.speed).toBe(1.5); + expect(config.vol).toBe(2.0); + expect(config.pitch).toBe(3); + }); + + it("reads from env vars as fallback", () => { + process.env.MINIMAX_API_HOST = "https://env.api.com"; + process.env.MINIMAX_TTS_MODEL = "speech-01-240228"; + process.env.MINIMAX_TTS_VOICE_ID = "Chinese (Mandarin)_Gentle_Boy"; + const config = provider.resolveConfig!({ rawConfig: {}, cfg: {} as never, timeoutMs: 30000 }); + expect(config.baseUrl).toBe("https://env.api.com"); + expect(config.model).toBe("speech-01-240228"); + expect(config.voiceId).toBe("Chinese (Mandarin)_Gentle_Boy"); + }); + }); + + describe("parseDirectiveToken", () => { + const policy = { + enabled: true, + allowText: true, + allowProvider: true, + allowVoice: true, + allowModelId: true, + allowVoiceSettings: true, + allowNormalization: true, + allowSeed: true, + }; + + it("handles voice key", () => { + const result = provider.parseDirectiveToken!({ + key: "voice", + value: "Chinese (Mandarin)_Warm_Girl", + policy, + }); + expect(result.handled).toBe(true); + expect(result.overrides?.voiceId).toBe("Chinese (Mandarin)_Warm_Girl"); + }); + + it("handles voiceid key", () => { + const result = provider.parseDirectiveToken!({ key: "voiceid", value: "test_voice", policy }); + expect(result.handled).toBe(true); + expect(result.overrides?.voiceId).toBe("test_voice"); + }); + + it("handles model key", () => { + const result = provider.parseDirectiveToken!({ + key: "model", + value: "speech-01-240228", + policy, + }); + expect(result.handled).toBe(true); + expect(result.overrides?.model).toBe("speech-01-240228"); + }); + + it("handles speed key with valid value", () => { + const result = provider.parseDirectiveToken!({ key: "speed", value: "1.5", policy }); + expect(result.handled).toBe(true); + expect(result.overrides?.speed).toBe(1.5); + }); + + it("warns on invalid speed", () => { + const result = provider.parseDirectiveToken!({ key: "speed", value: "5.0", policy }); + expect(result.handled).toBe(true); + expect(result.warnings).toHaveLength(1); + expect(result.overrides).toBeUndefined(); + }); + + it("handles vol key", () => { + const result = provider.parseDirectiveToken!({ key: "vol", value: "3", policy }); + expect(result.handled).toBe(true); + expect(result.overrides?.vol).toBe(3); + }); + + it("warns on vol=0 (exclusive minimum)", () => { + const result = provider.parseDirectiveToken!({ key: "vol", value: "0", policy }); + expect(result.handled).toBe(true); + expect(result.warnings).toHaveLength(1); + }); + + it("handles volume alias", () => { + const result = provider.parseDirectiveToken!({ key: "volume", value: "5", policy }); + expect(result.handled).toBe(true); + expect(result.overrides?.vol).toBe(5); + }); + + it("handles pitch key", () => { + const result = provider.parseDirectiveToken!({ key: "pitch", value: "-3", policy }); + expect(result.handled).toBe(true); + expect(result.overrides?.pitch).toBe(-3); + }); + + it("warns on out-of-range pitch", () => { + const result = provider.parseDirectiveToken!({ key: "pitch", value: "20", policy }); + expect(result.handled).toBe(true); + expect(result.warnings).toHaveLength(1); + }); + + it("returns handled=false for unknown keys", () => { + const result = provider.parseDirectiveToken!({ + key: "unknown_key", + value: "whatever", + policy, + }); + expect(result.handled).toBe(false); + }); + + it("suppresses voice when policy disallows it", () => { + const result = provider.parseDirectiveToken!({ + key: "voice", + value: "test", + policy: { ...policy, allowVoice: false }, + }); + expect(result.handled).toBe(true); + expect(result.overrides).toBeUndefined(); + }); + + it("suppresses model when policy disallows it", () => { + const result = provider.parseDirectiveToken!({ + key: "model", + value: "test", + policy: { ...policy, allowModelId: false }, + }); + expect(result.handled).toBe(true); + expect(result.overrides).toBeUndefined(); + }); + }); + + describe("synthesize", () => { + const savedFetch = globalThis.fetch; + + beforeEach(() => { + vi.stubGlobal("fetch", vi.fn()); + }); + + afterEach(() => { + globalThis.fetch = savedFetch; + vi.restoreAllMocks(); + }); + + it("makes correct API call and decodes hex response", async () => { + const hexAudio = Buffer.from("fake-audio-data").toString("hex"); + const mockFetch = vi.mocked(globalThis.fetch); + mockFetch.mockResolvedValueOnce( + new Response(JSON.stringify({ data: { audio: hexAudio } }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }), + ); + + const result = await provider.synthesize({ + text: "Hello world", + cfg: {} as never, + providerConfig: { apiKey: "sk-test", baseUrl: "https://api.minimaxi.com" }, + target: "audio-file", + timeoutMs: 30000, + }); + + expect(result.outputFormat).toBe("mp3"); + expect(result.fileExtension).toBe(".mp3"); + expect(result.voiceCompatible).toBe(false); + expect(result.audioBuffer.toString()).toBe("fake-audio-data"); + + expect(mockFetch).toHaveBeenCalledOnce(); + const [url, init] = mockFetch.mock.calls[0]!; + expect(url).toBe("https://api.minimaxi.com/v1/t2a_v2"); + const body = JSON.parse(init!.body as string); + expect(body.model).toBe("speech-2.8-hd"); + expect(body.text).toBe("Hello world"); + expect(body.voice_setting.voice_id).toBe("English_expressive_narrator"); + }); + + it("applies overrides", async () => { + const hexAudio = Buffer.from("audio").toString("hex"); + const mockFetch = vi.mocked(globalThis.fetch); + mockFetch.mockResolvedValueOnce( + new Response(JSON.stringify({ data: { audio: hexAudio } }), { status: 200 }), + ); + + await provider.synthesize({ + text: "Test", + cfg: {} as never, + providerConfig: { apiKey: "sk-test" }, + providerOverrides: { model: "speech-01-240228", voiceId: "custom_voice", speed: 1.5 }, + target: "audio-file", + timeoutMs: 30000, + }); + + const body = JSON.parse(vi.mocked(globalThis.fetch).mock.calls[0]![1]!.body as string); + expect(body.model).toBe("speech-01-240228"); + expect(body.voice_setting.voice_id).toBe("custom_voice"); + expect(body.voice_setting.speed).toBe(1.5); + }); + + it("throws when API key is missing", async () => { + const savedKey = process.env.MINIMAX_API_KEY; + delete process.env.MINIMAX_API_KEY; + try { + await expect( + provider.synthesize({ + text: "Test", + cfg: {} as never, + providerConfig: {}, + target: "audio-file", + timeoutMs: 30000, + }), + ).rejects.toThrow("MiniMax API key missing"); + } finally { + if (savedKey) process.env.MINIMAX_API_KEY = savedKey; + } + }); + + it("throws on API error with response body", async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response("Unauthorized", { status: 401 }), + ); + await expect( + provider.synthesize({ + text: "Test", + cfg: {} as never, + providerConfig: { apiKey: "sk-test" }, + target: "audio-file", + timeoutMs: 30000, + }), + ).rejects.toThrow("MiniMax TTS API error (401): Unauthorized"); + }); + }); + + describe("listVoices", () => { + it("returns known voices", async () => { + const voices = await provider.listVoices!({} as never); + expect(voices.length).toBeGreaterThan(0); + expect(voices[0]!.id).toBe("English_expressive_narrator"); + }); + }); +}); diff --git a/extensions/minimax/speech-provider.ts b/extensions/minimax/speech-provider.ts new file mode 100644 index 00000000000..58723744bb4 --- /dev/null +++ b/extensions/minimax/speech-provider.ts @@ -0,0 +1,245 @@ +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderOverrides, + SpeechProviderPlugin, +} from "openclaw/plugin-sdk/speech-core"; +import { + DEFAULT_MINIMAX_TTS_BASE_URL, + MINIMAX_TTS_MODELS, + MINIMAX_TTS_VOICES, + minimaxTTS, + normalizeMinimaxTtsBaseUrl, +} from "./tts.js"; + +type MinimaxTtsProviderConfig = { + apiKey?: string; + baseUrl: string; + model: string; + voiceId: string; + speed?: number; + vol?: number; + pitch?: number; +}; + +type MinimaxTtsProviderOverrides = { + model?: string; + voiceId?: string; + speed?: number; + vol?: number; + pitch?: number; +}; + +function trimToUndefined(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function asNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function asObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function normalizeMinimaxProviderConfig( + rawConfig: Record, +): MinimaxTtsProviderConfig { + const providers = asObject(rawConfig.providers); + const raw = asObject(providers?.minimax) ?? asObject(rawConfig.minimax); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.minimax.apiKey", + }), + baseUrl: normalizeMinimaxTtsBaseUrl( + trimToUndefined(raw?.baseUrl) ?? + trimToUndefined(process.env.MINIMAX_API_HOST) ?? + DEFAULT_MINIMAX_TTS_BASE_URL, + ), + model: + trimToUndefined(raw?.model) ?? + trimToUndefined(process.env.MINIMAX_TTS_MODEL) ?? + "speech-2.8-hd", + voiceId: + trimToUndefined(raw?.voiceId) ?? + trimToUndefined(process.env.MINIMAX_TTS_VOICE_ID) ?? + "English_expressive_narrator", + speed: asNumber(raw?.speed), + vol: asNumber(raw?.vol), + pitch: asNumber(raw?.pitch), + }; +} + +function readMinimaxProviderConfig(config: SpeechProviderConfig): MinimaxTtsProviderConfig { + const normalized = normalizeMinimaxProviderConfig({}); + return { + apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey, + baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl, + model: trimToUndefined(config.model) ?? normalized.model, + voiceId: trimToUndefined(config.voiceId) ?? normalized.voiceId, + speed: asNumber(config.speed) ?? normalized.speed, + vol: asNumber(config.vol) ?? normalized.vol, + pitch: asNumber(config.pitch) ?? normalized.pitch, + }; +} + +function readMinimaxOverrides( + overrides: SpeechProviderOverrides | undefined, +): MinimaxTtsProviderOverrides { + if (!overrides) { + return {}; + } + return { + model: trimToUndefined(overrides.model), + voiceId: trimToUndefined(overrides.voiceId), + speed: asNumber(overrides.speed), + vol: asNumber(overrides.vol), + pitch: asNumber(overrides.pitch), + }; +} + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { + handled: boolean; + overrides?: SpeechProviderOverrides; + warnings?: string[]; +} { + switch (ctx.key) { + case "voice": + case "voiceid": + case "voice_id": + case "minimax_voice": + case "minimaxvoice": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { handled: true, overrides: { voiceId: ctx.value } }; + case "model": + case "minimax_model": + case "minimaxmodel": + if (!ctx.policy.allowModelId) { + return { handled: true }; + } + return { handled: true, overrides: { model: ctx.value } }; + case "speed": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const speed = Number(ctx.value); + if (!Number.isFinite(speed) || speed < 0.5 || speed > 2.0) { + return { handled: true, warnings: [`invalid MiniMax speed "${ctx.value}" (0.5-2.0)`] }; + } + return { handled: true, overrides: { speed } }; + } + case "vol": + case "volume": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const vol = Number(ctx.value); + if (!Number.isFinite(vol) || vol <= 0 || vol > 10) { + return { + handled: true, + warnings: [`invalid MiniMax volume "${ctx.value}" (0-10, exclusive)`], + }; + } + return { handled: true, overrides: { vol } }; + } + case "pitch": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const pitch = Number(ctx.value); + if (!Number.isFinite(pitch) || pitch < -12 || pitch > 12) { + return { handled: true, warnings: [`invalid MiniMax pitch "${ctx.value}" (-12 to 12)`] }; + } + return { handled: true, overrides: { pitch } }; + } + default: + return { handled: false }; + } +} + +export function buildMinimaxSpeechProvider(): SpeechProviderPlugin { + return { + id: "minimax", + label: "MiniMax", + autoSelectOrder: 40, + models: MINIMAX_TTS_MODELS, + voices: MINIMAX_TTS_VOICES, + resolveConfig: ({ rawConfig }) => normalizeMinimaxProviderConfig(rawConfig), + parseDirectiveToken, + resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { + const base = normalizeMinimaxProviderConfig(baseTtsConfig); + return { + ...base, + ...(talkProviderConfig.apiKey === undefined + ? {} + : { + apiKey: normalizeResolvedSecretInputString({ + value: talkProviderConfig.apiKey, + path: "talk.providers.minimax.apiKey", + }), + }), + ...(trimToUndefined(talkProviderConfig.baseUrl) == null + ? {} + : { baseUrl: normalizeMinimaxTtsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }), + ...(trimToUndefined(talkProviderConfig.modelId) == null + ? {} + : { model: trimToUndefined(talkProviderConfig.modelId) }), + ...(trimToUndefined(talkProviderConfig.voiceId) == null + ? {} + : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }), + ...(asNumber(talkProviderConfig.speed) == null + ? {} + : { speed: asNumber(talkProviderConfig.speed) }), + ...(asNumber(talkProviderConfig.vol) == null + ? {} + : { vol: asNumber(talkProviderConfig.vol) }), + ...(asNumber(talkProviderConfig.pitch) == null + ? {} + : { pitch: asNumber(talkProviderConfig.pitch) }), + }; + }, + resolveTalkOverrides: ({ params }) => ({ + ...(trimToUndefined(params.voiceId) == null + ? {} + : { voiceId: trimToUndefined(params.voiceId) }), + ...(trimToUndefined(params.modelId) == null + ? {} + : { model: trimToUndefined(params.modelId) }), + ...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }), + }), + listVoices: async () => MINIMAX_TTS_VOICES.map((voice) => ({ id: voice, name: voice })), + isConfigured: ({ providerConfig }) => + Boolean(readMinimaxProviderConfig(providerConfig).apiKey || process.env.MINIMAX_API_KEY), + synthesize: async (req) => { + const config = readMinimaxProviderConfig(req.providerConfig); + const overrides = readMinimaxOverrides(req.providerOverrides); + const apiKey = config.apiKey || process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error("MiniMax API key missing"); + } + const audioBuffer = await minimaxTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + model: overrides.model ?? config.model, + voiceId: overrides.voiceId ?? config.voiceId, + speed: overrides.speed ?? config.speed, + vol: overrides.vol ?? config.vol, + pitch: overrides.pitch ?? config.pitch, + timeoutMs: req.timeoutMs, + }); + return { + audioBuffer, + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }; + }, + }; +} diff --git a/extensions/minimax/tts.ts b/extensions/minimax/tts.ts new file mode 100644 index 00000000000..bbf609438cc --- /dev/null +++ b/extensions/minimax/tts.ts @@ -0,0 +1,90 @@ +export const DEFAULT_MINIMAX_TTS_BASE_URL = "https://api.minimaxi.com"; + +export const MINIMAX_TTS_MODELS = ["speech-2.8-hd", "speech-01-240228"] as const; + +export const MINIMAX_TTS_VOICES = [ + "English_expressive_narrator", + "Chinese (Mandarin)_Warm_Girl", + "Chinese (Mandarin)_Lively_Girl", + "Chinese (Mandarin)_Gentle_Boy", + "Chinese (Mandarin)_Steady_Boy", +] as const; + +export function normalizeMinimaxTtsBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return DEFAULT_MINIMAX_TTS_BASE_URL; + } + return trimmed.replace(/\/+$/, ""); +} + +export async function minimaxTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + model: string; + voiceId: string; + speed?: number; + vol?: number; + pitch?: number; + format?: string; + sampleRate?: number; + timeoutMs: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + model, + voiceId, + speed = 1.0, + vol = 1.0, + pitch = 0, + format = "mp3", + sampleRate = 32000, + timeoutMs, + } = params; + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch(`${baseUrl}/v1/t2a_v2`, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + text, + voice_setting: { + voice_id: voiceId, + speed, + vol, + pitch, + }, + audio_setting: { + format, + sample_rate: sampleRate, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + const errBody = await response.text().catch(() => ""); + throw new Error(`MiniMax TTS API error (${response.status})${errBody ? `: ${errBody}` : ""}`); + } + + const body = (await response.json()) as { data?: { audio?: string } }; + const hexAudio = body?.data?.audio; + if (!hexAudio) { + throw new Error("MiniMax TTS API returned no audio data"); + } + + return Buffer.from(hexAudio, "hex"); + } finally { + clearTimeout(timeout); + } +}