diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbb77b297a..638ae7d25d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,6 +91,7 @@ Docs: https://docs.openclaw.ai - Plugins/OpenCode: strip unsupported disabled Responses reasoning payloads for OpenCode image understanding. Fixes #70252. - Plugins/OpenCode/OpenCode Go: register image understanding metadata so the image tool is available for OpenCode catalog models with vision support. Fixes #70482 and #61789. - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot. +- Providers/MiniMax TTS: truncate fractional pitch overrides before sending T2A requests, matching MiniMax's integer pitch contract while preserving fractional speed and volume. Fixes #62144. - Providers/MiniMax TTS: transcode voice-note targets to Opus so Feishu/Telegram receive native voice messages instead of MP3 file attachments. Fixes #63540, #64134, and #70445. - Providers/Microsoft TTS: keep allowlisted bundled speech providers discoverable even when another speech plugin has already registered, so Edge/Microsoft TTS is available alongside OpenAI. Fixes #62117 and #66850. - Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153. diff --git a/docs/providers/minimax.md b/docs/providers/minimax.md index fd3309c2d9a..1c928780ac7 100644 --- a/docs/providers/minimax.md +++ b/docs/providers/minimax.md @@ -255,12 +255,17 @@ The bundled `minimax` plugin registers MiniMax T2A v2 as a speech provider for - Voice-note targets such as Feishu and Telegram are transcoded from MiniMax MP3 to 48kHz Opus with `ffmpeg`, because the Feishu/Lark file API only accepts `file_type: "opus"` for native audio messages. +- MiniMax T2A accepts fractional `speed` and `vol`, but `pitch` is sent as an + integer; OpenClaw truncates fractional `pitch` values before the API request. | Setting | Env var | Default | Description | | ---------------------------------------- | ---------------------- | ----------------------------- | -------------------------------- | | `messages.tts.providers.minimax.baseUrl` | `MINIMAX_API_HOST` | `https://api.minimax.io` | MiniMax T2A API host. | | `messages.tts.providers.minimax.model` | `MINIMAX_TTS_MODEL` | `speech-2.8-hd` | TTS model id. | | `messages.tts.providers.minimax.voiceId` | `MINIMAX_TTS_VOICE_ID` | `English_expressive_narrator` | Voice id used for speech output. | +| `messages.tts.providers.minimax.speed` | | `1.0` | Playback speed, `0.5..2.0`. | +| `messages.tts.providers.minimax.vol` | | `1.0` | Volume, `(0, 10]`. | +| `messages.tts.providers.minimax.pitch` | | `0` | Integer pitch shift, `-12..12`. | ### Music generation diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 418a9000b77..6010b8093a7 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -374,7 +374,7 @@ Then run: - `providers.minimax.voiceId`: voice identifier (default `English_expressive_narrator`, env: `MINIMAX_TTS_VOICE_ID`). - `providers.minimax.speed`: playback speed `0.5..2.0` (default 1.0). - `providers.minimax.vol`: volume `(0, 10]` (default 1.0; must be greater than 0). -- `providers.minimax.pitch`: pitch shift `-12..12` (default 0). +- `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values. - `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`). - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted). - `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted. @@ -432,7 +432,7 @@ Available directive keys (when enabled): - `model` (OpenAI TTS model, ElevenLabs model id, or MiniMax model) or `google_model` (Google TTS model) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` - `vol` / `volume` (MiniMax volume, 0-10) -- `pitch` (MiniMax pitch, -12 to 12) +- `pitch` (MiniMax integer pitch, -12 to 12; fractional values are truncated before the MiniMax request) - `applyTextNormalization` (`auto|on|off`) - `languageCode` (ISO 639-1) - `seed` diff --git a/extensions/minimax/speech-provider.test.ts b/extensions/minimax/speech-provider.test.ts index 9daf2d1e4b1..6d62ce3faa6 100644 --- a/extensions/minimax/speech-provider.test.ts +++ b/extensions/minimax/speech-provider.test.ts @@ -309,7 +309,13 @@ describe("buildMinimaxSpeechProvider", () => { text: "Test", cfg: {} as never, providerConfig: { apiKey: "sk-test" }, - providerOverrides: { model: "speech-01-240228", voiceId: "custom_voice", speed: 1.5 }, + providerOverrides: { + model: "speech-01-240228", + voiceId: "custom_voice", + speed: 1.5, + vol: 1.5, + pitch: 0.5, + }, target: "audio-file", timeoutMs: 30000, }); @@ -318,6 +324,8 @@ describe("buildMinimaxSpeechProvider", () => { expect(body.model).toBe("speech-01-240228"); expect(body.voice_setting.voice_id).toBe("custom_voice"); expect(body.voice_setting.speed).toBe(1.5); + expect(body.voice_setting.vol).toBe(1.5); + expect(body.voice_setting.pitch).toBe(0); }); it("throws when API key is missing", async () => { diff --git a/extensions/minimax/tts.ts b/extensions/minimax/tts.ts index d5b3e07560d..364e786bebb 100644 --- a/extensions/minimax/tts.ts +++ b/extensions/minimax/tts.ts @@ -24,6 +24,10 @@ export function normalizeMinimaxTtsBaseUrl(baseUrl?: string): string { return trimmed.replace(/\/+$/, ""); } +function normalizeMinimaxTtsPitch(pitch: number): number { + return Math.trunc(pitch); +} + export async function minimaxTTS(params: { text: string; apiKey: string; @@ -70,7 +74,7 @@ export async function minimaxTTS(params: { voice_id: voiceId, speed, vol, - pitch, + pitch: normalizeMinimaxTtsPitch(pitch), }, audio_setting: { format,