fix(minimax): normalize tts pitch for api

2026-05-06 07:40:44 +00:00 · 2026-04-25 04:57:57 +01:00
parent 5d724863bb
commit 978a50a3c5
5 changed files with 22 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -91,6 +91,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/OpenCode: strip unsupported disabled Responses reasoning payloads for OpenCode image understanding. Fixes #70252.
 - Plugins/OpenCode/OpenCode Go: register image understanding metadata so the image tool is available for OpenCode catalog models with vision support. Fixes #70482 and #61789.
 - Providers/ElevenLabs: omit the MP3-only `Accept` header for PCM telephony synthesis, so Voice Call requests for `pcm_22050` no longer receive MP3 audio. Fixes #67340. Thanks @marcchabot.
+- Providers/MiniMax TTS: truncate fractional pitch overrides before sending T2A requests, matching MiniMax's integer pitch contract while preserving fractional speed and volume. Fixes #62144.
 - Providers/MiniMax TTS: transcode voice-note targets to Opus so Feishu/Telegram receive native voice messages instead of MP3 file attachments. Fixes #63540, #64134, and #70445.
 - Providers/Microsoft TTS: keep allowlisted bundled speech providers discoverable even when another speech plugin has already registered, so Edge/Microsoft TTS is available alongside OpenAI. Fixes #62117 and #66850.
 - Providers/Microsoft TTS: honor legacy `messages.tts.providers.edge` voice settings after normalizing Edge TTS to the Microsoft provider. Fixes #64153.
--- a/docs/providers/minimax.md
+++ b/docs/providers/minimax.md
@@ -255,12 +255,17 @@ The bundled `minimax` plugin registers MiniMax T2A v2 as a speech provider for
 - Voice-note targets such as Feishu and Telegram are transcoded from MiniMax
  MP3 to 48kHz Opus with `ffmpeg`, because the Feishu/Lark file API only
  accepts `file_type: "opus"` for native audio messages.
+- MiniMax T2A accepts fractional `speed` and `vol`, but `pitch` is sent as an
+  integer; OpenClaw truncates fractional `pitch` values before the API request.

 | Setting                                  | Env var                | Default                       | Description                      |
 | ---------------------------------------- | ---------------------- | ----------------------------- | -------------------------------- |
 | `messages.tts.providers.minimax.baseUrl` | `MINIMAX_API_HOST`     | `https://api.minimax.io`      | MiniMax T2A API host.            |
 | `messages.tts.providers.minimax.model`   | `MINIMAX_TTS_MODEL`    | `speech-2.8-hd`               | TTS model id.                    |
 | `messages.tts.providers.minimax.voiceId` | `MINIMAX_TTS_VOICE_ID` | `English_expressive_narrator` | Voice id used for speech output. |
+| `messages.tts.providers.minimax.speed`   |                        | `1.0`                         | Playback speed, `0.5..2.0`.      |
+| `messages.tts.providers.minimax.vol`     |                        | `1.0`                         | Volume, `(0, 10]`.               |
+| `messages.tts.providers.minimax.pitch`   |                        | `0`                           | Integer pitch shift, `-12..12`.  |

 ### Music generation

--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -374,7 +374,7 @@ Then run:
 - `providers.minimax.voiceId`: voice identifier (default `English_expressive_narrator`, env: `MINIMAX_TTS_VOICE_ID`).
 - `providers.minimax.speed`: playback speed `0.5..2.0` (default 1.0).
 - `providers.minimax.vol`: volume `(0, 10]` (default 1.0; must be greater than 0).
- `providers.minimax.pitch`: pitch shift `-12..12` (default 0).
+- `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values.
 - `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`).
 - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted).
 - `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted.
@@ -432,7 +432,7 @@ Available directive keys (when enabled):
 - `model` (OpenAI TTS model, ElevenLabs model id, or MiniMax model) or `google_model` (Google TTS model)
 - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost`
 - `vol` / `volume` (MiniMax volume, 0-10)
- `pitch` (MiniMax pitch, -12 to 12)
+- `pitch` (MiniMax integer pitch, -12 to 12; fractional values are truncated before the MiniMax request)
 - `applyTextNormalization` (`auto|on|off`)
 - `languageCode` (ISO 639-1)
 - `seed`
--- a/extensions/minimax/speech-provider.test.ts
+++ b/extensions/minimax/speech-provider.test.ts
@@ -309,7 +309,13 @@ describe("buildMinimaxSpeechProvider", () => {
        text: "Test",
        cfg: {} as never,
        providerConfig: { apiKey: "sk-test" },
-        providerOverrides: { model: "speech-01-240228", voiceId: "custom_voice", speed: 1.5 },
+        providerOverrides: {
+          model: "speech-01-240228",
+          voiceId: "custom_voice",
+          speed: 1.5,
+          vol: 1.5,
+          pitch: 0.5,
+        },
        target: "audio-file",
        timeoutMs: 30000,
      });
@@ -318,6 +324,8 @@ describe("buildMinimaxSpeechProvider", () => {
      expect(body.model).toBe("speech-01-240228");
      expect(body.voice_setting.voice_id).toBe("custom_voice");
      expect(body.voice_setting.speed).toBe(1.5);
+      expect(body.voice_setting.vol).toBe(1.5);
+      expect(body.voice_setting.pitch).toBe(0);
    });

    it("throws when API key is missing", async () => {
--- a/extensions/minimax/tts.ts
+++ b/extensions/minimax/tts.ts
@@ -24,6 +24,10 @@ export function normalizeMinimaxTtsBaseUrl(baseUrl?: string): string {
  return trimmed.replace(/\/+$/, "");
 }

+function normalizeMinimaxTtsPitch(pitch: number): number {
+  return Math.trunc(pitch);
+}
+
 export async function minimaxTTS(params: {
  text: string;
  apiKey: string;
@@ -70,7 +74,7 @@ export async function minimaxTTS(params: {
            voice_id: voiceId,
            speed,
            vol,
-            pitch,
+            pitch: normalizeMinimaxTtsPitch(pitch),
          },
          audio_setting: {
            format,