From ec8dbc4595583816f71af2568df61dbe7c090e1f Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 09:47:52 +0100 Subject: [PATCH] feat(tts): add xiaomi mimo speech provider --- CHANGELOG.md | 1 + docs/providers/xiaomi.md | 40 +++ docs/tools/tts.md | 52 +++- extensions/google/google.live.test.ts | 4 +- extensions/minimax/minimax.live.test.ts | 54 +++- extensions/xiaomi/index.ts | 4 + extensions/xiaomi/openclaw.plugin.json | 3 + extensions/xiaomi/speech-provider.test.ts | 250 ++++++++++++++++ extensions/xiaomi/speech-provider.ts | 336 ++++++++++++++++++++++ extensions/xiaomi/xiaomi.live.test.ts | 55 ++++ 10 files changed, 789 insertions(+), 10 deletions(-) create mode 100644 extensions/xiaomi/speech-provider.test.ts create mode 100644 extensions/xiaomi/speech-provider.ts create mode 100644 extensions/xiaomi/xiaomi.live.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 99dd51120c8..9e869cf4ce2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai - Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#71450) Thanks @vincentkoc and @jlapenna. - Control UI: refine the agent Tool Access panel with compact live-tool chips, collapsible tool groups, direct per-tool toggles, and clearer runtime/source provenance. (#71405) Thanks @BunsDev. - Memory-core/hybrid search: expose raw `vectorScore` and `textScore` alongside the combined `score` on hybrid memory search results, so callers can inspect vector-versus-text retrieval contribution before temporal decay or MMR reordering. Fixes #68166. (#68286) Thanks @ajfonthemove. +- Providers/Xiaomi: add MiMo TTS as a bundled speech provider with MP3/WAV output and voice-note Opus transcoding. Fixes #52376. (#55614) Thanks @zoujiejun. ### Fixes diff --git a/docs/providers/xiaomi.md b/docs/providers/xiaomi.md index ce5f328d61c..93cc0aa8e8e 100644 --- a/docs/providers/xiaomi.md +++ b/docs/providers/xiaomi.md @@ -53,6 +53,46 @@ OpenAI-compatible endpoint with API-key authentication. The default model ref is `xiaomi/mimo-v2-flash`. The provider is injected automatically when `XIAOMI_API_KEY` is set or an auth profile exists. +## Text-to-speech + +The bundled `xiaomi` plugin also registers Xiaomi MiMo as a speech provider for +`messages.tts`. It calls Xiaomi's chat-completions TTS contract with the text as +an `assistant` message and optional style guidance as a `user` message. + +| Property | Value | +| -------- | ---------------------------------------- | +| TTS id | `xiaomi` (`mimo` alias) | +| Auth | `XIAOMI_API_KEY` | +| API | `POST /v1/chat/completions` with `audio` | +| Default | `mimo-v2.5-tts`, voice `mimo_default` | +| Output | MP3 by default; WAV when configured | + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "xiaomi", + providers: { + xiaomi: { + apiKey: "xiaomi_api_key", + model: "mimo-v2.5-tts", + voice: "mimo_default", + format: "mp3", + style: "Bright, natural, conversational tone.", + }, + }, + }, + }, +} +``` + +Supported built-in voices include `mimo_default`, `default_zh`, `default_en`, +`Mia`, `Chloe`, `Milo`, and `Dean`. `mimo-v2-tts` is supported for older MiMo +TTS accounts; the default uses the current MiMo-V2.5 TTS model. For voice-note +targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi output to 48kHz +Opus with `ffmpeg` before delivery. + ## Config example ```json5 diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 2d7c6cb9c99..6f8529eb180 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -7,7 +7,7 @@ read_when: title: "Text-to-speech" --- -OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Microsoft, MiniMax, OpenAI, Vydra, or xAI. +OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo. It works anywhere OpenClaw can send audio. ## Supported services @@ -20,6 +20,7 @@ It works anywhere OpenClaw can send audio. - **OpenAI** (primary or fallback provider; also used for summaries) - **Vydra** (primary or fallback provider; shared image, video, and speech provider) - **xAI** (primary or fallback provider; uses the xAI TTS API) +- **Xiaomi MiMo** (primary or fallback provider; uses MiMo TTS through Xiaomi chat completions) ### Microsoft speech notes @@ -36,7 +37,7 @@ or ElevenLabs. ## Optional keys -If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, or xAI: +If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, xAI, or Xiaomi MiMo: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `GEMINI_API_KEY` (or `GOOGLE_API_KEY`) @@ -45,6 +46,7 @@ If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, or xAI: - `OPENAI_API_KEY` - `VYDRA_API_KEY` - `XAI_API_KEY` +- `XIAOMI_API_KEY` Microsoft speech does **not** require an API key. @@ -60,6 +62,7 @@ so that provider must also be authenticated if you enable summaries. - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) - [Gradium](/providers/gradium) - [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2) +- [Xiaomi MiMo speech synthesis](/providers/xiaomi#text-to-speech) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) - [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) - [xAI Text to Speech](https://docs.x.ai/developers/rest-api-reference/inference/voice#text-to-speech-rest) @@ -231,6 +234,34 @@ Resolution order is `messages.tts.providers.xai.apiKey` -> `XAI_API_KEY`. Current live voices are `ara`, `eve`, `leo`, `rex`, `sal`, and `una`; `eve` is the default. `language` accepts a BCP-47 tag or `auto`. +### Xiaomi MiMo primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "xiaomi", + providers: { + xiaomi: { + apiKey: "xiaomi_api_key", + baseUrl: "https://api.xiaomimimo.com/v1", + model: "mimo-v2.5-tts", + voice: "mimo_default", + format: "mp3", + style: "Bright, natural, conversational tone.", + }, + }, + }, + }, +} +``` + +Xiaomi MiMo TTS uses the same `XIAOMI_API_KEY` path as the bundled Xiaomi model +provider. The speech provider id is `xiaomi`; `mimo` is accepted as an alias. +The target text is sent as the assistant message, matching Xiaomi's TTS +contract. Optional `style` is sent as a user instruction and is not spoken. + ### OpenRouter primary ```json5 @@ -345,7 +376,7 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts:key=value]]` directives or a `[[tts:text]]...[[/tts:text]]` block. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, or `"xai"` (fallback is automatic). +- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, `"xai"`, or `"xiaomi"` (fallback is automatic). - If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` config is repaired by `openclaw doctor --fix` and rewritten to `provider: "microsoft"`. @@ -359,7 +390,7 @@ Then run: - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`). +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). - `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. - `providers.openai.baseUrl`: override the OpenAI TTS endpoint. - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` @@ -391,6 +422,12 @@ Then run: - `providers.xai.language`: BCP-47 language code or `auto` (default `en`). - `providers.xai.responseFormat`: `mp3`, `wav`, `pcm`, `mulaw`, or `alaw` (default `mp3`). - `providers.xai.speed`: provider-native speed override. +- `providers.xiaomi.apiKey`: Xiaomi MiMo API key (env: `XIAOMI_API_KEY`). +- `providers.xiaomi.baseUrl`: override the Xiaomi MiMo API base URL (default `https://api.xiaomimimo.com/v1`, env: `XIAOMI_BASE_URL`). +- `providers.xiaomi.model`: TTS model (default `mimo-v2.5-tts`, env: `XIAOMI_TTS_MODEL`; `mimo-v2-tts` is also supported). +- `providers.xiaomi.voice`: MiMo voice id (default `mimo_default`, env: `XIAOMI_TTS_VOICE`). +- `providers.xiaomi.format`: `mp3` or `wav` (default `mp3`, env: `XIAOMI_TTS_FORMAT`). +- `providers.xiaomi.style`: optional natural-language style instruction sent as the user message; it is not spoken. - `providers.openrouter.apiKey`: OpenRouter API key (env: `OPENROUTER_API_KEY`; can reuse `models.providers.openrouter.apiKey`). - `providers.openrouter.baseUrl`: override the OpenRouter TTS base URL (default `https://openrouter.ai/api/v1`; legacy `https://openrouter.ai/v1` is normalized). - `providers.openrouter.model`: OpenRouter TTS model id (default `hexgrad/kokoro-82m`; `modelId` is also accepted). @@ -432,9 +469,9 @@ Here you go. Available directive keys (when enabled): -- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `google`, `gradium`, `minimax`, `microsoft`, `vydra`, or `xai`; requires `allowProvider: true`) -- `voice` (OpenAI or Gradium voice), `voiceName` / `voice_name` / `google_voice` (Google voice), or `voiceId` (ElevenLabs / Gradium / MiniMax / xAI) -- `model` (OpenAI TTS model, ElevenLabs model id, or MiniMax model) or `google_model` (Google TTS model) +- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `google`, `gradium`, `minimax`, `microsoft`, `vydra`, `xai`, or `xiaomi`; requires `allowProvider: true`) +- `voice` (OpenAI, Gradium, or Xiaomi voice), `voiceName` / `voice_name` / `google_voice` (Google voice), or `voiceId` (ElevenLabs / Gradium / MiniMax / xAI) +- `model` (OpenAI TTS model, ElevenLabs model id, MiniMax model, or Xiaomi MiMo TTS model) or `google_model` (Google TTS model) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` - `vol` / `volume` (MiniMax volume, 0-10) - `pitch` (MiniMax integer pitch, -12 to 12; fractional values are truncated before the MiniMax request) @@ -498,6 +535,7 @@ These override `messages.tts.*` for that host. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery. +- **Xiaomi MiMo**: MP3 by default, or WAV when configured. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi output to 48kHz Opus with `ffmpeg` before delivery. - **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path. - **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. - **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. diff --git a/extensions/google/google.live.test.ts b/extensions/google/google.live.test.ts index 62eede1edd4..39751b5d635 100644 --- a/extensions/google/google.live.test.ts +++ b/extensions/google/google.live.test.ts @@ -43,7 +43,7 @@ describeLive("google plugin live", () => { const speechProvider = requireRegisteredProvider(speechProviders, "google"); const mediaProvider = requireRegisteredProvider(mediaProviders, "google"); - const phrase = "Testing Google audio transcription with OpenClaw."; + const phrase = "Testing Google audio transcription with pineapple."; const audioFile = await speechProvider.synthesize({ text: phrase, cfg: { plugins: { enabled: true } } as never, @@ -62,7 +62,7 @@ describeLive("google plugin live", () => { const normalized = normalizeTranscriptForMatch(transcript?.text ?? ""); expect(normalized).toContain("google"); - expect(normalized).toContain("openclaw"); + expect(normalized).toContain("pineapple"); }, 180_000); it("runs Gemini web search through the registered provider tool", async () => { diff --git a/extensions/minimax/minimax.live.test.ts b/extensions/minimax/minimax.live.test.ts index 9e3aa7c4eb3..7b1f26f5d79 100644 --- a/extensions/minimax/minimax.live.test.ts +++ b/extensions/minimax/minimax.live.test.ts @@ -1,14 +1,30 @@ import { describe, expect, it } from "vitest"; import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js"; +import { + registerProviderPlugin, + requireRegisteredProvider, +} from "../../test/helpers/plugins/provider-registration.js"; +import plugin from "./index.js"; +import { buildMinimaxSpeechProvider } from "./speech-provider.js"; import { createMiniMaxWebSearchProvider } from "./src/minimax-web-search-provider.js"; +const MINIMAX_API_KEY = process.env.MINIMAX_API_KEY?.trim() ?? ""; const MINIMAX_SEARCH_KEY = process.env.MINIMAX_CODE_PLAN_KEY?.trim() || process.env.MINIMAX_CODING_API_KEY?.trim() || - process.env.MINIMAX_API_KEY?.trim() || + MINIMAX_API_KEY || ""; const describeLive = isLiveTestEnabled() && MINIMAX_SEARCH_KEY.length > 0 ? describe : describe.skip; +const describeTtsLive = + isLiveTestEnabled() && MINIMAX_API_KEY.length > 0 ? describe : describe.skip; + +const registerMinimaxPlugin = () => + registerProviderPlugin({ + plugin, + id: "minimax", + name: "MiniMax Provider", + }); describeLive("minimax plugin live", () => { it("runs MiniMax web search through the provider tool", async () => { @@ -25,3 +41,39 @@ describeLive("minimax plugin live", () => { expect(Array.isArray(result?.results)).toBe(true); }, 120_000); }); + +describeTtsLive("minimax tts live", () => { + it("synthesizes TTS through the registered speech provider", async () => { + const { speechProviders } = await registerMinimaxPlugin(); + const provider = requireRegisteredProvider(speechProviders, "minimax"); + + const audioFile = await provider.synthesize({ + text: "OpenClaw MiniMax text to speech integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig: { apiKey: MINIMAX_API_KEY }, + target: "audio-file", + timeoutMs: 90_000, + }); + + expect(audioFile.outputFormat).toBe("mp3"); + expect(audioFile.fileExtension).toBe(".mp3"); + expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512); + }, 120_000); + + it("synthesizes MiniMax TTS as an Opus voice note", async () => { + const provider = buildMinimaxSpeechProvider(); + + const voiceNote = await provider.synthesize({ + text: "OpenClaw MiniMax voice note test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig: { apiKey: MINIMAX_API_KEY }, + target: "voice-note", + timeoutMs: 90_000, + }); + + expect(voiceNote.outputFormat).toBe("opus"); + expect(voiceNote.fileExtension).toBe(".opus"); + expect(voiceNote.voiceCompatible).toBe(true); + expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(512); + }, 120_000); +}); diff --git a/extensions/xiaomi/index.ts b/extensions/xiaomi/index.ts index 350e812d577..f2478e70e2e 100644 --- a/extensions/xiaomi/index.ts +++ b/extensions/xiaomi/index.ts @@ -2,6 +2,7 @@ import { defineSingleProviderPluginEntry } from "openclaw/plugin-sdk/provider-en import { PROVIDER_LABELS } from "openclaw/plugin-sdk/provider-usage"; import { applyXiaomiConfig, XIAOMI_DEFAULT_MODEL_REF } from "./onboard.js"; import { buildXiaomiProvider } from "./provider-catalog.js"; +import { buildXiaomiSpeechProvider } from "./speech-provider.js"; const PROVIDER_ID = "xiaomi"; @@ -40,4 +41,7 @@ export default defineSingleProviderPluginEntry({ windows: [], }), }, + register(api) { + api.registerSpeechProvider(buildXiaomiSpeechProvider()); + }, }); diff --git a/extensions/xiaomi/openclaw.plugin.json b/extensions/xiaomi/openclaw.plugin.json index 981e4bcb9ed..9e8ea09e9ea 100644 --- a/extensions/xiaomi/openclaw.plugin.json +++ b/extensions/xiaomi/openclaw.plugin.json @@ -2,6 +2,9 @@ "id": "xiaomi", "enabledByDefault": true, "providers": ["xiaomi"], + "contracts": { + "speechProviders": ["xiaomi"] + }, "providerAuthEnvVars": { "xiaomi": ["XIAOMI_API_KEY"] }, diff --git a/extensions/xiaomi/speech-provider.test.ts b/extensions/xiaomi/speech-provider.test.ts new file mode 100644 index 00000000000..f20caccc12d --- /dev/null +++ b/extensions/xiaomi/speech-provider.test.ts @@ -0,0 +1,250 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +const runFfmpegMock = vi.hoisted(() => vi.fn()); + +vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ + runFfmpeg: runFfmpegMock, +})); + +import { buildXiaomiSpeechProvider } from "./speech-provider.js"; + +describe("buildXiaomiSpeechProvider", () => { + const provider = buildXiaomiSpeechProvider(); + + describe("metadata", () => { + it("registers Xiaomi MiMo as a speech provider", () => { + expect(provider.id).toBe("xiaomi"); + expect(provider.aliases).toContain("mimo"); + expect(provider.models).toContain("mimo-v2.5-tts"); + expect(provider.models).toContain("mimo-v2-tts"); + expect(provider.voices).toContain("mimo_default"); + }); + }); + + describe("isConfigured", () => { + const savedEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...savedEnv }; + }); + + it("returns true when apiKey is in provider config", () => { + expect( + provider.isConfigured({ providerConfig: { apiKey: "sk-test" }, timeoutMs: 30000 }), + ).toBe(true); + }); + + it("returns false when no apiKey is available", () => { + delete process.env.XIAOMI_API_KEY; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(false); + }); + + it("returns true when XIAOMI_API_KEY env var is set", () => { + process.env.XIAOMI_API_KEY = "sk-env"; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(true); + }); + }); + + describe("resolveConfig", () => { + it("reads providers.xiaomi settings", () => { + const config = provider.resolveConfig!({ + rawConfig: { + providers: { + xiaomi: { + baseUrl: "https://example.com/v1/", + model: "mimo-v2-tts", + voice: "default_en", + format: "wav", + style: "Bright and fast.", + }, + }, + }, + cfg: {} as never, + timeoutMs: 30000, + }); + expect(config).toMatchObject({ + baseUrl: "https://example.com/v1", + model: "mimo-v2-tts", + voice: "default_en", + format: "wav", + style: "Bright and fast.", + }); + }); + + it("accepts the mimo provider config alias", () => { + const config = provider.resolveConfig!({ + rawConfig: { providers: { mimo: { voiceId: "default_zh" } } }, + cfg: {} as never, + timeoutMs: 30000, + }); + expect(config.voice).toBe("default_zh"); + }); + }); + + describe("parseDirectiveToken", () => { + const policy = { + enabled: true, + allowText: true, + allowProvider: true, + allowVoice: true, + allowModelId: true, + allowVoiceSettings: true, + allowNormalization: true, + allowSeed: true, + }; + + it("handles voice, model, style, and format tokens", () => { + expect(provider.parseDirectiveToken!({ key: "voice", value: "default_en", policy })).toEqual({ + handled: true, + overrides: { voice: "default_en" }, + }); + expect(provider.parseDirectiveToken!({ key: "model", value: "mimo-v2-tts", policy })).toEqual( + { handled: true, overrides: { model: "mimo-v2-tts" } }, + ); + expect(provider.parseDirectiveToken!({ key: "style", value: "whispered", policy })).toEqual({ + handled: true, + overrides: { style: "whispered" }, + }); + expect(provider.parseDirectiveToken!({ key: "format", value: "wav", policy })).toEqual({ + handled: true, + overrides: { format: "wav" }, + }); + }); + + it("warns on invalid format", () => { + const result = provider.parseDirectiveToken!({ key: "format", value: "ogg", policy }); + expect(result.handled).toBe(true); + expect(result.warnings).toHaveLength(1); + }); + }); + + describe("synthesize", () => { + const savedFetch = globalThis.fetch; + + beforeEach(() => { + vi.stubGlobal("fetch", vi.fn()); + runFfmpegMock.mockReset(); + }); + + afterEach(() => { + globalThis.fetch = savedFetch; + vi.restoreAllMocks(); + }); + + it("makes the Xiaomi chat completions TTS call and decodes audio", async () => { + const audio = Buffer.from("fake-mp3-audio").toString("base64"); + const mockFetch = vi.mocked(globalThis.fetch); + mockFetch.mockResolvedValueOnce( + new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }), + ); + + const result = await provider.synthesize({ + text: "Hello from OpenClaw.", + cfg: {} as never, + providerConfig: { + apiKey: "sk-test", + model: "mimo-v2-tts", + voice: "default_en", + style: "Bright.", + }, + target: "audio-file", + timeoutMs: 30000, + }); + + expect(result.outputFormat).toBe("mp3"); + expect(result.fileExtension).toBe(".mp3"); + expect(result.voiceCompatible).toBe(false); + expect(result.audioBuffer.toString()).toBe("fake-mp3-audio"); + + expect(mockFetch).toHaveBeenCalledOnce(); + const [url, init] = mockFetch.mock.calls[0]; + expect(url).toBe("https://api.xiaomimimo.com/v1/chat/completions"); + expect(init?.headers).toMatchObject({ "api-key": "sk-test" }); + const body = JSON.parse(init!.body as string); + expect(body.model).toBe("mimo-v2-tts"); + expect(body.messages).toEqual([ + { role: "user", content: "Bright." }, + { role: "assistant", content: "Hello from OpenClaw." }, + ]); + expect(body.audio).toEqual({ format: "mp3", voice: "default_en" }); + expect(runFfmpegMock).not.toHaveBeenCalled(); + }); + + it("transcodes Xiaomi output to Opus for voice-note targets", async () => { + const audio = Buffer.from("fake-mp3-audio").toString("base64"); + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }), + ); + runFfmpegMock.mockImplementationOnce(async (args: string[]) => { + const outputPath = args.at(-1); + if (typeof outputPath !== "string") { + throw new Error("missing ffmpeg output path"); + } + await import("node:fs/promises").then((fs) => + fs.writeFile(outputPath, Buffer.from("fake-opus-audio")), + ); + }); + + const result = await provider.synthesize({ + text: "Hello from OpenClaw.", + cfg: {} as never, + providerConfig: { apiKey: "sk-test" }, + target: "voice-note", + timeoutMs: 30000, + }); + + expect(result.outputFormat).toBe("opus"); + expect(result.fileExtension).toBe(".opus"); + expect(result.voiceCompatible).toBe(true); + expect(result.audioBuffer.toString()).toBe("fake-opus-audio"); + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]), + { timeoutMs: 30000 }, + ); + }); + + it("throws when API key is missing", async () => { + const savedKey = process.env.XIAOMI_API_KEY; + delete process.env.XIAOMI_API_KEY; + try { + await expect( + provider.synthesize({ + text: "Test", + cfg: {} as never, + providerConfig: {}, + target: "audio-file", + timeoutMs: 30000, + }), + ).rejects.toThrow("Xiaomi API key missing"); + } finally { + if (savedKey) { + process.env.XIAOMI_API_KEY = savedKey; + } + } + }); + + it("throws when the API response has no audio data", async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response(JSON.stringify({ choices: [{ message: {} }] }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }), + ); + await expect( + provider.synthesize({ + text: "Test", + cfg: {} as never, + providerConfig: { apiKey: "sk-test" }, + target: "audio-file", + timeoutMs: 30000, + }), + ).rejects.toThrow("Xiaomi TTS API returned no audio data"); + }); + }); +}); diff --git a/extensions/xiaomi/speech-provider.ts b/extensions/xiaomi/speech-provider.ts new file mode 100644 index 00000000000..748bdbe97df --- /dev/null +++ b/extensions/xiaomi/speech-provider.ts @@ -0,0 +1,336 @@ +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http"; +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderOverrides, + SpeechProviderPlugin, +} from "openclaw/plugin-sdk/speech-core"; +import { asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core"; +import { + fetchWithSsrFGuard, + ssrfPolicyFromHttpBaseUrlAllowedHostname, +} from "openclaw/plugin-sdk/ssrf-runtime"; +import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; + +export const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1"; +export const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts"; +export const DEFAULT_XIAOMI_TTS_VOICE = "mimo_default"; +export const DEFAULT_XIAOMI_TTS_FORMAT = "mp3"; + +export const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts"] as const; + +export const XIAOMI_TTS_VOICES = [ + "mimo_default", + "default_zh", + "default_en", + "Mia", + "Chloe", + "Milo", + "Dean", +] as const; + +const XIAOMI_TTS_FORMATS = ["mp3", "wav"] as const; + +type XiaomiTtsFormat = (typeof XIAOMI_TTS_FORMATS)[number]; + +type XiaomiTtsProviderConfig = { + apiKey?: string; + baseUrl: string; + model: string; + voice: string; + format: XiaomiTtsFormat; + style?: string; +}; + +type XiaomiTtsOverrides = { + model?: string; + voice?: string; + format?: XiaomiTtsFormat; + style?: string; +}; + +function normalizeXiaomiTtsBaseUrl(baseUrl?: string): string { + return (baseUrl?.trim() || DEFAULT_XIAOMI_TTS_BASE_URL).replace(/\/+$/, ""); +} + +function normalizeXiaomiTtsFormat(value: unknown): XiaomiTtsFormat | undefined { + const normalized = trimToUndefined(value)?.toLowerCase(); + return XIAOMI_TTS_FORMATS.includes(normalized as XiaomiTtsFormat) + ? (normalized as XiaomiTtsFormat) + : undefined; +} + +function resolveXiaomiTtsConfigRecord( + rawConfig: Record, +): Record | undefined { + const providers = asObject(rawConfig.providers); + return asObject(providers?.xiaomi) ?? asObject(providers?.mimo) ?? asObject(rawConfig.xiaomi); +} + +function normalizeXiaomiTtsProviderConfig( + rawConfig: Record, +): XiaomiTtsProviderConfig { + const raw = resolveXiaomiTtsConfigRecord(rawConfig); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.xiaomi.apiKey", + }), + baseUrl: normalizeXiaomiTtsBaseUrl( + trimToUndefined(raw?.baseUrl) ?? trimToUndefined(process.env.XIAOMI_BASE_URL), + ), + model: + trimToUndefined(raw?.model) ?? + trimToUndefined(process.env.XIAOMI_TTS_MODEL) ?? + DEFAULT_XIAOMI_TTS_MODEL, + voice: + trimToUndefined(raw?.voice) ?? + trimToUndefined(raw?.voiceId) ?? + trimToUndefined(process.env.XIAOMI_TTS_VOICE) ?? + DEFAULT_XIAOMI_TTS_VOICE, + format: + normalizeXiaomiTtsFormat(raw?.format) ?? + normalizeXiaomiTtsFormat(process.env.XIAOMI_TTS_FORMAT) ?? + DEFAULT_XIAOMI_TTS_FORMAT, + style: trimToUndefined(raw?.style), + }; +} + +function readXiaomiTtsProviderConfig(config: SpeechProviderConfig): XiaomiTtsProviderConfig { + const normalized = normalizeXiaomiTtsProviderConfig({}); + return { + apiKey: + normalizeResolvedSecretInputString({ + value: config.apiKey, + path: "messages.tts.providers.xiaomi.apiKey", + }) ?? normalized.apiKey, + baseUrl: normalizeXiaomiTtsBaseUrl(trimToUndefined(config.baseUrl) ?? normalized.baseUrl), + model: trimToUndefined(config.model) ?? normalized.model, + voice: trimToUndefined(config.voice) ?? trimToUndefined(config.voiceId) ?? normalized.voice, + format: normalizeXiaomiTtsFormat(config.format) ?? normalized.format, + style: trimToUndefined(config.style) ?? normalized.style, + }; +} + +function readXiaomiTtsOverrides( + overrides: SpeechProviderOverrides | undefined, +): XiaomiTtsOverrides { + if (!overrides) { + return {}; + } + return { + model: trimToUndefined(overrides.model), + voice: trimToUndefined(overrides.voice) ?? trimToUndefined(overrides.voiceId), + format: normalizeXiaomiTtsFormat(overrides.format), + style: trimToUndefined(overrides.style), + }; +} + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { + handled: boolean; + overrides?: SpeechProviderOverrides; + warnings?: string[]; +} { + switch (ctx.key) { + case "voice": + case "voiceid": + case "voice_id": + case "mimo_voice": + case "xiaomi_voice": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { handled: true, overrides: { voice: ctx.value } }; + case "model": + case "mimo_model": + case "xiaomi_model": + if (!ctx.policy.allowModelId) { + return { handled: true }; + } + return { handled: true, overrides: { model: ctx.value } }; + case "style": + case "mimo_style": + case "xiaomi_style": + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + return { handled: true, overrides: { style: ctx.value } }; + case "format": + case "responseformat": + case "response_format": { + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + const format = normalizeXiaomiTtsFormat(ctx.value); + if (!format) { + return { handled: true, warnings: [`invalid Xiaomi TTS format "${ctx.value}"`] }; + } + return { handled: true, overrides: { format } }; + } + default: + return { handled: false }; + } +} + +function buildXiaomiTtsMessages(params: { text: string; style?: string }) { + const style = trimToUndefined(params.style); + return [ + ...(style ? [{ role: "user" as const, content: style }] : []), + { role: "assistant" as const, content: params.text }, + ]; +} + +function decodeXiaomiAudioData(body: unknown): Buffer { + const root = asObject(body); + const choices = Array.isArray(root?.choices) ? root.choices : []; + const firstChoice = asObject(choices[0]); + const message = asObject(firstChoice?.message); + const audio = asObject(message?.audio); + const audioData = trimToUndefined(audio?.data); + if (!audioData) { + throw new Error("Xiaomi TTS API returned no audio data"); + } + return Buffer.from(audioData, "base64"); +} + +export async function xiaomiTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + model: string; + voice: string; + format: XiaomiTtsFormat; + style?: string; + timeoutMs: number; +}): Promise { + const { text, apiKey, baseUrl, model, voice, format, style, timeoutMs } = params; + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const { response, release } = await fetchWithSsrFGuard({ + url: `${baseUrl}/chat/completions`, + init: { + method: "POST", + headers: { + "api-key": apiKey, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + messages: buildXiaomiTtsMessages({ text, style }), + audio: { format, voice }, + }), + signal: controller.signal, + }, + timeoutMs, + policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(baseUrl), + auditContext: "xiaomi.tts", + }); + try { + await assertOkOrThrowProviderError(response, "Xiaomi TTS API error"); + return decodeXiaomiAudioData(await response.json()); + } finally { + await release(); + } + } finally { + clearTimeout(timeout); + } +} + +async function transcodeAudioToOpus(params: { + audioBuffer: Buffer; + inputExtension: string; + timeoutMs: number | undefined; +}) { + const tempRoot = resolvePreferredOpenClawTmpDir(); + await mkdir(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = await mkdtemp(path.join(tempRoot, "tts-xiaomi-")); + try { + const inputPath = path.join(tempDir, `input.${params.inputExtension}`); + const outputPath = path.join(tempDir, "voice.opus"); + await writeFile(inputPath, params.audioBuffer, { mode: 0o600 }); + await runFfmpeg( + [ + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + inputPath, + "-vn", + "-c:a", + "libopus", + "-b:a", + "64k", + "-ar", + "48000", + "-ac", + "1", + outputPath, + ], + { timeoutMs: params.timeoutMs }, + ); + return await readFile(outputPath); + } finally { + await rm(tempDir, { recursive: true, force: true }); + } +} + +export function buildXiaomiSpeechProvider(): SpeechProviderPlugin { + return { + id: "xiaomi", + label: "Xiaomi MiMo", + aliases: ["mimo"], + autoSelectOrder: 45, + models: XIAOMI_TTS_MODELS, + voices: XIAOMI_TTS_VOICES, + resolveConfig: ({ rawConfig }) => normalizeXiaomiTtsProviderConfig(rawConfig), + parseDirectiveToken, + listVoices: async () => XIAOMI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })), + isConfigured: ({ providerConfig }) => + Boolean(readXiaomiTtsProviderConfig(providerConfig).apiKey || process.env.XIAOMI_API_KEY), + synthesize: async (req) => { + const config = readXiaomiTtsProviderConfig(req.providerConfig); + const overrides = readXiaomiTtsOverrides(req.providerOverrides); + const apiKey = config.apiKey || process.env.XIAOMI_API_KEY; + if (!apiKey) { + throw new Error("Xiaomi API key missing"); + } + const outputFormat = overrides.format ?? config.format; + const audioBuffer = await xiaomiTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + model: overrides.model ?? config.model, + voice: overrides.voice ?? config.voice, + format: outputFormat, + style: overrides.style ?? config.style, + timeoutMs: req.timeoutMs, + }); + if (req.target === "voice-note") { + const opusBuffer = await transcodeAudioToOpus({ + audioBuffer, + inputExtension: outputFormat, + timeoutMs: req.timeoutMs, + }); + return { + audioBuffer: opusBuffer, + outputFormat: "opus", + fileExtension: ".opus", + voiceCompatible: true, + }; + } + return { + audioBuffer, + outputFormat, + fileExtension: `.${outputFormat}`, + voiceCompatible: false, + }; + }, + }; +} diff --git a/extensions/xiaomi/xiaomi.live.test.ts b/extensions/xiaomi/xiaomi.live.test.ts new file mode 100644 index 00000000000..bd617589306 --- /dev/null +++ b/extensions/xiaomi/xiaomi.live.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from "vitest"; +import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js"; +import { + registerProviderPlugin, + requireRegisteredProvider, +} from "../../test/helpers/plugins/provider-registration.js"; +import plugin from "./index.js"; + +const XIAOMI_API_KEY = process.env.XIAOMI_API_KEY?.trim() ?? ""; +const LIVE = isLiveTestEnabled() && XIAOMI_API_KEY.length > 0; +const describeLive = LIVE ? describe : describe.skip; + +const registerXiaomiPlugin = () => + registerProviderPlugin({ + plugin, + id: "xiaomi", + name: "Xiaomi Provider", + }); + +describeLive("xiaomi plugin live", () => { + it("synthesizes MiMo TTS through the registered speech provider", async () => { + const { speechProviders } = await registerXiaomiPlugin(); + const provider = requireRegisteredProvider(speechProviders, "xiaomi"); + + const audioFile = await provider.synthesize({ + text: "OpenClaw Xiaomi MiMo text to speech integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig: { apiKey: XIAOMI_API_KEY, format: "mp3", voice: "mimo_default" }, + target: "audio-file", + timeoutMs: 90_000, + }); + + expect(audioFile.outputFormat).toBe("mp3"); + expect(audioFile.fileExtension).toBe(".mp3"); + expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512); + }, 120_000); + + it("synthesizes MiMo TTS as an Opus voice note", async () => { + const { speechProviders } = await registerXiaomiPlugin(); + const provider = requireRegisteredProvider(speechProviders, "xiaomi"); + + const voiceNote = await provider.synthesize({ + text: "OpenClaw Xiaomi MiMo voice note test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig: { apiKey: XIAOMI_API_KEY, format: "mp3", voice: "mimo_default" }, + target: "voice-note", + timeoutMs: 90_000, + }); + + expect(voiceNote.outputFormat).toBe("opus"); + expect(voiceNote.fileExtension).toBe(".opus"); + expect(voiceNote.voiceCompatible).toBe(true); + expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(512); + }, 120_000); +});