feat(tts): add xiaomi mimo speech provider

2026-05-06 13:50:49 +00:00 · 2026-04-25 09:47:52 +01:00
parent e10f20032a
commit ec8dbc4595
10 changed files with 789 additions and 10 deletions
--- a/extensions/google/google.live.test.ts
+++ b/extensions/google/google.live.test.ts
@@ -43,7 +43,7 @@ describeLive("google plugin live", () => {
    const speechProvider = requireRegisteredProvider(speechProviders, "google");
    const mediaProvider = requireRegisteredProvider(mediaProviders, "google");

-    const phrase = "Testing Google audio transcription with OpenClaw.";
+    const phrase = "Testing Google audio transcription with pineapple.";
    const audioFile = await speechProvider.synthesize({
      text: phrase,
      cfg: { plugins: { enabled: true } } as never,
@@ -62,7 +62,7 @@ describeLive("google plugin live", () => {

    const normalized = normalizeTranscriptForMatch(transcript?.text ?? "");
    expect(normalized).toContain("google");
-    expect(normalized).toContain("openclaw");
+    expect(normalized).toContain("pineapple");
  }, 180_000);

  it("runs Gemini web search through the registered provider tool", async () => {
--- a/extensions/minimax/minimax.live.test.ts
+++ b/extensions/minimax/minimax.live.test.ts
@@ -1,14 +1,30 @@
 import { describe, expect, it } from "vitest";
 import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
+import {
+  registerProviderPlugin,
+  requireRegisteredProvider,
+} from "../../test/helpers/plugins/provider-registration.js";
+import plugin from "./index.js";
+import { buildMinimaxSpeechProvider } from "./speech-provider.js";
 import { createMiniMaxWebSearchProvider } from "./src/minimax-web-search-provider.js";

+const MINIMAX_API_KEY = process.env.MINIMAX_API_KEY?.trim() ?? "";
 const MINIMAX_SEARCH_KEY =
  process.env.MINIMAX_CODE_PLAN_KEY?.trim() ||
  process.env.MINIMAX_CODING_API_KEY?.trim() ||
-  process.env.MINIMAX_API_KEY?.trim() ||
+  MINIMAX_API_KEY ||
  "";
 const describeLive =
  isLiveTestEnabled() && MINIMAX_SEARCH_KEY.length > 0 ? describe : describe.skip;
+const describeTtsLive =
+  isLiveTestEnabled() && MINIMAX_API_KEY.length > 0 ? describe : describe.skip;
+
+const registerMinimaxPlugin = () =>
+  registerProviderPlugin({
+    plugin,
+    id: "minimax",
+    name: "MiniMax Provider",
+  });

 describeLive("minimax plugin live", () => {
  it("runs MiniMax web search through the provider tool", async () => {
@@ -25,3 +41,39 @@ describeLive("minimax plugin live", () => {
    expect(Array.isArray(result?.results)).toBe(true);
  }, 120_000);
 });
+
+describeTtsLive("minimax tts live", () => {
+  it("synthesizes TTS through the registered speech provider", async () => {
+    const { speechProviders } = await registerMinimaxPlugin();
+    const provider = requireRegisteredProvider(speechProviders, "minimax");
+
+    const audioFile = await provider.synthesize({
+      text: "OpenClaw MiniMax text to speech integration test OK.",
+      cfg: { plugins: { enabled: true } } as never,
+      providerConfig: { apiKey: MINIMAX_API_KEY },
+      target: "audio-file",
+      timeoutMs: 90_000,
+    });
+
+    expect(audioFile.outputFormat).toBe("mp3");
+    expect(audioFile.fileExtension).toBe(".mp3");
+    expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
+  }, 120_000);
+
+  it("synthesizes MiniMax TTS as an Opus voice note", async () => {
+    const provider = buildMinimaxSpeechProvider();
+
+    const voiceNote = await provider.synthesize({
+      text: "OpenClaw MiniMax voice note test OK.",
+      cfg: { plugins: { enabled: true } } as never,
+      providerConfig: { apiKey: MINIMAX_API_KEY },
+      target: "voice-note",
+      timeoutMs: 90_000,
+    });
+
+    expect(voiceNote.outputFormat).toBe("opus");
+    expect(voiceNote.fileExtension).toBe(".opus");
+    expect(voiceNote.voiceCompatible).toBe(true);
+    expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(512);
+  }, 120_000);
+});
--- a/extensions/xiaomi/index.ts
+++ b/extensions/xiaomi/index.ts
@@ -2,6 +2,7 @@ import { defineSingleProviderPluginEntry } from "openclaw/plugin-sdk/provider-en
 import { PROVIDER_LABELS } from "openclaw/plugin-sdk/provider-usage";
 import { applyXiaomiConfig, XIAOMI_DEFAULT_MODEL_REF } from "./onboard.js";
 import { buildXiaomiProvider } from "./provider-catalog.js";
+import { buildXiaomiSpeechProvider } from "./speech-provider.js";

 const PROVIDER_ID = "xiaomi";

@@ -40,4 +41,7 @@ export default defineSingleProviderPluginEntry({
      windows: [],
    }),
  },
+  register(api) {
+    api.registerSpeechProvider(buildXiaomiSpeechProvider());
+  },
 });
--- a/extensions/xiaomi/openclaw.plugin.json
+++ b/extensions/xiaomi/openclaw.plugin.json
@@ -2,6 +2,9 @@
  "id": "xiaomi",
  "enabledByDefault": true,
  "providers": ["xiaomi"],
+  "contracts": {
+    "speechProviders": ["xiaomi"]
+  },
  "providerAuthEnvVars": {
    "xiaomi": ["XIAOMI_API_KEY"]
  },
--- a/extensions/xiaomi/speech-provider.test.ts
+++ b/extensions/xiaomi/speech-provider.test.ts
@@ -0,0 +1,250 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+const runFfmpegMock = vi.hoisted(() => vi.fn());
+
+vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
+  runFfmpeg: runFfmpegMock,
+}));
+
+import { buildXiaomiSpeechProvider } from "./speech-provider.js";
+
+describe("buildXiaomiSpeechProvider", () => {
+  const provider = buildXiaomiSpeechProvider();
+
+  describe("metadata", () => {
+    it("registers Xiaomi MiMo as a speech provider", () => {
+      expect(provider.id).toBe("xiaomi");
+      expect(provider.aliases).toContain("mimo");
+      expect(provider.models).toContain("mimo-v2.5-tts");
+      expect(provider.models).toContain("mimo-v2-tts");
+      expect(provider.voices).toContain("mimo_default");
+    });
+  });
+
+  describe("isConfigured", () => {
+    const savedEnv = { ...process.env };
+
+    afterEach(() => {
+      process.env = { ...savedEnv };
+    });
+
+    it("returns true when apiKey is in provider config", () => {
+      expect(
+        provider.isConfigured({ providerConfig: { apiKey: "sk-test" }, timeoutMs: 30000 }),
+      ).toBe(true);
+    });
+
+    it("returns false when no apiKey is available", () => {
+      delete process.env.XIAOMI_API_KEY;
+      expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(false);
+    });
+
+    it("returns true when XIAOMI_API_KEY env var is set", () => {
+      process.env.XIAOMI_API_KEY = "sk-env";
+      expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(true);
+    });
+  });
+
+  describe("resolveConfig", () => {
+    it("reads providers.xiaomi settings", () => {
+      const config = provider.resolveConfig!({
+        rawConfig: {
+          providers: {
+            xiaomi: {
+              baseUrl: "https://example.com/v1/",
+              model: "mimo-v2-tts",
+              voice: "default_en",
+              format: "wav",
+              style: "Bright and fast.",
+            },
+          },
+        },
+        cfg: {} as never,
+        timeoutMs: 30000,
+      });
+      expect(config).toMatchObject({
+        baseUrl: "https://example.com/v1",
+        model: "mimo-v2-tts",
+        voice: "default_en",
+        format: "wav",
+        style: "Bright and fast.",
+      });
+    });
+
+    it("accepts the mimo provider config alias", () => {
+      const config = provider.resolveConfig!({
+        rawConfig: { providers: { mimo: { voiceId: "default_zh" } } },
+        cfg: {} as never,
+        timeoutMs: 30000,
+      });
+      expect(config.voice).toBe("default_zh");
+    });
+  });
+
+  describe("parseDirectiveToken", () => {
+    const policy = {
+      enabled: true,
+      allowText: true,
+      allowProvider: true,
+      allowVoice: true,
+      allowModelId: true,
+      allowVoiceSettings: true,
+      allowNormalization: true,
+      allowSeed: true,
+    };
+
+    it("handles voice, model, style, and format tokens", () => {
+      expect(provider.parseDirectiveToken!({ key: "voice", value: "default_en", policy })).toEqual({
+        handled: true,
+        overrides: { voice: "default_en" },
+      });
+      expect(provider.parseDirectiveToken!({ key: "model", value: "mimo-v2-tts", policy })).toEqual(
+        { handled: true, overrides: { model: "mimo-v2-tts" } },
+      );
+      expect(provider.parseDirectiveToken!({ key: "style", value: "whispered", policy })).toEqual({
+        handled: true,
+        overrides: { style: "whispered" },
+      });
+      expect(provider.parseDirectiveToken!({ key: "format", value: "wav", policy })).toEqual({
+        handled: true,
+        overrides: { format: "wav" },
+      });
+    });
+
+    it("warns on invalid format", () => {
+      const result = provider.parseDirectiveToken!({ key: "format", value: "ogg", policy });
+      expect(result.handled).toBe(true);
+      expect(result.warnings).toHaveLength(1);
+    });
+  });
+
+  describe("synthesize", () => {
+    const savedFetch = globalThis.fetch;
+
+    beforeEach(() => {
+      vi.stubGlobal("fetch", vi.fn());
+      runFfmpegMock.mockReset();
+    });
+
+    afterEach(() => {
+      globalThis.fetch = savedFetch;
+      vi.restoreAllMocks();
+    });
+
+    it("makes the Xiaomi chat completions TTS call and decodes audio", async () => {
+      const audio = Buffer.from("fake-mp3-audio").toString("base64");
+      const mockFetch = vi.mocked(globalThis.fetch);
+      mockFetch.mockResolvedValueOnce(
+        new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        }),
+      );
+
+      const result = await provider.synthesize({
+        text: "Hello from OpenClaw.",
+        cfg: {} as never,
+        providerConfig: {
+          apiKey: "sk-test",
+          model: "mimo-v2-tts",
+          voice: "default_en",
+          style: "Bright.",
+        },
+        target: "audio-file",
+        timeoutMs: 30000,
+      });
+
+      expect(result.outputFormat).toBe("mp3");
+      expect(result.fileExtension).toBe(".mp3");
+      expect(result.voiceCompatible).toBe(false);
+      expect(result.audioBuffer.toString()).toBe("fake-mp3-audio");
+
+      expect(mockFetch).toHaveBeenCalledOnce();
+      const [url, init] = mockFetch.mock.calls[0];
+      expect(url).toBe("https://api.xiaomimimo.com/v1/chat/completions");
+      expect(init?.headers).toMatchObject({ "api-key": "sk-test" });
+      const body = JSON.parse(init!.body as string);
+      expect(body.model).toBe("mimo-v2-tts");
+      expect(body.messages).toEqual([
+        { role: "user", content: "Bright." },
+        { role: "assistant", content: "Hello from OpenClaw." },
+      ]);
+      expect(body.audio).toEqual({ format: "mp3", voice: "default_en" });
+      expect(runFfmpegMock).not.toHaveBeenCalled();
+    });
+
+    it("transcodes Xiaomi output to Opus for voice-note targets", async () => {
+      const audio = Buffer.from("fake-mp3-audio").toString("base64");
+      vi.mocked(globalThis.fetch).mockResolvedValueOnce(
+        new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        }),
+      );
+      runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
+        const outputPath = args.at(-1);
+        if (typeof outputPath !== "string") {
+          throw new Error("missing ffmpeg output path");
+        }
+        await import("node:fs/promises").then((fs) =>
+          fs.writeFile(outputPath, Buffer.from("fake-opus-audio")),
+        );
+      });
+
+      const result = await provider.synthesize({
+        text: "Hello from OpenClaw.",
+        cfg: {} as never,
+        providerConfig: { apiKey: "sk-test" },
+        target: "voice-note",
+        timeoutMs: 30000,
+      });
+
+      expect(result.outputFormat).toBe("opus");
+      expect(result.fileExtension).toBe(".opus");
+      expect(result.voiceCompatible).toBe(true);
+      expect(result.audioBuffer.toString()).toBe("fake-opus-audio");
+      expect(runFfmpegMock).toHaveBeenCalledWith(
+        expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
+        { timeoutMs: 30000 },
+      );
+    });
+
+    it("throws when API key is missing", async () => {
+      const savedKey = process.env.XIAOMI_API_KEY;
+      delete process.env.XIAOMI_API_KEY;
+      try {
+        await expect(
+          provider.synthesize({
+            text: "Test",
+            cfg: {} as never,
+            providerConfig: {},
+            target: "audio-file",
+            timeoutMs: 30000,
+          }),
+        ).rejects.toThrow("Xiaomi API key missing");
+      } finally {
+        if (savedKey) {
+          process.env.XIAOMI_API_KEY = savedKey;
+        }
+      }
+    });
+
+    it("throws when the API response has no audio data", async () => {
+      vi.mocked(globalThis.fetch).mockResolvedValueOnce(
+        new Response(JSON.stringify({ choices: [{ message: {} }] }), {
+          status: 200,
+          headers: { "Content-Type": "application/json" },
+        }),
+      );
+      await expect(
+        provider.synthesize({
+          text: "Test",
+          cfg: {} as never,
+          providerConfig: { apiKey: "sk-test" },
+          target: "audio-file",
+          timeoutMs: 30000,
+        }),
+      ).rejects.toThrow("Xiaomi TTS API returned no audio data");
+    });
+  });
+});
--- a/extensions/xiaomi/speech-provider.ts
+++ b/extensions/xiaomi/speech-provider.ts
@@ -0,0 +1,336 @@
+import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
+import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import type {
+  SpeechDirectiveTokenParseContext,
+  SpeechProviderConfig,
+  SpeechProviderOverrides,
+  SpeechProviderPlugin,
+} from "openclaw/plugin-sdk/speech-core";
+import { asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
+import {
+  fetchWithSsrFGuard,
+  ssrfPolicyFromHttpBaseUrlAllowedHostname,
+} from "openclaw/plugin-sdk/ssrf-runtime";
+import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
+
+export const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1";
+export const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts";
+export const DEFAULT_XIAOMI_TTS_VOICE = "mimo_default";
+export const DEFAULT_XIAOMI_TTS_FORMAT = "mp3";
+
+export const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts"] as const;
+
+export const XIAOMI_TTS_VOICES = [
+  "mimo_default",
+  "default_zh",
+  "default_en",
+  "Mia",
+  "Chloe",
+  "Milo",
+  "Dean",
+] as const;
+
+const XIAOMI_TTS_FORMATS = ["mp3", "wav"] as const;
+
+type XiaomiTtsFormat = (typeof XIAOMI_TTS_FORMATS)[number];
+
+type XiaomiTtsProviderConfig = {
+  apiKey?: string;
+  baseUrl: string;
+  model: string;
+  voice: string;
+  format: XiaomiTtsFormat;
+  style?: string;
+};
+
+type XiaomiTtsOverrides = {
+  model?: string;
+  voice?: string;
+  format?: XiaomiTtsFormat;
+  style?: string;
+};
+
+function normalizeXiaomiTtsBaseUrl(baseUrl?: string): string {
+  return (baseUrl?.trim() || DEFAULT_XIAOMI_TTS_BASE_URL).replace(/\/+$/, "");
+}
+
+function normalizeXiaomiTtsFormat(value: unknown): XiaomiTtsFormat | undefined {
+  const normalized = trimToUndefined(value)?.toLowerCase();
+  return XIAOMI_TTS_FORMATS.includes(normalized as XiaomiTtsFormat)
+    ? (normalized as XiaomiTtsFormat)
+    : undefined;
+}
+
+function resolveXiaomiTtsConfigRecord(
+  rawConfig: Record<string, unknown>,
+): Record<string, unknown> | undefined {
+  const providers = asObject(rawConfig.providers);
+  return asObject(providers?.xiaomi) ?? asObject(providers?.mimo) ?? asObject(rawConfig.xiaomi);
+}
+
+function normalizeXiaomiTtsProviderConfig(
+  rawConfig: Record<string, unknown>,
+): XiaomiTtsProviderConfig {
+  const raw = resolveXiaomiTtsConfigRecord(rawConfig);
+  return {
+    apiKey: normalizeResolvedSecretInputString({
+      value: raw?.apiKey,
+      path: "messages.tts.providers.xiaomi.apiKey",
+    }),
+    baseUrl: normalizeXiaomiTtsBaseUrl(
+      trimToUndefined(raw?.baseUrl) ?? trimToUndefined(process.env.XIAOMI_BASE_URL),
+    ),
+    model:
+      trimToUndefined(raw?.model) ??
+      trimToUndefined(process.env.XIAOMI_TTS_MODEL) ??
+      DEFAULT_XIAOMI_TTS_MODEL,
+    voice:
+      trimToUndefined(raw?.voice) ??
+      trimToUndefined(raw?.voiceId) ??
+      trimToUndefined(process.env.XIAOMI_TTS_VOICE) ??
+      DEFAULT_XIAOMI_TTS_VOICE,
+    format:
+      normalizeXiaomiTtsFormat(raw?.format) ??
+      normalizeXiaomiTtsFormat(process.env.XIAOMI_TTS_FORMAT) ??
+      DEFAULT_XIAOMI_TTS_FORMAT,
+    style: trimToUndefined(raw?.style),
+  };
+}
+
+function readXiaomiTtsProviderConfig(config: SpeechProviderConfig): XiaomiTtsProviderConfig {
+  const normalized = normalizeXiaomiTtsProviderConfig({});
+  return {
+    apiKey:
+      normalizeResolvedSecretInputString({
+        value: config.apiKey,
+        path: "messages.tts.providers.xiaomi.apiKey",
+      }) ?? normalized.apiKey,
+    baseUrl: normalizeXiaomiTtsBaseUrl(trimToUndefined(config.baseUrl) ?? normalized.baseUrl),
+    model: trimToUndefined(config.model) ?? normalized.model,
+    voice: trimToUndefined(config.voice) ?? trimToUndefined(config.voiceId) ?? normalized.voice,
+    format: normalizeXiaomiTtsFormat(config.format) ?? normalized.format,
+    style: trimToUndefined(config.style) ?? normalized.style,
+  };
+}
+
+function readXiaomiTtsOverrides(
+  overrides: SpeechProviderOverrides | undefined,
+): XiaomiTtsOverrides {
+  if (!overrides) {
+    return {};
+  }
+  return {
+    model: trimToUndefined(overrides.model),
+    voice: trimToUndefined(overrides.voice) ?? trimToUndefined(overrides.voiceId),
+    format: normalizeXiaomiTtsFormat(overrides.format),
+    style: trimToUndefined(overrides.style),
+  };
+}
+
+function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
+  handled: boolean;
+  overrides?: SpeechProviderOverrides;
+  warnings?: string[];
+} {
+  switch (ctx.key) {
+    case "voice":
+    case "voiceid":
+    case "voice_id":
+    case "mimo_voice":
+    case "xiaomi_voice":
+      if (!ctx.policy.allowVoice) {
+        return { handled: true };
+      }
+      return { handled: true, overrides: { voice: ctx.value } };
+    case "model":
+    case "mimo_model":
+    case "xiaomi_model":
+      if (!ctx.policy.allowModelId) {
+        return { handled: true };
+      }
+      return { handled: true, overrides: { model: ctx.value } };
+    case "style":
+    case "mimo_style":
+    case "xiaomi_style":
+      if (!ctx.policy.allowVoiceSettings) {
+        return { handled: true };
+      }
+      return { handled: true, overrides: { style: ctx.value } };
+    case "format":
+    case "responseformat":
+    case "response_format": {
+      if (!ctx.policy.allowVoiceSettings) {
+        return { handled: true };
+      }
+      const format = normalizeXiaomiTtsFormat(ctx.value);
+      if (!format) {
+        return { handled: true, warnings: [`invalid Xiaomi TTS format "${ctx.value}"`] };
+      }
+      return { handled: true, overrides: { format } };
+    }
+    default:
+      return { handled: false };
+  }
+}
+
+function buildXiaomiTtsMessages(params: { text: string; style?: string }) {
+  const style = trimToUndefined(params.style);
+  return [
+    ...(style ? [{ role: "user" as const, content: style }] : []),
+    { role: "assistant" as const, content: params.text },
+  ];
+}
+
+function decodeXiaomiAudioData(body: unknown): Buffer {
+  const root = asObject(body);
+  const choices = Array.isArray(root?.choices) ? root.choices : [];
+  const firstChoice = asObject(choices[0]);
+  const message = asObject(firstChoice?.message);
+  const audio = asObject(message?.audio);
+  const audioData = trimToUndefined(audio?.data);
+  if (!audioData) {
+    throw new Error("Xiaomi TTS API returned no audio data");
+  }
+  return Buffer.from(audioData, "base64");
+}
+
+export async function xiaomiTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl: string;
+  model: string;
+  voice: string;
+  format: XiaomiTtsFormat;
+  style?: string;
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const { text, apiKey, baseUrl, model, voice, format, style, timeoutMs } = params;
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const { response, release } = await fetchWithSsrFGuard({
+      url: `${baseUrl}/chat/completions`,
+      init: {
+        method: "POST",
+        headers: {
+          "api-key": apiKey,
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+          model,
+          messages: buildXiaomiTtsMessages({ text, style }),
+          audio: { format, voice },
+        }),
+        signal: controller.signal,
+      },
+      timeoutMs,
+      policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(baseUrl),
+      auditContext: "xiaomi.tts",
+    });
+    try {
+      await assertOkOrThrowProviderError(response, "Xiaomi TTS API error");
+      return decodeXiaomiAudioData(await response.json());
+    } finally {
+      await release();
+    }
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+async function transcodeAudioToOpus(params: {
+  audioBuffer: Buffer;
+  inputExtension: string;
+  timeoutMs: number | undefined;
+}) {
+  const tempRoot = resolvePreferredOpenClawTmpDir();
+  await mkdir(tempRoot, { recursive: true, mode: 0o700 });
+  const tempDir = await mkdtemp(path.join(tempRoot, "tts-xiaomi-"));
+  try {
+    const inputPath = path.join(tempDir, `input.${params.inputExtension}`);
+    const outputPath = path.join(tempDir, "voice.opus");
+    await writeFile(inputPath, params.audioBuffer, { mode: 0o600 });
+    await runFfmpeg(
+      [
+        "-hide_banner",
+        "-loglevel",
+        "error",
+        "-y",
+        "-i",
+        inputPath,
+        "-vn",
+        "-c:a",
+        "libopus",
+        "-b:a",
+        "64k",
+        "-ar",
+        "48000",
+        "-ac",
+        "1",
+        outputPath,
+      ],
+      { timeoutMs: params.timeoutMs },
+    );
+    return await readFile(outputPath);
+  } finally {
+    await rm(tempDir, { recursive: true, force: true });
+  }
+}
+
+export function buildXiaomiSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "xiaomi",
+    label: "Xiaomi MiMo",
+    aliases: ["mimo"],
+    autoSelectOrder: 45,
+    models: XIAOMI_TTS_MODELS,
+    voices: XIAOMI_TTS_VOICES,
+    resolveConfig: ({ rawConfig }) => normalizeXiaomiTtsProviderConfig(rawConfig),
+    parseDirectiveToken,
+    listVoices: async () => XIAOMI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
+    isConfigured: ({ providerConfig }) =>
+      Boolean(readXiaomiTtsProviderConfig(providerConfig).apiKey || process.env.XIAOMI_API_KEY),
+    synthesize: async (req) => {
+      const config = readXiaomiTtsProviderConfig(req.providerConfig);
+      const overrides = readXiaomiTtsOverrides(req.providerOverrides);
+      const apiKey = config.apiKey || process.env.XIAOMI_API_KEY;
+      if (!apiKey) {
+        throw new Error("Xiaomi API key missing");
+      }
+      const outputFormat = overrides.format ?? config.format;
+      const audioBuffer = await xiaomiTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: config.baseUrl,
+        model: overrides.model ?? config.model,
+        voice: overrides.voice ?? config.voice,
+        format: outputFormat,
+        style: overrides.style ?? config.style,
+        timeoutMs: req.timeoutMs,
+      });
+      if (req.target === "voice-note") {
+        const opusBuffer = await transcodeAudioToOpus({
+          audioBuffer,
+          inputExtension: outputFormat,
+          timeoutMs: req.timeoutMs,
+        });
+        return {
+          audioBuffer: opusBuffer,
+          outputFormat: "opus",
+          fileExtension: ".opus",
+          voiceCompatible: true,
+        };
+      }
+      return {
+        audioBuffer,
+        outputFormat,
+        fileExtension: `.${outputFormat}`,
+        voiceCompatible: false,
+      };
+    },
+  };
+}
--- a/extensions/xiaomi/xiaomi.live.test.ts
+++ b/extensions/xiaomi/xiaomi.live.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from "vitest";
+import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
+import {
+  registerProviderPlugin,
+  requireRegisteredProvider,
+} from "../../test/helpers/plugins/provider-registration.js";
+import plugin from "./index.js";
+
+const XIAOMI_API_KEY = process.env.XIAOMI_API_KEY?.trim() ?? "";
+const LIVE = isLiveTestEnabled() && XIAOMI_API_KEY.length > 0;
+const describeLive = LIVE ? describe : describe.skip;
+
+const registerXiaomiPlugin = () =>
+  registerProviderPlugin({
+    plugin,
+    id: "xiaomi",
+    name: "Xiaomi Provider",
+  });
+
+describeLive("xiaomi plugin live", () => {
+  it("synthesizes MiMo TTS through the registered speech provider", async () => {
+    const { speechProviders } = await registerXiaomiPlugin();
+    const provider = requireRegisteredProvider(speechProviders, "xiaomi");
+
+    const audioFile = await provider.synthesize({
+      text: "OpenClaw Xiaomi MiMo text to speech integration test OK.",
+      cfg: { plugins: { enabled: true } } as never,
+      providerConfig: { apiKey: XIAOMI_API_KEY, format: "mp3", voice: "mimo_default" },
+      target: "audio-file",
+      timeoutMs: 90_000,
+    });
+
+    expect(audioFile.outputFormat).toBe("mp3");
+    expect(audioFile.fileExtension).toBe(".mp3");
+    expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
+  }, 120_000);
+
+  it("synthesizes MiMo TTS as an Opus voice note", async () => {
+    const { speechProviders } = await registerXiaomiPlugin();
+    const provider = requireRegisteredProvider(speechProviders, "xiaomi");
+
+    const voiceNote = await provider.synthesize({
+      text: "OpenClaw Xiaomi MiMo voice note test OK.",
+      cfg: { plugins: { enabled: true } } as never,
+      providerConfig: { apiKey: XIAOMI_API_KEY, format: "mp3", voice: "mimo_default" },
+      target: "voice-note",
+      timeoutMs: 90_000,
+    });
+
+    expect(voiceNote.outputFormat).toBe("opus");
+    expect(voiceNote.fileExtension).toBe(".opus");
+    expect(voiceNote.voiceCompatible).toBe(true);
+    expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(512);
+  }, 120_000);
+});