mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 13:50:49 +00:00
feat(tts): add xiaomi mimo speech provider
This commit is contained in:
@@ -43,7 +43,7 @@ describeLive("google plugin live", () => {
|
||||
const speechProvider = requireRegisteredProvider(speechProviders, "google");
|
||||
const mediaProvider = requireRegisteredProvider(mediaProviders, "google");
|
||||
|
||||
const phrase = "Testing Google audio transcription with OpenClaw.";
|
||||
const phrase = "Testing Google audio transcription with pineapple.";
|
||||
const audioFile = await speechProvider.synthesize({
|
||||
text: phrase,
|
||||
cfg: { plugins: { enabled: true } } as never,
|
||||
@@ -62,7 +62,7 @@ describeLive("google plugin live", () => {
|
||||
|
||||
const normalized = normalizeTranscriptForMatch(transcript?.text ?? "");
|
||||
expect(normalized).toContain("google");
|
||||
expect(normalized).toContain("openclaw");
|
||||
expect(normalized).toContain("pineapple");
|
||||
}, 180_000);
|
||||
|
||||
it("runs Gemini web search through the registered provider tool", async () => {
|
||||
|
||||
@@ -1,14 +1,30 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
|
||||
import {
|
||||
registerProviderPlugin,
|
||||
requireRegisteredProvider,
|
||||
} from "../../test/helpers/plugins/provider-registration.js";
|
||||
import plugin from "./index.js";
|
||||
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
|
||||
import { createMiniMaxWebSearchProvider } from "./src/minimax-web-search-provider.js";
|
||||
|
||||
const MINIMAX_API_KEY = process.env.MINIMAX_API_KEY?.trim() ?? "";
|
||||
const MINIMAX_SEARCH_KEY =
|
||||
process.env.MINIMAX_CODE_PLAN_KEY?.trim() ||
|
||||
process.env.MINIMAX_CODING_API_KEY?.trim() ||
|
||||
process.env.MINIMAX_API_KEY?.trim() ||
|
||||
MINIMAX_API_KEY ||
|
||||
"";
|
||||
const describeLive =
|
||||
isLiveTestEnabled() && MINIMAX_SEARCH_KEY.length > 0 ? describe : describe.skip;
|
||||
const describeTtsLive =
|
||||
isLiveTestEnabled() && MINIMAX_API_KEY.length > 0 ? describe : describe.skip;
|
||||
|
||||
const registerMinimaxPlugin = () =>
|
||||
registerProviderPlugin({
|
||||
plugin,
|
||||
id: "minimax",
|
||||
name: "MiniMax Provider",
|
||||
});
|
||||
|
||||
describeLive("minimax plugin live", () => {
|
||||
it("runs MiniMax web search through the provider tool", async () => {
|
||||
@@ -25,3 +41,39 @@ describeLive("minimax plugin live", () => {
|
||||
expect(Array.isArray(result?.results)).toBe(true);
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
describeTtsLive("minimax tts live", () => {
|
||||
it("synthesizes TTS through the registered speech provider", async () => {
|
||||
const { speechProviders } = await registerMinimaxPlugin();
|
||||
const provider = requireRegisteredProvider(speechProviders, "minimax");
|
||||
|
||||
const audioFile = await provider.synthesize({
|
||||
text: "OpenClaw MiniMax text to speech integration test OK.",
|
||||
cfg: { plugins: { enabled: true } } as never,
|
||||
providerConfig: { apiKey: MINIMAX_API_KEY },
|
||||
target: "audio-file",
|
||||
timeoutMs: 90_000,
|
||||
});
|
||||
|
||||
expect(audioFile.outputFormat).toBe("mp3");
|
||||
expect(audioFile.fileExtension).toBe(".mp3");
|
||||
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
|
||||
}, 120_000);
|
||||
|
||||
it("synthesizes MiniMax TTS as an Opus voice note", async () => {
|
||||
const provider = buildMinimaxSpeechProvider();
|
||||
|
||||
const voiceNote = await provider.synthesize({
|
||||
text: "OpenClaw MiniMax voice note test OK.",
|
||||
cfg: { plugins: { enabled: true } } as never,
|
||||
providerConfig: { apiKey: MINIMAX_API_KEY },
|
||||
target: "voice-note",
|
||||
timeoutMs: 90_000,
|
||||
});
|
||||
|
||||
expect(voiceNote.outputFormat).toBe("opus");
|
||||
expect(voiceNote.fileExtension).toBe(".opus");
|
||||
expect(voiceNote.voiceCompatible).toBe(true);
|
||||
expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(512);
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
@@ -2,6 +2,7 @@ import { defineSingleProviderPluginEntry } from "openclaw/plugin-sdk/provider-en
|
||||
import { PROVIDER_LABELS } from "openclaw/plugin-sdk/provider-usage";
|
||||
import { applyXiaomiConfig, XIAOMI_DEFAULT_MODEL_REF } from "./onboard.js";
|
||||
import { buildXiaomiProvider } from "./provider-catalog.js";
|
||||
import { buildXiaomiSpeechProvider } from "./speech-provider.js";
|
||||
|
||||
const PROVIDER_ID = "xiaomi";
|
||||
|
||||
@@ -40,4 +41,7 @@ export default defineSingleProviderPluginEntry({
|
||||
windows: [],
|
||||
}),
|
||||
},
|
||||
register(api) {
|
||||
api.registerSpeechProvider(buildXiaomiSpeechProvider());
|
||||
},
|
||||
});
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
"id": "xiaomi",
|
||||
"enabledByDefault": true,
|
||||
"providers": ["xiaomi"],
|
||||
"contracts": {
|
||||
"speechProviders": ["xiaomi"]
|
||||
},
|
||||
"providerAuthEnvVars": {
|
||||
"xiaomi": ["XIAOMI_API_KEY"]
|
||||
},
|
||||
|
||||
250
extensions/xiaomi/speech-provider.test.ts
Normal file
250
extensions/xiaomi/speech-provider.test.ts
Normal file
@@ -0,0 +1,250 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
runFfmpeg: runFfmpegMock,
|
||||
}));
|
||||
|
||||
import { buildXiaomiSpeechProvider } from "./speech-provider.js";
|
||||
|
||||
describe("buildXiaomiSpeechProvider", () => {
|
||||
const provider = buildXiaomiSpeechProvider();
|
||||
|
||||
describe("metadata", () => {
|
||||
it("registers Xiaomi MiMo as a speech provider", () => {
|
||||
expect(provider.id).toBe("xiaomi");
|
||||
expect(provider.aliases).toContain("mimo");
|
||||
expect(provider.models).toContain("mimo-v2.5-tts");
|
||||
expect(provider.models).toContain("mimo-v2-tts");
|
||||
expect(provider.voices).toContain("mimo_default");
|
||||
});
|
||||
});
|
||||
|
||||
describe("isConfigured", () => {
|
||||
const savedEnv = { ...process.env };
|
||||
|
||||
afterEach(() => {
|
||||
process.env = { ...savedEnv };
|
||||
});
|
||||
|
||||
it("returns true when apiKey is in provider config", () => {
|
||||
expect(
|
||||
provider.isConfigured({ providerConfig: { apiKey: "sk-test" }, timeoutMs: 30000 }),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false when no apiKey is available", () => {
|
||||
delete process.env.XIAOMI_API_KEY;
|
||||
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true when XIAOMI_API_KEY env var is set", () => {
|
||||
process.env.XIAOMI_API_KEY = "sk-env";
|
||||
expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30000 })).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveConfig", () => {
|
||||
it("reads providers.xiaomi settings", () => {
|
||||
const config = provider.resolveConfig!({
|
||||
rawConfig: {
|
||||
providers: {
|
||||
xiaomi: {
|
||||
baseUrl: "https://example.com/v1/",
|
||||
model: "mimo-v2-tts",
|
||||
voice: "default_en",
|
||||
format: "wav",
|
||||
style: "Bright and fast.",
|
||||
},
|
||||
},
|
||||
},
|
||||
cfg: {} as never,
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
expect(config).toMatchObject({
|
||||
baseUrl: "https://example.com/v1",
|
||||
model: "mimo-v2-tts",
|
||||
voice: "default_en",
|
||||
format: "wav",
|
||||
style: "Bright and fast.",
|
||||
});
|
||||
});
|
||||
|
||||
it("accepts the mimo provider config alias", () => {
|
||||
const config = provider.resolveConfig!({
|
||||
rawConfig: { providers: { mimo: { voiceId: "default_zh" } } },
|
||||
cfg: {} as never,
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
expect(config.voice).toBe("default_zh");
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseDirectiveToken", () => {
|
||||
const policy = {
|
||||
enabled: true,
|
||||
allowText: true,
|
||||
allowProvider: true,
|
||||
allowVoice: true,
|
||||
allowModelId: true,
|
||||
allowVoiceSettings: true,
|
||||
allowNormalization: true,
|
||||
allowSeed: true,
|
||||
};
|
||||
|
||||
it("handles voice, model, style, and format tokens", () => {
|
||||
expect(provider.parseDirectiveToken!({ key: "voice", value: "default_en", policy })).toEqual({
|
||||
handled: true,
|
||||
overrides: { voice: "default_en" },
|
||||
});
|
||||
expect(provider.parseDirectiveToken!({ key: "model", value: "mimo-v2-tts", policy })).toEqual(
|
||||
{ handled: true, overrides: { model: "mimo-v2-tts" } },
|
||||
);
|
||||
expect(provider.parseDirectiveToken!({ key: "style", value: "whispered", policy })).toEqual({
|
||||
handled: true,
|
||||
overrides: { style: "whispered" },
|
||||
});
|
||||
expect(provider.parseDirectiveToken!({ key: "format", value: "wav", policy })).toEqual({
|
||||
handled: true,
|
||||
overrides: { format: "wav" },
|
||||
});
|
||||
});
|
||||
|
||||
it("warns on invalid format", () => {
|
||||
const result = provider.parseDirectiveToken!({ key: "format", value: "ogg", policy });
|
||||
expect(result.handled).toBe(true);
|
||||
expect(result.warnings).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe("synthesize", () => {
|
||||
const savedFetch = globalThis.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal("fetch", vi.fn());
|
||||
runFfmpegMock.mockReset();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = savedFetch;
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("makes the Xiaomi chat completions TTS call and decodes audio", async () => {
|
||||
const audio = Buffer.from("fake-mp3-audio").toString("base64");
|
||||
const mockFetch = vi.mocked(globalThis.fetch);
|
||||
mockFetch.mockResolvedValueOnce(
|
||||
new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
}),
|
||||
);
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Hello from OpenClaw.",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "sk-test",
|
||||
model: "mimo-v2-tts",
|
||||
voice: "default_en",
|
||||
style: "Bright.",
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
|
||||
expect(result.outputFormat).toBe("mp3");
|
||||
expect(result.fileExtension).toBe(".mp3");
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
expect(result.audioBuffer.toString()).toBe("fake-mp3-audio");
|
||||
|
||||
expect(mockFetch).toHaveBeenCalledOnce();
|
||||
const [url, init] = mockFetch.mock.calls[0];
|
||||
expect(url).toBe("https://api.xiaomimimo.com/v1/chat/completions");
|
||||
expect(init?.headers).toMatchObject({ "api-key": "sk-test" });
|
||||
const body = JSON.parse(init!.body as string);
|
||||
expect(body.model).toBe("mimo-v2-tts");
|
||||
expect(body.messages).toEqual([
|
||||
{ role: "user", content: "Bright." },
|
||||
{ role: "assistant", content: "Hello from OpenClaw." },
|
||||
]);
|
||||
expect(body.audio).toEqual({ format: "mp3", voice: "default_en" });
|
||||
expect(runFfmpegMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcodes Xiaomi output to Opus for voice-note targets", async () => {
|
||||
const audio = Buffer.from("fake-mp3-audio").toString("base64");
|
||||
vi.mocked(globalThis.fetch).mockResolvedValueOnce(
|
||||
new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
}),
|
||||
);
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
const outputPath = args.at(-1);
|
||||
if (typeof outputPath !== "string") {
|
||||
throw new Error("missing ffmpeg output path");
|
||||
}
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(outputPath, Buffer.from("fake-opus-audio")),
|
||||
);
|
||||
});
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Hello from OpenClaw.",
|
||||
cfg: {} as never,
|
||||
providerConfig: { apiKey: "sk-test" },
|
||||
target: "voice-note",
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
|
||||
expect(result.outputFormat).toBe("opus");
|
||||
expect(result.fileExtension).toBe(".opus");
|
||||
expect(result.voiceCompatible).toBe(true);
|
||||
expect(result.audioBuffer.toString()).toBe("fake-opus-audio");
|
||||
expect(runFfmpegMock).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
|
||||
{ timeoutMs: 30000 },
|
||||
);
|
||||
});
|
||||
|
||||
it("throws when API key is missing", async () => {
|
||||
const savedKey = process.env.XIAOMI_API_KEY;
|
||||
delete process.env.XIAOMI_API_KEY;
|
||||
try {
|
||||
await expect(
|
||||
provider.synthesize({
|
||||
text: "Test",
|
||||
cfg: {} as never,
|
||||
providerConfig: {},
|
||||
target: "audio-file",
|
||||
timeoutMs: 30000,
|
||||
}),
|
||||
).rejects.toThrow("Xiaomi API key missing");
|
||||
} finally {
|
||||
if (savedKey) {
|
||||
process.env.XIAOMI_API_KEY = savedKey;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("throws when the API response has no audio data", async () => {
|
||||
vi.mocked(globalThis.fetch).mockResolvedValueOnce(
|
||||
new Response(JSON.stringify({ choices: [{ message: {} }] }), {
|
||||
status: 200,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
}),
|
||||
);
|
||||
await expect(
|
||||
provider.synthesize({
|
||||
text: "Test",
|
||||
cfg: {} as never,
|
||||
providerConfig: { apiKey: "sk-test" },
|
||||
target: "audio-file",
|
||||
timeoutMs: 30000,
|
||||
}),
|
||||
).rejects.toThrow("Xiaomi TTS API returned no audio data");
|
||||
});
|
||||
});
|
||||
});
|
||||
336
extensions/xiaomi/speech-provider.ts
Normal file
336
extensions/xiaomi/speech-provider.ts
Normal file
@@ -0,0 +1,336 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import type {
|
||||
SpeechDirectiveTokenParseContext,
|
||||
SpeechProviderConfig,
|
||||
SpeechProviderOverrides,
|
||||
SpeechProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
|
||||
import {
|
||||
fetchWithSsrFGuard,
|
||||
ssrfPolicyFromHttpBaseUrlAllowedHostname,
|
||||
} from "openclaw/plugin-sdk/ssrf-runtime";
|
||||
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
|
||||
|
||||
export const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1";
|
||||
export const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts";
|
||||
export const DEFAULT_XIAOMI_TTS_VOICE = "mimo_default";
|
||||
export const DEFAULT_XIAOMI_TTS_FORMAT = "mp3";
|
||||
|
||||
export const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts"] as const;
|
||||
|
||||
export const XIAOMI_TTS_VOICES = [
|
||||
"mimo_default",
|
||||
"default_zh",
|
||||
"default_en",
|
||||
"Mia",
|
||||
"Chloe",
|
||||
"Milo",
|
||||
"Dean",
|
||||
] as const;
|
||||
|
||||
const XIAOMI_TTS_FORMATS = ["mp3", "wav"] as const;
|
||||
|
||||
type XiaomiTtsFormat = (typeof XIAOMI_TTS_FORMATS)[number];
|
||||
|
||||
type XiaomiTtsProviderConfig = {
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
format: XiaomiTtsFormat;
|
||||
style?: string;
|
||||
};
|
||||
|
||||
type XiaomiTtsOverrides = {
|
||||
model?: string;
|
||||
voice?: string;
|
||||
format?: XiaomiTtsFormat;
|
||||
style?: string;
|
||||
};
|
||||
|
||||
function normalizeXiaomiTtsBaseUrl(baseUrl?: string): string {
|
||||
return (baseUrl?.trim() || DEFAULT_XIAOMI_TTS_BASE_URL).replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function normalizeXiaomiTtsFormat(value: unknown): XiaomiTtsFormat | undefined {
|
||||
const normalized = trimToUndefined(value)?.toLowerCase();
|
||||
return XIAOMI_TTS_FORMATS.includes(normalized as XiaomiTtsFormat)
|
||||
? (normalized as XiaomiTtsFormat)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function resolveXiaomiTtsConfigRecord(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): Record<string, unknown> | undefined {
|
||||
const providers = asObject(rawConfig.providers);
|
||||
return asObject(providers?.xiaomi) ?? asObject(providers?.mimo) ?? asObject(rawConfig.xiaomi);
|
||||
}
|
||||
|
||||
function normalizeXiaomiTtsProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): XiaomiTtsProviderConfig {
|
||||
const raw = resolveXiaomiTtsConfigRecord(rawConfig);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: "messages.tts.providers.xiaomi.apiKey",
|
||||
}),
|
||||
baseUrl: normalizeXiaomiTtsBaseUrl(
|
||||
trimToUndefined(raw?.baseUrl) ?? trimToUndefined(process.env.XIAOMI_BASE_URL),
|
||||
),
|
||||
model:
|
||||
trimToUndefined(raw?.model) ??
|
||||
trimToUndefined(process.env.XIAOMI_TTS_MODEL) ??
|
||||
DEFAULT_XIAOMI_TTS_MODEL,
|
||||
voice:
|
||||
trimToUndefined(raw?.voice) ??
|
||||
trimToUndefined(raw?.voiceId) ??
|
||||
trimToUndefined(process.env.XIAOMI_TTS_VOICE) ??
|
||||
DEFAULT_XIAOMI_TTS_VOICE,
|
||||
format:
|
||||
normalizeXiaomiTtsFormat(raw?.format) ??
|
||||
normalizeXiaomiTtsFormat(process.env.XIAOMI_TTS_FORMAT) ??
|
||||
DEFAULT_XIAOMI_TTS_FORMAT,
|
||||
style: trimToUndefined(raw?.style),
|
||||
};
|
||||
}
|
||||
|
||||
function readXiaomiTtsProviderConfig(config: SpeechProviderConfig): XiaomiTtsProviderConfig {
|
||||
const normalized = normalizeXiaomiTtsProviderConfig({});
|
||||
return {
|
||||
apiKey:
|
||||
normalizeResolvedSecretInputString({
|
||||
value: config.apiKey,
|
||||
path: "messages.tts.providers.xiaomi.apiKey",
|
||||
}) ?? normalized.apiKey,
|
||||
baseUrl: normalizeXiaomiTtsBaseUrl(trimToUndefined(config.baseUrl) ?? normalized.baseUrl),
|
||||
model: trimToUndefined(config.model) ?? normalized.model,
|
||||
voice: trimToUndefined(config.voice) ?? trimToUndefined(config.voiceId) ?? normalized.voice,
|
||||
format: normalizeXiaomiTtsFormat(config.format) ?? normalized.format,
|
||||
style: trimToUndefined(config.style) ?? normalized.style,
|
||||
};
|
||||
}
|
||||
|
||||
function readXiaomiTtsOverrides(
|
||||
overrides: SpeechProviderOverrides | undefined,
|
||||
): XiaomiTtsOverrides {
|
||||
if (!overrides) {
|
||||
return {};
|
||||
}
|
||||
return {
|
||||
model: trimToUndefined(overrides.model),
|
||||
voice: trimToUndefined(overrides.voice) ?? trimToUndefined(overrides.voiceId),
|
||||
format: normalizeXiaomiTtsFormat(overrides.format),
|
||||
style: trimToUndefined(overrides.style),
|
||||
};
|
||||
}
|
||||
|
||||
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
handled: boolean;
|
||||
overrides?: SpeechProviderOverrides;
|
||||
warnings?: string[];
|
||||
} {
|
||||
switch (ctx.key) {
|
||||
case "voice":
|
||||
case "voiceid":
|
||||
case "voice_id":
|
||||
case "mimo_voice":
|
||||
case "xiaomi_voice":
|
||||
if (!ctx.policy.allowVoice) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { voice: ctx.value } };
|
||||
case "model":
|
||||
case "mimo_model":
|
||||
case "xiaomi_model":
|
||||
if (!ctx.policy.allowModelId) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { model: ctx.value } };
|
||||
case "style":
|
||||
case "mimo_style":
|
||||
case "xiaomi_style":
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
return { handled: true, overrides: { style: ctx.value } };
|
||||
case "format":
|
||||
case "responseformat":
|
||||
case "response_format": {
|
||||
if (!ctx.policy.allowVoiceSettings) {
|
||||
return { handled: true };
|
||||
}
|
||||
const format = normalizeXiaomiTtsFormat(ctx.value);
|
||||
if (!format) {
|
||||
return { handled: true, warnings: [`invalid Xiaomi TTS format "${ctx.value}"`] };
|
||||
}
|
||||
return { handled: true, overrides: { format } };
|
||||
}
|
||||
default:
|
||||
return { handled: false };
|
||||
}
|
||||
}
|
||||
|
||||
function buildXiaomiTtsMessages(params: { text: string; style?: string }) {
|
||||
const style = trimToUndefined(params.style);
|
||||
return [
|
||||
...(style ? [{ role: "user" as const, content: style }] : []),
|
||||
{ role: "assistant" as const, content: params.text },
|
||||
];
|
||||
}
|
||||
|
||||
function decodeXiaomiAudioData(body: unknown): Buffer {
|
||||
const root = asObject(body);
|
||||
const choices = Array.isArray(root?.choices) ? root.choices : [];
|
||||
const firstChoice = asObject(choices[0]);
|
||||
const message = asObject(firstChoice?.message);
|
||||
const audio = asObject(message?.audio);
|
||||
const audioData = trimToUndefined(audio?.data);
|
||||
if (!audioData) {
|
||||
throw new Error("Xiaomi TTS API returned no audio data");
|
||||
}
|
||||
return Buffer.from(audioData, "base64");
|
||||
}
|
||||
|
||||
export async function xiaomiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
format: XiaomiTtsFormat;
|
||||
style?: string;
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, baseUrl, model, voice, format, style, timeoutMs } = params;
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const { response, release } = await fetchWithSsrFGuard({
|
||||
url: `${baseUrl}/chat/completions`,
|
||||
init: {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
messages: buildXiaomiTtsMessages({ text, style }),
|
||||
audio: { format, voice },
|
||||
}),
|
||||
signal: controller.signal,
|
||||
},
|
||||
timeoutMs,
|
||||
policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(baseUrl),
|
||||
auditContext: "xiaomi.tts",
|
||||
});
|
||||
try {
|
||||
await assertOkOrThrowProviderError(response, "Xiaomi TTS API error");
|
||||
return decodeXiaomiAudioData(await response.json());
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
async function transcodeAudioToOpus(params: {
|
||||
audioBuffer: Buffer;
|
||||
inputExtension: string;
|
||||
timeoutMs: number | undefined;
|
||||
}) {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = await mkdtemp(path.join(tempRoot, "tts-xiaomi-"));
|
||||
try {
|
||||
const inputPath = path.join(tempDir, `input.${params.inputExtension}`);
|
||||
const outputPath = path.join(tempDir, "voice.opus");
|
||||
await writeFile(inputPath, params.audioBuffer, { mode: 0o600 });
|
||||
await runFfmpeg(
|
||||
[
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"64k",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"1",
|
||||
outputPath,
|
||||
],
|
||||
{ timeoutMs: params.timeoutMs },
|
||||
);
|
||||
return await readFile(outputPath);
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export function buildXiaomiSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "xiaomi",
|
||||
label: "Xiaomi MiMo",
|
||||
aliases: ["mimo"],
|
||||
autoSelectOrder: 45,
|
||||
models: XIAOMI_TTS_MODELS,
|
||||
voices: XIAOMI_TTS_VOICES,
|
||||
resolveConfig: ({ rawConfig }) => normalizeXiaomiTtsProviderConfig(rawConfig),
|
||||
parseDirectiveToken,
|
||||
listVoices: async () => XIAOMI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
|
||||
isConfigured: ({ providerConfig }) =>
|
||||
Boolean(readXiaomiTtsProviderConfig(providerConfig).apiKey || process.env.XIAOMI_API_KEY),
|
||||
synthesize: async (req) => {
|
||||
const config = readXiaomiTtsProviderConfig(req.providerConfig);
|
||||
const overrides = readXiaomiTtsOverrides(req.providerOverrides);
|
||||
const apiKey = config.apiKey || process.env.XIAOMI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("Xiaomi API key missing");
|
||||
}
|
||||
const outputFormat = overrides.format ?? config.format;
|
||||
const audioBuffer = await xiaomiTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
model: overrides.model ?? config.model,
|
||||
voice: overrides.voice ?? config.voice,
|
||||
format: outputFormat,
|
||||
style: overrides.style ?? config.style,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
if (req.target === "voice-note") {
|
||||
const opusBuffer = await transcodeAudioToOpus({
|
||||
audioBuffer,
|
||||
inputExtension: outputFormat,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
audioBuffer: opusBuffer,
|
||||
outputFormat: "opus",
|
||||
fileExtension: ".opus",
|
||||
voiceCompatible: true,
|
||||
};
|
||||
}
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat,
|
||||
fileExtension: `.${outputFormat}`,
|
||||
voiceCompatible: false,
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
55
extensions/xiaomi/xiaomi.live.test.ts
Normal file
55
extensions/xiaomi/xiaomi.live.test.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
|
||||
import {
|
||||
registerProviderPlugin,
|
||||
requireRegisteredProvider,
|
||||
} from "../../test/helpers/plugins/provider-registration.js";
|
||||
import plugin from "./index.js";
|
||||
|
||||
const XIAOMI_API_KEY = process.env.XIAOMI_API_KEY?.trim() ?? "";
|
||||
const LIVE = isLiveTestEnabled() && XIAOMI_API_KEY.length > 0;
|
||||
const describeLive = LIVE ? describe : describe.skip;
|
||||
|
||||
const registerXiaomiPlugin = () =>
|
||||
registerProviderPlugin({
|
||||
plugin,
|
||||
id: "xiaomi",
|
||||
name: "Xiaomi Provider",
|
||||
});
|
||||
|
||||
describeLive("xiaomi plugin live", () => {
|
||||
it("synthesizes MiMo TTS through the registered speech provider", async () => {
|
||||
const { speechProviders } = await registerXiaomiPlugin();
|
||||
const provider = requireRegisteredProvider(speechProviders, "xiaomi");
|
||||
|
||||
const audioFile = await provider.synthesize({
|
||||
text: "OpenClaw Xiaomi MiMo text to speech integration test OK.",
|
||||
cfg: { plugins: { enabled: true } } as never,
|
||||
providerConfig: { apiKey: XIAOMI_API_KEY, format: "mp3", voice: "mimo_default" },
|
||||
target: "audio-file",
|
||||
timeoutMs: 90_000,
|
||||
});
|
||||
|
||||
expect(audioFile.outputFormat).toBe("mp3");
|
||||
expect(audioFile.fileExtension).toBe(".mp3");
|
||||
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
|
||||
}, 120_000);
|
||||
|
||||
it("synthesizes MiMo TTS as an Opus voice note", async () => {
|
||||
const { speechProviders } = await registerXiaomiPlugin();
|
||||
const provider = requireRegisteredProvider(speechProviders, "xiaomi");
|
||||
|
||||
const voiceNote = await provider.synthesize({
|
||||
text: "OpenClaw Xiaomi MiMo voice note test OK.",
|
||||
cfg: { plugins: { enabled: true } } as never,
|
||||
providerConfig: { apiKey: XIAOMI_API_KEY, format: "mp3", voice: "mimo_default" },
|
||||
target: "voice-note",
|
||||
timeoutMs: 90_000,
|
||||
});
|
||||
|
||||
expect(voiceNote.outputFormat).toBe("opus");
|
||||
expect(voiceNote.fileExtension).toBe(".opus");
|
||||
expect(voiceNote.voiceCompatible).toBe(true);
|
||||
expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(512);
|
||||
}, 120_000);
|
||||
});
|
||||
Reference in New Issue
Block a user