fix(minimax): transcode voice-note tts to opus

This commit is contained in:
Peter Steinberger
2026-04-25 04:52:19 +01:00
parent f3cc74ec5d
commit 225ff9a866
5 changed files with 110 additions and 4 deletions

View File

@@ -1,4 +1,11 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
const runFfmpegMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
runFfmpeg: runFfmpegMock,
}));
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
describe("buildMinimaxSpeechProvider", () => {
@@ -213,6 +220,7 @@ describe("buildMinimaxSpeechProvider", () => {
beforeEach(() => {
vi.stubGlobal("fetch", vi.fn());
runFfmpegMock.mockReset();
});
afterEach(() => {
@@ -240,7 +248,7 @@ describe("buildMinimaxSpeechProvider", () => {
expect(result.outputFormat).toBe("mp3");
expect(result.fileExtension).toBe(".mp3");
expect(result.voiceCompatible).toBe(true);
expect(result.voiceCompatible).toBe(false);
expect(result.audioBuffer.toString()).toBe("fake-audio-data");
expect(mockFetch).toHaveBeenCalledOnce();
@@ -250,6 +258,44 @@ describe("buildMinimaxSpeechProvider", () => {
expect(body.model).toBe("speech-2.8-hd");
expect(body.text).toBe("Hello world");
expect(body.voice_setting.voice_id).toBe("English_expressive_narrator");
expect(runFfmpegMock).not.toHaveBeenCalled();
});
it("transcodes MiniMax MP3 to Opus for voice-note targets", async () => {
const hexAudio = Buffer.from("fake-mp3-data").toString("hex");
const mockFetch = vi.mocked(globalThis.fetch);
mockFetch.mockResolvedValueOnce(
new Response(JSON.stringify({ data: { audio: hexAudio } }), {
status: 200,
headers: { "Content-Type": "application/json" },
}),
);
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
const outputPath = args.at(-1);
if (typeof outputPath !== "string") {
throw new Error("missing ffmpeg output path");
}
await import("node:fs/promises").then((fs) =>
fs.writeFile(outputPath, Buffer.from("fake-opus-data")),
);
});
const result = await provider.synthesize({
text: "Hello world",
cfg: {} as never,
providerConfig: { apiKey: "sk-test", baseUrl: "https://api.minimaxi.com" },
target: "voice-note",
timeoutMs: 30000,
});
expect(result.outputFormat).toBe("opus");
expect(result.fileExtension).toBe(".opus");
expect(result.voiceCompatible).toBe(true);
expect(result.audioBuffer.toString()).toBe("fake-opus-data");
expect(runFfmpegMock).toHaveBeenCalledWith(
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
{ timeoutMs: 30000 },
);
});
it("applies overrides", async () => {

View File

@@ -1,3 +1,6 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
import path from "node:path";
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
@@ -6,6 +9,7 @@ import type {
SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech-core";
import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
import {
DEFAULT_MINIMAX_TTS_BASE_URL,
MINIMAX_TTS_MODELS,
@@ -150,6 +154,41 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
}
}
async function transcodeMp3ToOpus(audioBuffer: Buffer, timeoutMs: number | undefined) {
const tempRoot = resolvePreferredOpenClawTmpDir();
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = await mkdtemp(path.join(tempRoot, "tts-minimax-"));
try {
const inputPath = path.join(tempDir, "input.mp3");
const outputPath = path.join(tempDir, "voice.opus");
await writeFile(inputPath, audioBuffer, { mode: 0o600 });
await runFfmpeg(
[
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
inputPath,
"-vn",
"-c:a",
"libopus",
"-b:a",
"64k",
"-ar",
"48000",
"-ac",
"1",
outputPath,
],
{ timeoutMs },
);
return await readFile(outputPath);
} finally {
await rm(tempDir, { recursive: true, force: true });
}
}
export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
return {
id: "minimax",
@@ -223,11 +262,20 @@ export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
pitch: overrides.pitch ?? config.pitch,
timeoutMs: req.timeoutMs,
});
if (req.target === "voice-note") {
const opusBuffer = await transcodeMp3ToOpus(audioBuffer, req.timeoutMs);
return {
audioBuffer: opusBuffer,
outputFormat: "opus",
fileExtension: ".opus",
voiceCompatible: true,
};
}
return {
audioBuffer,
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: true,
voiceCompatible: false,
};
},
};