fix(google): emit opus voice-note tts

This commit is contained in:
Peter Steinberger
2026-04-25 21:33:15 +01:00
parent d5b6667823
commit e2fd3dcee9
14 changed files with 300 additions and 123 deletions

View File

@@ -38,6 +38,24 @@ describeLive("google plugin live", () => {
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
}, 120_000);
it("transcodes speech to Opus for voice-note targets", async () => {
const { speechProviders } = await registerGooglePlugin();
const provider = requireRegisteredProvider(speechProviders, "google");
const audioFile = await provider.synthesize({
text: "OpenClaw Google voice note integration test OK.",
cfg: { plugins: { enabled: true } } as never,
providerConfig: { apiKey: GOOGLE_API_KEY },
target: "voice-note",
timeoutMs: 90_000,
});
expect(audioFile.outputFormat).toBe("opus");
expect(audioFile.fileExtension).toBe(".opus");
expect(audioFile.voiceCompatible).toBe(true);
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(128);
}, 120_000);
it("transcribes synthesized speech through the media provider", async () => {
const { mediaProviders, speechProviders } = await registerGooglePlugin();
const speechProvider = requireRegisteredProvider(speechProviders, "google");

View File

@@ -1,5 +1,12 @@
import * as providerHttp from "openclaw/plugin-sdk/provider-http";
import { afterEach, describe, expect, it, vi } from "vitest";
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
}));
import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
@@ -31,6 +38,7 @@ describe("Google speech provider", () => {
vi.restoreAllMocks();
vi.unstubAllGlobals();
vi.unstubAllEnvs();
transcodeAudioBufferToOpusMock.mockReset();
});
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
@@ -82,6 +90,39 @@ describe("Google speech provider", () => {
expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE);
expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0]));
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0]));
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
const provider = buildGoogleSpeechProvider();
const result = await provider.synthesize({
text: "Send this as a voice note.",
cfg: {},
providerConfig: {
apiKey: "google-test-key",
},
target: "voice-note",
timeoutMs: 12_000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("google-opus"),
outputFormat: "opus",
fileExtension: ".opus",
voiceCompatible: true,
});
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
audioBuffer: expect.any(Buffer),
inputExtension: "wav",
tempPrefix: "tts-google-",
timeoutMs: 12_000,
});
const [{ audioBuffer }] = transcodeAudioBufferToOpusMock.mock.calls[0];
expect(audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
});
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {

View File

@@ -1,3 +1,4 @@
import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime";
import {
assertOkOrThrowProviderError,
postJsonRequest,
@@ -394,6 +395,19 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
speakerName: overrides.speakerName ?? config.speakerName,
timeoutMs: req.timeoutMs,
});
if (req.target === "voice-note") {
return {
audioBuffer: await transcodeAudioBufferToOpus({
audioBuffer: wrapPcm16MonoToWav(pcm),
inputExtension: "wav",
tempPrefix: "tts-google-",
timeoutMs: req.timeoutMs,
}),
outputFormat: "opus",
fileExtension: ".opus",
voiceCompatible: true,
};
}
return {
audioBuffer: wrapPcm16MonoToWav(pcm),
outputFormat: "wav",

View File

@@ -3,10 +3,10 @@ import { tmpdir } from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
const runFfmpegMock = vi.hoisted(() => vi.fn());
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
runFfmpeg: runFfmpegMock,
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
}));
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
@@ -293,7 +293,7 @@ describe("buildMinimaxSpeechProvider", () => {
};
clearMinimaxAuthEnv();
vi.stubGlobal("fetch", vi.fn());
runFfmpegMock.mockReset();
transcodeAudioBufferToOpusMock.mockReset();
});
afterEach(async () => {
@@ -333,7 +333,7 @@ describe("buildMinimaxSpeechProvider", () => {
expect(body.model).toBe("speech-2.8-hd");
expect(body.text).toBe("Hello world");
expect(body.voice_setting.voice_id).toBe("English_expressive_narrator");
expect(runFfmpegMock).not.toHaveBeenCalled();
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
it("transcodes MiniMax MP3 to Opus for voice-note targets", async () => {
@@ -345,15 +345,7 @@ describe("buildMinimaxSpeechProvider", () => {
headers: { "Content-Type": "application/json" },
}),
);
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
const outputPath = args.at(-1);
if (typeof outputPath !== "string") {
throw new Error("missing ffmpeg output path");
}
await import("node:fs/promises").then((fs) =>
fs.writeFile(outputPath, Buffer.from("fake-opus-data")),
);
});
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-data"));
const result = await provider.synthesize({
text: "Hello world",
@@ -367,10 +359,12 @@ describe("buildMinimaxSpeechProvider", () => {
expect(result.fileExtension).toBe(".opus");
expect(result.voiceCompatible).toBe(true);
expect(result.audioBuffer.toString()).toBe("fake-opus-data");
expect(runFfmpegMock).toHaveBeenCalledWith(
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
{ timeoutMs: 30000 },
);
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
audioBuffer: Buffer.from("fake-mp3-data"),
inputExtension: "mp3",
tempPrefix: "tts-minimax-",
timeoutMs: 30000,
});
});
it("applies overrides", async () => {

View File

@@ -1,6 +1,4 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
import path from "node:path";
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime";
import {
isProviderAuthProfileConfigured,
type OpenClawConfig,
@@ -14,7 +12,6 @@ import type {
SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech-core";
import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
import {
DEFAULT_MINIMAX_TTS_BASE_URL,
MINIMAX_TTS_MODELS,
@@ -209,41 +206,6 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
}
}
async function transcodeMp3ToOpus(audioBuffer: Buffer, timeoutMs: number | undefined) {
const tempRoot = resolvePreferredOpenClawTmpDir();
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = await mkdtemp(path.join(tempRoot, "tts-minimax-"));
try {
const inputPath = path.join(tempDir, "input.mp3");
const outputPath = path.join(tempDir, "voice.opus");
await writeFile(inputPath, audioBuffer, { mode: 0o600 });
await runFfmpeg(
[
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
inputPath,
"-vn",
"-c:a",
"libopus",
"-b:a",
"64k",
"-ar",
"48000",
"-ac",
"1",
outputPath,
],
{ timeoutMs },
);
return await readFile(outputPath);
} finally {
await rm(tempDir, { recursive: true, force: true });
}
}
export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
return {
id: "minimax",
@@ -326,7 +288,12 @@ export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
timeoutMs: req.timeoutMs,
});
if (req.target === "voice-note") {
const opusBuffer = await transcodeMp3ToOpus(audioBuffer, req.timeoutMs);
const opusBuffer = await transcodeAudioBufferToOpus({
audioBuffer,
inputExtension: "mp3",
tempPrefix: "tts-minimax-",
timeoutMs: req.timeoutMs,
});
return {
audioBuffer: opusBuffer,
outputFormat: "opus",

View File

@@ -1,9 +1,9 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
const runFfmpegMock = vi.hoisted(() => vi.fn());
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
runFfmpeg: runFfmpegMock,
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
}));
import { buildXiaomiSpeechProvider } from "./speech-provider.js";
@@ -123,7 +123,7 @@ describe("buildXiaomiSpeechProvider", () => {
beforeEach(() => {
vi.stubGlobal("fetch", vi.fn());
runFfmpegMock.mockReset();
transcodeAudioBufferToOpusMock.mockReset();
});
afterEach(() => {
@@ -170,7 +170,7 @@ describe("buildXiaomiSpeechProvider", () => {
{ role: "assistant", content: "Hello from OpenClaw." },
]);
expect(body.audio).toEqual({ format: "mp3", voice: "default_en" });
expect(runFfmpegMock).not.toHaveBeenCalled();
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
it("transcodes Xiaomi output to Opus for voice-note targets", async () => {
@@ -181,15 +181,7 @@ describe("buildXiaomiSpeechProvider", () => {
headers: { "Content-Type": "application/json" },
}),
);
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
const outputPath = args.at(-1);
if (typeof outputPath !== "string") {
throw new Error("missing ffmpeg output path");
}
await import("node:fs/promises").then((fs) =>
fs.writeFile(outputPath, Buffer.from("fake-opus-audio")),
);
});
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-audio"));
const result = await provider.synthesize({
text: "Hello from OpenClaw.",
@@ -203,10 +195,12 @@ describe("buildXiaomiSpeechProvider", () => {
expect(result.fileExtension).toBe(".opus");
expect(result.voiceCompatible).toBe(true);
expect(result.audioBuffer.toString()).toBe("fake-opus-audio");
expect(runFfmpegMock).toHaveBeenCalledWith(
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
{ timeoutMs: 30000 },
);
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
audioBuffer: Buffer.from("fake-mp3-audio"),
inputExtension: "mp3",
tempPrefix: "tts-xiaomi-",
timeoutMs: 30000,
});
});
it("throws when API key is missing", async () => {

View File

@@ -1,6 +1,4 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
import path from "node:path";
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime";
import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
@@ -14,7 +12,6 @@ import {
fetchWithSsrFGuard,
ssrfPolicyFromHttpBaseUrlAllowedHostname,
} from "openclaw/plugin-sdk/ssrf-runtime";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
export const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1";
export const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts";
@@ -242,45 +239,6 @@ export async function xiaomiTTS(params: {
}
}
async function transcodeAudioToOpus(params: {
audioBuffer: Buffer;
inputExtension: string;
timeoutMs: number | undefined;
}) {
const tempRoot = resolvePreferredOpenClawTmpDir();
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = await mkdtemp(path.join(tempRoot, "tts-xiaomi-"));
try {
const inputPath = path.join(tempDir, `input.${params.inputExtension}`);
const outputPath = path.join(tempDir, "voice.opus");
await writeFile(inputPath, params.audioBuffer, { mode: 0o600 });
await runFfmpeg(
[
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
inputPath,
"-vn",
"-c:a",
"libopus",
"-b:a",
"64k",
"-ar",
"48000",
"-ac",
"1",
outputPath,
],
{ timeoutMs: params.timeoutMs },
);
return await readFile(outputPath);
} finally {
await rm(tempDir, { recursive: true, force: true });
}
}
export function buildXiaomiSpeechProvider(): SpeechProviderPlugin {
return {
id: "xiaomi",
@@ -313,9 +271,10 @@ export function buildXiaomiSpeechProvider(): SpeechProviderPlugin {
timeoutMs: req.timeoutMs,
});
if (req.target === "voice-note") {
const opusBuffer = await transcodeAudioToOpus({
const opusBuffer = await transcodeAudioBufferToOpus({
audioBuffer,
inputExtension: outputFormat,
tempPrefix: "tts-xiaomi-",
timeoutMs: req.timeoutMs,
});
return {