feat: support openai tts extra body

This commit is contained in:
Peter Steinberger
2026-05-01 22:57:27 +01:00
parent 11a268819e
commit 5e3265b09b
8 changed files with 146 additions and 2 deletions

View File

@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Providers/OpenAI: add `extraBody`/`extra_body` passthrough for OpenAI-compatible TTS endpoints, so custom speech servers can receive fields such as `lang` in `/audio/speech` requests. Fixes #39900. Thanks @R3NK0R.
- Dependencies: refresh workspace dependency pins, including TypeBox 1.1.37, AWS SDK 3.1041.0, Microsoft Teams 2.0.9, and Marked 18.0.3. Thanks @mariozechner, @aws, and @microsoft.
### Fixes

View File

@@ -479,9 +479,12 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
| Format | `messages.tts.providers.openai.responseFormat` | `opus` for voice notes, `mp3` for files |
| API key | `messages.tts.providers.openai.apiKey` | Falls back to `OPENAI_API_KEY` |
| Base URL | `messages.tts.providers.openai.baseUrl` | `https://api.openai.com/v1` |
| Extra body | `messages.tts.providers.openai.extraBody` / `extra_body` | (unset) |
Available models: `gpt-4o-mini-tts`, `tts-1`, `tts-1-hd`. Available voices: `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `juniper`, `marin`, `onyx`, `nova`, `sage`, `shimmer`, `verse`.
`extraBody` is merged into `/audio/speech` request JSON after OpenClaw's generated fields, so use it for OpenAI-compatible endpoints that require additional keys such as `lang`. Prototype keys are ignored.
```json5
{
messages: {

View File

@@ -892,6 +892,7 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above.
<ParamField path="model" type="string">OpenAI TTS model id (e.g. `gpt-4o-mini-tts`).</ParamField>
<ParamField path="voice" type="string">Voice name (e.g. `alloy`, `cedar`).</ParamField>
<ParamField path="instructions" type="string">Explicit OpenAI `instructions` field. When set, persona prompt fields are **not** auto-mapped.</ParamField>
<ParamField path="extraBody / extra_body" type="Record<string, unknown>">Extra JSON fields merged into `/audio/speech` request bodies after generated OpenAI TTS fields. Use this for OpenAI-compatible endpoints such as Kokoro that require provider-specific keys like `lang`; unsafe prototype keys are ignored.</ParamField>
<ParamField path="baseUrl" type="string">
Override the OpenAI TTS endpoint. Resolution order: config → `OPENAI_TTS_BASE_URL` → `https://api.openai.com/v1`. Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted.
</ParamField>

View File

@@ -16,6 +16,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
}));
function isSpeechRequestBody(value: unknown): value is {
[key: string]: unknown;
model?: string;
voice?: string;
speed?: number;
@@ -25,6 +26,7 @@ function isSpeechRequestBody(value: unknown): value is {
}
function parseRequestBody(init: RequestInit | undefined): {
[key: string]: unknown;
model?: string;
voice?: string;
speed?: number;
@@ -73,6 +75,9 @@ describe("buildOpenAISpeechProvider", () => {
speed: 1.25,
instructions: " Speak warmly ",
responseFormat: " WAV ",
extraBody: {
lang: "en-US",
},
},
},
},
@@ -86,6 +91,9 @@ describe("buildOpenAISpeechProvider", () => {
speed: 1.25,
instructions: "Speak warmly",
responseFormat: "wav",
extraBody: {
lang: "en-US",
},
});
});
@@ -285,4 +293,39 @@ describe("buildOpenAISpeechProvider", () => {
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
});
it("passes extra_body config through to OpenAI-compatible speech requests", async () => {
const provider = buildOpenAISpeechProvider();
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
const body = parseRequestBody(init);
expect(body).toMatchObject({
model: "custom-tts",
voice: "custom-voice",
lang: "en-US",
response_format: "mp3",
});
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
});
globalThis.fetch = fetchMock as unknown as typeof fetch;
const result = await provider.synthesize({
text: "hello",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
baseUrl: "https://proxy.example.com/openai/v1",
model: "custom-tts",
voice: "custom-voice",
responseFormat: "mp3",
extra_body: {
lang: "en-US",
},
},
target: "audio-file",
timeoutMs: 1_000,
});
expect(result.outputFormat).toBe("mp3");
expect(fetchMock).toHaveBeenCalledTimes(1);
});
});

View File

@@ -37,6 +37,7 @@ type OpenAITtsProviderConfig = {
speed?: number;
instructions?: string;
responseFormat?: OpenAiSpeechResponseFormat;
extraBody?: Record<string, unknown>;
};
type OpenAITtsProviderOverrides = {
@@ -96,10 +97,19 @@ function responseFormatToFileExtension(
}
}
function readExtraBody(value: unknown): Record<string, unknown> | undefined {
const body = asObjectRecord(value);
if (!body || Object.keys(body).length === 0) {
return undefined;
}
return body;
}
function normalizeOpenAIProviderConfig(
rawConfig: Record<string, unknown>,
): OpenAITtsProviderConfig {
const raw = resolveOpenAIProviderConfigRecord(rawConfig);
const extraBody = readExtraBody(raw?.extraBody) ?? readExtraBody(raw?.extra_body);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
@@ -115,6 +125,7 @@ function normalizeOpenAIProviderConfig(
speed: asFiniteNumber(raw?.speed),
instructions: trimToUndefined(raw?.instructions),
responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat),
extraBody,
};
}
@@ -129,6 +140,7 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid
instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
responseFormat:
normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat,
extraBody: readExtraBody(config.extraBody) ?? readExtraBody(config.extra_body),
};
}
@@ -298,6 +310,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
speed: overrides.speed ?? config.speed,
instructions: config.instructions,
responseFormat,
extraBody: config.extraBody,
timeoutMs: req.timeoutMs,
});
return {
@@ -325,6 +338,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
speed: overrides.speed ?? config.speed,
instructions: config.instructions,
responseFormat: outputFormat,
extraBody: config.extraBody,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };

View File

@@ -169,6 +169,47 @@ describe("openai tts", () => {
expect(body.voice).toBe("custom-voice");
});
it("merges sanitized extraBody fields into TTS requests", async () => {
const fetchMock = vi.fn(
async (_url: string | URL, _init?: RequestInit) =>
new Response(Buffer.from("audio-bytes"), { status: 200 }),
);
globalThis.fetch = fetchMock as unknown as typeof fetch;
const extraBody = JSON.parse(
'{"lang":"e","speed":1.2,"__proto__":{"polluted":true},"constructor":"bad","prototype":"bad"}',
) as Record<string, unknown>;
await openaiTTS({
text: "hello",
apiKey: "test-key",
baseUrl: "https://tts.example.com/v1",
model: "tts-1",
voice: "custom-voice",
speed: 1,
responseFormat: "mp3",
extraBody,
timeoutMs: 5_000,
});
const [, init] = fetchMock.mock.calls[0] ?? [];
if (typeof init?.body !== "string") {
throw new Error("expected JSON request body");
}
const body = JSON.parse(init.body) as Record<string, unknown>;
expect(body).toMatchObject({
model: "tts-1",
input: "hello",
voice: "custom-voice",
response_format: "mp3",
lang: "e",
speed: 1.2,
});
expect(Object.hasOwn(body, "__proto__")).toBe(false);
expect(Object.hasOwn(body, "constructor")).toBe(false);
expect(Object.hasOwn(body, "prototype")).toBe(false);
expect((Object.prototype as Record<string, unknown>).polluted).toBeUndefined();
});
it("omits instructions for unsupported models on the official OpenAI endpoint", async () => {
const fetchMock = vi.fn(
async (_url: string | URL, _init?: RequestInit) =>

View File

@@ -78,6 +78,17 @@ export function resolveOpenAITtsInstructions(
return model.includes("gpt-4o-mini-tts") ? next : undefined;
}
function sanitizeExtraBodyRecord(value: Record<string, unknown>): Record<string, unknown> {
const sanitized: Record<string, unknown> = {};
for (const [key, entry] of Object.entries(value)) {
if (key === "__proto__" || key === "constructor" || key === "prototype") {
continue;
}
sanitized[key] = entry;
}
return sanitized;
}
export async function openaiTTS(params: {
text: string;
apiKey: string;
@@ -87,10 +98,21 @@ export async function openaiTTS(params: {
speed?: number;
instructions?: string;
responseFormat: "mp3" | "opus" | "pcm" | "wav";
extraBody?: Record<string, unknown>;
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
params;
const {
text,
apiKey,
baseUrl,
model,
voice,
speed,
instructions,
responseFormat,
extraBody,
timeoutMs,
} = params;
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl);
if (!isValidOpenAIModel(model, baseUrl)) {
@@ -120,6 +142,7 @@ export async function openaiTTS(params: {
response_format: responseFormat,
...(speed != null && { speed }),
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
...(extraBody == null ? {} : sanitizeExtraBodyRecord(extraBody)),
});
const requestUrl = `${baseUrl}/audio/speech`;
const debugProxyFetchPatchInstalled = isDebugProxyGlobalFetchPatchInstalled();

View File

@@ -16,6 +16,24 @@ describe("TtsConfigSchema openai speed and instructions", () => {
).not.toThrow();
});
it("accepts openai extraBody objects for compatible TTS endpoints", () => {
expect(() =>
TtsConfigSchema.parse({
providers: {
openai: {
baseUrl: "http://localhost:8880/v1",
model: "kokoro",
voice: "em_alex",
extraBody: {
lang: "e",
speed: 1.2,
},
},
},
}),
).not.toThrow();
});
it("rejects out-of-range openai speed", () => {
expect(() =>
TtsConfigSchema.parse({