diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ecd4617a6c..6cc4cf2473c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
### Changes
+- Providers/OpenAI: add `extraBody`/`extra_body` passthrough for OpenAI-compatible TTS endpoints, so custom speech servers can receive fields such as `lang` in `/audio/speech` requests. Fixes #39900. Thanks @R3NK0R.
- Dependencies: refresh workspace dependency pins, including TypeBox 1.1.37, AWS SDK 3.1041.0, Microsoft Teams 2.0.9, and Marked 18.0.3. Thanks @mariozechner, @aws, and @microsoft.
### Fixes
diff --git a/docs/providers/openai.md b/docs/providers/openai.md
index d787acce130..f0b5b188f22 100644
--- a/docs/providers/openai.md
+++ b/docs/providers/openai.md
@@ -479,9 +479,12 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
| Format | `messages.tts.providers.openai.responseFormat` | `opus` for voice notes, `mp3` for files |
| API key | `messages.tts.providers.openai.apiKey` | Falls back to `OPENAI_API_KEY` |
| Base URL | `messages.tts.providers.openai.baseUrl` | `https://api.openai.com/v1` |
+ | Extra body | `messages.tts.providers.openai.extraBody` / `extra_body` | (unset) |
Available models: `gpt-4o-mini-tts`, `tts-1`, `tts-1-hd`. Available voices: `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `juniper`, `marin`, `onyx`, `nova`, `sage`, `shimmer`, `verse`.
+ `extraBody` is merged into `/audio/speech` request JSON after OpenClaw's generated fields, so use it for OpenAI-compatible endpoints that require additional keys such as `lang`. Prototype keys are ignored.
+
```json5
{
messages: {
diff --git a/docs/tools/tts.md b/docs/tools/tts.md
index ab600b4753d..8231921767e 100644
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -892,6 +892,7 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above.
OpenAI TTS model id (e.g. `gpt-4o-mini-tts`).
Voice name (e.g. `alloy`, `cedar`).
Explicit OpenAI `instructions` field. When set, persona prompt fields are **not** auto-mapped.
+ Extra JSON fields merged into `/audio/speech` request bodies after generated OpenAI TTS fields. Use this for OpenAI-compatible endpoints such as Kokoro that require provider-specific keys like `lang`; unsafe prototype keys are ignored.
Override the OpenAI TTS endpoint. Resolution order: config → `OPENAI_TTS_BASE_URL` → `https://api.openai.com/v1`. Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted.
diff --git a/extensions/openai/speech-provider.test.ts b/extensions/openai/speech-provider.test.ts
index b3b7492eaac..7d9ee46eea6 100644
--- a/extensions/openai/speech-provider.test.ts
+++ b/extensions/openai/speech-provider.test.ts
@@ -16,6 +16,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
}));
function isSpeechRequestBody(value: unknown): value is {
+ [key: string]: unknown;
model?: string;
voice?: string;
speed?: number;
@@ -25,6 +26,7 @@ function isSpeechRequestBody(value: unknown): value is {
}
function parseRequestBody(init: RequestInit | undefined): {
+ [key: string]: unknown;
model?: string;
voice?: string;
speed?: number;
@@ -73,6 +75,9 @@ describe("buildOpenAISpeechProvider", () => {
speed: 1.25,
instructions: " Speak warmly ",
responseFormat: " WAV ",
+ extraBody: {
+ lang: "en-US",
+ },
},
},
},
@@ -86,6 +91,9 @@ describe("buildOpenAISpeechProvider", () => {
speed: 1.25,
instructions: "Speak warmly",
responseFormat: "wav",
+ extraBody: {
+ lang: "en-US",
+ },
});
});
@@ -285,4 +293,39 @@ describe("buildOpenAISpeechProvider", () => {
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
});
+
+ it("passes extra_body config through to OpenAI-compatible speech requests", async () => {
+ const provider = buildOpenAISpeechProvider();
+ const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
+ const body = parseRequestBody(init);
+ expect(body).toMatchObject({
+ model: "custom-tts",
+ voice: "custom-voice",
+ lang: "en-US",
+ response_format: "mp3",
+ });
+ return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
+ });
+ globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+ const result = await provider.synthesize({
+ text: "hello",
+ cfg: {} as never,
+ providerConfig: {
+ apiKey: "sk-test",
+ baseUrl: "https://proxy.example.com/openai/v1",
+ model: "custom-tts",
+ voice: "custom-voice",
+ responseFormat: "mp3",
+ extra_body: {
+ lang: "en-US",
+ },
+ },
+ target: "audio-file",
+ timeoutMs: 1_000,
+ });
+
+ expect(result.outputFormat).toBe("mp3");
+ expect(fetchMock).toHaveBeenCalledTimes(1);
+ });
});
diff --git a/extensions/openai/speech-provider.ts b/extensions/openai/speech-provider.ts
index 043fd494828..c39e687dfe1 100644
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -37,6 +37,7 @@ type OpenAITtsProviderConfig = {
speed?: number;
instructions?: string;
responseFormat?: OpenAiSpeechResponseFormat;
+ extraBody?: Record;
};
type OpenAITtsProviderOverrides = {
@@ -96,10 +97,19 @@ function responseFormatToFileExtension(
}
}
+function readExtraBody(value: unknown): Record | undefined {
+ const body = asObjectRecord(value);
+ if (!body || Object.keys(body).length === 0) {
+ return undefined;
+ }
+ return body;
+}
+
function normalizeOpenAIProviderConfig(
rawConfig: Record,
): OpenAITtsProviderConfig {
const raw = resolveOpenAIProviderConfigRecord(rawConfig);
+ const extraBody = readExtraBody(raw?.extraBody) ?? readExtraBody(raw?.extra_body);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
@@ -115,6 +125,7 @@ function normalizeOpenAIProviderConfig(
speed: asFiniteNumber(raw?.speed),
instructions: trimToUndefined(raw?.instructions),
responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat),
+ extraBody,
};
}
@@ -129,6 +140,7 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid
instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
responseFormat:
normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat,
+ extraBody: readExtraBody(config.extraBody) ?? readExtraBody(config.extra_body),
};
}
@@ -298,6 +310,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
speed: overrides.speed ?? config.speed,
instructions: config.instructions,
responseFormat,
+ extraBody: config.extraBody,
timeoutMs: req.timeoutMs,
});
return {
@@ -325,6 +338,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
speed: overrides.speed ?? config.speed,
instructions: config.instructions,
responseFormat: outputFormat,
+ extraBody: config.extraBody,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
diff --git a/extensions/openai/tts.test.ts b/extensions/openai/tts.test.ts
index e11d56dbeea..343503879a5 100644
--- a/extensions/openai/tts.test.ts
+++ b/extensions/openai/tts.test.ts
@@ -169,6 +169,47 @@ describe("openai tts", () => {
expect(body.voice).toBe("custom-voice");
});
+ it("merges sanitized extraBody fields into TTS requests", async () => {
+ const fetchMock = vi.fn(
+ async (_url: string | URL, _init?: RequestInit) =>
+ new Response(Buffer.from("audio-bytes"), { status: 200 }),
+ );
+ globalThis.fetch = fetchMock as unknown as typeof fetch;
+ const extraBody = JSON.parse(
+ '{"lang":"e","speed":1.2,"__proto__":{"polluted":true},"constructor":"bad","prototype":"bad"}',
+ ) as Record;
+
+ await openaiTTS({
+ text: "hello",
+ apiKey: "test-key",
+ baseUrl: "https://tts.example.com/v1",
+ model: "tts-1",
+ voice: "custom-voice",
+ speed: 1,
+ responseFormat: "mp3",
+ extraBody,
+ timeoutMs: 5_000,
+ });
+
+ const [, init] = fetchMock.mock.calls[0] ?? [];
+ if (typeof init?.body !== "string") {
+ throw new Error("expected JSON request body");
+ }
+ const body = JSON.parse(init.body) as Record;
+ expect(body).toMatchObject({
+ model: "tts-1",
+ input: "hello",
+ voice: "custom-voice",
+ response_format: "mp3",
+ lang: "e",
+ speed: 1.2,
+ });
+ expect(Object.hasOwn(body, "__proto__")).toBe(false);
+ expect(Object.hasOwn(body, "constructor")).toBe(false);
+ expect(Object.hasOwn(body, "prototype")).toBe(false);
+ expect((Object.prototype as Record).polluted).toBeUndefined();
+ });
+
it("omits instructions for unsupported models on the official OpenAI endpoint", async () => {
const fetchMock = vi.fn(
async (_url: string | URL, _init?: RequestInit) =>
diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts
index 59d992e3ccc..a4b64a2a488 100644
--- a/extensions/openai/tts.ts
+++ b/extensions/openai/tts.ts
@@ -78,6 +78,17 @@ export function resolveOpenAITtsInstructions(
return model.includes("gpt-4o-mini-tts") ? next : undefined;
}
+function sanitizeExtraBodyRecord(value: Record): Record {
+ const sanitized: Record = {};
+ for (const [key, entry] of Object.entries(value)) {
+ if (key === "__proto__" || key === "constructor" || key === "prototype") {
+ continue;
+ }
+ sanitized[key] = entry;
+ }
+ return sanitized;
+}
+
export async function openaiTTS(params: {
text: string;
apiKey: string;
@@ -87,10 +98,21 @@ export async function openaiTTS(params: {
speed?: number;
instructions?: string;
responseFormat: "mp3" | "opus" | "pcm" | "wav";
+ extraBody?: Record;
timeoutMs: number;
}): Promise {
- const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
- params;
+ const {
+ text,
+ apiKey,
+ baseUrl,
+ model,
+ voice,
+ speed,
+ instructions,
+ responseFormat,
+ extraBody,
+ timeoutMs,
+ } = params;
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl);
if (!isValidOpenAIModel(model, baseUrl)) {
@@ -120,6 +142,7 @@ export async function openaiTTS(params: {
response_format: responseFormat,
...(speed != null && { speed }),
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
+ ...(extraBody == null ? {} : sanitizeExtraBodyRecord(extraBody)),
});
const requestUrl = `${baseUrl}/audio/speech`;
const debugProxyFetchPatchInstalled = isDebugProxyGlobalFetchPatchInstalled();
diff --git a/src/config/zod-schema.tts.test.ts b/src/config/zod-schema.tts.test.ts
index 3186462c419..7d2a2a335dc 100644
--- a/src/config/zod-schema.tts.test.ts
+++ b/src/config/zod-schema.tts.test.ts
@@ -16,6 +16,24 @@ describe("TtsConfigSchema openai speed and instructions", () => {
).not.toThrow();
});
+ it("accepts openai extraBody objects for compatible TTS endpoints", () => {
+ expect(() =>
+ TtsConfigSchema.parse({
+ providers: {
+ openai: {
+ baseUrl: "http://localhost:8880/v1",
+ model: "kokoro",
+ voice: "em_alex",
+ extraBody: {
+ lang: "e",
+ speed: 1.2,
+ },
+ },
+ },
+ }),
+ ).not.toThrow();
+ });
+
it("rejects out-of-range openai speed", () => {
expect(() =>
TtsConfigSchema.parse({