diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ecd4617a6c..6cc4cf2473c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai ### Changes +- Providers/OpenAI: add `extraBody`/`extra_body` passthrough for OpenAI-compatible TTS endpoints, so custom speech servers can receive fields such as `lang` in `/audio/speech` requests. Fixes #39900. Thanks @R3NK0R. - Dependencies: refresh workspace dependency pins, including TypeBox 1.1.37, AWS SDK 3.1041.0, Microsoft Teams 2.0.9, and Marked 18.0.3. Thanks @mariozechner, @aws, and @microsoft. ### Fixes diff --git a/docs/providers/openai.md b/docs/providers/openai.md index d787acce130..f0b5b188f22 100644 --- a/docs/providers/openai.md +++ b/docs/providers/openai.md @@ -479,9 +479,12 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil | Format | `messages.tts.providers.openai.responseFormat` | `opus` for voice notes, `mp3` for files | | API key | `messages.tts.providers.openai.apiKey` | Falls back to `OPENAI_API_KEY` | | Base URL | `messages.tts.providers.openai.baseUrl` | `https://api.openai.com/v1` | + | Extra body | `messages.tts.providers.openai.extraBody` / `extra_body` | (unset) | Available models: `gpt-4o-mini-tts`, `tts-1`, `tts-1-hd`. Available voices: `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `juniper`, `marin`, `onyx`, `nova`, `sage`, `shimmer`, `verse`. + `extraBody` is merged into `/audio/speech` request JSON after OpenClaw's generated fields, so use it for OpenAI-compatible endpoints that require additional keys such as `lang`. Prototype keys are ignored. + ```json5 { messages: { diff --git a/docs/tools/tts.md b/docs/tools/tts.md index ab600b4753d..8231921767e 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -892,6 +892,7 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above. OpenAI TTS model id (e.g. `gpt-4o-mini-tts`). Voice name (e.g. `alloy`, `cedar`). Explicit OpenAI `instructions` field. When set, persona prompt fields are **not** auto-mapped. + Extra JSON fields merged into `/audio/speech` request bodies after generated OpenAI TTS fields. Use this for OpenAI-compatible endpoints such as Kokoro that require provider-specific keys like `lang`; unsafe prototype keys are ignored. Override the OpenAI TTS endpoint. Resolution order: config → `OPENAI_TTS_BASE_URL` → `https://api.openai.com/v1`. Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted. diff --git a/extensions/openai/speech-provider.test.ts b/extensions/openai/speech-provider.test.ts index b3b7492eaac..7d9ee46eea6 100644 --- a/extensions/openai/speech-provider.test.ts +++ b/extensions/openai/speech-provider.test.ts @@ -16,6 +16,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({ })); function isSpeechRequestBody(value: unknown): value is { + [key: string]: unknown; model?: string; voice?: string; speed?: number; @@ -25,6 +26,7 @@ function isSpeechRequestBody(value: unknown): value is { } function parseRequestBody(init: RequestInit | undefined): { + [key: string]: unknown; model?: string; voice?: string; speed?: number; @@ -73,6 +75,9 @@ describe("buildOpenAISpeechProvider", () => { speed: 1.25, instructions: " Speak warmly ", responseFormat: " WAV ", + extraBody: { + lang: "en-US", + }, }, }, }, @@ -86,6 +91,9 @@ describe("buildOpenAISpeechProvider", () => { speed: 1.25, instructions: "Speak warmly", responseFormat: "wav", + extraBody: { + lang: "en-US", + }, }); }); @@ -285,4 +293,39 @@ describe("buildOpenAISpeechProvider", () => { expect(result.fileExtension).toBe(".wav"); expect(result.voiceCompatible).toBe(false); }); + + it("passes extra_body config through to OpenAI-compatible speech requests", async () => { + const provider = buildOpenAISpeechProvider(); + const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => { + const body = parseRequestBody(init); + expect(body).toMatchObject({ + model: "custom-tts", + voice: "custom-voice", + lang: "en-US", + response_format: "mp3", + }); + return new Response(new Uint8Array([1, 2, 3]), { status: 200 }); + }); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const result = await provider.synthesize({ + text: "hello", + cfg: {} as never, + providerConfig: { + apiKey: "sk-test", + baseUrl: "https://proxy.example.com/openai/v1", + model: "custom-tts", + voice: "custom-voice", + responseFormat: "mp3", + extra_body: { + lang: "en-US", + }, + }, + target: "audio-file", + timeoutMs: 1_000, + }); + + expect(result.outputFormat).toBe("mp3"); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); }); diff --git a/extensions/openai/speech-provider.ts b/extensions/openai/speech-provider.ts index 043fd494828..c39e687dfe1 100644 --- a/extensions/openai/speech-provider.ts +++ b/extensions/openai/speech-provider.ts @@ -37,6 +37,7 @@ type OpenAITtsProviderConfig = { speed?: number; instructions?: string; responseFormat?: OpenAiSpeechResponseFormat; + extraBody?: Record; }; type OpenAITtsProviderOverrides = { @@ -96,10 +97,19 @@ function responseFormatToFileExtension( } } +function readExtraBody(value: unknown): Record | undefined { + const body = asObjectRecord(value); + if (!body || Object.keys(body).length === 0) { + return undefined; + } + return body; +} + function normalizeOpenAIProviderConfig( rawConfig: Record, ): OpenAITtsProviderConfig { const raw = resolveOpenAIProviderConfigRecord(rawConfig); + const extraBody = readExtraBody(raw?.extraBody) ?? readExtraBody(raw?.extra_body); return { apiKey: normalizeResolvedSecretInputString({ value: raw?.apiKey, @@ -115,6 +125,7 @@ function normalizeOpenAIProviderConfig( speed: asFiniteNumber(raw?.speed), instructions: trimToUndefined(raw?.instructions), responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat), + extraBody, }; } @@ -129,6 +140,7 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid instructions: trimToUndefined(config.instructions) ?? normalized.instructions, responseFormat: normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat, + extraBody: readExtraBody(config.extraBody) ?? readExtraBody(config.extra_body), }; } @@ -298,6 +310,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { speed: overrides.speed ?? config.speed, instructions: config.instructions, responseFormat, + extraBody: config.extraBody, timeoutMs: req.timeoutMs, }); return { @@ -325,6 +338,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { speed: overrides.speed ?? config.speed, instructions: config.instructions, responseFormat: outputFormat, + extraBody: config.extraBody, timeoutMs: req.timeoutMs, }); return { audioBuffer, outputFormat, sampleRate }; diff --git a/extensions/openai/tts.test.ts b/extensions/openai/tts.test.ts index e11d56dbeea..343503879a5 100644 --- a/extensions/openai/tts.test.ts +++ b/extensions/openai/tts.test.ts @@ -169,6 +169,47 @@ describe("openai tts", () => { expect(body.voice).toBe("custom-voice"); }); + it("merges sanitized extraBody fields into TTS requests", async () => { + const fetchMock = vi.fn( + async (_url: string | URL, _init?: RequestInit) => + new Response(Buffer.from("audio-bytes"), { status: 200 }), + ); + globalThis.fetch = fetchMock as unknown as typeof fetch; + const extraBody = JSON.parse( + '{"lang":"e","speed":1.2,"__proto__":{"polluted":true},"constructor":"bad","prototype":"bad"}', + ) as Record; + + await openaiTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://tts.example.com/v1", + model: "tts-1", + voice: "custom-voice", + speed: 1, + responseFormat: "mp3", + extraBody, + timeoutMs: 5_000, + }); + + const [, init] = fetchMock.mock.calls[0] ?? []; + if (typeof init?.body !== "string") { + throw new Error("expected JSON request body"); + } + const body = JSON.parse(init.body) as Record; + expect(body).toMatchObject({ + model: "tts-1", + input: "hello", + voice: "custom-voice", + response_format: "mp3", + lang: "e", + speed: 1.2, + }); + expect(Object.hasOwn(body, "__proto__")).toBe(false); + expect(Object.hasOwn(body, "constructor")).toBe(false); + expect(Object.hasOwn(body, "prototype")).toBe(false); + expect((Object.prototype as Record).polluted).toBeUndefined(); + }); + it("omits instructions for unsupported models on the official OpenAI endpoint", async () => { const fetchMock = vi.fn( async (_url: string | URL, _init?: RequestInit) => diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts index 59d992e3ccc..a4b64a2a488 100644 --- a/extensions/openai/tts.ts +++ b/extensions/openai/tts.ts @@ -78,6 +78,17 @@ export function resolveOpenAITtsInstructions( return model.includes("gpt-4o-mini-tts") ? next : undefined; } +function sanitizeExtraBodyRecord(value: Record): Record { + const sanitized: Record = {}; + for (const [key, entry] of Object.entries(value)) { + if (key === "__proto__" || key === "constructor" || key === "prototype") { + continue; + } + sanitized[key] = entry; + } + return sanitized; +} + export async function openaiTTS(params: { text: string; apiKey: string; @@ -87,10 +98,21 @@ export async function openaiTTS(params: { speed?: number; instructions?: string; responseFormat: "mp3" | "opus" | "pcm" | "wav"; + extraBody?: Record; timeoutMs: number; }): Promise { - const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } = - params; + const { + text, + apiKey, + baseUrl, + model, + voice, + speed, + instructions, + responseFormat, + extraBody, + timeoutMs, + } = params; const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl); if (!isValidOpenAIModel(model, baseUrl)) { @@ -120,6 +142,7 @@ export async function openaiTTS(params: { response_format: responseFormat, ...(speed != null && { speed }), ...(effectiveInstructions != null && { instructions: effectiveInstructions }), + ...(extraBody == null ? {} : sanitizeExtraBodyRecord(extraBody)), }); const requestUrl = `${baseUrl}/audio/speech`; const debugProxyFetchPatchInstalled = isDebugProxyGlobalFetchPatchInstalled(); diff --git a/src/config/zod-schema.tts.test.ts b/src/config/zod-schema.tts.test.ts index 3186462c419..7d2a2a335dc 100644 --- a/src/config/zod-schema.tts.test.ts +++ b/src/config/zod-schema.tts.test.ts @@ -16,6 +16,24 @@ describe("TtsConfigSchema openai speed and instructions", () => { ).not.toThrow(); }); + it("accepts openai extraBody objects for compatible TTS endpoints", () => { + expect(() => + TtsConfigSchema.parse({ + providers: { + openai: { + baseUrl: "http://localhost:8880/v1", + model: "kokoro", + voice: "em_alex", + extraBody: { + lang: "e", + speed: 1.2, + }, + }, + }, + }), + ).not.toThrow(); + }); + it("rejects out-of-range openai speed", () => { expect(() => TtsConfigSchema.parse({