feat: support openai tts extra body

2026-05-06 13:10:43 +00:00 · 2026-05-01 22:57:27 +01:00
parent 11a268819e
commit 5e3265b09b
8 changed files with 146 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai

 ### Changes

+- Providers/OpenAI: add `extraBody`/`extra_body` passthrough for OpenAI-compatible TTS endpoints, so custom speech servers can receive fields such as `lang` in `/audio/speech` requests. Fixes #39900. Thanks @R3NK0R.
 - Dependencies: refresh workspace dependency pins, including TypeBox 1.1.37, AWS SDK 3.1041.0, Microsoft Teams 2.0.9, and Marked 18.0.3. Thanks @mariozechner, @aws, and @microsoft.

 ### Fixes
--- a/docs/providers/openai.md
+++ b/docs/providers/openai.md
@@ -479,9 +479,12 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
    | Format | `messages.tts.providers.openai.responseFormat` | `opus` for voice notes, `mp3` for files |
    | API key | `messages.tts.providers.openai.apiKey` | Falls back to `OPENAI_API_KEY` |
    | Base URL | `messages.tts.providers.openai.baseUrl` | `https://api.openai.com/v1` |
+    | Extra body | `messages.tts.providers.openai.extraBody` / `extra_body` | (unset) |

    Available models: `gpt-4o-mini-tts`, `tts-1`, `tts-1-hd`. Available voices: `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `juniper`, `marin`, `onyx`, `nova`, `sage`, `shimmer`, `verse`.

+    `extraBody` is merged into `/audio/speech` request JSON after OpenClaw's generated fields, so use it for OpenAI-compatible endpoints that require additional keys such as `lang`. Prototype keys are ignored.
+
    ```json5
    {
      messages: {
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -892,6 +892,7 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above.
    <ParamField path="model" type="string">OpenAI TTS model id (e.g. `gpt-4o-mini-tts`).</ParamField>
    <ParamField path="voice" type="string">Voice name (e.g. `alloy`, `cedar`).</ParamField>
    <ParamField path="instructions" type="string">Explicit OpenAI `instructions` field. When set, persona prompt fields are **not** auto-mapped.</ParamField>
+    <ParamField path="extraBody / extra_body" type="Record<string, unknown>">Extra JSON fields merged into `/audio/speech` request bodies after generated OpenAI TTS fields. Use this for OpenAI-compatible endpoints such as Kokoro that require provider-specific keys like `lang`; unsafe prototype keys are ignored.</ParamField>
    <ParamField path="baseUrl" type="string">
      Override the OpenAI TTS endpoint. Resolution order: config → `OPENAI_TTS_BASE_URL` → `https://api.openai.com/v1`. Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted.
    </ParamField>
--- a/extensions/openai/speech-provider.test.ts
+++ b/extensions/openai/speech-provider.test.ts
@@ -16,6 +16,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
 }));

 function isSpeechRequestBody(value: unknown): value is {
+  [key: string]: unknown;
  model?: string;
  voice?: string;
  speed?: number;
@@ -25,6 +26,7 @@ function isSpeechRequestBody(value: unknown): value is {
 }

 function parseRequestBody(init: RequestInit | undefined): {
+  [key: string]: unknown;
  model?: string;
  voice?: string;
  speed?: number;
@@ -73,6 +75,9 @@ describe("buildOpenAISpeechProvider", () => {
            speed: 1.25,
            instructions: " Speak warmly ",
            responseFormat: " WAV ",
+            extraBody: {
+              lang: "en-US",
+            },
          },
        },
      },
@@ -86,6 +91,9 @@ describe("buildOpenAISpeechProvider", () => {
      speed: 1.25,
      instructions: "Speak warmly",
      responseFormat: "wav",
+      extraBody: {
+        lang: "en-US",
+      },
    });
  });

@@ -285,4 +293,39 @@ describe("buildOpenAISpeechProvider", () => {
    expect(result.fileExtension).toBe(".wav");
    expect(result.voiceCompatible).toBe(false);
  });
+
+  it("passes extra_body config through to OpenAI-compatible speech requests", async () => {
+    const provider = buildOpenAISpeechProvider();
+    const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
+      const body = parseRequestBody(init);
+      expect(body).toMatchObject({
+        model: "custom-tts",
+        voice: "custom-voice",
+        lang: "en-US",
+        response_format: "mp3",
+      });
+      return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
+    });
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    const result = await provider.synthesize({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "sk-test",
+        baseUrl: "https://proxy.example.com/openai/v1",
+        model: "custom-tts",
+        voice: "custom-voice",
+        responseFormat: "mp3",
+        extra_body: {
+          lang: "en-US",
+        },
+      },
+      target: "audio-file",
+      timeoutMs: 1_000,
+    });
+
+    expect(result.outputFormat).toBe("mp3");
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+  });
 });
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -37,6 +37,7 @@ type OpenAITtsProviderConfig = {
  speed?: number;
  instructions?: string;
  responseFormat?: OpenAiSpeechResponseFormat;
+  extraBody?: Record<string, unknown>;
 };

 type OpenAITtsProviderOverrides = {
@@ -96,10 +97,19 @@ function responseFormatToFileExtension(
  }
 }

+function readExtraBody(value: unknown): Record<string, unknown> | undefined {
+  const body = asObjectRecord(value);
+  if (!body || Object.keys(body).length === 0) {
+    return undefined;
+  }
+  return body;
+}
+
 function normalizeOpenAIProviderConfig(
  rawConfig: Record<string, unknown>,
 ): OpenAITtsProviderConfig {
  const raw = resolveOpenAIProviderConfigRecord(rawConfig);
+  const extraBody = readExtraBody(raw?.extraBody) ?? readExtraBody(raw?.extra_body);
  return {
    apiKey: normalizeResolvedSecretInputString({
      value: raw?.apiKey,
@@ -115,6 +125,7 @@ function normalizeOpenAIProviderConfig(
    speed: asFiniteNumber(raw?.speed),
    instructions: trimToUndefined(raw?.instructions),
    responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat),
+    extraBody,
  };
 }

@@ -129,6 +140,7 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid
    instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
    responseFormat:
      normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat,
+    extraBody: readExtraBody(config.extraBody) ?? readExtraBody(config.extra_body),
  };
 }

@@ -298,6 +310,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
        speed: overrides.speed ?? config.speed,
        instructions: config.instructions,
        responseFormat,
+        extraBody: config.extraBody,
        timeoutMs: req.timeoutMs,
      });
      return {
@@ -325,6 +338,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
        speed: overrides.speed ?? config.speed,
        instructions: config.instructions,
        responseFormat: outputFormat,
+        extraBody: config.extraBody,
        timeoutMs: req.timeoutMs,
      });
      return { audioBuffer, outputFormat, sampleRate };
--- a/extensions/openai/tts.test.ts
+++ b/extensions/openai/tts.test.ts
@@ -169,6 +169,47 @@ describe("openai tts", () => {
      expect(body.voice).toBe("custom-voice");
    });

+    it("merges sanitized extraBody fields into TTS requests", async () => {
+      const fetchMock = vi.fn(
+        async (_url: string | URL, _init?: RequestInit) =>
+          new Response(Buffer.from("audio-bytes"), { status: 200 }),
+      );
+      globalThis.fetch = fetchMock as unknown as typeof fetch;
+      const extraBody = JSON.parse(
+        '{"lang":"e","speed":1.2,"__proto__":{"polluted":true},"constructor":"bad","prototype":"bad"}',
+      ) as Record<string, unknown>;
+
+      await openaiTTS({
+        text: "hello",
+        apiKey: "test-key",
+        baseUrl: "https://tts.example.com/v1",
+        model: "tts-1",
+        voice: "custom-voice",
+        speed: 1,
+        responseFormat: "mp3",
+        extraBody,
+        timeoutMs: 5_000,
+      });
+
+      const [, init] = fetchMock.mock.calls[0] ?? [];
+      if (typeof init?.body !== "string") {
+        throw new Error("expected JSON request body");
+      }
+      const body = JSON.parse(init.body) as Record<string, unknown>;
+      expect(body).toMatchObject({
+        model: "tts-1",
+        input: "hello",
+        voice: "custom-voice",
+        response_format: "mp3",
+        lang: "e",
+        speed: 1.2,
+      });
+      expect(Object.hasOwn(body, "__proto__")).toBe(false);
+      expect(Object.hasOwn(body, "constructor")).toBe(false);
+      expect(Object.hasOwn(body, "prototype")).toBe(false);
+      expect((Object.prototype as Record<string, unknown>).polluted).toBeUndefined();
+    });
+
    it("omits instructions for unsupported models on the official OpenAI endpoint", async () => {
      const fetchMock = vi.fn(
        async (_url: string | URL, _init?: RequestInit) =>
--- a/extensions/openai/tts.ts
+++ b/extensions/openai/tts.ts
@@ -78,6 +78,17 @@ export function resolveOpenAITtsInstructions(
  return model.includes("gpt-4o-mini-tts") ? next : undefined;
 }

+function sanitizeExtraBodyRecord(value: Record<string, unknown>): Record<string, unknown> {
+  const sanitized: Record<string, unknown> = {};
+  for (const [key, entry] of Object.entries(value)) {
+    if (key === "__proto__" || key === "constructor" || key === "prototype") {
+      continue;
+    }
+    sanitized[key] = entry;
+  }
+  return sanitized;
+}
+
 export async function openaiTTS(params: {
  text: string;
  apiKey: string;
@@ -87,10 +98,21 @@ export async function openaiTTS(params: {
  speed?: number;
  instructions?: string;
  responseFormat: "mp3" | "opus" | "pcm" | "wav";
+  extraBody?: Record<string, unknown>;
  timeoutMs: number;
 }): Promise<Buffer> {
-  const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
-    params;
+  const {
+    text,
+    apiKey,
+    baseUrl,
+    model,
+    voice,
+    speed,
+    instructions,
+    responseFormat,
+    extraBody,
+    timeoutMs,
+  } = params;
  const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl);

  if (!isValidOpenAIModel(model, baseUrl)) {
@@ -120,6 +142,7 @@ export async function openaiTTS(params: {
    response_format: responseFormat,
    ...(speed != null && { speed }),
    ...(effectiveInstructions != null && { instructions: effectiveInstructions }),
+    ...(extraBody == null ? {} : sanitizeExtraBodyRecord(extraBody)),
  });
  const requestUrl = `${baseUrl}/audio/speech`;
  const debugProxyFetchPatchInstalled = isDebugProxyGlobalFetchPatchInstalled();
--- a/src/config/zod-schema.tts.test.ts
+++ b/src/config/zod-schema.tts.test.ts
@@ -16,6 +16,24 @@ describe("TtsConfigSchema openai speed and instructions", () => {
    ).not.toThrow();
  });

+  it("accepts openai extraBody objects for compatible TTS endpoints", () => {
+    expect(() =>
+      TtsConfigSchema.parse({
+        providers: {
+          openai: {
+            baseUrl: "http://localhost:8880/v1",
+            model: "kokoro",
+            voice: "em_alex",
+            extraBody: {
+              lang: "e",
+              speed: 1.2,
+            },
+          },
+        },
+      }),
+    ).not.toThrow();
+  });
+
  it("rejects out-of-range openai speed", () => {
    expect(() =>
      TtsConfigSchema.parse({