mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 13:10:43 +00:00
feat: support openai tts extra body
This commit is contained in:
@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Changes
|
||||
|
||||
- Providers/OpenAI: add `extraBody`/`extra_body` passthrough for OpenAI-compatible TTS endpoints, so custom speech servers can receive fields such as `lang` in `/audio/speech` requests. Fixes #39900. Thanks @R3NK0R.
|
||||
- Dependencies: refresh workspace dependency pins, including TypeBox 1.1.37, AWS SDK 3.1041.0, Microsoft Teams 2.0.9, and Marked 18.0.3. Thanks @mariozechner, @aws, and @microsoft.
|
||||
|
||||
### Fixes
|
||||
|
||||
@@ -479,9 +479,12 @@ Legacy `plugins.entries.openai.config.personality` is still read as a compatibil
|
||||
| Format | `messages.tts.providers.openai.responseFormat` | `opus` for voice notes, `mp3` for files |
|
||||
| API key | `messages.tts.providers.openai.apiKey` | Falls back to `OPENAI_API_KEY` |
|
||||
| Base URL | `messages.tts.providers.openai.baseUrl` | `https://api.openai.com/v1` |
|
||||
| Extra body | `messages.tts.providers.openai.extraBody` / `extra_body` | (unset) |
|
||||
|
||||
Available models: `gpt-4o-mini-tts`, `tts-1`, `tts-1-hd`. Available voices: `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `juniper`, `marin`, `onyx`, `nova`, `sage`, `shimmer`, `verse`.
|
||||
|
||||
`extraBody` is merged into `/audio/speech` request JSON after OpenClaw's generated fields, so use it for OpenAI-compatible endpoints that require additional keys such as `lang`. Prototype keys are ignored.
|
||||
|
||||
```json5
|
||||
{
|
||||
messages: {
|
||||
|
||||
@@ -892,6 +892,7 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above.
|
||||
<ParamField path="model" type="string">OpenAI TTS model id (e.g. `gpt-4o-mini-tts`).</ParamField>
|
||||
<ParamField path="voice" type="string">Voice name (e.g. `alloy`, `cedar`).</ParamField>
|
||||
<ParamField path="instructions" type="string">Explicit OpenAI `instructions` field. When set, persona prompt fields are **not** auto-mapped.</ParamField>
|
||||
<ParamField path="extraBody / extra_body" type="Record<string, unknown>">Extra JSON fields merged into `/audio/speech` request bodies after generated OpenAI TTS fields. Use this for OpenAI-compatible endpoints such as Kokoro that require provider-specific keys like `lang`; unsafe prototype keys are ignored.</ParamField>
|
||||
<ParamField path="baseUrl" type="string">
|
||||
Override the OpenAI TTS endpoint. Resolution order: config → `OPENAI_TTS_BASE_URL` → `https://api.openai.com/v1`. Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted.
|
||||
</ParamField>
|
||||
|
||||
@@ -16,6 +16,7 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
|
||||
}));
|
||||
|
||||
function isSpeechRequestBody(value: unknown): value is {
|
||||
[key: string]: unknown;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
@@ -25,6 +26,7 @@ function isSpeechRequestBody(value: unknown): value is {
|
||||
}
|
||||
|
||||
function parseRequestBody(init: RequestInit | undefined): {
|
||||
[key: string]: unknown;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
speed?: number;
|
||||
@@ -73,6 +75,9 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
speed: 1.25,
|
||||
instructions: " Speak warmly ",
|
||||
responseFormat: " WAV ",
|
||||
extraBody: {
|
||||
lang: "en-US",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -86,6 +91,9 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
speed: 1.25,
|
||||
instructions: "Speak warmly",
|
||||
responseFormat: "wav",
|
||||
extraBody: {
|
||||
lang: "en-US",
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -285,4 +293,39 @@ describe("buildOpenAISpeechProvider", () => {
|
||||
expect(result.fileExtension).toBe(".wav");
|
||||
expect(result.voiceCompatible).toBe(false);
|
||||
});
|
||||
|
||||
it("passes extra_body config through to OpenAI-compatible speech requests", async () => {
|
||||
const provider = buildOpenAISpeechProvider();
|
||||
const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
|
||||
const body = parseRequestBody(init);
|
||||
expect(body).toMatchObject({
|
||||
model: "custom-tts",
|
||||
voice: "custom-voice",
|
||||
lang: "en-US",
|
||||
response_format: "mp3",
|
||||
});
|
||||
return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
|
||||
});
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "hello",
|
||||
cfg: {} as never,
|
||||
providerConfig: {
|
||||
apiKey: "sk-test",
|
||||
baseUrl: "https://proxy.example.com/openai/v1",
|
||||
model: "custom-tts",
|
||||
voice: "custom-voice",
|
||||
responseFormat: "mp3",
|
||||
extra_body: {
|
||||
lang: "en-US",
|
||||
},
|
||||
},
|
||||
target: "audio-file",
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
|
||||
expect(result.outputFormat).toBe("mp3");
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -37,6 +37,7 @@ type OpenAITtsProviderConfig = {
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat?: OpenAiSpeechResponseFormat;
|
||||
extraBody?: Record<string, unknown>;
|
||||
};
|
||||
|
||||
type OpenAITtsProviderOverrides = {
|
||||
@@ -96,10 +97,19 @@ function responseFormatToFileExtension(
|
||||
}
|
||||
}
|
||||
|
||||
function readExtraBody(value: unknown): Record<string, unknown> | undefined {
|
||||
const body = asObjectRecord(value);
|
||||
if (!body || Object.keys(body).length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
function normalizeOpenAIProviderConfig(
|
||||
rawConfig: Record<string, unknown>,
|
||||
): OpenAITtsProviderConfig {
|
||||
const raw = resolveOpenAIProviderConfigRecord(rawConfig);
|
||||
const extraBody = readExtraBody(raw?.extraBody) ?? readExtraBody(raw?.extra_body);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
@@ -115,6 +125,7 @@ function normalizeOpenAIProviderConfig(
|
||||
speed: asFiniteNumber(raw?.speed),
|
||||
instructions: trimToUndefined(raw?.instructions),
|
||||
responseFormat: normalizeOpenAISpeechResponseFormat(raw?.responseFormat),
|
||||
extraBody,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -129,6 +140,7 @@ function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProvid
|
||||
instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
|
||||
responseFormat:
|
||||
normalizeOpenAISpeechResponseFormat(config.responseFormat) ?? normalized.responseFormat,
|
||||
extraBody: readExtraBody(config.extraBody) ?? readExtraBody(config.extra_body),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -298,6 +310,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
speed: overrides.speed ?? config.speed,
|
||||
instructions: config.instructions,
|
||||
responseFormat,
|
||||
extraBody: config.extraBody,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
@@ -325,6 +338,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
speed: overrides.speed ?? config.speed,
|
||||
instructions: config.instructions,
|
||||
responseFormat: outputFormat,
|
||||
extraBody: config.extraBody,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return { audioBuffer, outputFormat, sampleRate };
|
||||
|
||||
@@ -169,6 +169,47 @@ describe("openai tts", () => {
|
||||
expect(body.voice).toBe("custom-voice");
|
||||
});
|
||||
|
||||
it("merges sanitized extraBody fields into TTS requests", async () => {
|
||||
const fetchMock = vi.fn(
|
||||
async (_url: string | URL, _init?: RequestInit) =>
|
||||
new Response(Buffer.from("audio-bytes"), { status: 200 }),
|
||||
);
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
const extraBody = JSON.parse(
|
||||
'{"lang":"e","speed":1.2,"__proto__":{"polluted":true},"constructor":"bad","prototype":"bad"}',
|
||||
) as Record<string, unknown>;
|
||||
|
||||
await openaiTTS({
|
||||
text: "hello",
|
||||
apiKey: "test-key",
|
||||
baseUrl: "https://tts.example.com/v1",
|
||||
model: "tts-1",
|
||||
voice: "custom-voice",
|
||||
speed: 1,
|
||||
responseFormat: "mp3",
|
||||
extraBody,
|
||||
timeoutMs: 5_000,
|
||||
});
|
||||
|
||||
const [, init] = fetchMock.mock.calls[0] ?? [];
|
||||
if (typeof init?.body !== "string") {
|
||||
throw new Error("expected JSON request body");
|
||||
}
|
||||
const body = JSON.parse(init.body) as Record<string, unknown>;
|
||||
expect(body).toMatchObject({
|
||||
model: "tts-1",
|
||||
input: "hello",
|
||||
voice: "custom-voice",
|
||||
response_format: "mp3",
|
||||
lang: "e",
|
||||
speed: 1.2,
|
||||
});
|
||||
expect(Object.hasOwn(body, "__proto__")).toBe(false);
|
||||
expect(Object.hasOwn(body, "constructor")).toBe(false);
|
||||
expect(Object.hasOwn(body, "prototype")).toBe(false);
|
||||
expect((Object.prototype as Record<string, unknown>).polluted).toBeUndefined();
|
||||
});
|
||||
|
||||
it("omits instructions for unsupported models on the official OpenAI endpoint", async () => {
|
||||
const fetchMock = vi.fn(
|
||||
async (_url: string | URL, _init?: RequestInit) =>
|
||||
|
||||
@@ -78,6 +78,17 @@ export function resolveOpenAITtsInstructions(
|
||||
return model.includes("gpt-4o-mini-tts") ? next : undefined;
|
||||
}
|
||||
|
||||
function sanitizeExtraBodyRecord(value: Record<string, unknown>): Record<string, unknown> {
|
||||
const sanitized: Record<string, unknown> = {};
|
||||
for (const [key, entry] of Object.entries(value)) {
|
||||
if (key === "__proto__" || key === "constructor" || key === "prototype") {
|
||||
continue;
|
||||
}
|
||||
sanitized[key] = entry;
|
||||
}
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
export async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
@@ -87,10 +98,21 @@ export async function openaiTTS(params: {
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm" | "wav";
|
||||
extraBody?: Record<string, unknown>;
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
|
||||
params;
|
||||
const {
|
||||
text,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
model,
|
||||
voice,
|
||||
speed,
|
||||
instructions,
|
||||
responseFormat,
|
||||
extraBody,
|
||||
timeoutMs,
|
||||
} = params;
|
||||
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions, baseUrl);
|
||||
|
||||
if (!isValidOpenAIModel(model, baseUrl)) {
|
||||
@@ -120,6 +142,7 @@ export async function openaiTTS(params: {
|
||||
response_format: responseFormat,
|
||||
...(speed != null && { speed }),
|
||||
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
|
||||
...(extraBody == null ? {} : sanitizeExtraBodyRecord(extraBody)),
|
||||
});
|
||||
const requestUrl = `${baseUrl}/audio/speech`;
|
||||
const debugProxyFetchPatchInstalled = isDebugProxyGlobalFetchPatchInstalled();
|
||||
|
||||
@@ -16,6 +16,24 @@ describe("TtsConfigSchema openai speed and instructions", () => {
|
||||
).not.toThrow();
|
||||
});
|
||||
|
||||
it("accepts openai extraBody objects for compatible TTS endpoints", () => {
|
||||
expect(() =>
|
||||
TtsConfigSchema.parse({
|
||||
providers: {
|
||||
openai: {
|
||||
baseUrl: "http://localhost:8880/v1",
|
||||
model: "kokoro",
|
||||
voice: "em_alex",
|
||||
extraBody: {
|
||||
lang: "e",
|
||||
speed: 1.2,
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
).not.toThrow();
|
||||
});
|
||||
|
||||
it("rejects out-of-range openai speed", () => {
|
||||
expect(() =>
|
||||
TtsConfigSchema.parse({
|
||||
|
||||
Reference in New Issue
Block a user