diff --git a/CHANGELOG.md b/CHANGELOG.md index c56930d9189..72cfc2f94a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -535,6 +535,7 @@ Docs: https://docs.openclaw.ai - Browser/config schema: accept `browser.profiles.*.driver: "openclaw"` while preserving legacy `"clawd"` compatibility in validated config. (#39374; based on #35621) Thanks @gambletan and @ingyukoh. - Memory flush/bootstrap file protection: restrict memory-flush runs to append-only `read`/`write` tools and route host-side memory appends through root-enforced safe file handles so flush turns cannot overwrite bootstrap files via `exec` or unsafe raw rewrites. (#38574) Thanks @frankekn. - Mattermost/DM media uploads: resolve bare 26-character Mattermost IDs user-first for direct messages so media sends no longer fail with `403 Forbidden` when targets are configured as unprefixed user IDs. (#29925) Thanks @teconomix. +- Voice-call/OpenAI TTS config parity: add missing `speed`, `instructions`, and `baseUrl` fields to the OpenAI TTS config schema and gate `instructions` to supported models so voice-call overrides validate and route cleanly through core TTS. (#39226) Thanks @ademczuk. ## 2026.3.2 diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index d9a904c73eb..fef3ccc6ad9 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -522,11 +522,22 @@ "apiKey": { "type": "string" }, + "baseUrl": { + "type": "string" + }, "model": { "type": "string" }, "voice": { "type": "string" + }, + "speed": { + "type": "number", + "minimum": 0.25, + "maximum": 4.0 + }, + "instructions": { + "type": "string" } } }, diff --git a/extensions/voice-call/src/providers/tts-openai.ts b/extensions/voice-call/src/providers/tts-openai.ts index a27030b4578..0a7c74d90ac 100644 --- a/extensions/voice-call/src/providers/tts-openai.ts +++ b/extensions/voice-call/src/providers/tts-openai.ts @@ -1,3 +1,4 @@ +import { resolveOpenAITtsInstructions } from "openclaw/plugin-sdk/voice-call"; import { pcmToMulaw } from "../telephony-audio.js"; /** @@ -110,9 +111,11 @@ export class OpenAITTSProvider { speed: this.speed, }; - // Add instructions if using gpt-4o-mini-tts model - const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions; - if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) { + const effectiveInstructions = resolveOpenAITtsInstructions( + this.model, + trimToUndefined(instructions) ?? this.instructions, + ); + if (effectiveInstructions) { body.instructions = effectiveInstructions; } diff --git a/src/config/config.plugin-validation.test.ts b/src/config/config.plugin-validation.test.ts index 02eab6789ea..99438a13e16 100644 --- a/src/config/config.plugin-validation.test.ts +++ b/src/config/config.plugin-validation.test.ts @@ -279,6 +279,31 @@ describe("config plugin validation", () => { expect(res.ok).toBe(true); }); + it("accepts voice-call OpenAI TTS speed, instructions, and baseUrl config fields", async () => { + const res = validateInSuite({ + agents: { list: [{ id: "pi" }] }, + plugins: { + enabled: true, + load: { paths: [voiceCallSchemaPluginDir] }, + entries: { + "voice-call-schema-fixture": { + config: { + tts: { + openai: { + baseUrl: "http://localhost:8880/v1", + voice: "alloy", + speed: 1.5, + instructions: "Speak in a cheerful tone", + }, + }, + }, + }, + }, + }, + }); + expect(res.ok).toBe(true); + }); + it("accepts known plugin ids and valid channel/heartbeat enums", async () => { const res = validateInSuite({ agents: { diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 3d898ff9c57..a6232f9de5a 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -61,6 +61,10 @@ export type TtsConfig = { baseUrl?: string; model?: string; voice?: string; + /** Playback speed (0.25–4.0, default 1.0). */ + speed?: number; + /** System-level instructions for the TTS model (gpt-4o-mini-tts only). */ + instructions?: string; }; /** Microsoft Edge (node-edge-tts) configuration. */ edge?: { diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 066a33f0f4f..305efab4b26 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -404,6 +404,8 @@ export const TtsConfigSchema = z baseUrl: z.string().optional(), model: z.string().optional(), voice: z.string().optional(), + speed: z.number().min(0.25).max(4).optional(), + instructions: z.string().optional(), }) .strict() .optional(), diff --git a/src/config/zod-schema.tts.test.ts b/src/config/zod-schema.tts.test.ts new file mode 100644 index 00000000000..70398e81054 --- /dev/null +++ b/src/config/zod-schema.tts.test.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from "vitest"; +import { TtsConfigSchema } from "./zod-schema.core.js"; + +describe("TtsConfigSchema openai speed and instructions", () => { + it("accepts speed and instructions in openai section", () => { + expect(() => + TtsConfigSchema.parse({ + openai: { + voice: "alloy", + speed: 1.5, + instructions: "Speak in a cheerful tone", + }, + }), + ).not.toThrow(); + }); + + it("rejects out-of-range openai speed", () => { + expect(() => + TtsConfigSchema.parse({ + openai: { + speed: 5.0, + }, + }), + ).toThrow(); + }); + + it("rejects openai speed below minimum", () => { + expect(() => + TtsConfigSchema.parse({ + openai: { + speed: 0.1, + }, + }), + ).toThrow(); + }); +}); diff --git a/src/plugin-sdk/voice-call.ts b/src/plugin-sdk/voice-call.ts index da8a1f12613..c50b979a145 100644 --- a/src/plugin-sdk/voice-call.ts +++ b/src/plugin-sdk/voice-call.ts @@ -7,6 +7,7 @@ export { TtsModeSchema, TtsProviderSchema, } from "../config/zod-schema.core.js"; +export { resolveOpenAITtsInstructions } from "../tts/tts-core.js"; export type { GatewayRequestHandlerOptions } from "../gateway/server-methods/types.js"; export { isRequestBodyLimitError, diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts index 08f80c3d60c..279fc3cc1ed 100644 --- a/src/tts/tts-core.ts +++ b/src/tts/tts-core.ts @@ -43,6 +43,11 @@ function normalizeOpenAITtsBaseUrl(baseUrl?: string): string { return trimmed.replace(/\/+$/, ""); } +function trimToUndefined(value?: string): string | undefined { + const trimmed = value?.trim(); + return trimmed ? trimmed : undefined; +} + function requireInRange(value: number, min: number, max: number, label: string): void { if (!Number.isFinite(value) || value < min || value > max) { throw new Error(`${label} must be between ${min} and ${max}`); @@ -383,6 +388,14 @@ export function isValidOpenAIModel(model: string, baseUrl?: string): boolean { return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); } +export function resolveOpenAITtsInstructions( + model: string, + instructions?: string, +): string | undefined { + const next = trimToUndefined(instructions); + return next && model.includes("gpt-4o-mini-tts") ? next : undefined; +} + export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice { // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices) if (isCustomOpenAIEndpoint(baseUrl)) { @@ -619,10 +632,14 @@ export async function openaiTTS(params: { baseUrl: string; model: string; voice: string; + speed?: number; + instructions?: string; responseFormat: "mp3" | "opus" | "pcm"; timeoutMs: number; }): Promise { - const { text, apiKey, baseUrl, model, voice, responseFormat, timeoutMs } = params; + const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } = + params; + const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions); if (!isValidOpenAIModel(model, baseUrl)) { throw new Error(`Invalid model: ${model}`); @@ -646,6 +663,8 @@ export async function openaiTTS(params: { input: text, voice, response_format: responseFormat, + ...(speed != null && { speed }), + ...(effectiveInstructions != null && { instructions: effectiveInstructions }), }), signal: controller.signal, }); diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index f3b5d8ce0ee..642e403ec7b 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -57,6 +57,7 @@ const { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, parseTtsDirectives, + resolveOpenAITtsInstructions, resolveModelOverridePolicy, summarizeText, resolveOutputFormat, @@ -169,6 +170,20 @@ describe("tts", () => { }); }); + describe("resolveOpenAITtsInstructions", () => { + it("keeps instructions only for gpt-4o-mini-tts variants", () => { + expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " Speak warmly ")).toBe( + "Speak warmly", + ); + expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts-2025-12-15", "Speak warmly")).toBe( + "Speak warmly", + ); + expect(resolveOpenAITtsInstructions("tts-1", "Speak warmly")).toBeUndefined(); + expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined(); + expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined(); + }); + }); + describe("resolveOutputFormat", () => { it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => { const cases = [ @@ -557,6 +572,84 @@ describe("tts", () => { }); }); + describe("textToSpeechTelephony – openai instructions", () => { + const withMockedTelephonyFetch = async ( + run: (fetchMock: ReturnType) => Promise, + ) => { + const originalFetch = globalThis.fetch; + const fetchMock = vi.fn(async () => ({ + ok: true, + arrayBuffer: async () => new ArrayBuffer(2), + })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + try { + await run(fetchMock); + } finally { + globalThis.fetch = originalFetch; + } + }; + + it("omits instructions for unsupported speech models", async () => { + const cfg: OpenClawConfig = { + messages: { + tts: { + provider: "openai", + openai: { + apiKey: "test-key", + model: "tts-1", + voice: "alloy", + instructions: "Speak warmly", + }, + }, + }, + }; + + await withMockedTelephonyFetch(async (fetchMock) => { + const result = await tts.textToSpeechTelephony({ + text: "Hello there, friendly caller.", + cfg, + }); + + expect(result.success).toBe(true); + expect(fetchMock).toHaveBeenCalledTimes(1); + const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(typeof init.body).toBe("string"); + const body = JSON.parse(init.body as string) as Record; + expect(body.instructions).toBeUndefined(); + }); + }); + + it("includes instructions for gpt-4o-mini-tts", async () => { + const cfg: OpenClawConfig = { + messages: { + tts: { + provider: "openai", + openai: { + apiKey: "test-key", + model: "gpt-4o-mini-tts", + voice: "alloy", + instructions: "Speak warmly", + }, + }, + }, + }; + + await withMockedTelephonyFetch(async (fetchMock) => { + const result = await tts.textToSpeechTelephony({ + text: "Hello there, friendly caller.", + cfg, + }); + + expect(result.success).toBe(true); + expect(fetchMock).toHaveBeenCalledTimes(1); + const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(typeof init.body).toBe("string"); + const body = JSON.parse(init.body as string) as Record; + expect(body.instructions).toBe("Speak warmly"); + }); + }); + }); + describe("maybeApplyTtsToPayload", () => { const baseCfg: OpenClawConfig = { agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, diff --git a/src/tts/tts.ts b/src/tts/tts.ts index f76000029f6..5cd306f13a9 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -37,6 +37,7 @@ import { isValidVoiceId, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + resolveOpenAITtsInstructions, openaiTTS, parseTtsDirectives, scheduleCleanup, @@ -117,6 +118,8 @@ export type ResolvedTtsConfig = { baseUrl: string; model: string; voice: string; + speed?: number; + instructions?: string; }; edge: { enabled: boolean; @@ -304,6 +307,8 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { ).replace(/\/+$/, ""), model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, + speed: raw.openai?.speed, + instructions: raw.openai?.instructions?.trim() || undefined, }, edge: { enabled: raw.edge?.enabled ?? true, @@ -692,6 +697,8 @@ export async function textToSpeech(params: { baseUrl: config.openai.baseUrl, model: openaiModelOverride ?? config.openai.model, voice: openaiVoiceOverride ?? config.openai.voice, + speed: config.openai.speed, + instructions: config.openai.instructions, responseFormat: output.openai, timeoutMs: config.timeoutMs, }); @@ -789,6 +796,8 @@ export async function textToSpeechTelephony(params: { baseUrl: config.openai.baseUrl, model: config.openai.model, voice: config.openai.voice, + speed: config.openai.speed, + instructions: config.openai.instructions, responseFormat: output.format, timeoutMs: config.timeoutMs, }); @@ -961,6 +970,7 @@ export const _test = { isValidOpenAIModel, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + resolveOpenAITtsInstructions, parseTtsDirectives, resolveModelOverridePolicy, summarizeText,