fix: honor telephony tts directives

2026-05-06 05:10:44 +00:00 · 2026-05-01 22:48:27 +01:00
parent 236bd42bb3
commit 4389ceedac
11 changed files with 397 additions and 33 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
 - Doctor/WhatsApp: warn when Linux crontabs still run the legacy `ensure-whatsapp.sh` health check, which can misreport `Gateway inactive` when cron lacks the systemd user-bus environment. Fixes #60204. Thanks @mySebbe.
 - Slack/setup: print the generated app manifest as plain JSON instead of embedding it inside the framed setup note, so it can be copied into Slack without deleting border characters. Fixes #65751. Thanks @theDanielJLewis.
 - Channels/WhatsApp: route CLI logout through the live Gateway and stop runtime-backed listeners before channel removal, so removing a WhatsApp account does not leave the old socket replying until restart. Fixes #67746. Thanks @123Mismail.
+- Voice Call/Twilio: honor TTS directive text and provider voice/model overrides during telephony synthesis, so `[[tts:...]]` tags are not spoken literally and voiceId overrides reach OpenAI/ElevenLabs calls. Fixes #58114. Thanks @legonhilltech-jpg.
 - Agents/Codex: stop prompting message-tool-only source turns to finish with `NO_REPLY`, so quiet turns are represented by not calling the visible message tool instead of conflicting final-text instructions. Thanks @pashpashpash.
 - Gateway/config: report failed backup restores as failed in logs and config observe audit records instead of marking them valid. (#70515) Thanks @davidangularme.
 - Compaction: use the active session model fallback chain for implicit summarization failures without persisting fallback model selection, so Azure content-filter 400s can recover. Fixes #64960. (#74470) Thanks @jalehman and @OpenCodeEngineer.
--- a/extensions/elevenlabs/speech-provider.test.ts
+++ b/extensions/elevenlabs/speech-provider.test.ts
@@ -1,7 +1,39 @@
-import { describe, expect, it } from "vitest";
+import { afterEach, describe, expect, it, vi } from "vitest";
 import { buildElevenLabsSpeechProvider, isValidVoiceId } from "./speech-provider.js";

+vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
+  fetchWithSsrFGuard: async ({
+    url,
+    init,
+  }: {
+    url: string;
+    init?: RequestInit;
+  }): Promise<{ response: Response; release: () => Promise<void> }> => ({
+    response: await globalThis.fetch(url, init),
+    release: vi.fn(async () => {}),
+  }),
+  ssrfPolicyFromHttpBaseUrlAllowedHostname: () => undefined,
+}));
+
+function parseRequestBody(init: RequestInit | undefined): Record<string, unknown> {
+  if (typeof init?.body !== "string") {
+    throw new Error("expected string request body");
+  }
+  const body: unknown = JSON.parse(init.body);
+  if (!body || typeof body !== "object" || Array.isArray(body)) {
+    throw new Error("expected ElevenLabs request body");
+  }
+  return body as Record<string, unknown>;
+}
+
 describe("elevenlabs speech provider", () => {
+  const originalFetch = globalThis.fetch;
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    vi.restoreAllMocks();
+  });
+
  it("exposes the current ElevenLabs TTS model catalog", () => {
    const provider = buildElevenLabsSpeechProvider();

@@ -32,4 +64,49 @@ describe("elevenlabs speech provider", () => {
      expect(isValidVoiceId(testCase.value), testCase.value).toBe(testCase.expected);
    }
  });
+
+  it("applies provider overrides to telephony synthesis", async () => {
+    const provider = buildElevenLabsSpeechProvider();
+    const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
+      expect(url).toContain("/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM");
+      expect(url).toContain("output_format=pcm_22050");
+      const body = parseRequestBody(init);
+      expect(body).toMatchObject({
+        text: "hello",
+        model_id: "eleven_v3",
+        seed: 123,
+        apply_text_normalization: "on",
+        language_code: "en",
+        voice_settings: expect.objectContaining({
+          speed: 1.2,
+        }),
+      });
+      return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
+    });
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    const result = await provider.synthesizeTelephony?.({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "xi-test",
+        voiceId: "pMsXgVXv3BLzUgSXRplE",
+        modelId: "eleven_multilingual_v2",
+      },
+      providerOverrides: {
+        voiceId: "21m00Tcm4TlvDq8ikWAM",
+        modelId: "eleven_v3",
+        seed: 123,
+        applyTextNormalization: "on",
+        languageCode: "en",
+        voiceSettings: {
+          speed: 1.2,
+        },
+      },
+      timeoutMs: 1_000,
+    });
+
+    expect(result?.outputFormat).toBe("pcm_22050");
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+  });
 });
--- a/extensions/elevenlabs/speech-provider.ts
+++ b/extensions/elevenlabs/speech-provider.ts
@@ -152,6 +152,31 @@ function mergeVoiceSettingsOverride(
  };
 }

+function resolveVoiceSettingsOverride(
+  base: ElevenLabsProviderConfig["voiceSettings"],
+  overrides: unknown,
+): ElevenLabsProviderConfig["voiceSettings"] {
+  const voiceSettings = asObject(overrides);
+  return {
+    ...base,
+    ...(asFiniteNumber(voiceSettings?.stability) == null
+      ? {}
+      : { stability: asFiniteNumber(voiceSettings?.stability) }),
+    ...(asFiniteNumber(voiceSettings?.similarityBoost) == null
+      ? {}
+      : { similarityBoost: asFiniteNumber(voiceSettings?.similarityBoost) }),
+    ...(asFiniteNumber(voiceSettings?.style) == null
+      ? {}
+      : { style: asFiniteNumber(voiceSettings?.style) }),
+    ...(asBoolean(voiceSettings?.useSpeakerBoost) == null
+      ? {}
+      : { useSpeakerBoost: asBoolean(voiceSettings?.useSpeakerBoost) }),
+    ...(asFiniteNumber(voiceSettings?.speed) == null
+      ? {}
+      : { speed: asFiniteNumber(voiceSettings?.speed) }),
+  };
+}
+
 function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
  try {
    switch (ctx.key) {
@@ -469,7 +494,6 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
      const outputFormat =
        trimToUndefined(overrides.outputFormat) ??
        (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
-      const overrideVoiceSettings = asObject(overrides.voiceSettings);
      const latencyTier = asFiniteNumber(overrides.latencyTier);
      const audioBuffer = await elevenLabsTTS({
        text: req.text,
@@ -487,24 +511,7 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
            | undefined) ?? config.applyTextNormalization,
        languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
        latencyTier,
-        voiceSettings: {
-          ...config.voiceSettings,
-          ...(asFiniteNumber(overrideVoiceSettings?.stability) == null
-            ? {}
-            : { stability: asFiniteNumber(overrideVoiceSettings?.stability) }),
-          ...(asFiniteNumber(overrideVoiceSettings?.similarityBoost) == null
-            ? {}
-            : { similarityBoost: asFiniteNumber(overrideVoiceSettings?.similarityBoost) }),
-          ...(asFiniteNumber(overrideVoiceSettings?.style) == null
-            ? {}
-            : { style: asFiniteNumber(overrideVoiceSettings?.style) }),
-          ...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
-            ? {}
-            : { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
-          ...(asFiniteNumber(overrideVoiceSettings?.speed) == null
-            ? {}
-            : { speed: asFiniteNumber(overrideVoiceSettings?.speed) }),
-        },
+        voiceSettings: resolveVoiceSettingsOverride(config.voiceSettings, overrides.voiceSettings),
        timeoutMs: req.timeoutMs,
      });
      return {
@@ -516,6 +523,7 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readElevenLabsProviderConfig(req.providerConfig);
+      const overrides = req.providerOverrides ?? {};
      const apiKey =
        config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY;
      if (!apiKey) {
@@ -527,13 +535,18 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
        text: req.text,
        apiKey,
        baseUrl: config.baseUrl,
-        voiceId: config.voiceId,
-        modelId: config.modelId,
+        voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
+        modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
        outputFormat,
-        seed: config.seed,
-        applyTextNormalization: config.applyTextNormalization,
-        languageCode: config.languageCode,
-        voiceSettings: config.voiceSettings,
+        seed: asFiniteNumber(overrides.seed) ?? config.seed,
+        applyTextNormalization:
+          (trimToUndefined(overrides.applyTextNormalization) as
+            | "auto"
+            | "on"
+            | "off"
+            | undefined) ?? config.applyTextNormalization,
+        languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
+        voiceSettings: resolveVoiceSettingsOverride(config.voiceSettings, overrides.voiceSettings),
        timeoutMs: req.timeoutMs,
      });
      return { audioBuffer, outputFormat, sampleRate };
--- a/extensions/openai/speech-provider.test.ts
+++ b/extensions/openai/speech-provider.test.ts
@@ -15,11 +15,21 @@ vi.mock("openclaw/plugin-sdk/ssrf-runtime", () => ({
  ssrfPolicyFromHttpBaseUrlAllowedHostname: () => undefined,
 }));

-function isSpeechRequestBody(value: unknown): value is { response_format?: string } {
+function isSpeechRequestBody(value: unknown): value is {
+  model?: string;
+  voice?: string;
+  speed?: number;
+  response_format?: string;
+} {
  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
 }

-function parseRequestBody(init: RequestInit | undefined): { response_format?: string } {
+function parseRequestBody(init: RequestInit | undefined): {
+  model?: string;
+  voice?: string;
+  speed?: number;
+  response_format?: string;
+} {
  if (typeof init?.body !== "string") {
    throw new Error("expected string request body");
  }
@@ -218,6 +228,41 @@ describe("buildOpenAISpeechProvider", () => {
    expect(result.voiceCompatible).toBe(false);
  });

+  it("applies provider overrides to telephony synthesis", async () => {
+    const provider = buildOpenAISpeechProvider();
+    const fetchMock = vi.fn(async (_url: string, init?: RequestInit) => {
+      const body = parseRequestBody(init);
+      expect(body).toMatchObject({
+        model: "tts-1",
+        voice: "nova",
+        speed: 1.25,
+        response_format: "pcm",
+      });
+      return new Response(new Uint8Array([1, 2, 3]), { status: 200 });
+    });
+    globalThis.fetch = fetchMock as unknown as typeof fetch;
+
+    const result = await provider.synthesizeTelephony?.({
+      text: "hello",
+      cfg: {} as never,
+      providerConfig: {
+        apiKey: "sk-test",
+        model: "gpt-4o-mini-tts",
+        voice: "alloy",
+        speed: 1,
+      },
+      providerOverrides: {
+        model: "tts-1",
+        voice: "nova",
+        speed: 1.25,
+      },
+      timeoutMs: 1_000,
+    });
+
+    expect(result?.outputFormat).toBe("pcm");
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+  });
+
  it("honors explicit responseFormat overrides and clears voice-note compatibility when not opus", async () => {
    const provider = buildOpenAISpeechProvider();
    mockSpeechFetchExpectingFormat("wav");
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -309,6 +309,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
    },
    synthesizeTelephony: async (req) => {
      const config = readOpenAIProviderConfig(req.providerConfig);
+      const overrides = readOpenAIOverrides(req.providerOverrides);
      const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
      if (!apiKey) {
        throw new Error("OpenAI API key missing");
@@ -319,9 +320,9 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
        text: req.text,
        apiKey,
        baseUrl: config.baseUrl,
-        model: config.model,
-        voice: config.voice,
-        speed: config.speed,
+        model: overrides.model ?? config.model,
+        voice: overrides.voice ?? config.voice,
+        speed: overrides.speed ?? config.speed,
        instructions: config.instructions,
        responseFormat: outputFormat,
        timeoutMs: req.timeoutMs,
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -10,6 +10,7 @@ import type {
  SpeechProviderPlugin,
  SpeechProviderPrepareSynthesisContext,
  SpeechSynthesisRequest,
+  SpeechTelephonySynthesisRequest,
 } from "openclaw/plugin-sdk/speech-core";
 import { afterEach, describe, expect, it, vi } from "vitest";

@@ -542,6 +543,47 @@ describe("speech-core native voice-note routing", () => {
    expect(result.attempts?.[0]).not.toHaveProperty("personaBinding");
  });

+  it("passes directive overrides to telephony synthesis providers", async () => {
+    const synthesizeTelephony = vi.fn(async (_request: SpeechTelephonySynthesisRequest) => ({
+      audioBuffer: Buffer.from("voice"),
+      outputFormat: "pcm",
+      sampleRate: 24000,
+    }));
+    installSpeechProviders([
+      createMockSpeechProvider("mock", {
+        synthesizeTelephony,
+      }),
+    ]);
+
+    const result = await textToSpeechTelephony({
+      text: "Use a directed telephony voice.",
+      cfg: {
+        messages: {
+          tts: {
+            enabled: true,
+            provider: "mock",
+          },
+        },
+      },
+      overrides: {
+        providerOverrides: {
+          mock: {
+            voice: "directed-voice",
+          },
+        },
+      },
+    });
+
+    expect(result.success).toBe(true);
+    expect(synthesizeTelephony).toHaveBeenCalledWith(
+      expect.objectContaining({
+        providerOverrides: {
+          voice: "directed-voice",
+        },
+      }),
+    );
+  });
+
  it("uses provider defaults when fallback policy allows missing persona bindings", async () => {
    await synthesizeSpeech({
      text: "Use neutral provider defaults.",
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -1318,11 +1318,13 @@ export async function textToSpeechTelephony(params: {
  text: string;
  cfg: OpenClawConfig;
  prefsPath?: string;
+  overrides?: TtsDirectiveOverrides;
 }): Promise<TtsTelephonyResult> {
  const setup = resolveTtsRequestSetup({
    text: params.text,
    cfg: params.cfg,
    prefsPath: params.prefsPath,
+    providerOverride: params.overrides?.provider,
  });
  if ("error" in setup) {
    return { success: false, error: setup.error };
@@ -1371,6 +1373,7 @@ export async function textToSpeechTelephony(params: {
        text: params.text,
        cfg,
        providerConfig: resolvedProvider.providerConfig,
+        providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id],
        persona: resolvedProvider.synthesisPersona,
        personaProviderConfig: resolvedProvider.personaProviderConfig,
        target: "telephony",
@@ -1380,6 +1383,7 @@ export async function textToSpeechTelephony(params: {
        text: prepared.text,
        cfg,
        providerConfig: prepared.providerConfig,
+        providerOverrides: prepared.providerOverrides,
        timeoutMs: config.timeoutMs,
      });
      const latencyMs = Date.now() - providerStart;
--- a/extensions/voice-call/src/telephony-tts.test.ts
+++ b/extensions/voice-call/src/telephony-tts.test.ts
@@ -117,6 +117,53 @@ describe("createTelephonyTtsProvider deepMerge hardening", () => {
    );
  });

+  it("strips telephony TTS directive tags before synthesis", async () => {
+    let requestText: string | undefined;
+    const provider = createTelephonyTtsProvider({
+      coreConfig: createCoreConfig(),
+      runtime: {
+        textToSpeechTelephony: async ({ text }) => {
+          requestText = text;
+          return {
+            success: true,
+            audioBuffer: Buffer.alloc(2),
+            sampleRate: 8000,
+          };
+        },
+      },
+    });
+
+    await provider.synthesizeForTelephony("[[tts]]Hello caller[[/tts]]");
+
+    expect(requestText).toBe("Hello caller");
+  });
+
+  it("uses hidden telephony TTS directive text for synthesis", async () => {
+    let requestText: string | undefined;
+    let requestOverrides: unknown;
+    const provider = createTelephonyTtsProvider({
+      coreConfig: createCoreConfig(),
+      runtime: {
+        textToSpeechTelephony: async ({ text, overrides }) => {
+          requestText = text;
+          requestOverrides = overrides;
+          return {
+            success: true,
+            audioBuffer: Buffer.alloc(2),
+            sampleRate: 8000,
+          };
+        },
+      },
+    });
+
+    await provider.synthesizeForTelephony(
+      "Visible text [[tts:text]]Speak this instead[[/tts:text]]",
+    );
+
+    expect(requestText).toBe("Speak this instead");
+    expect(requestOverrides).toMatchObject({ ttsText: "Speak this instead" });
+  });
+
  it("exposes configured timeoutMs as synthesisTimeoutMs", () => {
    const provider = createTelephonyTtsProvider({
      coreConfig: { messages: { tts: { provider: "openai", timeoutMs: 15000 } } },
--- a/extensions/voice-call/src/telephony-tts.ts
+++ b/extensions/voice-call/src/telephony-tts.ts
@@ -1,3 +1,9 @@
+import {
+  parseTtsDirectives,
+  type SpeechModelOverridePolicy,
+  type SpeechProviderConfig,
+  type TtsDirectiveOverrides,
+} from "openclaw/plugin-sdk/speech";
 import type { VoiceCallTtsConfig } from "./config.js";
 import type { CoreConfig } from "./core-bridge.js";
 import { deepMergeDefined } from "./deep-merge.js";
@@ -8,6 +14,7 @@ export type TelephonyTtsRuntime = {
    text: string;
    cfg: CoreConfig;
    prefsPath?: string;
+    overrides?: TtsDirectiveOverrides;
  }) => Promise<{
    success: boolean;
    audioBuffer?: Buffer;
@@ -26,6 +33,17 @@ export type TelephonyTtsProvider = {

 export const TELEPHONY_DEFAULT_TTS_TIMEOUT_MS = 8000;

+type TelephonyModelOverrideConfig = {
+  enabled?: boolean;
+  allowText?: boolean;
+  allowProvider?: boolean;
+  allowVoice?: boolean;
+  allowModelId?: boolean;
+  allowVoiceSettings?: boolean;
+  allowNormalization?: boolean;
+  allowSeed?: boolean;
+};
+
 export function createTelephonyTtsProvider(params: {
  coreConfig: CoreConfig;
  ttsOverride?: VoiceCallTtsConfig;
@@ -36,15 +54,35 @@ export function createTelephonyTtsProvider(params: {
 }): TelephonyTtsProvider {
  const { coreConfig, ttsOverride, runtime, logger } = params;
  const mergedConfig = applyTtsOverride(coreConfig, ttsOverride);
+  const ttsConfig = mergedConfig.messages?.tts;
+  const modelOverrides = resolveTelephonyModelOverridePolicy(
+    readTelephonyModelOverrides(ttsConfig),
+  );
+  const providerConfigs = collectTelephonyProviderConfigs(ttsConfig);
+  const activeProvider = normalizeProviderId(ttsConfig?.provider);
  const synthesisTimeoutMs =
    mergedConfig.messages?.tts?.timeoutMs ?? TELEPHONY_DEFAULT_TTS_TIMEOUT_MS;

  return {
    synthesisTimeoutMs,
    synthesizeForTelephony: async (text: string) => {
-      const result = await runtime.textToSpeechTelephony({
-        text,
+      const directives = parseTtsDirectives(text, modelOverrides, {
        cfg: mergedConfig,
+        providerConfigs,
+        preferredProviderId: activeProvider,
+      });
+      if (directives.warnings.length > 0) {
+        logger?.warn?.(
+          `[voice-call] Ignored telephony TTS directive overrides (${directives.warnings.join("; ")})`,
+        );
+      }
+      const cleanText = directives.hasDirective
+        ? directives.ttsText?.trim() || directives.cleanedText.trim()
+        : text;
+      const result = await runtime.textToSpeechTelephony({
+        text: cleanText,
+        cfg: mergedConfig,
+        overrides: directives.overrides,
      });

      if (!result.success || !result.audioBuffer || !result.sampleRate) {
@@ -101,3 +139,97 @@ function mergeTtsConfig(
  }
  return deepMergeDefined(base, override) as VoiceCallTtsConfig;
 }
+
+function resolveTelephonyModelOverridePolicy(
+  overrides: TelephonyModelOverrideConfig | undefined,
+): SpeechModelOverridePolicy {
+  const enabled = overrides?.enabled ?? true;
+  if (!enabled) {
+    return {
+      enabled: false,
+      allowText: false,
+      allowProvider: false,
+      allowVoice: false,
+      allowModelId: false,
+      allowVoiceSettings: false,
+      allowNormalization: false,
+      allowSeed: false,
+    };
+  }
+  const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
+  return {
+    enabled: true,
+    allowText: allow(overrides?.allowText),
+    allowProvider: allow(overrides?.allowProvider, false),
+    allowVoice: allow(overrides?.allowVoice),
+    allowModelId: allow(overrides?.allowModelId),
+    allowVoiceSettings: allow(overrides?.allowVoiceSettings),
+    allowNormalization: allow(overrides?.allowNormalization),
+    allowSeed: allow(overrides?.allowSeed),
+  };
+}
+
+function readTelephonyModelOverrides(
+  ttsConfig: VoiceCallTtsConfig | undefined,
+): TelephonyModelOverrideConfig | undefined {
+  const value = (ttsConfig as Record<string, unknown> | undefined)?.modelOverrides;
+  return value && typeof value === "object" && !Array.isArray(value)
+    ? (value as TelephonyModelOverrideConfig)
+    : undefined;
+}
+
+function normalizeProviderId(value: unknown): string | undefined {
+  return typeof value === "string" ? value.trim().toLowerCase() || undefined : undefined;
+}
+
+function asProviderConfig(value: unknown): SpeechProviderConfig {
+  return value && typeof value === "object" && !Array.isArray(value)
+    ? (value as SpeechProviderConfig)
+    : {};
+}
+
+function collectTelephonyProviderConfigs(
+  ttsConfig: VoiceCallTtsConfig | undefined,
+): Record<string, SpeechProviderConfig> {
+  if (!ttsConfig) {
+    return {};
+  }
+  const entries: Record<string, SpeechProviderConfig> = {};
+  const rawProviders =
+    ttsConfig.providers &&
+    typeof ttsConfig.providers === "object" &&
+    !Array.isArray(ttsConfig.providers)
+      ? (ttsConfig.providers as Record<string, unknown>)
+      : {};
+  for (const [providerId, value] of Object.entries(rawProviders)) {
+    const normalized = normalizeProviderId(providerId) ?? providerId;
+    entries[normalized] = asProviderConfig(value);
+  }
+  const reservedKeys = new Set([
+    "auto",
+    "enabled",
+    "maxTextLength",
+    "mode",
+    "modelOverrides",
+    "persona",
+    "personas",
+    "prefsPath",
+    "provider",
+    "providers",
+    "summaryModel",
+    "timeoutMs",
+  ]);
+  for (const [key, value] of Object.entries(ttsConfig as Record<string, unknown>)) {
+    if (
+      reservedKeys.has(key) ||
+      typeof value !== "object" ||
+      value === null ||
+      Array.isArray(value)
+    ) {
+      continue;
+    }
+    const normalized = normalizeProviderId(key) ?? key;
+    entries[normalized] ??= asProviderConfig(value);
+  }
+  return entries;
+}
--- a/src/plugin-sdk/tts-runtime.types.ts
+++ b/src/plugin-sdk/tts-runtime.types.ts
@@ -88,6 +88,7 @@ export type TtsTelephonyRequestParams = {
  text: string;
  cfg: OpenClawConfig;
  prefsPath?: string;
+  overrides?: TtsDirectiveOverrides;
 };

 export type ListSpeechVoicesParams = {
--- a/src/tts/provider-types.ts
+++ b/src/tts/provider-types.ts
@@ -61,6 +61,7 @@ export type SpeechTelephonySynthesisRequest = {
  text: string;
  cfg: OpenClawConfig;
  providerConfig: SpeechProviderConfig;
+  providerOverrides?: SpeechProviderOverrides;
  timeoutMs: number;
 };