refactor: plugin-own speech provider config

2026-05-04 16:20:21 +00:00 · 2026-03-26 22:27:17 +00:00
parent 8eeb7f0829
commit 2c6d099b01
28 changed files with 1791 additions and 1229 deletions
--- a/extensions/discord/src/voice/manager.ts
+++ b/extensions/discord/src/voice/manager.ts
@@ -645,11 +645,10 @@ export class DiscordVoiceManager {
      cfg: this.params.cfg,
      override: this.params.discordConfig.voice?.tts,
    });
-    const directive = parseTtsDirectives(
-      replyText,
-      ttsConfig.modelOverrides,
-      ttsConfig.openai.baseUrl,
-    );
+    const directive = parseTtsDirectives(replyText, ttsConfig.modelOverrides, {
+      cfg: ttsCfg,
+      providerConfigs: ttsConfig.providerConfigs,
+    });
    const speakText = directive.overrides.ttsText ?? directive.cleanedText.trim();
    if (!speakText) {
      logVoiceVerbose(
--- a/extensions/elevenlabs/speech-provider.ts
+++ b/extensions/elevenlabs/speech-provider.ts
@@ -1,16 +1,309 @@
-import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
-import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import type {
+  SpeechDirectiveTokenParseContext,
+  SpeechProviderConfig,
+  SpeechProviderOverrides,
+  SpeechProviderPlugin,
+  SpeechVoiceOption,
+} from "openclaw/plugin-sdk/speech-core";
+import {
+  normalizeApplyTextNormalization,
+  normalizeLanguageCode,
+  normalizeSeed,
+  requireInRange,
+} from "openclaw/plugin-sdk/speech-core";
 import { elevenLabsTTS } from "./tts.js";

+const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
+const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
+const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
+const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
+  stability: 0.5,
+  similarityBoost: 0.75,
+  style: 0.0,
+  useSpeakerBoost: true,
+  speed: 1.0,
+};
+
 const ELEVENLABS_TTS_MODELS = [
  "eleven_multilingual_v2",
  "eleven_turbo_v2_5",
  "eleven_monolingual_v1",
 ] as const;

+type ElevenLabsProviderConfig = {
+  apiKey?: string;
+  baseUrl: string;
+  voiceId: string;
+  modelId: string;
+  seed?: number;
+  applyTextNormalization?: "auto" | "on" | "off";
+  languageCode?: string;
+  voiceSettings: {
+    stability: number;
+    similarityBoost: number;
+    style: number;
+    useSpeakerBoost: boolean;
+    speed: number;
+  };
+};
+
+function trimToUndefined(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function asNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asBoolean(value: unknown): boolean | undefined {
+  return typeof value === "boolean" ? value : undefined;
+}
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function parseBooleanValue(value: string): boolean | undefined {
+  const normalized = value.trim().toLowerCase();
+  if (["true", "1", "yes", "on"].includes(normalized)) {
+    return true;
+  }
+  if (["false", "0", "no", "off"].includes(normalized)) {
+    return false;
+  }
+  return undefined;
+}
+
+function parseNumberValue(value: string): number | undefined {
+  const parsed = Number.parseFloat(value);
+  return Number.isFinite(parsed) ? parsed : undefined;
+}
+
+export function isValidVoiceId(voiceId: string): boolean {
+  return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
+}
+
 function normalizeElevenLabsBaseUrl(baseUrl: string | undefined): string {
  const trimmed = baseUrl?.trim();
-  return trimmed?.replace(/\/+$/, "") || "https://api.elevenlabs.io";
+  return trimmed?.replace(/\/+$/, "") || DEFAULT_ELEVENLABS_BASE_URL;
+}
+
+function normalizeElevenLabsProviderConfig(
+  rawConfig: Record<string, unknown>,
+): ElevenLabsProviderConfig {
+  const raw = asObject(rawConfig.elevenlabs);
+  const rawVoiceSettings = asObject(raw?.voiceSettings);
+  return {
+    apiKey: normalizeResolvedSecretInputString({
+      value: raw?.apiKey,
+      path: "messages.tts.elevenlabs.apiKey",
+    }),
+    baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(raw?.baseUrl)),
+    voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_ELEVENLABS_VOICE_ID,
+    modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_ELEVENLABS_MODEL_ID,
+    seed: asNumber(raw?.seed),
+    applyTextNormalization: trimToUndefined(raw?.applyTextNormalization) as
+      | "auto"
+      | "on"
+      | "off"
+      | undefined,
+    languageCode: trimToUndefined(raw?.languageCode),
+    voiceSettings: {
+      stability:
+        asNumber(rawVoiceSettings?.stability) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
+      similarityBoost:
+        asNumber(rawVoiceSettings?.similarityBoost) ??
+        DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
+      style: asNumber(rawVoiceSettings?.style) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
+      useSpeakerBoost:
+        asBoolean(rawVoiceSettings?.useSpeakerBoost) ??
+        DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
+      speed: asNumber(rawVoiceSettings?.speed) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
+    },
+  };
+}
+
+function readElevenLabsProviderConfig(config: SpeechProviderConfig): ElevenLabsProviderConfig {
+  const defaults = normalizeElevenLabsProviderConfig({});
+  const voiceSettings = asObject(config.voiceSettings);
+  return {
+    apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
+    baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl),
+    voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
+    modelId: trimToUndefined(config.modelId) ?? defaults.modelId,
+    seed: asNumber(config.seed) ?? defaults.seed,
+    applyTextNormalization:
+      (trimToUndefined(config.applyTextNormalization) as "auto" | "on" | "off" | undefined) ??
+      defaults.applyTextNormalization,
+    languageCode: trimToUndefined(config.languageCode) ?? defaults.languageCode,
+    voiceSettings: {
+      stability: asNumber(voiceSettings?.stability) ?? defaults.voiceSettings.stability,
+      similarityBoost:
+        asNumber(voiceSettings?.similarityBoost) ?? defaults.voiceSettings.similarityBoost,
+      style: asNumber(voiceSettings?.style) ?? defaults.voiceSettings.style,
+      useSpeakerBoost:
+        asBoolean(voiceSettings?.useSpeakerBoost) ?? defaults.voiceSettings.useSpeakerBoost,
+      speed: asNumber(voiceSettings?.speed) ?? defaults.voiceSettings.speed,
+    },
+  };
+}
+
+function mergeVoiceSettingsOverride(
+  ctx: SpeechDirectiveTokenParseContext,
+  next: Record<string, unknown>,
+): SpeechProviderOverrides {
+  return {
+    ...(ctx.currentOverrides ?? {}),
+    voiceSettings: {
+      ...(asObject(ctx.currentOverrides?.voiceSettings) ?? {}),
+      ...next,
+    },
+  };
+}
+
+function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
+  try {
+    switch (ctx.key) {
+      case "voiceid":
+      case "voice_id":
+      case "elevenlabs_voice":
+      case "elevenlabsvoice":
+        if (!ctx.policy.allowVoice) {
+          return { handled: true };
+        }
+        if (!isValidVoiceId(ctx.value)) {
+          return { handled: true, warnings: [`invalid ElevenLabs voiceId "${ctx.value}"`] };
+        }
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
+        };
+      case "model":
+      case "modelid":
+      case "model_id":
+      case "elevenlabs_model":
+      case "elevenlabsmodel":
+        if (!ctx.policy.allowModelId) {
+          return { handled: true };
+        }
+        return {
+          handled: true,
+          overrides: { ...(ctx.currentOverrides ?? {}), modelId: ctx.value },
+        };
+      case "stability": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid stability value"] };
+        }
+        requireInRange(value, 0, 1, "stability");
+        return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { stability: value }) };
+      }
+      case "similarity":
+      case "similarityboost":
+      case "similarity_boost": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid similarityBoost value"] };
+        }
+        requireInRange(value, 0, 1, "similarityBoost");
+        return {
+          handled: true,
+          overrides: mergeVoiceSettingsOverride(ctx, { similarityBoost: value }),
+        };
+      }
+      case "style": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid style value"] };
+        }
+        requireInRange(value, 0, 1, "style");
+        return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { style: value }) };
+      }
+      case "speed": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseNumberValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid speed value"] };
+        }
+        requireInRange(value, 0.5, 2, "speed");
+        return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { speed: value }) };
+      }
+      case "speakerboost":
+      case "speaker_boost":
+      case "usespeakerboost":
+      case "use_speaker_boost": {
+        if (!ctx.policy.allowVoiceSettings) {
+          return { handled: true };
+        }
+        const value = parseBooleanValue(ctx.value);
+        if (value == null) {
+          return { handled: true, warnings: ["invalid useSpeakerBoost value"] };
+        }
+        return {
+          handled: true,
+          overrides: mergeVoiceSettingsOverride(ctx, { useSpeakerBoost: value }),
+        };
+      }
+      case "normalize":
+      case "applytextnormalization":
+      case "apply_text_normalization":
+        if (!ctx.policy.allowNormalization) {
+          return { handled: true };
+        }
+        return {
+          handled: true,
+          overrides: {
+            ...(ctx.currentOverrides ?? {}),
+            applyTextNormalization: normalizeApplyTextNormalization(ctx.value),
+          },
+        };
+      case "language":
+      case "languagecode":
+      case "language_code":
+        if (!ctx.policy.allowNormalization) {
+          return { handled: true };
+        }
+        return {
+          handled: true,
+          overrides: {
+            ...(ctx.currentOverrides ?? {}),
+            languageCode: normalizeLanguageCode(ctx.value),
+          },
+        };
+      case "seed":
+        if (!ctx.policy.allowSeed) {
+          return { handled: true };
+        }
+        return {
+          handled: true,
+          overrides: {
+            ...(ctx.currentOverrides ?? {}),
+            seed: normalizeSeed(Number.parseInt(ctx.value, 10)),
+          },
+        };
+      default:
+        return { handled: false };
+    }
+  } catch (error) {
+    return {
+      handled: true,
+      warnings: [error instanceof Error ? error.message : String(error)],
+    };
+  }
 }

 export async function listElevenLabsVoices(params: {
@@ -49,49 +342,164 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
  return {
    id: "elevenlabs",
    label: "ElevenLabs",
+    autoSelectOrder: 20,
    models: ELEVENLABS_TTS_MODELS,
+    resolveConfig: ({ rawConfig }) => normalizeElevenLabsProviderConfig(rawConfig),
+    parseDirectiveToken,
+    resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
+      const base = normalizeElevenLabsProviderConfig(baseTtsConfig);
+      const talkVoiceSettings = asObject(talkProviderConfig.voiceSettings);
+      return {
+        ...base,
+        ...(talkProviderConfig.apiKey === undefined
+          ? {}
+          : {
+              apiKey: normalizeResolvedSecretInputString({
+                value: talkProviderConfig.apiKey,
+                path: "talk.providers.elevenlabs.apiKey",
+              }),
+            }),
+        ...(trimToUndefined(talkProviderConfig.baseUrl) == null
+          ? {}
+          : { baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
+        ...(trimToUndefined(talkProviderConfig.voiceId) == null
+          ? {}
+          : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
+        ...(trimToUndefined(talkProviderConfig.modelId) == null
+          ? {}
+          : { modelId: trimToUndefined(talkProviderConfig.modelId) }),
+        ...(asNumber(talkProviderConfig.seed) == null
+          ? {}
+          : { seed: asNumber(talkProviderConfig.seed) }),
+        ...(trimToUndefined(talkProviderConfig.applyTextNormalization) == null
+          ? {}
+          : {
+              applyTextNormalization: normalizeApplyTextNormalization(
+                trimToUndefined(talkProviderConfig.applyTextNormalization),
+              ),
+            }),
+        ...(trimToUndefined(talkProviderConfig.languageCode) == null
+          ? {}
+          : {
+              languageCode: normalizeLanguageCode(trimToUndefined(talkProviderConfig.languageCode)),
+            }),
+        voiceSettings: {
+          ...base.voiceSettings,
+          ...(asNumber(talkVoiceSettings?.stability) == null
+            ? {}
+            : { stability: asNumber(talkVoiceSettings?.stability) }),
+          ...(asNumber(talkVoiceSettings?.similarityBoost) == null
+            ? {}
+            : { similarityBoost: asNumber(talkVoiceSettings?.similarityBoost) }),
+          ...(asNumber(talkVoiceSettings?.style) == null
+            ? {}
+            : { style: asNumber(talkVoiceSettings?.style) }),
+          ...(asBoolean(talkVoiceSettings?.useSpeakerBoost) == null
+            ? {}
+            : { useSpeakerBoost: asBoolean(talkVoiceSettings?.useSpeakerBoost) }),
+          ...(asNumber(talkVoiceSettings?.speed) == null
+            ? {}
+            : { speed: asNumber(talkVoiceSettings?.speed) }),
+        },
+      };
+    },
+    resolveTalkOverrides: ({ params }) => {
+      const normalize = trimToUndefined(params.normalize);
+      const language = trimToUndefined(params.language)?.toLowerCase();
+      const voiceSettings = {
+        ...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }),
+        ...(asNumber(params.stability) == null ? {} : { stability: asNumber(params.stability) }),
+        ...(asNumber(params.similarity) == null
+          ? {}
+          : { similarityBoost: asNumber(params.similarity) }),
+        ...(asNumber(params.style) == null ? {} : { style: asNumber(params.style) }),
+        ...(asBoolean(params.speakerBoost) == null
+          ? {}
+          : { useSpeakerBoost: asBoolean(params.speakerBoost) }),
+      };
+      return {
+        ...(trimToUndefined(params.voiceId) == null
+          ? {}
+          : { voiceId: trimToUndefined(params.voiceId) }),
+        ...(trimToUndefined(params.modelId) == null
+          ? {}
+          : { modelId: trimToUndefined(params.modelId) }),
+        ...(trimToUndefined(params.outputFormat) == null
+          ? {}
+          : { outputFormat: trimToUndefined(params.outputFormat) }),
+        ...(asNumber(params.seed) == null ? {} : { seed: asNumber(params.seed) }),
+        ...(normalize == null
+          ? {}
+          : { applyTextNormalization: normalizeApplyTextNormalization(normalize) }),
+        ...(language == null ? {} : { languageCode: normalizeLanguageCode(language) }),
+        ...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
+      };
+    },
    listVoices: async (req) => {
+      const config = req.providerConfig
+        ? readElevenLabsProviderConfig(req.providerConfig)
+        : undefined;
      const apiKey =
-        req.apiKey ||
-        req.config?.elevenlabs.apiKey ||
-        process.env.ELEVENLABS_API_KEY ||
-        process.env.XI_API_KEY;
+        req.apiKey || config?.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
      if (!apiKey) {
        throw new Error("ElevenLabs API key missing");
      }
      return listElevenLabsVoices({
        apiKey,
-        baseUrl: req.baseUrl ?? req.config?.elevenlabs.baseUrl,
+        baseUrl: req.baseUrl ?? config?.baseUrl,
      });
    },
-    isConfigured: ({ config }) =>
-      Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
+    isConfigured: ({ providerConfig }) =>
+      Boolean(
+        readElevenLabsProviderConfig(providerConfig).apiKey ||
+        process.env.ELEVENLABS_API_KEY ||
+        process.env.XI_API_KEY,
+      ),
    synthesize: async (req) => {
-      const apiKey =
-        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
+      const config = readElevenLabsProviderConfig(req.providerConfig);
+      const overrides = req.providerOverrides ?? {};
+      const apiKey = config.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
      if (!apiKey) {
        throw new Error("ElevenLabs API key missing");
      }
      const outputFormat =
-        req.overrides?.elevenlabs?.outputFormat ??
+        trimToUndefined(overrides.outputFormat) ??
        (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
+      const overrideVoiceSettings = asObject(overrides.voiceSettings);
      const audioBuffer = await elevenLabsTTS({
        text: req.text,
        apiKey,
-        baseUrl: req.config.elevenlabs.baseUrl,
-        voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
-        modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
+        baseUrl: config.baseUrl,
+        voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
+        modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
        outputFormat,
-        seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
+        seed: asNumber(overrides.seed) ?? config.seed,
        applyTextNormalization:
-          req.overrides?.elevenlabs?.applyTextNormalization ??
-          req.config.elevenlabs.applyTextNormalization,
-        languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
+          (trimToUndefined(overrides.applyTextNormalization) as
+            | "auto"
+            | "on"
+            | "off"
+            | undefined) ?? config.applyTextNormalization,
+        languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
        voiceSettings: {
-          ...req.config.elevenlabs.voiceSettings,
-          ...req.overrides?.elevenlabs?.voiceSettings,
+          ...config.voiceSettings,
+          ...(asNumber(overrideVoiceSettings?.stability) == null
+            ? {}
+            : { stability: asNumber(overrideVoiceSettings?.stability) }),
+          ...(asNumber(overrideVoiceSettings?.similarityBoost) == null
+            ? {}
+            : { similarityBoost: asNumber(overrideVoiceSettings?.similarityBoost) }),
+          ...(asNumber(overrideVoiceSettings?.style) == null
+            ? {}
+            : { style: asNumber(overrideVoiceSettings?.style) }),
+          ...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
+            ? {}
+            : { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
+          ...(asNumber(overrideVoiceSettings?.speed) == null
+            ? {}
+            : { speed: asNumber(overrideVoiceSettings?.speed) }),
        },
-        timeoutMs: req.config.timeoutMs,
+        timeoutMs: req.timeoutMs,
      });
      return {
        audioBuffer,
@@ -101,8 +509,8 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
      };
    },
    synthesizeTelephony: async (req) => {
-      const apiKey =
-        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
+      const config = readElevenLabsProviderConfig(req.providerConfig);
+      const apiKey = config.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
      if (!apiKey) {
        throw new Error("ElevenLabs API key missing");
      }
@@ -111,15 +519,15 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
      const audioBuffer = await elevenLabsTTS({
        text: req.text,
        apiKey,
-        baseUrl: req.config.elevenlabs.baseUrl,
-        voiceId: req.config.elevenlabs.voiceId,
-        modelId: req.config.elevenlabs.modelId,
+        baseUrl: config.baseUrl,
+        voiceId: config.voiceId,
+        modelId: config.modelId,
        outputFormat,
-        seed: req.config.elevenlabs.seed,
-        applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
-        languageCode: req.config.elevenlabs.languageCode,
-        voiceSettings: req.config.elevenlabs.voiceSettings,
-        timeoutMs: req.config.timeoutMs,
+        seed: config.seed,
+        applyTextNormalization: config.applyTextNormalization,
+        languageCode: config.languageCode,
+        voiceSettings: config.voiceSettings,
+        timeoutMs: req.timeoutMs,
      });
      return { audioBuffer, outputFormat, sampleRate };
    },
--- a/extensions/microsoft/speech-provider.ts
+++ b/extensions/microsoft/speech-provider.ts
@@ -5,14 +5,33 @@ import {
  TRUSTED_CLIENT_TOKEN,
  generateSecMsGecToken,
 } from "node-edge-tts/dist/drm.js";
-import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
 import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
 import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
-import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import type {
+  SpeechProviderConfig,
+  SpeechProviderPlugin,
+  SpeechVoiceOption,
+} from "openclaw/plugin-sdk/speech-core";
 import { edgeTTS, inferEdgeExtension } from "./tts.js";

+const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
+const DEFAULT_EDGE_LANG = "en-US";
 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";

+type MicrosoftProviderConfig = {
+  enabled: boolean;
+  voice: string;
+  lang: string;
+  outputFormat: string;
+  outputFormatConfigured: boolean;
+  pitch?: string;
+  rate?: string;
+  volume?: string;
+  saveSubtitles: boolean;
+  proxy?: string;
+  timeoutMs?: number;
+};
+
 type MicrosoftVoiceListEntry = {
  ShortName?: string;
  FriendlyName?: string;
@@ -24,6 +43,64 @@ type MicrosoftVoiceListEntry = {
  };
 };

+function trimToUndefined(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function asBoolean(value: unknown): boolean | undefined {
+  return typeof value === "boolean" ? value : undefined;
+}
+
+function asNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function normalizeMicrosoftProviderConfig(
+  rawConfig: Record<string, unknown>,
+): MicrosoftProviderConfig {
+  const rawEdge = asObject(rawConfig.edge);
+  const rawMicrosoft = asObject(rawConfig.microsoft);
+  const raw = { ...(rawEdge ?? {}), ...(rawMicrosoft ?? {}) };
+  const outputFormat = trimToUndefined(raw.outputFormat);
+  return {
+    enabled: asBoolean(raw.enabled) ?? true,
+    voice: trimToUndefined(raw.voice) ?? DEFAULT_EDGE_VOICE,
+    lang: trimToUndefined(raw.lang) ?? DEFAULT_EDGE_LANG,
+    outputFormat: outputFormat ?? DEFAULT_EDGE_OUTPUT_FORMAT,
+    outputFormatConfigured: Boolean(outputFormat),
+    pitch: trimToUndefined(raw.pitch),
+    rate: trimToUndefined(raw.rate),
+    volume: trimToUndefined(raw.volume),
+    saveSubtitles: asBoolean(raw.saveSubtitles) ?? false,
+    proxy: trimToUndefined(raw.proxy),
+    timeoutMs: asNumber(raw.timeoutMs),
+  };
+}
+
+function readMicrosoftProviderConfig(config: SpeechProviderConfig): MicrosoftProviderConfig {
+  const defaults = normalizeMicrosoftProviderConfig({});
+  return {
+    enabled: asBoolean(config.enabled) ?? defaults.enabled,
+    voice: trimToUndefined(config.voice) ?? defaults.voice,
+    lang: trimToUndefined(config.lang) ?? defaults.lang,
+    outputFormat: trimToUndefined(config.outputFormat) ?? defaults.outputFormat,
+    outputFormatConfigured:
+      asBoolean(config.outputFormatConfigured) ?? defaults.outputFormatConfigured,
+    pitch: trimToUndefined(config.pitch) ?? defaults.pitch,
+    rate: trimToUndefined(config.rate) ?? defaults.rate,
+    volume: trimToUndefined(config.volume) ?? defaults.volume,
+    saveSubtitles: asBoolean(config.saveSubtitles) ?? defaults.saveSubtitles,
+    proxy: trimToUndefined(config.proxy) ?? defaults.proxy,
+    timeoutMs: asNumber(config.timeoutMs) ?? defaults.timeoutMs,
+  };
+}
+
 function buildMicrosoftVoiceHeaders(): Record<string, string> {
  const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0";
  return {
@@ -77,13 +154,57 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
    id: "microsoft",
    label: "Microsoft",
    aliases: ["edge"],
+    autoSelectOrder: 30,
+    resolveConfig: ({ rawConfig }) => normalizeMicrosoftProviderConfig(rawConfig),
+    resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
+      const base = normalizeMicrosoftProviderConfig(baseTtsConfig);
+      return {
+        ...base,
+        enabled: true,
+        ...(trimToUndefined(talkProviderConfig.voiceId) == null
+          ? {}
+          : { voice: trimToUndefined(talkProviderConfig.voiceId) }),
+        ...(trimToUndefined(talkProviderConfig.languageCode) == null
+          ? {}
+          : { lang: trimToUndefined(talkProviderConfig.languageCode) }),
+        ...(trimToUndefined(talkProviderConfig.outputFormat) == null
+          ? {}
+          : { outputFormat: trimToUndefined(talkProviderConfig.outputFormat) }),
+        ...(trimToUndefined(talkProviderConfig.pitch) == null
+          ? {}
+          : { pitch: trimToUndefined(talkProviderConfig.pitch) }),
+        ...(trimToUndefined(talkProviderConfig.rate) == null
+          ? {}
+          : { rate: trimToUndefined(talkProviderConfig.rate) }),
+        ...(trimToUndefined(talkProviderConfig.volume) == null
+          ? {}
+          : { volume: trimToUndefined(talkProviderConfig.volume) }),
+        ...(trimToUndefined(talkProviderConfig.proxy) == null
+          ? {}
+          : { proxy: trimToUndefined(talkProviderConfig.proxy) }),
+        ...(asNumber(talkProviderConfig.timeoutMs) == null
+          ? {}
+          : { timeoutMs: asNumber(talkProviderConfig.timeoutMs) }),
+      };
+    },
+    resolveTalkOverrides: ({ params }) => ({
+      ...(trimToUndefined(params.voiceId) == null
+        ? {}
+        : { voice: trimToUndefined(params.voiceId) }),
+      ...(trimToUndefined(params.outputFormat) == null
+        ? {}
+        : { outputFormat: trimToUndefined(params.outputFormat) }),
+    }),
    listVoices: async () => await listMicrosoftVoices(),
-    isConfigured: ({ config }) => config.edge.enabled,
+    isConfigured: ({ providerConfig }) => readMicrosoftProviderConfig(providerConfig).enabled,
    synthesize: async (req) => {
+      const config = readMicrosoftProviderConfig(req.providerConfig);
      const tempRoot = resolvePreferredOpenClawTmpDir();
      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
      const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
-      let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat;
+      const overrideVoice = trimToUndefined(req.providerOverrides?.voice);
+      let outputFormat =
+        trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat;
      const fallbackOutputFormat =
        outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;

@@ -95,11 +216,11 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
            text: req.text,
            outputPath,
            config: {
-              ...req.config.edge,
-              voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice,
+              ...config,
+              voice: overrideVoice ?? config.voice,
              outputFormat: format,
            },
-            timeoutMs: req.config.timeoutMs,
+            timeoutMs: req.timeoutMs,
          });
          const audioBuffer = readFileSync(outputPath);
          return {
@@ -112,9 +233,9 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {

        try {
          return await runEdge(outputFormat);
-        } catch (err) {
+        } catch (error) {
          if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
-            throw err;
+            throw error;
          }
          outputFormat = fallbackOutputFormat;
          return await runEdge(outputFormat);
--- a/extensions/openai/index.test.ts
+++ b/extensions/openai/index.test.ts
@@ -137,32 +137,14 @@ function createLiveTtsConfig(): ResolvedTtsConfig {
      allowNormalization: true,
      allowSeed: true,
    },
-    elevenlabs: {
-      baseUrl: "https://api.elevenlabs.io",
-      voiceId: "",
-      modelId: "eleven_multilingual_v2",
-      voiceSettings: {
-        stability: 0.5,
-        similarityBoost: 0.75,
-        style: 0,
-        useSpeakerBoost: true,
-        speed: 1,
+    providerConfigs: {
+      openai: {
+        apiKey: OPENAI_API_KEY,
+        baseUrl: "https://api.openai.com/v1",
+        model: "gpt-4o-mini-tts",
+        voice: "alloy",
      },
    },
-    openai: {
-      apiKey: OPENAI_API_KEY,
-      baseUrl: "https://api.openai.com/v1",
-      model: "gpt-4o-mini-tts",
-      voice: "alloy",
-    },
-    edge: {
-      enabled: false,
-      voice: "en-US-AriaNeural",
-      lang: "en-US",
-      outputFormat: "audio-24khz-48kbitrate-mono-mp3",
-      outputFormatConfigured: false,
-      saveSubtitles: false,
-    },
    maxTextLength: 4_000,
    timeoutMs: 30_000,
  };
@@ -358,8 +340,9 @@ describeLive("openai plugin live", () => {
    const audioFile = await speechProvider.synthesize({
      text: "OpenClaw integration test OK.",
      cfg,
-      config: ttsConfig,
+      providerConfig: ttsConfig.providerConfigs.openai ?? {},
      target: "audio-file",
+      timeoutMs: ttsConfig.timeoutMs,
    });
    expect(audioFile.outputFormat).toBe("mp3");
    expect(audioFile.fileExtension).toBe(".mp3");
@@ -368,7 +351,8 @@ describeLive("openai plugin live", () => {
    const telephony = await speechProvider.synthesizeTelephony?.({
      text: "Telephony check OK.",
      cfg,
-      config: ttsConfig,
+      providerConfig: ttsConfig.providerConfigs.openai ?? {},
+      timeoutMs: ttsConfig.timeoutMs,
    });
    expect(telephony?.outputFormat).toBe("pcm");
    expect(telephony?.sampleRate).toBe(24_000);
@@ -386,8 +370,9 @@ describeLive("openai plugin live", () => {
    const synthesized = await speechProvider.synthesize({
      text: "OpenClaw integration test OK.",
      cfg,
-      config: ttsConfig,
+      providerConfig: ttsConfig.providerConfigs.openai ?? {},
      target: "audio-file",
+      timeoutMs: ttsConfig.timeoutMs,
    });

    const transcription = await mediaProvider.transcribeAudio?.({
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -1,16 +1,181 @@
-import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
-import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import type {
+  SpeechDirectiveTokenParseContext,
+  SpeechProviderConfig,
+  SpeechProviderOverrides,
+  SpeechProviderPlugin,
+} from "openclaw/plugin-sdk/speech-core";
+import {
+  DEFAULT_OPENAI_BASE_URL,
+  isValidOpenAIModel,
+  isValidOpenAIVoice,
+  normalizeOpenAITtsBaseUrl,
+  OPENAI_TTS_MODELS,
+  OPENAI_TTS_VOICES,
+  openaiTTS,
+} from "./tts.js";
+
+type OpenAITtsProviderConfig = {
+  apiKey?: string;
+  baseUrl: string;
+  model: string;
+  voice: string;
+  speed?: number;
+  instructions?: string;
+};
+
+type OpenAITtsProviderOverrides = {
+  model?: string;
+  voice?: string;
+  speed?: number;
+};
+
+function trimToUndefined(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function asNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function normalizeOpenAIProviderConfig(
+  rawConfig: Record<string, unknown>,
+): OpenAITtsProviderConfig {
+  const raw = asObject(rawConfig.openai);
+  return {
+    apiKey: normalizeResolvedSecretInputString({
+      value: raw?.apiKey,
+      path: "messages.tts.openai.apiKey",
+    }),
+    baseUrl: normalizeOpenAITtsBaseUrl(
+      trimToUndefined(raw?.baseUrl) ??
+        trimToUndefined(process.env.OPENAI_TTS_BASE_URL) ??
+        DEFAULT_OPENAI_BASE_URL,
+    ),
+    model: trimToUndefined(raw?.model) ?? "gpt-4o-mini-tts",
+    voice: trimToUndefined(raw?.voice) ?? "coral",
+    speed: asNumber(raw?.speed),
+    instructions: trimToUndefined(raw?.instructions),
+  };
+}
+
+function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProviderConfig {
+  const normalized = normalizeOpenAIProviderConfig({});
+  return {
+    apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
+    baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl,
+    model: trimToUndefined(config.model) ?? normalized.model,
+    voice: trimToUndefined(config.voice) ?? normalized.voice,
+    speed: asNumber(config.speed) ?? normalized.speed,
+    instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
+  };
+}
+
+function readOpenAIOverrides(
+  overrides: SpeechProviderOverrides | undefined,
+): OpenAITtsProviderOverrides {
+  if (!overrides) {
+    return {};
+  }
+  return {
+    model: trimToUndefined(overrides.model),
+    voice: trimToUndefined(overrides.voice),
+    speed: asNumber(overrides.speed),
+  };
+}
+
+function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
+  handled: boolean;
+  overrides?: SpeechProviderOverrides;
+  warnings?: string[];
+} {
+  const baseUrl = trimToUndefined(ctx.providerConfig?.baseUrl);
+  switch (ctx.key) {
+    case "voice":
+    case "openai_voice":
+    case "openaivoice":
+      if (!ctx.policy.allowVoice) {
+        return { handled: true };
+      }
+      if (!isValidOpenAIVoice(ctx.value, baseUrl)) {
+        return { handled: true, warnings: [`invalid OpenAI voice "${ctx.value}"`] };
+      }
+      return { handled: true, overrides: { voice: ctx.value } };
+    case "model":
+    case "openai_model":
+    case "openaimodel":
+      if (!ctx.policy.allowModelId) {
+        return { handled: true };
+      }
+      if (!isValidOpenAIModel(ctx.value, baseUrl)) {
+        return { handled: false };
+      }
+      return { handled: true, overrides: { model: ctx.value } };
+    default:
+      return { handled: false };
+  }
+}

 export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
  return {
    id: "openai",
    label: "OpenAI",
+    autoSelectOrder: 10,
    models: OPENAI_TTS_MODELS,
    voices: OPENAI_TTS_VOICES,
+    resolveConfig: ({ rawConfig }) => normalizeOpenAIProviderConfig(rawConfig),
+    parseDirectiveToken,
+    resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
+      const base = normalizeOpenAIProviderConfig(baseTtsConfig);
+      return {
+        ...base,
+        ...(talkProviderConfig.apiKey === undefined
+          ? {}
+          : {
+              apiKey: normalizeResolvedSecretInputString({
+                value: talkProviderConfig.apiKey,
+                path: "talk.providers.openai.apiKey",
+              }),
+            }),
+        ...(trimToUndefined(talkProviderConfig.baseUrl) == null
+          ? {}
+          : { baseUrl: trimToUndefined(talkProviderConfig.baseUrl) }),
+        ...(trimToUndefined(talkProviderConfig.modelId) == null
+          ? {}
+          : { model: trimToUndefined(talkProviderConfig.modelId) }),
+        ...(trimToUndefined(talkProviderConfig.voiceId) == null
+          ? {}
+          : { voice: trimToUndefined(talkProviderConfig.voiceId) }),
+        ...(asNumber(talkProviderConfig.speed) == null
+          ? {}
+          : { speed: asNumber(talkProviderConfig.speed) }),
+        ...(trimToUndefined(talkProviderConfig.instructions) == null
+          ? {}
+          : { instructions: trimToUndefined(talkProviderConfig.instructions) }),
+      };
+    },
+    resolveTalkOverrides: ({ params }) => ({
+      ...(trimToUndefined(params.voiceId) == null
+        ? {}
+        : { voice: trimToUndefined(params.voiceId) }),
+      ...(trimToUndefined(params.modelId) == null
+        ? {}
+        : { model: trimToUndefined(params.modelId) }),
+      ...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }),
+    }),
    listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
-    isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
+    isConfigured: ({ providerConfig }) =>
+      Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
    synthesize: async (req) => {
-      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
+      const config = readOpenAIProviderConfig(req.providerConfig);
+      const overrides = readOpenAIOverrides(req.providerOverrides);
+      const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
      if (!apiKey) {
        throw new Error("OpenAI API key missing");
      }
@@ -18,13 +183,13 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
      const audioBuffer = await openaiTTS({
        text: req.text,
        apiKey,
-        baseUrl: req.config.openai.baseUrl,
-        model: req.overrides?.openai?.model ?? req.config.openai.model,
-        voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
-        speed: req.overrides?.openai?.speed ?? req.config.openai.speed,
-        instructions: req.config.openai.instructions,
+        baseUrl: config.baseUrl,
+        model: overrides.model ?? config.model,
+        voice: overrides.voice ?? config.voice,
+        speed: overrides.speed ?? config.speed,
+        instructions: config.instructions,
        responseFormat,
-        timeoutMs: req.config.timeoutMs,
+        timeoutMs: req.timeoutMs,
      });
      return {
        audioBuffer,
@@ -34,7 +199,8 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
      };
    },
    synthesizeTelephony: async (req) => {
-      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
+      const config = readOpenAIProviderConfig(req.providerConfig);
+      const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
      if (!apiKey) {
        throw new Error("OpenAI API key missing");
      }
@@ -43,13 +209,13 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
      const audioBuffer = await openaiTTS({
        text: req.text,
        apiKey,
-        baseUrl: req.config.openai.baseUrl,
-        model: req.config.openai.model,
-        voice: req.config.openai.voice,
-        speed: req.config.openai.speed,
-        instructions: req.config.openai.instructions,
+        baseUrl: config.baseUrl,
+        model: config.model,
+        voice: config.voice,
+        speed: config.speed,
+        instructions: config.instructions,
        responseFormat: outputFormat,
-        timeoutMs: req.config.timeoutMs,
+        timeoutMs: req.timeoutMs,
      });
      return { audioBuffer, outputFormat, sampleRate };
    },
--- a/extensions/openai/tts.ts
+++ b/extensions/openai/tts.ts
@@ -1,4 +1,4 @@
-const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
+export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";

 export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;

@@ -21,7 +21,7 @@ export const OPENAI_TTS_VOICES = [

 type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];

-function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
+export function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
  const trimmed = baseUrl?.trim();
  if (!trimmed) {
    return DEFAULT_OPENAI_BASE_URL;
@@ -36,21 +36,24 @@ function isCustomOpenAIEndpoint(baseUrl?: string): boolean {
  return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL;
 }

-function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
+export function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
  if (isCustomOpenAIEndpoint(baseUrl)) {
    return true;
  }
  return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
 }

-function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
+export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
  if (isCustomOpenAIEndpoint(baseUrl)) {
    return true;
  }
  return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
 }

-function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
+export function resolveOpenAITtsInstructions(
+  model: string,
+  instructions?: string,
+): string | undefined {
  const next = instructions?.trim();
  return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
 }
--- a/extensions/voice-call/src/providers/tts-openai.ts
+++ b/extensions/voice-call/src/providers/tts-openai.ts
@@ -1,4 +1,3 @@
-import { resolveOpenAITtsInstructions } from "../../api.js";
 import { convertPcmToMulaw8k } from "../telephony-audio.js";

 /**
@@ -72,6 +71,11 @@ function trimToUndefined(value: string | undefined): string | undefined {
  return trimmed ? trimmed : undefined;
 }

+function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
+  const next = trimToUndefined(instructions);
+  return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
+}
+
 /**
 * OpenAI TTS Provider for generating speech audio.
 */