feat(plugins): add speech provider registration

2026-04-28 01:21:36 +00:00 · 2026-03-16 18:49:55 -07:00
parent ad05cd9ab2
commit 662031a88e
35 changed files with 658 additions and 286 deletions
--- a/extensions/elevenlabs/index.ts
+++ b/extensions/elevenlabs/index.ts
@@ -0,0 +1,14 @@
+import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
+import { buildElevenLabsSpeechProvider } from "../../src/tts/providers/elevenlabs.js";
+
+const elevenLabsPlugin = {
+  id: "elevenlabs",
+  name: "ElevenLabs Speech",
+  description: "Bundled ElevenLabs speech provider",
+  configSchema: emptyPluginConfigSchema(),
+  register(api: OpenClawPluginApi) {
+    api.registerSpeechProvider(buildElevenLabsSpeechProvider());
+  },
+};
+
+export default elevenLabsPlugin;
--- a/extensions/elevenlabs/openclaw.plugin.json
+++ b/extensions/elevenlabs/openclaw.plugin.json
@@ -0,0 +1,8 @@
+{
+  "id": "elevenlabs",
+  "configSchema": {
+    "type": "object",
+    "additionalProperties": false,
+    "properties": {}
+  }
+}
--- a/extensions/elevenlabs/package.json
+++ b/extensions/elevenlabs/package.json
@@ -0,0 +1,12 @@
+{
+  "name": "@openclaw/elevenlabs-speech",
+  "version": "2026.3.14",
+  "private": true,
+  "description": "OpenClaw ElevenLabs speech plugin",
+  "type": "module",
+  "openclaw": {
+    "extensions": [
+      "./index.ts"
+    ]
+  }
+}
--- a/extensions/lobster/src/lobster-tool.test.ts
+++ b/extensions/lobster/src/lobster-tool.test.ts
@@ -44,6 +44,7 @@ function fakeApi(overrides: Partial<OpenClawPluginApi> = {}): OpenClawPluginApi
    registerCli() {},
    registerService() {},
    registerProvider() {},
+    registerSpeechProvider() {},
    registerWebSearchProvider() {},
    registerInteractiveHandler() {},
    registerHook() {},
--- a/extensions/microsoft/index.ts
+++ b/extensions/microsoft/index.ts
@@ -0,0 +1,14 @@
+import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
+import { buildMicrosoftSpeechProvider } from "../../src/tts/providers/microsoft.js";
+
+const microsoftPlugin = {
+  id: "microsoft",
+  name: "Microsoft Speech",
+  description: "Bundled Microsoft speech provider",
+  configSchema: emptyPluginConfigSchema(),
+  register(api: OpenClawPluginApi) {
+    api.registerSpeechProvider(buildMicrosoftSpeechProvider());
+  },
+};
+
+export default microsoftPlugin;
--- a/extensions/microsoft/openclaw.plugin.json
+++ b/extensions/microsoft/openclaw.plugin.json
@@ -0,0 +1,8 @@
+{
+  "id": "microsoft",
+  "configSchema": {
+    "type": "object",
+    "additionalProperties": false,
+    "properties": {}
+  }
+}
--- a/extensions/microsoft/package.json
+++ b/extensions/microsoft/package.json
@@ -0,0 +1,12 @@
+{
+  "name": "@openclaw/microsoft-speech",
+  "version": "2026.3.14",
+  "private": true,
+  "description": "OpenClaw Microsoft speech plugin",
+  "type": "module",
+  "openclaw": {
+    "extensions": [
+      "./index.ts"
+    ]
+  }
+}
--- a/extensions/openai/index.ts
+++ b/extensions/openai/index.ts
@@ -1,4 +1,5 @@
 import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
+import { buildOpenAISpeechProvider } from "../../src/tts/providers/openai.js";
 import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
 import { buildOpenAIProvider } from "./openai-provider.js";

@@ -10,6 +11,7 @@ const openAIPlugin = {
  register(api: OpenClawPluginApi) {
    api.registerProvider(buildOpenAIProvider());
    api.registerProvider(buildOpenAICodexProviderPlugin());
+    api.registerSpeechProvider(buildOpenAISpeechProvider());
  },
 };

--- a/extensions/test-utils/plugin-api.ts
+++ b/extensions/test-utils/plugin-api.ts
@@ -15,6 +15,7 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi
    registerCli() {},
    registerService() {},
    registerProvider() {},
+    registerSpeechProvider() {},
    registerWebSearchProvider() {},
    registerInteractiveHandler() {},
    registerCommand() {},
--- a/extensions/voice-call/index.ts
+++ b/extensions/voice-call/index.ts
@@ -80,7 +80,7 @@ const voiceCallConfigSchema = {
    "streaming.streamPath": { label: "Media Stream Path", advanced: true },
    "tts.provider": {
      label: "TTS Provider Override",
-      help: "Deep-merges with messages.tts (Edge is ignored for calls).",
+      help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
      advanced: true,
    },
    "tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
--- a/extensions/voice-call/openclaw.plugin.json
+++ b/extensions/voice-call/openclaw.plugin.json
@@ -101,7 +101,7 @@
    },
    "tts.provider": {
      "label": "TTS Provider Override",
-      "help": "Deep-merges with messages.tts (Edge is ignored for calls).",
+      "help": "Deep-merges with messages.tts (Microsoft is ignored for calls).",
      "advanced": true
    },
    "tts.openai.model": {
@@ -420,8 +420,7 @@
            "enum": ["final", "all"]
          },
          "provider": {
-            "type": "string",
-            "enum": ["openai", "elevenlabs", "edge"]
+            "type": "string"
          },
          "summaryModel": {
            "type": "string"
--- a/src/auto-reply/reply/commands-tts.ts
+++ b/src/auto-reply/reply/commands-tts.ts
@@ -1,4 +1,5 @@
 import { logVerbose } from "../../globals.js";
+import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js";
 import {
  getLastTtsAttempt,
  getTtsMaxLength,
@@ -54,7 +55,7 @@ function ttsUsage(): ReplyPayload {
      `• /tts summary [on|off] — View/change auto-summary\n` +
      `• /tts audio <text> — Generate audio from text\n\n` +
      `**Providers:**\n` +
-      `• edge — Free, fast (default)\n` +
+      `• microsoft — Microsoft Edge-backed speech (default fallback)\n` +
      `• openai — High quality (requires API key)\n` +
      `• elevenlabs — Premium voices (requires API key)\n\n` +
      `**Text Limit (default: 1500, max: 4096):**\n` +
@@ -62,7 +63,7 @@ function ttsUsage(): ReplyPayload {
      `• Summary ON: AI summarizes, then generates audio\n` +
      `• Summary OFF: Truncates text, then generates audio\n\n` +
      `**Examples:**\n` +
-      `/tts provider edge\n` +
+      `/tts provider microsoft\n` +
      `/tts limit 2000\n` +
      `/tts audio Hello, this is a test!`,
  };
@@ -161,7 +162,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
    if (!args.trim()) {
      const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
      const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
-      const hasEdge = isTtsProviderConfigured(config, "edge");
+      const hasMicrosoft = isTtsProviderConfigured(config, "microsoft", params.cfg);
      return {
        shouldContinue: false,
        reply: {
@@ -170,21 +171,23 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
            `Primary: ${currentProvider}\n` +
            `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
            `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
-            `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
-            `Usage: /tts provider openai | elevenlabs | edge`,
+            `Microsoft enabled: ${hasMicrosoft ? "✅" : "❌"}\n` +
+            `Usage: /tts provider openai | elevenlabs | microsoft`,
        },
      };
    }

    const requested = args.trim().toLowerCase();
-    if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
+    const knownProviders = new Set(listSpeechProviders(params.cfg).map((provider) => provider.id));
+    if (requested !== "edge" && !knownProviders.has(requested)) {
      return { shouldContinue: false, reply: ttsUsage() };
    }

+    const nextProvider = normalizeSpeechProviderId(requested) ?? requested;
    setTtsProvider(prefsPath, requested);
    return {
      shouldContinue: false,
-      reply: { text: `✅ TTS provider set to ${requested}.` },
+      reply: { text: `✅ TTS provider set to ${nextProvider}.` },
    };
  }

@@ -249,7 +252,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
  if (action === "status") {
    const enabled = isTtsEnabled(config, prefsPath);
    const provider = getTtsProvider(config, prefsPath);
-    const hasKey = isTtsProviderConfigured(config, provider);
+    const hasKey = isTtsProviderConfigured(config, provider, params.cfg);
    const maxLength = getTtsMaxLength(prefsPath);
    const summarize = isSummarizationEnabled(prefsPath);
    const last = getLastTtsAttempt();
--- a/src/auto-reply/reply/route-reply.test.ts
+++ b/src/auto-reply/reply/route-reply.test.ts
@@ -91,6 +91,7 @@ const createRegistry = (channels: PluginRegistry["channels"]): PluginRegistry =>
    enabled: true,
  })),
  providers: [],
+  speechProviders: [],
  webSearchProviders: [],
  gatewayHandlers: {},
  httpRoutes: [],
--- a/src/commands/channel-setup/plugin-install.test.ts
+++ b/src/commands/channel-setup/plugin-install.test.ts
@@ -337,6 +337,7 @@ describe("ensureChannelSetupPluginInstalled", () => {
      hookNames: [],
      channelIds: [],
      providerIds: [],
+      speechProviderIds: [],
      webSearchProviderIds: [],
      gatewayMethods: [],
      cliCommands: [],
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@@ -1,6 +1,6 @@
 import type { SecretInput } from "./types.secrets.js";

-export type TtsProvider = "elevenlabs" | "openai" | "edge";
+export type TtsProvider = string;

 export type TtsMode = "final" | "all";

@@ -66,9 +66,22 @@ export type TtsConfig = {
    /** System-level instructions for the TTS model (gpt-4o-mini-tts only). */
    instructions?: string;
  };
-  /** Microsoft Edge (node-edge-tts) configuration. */
+  /** Legacy alias for Microsoft speech configuration. */
  edge?: {
-    /** Explicitly allow Edge TTS usage (no API key required). */
+    /** Explicitly allow Microsoft speech usage (no API key required). */
+    enabled?: boolean;
+    voice?: string;
+    lang?: string;
+    outputFormat?: string;
+    pitch?: string;
+    rate?: string;
+    volume?: string;
+    saveSubtitles?: boolean;
+    proxy?: string;
+    timeoutMs?: number;
+  };
+  /** Preferred alias for Microsoft speech configuration. */
+  microsoft?: {
    enabled?: boolean;
    voice?: string;
    lang?: string;
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -353,9 +353,24 @@ export const MarkdownConfigSchema = z
  .strict()
  .optional();

-export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
+export const TtsProviderSchema = z.string().min(1);
 export const TtsModeSchema = z.enum(["final", "all"]);
 export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
+const TtsMicrosoftConfigSchema = z
+  .object({
+    enabled: z.boolean().optional(),
+    voice: z.string().optional(),
+    lang: z.string().optional(),
+    outputFormat: z.string().optional(),
+    pitch: z.string().optional(),
+    rate: z.string().optional(),
+    volume: z.string().optional(),
+    saveSubtitles: z.boolean().optional(),
+    proxy: z.string().optional(),
+    timeoutMs: z.number().int().min(1000).max(120000).optional(),
+  })
+  .strict()
+  .optional();
 export const TtsConfigSchema = z
  .object({
    auto: TtsAutoSchema.optional(),
@@ -409,21 +424,8 @@ export const TtsConfigSchema = z
      })
      .strict()
      .optional(),
-    edge: z
-      .object({
-        enabled: z.boolean().optional(),
-        voice: z.string().optional(),
-        lang: z.string().optional(),
-        outputFormat: z.string().optional(),
-        pitch: z.string().optional(),
-        rate: z.string().optional(),
-        volume: z.string().optional(),
-        saveSubtitles: z.boolean().optional(),
-        proxy: z.string().optional(),
-        timeoutMs: z.number().int().min(1000).max(120000).optional(),
-      })
-      .strict()
-      .optional(),
+    edge: TtsMicrosoftConfigSchema,
+    microsoft: TtsMicrosoftConfigSchema,
    prefsPath: z.string().optional(),
    maxTextLength: z.number().int().min(1).optional(),
    timeoutMs: z.number().int().min(1000).max(120000).optional(),
--- a/src/gateway/server-methods/tts.ts
+++ b/src/gateway/server-methods/tts.ts
@@ -1,4 +1,5 @@
 import { loadConfig } from "../../config/config.js";
+import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js";
 import {
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
@@ -26,9 +27,9 @@ export const ttsHandlers: GatewayRequestHandlers = {
      const prefsPath = resolveTtsPrefsPath(config);
      const provider = getTtsProvider(config, prefsPath);
      const autoMode = resolveTtsAutoMode({ config, prefsPath });
-      const fallbackProviders = resolveTtsProviderOrder(provider)
+      const fallbackProviders = resolveTtsProviderOrder(provider, cfg)
        .slice(1)
-        .filter((candidate) => isTtsProviderConfigured(config, candidate));
+        .filter((candidate) => isTtsProviderConfigured(config, candidate, cfg));
      respond(true, {
        enabled: isTtsEnabled(config, prefsPath),
        auto: autoMode,
@@ -38,7 +39,7 @@ export const ttsHandlers: GatewayRequestHandlers = {
        prefsPath,
        hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")),
        hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")),
-        edgeEnabled: isTtsProviderConfigured(config, "edge"),
+        microsoftEnabled: isTtsProviderConfigured(config, "microsoft", cfg),
      });
    } catch (err) {
      respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
@@ -99,20 +100,23 @@ export const ttsHandlers: GatewayRequestHandlers = {
    }
  },
  "tts.setProvider": async ({ params, respond }) => {
-    const provider = typeof params.provider === "string" ? params.provider.trim() : "";
-    if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") {
+    const provider = normalizeSpeechProviderId(
+      typeof params.provider === "string" ? params.provider.trim() : "",
+    );
+    const cfg = loadConfig();
+    const knownProviders = new Set(listSpeechProviders(cfg).map((entry) => entry.id));
+    if (!provider || !knownProviders.has(provider)) {
      respond(
        false,
        undefined,
        errorShape(
          ErrorCodes.INVALID_REQUEST,
-          "Invalid provider. Use openai, elevenlabs, or edge.",
+          "Invalid provider. Use a registered TTS provider id such as openai, elevenlabs, or microsoft.",
        ),
      );
      return;
    }
    try {
-      const cfg = loadConfig();
      const config = resolveTtsConfig(cfg);
      const prefsPath = resolveTtsPrefsPath(config);
      setTtsProvider(prefsPath, provider);
@@ -127,27 +131,19 @@ export const ttsHandlers: GatewayRequestHandlers = {
      const config = resolveTtsConfig(cfg);
      const prefsPath = resolveTtsPrefsPath(config);
      respond(true, {
-        providers: [
-          {
-            id: "openai",
-            name: "OpenAI",
-            configured: Boolean(resolveTtsApiKey(config, "openai")),
-            models: [...OPENAI_TTS_MODELS],
-            voices: [...OPENAI_TTS_VOICES],
-          },
-          {
-            id: "elevenlabs",
-            name: "ElevenLabs",
-            configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
-            models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
-          },
-          {
-            id: "edge",
-            name: "Edge TTS",
-            configured: isTtsProviderConfigured(config, "edge"),
-            models: [],
-          },
-        ],
+        providers: listSpeechProviders(cfg).map((provider) => ({
+          id: provider.id,
+          name: provider.label,
+          configured: provider.isConfigured({ cfg, config }),
+          models:
+            provider.id === "openai" && provider.models == null
+              ? [...OPENAI_TTS_MODELS]
+              : [...(provider.models ?? [])],
+          voices:
+            provider.id === "openai" && provider.voices == null
+              ? [...OPENAI_TTS_VOICES]
+              : [...(provider.voices ?? [])],
+        })),
        active: getTtsProvider(config, prefsPath),
      });
    } catch (err) {
--- a/src/gateway/server-plugins.test.ts
+++ b/src/gateway/server-plugins.test.ts
@@ -29,6 +29,7 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({
  channelSetups: [],
  commands: [],
  providers: [],
+  speechProviders: [],
  webSearchProviders: [],
  gatewayHandlers: {},
  httpRoutes: [],
--- a/src/gateway/server.agent.gateway-server-agent.mocks.ts
+++ b/src/gateway/server.agent.gateway-server-agent.mocks.ts
@@ -1,25 +1,9 @@
 import { vi } from "vitest";
-import type { PluginRegistry } from "../plugins/registry.js";
+import { createEmptyPluginRegistry, type PluginRegistry } from "../plugins/registry.js";
 import { setActivePluginRegistry } from "../plugins/runtime.js";

 export const registryState: { registry: PluginRegistry } = {
-  registry: {
-    plugins: [],
-    tools: [],
-    hooks: [],
-    typedHooks: [],
-    channels: [],
-    channelSetups: [],
-    providers: [],
-    webSearchProviders: [],
-    gatewayHandlers: {},
-    httpHandlers: [],
-    httpRoutes: [],
-    cliRegistrars: [],
-    services: [],
-    commands: [],
-    diagnostics: [],
-  } as PluginRegistry,
+  registry: createEmptyPluginRegistry(),
 };

 export function setRegistry(registry: PluginRegistry) {
--- a/src/gateway/test-helpers.mocks.ts
+++ b/src/gateway/test-helpers.mocks.ts
@@ -146,6 +146,7 @@ const createStubPluginRegistry = (): PluginRegistry => ({
  ],
  channelSetups: [],
  providers: [],
+  speechProviders: [],
  webSearchProviders: [],
  gatewayHandlers: {},
  httpRoutes: [],
--- a/src/plugin-sdk/core.ts
+++ b/src/plugin-sdk/core.ts
@@ -21,6 +21,7 @@ export type {
  ProviderResolveDynamicModelContext,
  ProviderNormalizeResolvedModelContext,
  ProviderRuntimeModel,
+  SpeechProviderPlugin,
  ProviderThinkingPolicyContext,
  ProviderWrapStreamFnContext,
  OpenClawPluginService,
--- a/src/plugin-sdk/index.ts
+++ b/src/plugin-sdk/index.ts
@@ -140,6 +140,7 @@ export type {
  ProviderResolveDynamicModelContext,
  ProviderNormalizeResolvedModelContext,
  ProviderRuntimeModel,
+  SpeechProviderPlugin,
  ProviderThinkingPolicyContext,
  ProviderWrapStreamFnContext,
 } from "../plugins/types.js";
--- a/src/plugins/loader.ts
+++ b/src/plugins/loader.ts
@@ -494,6 +494,7 @@ function createPluginRecord(params: {
    hookNames: [],
    channelIds: [],
    providerIds: [],
+    speechProviderIds: [],
    webSearchProviderIds: [],
    gatewayMethods: [],
    cliCommands: [],
--- a/src/plugins/registry.ts
+++ b/src/plugins/registry.ts
@@ -46,6 +46,7 @@ import type {
  PluginHookName,
  PluginHookHandlerMap,
  PluginHookRegistration as TypedPluginHookRegistration,
+  SpeechProviderPlugin,
  WebSearchProviderPlugin,
 } from "./types.js";

@@ -110,6 +111,14 @@ export type PluginWebSearchProviderRegistration = {
  rootDir?: string;
 };

+export type PluginSpeechProviderRegistration = {
+  pluginId: string;
+  pluginName?: string;
+  provider: SpeechProviderPlugin;
+  source: string;
+  rootDir?: string;
+};
+
 export type PluginHookRegistration = {
  pluginId: string;
  entry: HookEntry;
@@ -154,6 +163,7 @@ export type PluginRecord = {
  hookNames: string[];
  channelIds: string[];
  providerIds: string[];
+  speechProviderIds: string[];
  webSearchProviderIds: string[];
  gatewayMethods: string[];
  cliCommands: string[];
@@ -174,6 +184,7 @@ export type PluginRegistry = {
  channels: PluginChannelRegistration[];
  channelSetups: PluginChannelSetupRegistration[];
  providers: PluginProviderRegistration[];
+  speechProviders: PluginSpeechProviderRegistration[];
  webSearchProviders: PluginWebSearchProviderRegistration[];
  gatewayHandlers: GatewayRequestHandlers;
  httpRoutes: PluginHttpRouteRegistration[];
@@ -219,6 +230,7 @@ export function createEmptyPluginRegistry(): PluginRegistry {
    channels: [],
    channelSetups: [],
    providers: [],
+    speechProviders: [],
    webSearchProviders: [],
    gatewayHandlers: {},
    httpRoutes: [],
@@ -550,6 +562,37 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
    });
  };

+  const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
+    const id = provider.id.trim();
+    if (!id) {
+      pushDiagnostic({
+        level: "error",
+        pluginId: record.id,
+        source: record.source,
+        message: "speech provider registration missing id",
+      });
+      return;
+    }
+    const existing = registry.speechProviders.find((entry) => entry.provider.id === id);
+    if (existing) {
+      pushDiagnostic({
+        level: "error",
+        pluginId: record.id,
+        source: record.source,
+        message: `speech provider already registered: ${id} (${existing.pluginId})`,
+      });
+      return;
+    }
+    record.speechProviderIds.push(id);
+    registry.speechProviders.push({
+      pluginId: record.id,
+      pluginName: record.name,
+      provider,
+      source: record.source,
+      rootDir: record.rootDir,
+    });
+  };
+
  const registerWebSearchProvider = (record: PluginRecord, provider: WebSearchProviderPlugin) => {
    const id = provider.id.trim();
    if (!id) {
@@ -789,6 +832,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
      registerChannel: (registration) => registerChannel(record, registration, registrationMode),
      registerProvider:
        registrationMode === "full" ? (provider) => registerProvider(record, provider) : () => {},
+      registerSpeechProvider:
+        registrationMode === "full"
+          ? (provider) => registerSpeechProvider(record, provider)
+          : () => {},
      registerWebSearchProvider:
        registrationMode === "full"
          ? (provider) => registerWebSearchProvider(record, provider)
@@ -862,6 +909,7 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
    registerTool,
    registerChannel,
    registerProvider,
+    registerSpeechProvider,
    registerWebSearchProvider,
    registerGatewayMethod,
    registerCli,
--- a/src/plugins/types.ts
+++ b/src/plugins/types.ts
@@ -27,6 +27,14 @@ import type { HookEntry } from "../hooks/types.js";
 import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js";
 import type { RuntimeEnv } from "../runtime.js";
 import type { RuntimeWebSearchMetadata } from "../secrets/runtime-web-tools.types.js";
+import type {
+  SpeechProviderConfiguredContext,
+  SpeechProviderId,
+  SpeechSynthesisRequest,
+  SpeechSynthesisResult,
+  SpeechTelephonySynthesisRequest,
+  SpeechTelephonySynthesisResult,
+} from "../tts/provider-types.js";
 import type { WizardPrompter } from "../wizard/prompts.js";
 import type { PluginRuntime } from "./runtime/types.js";

@@ -853,6 +861,23 @@ export type PluginWebSearchProviderEntry = WebSearchProviderPlugin & {
  pluginId: string;
 };

+export type SpeechProviderPlugin = {
+  id: SpeechProviderId;
+  label: string;
+  aliases?: string[];
+  models?: readonly string[];
+  voices?: readonly string[];
+  isConfigured: (ctx: SpeechProviderConfiguredContext) => boolean;
+  synthesize: (req: SpeechSynthesisRequest) => Promise<SpeechSynthesisResult>;
+  synthesizeTelephony?: (
+    req: SpeechTelephonySynthesisRequest,
+  ) => Promise<SpeechTelephonySynthesisResult>;
+};
+
+export type PluginSpeechProviderEntry = SpeechProviderPlugin & {
+  pluginId: string;
+};
+
 export type OpenClawPluginGatewayMethod = {
  method: string;
  handler: GatewayRequestHandler;
@@ -1211,6 +1236,7 @@ export type OpenClawPluginApi = {
  registerCli: (registrar: OpenClawPluginCliRegistrar, opts?: { commands?: string[] }) => void;
  registerService: (service: OpenClawPluginService) => void;
  registerProvider: (provider: ProviderPlugin) => void;
+  registerSpeechProvider: (provider: SpeechProviderPlugin) => void;
  registerWebSearchProvider: (provider: WebSearchProviderPlugin) => void;
  registerInteractiveHandler: (registration: PluginInteractiveHandlerRegistration) => void;
  /**
--- a/src/test-utils/channel-plugins.ts
+++ b/src/test-utils/channel-plugins.ts
@@ -26,6 +26,7 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl
    enabled: true,
  })),
  providers: [],
+  speechProviders: [],
  webSearchProviders: [],
  gatewayHandlers: {},
  httpRoutes: [],
--- a/src/test-utils/plugin-registration.ts
+++ b/src/test-utils/plugin-registration.ts
@@ -2,29 +2,36 @@ import type {
  AnyAgentTool,
  OpenClawPluginApi,
  ProviderPlugin,
+  SpeechProviderPlugin,
  WebSearchProviderPlugin,
 } from "../plugins/types.js";

 export type CapturedPluginRegistration = {
  api: OpenClawPluginApi;
  providers: ProviderPlugin[];
+  speechProviders: SpeechProviderPlugin[];
  webSearchProviders: WebSearchProviderPlugin[];
  tools: AnyAgentTool[];
 };

 export function createCapturedPluginRegistration(): CapturedPluginRegistration {
  const providers: ProviderPlugin[] = [];
+  const speechProviders: SpeechProviderPlugin[] = [];
  const webSearchProviders: WebSearchProviderPlugin[] = [];
  const tools: AnyAgentTool[] = [];

  return {
    providers,
+    speechProviders,
    webSearchProviders,
    tools,
    api: {
      registerProvider(provider: ProviderPlugin) {
        providers.push(provider);
      },
+      registerSpeechProvider(provider: SpeechProviderPlugin) {
+        speechProviders.push(provider);
+      },
      registerWebSearchProvider(provider: WebSearchProviderPlugin) {
        webSearchProviders.push(provider);
      },
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@@ -0,0 +1,84 @@
+import type { OpenClawConfig } from "../config/config.js";
+import { loadOpenClawPlugins } from "../plugins/loader.js";
+import { getActivePluginRegistry } from "../plugins/runtime.js";
+import type { SpeechProviderPlugin } from "../plugins/types.js";
+import type { SpeechProviderId } from "./provider-types.js";
+import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
+import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
+import { buildOpenAISpeechProvider } from "./providers/openai.js";
+
+const BUILTIN_SPEECH_PROVIDERS: readonly SpeechProviderPlugin[] = [
+  buildOpenAISpeechProvider(),
+  buildElevenLabsSpeechProvider(),
+  buildMicrosoftSpeechProvider(),
+];
+
+function trimToUndefined(value: string | undefined): string | undefined {
+  const trimmed = value?.trim().toLowerCase();
+  return trimmed ? trimmed : undefined;
+}
+
+export function normalizeSpeechProviderId(
+  providerId: string | undefined,
+): SpeechProviderId | undefined {
+  const normalized = trimToUndefined(providerId);
+  if (!normalized) {
+    return undefined;
+  }
+  return normalized === "edge" ? "microsoft" : normalized;
+}
+
+function resolveSpeechProviderPluginEntries(cfg?: OpenClawConfig): SpeechProviderPlugin[] {
+  const active = getActivePluginRegistry();
+  const registry =
+    (active?.speechProviders?.length ?? 0) > 0 || !cfg
+      ? active
+      : loadOpenClawPlugins({ config: cfg });
+  return registry?.speechProviders?.map((entry) => entry.provider) ?? [];
+}
+
+function buildProviderMaps(cfg?: OpenClawConfig): {
+  canonical: Map<string, SpeechProviderPlugin>;
+  aliases: Map<string, SpeechProviderPlugin>;
+} {
+  const canonical = new Map<string, SpeechProviderPlugin>();
+  const aliases = new Map<string, SpeechProviderPlugin>();
+  const register = (provider: SpeechProviderPlugin) => {
+    const id = normalizeSpeechProviderId(provider.id);
+    if (!id) {
+      return;
+    }
+    canonical.set(id, provider);
+    aliases.set(id, provider);
+    for (const alias of provider.aliases ?? []) {
+      const normalizedAlias = normalizeSpeechProviderId(alias);
+      if (normalizedAlias) {
+        aliases.set(normalizedAlias, provider);
+      }
+    }
+  };
+
+  for (const provider of BUILTIN_SPEECH_PROVIDERS) {
+    register(provider);
+  }
+  for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
+    register(provider);
+  }
+
+  return { canonical, aliases };
+}
+
+export function listSpeechProviders(cfg?: OpenClawConfig): SpeechProviderPlugin[] {
+  return [...buildProviderMaps(cfg).canonical.values()];
+}
+
+export function getSpeechProvider(
+  providerId: string | undefined,
+  cfg?: OpenClawConfig,
+): SpeechProviderPlugin | undefined {
+  const normalized = normalizeSpeechProviderId(providerId);
+  if (!normalized) {
+    return undefined;
+  }
+  return buildProviderMaps(cfg).aliases.get(normalized);
+}
--- a/src/tts/provider-types.ts
+++ b/src/tts/provider-types.ts
@@ -0,0 +1,38 @@
+import type { OpenClawConfig } from "../config/config.js";
+import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "./tts.js";
+
+export type SpeechProviderId = string;
+
+export type SpeechSynthesisTarget = "audio-file" | "voice-note";
+
+export type SpeechProviderConfiguredContext = {
+  cfg?: OpenClawConfig;
+  config: ResolvedTtsConfig;
+};
+
+export type SpeechSynthesisRequest = {
+  text: string;
+  cfg: OpenClawConfig;
+  config: ResolvedTtsConfig;
+  target: SpeechSynthesisTarget;
+  overrides?: TtsDirectiveOverrides;
+};
+
+export type SpeechSynthesisResult = {
+  audioBuffer: Buffer;
+  outputFormat: string;
+  fileExtension: string;
+  voiceCompatible: boolean;
+};
+
+export type SpeechTelephonySynthesisRequest = {
+  text: string;
+  cfg: OpenClawConfig;
+  config: ResolvedTtsConfig;
+};
+
+export type SpeechTelephonySynthesisResult = {
+  audioBuffer: Buffer;
+  outputFormat: string;
+  sampleRate: number;
+};
--- a/src/tts/providers/elevenlabs.ts
+++ b/src/tts/providers/elevenlabs.ts
@@ -0,0 +1,73 @@
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import { elevenLabsTTS } from "../tts-core.js";
+
+const ELEVENLABS_TTS_MODELS = [
+  "eleven_multilingual_v2",
+  "eleven_turbo_v2_5",
+  "eleven_monolingual_v1",
+] as const;
+
+export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "elevenlabs",
+    label: "ElevenLabs",
+    models: ELEVENLABS_TTS_MODELS,
+    isConfigured: ({ config }) =>
+      Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
+    synthesize: async (req) => {
+      const apiKey =
+        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
+      if (!apiKey) {
+        throw new Error("ElevenLabs API key missing");
+      }
+      const outputFormat = req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128";
+      const audioBuffer = await elevenLabsTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.elevenlabs.baseUrl,
+        voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
+        modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
+        outputFormat,
+        seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
+        applyTextNormalization:
+          req.overrides?.elevenlabs?.applyTextNormalization ??
+          req.config.elevenlabs.applyTextNormalization,
+        languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
+        voiceSettings: {
+          ...req.config.elevenlabs.voiceSettings,
+          ...req.overrides?.elevenlabs?.voiceSettings,
+        },
+        timeoutMs: req.config.timeoutMs,
+      });
+      return {
+        audioBuffer,
+        outputFormat,
+        fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
+        voiceCompatible: req.target === "voice-note",
+      };
+    },
+    synthesizeTelephony: async (req) => {
+      const apiKey =
+        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
+      if (!apiKey) {
+        throw new Error("ElevenLabs API key missing");
+      }
+      const outputFormat = "pcm_22050";
+      const sampleRate = 22_050;
+      const audioBuffer = await elevenLabsTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.elevenlabs.baseUrl,
+        voiceId: req.config.elevenlabs.voiceId,
+        modelId: req.config.elevenlabs.modelId,
+        outputFormat,
+        seed: req.config.elevenlabs.seed,
+        applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
+        languageCode: req.config.elevenlabs.languageCode,
+        voiceSettings: req.config.elevenlabs.voiceSettings,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return { audioBuffer, outputFormat, sampleRate };
+    },
+  };
+}
--- a/src/tts/providers/microsoft.ts
+++ b/src/tts/providers/microsoft.ts
@@ -0,0 +1,60 @@
+import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
+import path from "node:path";
+import { resolvePreferredOpenClawTmpDir } from "../../infra/tmp-openclaw-dir.js";
+import { isVoiceCompatibleAudio } from "../../media/audio.js";
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import { edgeTTS, inferEdgeExtension } from "../tts-core.js";
+
+const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "microsoft",
+    label: "Microsoft",
+    aliases: ["edge"],
+    isConfigured: ({ config }) => config.edge.enabled,
+    synthesize: async (req) => {
+      const tempRoot = resolvePreferredOpenClawTmpDir();
+      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
+      const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
+      let outputFormat = req.config.edge.outputFormat;
+      const fallbackOutputFormat =
+        outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
+
+      try {
+        const runEdge = async (format: string) => {
+          const fileExtension = inferEdgeExtension(format);
+          const outputPath = path.join(tempDir, `speech${fileExtension}`);
+          await edgeTTS({
+            text: req.text,
+            outputPath,
+            config: {
+              ...req.config.edge,
+              outputFormat: format,
+            },
+            timeoutMs: req.config.timeoutMs,
+          });
+          const audioBuffer = readFileSync(outputPath);
+          return {
+            audioBuffer,
+            outputFormat: format,
+            fileExtension,
+            voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
+          };
+        };
+
+        try {
+          return await runEdge(outputFormat);
+        } catch (err) {
+          if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
+            throw err;
+          }
+          outputFormat = fallbackOutputFormat;
+          return await runEdge(outputFormat);
+        }
+      } finally {
+        rmSync(tempDir, { recursive: true, force: true });
+      }
+    },
+  };
+}
--- a/src/tts/providers/openai.ts
+++ b/src/tts/providers/openai.ts
@@ -0,0 +1,56 @@
+import type { SpeechProviderPlugin } from "../../plugins/types.js";
+import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "../tts-core.js";
+
+export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "openai",
+    label: "OpenAI",
+    models: OPENAI_TTS_MODELS,
+    voices: OPENAI_TTS_VOICES,
+    isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
+    synthesize: async (req) => {
+      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error("OpenAI API key missing");
+      }
+      const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
+      const audioBuffer = await openaiTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.openai.baseUrl,
+        model: req.overrides?.openai?.model ?? req.config.openai.model,
+        voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
+        speed: req.config.openai.speed,
+        instructions: req.config.openai.instructions,
+        responseFormat,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return {
+        audioBuffer,
+        outputFormat: responseFormat,
+        fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
+        voiceCompatible: req.target === "voice-note",
+      };
+    },
+    synthesizeTelephony: async (req) => {
+      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error("OpenAI API key missing");
+      }
+      const outputFormat = "pcm";
+      const sampleRate = 24_000;
+      const audioBuffer = await openaiTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.openai.baseUrl,
+        model: req.config.openai.model,
+        voice: req.config.openai.voice,
+        speed: req.config.openai.speed,
+        instructions: req.config.openai.instructions,
+        responseFormat: outputFormat,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return { audioBuffer, outputFormat, sampleRate };
+    },
+  };
+}
--- a/src/tts/tts-core.ts
+++ b/src/tts/tts-core.ts
@@ -156,10 +156,13 @@ export function parseTtsDirectives(
            if (!policy.allowProvider) {
              break;
            }
-            if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
-              overrides.provider = rawValue;
-            } else {
-              warnings.push(`unsupported provider "${rawValue}"`);
+            {
+              const providerId = rawValue.trim().toLowerCase();
+              if (providerId) {
+                overrides.provider = providerId;
+              } else {
+                warnings.push("invalid provider id");
+              }
            }
            break;
          case "voice":
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -311,7 +311,7 @@ describe("tts", () => {
      expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1);
    });

-    it("accepts edge as provider override", () => {
+    it("accepts edge as a legacy microsoft provider override", () => {
      const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true });
      const input = "Hello [[tts:provider=edge]] world";
      const result = parseTtsDirectives(input, policy);
@@ -524,8 +524,8 @@ describe("tts", () => {
            ELEVENLABS_API_KEY: undefined,
            XI_API_KEY: undefined,
          },
-          prefsPath: "/tmp/tts-prefs-edge.json",
-          expected: "edge",
+          prefsPath: "/tmp/tts-prefs-microsoft.json",
+          expected: "microsoft",
        },
      ] as const;

@@ -539,6 +539,25 @@ describe("tts", () => {
    });
  });

+  describe("resolveTtsConfig provider normalization", () => {
+    it("normalizes legacy edge provider ids to microsoft", () => {
+      const config = resolveTtsConfig({
+        agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
+        messages: {
+          tts: {
+            provider: "edge",
+            edge: {
+              enabled: true,
+            },
+          },
+        },
+      });
+
+      expect(config.provider).toBe("microsoft");
+      expect(getTtsProvider(config, "/tmp/tts-prefs-normalized.json")).toBe("microsoft");
+    });
+  });
+
  describe("resolveTtsConfig – openai.baseUrl", () => {
    const baseCfg: OpenClawConfig = {
      agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -5,7 +5,6 @@ import {
  readFileSync,
  writeFileSync,
  mkdtempSync,
-  rmSync,
  renameSync,
  unlinkSync,
 } from "node:fs";
@@ -25,20 +24,20 @@ import type {
 import { logVerbose } from "../globals.js";
 import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
 import { stripMarkdown } from "../line/markdown-to-line.js";
-import { isVoiceCompatibleAudio } from "../media/audio.js";
 import { CONFIG_DIR, resolveUserPath } from "../utils.js";
+import {
+  getSpeechProvider,
+  listSpeechProviders,
+  normalizeSpeechProviderId,
+} from "./provider-registry.js";
 import {
  DEFAULT_OPENAI_BASE_URL,
-  edgeTTS,
-  elevenLabsTTS,
-  inferEdgeExtension,
  isValidOpenAIModel,
  isValidOpenAIVoice,
  isValidVoiceId,
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
  resolveOpenAITtsInstructions,
-  openaiTTS,
  parseTtsDirectives,
  scheduleCleanup,
  summarizeText,
@@ -83,11 +82,6 @@ const DEFAULT_OUTPUT = {
  voiceCompatible: false,
 };

-const TELEPHONY_OUTPUT = {
-  openai: { format: "pcm" as const, sampleRate: 24000 },
-  elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
-};
-
 const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);

 export type ResolvedTtsConfig = {
@@ -261,12 +255,13 @@ function resolveModelOverridePolicy(
 export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
  const raw: TtsConfig = cfg.messages?.tts ?? {};
  const providerSource = raw.provider ? "config" : "default";
-  const edgeOutputFormat = raw.edge?.outputFormat?.trim();
+  const rawMicrosoft = { ...raw.edge, ...raw.microsoft };
+  const edgeOutputFormat = rawMicrosoft.outputFormat?.trim();
  const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
  return {
    auto,
    mode: raw.mode ?? "final",
-    provider: raw.provider ?? "edge",
+    provider: normalizeSpeechProviderId(raw.provider) ?? "microsoft",
    providerSource,
    summaryModel: raw.summaryModel?.trim() || undefined,
    modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
@@ -311,17 +306,17 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
      instructions: raw.openai?.instructions?.trim() || undefined,
    },
    edge: {
-      enabled: raw.edge?.enabled ?? true,
-      voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
-      lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
+      enabled: rawMicrosoft.enabled ?? true,
+      voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,
+      lang: rawMicrosoft.lang?.trim() || DEFAULT_EDGE_LANG,
      outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
      outputFormatConfigured: Boolean(edgeOutputFormat),
-      pitch: raw.edge?.pitch?.trim() || undefined,
-      rate: raw.edge?.rate?.trim() || undefined,
-      volume: raw.edge?.volume?.trim() || undefined,
-      saveSubtitles: raw.edge?.saveSubtitles ?? false,
-      proxy: raw.edge?.proxy?.trim() || undefined,
-      timeoutMs: raw.edge?.timeoutMs,
+      pitch: rawMicrosoft.pitch?.trim() || undefined,
+      rate: rawMicrosoft.rate?.trim() || undefined,
+      volume: rawMicrosoft.volume?.trim() || undefined,
+      saveSubtitles: rawMicrosoft.saveSubtitles ?? false,
+      proxy: rawMicrosoft.proxy?.trim() || undefined,
+      timeoutMs: rawMicrosoft.timeoutMs,
    },
    prefsPath: raw.prefsPath,
    maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
@@ -448,11 +443,12 @@ export function setTtsEnabled(prefsPath: string, enabled: boolean): void {

 export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
  const prefs = readPrefs(prefsPath);
-  if (prefs.tts?.provider) {
-    return prefs.tts.provider;
+  const prefsProvider = normalizeSpeechProviderId(prefs.tts?.provider);
+  if (prefsProvider) {
+    return prefsProvider;
  }
  if (config.providerSource === "config") {
-    return config.provider;
+    return normalizeSpeechProviderId(config.provider) ?? config.provider;
  }

  if (resolveTtsApiKey(config, "openai")) {
@@ -461,12 +457,12 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
  if (resolveTtsApiKey(config, "elevenlabs")) {
    return "elevenlabs";
  }
-  return "edge";
+  return "microsoft";
 }

 export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
  updatePrefs(prefsPath, (prefs) => {
-    prefs.tts = { ...prefs.tts, provider };
+    prefs.tts = { ...prefs.tts, provider: normalizeSpeechProviderId(provider) ?? provider };
  });
 }

@@ -522,26 +518,42 @@ export function resolveTtsApiKey(
  config: ResolvedTtsConfig,
  provider: TtsProvider,
 ): string | undefined {
-  if (provider === "elevenlabs") {
+  const normalizedProvider = normalizeSpeechProviderId(provider);
+  if (normalizedProvider === "elevenlabs") {
    return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
  }
-  if (provider === "openai") {
+  if (normalizedProvider === "openai") {
    return config.openai.apiKey || process.env.OPENAI_API_KEY;
  }
  return undefined;
 }

-export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
+export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const;

-export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
-  return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
+export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
+  const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary;
+  const ordered = new Set<TtsProvider>([normalizedPrimary]);
+  for (const provider of TTS_PROVIDERS) {
+    if (provider !== normalizedPrimary) {
+      ordered.add(provider);
+    }
+  }
+  for (const provider of listSpeechProviders(cfg)) {
+    const normalized = normalizeSpeechProviderId(provider.id) ?? provider.id;
+    if (normalized !== normalizedPrimary) {
+      ordered.add(normalized);
+    }
+  }
+  return [...ordered];
 }

-export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
-  if (provider === "edge") {
-    return config.edge.enabled;
-  }
-  return Boolean(resolveTtsApiKey(config, provider));
+export function isTtsProviderConfigured(
+  config: ResolvedTtsConfig,
+  provider: TtsProvider,
+  cfg?: OpenClawConfig,
+): boolean {
+  const resolvedProvider = getSpeechProvider(provider, cfg);
+  return resolvedProvider?.isConfigured({ cfg, config }) ?? false;
 }

 function formatTtsProviderError(provider: TtsProvider, err: unknown): string {
@@ -581,10 +593,10 @@ function resolveTtsRequestSetup(params: {
  }

  const userProvider = getTtsProvider(config, prefsPath);
-  const provider = params.providerOverride ?? userProvider;
+  const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider;
  return {
    config,
-    providers: resolveTtsProviderOrder(provider),
+    providers: resolveTtsProviderOrder(provider, params.cfg),
  };
 }

@@ -607,136 +619,36 @@ export async function textToSpeech(params: {

  const { config, providers } = setup;
  const channelId = resolveChannelId(params.channel);
-  const output = resolveOutputFormat(channelId);
+  const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";

  const errors: string[] = [];

  for (const provider of providers) {
    const providerStart = Date.now();
    try {
-      if (provider === "edge") {
-        if (!config.edge.enabled) {
-          errors.push("edge: disabled");
-          continue;
-        }
-
-        const tempRoot = resolvePreferredOpenClawTmpDir();
-        mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
-        const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
-        let edgeOutputFormat = resolveEdgeOutputFormat(config);
-        const fallbackEdgeOutputFormat =
-          edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
-
-        const attemptEdgeTts = async (outputFormat: string) => {
-          const extension = inferEdgeExtension(outputFormat);
-          const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
-          await edgeTTS({
-            text: params.text,
-            outputPath: audioPath,
-            config: {
-              ...config.edge,
-              outputFormat,
-            },
-            timeoutMs: config.timeoutMs,
-          });
-          return { audioPath, outputFormat };
-        };
-
-        let edgeResult: { audioPath: string; outputFormat: string };
-        try {
-          edgeResult = await attemptEdgeTts(edgeOutputFormat);
-        } catch (err) {
-          if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
-            logVerbose(
-              `TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
-            );
-            edgeOutputFormat = fallbackEdgeOutputFormat;
-            try {
-              edgeResult = await attemptEdgeTts(edgeOutputFormat);
-            } catch (fallbackErr) {
-              try {
-                rmSync(tempDir, { recursive: true, force: true });
-              } catch {
-                // ignore cleanup errors
-              }
-              throw fallbackErr;
-            }
-          } else {
-            try {
-              rmSync(tempDir, { recursive: true, force: true });
-            } catch {
-              // ignore cleanup errors
-            }
-            throw err;
-          }
-        }
-
-        scheduleCleanup(tempDir);
-        const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
-
-        return {
-          success: true,
-          audioPath: edgeResult.audioPath,
-          latencyMs: Date.now() - providerStart,
-          provider,
-          outputFormat: edgeResult.outputFormat,
-          voiceCompatible,
-        };
-      }
-
-      const apiKey = resolveTtsApiKey(config, provider);
-      if (!apiKey) {
-        errors.push(`${provider}: no API key`);
+      const resolvedProvider = getSpeechProvider(provider, params.cfg);
+      if (!resolvedProvider) {
+        errors.push(`${provider}: no provider registered`);
        continue;
      }
-
-      let audioBuffer: Buffer;
-      if (provider === "elevenlabs") {
-        const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
-        const modelIdOverride = params.overrides?.elevenlabs?.modelId;
-        const voiceSettings = {
-          ...config.elevenlabs.voiceSettings,
-          ...params.overrides?.elevenlabs?.voiceSettings,
-        };
-        const seedOverride = params.overrides?.elevenlabs?.seed;
-        const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
-        const languageOverride = params.overrides?.elevenlabs?.languageCode;
-        audioBuffer = await elevenLabsTTS({
-          text: params.text,
-          apiKey,
-          baseUrl: config.elevenlabs.baseUrl,
-          voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
-          modelId: modelIdOverride ?? config.elevenlabs.modelId,
-          outputFormat: output.elevenlabs,
-          seed: seedOverride ?? config.elevenlabs.seed,
-          applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
-          languageCode: languageOverride ?? config.elevenlabs.languageCode,
-          voiceSettings,
-          timeoutMs: config.timeoutMs,
-        });
-      } else {
-        const openaiModelOverride = params.overrides?.openai?.model;
-        const openaiVoiceOverride = params.overrides?.openai?.voice;
-        audioBuffer = await openaiTTS({
-          text: params.text,
-          apiKey,
-          baseUrl: config.openai.baseUrl,
-          model: openaiModelOverride ?? config.openai.model,
-          voice: openaiVoiceOverride ?? config.openai.voice,
-          speed: config.openai.speed,
-          instructions: config.openai.instructions,
-          responseFormat: output.openai,
-          timeoutMs: config.timeoutMs,
-        });
+      if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) {
+        errors.push(`${provider}: not configured`);
+        continue;
      }
-
+      const synthesis = await resolvedProvider.synthesize({
+        text: params.text,
+        cfg: params.cfg,
+        config,
+        target,
+        overrides: params.overrides,
+      });
      const latencyMs = Date.now() - providerStart;

      const tempRoot = resolvePreferredOpenClawTmpDir();
      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
      const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
-      const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
-      writeFileSync(audioPath, audioBuffer);
+      const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
+      writeFileSync(audioPath, synthesis.audioBuffer);
      scheduleCleanup(tempDir);

      return {
@@ -744,8 +656,8 @@ export async function textToSpeech(params: {
        audioPath,
        latencyMs,
        provider,
-        outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
-        voiceCompatible: output.voiceCompatible,
+        outputFormat: synthesis.outputFormat,
+        voiceCompatible: synthesis.voiceCompatible,
      };
    } catch (err) {
      errors.push(formatTtsProviderError(provider, err));
@@ -776,63 +688,32 @@ export async function textToSpeechTelephony(params: {
  for (const provider of providers) {
    const providerStart = Date.now();
    try {
-      if (provider === "edge") {
-        errors.push("edge: unsupported for telephony");
+      const resolvedProvider = getSpeechProvider(provider, params.cfg);
+      if (!resolvedProvider) {
+        errors.push(`${provider}: no provider registered`);
        continue;
      }
-
-      const apiKey = resolveTtsApiKey(config, provider);
-      if (!apiKey) {
-        errors.push(`${provider}: no API key`);
+      if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) {
+        errors.push(`${provider}: not configured`);
        continue;
      }
-
-      if (provider === "elevenlabs") {
-        const output = TELEPHONY_OUTPUT.elevenlabs;
-        const audioBuffer = await elevenLabsTTS({
-          text: params.text,
-          apiKey,
-          baseUrl: config.elevenlabs.baseUrl,
-          voiceId: config.elevenlabs.voiceId,
-          modelId: config.elevenlabs.modelId,
-          outputFormat: output.format,
-          seed: config.elevenlabs.seed,
-          applyTextNormalization: config.elevenlabs.applyTextNormalization,
-          languageCode: config.elevenlabs.languageCode,
-          voiceSettings: config.elevenlabs.voiceSettings,
-          timeoutMs: config.timeoutMs,
-        });
-
-        return {
-          success: true,
-          audioBuffer,
-          latencyMs: Date.now() - providerStart,
-          provider,
-          outputFormat: output.format,
-          sampleRate: output.sampleRate,
-        };
+      if (!resolvedProvider.synthesizeTelephony) {
+        errors.push(`${provider}: unsupported for telephony`);
+        continue;
      }
-
-      const output = TELEPHONY_OUTPUT.openai;
-      const audioBuffer = await openaiTTS({
+      const synthesis = await resolvedProvider.synthesizeTelephony({
        text: params.text,
-        apiKey,
-        baseUrl: config.openai.baseUrl,
-        model: config.openai.model,
-        voice: config.openai.voice,
-        speed: config.openai.speed,
-        instructions: config.openai.instructions,
-        responseFormat: output.format,
-        timeoutMs: config.timeoutMs,
+        cfg: params.cfg,
+        config,
      });

      return {
        success: true,
-        audioBuffer,
+        audioBuffer: synthesis.audioBuffer,
        latencyMs: Date.now() - providerStart,
        provider,
-        outputFormat: output.format,
-        sampleRate: output.sampleRate,
+        outputFormat: synthesis.outputFormat,
+        sampleRate: synthesis.sampleRate,
      };
    } catch (err) {
      errors.push(formatTtsProviderError(provider, err));