refactor(tts): move speech providers into plugins

2026-05-17 10:50:47 +00:00 · 2026-03-22 17:46:48 -07:00
parent 1d08ad4bac
commit de6bf58e79
15 changed files with 448 additions and 128 deletions
--- a/extensions/elevenlabs/index.ts
+++ b/extensions/elevenlabs/index.ts
@@ -1,5 +1,5 @@
 import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
-import { buildElevenLabsSpeechProvider } from "openclaw/plugin-sdk/speech";
+import { buildElevenLabsSpeechProvider } from "./speech-provider.js";

 export default definePluginEntry({
  id: "elevenlabs",
--- a/extensions/elevenlabs/speech-provider.ts
+++ b/extensions/elevenlabs/speech-provider.ts
@@ -0,0 +1,126 @@
+import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
+import { elevenLabsTTS, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech-core";
+
+const ELEVENLABS_TTS_MODELS = [
+  "eleven_multilingual_v2",
+  "eleven_turbo_v2_5",
+  "eleven_monolingual_v1",
+] as const;
+
+function normalizeElevenLabsBaseUrl(baseUrl: string | undefined): string {
+  const trimmed = baseUrl?.trim();
+  return trimmed?.replace(/\/+$/, "") || "https://api.elevenlabs.io";
+}
+
+export async function listElevenLabsVoices(params: {
+  apiKey: string;
+  baseUrl?: string;
+}): Promise<SpeechVoiceOption[]> {
+  const res = await fetch(`${normalizeElevenLabsBaseUrl(params.baseUrl)}/v1/voices`, {
+    headers: {
+      "xi-api-key": params.apiKey,
+    },
+  });
+  if (!res.ok) {
+    throw new Error(`ElevenLabs voices API error (${res.status})`);
+  }
+  const json = (await res.json()) as {
+    voices?: Array<{
+      voice_id?: string;
+      name?: string;
+      category?: string;
+      description?: string;
+    }>;
+  };
+  return Array.isArray(json.voices)
+    ? json.voices
+        .map((voice) => ({
+          id: voice.voice_id?.trim() ?? "",
+          name: voice.name?.trim() || undefined,
+          category: voice.category?.trim() || undefined,
+          description: voice.description?.trim() || undefined,
+        }))
+        .filter((voice) => voice.id.length > 0)
+    : [];
+}
+
+export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "elevenlabs",
+    label: "ElevenLabs",
+    models: ELEVENLABS_TTS_MODELS,
+    listVoices: async (req) => {
+      const apiKey =
+        req.apiKey ||
+        req.config?.elevenlabs.apiKey ||
+        process.env.ELEVENLABS_API_KEY ||
+        process.env.XI_API_KEY;
+      if (!apiKey) {
+        throw new Error("ElevenLabs API key missing");
+      }
+      return listElevenLabsVoices({
+        apiKey,
+        baseUrl: req.baseUrl ?? req.config?.elevenlabs.baseUrl,
+      });
+    },
+    isConfigured: ({ config }) =>
+      Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
+    synthesize: async (req) => {
+      const apiKey =
+        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
+      if (!apiKey) {
+        throw new Error("ElevenLabs API key missing");
+      }
+      const outputFormat =
+        req.overrides?.elevenlabs?.outputFormat ??
+        (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
+      const audioBuffer = await elevenLabsTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.elevenlabs.baseUrl,
+        voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
+        modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
+        outputFormat,
+        seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
+        applyTextNormalization:
+          req.overrides?.elevenlabs?.applyTextNormalization ??
+          req.config.elevenlabs.applyTextNormalization,
+        languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
+        voiceSettings: {
+          ...req.config.elevenlabs.voiceSettings,
+          ...req.overrides?.elevenlabs?.voiceSettings,
+        },
+        timeoutMs: req.config.timeoutMs,
+      });
+      return {
+        audioBuffer,
+        outputFormat,
+        fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
+        voiceCompatible: req.target === "voice-note",
+      };
+    },
+    synthesizeTelephony: async (req) => {
+      const apiKey =
+        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
+      if (!apiKey) {
+        throw new Error("ElevenLabs API key missing");
+      }
+      const outputFormat = "pcm_22050";
+      const sampleRate = 22_050;
+      const audioBuffer = await elevenLabsTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.elevenlabs.baseUrl,
+        voiceId: req.config.elevenlabs.voiceId,
+        modelId: req.config.elevenlabs.modelId,
+        outputFormat,
+        seed: req.config.elevenlabs.seed,
+        applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
+        languageCode: req.config.elevenlabs.languageCode,
+        voiceSettings: req.config.elevenlabs.voiceSettings,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return { audioBuffer, outputFormat, sampleRate };
+    },
+  };
+}
--- a/extensions/microsoft/index.ts
+++ b/extensions/microsoft/index.ts
@@ -1,5 +1,5 @@
 import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
-import { buildMicrosoftSpeechProvider } from "openclaw/plugin-sdk/speech";
+import { buildMicrosoftSpeechProvider } from "./speech-provider.js";

 export default definePluginEntry({
  id: "microsoft",
--- a/extensions/microsoft/package.json
+++ b/extensions/microsoft/package.json
@@ -4,6 +4,9 @@
  "private": true,
  "description": "OpenClaw Microsoft speech plugin",
  "type": "module",
+  "dependencies": {
+    "node-edge-tts": "^1.2.10"
+  },
  "openclaw": {
    "extensions": [
      "./index.ts"
--- a/extensions/microsoft/speech-provider.test.ts
+++ b/extensions/microsoft/speech-provider.test.ts
@@ -0,0 +1,43 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+
+import { listMicrosoftVoices } from "./speech-provider.js";
+
+const fetchMock = vi.fn<typeof fetch>();
+
+describe("listMicrosoftVoices", () => {
+  afterEach(() => {
+    fetchMock.mockReset();
+    vi.unstubAllGlobals();
+  });
+
+  it("maps Microsoft voices to the shared speech voice shape", async () => {
+    fetchMock.mockResolvedValueOnce({
+      ok: true,
+      json: async () => [
+        {
+          ShortName: "en-US-AvaMultilingualNeural",
+          FriendlyName: "Microsoft Ava",
+          Locale: "en-US",
+          Gender: "Female",
+          VoiceTag: {
+            ContentCategories: ["General"],
+            VoicePersonalities: ["Friendly", "Warm"],
+          },
+        },
+      ],
+    } as Response);
+    vi.stubGlobal("fetch", fetchMock);
+
+    await expect(listMicrosoftVoices()).resolves.toEqual([
+      {
+        id: "en-US-AvaMultilingualNeural",
+        name: "Microsoft Ava",
+        category: "General",
+        description: "Friendly, Warm",
+        locale: "en-US",
+        gender: "Female",
+        personalities: ["Friendly", "Warm"],
+      },
+    ]);
+  });
+});
--- a/extensions/microsoft/speech-provider.ts
+++ b/extensions/microsoft/speech-provider.ts
@@ -0,0 +1,130 @@
+import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
+import path from "node:path";
+import {
+  CHROMIUM_FULL_VERSION,
+  TRUSTED_CLIENT_TOKEN,
+  generateSecMsGecToken,
+} from "node-edge-tts/dist/drm.js";
+import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
+import {
+  edgeTTS,
+  inferEdgeExtension,
+  isVoiceCompatibleAudio,
+  resolvePreferredOpenClawTmpDir,
+  type SpeechVoiceOption,
+} from "openclaw/plugin-sdk/speech-core";
+
+const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+type MicrosoftVoiceListEntry = {
+  ShortName?: string;
+  FriendlyName?: string;
+  Locale?: string;
+  Gender?: string;
+  VoiceTag?: {
+    ContentCategories?: string[];
+    VoicePersonalities?: string[];
+  };
+};
+
+function buildMicrosoftVoiceHeaders(): Record<string, string> {
+  const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0";
+  return {
+    Authority: "speech.platform.bing.com",
+    Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+    Accept: "*/*",
+    "User-Agent":
+      `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` +
+      `(KHTML, like Gecko) Chrome/${major}.0.0.0 Safari/537.36 Edg/${major}.0.0.0`,
+    "Sec-MS-GEC": generateSecMsGecToken(),
+    "Sec-MS-GEC-Version": `1-${CHROMIUM_FULL_VERSION}`,
+  };
+}
+
+function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string | undefined {
+  const personalities = entry.VoiceTag?.VoicePersonalities?.filter(Boolean) ?? [];
+  return personalities.length > 0 ? personalities.join(", ") : undefined;
+}
+
+export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
+  const response = await fetch(
+    "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
+      `?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`,
+    {
+      headers: buildMicrosoftVoiceHeaders(),
+    },
+  );
+  if (!response.ok) {
+    throw new Error(`Microsoft voices API error (${response.status})`);
+  }
+  const voices = (await response.json()) as MicrosoftVoiceListEntry[];
+  return Array.isArray(voices)
+    ? voices
+        .map((voice) => ({
+          id: voice.ShortName?.trim() ?? "",
+          name: voice.FriendlyName?.trim() || voice.ShortName?.trim() || undefined,
+          category: voice.VoiceTag?.ContentCategories?.find((value) => value.trim().length > 0),
+          description: formatMicrosoftVoiceDescription(voice),
+          locale: voice.Locale?.trim() || undefined,
+          gender: voice.Gender?.trim() || undefined,
+          personalities: voice.VoiceTag?.VoicePersonalities?.filter(
+            (value): value is string => value.trim().length > 0,
+          ),
+        }))
+        .filter((voice) => voice.id.length > 0)
+    : [];
+}
+
+export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "microsoft",
+    label: "Microsoft",
+    aliases: ["edge"],
+    listVoices: async () => await listMicrosoftVoices(),
+    isConfigured: ({ config }) => config.edge.enabled,
+    synthesize: async (req) => {
+      const tempRoot = resolvePreferredOpenClawTmpDir();
+      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
+      const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
+      let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat;
+      const fallbackOutputFormat =
+        outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
+
+      try {
+        const runEdge = async (format: string) => {
+          const fileExtension = inferEdgeExtension(format);
+          const outputPath = path.join(tempDir, `speech${fileExtension}`);
+          await edgeTTS({
+            text: req.text,
+            outputPath,
+            config: {
+              ...req.config.edge,
+              voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice,
+              outputFormat: format,
+            },
+            timeoutMs: req.config.timeoutMs,
+          });
+          const audioBuffer = readFileSync(outputPath);
+          return {
+            audioBuffer,
+            outputFormat: format,
+            fileExtension,
+            voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
+          };
+        };
+
+        try {
+          return await runEdge(outputFormat);
+        } catch (err) {
+          if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
+            throw err;
+          }
+          outputFormat = fallbackOutputFormat;
+          return await runEdge(outputFormat);
+        }
+      } finally {
+        rmSync(tempDir, { recursive: true, force: true });
+      }
+    },
+  };
+}
--- a/extensions/openai/index.ts
+++ b/extensions/openai/index.ts
@@ -1,9 +1,9 @@
 import { buildOpenAIImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
 import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
-import { buildOpenAISpeechProvider } from "openclaw/plugin-sdk/speech";
 import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
 import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
 import { buildOpenAIProvider } from "./openai-provider.js";
+import { buildOpenAISpeechProvider } from "./speech-provider.js";

 export default definePluginEntry({
  id: "openai",
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -0,0 +1,57 @@
+import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
+import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "openclaw/plugin-sdk/speech-core";
+
+export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "openai",
+    label: "OpenAI",
+    models: OPENAI_TTS_MODELS,
+    voices: OPENAI_TTS_VOICES,
+    listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
+    isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
+    synthesize: async (req) => {
+      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error("OpenAI API key missing");
+      }
+      const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
+      const audioBuffer = await openaiTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.openai.baseUrl,
+        model: req.overrides?.openai?.model ?? req.config.openai.model,
+        voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
+        speed: req.overrides?.openai?.speed ?? req.config.openai.speed,
+        instructions: req.config.openai.instructions,
+        responseFormat,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return {
+        audioBuffer,
+        outputFormat: responseFormat,
+        fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
+        voiceCompatible: req.target === "voice-note",
+      };
+    },
+    synthesizeTelephony: async (req) => {
+      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error("OpenAI API key missing");
+      }
+      const outputFormat = "pcm";
+      const sampleRate = 24_000;
+      const audioBuffer = await openaiTTS({
+        text: req.text,
+        apiKey,
+        baseUrl: req.config.openai.baseUrl,
+        model: req.config.openai.model,
+        voice: req.config.openai.voice,
+        speed: req.config.openai.speed,
+        instructions: req.config.openai.instructions,
+        responseFormat: outputFormat,
+        timeoutMs: req.config.timeoutMs,
+      });
+      return { audioBuffer, outputFormat, sampleRate };
+    },
+  };
+}
--- a/package.json
+++ b/package.json
@@ -165,6 +165,10 @@
      "types": "./dist/plugin-sdk/speech-runtime.d.ts",
      "default": "./dist/plugin-sdk/speech-runtime.js"
    },
+    "./plugin-sdk/speech-core": {
+      "types": "./dist/plugin-sdk/speech-core.d.ts",
+      "default": "./dist/plugin-sdk/speech-core.js"
+    },
    "./plugin-sdk/plugin-runtime": {
      "types": "./dist/plugin-sdk/plugin-runtime.d.ts",
      "default": "./dist/plugin-sdk/plugin-runtime.js"
--- a/scripts/lib/plugin-sdk-entrypoints.json
+++ b/scripts/lib/plugin-sdk-entrypoints.json
@@ -31,6 +31,7 @@
  "text-runtime",
  "agent-runtime",
  "speech-runtime",
+  "speech-core",
  "plugin-runtime",
  "security-runtime",
  "gateway-runtime",
--- a/src/plugin-sdk/speech-core.ts
+++ b/src/plugin-sdk/speech-core.ts
@@ -0,0 +1,17 @@
+// Shared speech-provider implementation helpers for bundled and third-party plugins.
+
+export type { SpeechProviderPlugin } from "../plugins/types.js";
+export type { SpeechVoiceOption } from "../tts/provider-types.js";
+
+export {
+  edgeTTS,
+  elevenLabsTTS,
+  inferEdgeExtension,
+  OPENAI_TTS_MODELS,
+  OPENAI_TTS_VOICES,
+  openaiTTS,
+  parseTtsDirectives,
+} from "../tts/tts-core.js";
+
+export { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+export { isVoiceCompatibleAudio } from "../media/audio.js";
--- a/src/plugin-sdk/speech.ts
+++ b/src/plugin-sdk/speech.ts
@@ -1,7 +1,7 @@
 // Public speech-provider builders for bundled or third-party plugins.

-export { buildElevenLabsSpeechProvider } from "../tts/providers/elevenlabs.js";
-export { buildMicrosoftSpeechProvider } from "../tts/providers/microsoft.js";
-export { buildOpenAISpeechProvider } from "../tts/providers/openai.js";
+export { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
+export { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
+export { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
 export { parseTtsDirectives } from "../tts/tts-core.js";
 export type { SpeechVoiceOption } from "../tts/provider-types.js";
--- a/src/tts/provider-registry.test.ts
+++ b/src/tts/provider-registry.test.ts
@@ -1,62 +1,84 @@
-import { afterEach, describe, expect, it, vi } from "vitest";
-import type { OpenClawConfig } from "../config/config.js";
-import { createEmptyPluginRegistry } from "../plugins/registry.js";
-import { resetPluginRuntimeStateForTest, setActivePluginRegistry } from "../plugins/runtime.js";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

-const { loadOpenClawPluginsMock } = vi.hoisted(() => ({
-  loadOpenClawPluginsMock: vi.fn(() => createEmptyPluginRegistry()),
-}));
+import type { OpenClawConfig } from "../config/config.js";
+import { createEmptyPluginRegistry } from "../plugins/registry-empty.js";
+import { resetPluginRuntimeStateForTest, setActivePluginRegistry } from "../plugins/runtime.js";
+import type { SpeechProviderPlugin } from "../plugins/types.js";
+import { getSpeechProvider, listSpeechProviders, normalizeSpeechProviderId } from "./provider-registry.js";
+
+const loadOpenClawPluginsMock = vi.fn();

 vi.mock("../plugins/loader.js", () => ({
-  loadOpenClawPlugins: loadOpenClawPluginsMock,
+  loadOpenClawPlugins: (...args: Parameters<typeof loadOpenClawPluginsMock>) =>
+    loadOpenClawPluginsMock(...args),
 }));

-import { getSpeechProvider, listSpeechProviders } from "./provider-registry.js";
+function createSpeechProvider(id: string, aliases?: string[]): SpeechProviderPlugin {
+  return {
+    id,
+    ...(aliases ? { aliases } : {}),
+    isConfigured: () => true,
+    synthesize: async () => ({
+      audioBuffer: Buffer.from("audio"),
+      outputFormat: "mp3",
+      voiceCompatible: false,
+      fileExtension: ".mp3",
+    }),
+  };
+}

 describe("speech provider registry", () => {
-  afterEach(() => {
+  beforeEach(() => {
+    resetPluginRuntimeStateForTest();
    loadOpenClawPluginsMock.mockReset();
    loadOpenClawPluginsMock.mockReturnValue(createEmptyPluginRegistry());
+  });
+
+  afterEach(() => {
    resetPluginRuntimeStateForTest();
  });

-  it("does not load plugins for builtin provider lookup", () => {
-    const provider = getSpeechProvider("openai", {} as OpenClawConfig);
+  it("uses active plugin speech providers without reloading plugins", () => {
+    setActivePluginRegistry({
+      ...createEmptyPluginRegistry(),
+      speechProviders: [
+        {
+          pluginId: "test-openai",
+          provider: createSpeechProvider("openai"),
+        },
+      ],
+    });

-    expect(provider?.id).toBe("openai");
-    expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
-  });
-
-  it("does not load plugins when listing without config", () => {
    const providers = listSpeechProviders();

-    expect(providers.map((provider) => provider.id)).toEqual(["openai", "elevenlabs", "microsoft"]);
+    expect(providers.map((provider) => provider.id)).toEqual(["openai"]);
    expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
  });

-  it("uses active plugin speech providers without loading from disk", () => {
-    const registry = createEmptyPluginRegistry();
-    registry.speechProviders.push({
-      pluginId: "custom-speech",
-      pluginName: "Custom Speech",
-      source: "test",
-      provider: {
-        id: "custom-speech",
-        label: "Custom Speech",
-        isConfigured: () => true,
-        synthesize: async () => ({
-          audioBuffer: Buffer.from("audio"),
-          outputFormat: "mp3",
-          fileExtension: ".mp3",
-          voiceCompatible: false,
-        }),
-      },
+  it("loads speech providers from plugins when config is provided", () => {
+    loadOpenClawPluginsMock.mockReturnValue({
+      ...createEmptyPluginRegistry(),
+      speechProviders: [
+        {
+          pluginId: "test-microsoft",
+          provider: createSpeechProvider("microsoft", ["edge"]),
+        },
+      ],
    });
-    setActivePluginRegistry(registry);

-    const provider = getSpeechProvider("custom-speech");
+    const cfg = {} as OpenClawConfig;

-    expect(provider?.id).toBe("custom-speech");
-    expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
+    expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual(["microsoft"]);
+    expect(getSpeechProvider("edge", cfg)?.id).toBe("microsoft");
+    expect(loadOpenClawPluginsMock).toHaveBeenCalledWith({ config: cfg });
+  });
+
+  it("returns no providers when neither plugins nor active registry provide speech support", () => {
+    expect(listSpeechProviders()).toEqual([]);
+    expect(getSpeechProvider("openai")).toBeUndefined();
+  });
+
+  it("normalizes the legacy edge alias to microsoft", () => {
+    expect(normalizeSpeechProviderId("edge")).toBe("microsoft");
  });
 });
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@@ -3,15 +3,6 @@ import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";
-import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
-import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
-import { buildOpenAISpeechProvider } from "./providers/openai.js";
-
-const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
-  buildOpenAISpeechProvider,
-  buildElevenLabsSpeechProvider,
-  buildMicrosoftSpeechProvider,
-] as const satisfies readonly (() => SpeechProviderPlugin)[];

 function trimToUndefined(value: string | undefined): string | undefined {
  const trimmed = value?.trim().toLowerCase();
@@ -66,9 +57,6 @@ function buildProviderMaps(cfg?: OpenClawConfig): {
  const aliases = new Map<string, SpeechProviderPlugin>();
  const maps = { canonical, aliases };

-  for (const buildProvider of BUILTIN_SPEECH_PROVIDER_BUILDERS) {
-    registerSpeechProvider(maps, buildProvider());
-  }
  for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
    registerSpeechProvider(maps, provider);
  }
@@ -88,10 +76,5 @@ export function getSpeechProvider(
  if (!normalized) {
    return undefined;
  }
-
-  const local = buildProviderMaps().aliases.get(normalized);
-  if (local || !cfg) {
-    return local;
-  }
  return buildProviderMaps(cfg).aliases.get(normalized);
 }
--- a/src/tts/providers/microsoft.test.ts
+++ b/src/tts/providers/microsoft.test.ts
@@ -1,66 +0,0 @@
-import { afterEach, describe, expect, it, vi } from "vitest";
-import { withFetchPreconnect } from "../../test-utils/fetch-mock.js";
-import { listMicrosoftVoices } from "./microsoft.js";
-
-describe("listMicrosoftVoices", () => {
-  const originalFetch = globalThis.fetch;
-
-  afterEach(() => {
-    globalThis.fetch = originalFetch;
-    vi.restoreAllMocks();
-  });
-
-  it("maps Microsoft voice metadata into speech voice options", async () => {
-    globalThis.fetch = withFetchPreconnect(
-      vi.fn().mockResolvedValue(
-        new Response(
-          JSON.stringify([
-            {
-              ShortName: "en-US-AvaNeural",
-              FriendlyName: "Microsoft Ava Online (Natural) - English (United States)",
-              Locale: "en-US",
-              Gender: "Female",
-              VoiceTag: {
-                ContentCategories: ["General"],
-                VoicePersonalities: ["Friendly", "Positive"],
-              },
-            },
-          ]),
-          { status: 200 },
-        ),
-      ),
-    );
-
-    const voices = await listMicrosoftVoices();
-
-    expect(voices).toEqual([
-      {
-        id: "en-US-AvaNeural",
-        name: "Microsoft Ava Online (Natural) - English (United States)",
-        category: "General",
-        description: "Friendly, Positive",
-        locale: "en-US",
-        gender: "Female",
-        personalities: ["Friendly", "Positive"],
-      },
-    ]);
-    expect(globalThis.fetch).toHaveBeenCalledWith(
-      expect.stringContaining("/voices/list?trustedclienttoken="),
-      expect.objectContaining({
-        headers: expect.objectContaining({
-          Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-          "Sec-MS-GEC": expect.any(String),
-          "Sec-MS-GEC-Version": expect.stringContaining("1-"),
-        }),
-      }),
-    );
-  });
-
-  it("throws on Microsoft voice list failures", async () => {
-    globalThis.fetch = withFetchPreconnect(
-      vi.fn().mockResolvedValue(new Response("nope", { status: 503 })),
-    );
-
-    await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)");
-  });
-});