diff --git a/docs/.generated/plugin-sdk-api-baseline.json b/docs/.generated/plugin-sdk-api-baseline.json
index ff71a089b2f..220d24625e9 100644
--- a/docs/.generated/plugin-sdk-api-baseline.json
+++ b/docs/.generated/plugin-sdk-api-baseline.json
@@ -5,33 +5,6 @@
       "category": "legacy",
       "entrypoint": "index",
       "exports": [
-        {
-          "declaration": "export function buildFalImageGenerationProvider(): ImageGenerationProvider;",
-          "exportName": "buildFalImageGenerationProvider",
-          "kind": "function",
-          "source": {
-            "line": 190,
-            "path": "extensions/fal/image-generation-provider.ts"
-          }
-        },
-        {
-          "declaration": "export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;",
-          "exportName": "buildGoogleImageGenerationProvider",
-          "kind": "function",
-          "source": {
-            "line": 98,
-            "path": "extensions/google/image-generation-provider.ts"
-          }
-        },
-        {
-          "declaration": "export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;",
-          "exportName": "buildOpenAIImageGenerationProvider",
-          "kind": "function",
-          "source": {
-            "line": 22,
-            "path": "extensions/openai/image-generation-provider.ts"
-          }
-        },
         {
           "declaration": "export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;",
           "exportName": "delegateCompactionToRuntime",
@@ -923,7 +896,7 @@
           "exportName": "createMessageToolButtonsSchema",
           "kind": "function",
           "source": {
-            "line": 11,
+            "line": 12,
             "path": "src/plugin-sdk/channel-actions.ts"
           }
         },
@@ -932,7 +905,7 @@
           "exportName": "createMessageToolCardSchema",
           "kind": "function",
           "source": {
-            "line": 29,
+            "line": 30,
             "path": "src/plugin-sdk/channel-actions.ts"
           }
         },
@@ -954,6 +927,15 @@
             "path": "src/channels/plugins/actions/shared.ts"
           }
         },
+        {
+          "declaration": "export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;",
+          "exportName": "optionalStringEnum",
+          "kind": "function",
+          "source": {
+            "line": 31,
+            "path": "src/agents/schema/typebox.ts"
+          }
+        },
         {
           "declaration": "export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;",
           "exportName": "resolveReactionMessageId",
@@ -962,6 +944,15 @@
             "line": 7,
             "path": "src/channels/plugins/actions/reaction-message-id.ts"
           }
+        },
+        {
+          "declaration": "export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;",
+          "exportName": "stringEnum",
+          "kind": "function",
+          "source": {
+            "line": 15,
+            "path": "src/agents/schema/typebox.ts"
+          }
         }
       ],
       "importSpecifier": "openclaw/plugin-sdk/channel-actions",
diff --git a/docs/.generated/plugin-sdk-api-baseline.jsonl b/docs/.generated/plugin-sdk-api-baseline.jsonl
index 677fdec8b6e..ab024a272a9 100644
--- a/docs/.generated/plugin-sdk-api-baseline.jsonl
+++ b/docs/.generated/plugin-sdk-api-baseline.jsonl
@@ -1,7 +1,4 @@
 {"category":"legacy","entrypoint":"index","importSpecifier":"openclaw/plugin-sdk","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/index.ts"}
-{"declaration":"export function buildFalImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildFalImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":190,"sourcePath":"extensions/fal/image-generation-provider.ts"}
-{"declaration":"export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildGoogleImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":98,"sourcePath":"extensions/google/image-generation-provider.ts"}
-{"declaration":"export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildOpenAIImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":22,"sourcePath":"extensions/openai/image-generation-provider.ts"}
 {"declaration":"export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;","entrypoint":"index","exportName":"delegateCompactionToRuntime","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":16,"sourcePath":"src/context-engine/delegate.ts"}
 {"declaration":"export function emptyPluginConfigSchema(): OpenClawPluginConfigSchema;","entrypoint":"index","exportName":"emptyPluginConfigSchema","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/plugins/config-schema.ts"}
 {"declaration":"export function onDiagnosticEvent(listener: (evt: DiagnosticEventPayload) => void): () => void;","entrypoint":"index","exportName":"onDiagnosticEvent","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":229,"sourcePath":"src/infra/diagnostic-events.ts"}
@@ -100,11 +97,13 @@
 {"declaration":"export type BasicAllowlistResolutionEntry = BasicAllowlistResolutionEntry;","entrypoint":"allow-from","exportName":"BasicAllowlistResolutionEntry","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":129,"sourcePath":"src/plugin-sdk/allow-from.ts"}
 {"declaration":"export type CompiledAllowlist = CompiledAllowlist;","entrypoint":"allow-from","exportName":"CompiledAllowlist","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":19,"sourcePath":"src/channels/allowlist-match.ts"}
 {"category":"channel","entrypoint":"channel-actions","importSpecifier":"openclaw/plugin-sdk/channel-actions","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
-{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":11,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
-{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":29,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
+{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":12,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
+{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":30,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
 {"declaration":"export function createUnionActionGate<TAccount, TKey extends string>(accounts: readonly TAccount[], createGate: (account: TAccount) => OptionalDefaultGate<TKey>): OptionalDefaultGate<TKey>;","entrypoint":"channel-actions","exportName":"createUnionActionGate","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/channels/plugins/actions/shared.ts"}
 {"declaration":"export function listTokenSourcedAccounts<TAccount extends TokenSourcedAccount>(accounts: readonly TAccount[]): TAccount[];","entrypoint":"channel-actions","exportName":"listTokenSourcedAccounts","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/shared.ts"}
+{"declaration":"export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;","entrypoint":"channel-actions","exportName":"optionalStringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":31,"sourcePath":"src/agents/schema/typebox.ts"}
 {"declaration":"export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;","entrypoint":"channel-actions","exportName":"resolveReactionMessageId","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/reaction-message-id.ts"}
+{"declaration":"export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;","entrypoint":"channel-actions","exportName":"stringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":15,"sourcePath":"src/agents/schema/typebox.ts"}
 {"category":"channel","entrypoint":"channel-config-schema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-config-schema.ts"}
 {"declaration":"export function buildCatchallMultiAccountChannelSchema<T extends ExtendableZodObject>(accountSchema: T): T;","entrypoint":"channel-config-schema","exportName":"buildCatchallMultiAccountChannelSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":26,"sourcePath":"src/channels/plugins/config-schema.ts"}
 {"declaration":"export function buildChannelConfigSchema(schema: ZodType<unknown, unknown, $ZodTypeInternals<unknown, unknown>>): ChannelConfigSchema;","entrypoint":"channel-config-schema","exportName":"buildChannelConfigSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":35,"sourcePath":"src/channels/plugins/config-schema.ts"}
diff --git a/extensions/anthropic/openclaw.plugin.json b/extensions/anthropic/openclaw.plugin.json
index 1d58bde7188..106f832eef9 100644
--- a/extensions/anthropic/openclaw.plugin.json
+++ b/extensions/anthropic/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
   "id": "anthropic",
   "providers": ["anthropic"],
+  "mediaUnderstandingProviders": ["anthropic"],
   "cliBackends": ["claude-cli"],
   "providerAuthEnvVars": {
     "anthropic": ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"]
diff --git a/extensions/deepgram/audio.ts b/extensions/deepgram/audio.ts
index de97397f215..77146ecfa90 100644
--- a/extensions/deepgram/audio.ts
+++ b/extensions/deepgram/audio.ts
@@ -7,7 +7,7 @@ import {
   normalizeBaseUrl,
   postTranscriptionRequest,
   requireTranscriptionText,
-} from "openclaw/plugin-sdk/media-understanding";
+} from "openclaw/plugin-sdk/provider-http";
 
 export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
 export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
diff --git a/extensions/deepgram/openclaw.plugin.json b/extensions/deepgram/openclaw.plugin.json
index 7d148b7c720..d522ec8be6a 100644
--- a/extensions/deepgram/openclaw.plugin.json
+++ b/extensions/deepgram/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
   "id": "deepgram",
+  "mediaUnderstandingProviders": ["deepgram"],
   "configSchema": {
     "type": "object",
     "additionalProperties": false,
diff --git a/extensions/elevenlabs/openclaw.plugin.json b/extensions/elevenlabs/openclaw.plugin.json
index 3015fa282a2..abffc3c4f49 100644
--- a/extensions/elevenlabs/openclaw.plugin.json
+++ b/extensions/elevenlabs/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
   "id": "elevenlabs",
+  "speechProviders": ["elevenlabs"],
   "configSchema": {
     "type": "object",
     "additionalProperties": false,
diff --git a/extensions/elevenlabs/speech-provider.ts b/extensions/elevenlabs/speech-provider.ts
index 1ef07597958..24e8298ad0d 100644
--- a/extensions/elevenlabs/speech-provider.ts
+++ b/extensions/elevenlabs/speech-provider.ts
@@ -1,5 +1,6 @@
 import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
-import { elevenLabsTTS, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import { elevenLabsTTS } from "./tts.js";
 
 const ELEVENLABS_TTS_MODELS = [
   "eleven_multilingual_v2",
diff --git a/extensions/elevenlabs/tts.ts b/extensions/elevenlabs/tts.ts
new file mode 100644
index 00000000000..bebf1df9060
--- /dev/null
+++ b/extensions/elevenlabs/tts.ts
@@ -0,0 +1,150 @@
+const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
+
+function isValidVoiceId(voiceId: string): boolean {
+  return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
+}
+
+function normalizeElevenLabsBaseUrl(baseUrl?: string): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return DEFAULT_ELEVENLABS_BASE_URL;
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+function normalizeLanguageCode(code?: string): string | undefined {
+  const trimmed = code?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (!/^[a-z]{2}$/.test(normalized)) {
+    throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
+  }
+  return normalized;
+}
+
+function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
+  const trimmed = mode?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (normalized === "auto" || normalized === "on" || normalized === "off") {
+    return normalized;
+  }
+  throw new Error("applyTextNormalization must be one of: auto, on, off");
+}
+
+function normalizeSeed(seed?: number): number | undefined {
+  if (seed == null) {
+    return undefined;
+  }
+  const next = Math.floor(seed);
+  if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
+    throw new Error("seed must be between 0 and 4294967295");
+  }
+  return next;
+}
+
+function requireInRange(value: number, min: number, max: number, label: string): void {
+  if (!Number.isFinite(value) || value < min || value > max) {
+    throw new Error(`${label} must be between ${min} and ${max}`);
+  }
+}
+
+function assertElevenLabsVoiceSettings(settings: {
+  stability: number;
+  similarityBoost: number;
+  style: number;
+  useSpeakerBoost: boolean;
+  speed: number;
+}) {
+  requireInRange(settings.stability, 0, 1, "stability");
+  requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
+  requireInRange(settings.style, 0, 1, "style");
+  requireInRange(settings.speed, 0.5, 2, "speed");
+}
+
+export async function elevenLabsTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl: string;
+  voiceId: string;
+  modelId: string;
+  outputFormat: string;
+  seed?: number;
+  applyTextNormalization?: "auto" | "on" | "off";
+  languageCode?: string;
+  voiceSettings: {
+    stability: number;
+    similarityBoost: number;
+    style: number;
+    useSpeakerBoost: boolean;
+    speed: number;
+  };
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const {
+    text,
+    apiKey,
+    baseUrl,
+    voiceId,
+    modelId,
+    outputFormat,
+    seed,
+    applyTextNormalization,
+    languageCode,
+    voiceSettings,
+    timeoutMs,
+  } = params;
+  if (!isValidVoiceId(voiceId)) {
+    throw new Error("Invalid voiceId format");
+  }
+  assertElevenLabsVoiceSettings(voiceSettings);
+  const normalizedLanguage = normalizeLanguageCode(languageCode);
+  const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
+  const normalizedSeed = normalizeSeed(seed);
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
+    if (outputFormat) {
+      url.searchParams.set("output_format", outputFormat);
+    }
+
+    const response = await fetch(url.toString(), {
+      method: "POST",
+      headers: {
+        "xi-api-key": apiKey,
+        "Content-Type": "application/json",
+        Accept: "audio/mpeg",
+      },
+      body: JSON.stringify({
+        text,
+        model_id: modelId,
+        seed: normalizedSeed,
+        apply_text_normalization: normalizedNormalization,
+        language_code: normalizedLanguage,
+        voice_settings: {
+          stability: voiceSettings.stability,
+          similarity_boost: voiceSettings.similarityBoost,
+          style: voiceSettings.style,
+          use_speaker_boost: voiceSettings.useSpeakerBoost,
+          speed: voiceSettings.speed,
+        },
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      throw new Error(`ElevenLabs API error (${response.status})`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
diff --git a/extensions/fal/openclaw.plugin.json b/extensions/fal/openclaw.plugin.json
index d7f7e12f677..99ac7d3d1f9 100644
--- a/extensions/fal/openclaw.plugin.json
+++ b/extensions/fal/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
   "id": "fal",
   "providers": ["fal"],
+  "imageGenerationProviders": ["fal"],
   "providerAuthEnvVars": {
     "fal": ["FAL_KEY"]
   },
diff --git a/extensions/google/image-generation-provider.ts b/extensions/google/image-generation-provider.ts
index ef72925b136..f138ff86be0 100644
--- a/extensions/google/image-generation-provider.ts
+++ b/extensions/google/image-generation-provider.ts
@@ -1,9 +1,4 @@
 import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
-import {
-  assertOkOrThrowHttpError,
-  normalizeBaseUrl,
-  postJsonRequest,
-} from "openclaw/plugin-sdk/media-understanding";
 import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth";
 import {
   DEFAULT_GOOGLE_API_BASE_URL,
@@ -11,6 +6,11 @@ import {
   normalizeGoogleModelId,
   parseGeminiAuth,
 } from "openclaw/plugin-sdk/provider-google";
+import {
+  assertOkOrThrowHttpError,
+  normalizeBaseUrl,
+  postJsonRequest,
+} from "openclaw/plugin-sdk/provider-http";
 
 const DEFAULT_GOOGLE_IMAGE_MODEL = "gemini-3.1-flash-image-preview";
 const DEFAULT_OUTPUT_MIME = "image/png";
diff --git a/extensions/google/media-understanding-provider.ts b/extensions/google/media-understanding-provider.ts
index c3734a8d12f..2766dab3207 100644
--- a/extensions/google/media-understanding-provider.ts
+++ b/extensions/google/media-understanding-provider.ts
@@ -1,15 +1,17 @@
 import {
-  assertOkOrThrowHttpError,
   describeImageWithModel,
   describeImagesWithModel,
-  normalizeBaseUrl,
-  postJsonRequest,
   type AudioTranscriptionRequest,
   type AudioTranscriptionResult,
   type MediaUnderstandingProvider,
   type VideoDescriptionRequest,
   type VideoDescriptionResult,
 } from "openclaw/plugin-sdk/media-understanding";
+import {
+  assertOkOrThrowHttpError,
+  normalizeBaseUrl,
+  postJsonRequest,
+} from "openclaw/plugin-sdk/provider-http";
 import {
   DEFAULT_GOOGLE_API_BASE_URL,
   normalizeGoogleApiBaseUrl,
diff --git a/extensions/google/openclaw.plugin.json b/extensions/google/openclaw.plugin.json
index 252891b2a52..576d4992fce 100644
--- a/extensions/google/openclaw.plugin.json
+++ b/extensions/google/openclaw.plugin.json
@@ -1,6 +1,8 @@
 {
   "id": "google",
   "providers": ["google", "google-gemini-cli"],
+  "mediaUnderstandingProviders": ["google"],
+  "imageGenerationProviders": ["google"],
   "cliBackends": ["google-gemini-cli"],
   "providerAuthEnvVars": {
     "google": ["GEMINI_API_KEY", "GOOGLE_API_KEY"]
diff --git a/extensions/groq/openclaw.plugin.json b/extensions/groq/openclaw.plugin.json
index 5ab0133764b..7da82942848 100644
--- a/extensions/groq/openclaw.plugin.json
+++ b/extensions/groq/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
   "id": "groq",
+  "mediaUnderstandingProviders": ["groq"],
   "configSchema": {
     "type": "object",
     "additionalProperties": false,
diff --git a/extensions/microsoft/openclaw.plugin.json b/extensions/microsoft/openclaw.plugin.json
index 85a130c463a..7ab6a523125 100644
--- a/extensions/microsoft/openclaw.plugin.json
+++ b/extensions/microsoft/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
   "id": "microsoft",
+  "speechProviders": ["microsoft"],
   "configSchema": {
     "type": "object",
     "additionalProperties": false,
diff --git a/extensions/microsoft/speech-provider.ts b/extensions/microsoft/speech-provider.ts
index 25997720670..3967a0c62d2 100644
--- a/extensions/microsoft/speech-provider.ts
+++ b/extensions/microsoft/speech-provider.ts
@@ -8,7 +8,8 @@ import {
 import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
 import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
 import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
-import { edgeTTS, inferEdgeExtension, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import { edgeTTS, inferEdgeExtension } from "./tts.js";
 
 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
 
diff --git a/src/tts/edge-tts-validation.test.ts b/extensions/microsoft/tts.test.ts
similarity index 90%
rename from src/tts/edge-tts-validation.test.ts
rename to extensions/microsoft/tts.test.ts
index 85c93211efc..b6dd0db1474 100644
--- a/src/tts/edge-tts-validation.test.ts
+++ b/extensions/microsoft/tts.test.ts
@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
 import path from "node:path";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
-let edgeTTS: typeof import("./tts-core.js").edgeTTS;
+let edgeTTS: typeof import("./tts.js").edgeTTS;
 
 let mockTtsPromise = vi.fn<(text: string, filePath: string) => Promise<void>>();
 
@@ -16,15 +16,13 @@ vi.mock("node-edge-tts", () => ({
 }));
 
 const baseEdgeConfig = {
-  enabled: true,
   voice: "en-US-MichelleNeural",
   lang: "en-US",
   outputFormat: "audio-24khz-48kbitrate-mono-mp3",
-  outputFormatConfigured: false,
   saveSubtitles: false,
 };
 
-describe("edgeTTS – empty audio validation", () => {
+describe("edgeTTS empty audio validation", () => {
   let tempDir: string | undefined;
 
   beforeEach(async () => {
@@ -36,7 +34,7 @@ describe("edgeTTS – empty audio validation", () => {
         }
       },
     }));
-    ({ edgeTTS } = await import("./tts-core.js"));
+    ({ edgeTTS } = await import("./tts.js"));
   });
 
   afterEach(() => {
diff --git a/extensions/microsoft/tts.ts b/extensions/microsoft/tts.ts
new file mode 100644
index 00000000000..4fd13e8b3a3
--- /dev/null
+++ b/extensions/microsoft/tts.ts
@@ -0,0 +1,55 @@
+import { statSync } from "node:fs";
+import { EdgeTTS } from "node-edge-tts";
+
+export function inferEdgeExtension(outputFormat: string): string {
+  const normalized = outputFormat.toLowerCase();
+  if (normalized.includes("webm")) {
+    return ".webm";
+  }
+  if (normalized.includes("ogg")) {
+    return ".ogg";
+  }
+  if (normalized.includes("opus")) {
+    return ".opus";
+  }
+  if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
+    return ".wav";
+  }
+  return ".mp3";
+}
+
+export async function edgeTTS(params: {
+  text: string;
+  outputPath: string;
+  config: {
+    voice: string;
+    lang: string;
+    outputFormat: string;
+    saveSubtitles: boolean;
+    proxy?: string;
+    rate?: string;
+    pitch?: string;
+    volume?: string;
+    timeoutMs?: number;
+  };
+  timeoutMs: number;
+}): Promise<void> {
+  const { text, outputPath, config, timeoutMs } = params;
+  const tts = new EdgeTTS({
+    voice: config.voice,
+    lang: config.lang,
+    outputFormat: config.outputFormat,
+    saveSubtitles: config.saveSubtitles,
+    proxy: config.proxy,
+    rate: config.rate,
+    pitch: config.pitch,
+    volume: config.volume,
+    timeout: config.timeoutMs ?? timeoutMs,
+  });
+  await tts.ttsPromise(text, outputPath);
+
+  const { size } = statSync(outputPath);
+  if (size === 0) {
+    throw new Error("Edge TTS produced empty audio file");
+  }
+}
diff --git a/extensions/minimax/openclaw.plugin.json b/extensions/minimax/openclaw.plugin.json
index 60a77127713..381865d93ed 100644
--- a/extensions/minimax/openclaw.plugin.json
+++ b/extensions/minimax/openclaw.plugin.json
@@ -1,6 +1,8 @@
 {
   "id": "minimax",
   "providers": ["minimax", "minimax-portal"],
+  "mediaUnderstandingProviders": ["minimax", "minimax-portal"],
+  "imageGenerationProviders": ["minimax", "minimax-portal"],
   "providerAuthEnvVars": {
     "minimax": ["MINIMAX_API_KEY"],
     "minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"]
diff --git a/extensions/mistral/openclaw.plugin.json b/extensions/mistral/openclaw.plugin.json
index 93f115cf719..ec142023431 100644
--- a/extensions/mistral/openclaw.plugin.json
+++ b/extensions/mistral/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
   "id": "mistral",
   "providers": ["mistral"],
+  "mediaUnderstandingProviders": ["mistral"],
   "providerAuthEnvVars": {
     "mistral": ["MISTRAL_API_KEY"]
   },
diff --git a/extensions/moonshot/media-understanding-provider.ts b/extensions/moonshot/media-understanding-provider.ts
index 6c652ae58d3..7d7ace86ea0 100644
--- a/extensions/moonshot/media-understanding-provider.ts
+++ b/extensions/moonshot/media-understanding-provider.ts
@@ -4,10 +4,12 @@ import {
   type MediaUnderstandingProvider,
   type VideoDescriptionRequest,
   type VideoDescriptionResult,
+} from "openclaw/plugin-sdk/media-understanding";
+import {
   assertOkOrThrowHttpError,
   normalizeBaseUrl,
   postJsonRequest,
-} from "openclaw/plugin-sdk/media-understanding";
+} from "openclaw/plugin-sdk/provider-http";
 
 export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
 const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
diff --git a/extensions/moonshot/openclaw.plugin.json b/extensions/moonshot/openclaw.plugin.json
index a5756e05623..39f36e7ecaa 100644
--- a/extensions/moonshot/openclaw.plugin.json
+++ b/extensions/moonshot/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
   "id": "moonshot",
   "providers": ["moonshot"],
+  "mediaUnderstandingProviders": ["moonshot"],
   "providerAuthEnvVars": {
     "moonshot": ["MOONSHOT_API_KEY"]
   },
diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json
index c082cdf93bd..68f3ba07670 100644
--- a/extensions/openai/openclaw.plugin.json
+++ b/extensions/openai/openclaw.plugin.json
@@ -1,6 +1,9 @@
 {
   "id": "openai",
   "providers": ["openai", "openai-codex"],
+  "speechProviders": ["openai"],
+  "mediaUnderstandingProviders": ["openai", "openai-codex"],
+  "imageGenerationProviders": ["openai"],
   "cliBackends": ["codex-cli"],
   "providerAuthEnvVars": {
     "openai": ["OPENAI_API_KEY"]
diff --git a/extensions/openai/speech-provider.ts b/extensions/openai/speech-provider.ts
index 0376d72f4f3..91b8008db9c 100644
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -1,5 +1,5 @@
 import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
-import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "openclaw/plugin-sdk/speech";
+import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js";
 
 export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
   return {
diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts
new file mode 100644
index 00000000000..52a288fef6f
--- /dev/null
+++ b/extensions/openai/tts.ts
@@ -0,0 +1,109 @@
+const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
+
+export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
+
+export const OPENAI_TTS_VOICES = [
+  "alloy",
+  "ash",
+  "ballad",
+  "cedar",
+  "coral",
+  "echo",
+  "fable",
+  "juniper",
+  "marin",
+  "onyx",
+  "nova",
+  "sage",
+  "shimmer",
+  "verse",
+] as const;
+
+type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
+
+function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return DEFAULT_OPENAI_BASE_URL;
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+function isCustomOpenAIEndpoint(baseUrl?: string): boolean {
+  if (baseUrl != null) {
+    return normalizeOpenAITtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_BASE_URL;
+  }
+  return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL;
+}
+
+function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
+  if (isCustomOpenAIEndpoint(baseUrl)) {
+    return true;
+  }
+  return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
+}
+
+function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
+  if (isCustomOpenAIEndpoint(baseUrl)) {
+    return true;
+  }
+  return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
+}
+
+function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
+  const next = instructions?.trim();
+  return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
+}
+
+export async function openaiTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl: string;
+  model: string;
+  voice: string;
+  speed?: number;
+  instructions?: string;
+  responseFormat: "mp3" | "opus" | "pcm";
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
+    params;
+  const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
+
+  if (!isValidOpenAIModel(model, baseUrl)) {
+    throw new Error(`Invalid model: ${model}`);
+  }
+  if (!isValidOpenAIVoice(voice, baseUrl)) {
+    throw new Error(`Invalid voice: ${voice}`);
+  }
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const response = await fetch(`${baseUrl}/audio/speech`, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model,
+        input: text,
+        voice,
+        response_format: responseFormat,
+        ...(speed != null && { speed }),
+        ...(effectiveInstructions != null && { instructions: effectiveInstructions }),
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      throw new Error(`OpenAI TTS API error (${response.status})`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
diff --git a/extensions/zai/openclaw.plugin.json b/extensions/zai/openclaw.plugin.json
index 2a7e1c8b40a..0e998d152f7 100644
--- a/extensions/zai/openclaw.plugin.json
+++ b/extensions/zai/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
   "id": "zai",
   "providers": ["zai"],
+  "mediaUnderstandingProviders": ["zai"],
   "providerAuthEnvVars": {
     "zai": ["ZAI_API_KEY", "Z_AI_API_KEY"]
   },
diff --git a/package.json b/package.json
index 6f41b3544f0..006808c3926 100644
--- a/package.json
+++ b/package.json
@@ -453,6 +453,10 @@
       "types": "./dist/plugin-sdk/provider-env-vars.d.ts",
       "default": "./dist/plugin-sdk/provider-env-vars.js"
     },
+    "./plugin-sdk/provider-http": {
+      "types": "./dist/plugin-sdk/provider-http.d.ts",
+      "default": "./dist/plugin-sdk/provider-http.js"
+    },
     "./plugin-sdk/provider-google": {
       "types": "./dist/plugin-sdk/provider-google.d.ts",
       "default": "./dist/plugin-sdk/provider-google.js"
@@ -529,6 +533,10 @@
       "types": "./dist/plugin-sdk/telegram-core.d.ts",
       "default": "./dist/plugin-sdk/telegram-core.js"
     },
+    "./plugin-sdk/telegram-runtime": {
+      "types": "./dist/plugin-sdk/telegram-runtime.d.ts",
+      "default": "./dist/plugin-sdk/telegram-runtime.js"
+    },
     "./plugin-sdk/thread-ownership": {
       "types": "./dist/plugin-sdk/thread-ownership.d.ts",
       "default": "./dist/plugin-sdk/thread-ownership.js"
diff --git a/scripts/generate-bundled-plugin-metadata.mjs b/scripts/generate-bundled-plugin-metadata.mjs
index 4fea8dd62f8..161b6bade7e 100644
--- a/scripts/generate-bundled-plugin-metadata.mjs
+++ b/scripts/generate-bundled-plugin-metadata.mjs
@@ -103,6 +103,15 @@ function normalizePluginManifest(raw) {
     ...(normalizeStringList(raw.providers)
       ? { providers: normalizeStringList(raw.providers) }
       : {}),
+    ...(normalizeStringList(raw.speechProviders)
+      ? { speechProviders: normalizeStringList(raw.speechProviders) }
+      : {}),
+    ...(normalizeStringList(raw.mediaUnderstandingProviders)
+      ? { mediaUnderstandingProviders: normalizeStringList(raw.mediaUnderstandingProviders) }
+      : {}),
+    ...(normalizeStringList(raw.imageGenerationProviders)
+      ? { imageGenerationProviders: normalizeStringList(raw.imageGenerationProviders) }
+      : {}),
     ...(normalizeObject(raw.providerAuthEnvVars)
       ? { providerAuthEnvVars: raw.providerAuthEnvVars }
       : {}),
diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json
index 8144c4a9701..f3de2a2ab96 100644
--- a/scripts/lib/plugin-sdk-entrypoints.json
+++ b/scripts/lib/plugin-sdk-entrypoints.json
@@ -103,6 +103,7 @@
   "provider-catalog",
   "provider-entry",
   "provider-env-vars",
+  "provider-http",
   "provider-google",
   "provider-models",
   "provider-onboard",
@@ -122,6 +123,7 @@
   "state-paths",
   "telegram",
   "telegram-core",
+  "telegram-runtime",
   "thread-ownership",
   "tlon",
   "tool-send",
diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 7721dae16b0..613f65d6658 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -461,7 +461,7 @@ export async function applyMediaUnderstanding(params: {
       .find((value) => value && value.trim()) ?? undefined;
 
   const attachments = normalizeMediaAttachments(ctx);
-  const providerRegistry = buildProviderRegistry(params.providers);
+  const providerRegistry = buildProviderRegistry(params.providers, cfg);
   const cache = createMediaAttachmentCache(attachments, {
     localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
   });
diff --git a/src/media-understanding/audio-transcription-runner.ts b/src/media-understanding/audio-transcription-runner.ts
index 3ef2fdfa0fa..2cbc5f15563 100644
--- a/src/media-understanding/audio-transcription-runner.ts
+++ b/src/media-understanding/audio-transcription-runner.ts
@@ -23,7 +23,7 @@ export async function runAudioTranscription(params: {
     return { transcript: undefined, attachments };
   }
 
-  const providerRegistry = buildProviderRegistry(params.providers);
+  const providerRegistry = buildProviderRegistry(params.providers, params.cfg);
   const cache = createMediaAttachmentCache(
     attachments,
     params.localPathRoots ? { localPathRoots: params.localPathRoots } : undefined,
diff --git a/src/media-understanding/provider-registry.test.ts b/src/media-understanding/provider-registry.test.ts
index 46885c6d45d..84ef2df4928 100644
--- a/src/media-understanding/provider-registry.test.ts
+++ b/src/media-understanding/provider-registry.test.ts
@@ -11,15 +11,10 @@ describe("media-understanding provider registry", () => {
     setActivePluginRegistry(createEmptyPluginRegistry());
   });
 
-  it("keeps core-owned fallback providers registered by default", () => {
+  it("returns no providers by default when no active registry is present", () => {
     const registry = buildMediaUnderstandingRegistry();
-    const groqProvider = getMediaUnderstandingProvider("groq", registry);
-    const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry);
-
-    expect(groqProvider?.id).toBe("groq");
-    expect(groqProvider?.capabilities).toEqual(["audio"]);
-    expect(deepgramProvider?.id).toBe("deepgram");
-    expect(deepgramProvider?.capabilities).toEqual(["audio"]);
+    expect(getMediaUnderstandingProvider("groq", registry)).toBeUndefined();
+    expect(getMediaUnderstandingProvider("deepgram", registry)).toBeUndefined();
   });
 
   it("merges plugin-registered media providers into the active registry", async () => {
diff --git a/src/media-understanding/provider-registry.ts b/src/media-understanding/provider-registry.ts
index 9441ccf5a7c..018d4edc58a 100644
--- a/src/media-understanding/provider-registry.ts
+++ b/src/media-understanding/provider-registry.ts
@@ -1,18 +1,9 @@
 import type { OpenClawConfig } from "../config/config.js";
-import {
-  deepgramMediaUnderstandingProvider,
-  groqMediaUnderstandingProvider,
-} from "../plugin-sdk/media-understanding.js";
 import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import { normalizeMediaProviderId } from "./provider-id.js";
 import type { MediaUnderstandingProvider } from "./types.js";
 
-const PROVIDERS: MediaUnderstandingProvider[] = [
-  groqMediaUnderstandingProvider,
-  deepgramMediaUnderstandingProvider,
-];
-
 function mergeProviderIntoRegistry(
   registry: Map<string, MediaUnderstandingProvider>,
   provider: MediaUnderstandingProvider,
@@ -36,12 +27,9 @@ export function buildMediaUnderstandingRegistry(
   cfg?: OpenClawConfig,
 ): Map<string, MediaUnderstandingProvider> {
   const registry = new Map<string, MediaUnderstandingProvider>();
-  for (const provider of PROVIDERS) {
-    mergeProviderIntoRegistry(registry, provider);
-  }
   const active = getActivePluginRegistry();
   const pluginRegistry =
-    (active?.mediaUnderstandingProviders?.length ?? 0) > 0
+    (active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg
       ? active
       : loadOpenClawPlugins({ config: cfg });
   for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) {
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index cb4934a5e34..fa9a7379e23 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -494,7 +494,7 @@ export async function resolveAutoImageModel(params: {
   agentDir?: string;
   activeModel?: ActiveMediaModel;
 }): Promise<ActiveMediaModel | null> {
-  const providerRegistry = buildProviderRegistry();
+  const providerRegistry = buildProviderRegistry(undefined, params.cfg);
   const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
     if (!entry || entry.type === "cli") {
       return null;
diff --git a/src/plugin-sdk/channel-actions.ts b/src/plugin-sdk/channel-actions.ts
index fe04c82a466..39d5b94ddaf 100644
--- a/src/plugin-sdk/channel-actions.ts
+++ b/src/plugin-sdk/channel-actions.ts
@@ -7,6 +7,7 @@ export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
 import { Type } from "@sinclair/typebox";
 import type { TSchema } from "@sinclair/typebox";
 import { stringEnum } from "../agents/schema/typebox.js";
+export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
 
 /** Schema helper for channels that expose button rows on the shared `message` tool. */
 export function createMessageToolButtonsSchema(): TSchema {
diff --git a/src/plugin-sdk/image-generation.ts b/src/plugin-sdk/image-generation.ts
index 1dc0c04b403..0c37a9ece7f 100644
--- a/src/plugin-sdk/image-generation.ts
+++ b/src/plugin-sdk/image-generation.ts
@@ -8,7 +8,3 @@ export type {
   ImageGenerationResult,
   ImageGenerationSourceImage,
 } from "../image-generation/types.js";
-
-export { buildFalImageGenerationProvider } from "../../extensions/fal/image-generation-provider.js";
-export { buildGoogleImageGenerationProvider } from "../../extensions/google/image-generation-provider.js";
-export { buildOpenAIImageGenerationProvider } from "../../extensions/openai/image-generation-provider.js";
diff --git a/src/plugin-sdk/index.test.ts b/src/plugin-sdk/index.test.ts
index c801e43218e..74cee9af15a 100644
--- a/src/plugin-sdk/index.test.ts
+++ b/src/plugin-sdk/index.test.ts
@@ -89,9 +89,6 @@ describe("plugin-sdk exports", () => {
   it("keeps the root runtime surface intentionally small", async () => {
     const runtimeExports = await collectRuntimeExports(path.join(import.meta.dirname, "index.ts"));
     expect([...runtimeExports].toSorted()).toEqual([
-      "buildFalImageGenerationProvider",
-      "buildGoogleImageGenerationProvider",
-      "buildOpenAIImageGenerationProvider",
       "delegateCompactionToRuntime",
       "emptyPluginConfigSchema",
       "onDiagnosticEvent",
diff --git a/src/plugin-sdk/media-understanding.ts b/src/plugin-sdk/media-understanding.ts
index 986f47357d7..cd6401ad675 100644
--- a/src/plugin-sdk/media-understanding.ts
+++ b/src/plugin-sdk/media-understanding.ts
@@ -18,12 +18,3 @@ export {
   describeImagesWithModel,
 } from "../media-understanding/image-runtime.js";
 export { transcribeOpenAiCompatibleAudio } from "../media-understanding/openai-compatible-audio.js";
-export {
-  assertOkOrThrowHttpError,
-  normalizeBaseUrl,
-  postJsonRequest,
-  postTranscriptionRequest,
-  requireTranscriptionText,
-} from "../media-understanding/shared.js";
-export { deepgramMediaUnderstandingProvider } from "../../extensions/deepgram/media-understanding-provider.js";
-export { groqMediaUnderstandingProvider } from "../../extensions/groq/media-understanding-provider.js";
diff --git a/src/plugin-sdk/provider-http.ts b/src/plugin-sdk/provider-http.ts
new file mode 100644
index 00000000000..de59b4c029b
--- /dev/null
+++ b/src/plugin-sdk/provider-http.ts
@@ -0,0 +1,12 @@
+// Shared provider-facing HTTP helpers. Keep generic transport utilities here so
+// capability SDKs do not depend on each other.
+
+export {
+  assertOkOrThrowHttpError,
+  fetchWithTimeout,
+  fetchWithTimeoutGuarded,
+  normalizeBaseUrl,
+  postJsonRequest,
+  postTranscriptionRequest,
+  requireTranscriptionText,
+} from "../media-understanding/shared.js";
diff --git a/src/plugin-sdk/speech-core.ts b/src/plugin-sdk/speech-core.ts
index e4af7a69486..75f9100fbe7 100644
--- a/src/plugin-sdk/speech-core.ts
+++ b/src/plugin-sdk/speech-core.ts
@@ -3,15 +3,4 @@
 export type { SpeechProviderPlugin } from "../plugins/types.js";
 export type { SpeechVoiceOption } from "../tts/provider-types.js";
 
-export {
-  edgeTTS,
-  elevenLabsTTS,
-  inferEdgeExtension,
-  OPENAI_TTS_MODELS,
-  OPENAI_TTS_VOICES,
-  openaiTTS,
-  parseTtsDirectives,
-} from "../tts/tts-core.js";
-
-export { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
-export { isVoiceCompatibleAudio } from "../media/audio.js";
+export { parseTtsDirectives } from "../tts/tts-core.js";
diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts
index 3c14de0238d..a98e98103e0 100644
--- a/src/plugin-sdk/speech.ts
+++ b/src/plugin-sdk/speech.ts
@@ -1,9 +1,4 @@
-// Public speech-provider builders for bundled or third-party plugins.
+// Public speech helpers for bundled or third-party plugins.
 
-export { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
-export { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
-export { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
-export { edgeTTS, elevenLabsTTS, inferEdgeExtension, openaiTTS } from "../tts/tts-core.js";
-export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "../tts/tts-core.js";
 export { parseTtsDirectives } from "../tts/tts-core.js";
 export type { SpeechVoiceOption } from "../tts/provider-types.js";
diff --git a/src/plugin-sdk/subpaths.test.ts b/src/plugin-sdk/subpaths.test.ts
index c6188cddbaf..251ddd4450f 100644
--- a/src/plugin-sdk/subpaths.test.ts
+++ b/src/plugin-sdk/subpaths.test.ts
@@ -544,6 +544,36 @@ describe("plugin-sdk subpath exports", () => {
       "buildOptionalSecretInputSchema",
       "normalizeSecretInputString",
     ]);
+    expectSourceMentions("provider-http", [
+      "assertOkOrThrowHttpError",
+      "normalizeBaseUrl",
+      "postJsonRequest",
+      "postTranscriptionRequest",
+      "requireTranscriptionText",
+    ]);
+    expectSourceOmits("speech", [
+      "buildElevenLabsSpeechProvider",
+      "buildMicrosoftSpeechProvider",
+      "buildOpenAISpeechProvider",
+      "edgeTTS",
+      "elevenLabsTTS",
+      "inferEdgeExtension",
+      "openaiTTS",
+      "OPENAI_TTS_MODELS",
+      "OPENAI_TTS_VOICES",
+    ]);
+    expectSourceOmits("media-understanding", [
+      "deepgramMediaUnderstandingProvider",
+      "groqMediaUnderstandingProvider",
+      "assertOkOrThrowHttpError",
+      "postJsonRequest",
+      "postTranscriptionRequest",
+    ]);
+    expectSourceOmits("image-generation", [
+      "buildFalImageGenerationProvider",
+      "buildGoogleImageGenerationProvider",
+      "buildOpenAIImageGenerationProvider",
+    ]);
     expectSourceOmits("config-runtime", [
       "hasConfiguredSecretInput",
       "normalizeResolvedSecretInputString",
diff --git a/src/plugins/bundled-plugin-metadata.generated.ts b/src/plugins/bundled-plugin-metadata.generated.ts
index 8b38bd799a5..317f9b767b2 100644
--- a/src/plugins/bundled-plugin-metadata.generated.ts
+++ b/src/plugins/bundled-plugin-metadata.generated.ts
@@ -169,6 +169,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         properties: {},
       },
       providers: ["anthropic"],
+      mediaUnderstandingProviders: ["anthropic"],
       providerAuthEnvVars: {
         anthropic: ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"],
       },
@@ -488,6 +489,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         additionalProperties: false,
         properties: {},
       },
+      mediaUnderstandingProviders: ["deepgram"],
     },
   },
   {
@@ -859,6 +861,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         additionalProperties: false,
         properties: {},
       },
+      speechProviders: ["elevenlabs"],
     },
   },
   {
@@ -925,6 +928,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         properties: {},
       },
       providers: ["fal"],
+      imageGenerationProviders: ["fal"],
       providerAuthEnvVars: {
         fal: ["FAL_KEY"],
       },
@@ -1114,6 +1118,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         },
       },
       providers: ["google", "google-gemini-cli"],
+      mediaUnderstandingProviders: ["google"],
+      imageGenerationProviders: ["google"],
       providerAuthEnvVars: {
         google: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
       },
@@ -1221,6 +1227,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         additionalProperties: false,
         properties: {},
       },
+      mediaUnderstandingProviders: ["groq"],
     },
   },
   {
@@ -1782,6 +1789,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         additionalProperties: false,
         properties: {},
       },
+      speechProviders: ["microsoft"],
     },
   },
   {
@@ -1854,6 +1862,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         properties: {},
       },
       providers: ["minimax", "minimax-portal"],
+      mediaUnderstandingProviders: ["minimax", "minimax-portal"],
+      imageGenerationProviders: ["minimax", "minimax-portal"],
       providerAuthEnvVars: {
         minimax: ["MINIMAX_API_KEY"],
         "minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"],
@@ -1931,6 +1941,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         properties: {},
       },
       providers: ["mistral"],
+      mediaUnderstandingProviders: ["mistral"],
       providerAuthEnvVars: {
         mistral: ["MISTRAL_API_KEY"],
       },
@@ -2072,6 +2083,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         },
       },
       providers: ["moonshot"],
+      mediaUnderstandingProviders: ["moonshot"],
       providerAuthEnvVars: {
         moonshot: ["MOONSHOT_API_KEY"],
       },
@@ -2363,6 +2375,9 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         properties: {},
       },
       providers: ["openai", "openai-codex"],
+      speechProviders: ["openai"],
+      mediaUnderstandingProviders: ["openai", "openai-codex"],
+      imageGenerationProviders: ["openai"],
       providerAuthEnvVars: {
         openai: ["OPENAI_API_KEY"],
       },
@@ -4101,6 +4116,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
         properties: {},
       },
       providers: ["zai"],
+      mediaUnderstandingProviders: ["zai"],
       providerAuthEnvVars: {
         zai: ["ZAI_API_KEY", "Z_AI_API_KEY"],
       },
diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts
index 0260026cecf..7f15fe700e8 100644
--- a/src/plugins/contracts/registry.contract.test.ts
+++ b/src/plugins/contracts/registry.contract.test.ts
@@ -120,6 +120,53 @@ describe("plugin contract registry", () => {
     expect(providerContractPluginIds).toEqual(bundledProviderPluginIds);
   });
 
+  it("covers every bundled speech plugin discovered from manifests", () => {
+    const bundledSpeechPluginIds = loadPluginManifestRegistry({})
+      .plugins.filter(
+        (plugin) => plugin.origin === "bundled" && (plugin.speechProviders?.length ?? 0) > 0,
+      )
+      .map((plugin) => plugin.id)
+      .toSorted((left, right) => left.localeCompare(right));
+
+    expect(
+      [...new Set(speechProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
+        (left, right) => left.localeCompare(right),
+      ),
+    ).toEqual(bundledSpeechPluginIds);
+  });
+
+  it("covers every bundled media-understanding plugin discovered from manifests", () => {
+    const bundledMediaPluginIds = loadPluginManifestRegistry({})
+      .plugins.filter(
+        (plugin) =>
+          plugin.origin === "bundled" && (plugin.mediaUnderstandingProviders?.length ?? 0) > 0,
+      )
+      .map((plugin) => plugin.id)
+      .toSorted((left, right) => left.localeCompare(right));
+
+    expect(
+      [
+        ...new Set(mediaUnderstandingProviderContractRegistry.map((entry) => entry.pluginId)),
+      ].toSorted((left, right) => left.localeCompare(right)),
+    ).toEqual(bundledMediaPluginIds);
+  });
+
+  it("covers every bundled image-generation plugin discovered from manifests", () => {
+    const bundledImagePluginIds = loadPluginManifestRegistry({})
+      .plugins.filter(
+        (plugin) =>
+          plugin.origin === "bundled" && (plugin.imageGenerationProviders?.length ?? 0) > 0,
+      )
+      .map((plugin) => plugin.id)
+      .toSorted((left, right) => left.localeCompare(right));
+
+    expect(
+      [...new Set(imageGenerationProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
+        (left, right) => left.localeCompare(right),
+      ),
+    ).toEqual(bundledImagePluginIds);
+  });
+
   it("covers every bundled web search plugin from the shared resolver", () => {
     const bundledWebSearchPluginIds = resolveBundledWebSearchPluginIds({});
 
diff --git a/src/plugins/contracts/registry.ts b/src/plugins/contracts/registry.ts
index 1af765586f6..73578d401f2 100644
--- a/src/plugins/contracts/registry.ts
+++ b/src/plugins/contracts/registry.ts
@@ -39,6 +39,7 @@ import xiaomiPlugin from "../../../extensions/xiaomi/index.js";
 import zaiPlugin from "../../../extensions/zai/index.js";
 import { bundledWebSearchPluginRegistrations } from "../../bundled-web-search-registry.js";
 import { createCapturedPluginRegistration } from "../captured-registration.js";
+import { loadPluginManifestRegistry } from "../manifest-registry.js";
 import { resolvePluginProviders } from "../provider-auth-choice.runtime.js";
 import type {
   ImageGenerationProviderPlugin,
@@ -85,21 +86,6 @@ const bundledWebSearchPlugins: Array<RegistrablePlugin & { credentialValue: unkn
     ...plugin,
     credentialValue,
   }));
-const bundledSpeechPlugins: RegistrablePlugin[] = [elevenLabsPlugin, microsoftPlugin, openAIPlugin];
-
-const bundledMediaUnderstandingPlugins: RegistrablePlugin[] = [
-  anthropicPlugin,
-  deepgramPlugin,
-  googlePlugin,
-  groqPlugin,
-  minimaxPlugin,
-  mistralPlugin,
-  moonshotPlugin,
-  openAIPlugin,
-  zaiPlugin,
-];
-
-const bundledImageGenerationPlugins: RegistrablePlugin[] = [falPlugin, googlePlugin, openAIPlugin];
 
 function captureRegistrations(plugin: RegistrablePlugin) {
   const captured = createCapturedPluginRegistration();
@@ -390,6 +376,43 @@ const bundledProviderPlugins = dedupePlugins([
   zaiPlugin,
 ]);
 
+const bundledRegistrablePluginsById = new Map(
+  dedupePlugins([
+    ...bundledProviderPlugins,
+    elevenLabsPlugin,
+    microsoftPlugin,
+    deepgramPlugin,
+    groqPlugin,
+    ...bundledWebSearchPlugins,
+  ]).map((plugin) => [plugin.id, plugin]),
+);
+
+function resolveBundledCapabilityPluginIds(
+  capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
+): string[] {
+  return loadPluginManifestRegistry({})
+    .plugins.filter(
+      (plugin) => plugin.origin === "bundled" && (plugin[capability]?.length ?? 0) > 0,
+    )
+    .map((plugin) => plugin.id)
+    .toSorted((left, right) => left.localeCompare(right));
+}
+
+function resolveBundledCapabilityPlugins(
+  capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
+): RegistrablePlugin[] {
+  return resolveBundledCapabilityPluginIds(capability).flatMap((pluginId) => {
+    const plugin = bundledRegistrablePluginsById.get(pluginId);
+    return plugin ? [plugin] : [];
+  });
+}
+
+const bundledSpeechPlugins = resolveBundledCapabilityPlugins("speechProviders");
+const bundledMediaUnderstandingPlugins = resolveBundledCapabilityPlugins(
+  "mediaUnderstandingProviders",
+);
+const bundledImageGenerationPlugins = resolveBundledCapabilityPlugins("imageGenerationProviders");
+
 const bundledPluginRegistrationList = dedupePlugins([
   ...bundledSpeechPlugins,
   ...bundledMediaUnderstandingPlugins,
diff --git a/src/plugins/manifest-registry.ts b/src/plugins/manifest-registry.ts
index adf601db5ca..18f3c6a6427 100644
--- a/src/plugins/manifest-registry.ts
+++ b/src/plugins/manifest-registry.ts
@@ -45,6 +45,9 @@ export type PluginManifestRecord = {
   kind?: PluginKind;
   channels: string[];
   providers: string[];
+  speechProviders?: string[];
+  mediaUnderstandingProviders?: string[];
+  imageGenerationProviders?: string[];
   cliBackends: string[];
   providerAuthEnvVars?: Record<string, string[]>;
   providerAuthChoices?: PluginManifest["providerAuthChoices"];
@@ -171,6 +174,9 @@ function buildRecord(params: {
     kind: params.manifest.kind,
     channels: params.manifest.channels ?? [],
     providers: params.manifest.providers ?? [],
+    speechProviders: params.manifest.speechProviders ?? [],
+    mediaUnderstandingProviders: params.manifest.mediaUnderstandingProviders ?? [],
+    imageGenerationProviders: params.manifest.imageGenerationProviders ?? [],
     cliBackends: params.manifest.cliBackends ?? [],
     providerAuthEnvVars: params.manifest.providerAuthEnvVars,
     providerAuthChoices: params.manifest.providerAuthChoices,
@@ -226,6 +232,9 @@ function buildBundleRecord(params: {
     bundleCapabilities: params.manifest.capabilities,
     channels: [],
     providers: [],
+    speechProviders: [],
+    mediaUnderstandingProviders: [],
+    imageGenerationProviders: [],
     cliBackends: [],
     skills: params.manifest.skills ?? [],
     settingsFiles: params.manifest.settingsFiles ?? [],
diff --git a/src/plugins/manifest.ts b/src/plugins/manifest.ts
index b1a7d593b46..50ec2d0aca0 100644
--- a/src/plugins/manifest.ts
+++ b/src/plugins/manifest.ts
@@ -15,6 +15,9 @@ export type PluginManifest = {
   kind?: PluginKind;
   channels?: string[];
   providers?: string[];
+  speechProviders?: string[];
+  mediaUnderstandingProviders?: string[];
+  imageGenerationProviders?: string[];
   /** Cheap startup activation lookup for plugin-owned CLI inference backends. */
   cliBackends?: string[];
   /** Cheap provider-auth env lookup without booting plugin runtime. */
@@ -205,6 +208,9 @@ export function loadPluginManifest(
   const version = typeof raw.version === "string" ? raw.version.trim() : undefined;
   const channels = normalizeStringList(raw.channels);
   const providers = normalizeStringList(raw.providers);
+  const speechProviders = normalizeStringList(raw.speechProviders);
+  const mediaUnderstandingProviders = normalizeStringList(raw.mediaUnderstandingProviders);
+  const imageGenerationProviders = normalizeStringList(raw.imageGenerationProviders);
   const cliBackends = normalizeStringList(raw.cliBackends);
   const providerAuthEnvVars = normalizeStringListRecord(raw.providerAuthEnvVars);
   const providerAuthChoices = normalizeProviderAuthChoices(raw.providerAuthChoices);
@@ -224,6 +230,9 @@ export function loadPluginManifest(
       kind,
       channels,
       providers,
+      speechProviders,
+      mediaUnderstandingProviders,
+      imageGenerationProviders,
       cliBackends,
       providerAuthEnvVars,
       providerAuthChoices,
diff --git a/src/tts/provider-registry.test.ts b/src/tts/provider-registry.test.ts
index b0b6a08abf5..5a084466e55 100644
--- a/src/tts/provider-registry.test.ts
+++ b/src/tts/provider-registry.test.ts
@@ -58,7 +58,7 @@ describe("speech provider registry", () => {
 
     const providers = listSpeechProviders();
 
-    expect(providers.map((provider) => provider.id)).toEqual(["openai", "elevenlabs", "microsoft"]);
+    expect(providers.map((provider) => provider.id)).toEqual(["openai"]);
     expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
   });
 
@@ -76,22 +76,14 @@ describe("speech provider registry", () => {
 
     const cfg = {} as OpenClawConfig;
 
-    expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual([
-      "openai",
-      "elevenlabs",
-      "microsoft",
-    ]);
+    expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual(["microsoft"]);
     expect(getSpeechProvider("edge", cfg)?.id).toBe("microsoft");
     expect(loadOpenClawPluginsMock).toHaveBeenCalledWith({ config: cfg });
   });
 
-  it("returns builtin providers when neither plugins nor active registry provide speech support", () => {
-    expect(listSpeechProviders().map((provider) => provider.id)).toEqual([
-      "openai",
-      "elevenlabs",
-      "microsoft",
-    ]);
-    expect(getSpeechProvider("openai")?.id).toBe("openai");
+  it("returns no providers when neither plugins nor active registry provide speech support", () => {
+    expect(listSpeechProviders()).toEqual([]);
+    expect(getSpeechProvider("openai")).toBeUndefined();
   });
 
   it("normalizes the legacy edge alias to microsoft", () => {
diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts
index 372473e0674..5fc6485e066 100644
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@@ -1,18 +1,9 @@
-import { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
-import { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
-import { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
 import type { OpenClawConfig } from "../config/config.js";
 import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";
 
-const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
-  buildOpenAISpeechProvider,
-  buildElevenLabsSpeechProvider,
-  buildMicrosoftSpeechProvider,
-] as const satisfies readonly (() => SpeechProviderPlugin)[];
-
 function trimToUndefined(value: string | undefined): string | undefined {
   const trimmed = value?.trim().toLowerCase();
   return trimmed ? trimmed : undefined;
@@ -58,9 +49,6 @@ function buildProviderMaps(cfg?: OpenClawConfig): {
     }
   };
 
-  for (const buildProvider of BUILTIN_SPEECH_PROVIDER_BUILDERS) {
-    register(buildProvider());
-  }
   for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
     register(provider);
   }
diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts
index f665b005a51..809c18c2d78 100644
--- a/src/tts/tts-core.ts
+++ b/src/tts/tts-core.ts
@@ -1,6 +1,5 @@
-import { rmSync, statSync } from "node:fs";
+import { rmSync } from "node:fs";
 import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
-import { EdgeTTS } from "node-edge-tts";
 import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
 import {
   buildModelAliasIndex,
@@ -18,7 +17,6 @@ import type {
   TtsDirectiveParseResult,
 } from "./tts.js";
 
-const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
 export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
 const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
 
@@ -26,14 +24,6 @@ export function isValidVoiceId(voiceId: string): boolean {
   return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
 }
 
-function normalizeElevenLabsBaseUrl(baseUrl: string): string {
-  const trimmed = baseUrl.trim();
-  if (!trimmed) {
-    return DEFAULT_ELEVENLABS_BASE_URL;
-  }
-  return trimmed.replace(/\/+$/, "");
-}
-
 function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
   const trimmed = baseUrl?.trim();
   if (!trimmed) {
@@ -53,13 +43,6 @@ function requireInRange(value: number, min: number, max: number, label: string):
   }
 }
 
-function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
-  requireInRange(settings.stability, 0, 1, "stability");
-  requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
-  requireInRange(settings.style, 0, 1, "style");
-  requireInRange(settings.speed, 0.5, 2, "speed");
-}
-
 function normalizeLanguageCode(code?: string): string | undefined {
   const trimmed = code?.trim();
   if (!trimmed) {
@@ -538,177 +521,3 @@ export function scheduleCleanup(
   }, delayMs);
   timer.unref();
 }
-
-export async function elevenLabsTTS(params: {
-  text: string;
-  apiKey: string;
-  baseUrl: string;
-  voiceId: string;
-  modelId: string;
-  outputFormat: string;
-  seed?: number;
-  applyTextNormalization?: "auto" | "on" | "off";
-  languageCode?: string;
-  voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
-  timeoutMs: number;
-}): Promise<Buffer> {
-  const {
-    text,
-    apiKey,
-    baseUrl,
-    voiceId,
-    modelId,
-    outputFormat,
-    seed,
-    applyTextNormalization,
-    languageCode,
-    voiceSettings,
-    timeoutMs,
-  } = params;
-  if (!isValidVoiceId(voiceId)) {
-    throw new Error("Invalid voiceId format");
-  }
-  assertElevenLabsVoiceSettings(voiceSettings);
-  const normalizedLanguage = normalizeLanguageCode(languageCode);
-  const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
-  const normalizedSeed = normalizeSeed(seed);
-
-  const controller = new AbortController();
-  const timeout = setTimeout(() => controller.abort(), timeoutMs);
-
-  try {
-    const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
-    if (outputFormat) {
-      url.searchParams.set("output_format", outputFormat);
-    }
-
-    const response = await fetch(url.toString(), {
-      method: "POST",
-      headers: {
-        "xi-api-key": apiKey,
-        "Content-Type": "application/json",
-        Accept: "audio/mpeg",
-      },
-      body: JSON.stringify({
-        text,
-        model_id: modelId,
-        seed: normalizedSeed,
-        apply_text_normalization: normalizedNormalization,
-        language_code: normalizedLanguage,
-        voice_settings: {
-          stability: voiceSettings.stability,
-          similarity_boost: voiceSettings.similarityBoost,
-          style: voiceSettings.style,
-          use_speaker_boost: voiceSettings.useSpeakerBoost,
-          speed: voiceSettings.speed,
-        },
-      }),
-      signal: controller.signal,
-    });
-
-    if (!response.ok) {
-      throw new Error(`ElevenLabs API error (${response.status})`);
-    }
-
-    return Buffer.from(await response.arrayBuffer());
-  } finally {
-    clearTimeout(timeout);
-  }
-}
-
-export async function openaiTTS(params: {
-  text: string;
-  apiKey: string;
-  baseUrl: string;
-  model: string;
-  voice: string;
-  speed?: number;
-  instructions?: string;
-  responseFormat: "mp3" | "opus" | "pcm";
-  timeoutMs: number;
-}): Promise<Buffer> {
-  const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
-    params;
-  const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
-
-  if (!isValidOpenAIModel(model, baseUrl)) {
-    throw new Error(`Invalid model: ${model}`);
-  }
-  if (!isValidOpenAIVoice(voice, baseUrl)) {
-    throw new Error(`Invalid voice: ${voice}`);
-  }
-
-  const controller = new AbortController();
-  const timeout = setTimeout(() => controller.abort(), timeoutMs);
-
-  try {
-    const response = await fetch(`${baseUrl}/audio/speech`, {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${apiKey}`,
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model,
-        input: text,
-        voice,
-        response_format: responseFormat,
-        ...(speed != null && { speed }),
-        ...(effectiveInstructions != null && { instructions: effectiveInstructions }),
-      }),
-      signal: controller.signal,
-    });
-
-    if (!response.ok) {
-      throw new Error(`OpenAI TTS API error (${response.status})`);
-    }
-
-    return Buffer.from(await response.arrayBuffer());
-  } finally {
-    clearTimeout(timeout);
-  }
-}
-
-export function inferEdgeExtension(outputFormat: string): string {
-  const normalized = outputFormat.toLowerCase();
-  if (normalized.includes("webm")) {
-    return ".webm";
-  }
-  if (normalized.includes("ogg")) {
-    return ".ogg";
-  }
-  if (normalized.includes("opus")) {
-    return ".opus";
-  }
-  if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
-    return ".wav";
-  }
-  return ".mp3";
-}
-
-export async function edgeTTS(params: {
-  text: string;
-  outputPath: string;
-  config: ResolvedTtsConfig["edge"];
-  timeoutMs: number;
-}): Promise<void> {
-  const { text, outputPath, config, timeoutMs } = params;
-  const tts = new EdgeTTS({
-    voice: config.voice,
-    lang: config.lang,
-    outputFormat: config.outputFormat,
-    saveSubtitles: config.saveSubtitles,
-    proxy: config.proxy,
-    rate: config.rate,
-    pitch: config.pitch,
-    volume: config.volume,
-    timeout: config.timeoutMs ?? timeoutMs,
-  });
-  await tts.ttsPromise(text, outputPath);
-
-  const { size } = statSync(outputPath);
-
-  if (size === 0) {
-    throw new Error("Edge TTS produced empty audio file");
-  }
-}