refactor: clean plugin capability boundaries

2026-03-28 10:22:32 +00:00 · 2026-03-26 21:40:58 +00:00
parent d00dc5f46b
commit ce9dff1458
49 changed files with 572 additions and 342 deletions
--- a/docs/.generated/plugin-sdk-api-baseline.json
+++ b/docs/.generated/plugin-sdk-api-baseline.json
@@ -5,33 +5,6 @@
      "category": "legacy",
      "entrypoint": "index",
      "exports": [
-        {
-          "declaration": "export function buildFalImageGenerationProvider(): ImageGenerationProvider;",
-          "exportName": "buildFalImageGenerationProvider",
-          "kind": "function",
-          "source": {
-            "line": 190,
-            "path": "extensions/fal/image-generation-provider.ts"
-          }
-        },
-        {
-          "declaration": "export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;",
-          "exportName": "buildGoogleImageGenerationProvider",
-          "kind": "function",
-          "source": {
-            "line": 98,
-            "path": "extensions/google/image-generation-provider.ts"
-          }
-        },
-        {
-          "declaration": "export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;",
-          "exportName": "buildOpenAIImageGenerationProvider",
-          "kind": "function",
-          "source": {
-            "line": 22,
-            "path": "extensions/openai/image-generation-provider.ts"
-          }
-        },
        {
          "declaration": "export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;",
          "exportName": "delegateCompactionToRuntime",
@@ -923,7 +896,7 @@
          "exportName": "createMessageToolButtonsSchema",
          "kind": "function",
          "source": {
-            "line": 11,
+            "line": 12,
            "path": "src/plugin-sdk/channel-actions.ts"
          }
        },
@@ -932,7 +905,7 @@
          "exportName": "createMessageToolCardSchema",
          "kind": "function",
          "source": {
-            "line": 29,
+            "line": 30,
            "path": "src/plugin-sdk/channel-actions.ts"
          }
        },
@@ -954,6 +927,15 @@
            "path": "src/channels/plugins/actions/shared.ts"
          }
        },
+        {
+          "declaration": "export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;",
+          "exportName": "optionalStringEnum",
+          "kind": "function",
+          "source": {
+            "line": 31,
+            "path": "src/agents/schema/typebox.ts"
+          }
+        },
        {
          "declaration": "export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;",
          "exportName": "resolveReactionMessageId",
@@ -962,6 +944,15 @@
            "line": 7,
            "path": "src/channels/plugins/actions/reaction-message-id.ts"
          }
+        },
+        {
+          "declaration": "export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;",
+          "exportName": "stringEnum",
+          "kind": "function",
+          "source": {
+            "line": 15,
+            "path": "src/agents/schema/typebox.ts"
+          }
        }
      ],
      "importSpecifier": "openclaw/plugin-sdk/channel-actions",
--- a/docs/.generated/plugin-sdk-api-baseline.jsonl
+++ b/docs/.generated/plugin-sdk-api-baseline.jsonl
@@ -1,7 +1,4 @@
 {"category":"legacy","entrypoint":"index","importSpecifier":"openclaw/plugin-sdk","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/index.ts"}
-{"declaration":"export function buildFalImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildFalImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":190,"sourcePath":"extensions/fal/image-generation-provider.ts"}
-{"declaration":"export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildGoogleImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":98,"sourcePath":"extensions/google/image-generation-provider.ts"}
-{"declaration":"export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildOpenAIImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":22,"sourcePath":"extensions/openai/image-generation-provider.ts"}
 {"declaration":"export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;","entrypoint":"index","exportName":"delegateCompactionToRuntime","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":16,"sourcePath":"src/context-engine/delegate.ts"}
 {"declaration":"export function emptyPluginConfigSchema(): OpenClawPluginConfigSchema;","entrypoint":"index","exportName":"emptyPluginConfigSchema","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/plugins/config-schema.ts"}
 {"declaration":"export function onDiagnosticEvent(listener: (evt: DiagnosticEventPayload) => void): () => void;","entrypoint":"index","exportName":"onDiagnosticEvent","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":229,"sourcePath":"src/infra/diagnostic-events.ts"}
@@ -100,11 +97,13 @@
 {"declaration":"export type BasicAllowlistResolutionEntry = BasicAllowlistResolutionEntry;","entrypoint":"allow-from","exportName":"BasicAllowlistResolutionEntry","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":129,"sourcePath":"src/plugin-sdk/allow-from.ts"}
 {"declaration":"export type CompiledAllowlist = CompiledAllowlist;","entrypoint":"allow-from","exportName":"CompiledAllowlist","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":19,"sourcePath":"src/channels/allowlist-match.ts"}
 {"category":"channel","entrypoint":"channel-actions","importSpecifier":"openclaw/plugin-sdk/channel-actions","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
-{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":11,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
-{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":29,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
+{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":12,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
+{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":30,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
 {"declaration":"export function createUnionActionGate<TAccount, TKey extends string>(accounts: readonly TAccount[], createGate: (account: TAccount) => OptionalDefaultGate<TKey>): OptionalDefaultGate<TKey>;","entrypoint":"channel-actions","exportName":"createUnionActionGate","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/channels/plugins/actions/shared.ts"}
 {"declaration":"export function listTokenSourcedAccounts<TAccount extends TokenSourcedAccount>(accounts: readonly TAccount[]): TAccount[];","entrypoint":"channel-actions","exportName":"listTokenSourcedAccounts","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/shared.ts"}
+{"declaration":"export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;","entrypoint":"channel-actions","exportName":"optionalStringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":31,"sourcePath":"src/agents/schema/typebox.ts"}
 {"declaration":"export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;","entrypoint":"channel-actions","exportName":"resolveReactionMessageId","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/reaction-message-id.ts"}
+{"declaration":"export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;","entrypoint":"channel-actions","exportName":"stringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":15,"sourcePath":"src/agents/schema/typebox.ts"}
 {"category":"channel","entrypoint":"channel-config-schema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-config-schema.ts"}
 {"declaration":"export function buildCatchallMultiAccountChannelSchema<T extends ExtendableZodObject>(accountSchema: T): T;","entrypoint":"channel-config-schema","exportName":"buildCatchallMultiAccountChannelSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":26,"sourcePath":"src/channels/plugins/config-schema.ts"}
 {"declaration":"export function buildChannelConfigSchema(schema: ZodType<unknown, unknown, $ZodTypeInternals<unknown, unknown>>): ChannelConfigSchema;","entrypoint":"channel-config-schema","exportName":"buildChannelConfigSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":35,"sourcePath":"src/channels/plugins/config-schema.ts"}
--- a/extensions/anthropic/openclaw.plugin.json
+++ b/extensions/anthropic/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
  "id": "anthropic",
  "providers": ["anthropic"],
+  "mediaUnderstandingProviders": ["anthropic"],
  "cliBackends": ["claude-cli"],
  "providerAuthEnvVars": {
    "anthropic": ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"]
--- a/extensions/deepgram/audio.ts
+++ b/extensions/deepgram/audio.ts
@@ -7,7 +7,7 @@ import {
  normalizeBaseUrl,
  postTranscriptionRequest,
  requireTranscriptionText,
-} from "openclaw/plugin-sdk/media-understanding";
+} from "openclaw/plugin-sdk/provider-http";

 export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
 export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";
--- a/extensions/deepgram/openclaw.plugin.json
+++ b/extensions/deepgram/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
  "id": "deepgram",
+  "mediaUnderstandingProviders": ["deepgram"],
  "configSchema": {
    "type": "object",
    "additionalProperties": false,
--- a/extensions/elevenlabs/openclaw.plugin.json
+++ b/extensions/elevenlabs/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
  "id": "elevenlabs",
+  "speechProviders": ["elevenlabs"],
  "configSchema": {
    "type": "object",
    "additionalProperties": false,
--- a/extensions/elevenlabs/speech-provider.ts
+++ b/extensions/elevenlabs/speech-provider.ts
@@ -1,5 +1,6 @@
 import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
-import { elevenLabsTTS, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import { elevenLabsTTS } from "./tts.js";

 const ELEVENLABS_TTS_MODELS = [
  "eleven_multilingual_v2",
--- a/extensions/elevenlabs/tts.ts
+++ b/extensions/elevenlabs/tts.ts
@@ -0,0 +1,150 @@
+const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
+
+function isValidVoiceId(voiceId: string): boolean {
+  return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
+}
+
+function normalizeElevenLabsBaseUrl(baseUrl?: string): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return DEFAULT_ELEVENLABS_BASE_URL;
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+function normalizeLanguageCode(code?: string): string | undefined {
+  const trimmed = code?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (!/^[a-z]{2}$/.test(normalized)) {
+    throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
+  }
+  return normalized;
+}
+
+function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
+  const trimmed = mode?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (normalized === "auto" || normalized === "on" || normalized === "off") {
+    return normalized;
+  }
+  throw new Error("applyTextNormalization must be one of: auto, on, off");
+}
+
+function normalizeSeed(seed?: number): number | undefined {
+  if (seed == null) {
+    return undefined;
+  }
+  const next = Math.floor(seed);
+  if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
+    throw new Error("seed must be between 0 and 4294967295");
+  }
+  return next;
+}
+
+function requireInRange(value: number, min: number, max: number, label: string): void {
+  if (!Number.isFinite(value) || value < min || value > max) {
+    throw new Error(`${label} must be between ${min} and ${max}`);
+  }
+}
+
+function assertElevenLabsVoiceSettings(settings: {
+  stability: number;
+  similarityBoost: number;
+  style: number;
+  useSpeakerBoost: boolean;
+  speed: number;
+}) {
+  requireInRange(settings.stability, 0, 1, "stability");
+  requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
+  requireInRange(settings.style, 0, 1, "style");
+  requireInRange(settings.speed, 0.5, 2, "speed");
+}
+
+export async function elevenLabsTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl: string;
+  voiceId: string;
+  modelId: string;
+  outputFormat: string;
+  seed?: number;
+  applyTextNormalization?: "auto" | "on" | "off";
+  languageCode?: string;
+  voiceSettings: {
+    stability: number;
+    similarityBoost: number;
+    style: number;
+    useSpeakerBoost: boolean;
+    speed: number;
+  };
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const {
+    text,
+    apiKey,
+    baseUrl,
+    voiceId,
+    modelId,
+    outputFormat,
+    seed,
+    applyTextNormalization,
+    languageCode,
+    voiceSettings,
+    timeoutMs,
+  } = params;
+  if (!isValidVoiceId(voiceId)) {
+    throw new Error("Invalid voiceId format");
+  }
+  assertElevenLabsVoiceSettings(voiceSettings);
+  const normalizedLanguage = normalizeLanguageCode(languageCode);
+  const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
+  const normalizedSeed = normalizeSeed(seed);
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
+    if (outputFormat) {
+      url.searchParams.set("output_format", outputFormat);
+    }
+
+    const response = await fetch(url.toString(), {
+      method: "POST",
+      headers: {
+        "xi-api-key": apiKey,
+        "Content-Type": "application/json",
+        Accept: "audio/mpeg",
+      },
+      body: JSON.stringify({
+        text,
+        model_id: modelId,
+        seed: normalizedSeed,
+        apply_text_normalization: normalizedNormalization,
+        language_code: normalizedLanguage,
+        voice_settings: {
+          stability: voiceSettings.stability,
+          similarity_boost: voiceSettings.similarityBoost,
+          style: voiceSettings.style,
+          use_speaker_boost: voiceSettings.useSpeakerBoost,
+          speed: voiceSettings.speed,
+        },
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      throw new Error(`ElevenLabs API error (${response.status})`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
--- a/extensions/fal/openclaw.plugin.json
+++ b/extensions/fal/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
  "id": "fal",
  "providers": ["fal"],
+  "imageGenerationProviders": ["fal"],
  "providerAuthEnvVars": {
    "fal": ["FAL_KEY"]
  },
--- a/extensions/google/image-generation-provider.ts
+++ b/extensions/google/image-generation-provider.ts
@@ -1,9 +1,4 @@
 import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
-import {
-  assertOkOrThrowHttpError,
-  normalizeBaseUrl,
-  postJsonRequest,
-} from "openclaw/plugin-sdk/media-understanding";
 import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth";
 import {
  DEFAULT_GOOGLE_API_BASE_URL,
@@ -11,6 +6,11 @@ import {
  normalizeGoogleModelId,
  parseGeminiAuth,
 } from "openclaw/plugin-sdk/provider-google";
+import {
+  assertOkOrThrowHttpError,
+  normalizeBaseUrl,
+  postJsonRequest,
+} from "openclaw/plugin-sdk/provider-http";

 const DEFAULT_GOOGLE_IMAGE_MODEL = "gemini-3.1-flash-image-preview";
 const DEFAULT_OUTPUT_MIME = "image/png";
--- a/extensions/google/media-understanding-provider.ts
+++ b/extensions/google/media-understanding-provider.ts
@@ -1,15 +1,17 @@
 import {
-  assertOkOrThrowHttpError,
  describeImageWithModel,
  describeImagesWithModel,
-  normalizeBaseUrl,
-  postJsonRequest,
  type AudioTranscriptionRequest,
  type AudioTranscriptionResult,
  type MediaUnderstandingProvider,
  type VideoDescriptionRequest,
  type VideoDescriptionResult,
 } from "openclaw/plugin-sdk/media-understanding";
+import {
+  assertOkOrThrowHttpError,
+  normalizeBaseUrl,
+  postJsonRequest,
+} from "openclaw/plugin-sdk/provider-http";
 import {
  DEFAULT_GOOGLE_API_BASE_URL,
  normalizeGoogleApiBaseUrl,
--- a/extensions/google/openclaw.plugin.json
+++ b/extensions/google/openclaw.plugin.json
@@ -1,6 +1,8 @@
 {
  "id": "google",
  "providers": ["google", "google-gemini-cli"],
+  "mediaUnderstandingProviders": ["google"],
+  "imageGenerationProviders": ["google"],
  "cliBackends": ["google-gemini-cli"],
  "providerAuthEnvVars": {
    "google": ["GEMINI_API_KEY", "GOOGLE_API_KEY"]
--- a/extensions/groq/openclaw.plugin.json
+++ b/extensions/groq/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
  "id": "groq",
+  "mediaUnderstandingProviders": ["groq"],
  "configSchema": {
    "type": "object",
    "additionalProperties": false,
--- a/extensions/microsoft/openclaw.plugin.json
+++ b/extensions/microsoft/openclaw.plugin.json
@@ -1,5 +1,6 @@
 {
  "id": "microsoft",
+  "speechProviders": ["microsoft"],
  "configSchema": {
    "type": "object",
    "additionalProperties": false,
--- a/extensions/microsoft/speech-provider.ts
+++ b/extensions/microsoft/speech-provider.ts
@@ -8,7 +8,8 @@ import {
 import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
 import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
 import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
-import { edgeTTS, inferEdgeExtension, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
+import { edgeTTS, inferEdgeExtension } from "./tts.js";

 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";

--- a/src/tts/edge-tts-validation.test.ts
+++ b/src/tts/edge-tts-validation.test.ts
@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
 import path from "node:path";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

-let edgeTTS: typeof import("./tts-core.js").edgeTTS;
+let edgeTTS: typeof import("./tts.js").edgeTTS;

 let mockTtsPromise = vi.fn<(text: string, filePath: string) => Promise<void>>();

@@ -16,15 +16,13 @@ vi.mock("node-edge-tts", () => ({
 }));

 const baseEdgeConfig = {
-  enabled: true,
  voice: "en-US-MichelleNeural",
  lang: "en-US",
  outputFormat: "audio-24khz-48kbitrate-mono-mp3",
-  outputFormatConfigured: false,
  saveSubtitles: false,
 };

-describe("edgeTTS – empty audio validation", () => {
+describe("edgeTTS empty audio validation", () => {
  let tempDir: string | undefined;

  beforeEach(async () => {
@@ -36,7 +34,7 @@ describe("edgeTTS – empty audio validation", () => {
        }
      },
    }));
-    ({ edgeTTS } = await import("./tts-core.js"));
+    ({ edgeTTS } = await import("./tts.js"));
  });

  afterEach(() => {
--- a/extensions/microsoft/tts.ts
+++ b/extensions/microsoft/tts.ts
@@ -0,0 +1,55 @@
+import { statSync } from "node:fs";
+import { EdgeTTS } from "node-edge-tts";
+
+export function inferEdgeExtension(outputFormat: string): string {
+  const normalized = outputFormat.toLowerCase();
+  if (normalized.includes("webm")) {
+    return ".webm";
+  }
+  if (normalized.includes("ogg")) {
+    return ".ogg";
+  }
+  if (normalized.includes("opus")) {
+    return ".opus";
+  }
+  if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
+    return ".wav";
+  }
+  return ".mp3";
+}
+
+export async function edgeTTS(params: {
+  text: string;
+  outputPath: string;
+  config: {
+    voice: string;
+    lang: string;
+    outputFormat: string;
+    saveSubtitles: boolean;
+    proxy?: string;
+    rate?: string;
+    pitch?: string;
+    volume?: string;
+    timeoutMs?: number;
+  };
+  timeoutMs: number;
+}): Promise<void> {
+  const { text, outputPath, config, timeoutMs } = params;
+  const tts = new EdgeTTS({
+    voice: config.voice,
+    lang: config.lang,
+    outputFormat: config.outputFormat,
+    saveSubtitles: config.saveSubtitles,
+    proxy: config.proxy,
+    rate: config.rate,
+    pitch: config.pitch,
+    volume: config.volume,
+    timeout: config.timeoutMs ?? timeoutMs,
+  });
+  await tts.ttsPromise(text, outputPath);
+
+  const { size } = statSync(outputPath);
+  if (size === 0) {
+    throw new Error("Edge TTS produced empty audio file");
+  }
+}
--- a/extensions/minimax/openclaw.plugin.json
+++ b/extensions/minimax/openclaw.plugin.json
@@ -1,6 +1,8 @@
 {
  "id": "minimax",
  "providers": ["minimax", "minimax-portal"],
+  "mediaUnderstandingProviders": ["minimax", "minimax-portal"],
+  "imageGenerationProviders": ["minimax", "minimax-portal"],
  "providerAuthEnvVars": {
    "minimax": ["MINIMAX_API_KEY"],
    "minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"]
--- a/extensions/mistral/openclaw.plugin.json
+++ b/extensions/mistral/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
  "id": "mistral",
  "providers": ["mistral"],
+  "mediaUnderstandingProviders": ["mistral"],
  "providerAuthEnvVars": {
    "mistral": ["MISTRAL_API_KEY"]
  },
--- a/extensions/moonshot/media-understanding-provider.ts
+++ b/extensions/moonshot/media-understanding-provider.ts
@@ -4,10 +4,12 @@ import {
  type MediaUnderstandingProvider,
  type VideoDescriptionRequest,
  type VideoDescriptionResult,
+} from "openclaw/plugin-sdk/media-understanding";
+import {
  assertOkOrThrowHttpError,
  normalizeBaseUrl,
  postJsonRequest,
-} from "openclaw/plugin-sdk/media-understanding";
+} from "openclaw/plugin-sdk/provider-http";

 export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
 const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
--- a/extensions/moonshot/openclaw.plugin.json
+++ b/extensions/moonshot/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
  "id": "moonshot",
  "providers": ["moonshot"],
+  "mediaUnderstandingProviders": ["moonshot"],
  "providerAuthEnvVars": {
    "moonshot": ["MOONSHOT_API_KEY"]
  },
--- a/extensions/openai/openclaw.plugin.json
+++ b/extensions/openai/openclaw.plugin.json
@@ -1,6 +1,9 @@
 {
  "id": "openai",
  "providers": ["openai", "openai-codex"],
+  "speechProviders": ["openai"],
+  "mediaUnderstandingProviders": ["openai", "openai-codex"],
+  "imageGenerationProviders": ["openai"],
  "cliBackends": ["codex-cli"],
  "providerAuthEnvVars": {
    "openai": ["OPENAI_API_KEY"]
--- a/extensions/openai/speech-provider.ts
+++ b/extensions/openai/speech-provider.ts
@@ -1,5 +1,5 @@
 import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
-import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "openclaw/plugin-sdk/speech";
+import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js";

 export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
  return {
--- a/extensions/openai/tts.ts
+++ b/extensions/openai/tts.ts
@@ -0,0 +1,109 @@
+const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
+
+export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
+
+export const OPENAI_TTS_VOICES = [
+  "alloy",
+  "ash",
+  "ballad",
+  "cedar",
+  "coral",
+  "echo",
+  "fable",
+  "juniper",
+  "marin",
+  "onyx",
+  "nova",
+  "sage",
+  "shimmer",
+  "verse",
+] as const;
+
+type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
+
+function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
+  const trimmed = baseUrl?.trim();
+  if (!trimmed) {
+    return DEFAULT_OPENAI_BASE_URL;
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+function isCustomOpenAIEndpoint(baseUrl?: string): boolean {
+  if (baseUrl != null) {
+    return normalizeOpenAITtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_BASE_URL;
+  }
+  return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL;
+}
+
+function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
+  if (isCustomOpenAIEndpoint(baseUrl)) {
+    return true;
+  }
+  return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
+}
+
+function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
+  if (isCustomOpenAIEndpoint(baseUrl)) {
+    return true;
+  }
+  return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
+}
+
+function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
+  const next = instructions?.trim();
+  return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
+}
+
+export async function openaiTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl: string;
+  model: string;
+  voice: string;
+  speed?: number;
+  instructions?: string;
+  responseFormat: "mp3" | "opus" | "pcm";
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
+    params;
+  const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
+
+  if (!isValidOpenAIModel(model, baseUrl)) {
+    throw new Error(`Invalid model: ${model}`);
+  }
+  if (!isValidOpenAIVoice(voice, baseUrl)) {
+    throw new Error(`Invalid voice: ${voice}`);
+  }
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const response = await fetch(`${baseUrl}/audio/speech`, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model,
+        input: text,
+        voice,
+        response_format: responseFormat,
+        ...(speed != null && { speed }),
+        ...(effectiveInstructions != null && { instructions: effectiveInstructions }),
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      throw new Error(`OpenAI TTS API error (${response.status})`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
--- a/extensions/zai/openclaw.plugin.json
+++ b/extensions/zai/openclaw.plugin.json
@@ -1,6 +1,7 @@
 {
  "id": "zai",
  "providers": ["zai"],
+  "mediaUnderstandingProviders": ["zai"],
  "providerAuthEnvVars": {
    "zai": ["ZAI_API_KEY", "Z_AI_API_KEY"]
  },
--- a/package.json
+++ b/package.json
@@ -453,6 +453,10 @@
      "types": "./dist/plugin-sdk/provider-env-vars.d.ts",
      "default": "./dist/plugin-sdk/provider-env-vars.js"
    },
+    "./plugin-sdk/provider-http": {
+      "types": "./dist/plugin-sdk/provider-http.d.ts",
+      "default": "./dist/plugin-sdk/provider-http.js"
+    },
    "./plugin-sdk/provider-google": {
      "types": "./dist/plugin-sdk/provider-google.d.ts",
      "default": "./dist/plugin-sdk/provider-google.js"
@@ -529,6 +533,10 @@
      "types": "./dist/plugin-sdk/telegram-core.d.ts",
      "default": "./dist/plugin-sdk/telegram-core.js"
    },
+    "./plugin-sdk/telegram-runtime": {
+      "types": "./dist/plugin-sdk/telegram-runtime.d.ts",
+      "default": "./dist/plugin-sdk/telegram-runtime.js"
+    },
    "./plugin-sdk/thread-ownership": {
      "types": "./dist/plugin-sdk/thread-ownership.d.ts",
      "default": "./dist/plugin-sdk/thread-ownership.js"
--- a/scripts/generate-bundled-plugin-metadata.mjs
+++ b/scripts/generate-bundled-plugin-metadata.mjs
@@ -103,6 +103,15 @@ function normalizePluginManifest(raw) {
    ...(normalizeStringList(raw.providers)
      ? { providers: normalizeStringList(raw.providers) }
      : {}),
+    ...(normalizeStringList(raw.speechProviders)
+      ? { speechProviders: normalizeStringList(raw.speechProviders) }
+      : {}),
+    ...(normalizeStringList(raw.mediaUnderstandingProviders)
+      ? { mediaUnderstandingProviders: normalizeStringList(raw.mediaUnderstandingProviders) }
+      : {}),
+    ...(normalizeStringList(raw.imageGenerationProviders)
+      ? { imageGenerationProviders: normalizeStringList(raw.imageGenerationProviders) }
+      : {}),
    ...(normalizeObject(raw.providerAuthEnvVars)
      ? { providerAuthEnvVars: raw.providerAuthEnvVars }
      : {}),
--- a/scripts/lib/plugin-sdk-entrypoints.json
+++ b/scripts/lib/plugin-sdk-entrypoints.json
@@ -103,6 +103,7 @@
  "provider-catalog",
  "provider-entry",
  "provider-env-vars",
+  "provider-http",
  "provider-google",
  "provider-models",
  "provider-onboard",
@@ -122,6 +123,7 @@
  "state-paths",
  "telegram",
  "telegram-core",
+  "telegram-runtime",
  "thread-ownership",
  "tlon",
  "tool-send",
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -461,7 +461,7 @@ export async function applyMediaUnderstanding(params: {
      .find((value) => value && value.trim()) ?? undefined;

  const attachments = normalizeMediaAttachments(ctx);
-  const providerRegistry = buildProviderRegistry(params.providers);
+  const providerRegistry = buildProviderRegistry(params.providers, cfg);
  const cache = createMediaAttachmentCache(attachments, {
    localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
  });
--- a/src/media-understanding/audio-transcription-runner.ts
+++ b/src/media-understanding/audio-transcription-runner.ts
@@ -23,7 +23,7 @@ export async function runAudioTranscription(params: {
    return { transcript: undefined, attachments };
  }

-  const providerRegistry = buildProviderRegistry(params.providers);
+  const providerRegistry = buildProviderRegistry(params.providers, params.cfg);
  const cache = createMediaAttachmentCache(
    attachments,
    params.localPathRoots ? { localPathRoots: params.localPathRoots } : undefined,
--- a/src/media-understanding/provider-registry.test.ts
+++ b/src/media-understanding/provider-registry.test.ts
@@ -11,15 +11,10 @@ describe("media-understanding provider registry", () => {
    setActivePluginRegistry(createEmptyPluginRegistry());
  });

-  it("keeps core-owned fallback providers registered by default", () => {
+  it("returns no providers by default when no active registry is present", () => {
    const registry = buildMediaUnderstandingRegistry();
-    const groqProvider = getMediaUnderstandingProvider("groq", registry);
-    const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry);
-
-    expect(groqProvider?.id).toBe("groq");
-    expect(groqProvider?.capabilities).toEqual(["audio"]);
-    expect(deepgramProvider?.id).toBe("deepgram");
-    expect(deepgramProvider?.capabilities).toEqual(["audio"]);
+    expect(getMediaUnderstandingProvider("groq", registry)).toBeUndefined();
+    expect(getMediaUnderstandingProvider("deepgram", registry)).toBeUndefined();
  });

  it("merges plugin-registered media providers into the active registry", async () => {
--- a/src/media-understanding/provider-registry.ts
+++ b/src/media-understanding/provider-registry.ts
@@ -1,18 +1,9 @@
 import type { OpenClawConfig } from "../config/config.js";
-import {
-  deepgramMediaUnderstandingProvider,
-  groqMediaUnderstandingProvider,
-} from "../plugin-sdk/media-understanding.js";
 import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import { normalizeMediaProviderId } from "./provider-id.js";
 import type { MediaUnderstandingProvider } from "./types.js";

-const PROVIDERS: MediaUnderstandingProvider[] = [
-  groqMediaUnderstandingProvider,
-  deepgramMediaUnderstandingProvider,
-];
-
 function mergeProviderIntoRegistry(
  registry: Map<string, MediaUnderstandingProvider>,
  provider: MediaUnderstandingProvider,
@@ -36,12 +27,9 @@ export function buildMediaUnderstandingRegistry(
  cfg?: OpenClawConfig,
 ): Map<string, MediaUnderstandingProvider> {
  const registry = new Map<string, MediaUnderstandingProvider>();
-  for (const provider of PROVIDERS) {
-    mergeProviderIntoRegistry(registry, provider);
-  }
  const active = getActivePluginRegistry();
  const pluginRegistry =
-    (active?.mediaUnderstandingProviders?.length ?? 0) > 0
+    (active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg
      ? active
      : loadOpenClawPlugins({ config: cfg });
  for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) {
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -494,7 +494,7 @@ export async function resolveAutoImageModel(params: {
  agentDir?: string;
  activeModel?: ActiveMediaModel;
 }): Promise<ActiveMediaModel | null> {
-  const providerRegistry = buildProviderRegistry();
+  const providerRegistry = buildProviderRegistry(undefined, params.cfg);
  const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
    if (!entry || entry.type === "cli") {
      return null;
--- a/src/plugin-sdk/channel-actions.ts
+++ b/src/plugin-sdk/channel-actions.ts
@@ -7,6 +7,7 @@ export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
 import { Type } from "@sinclair/typebox";
 import type { TSchema } from "@sinclair/typebox";
 import { stringEnum } from "../agents/schema/typebox.js";
+export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";

 /** Schema helper for channels that expose button rows on the shared `message` tool. */
 export function createMessageToolButtonsSchema(): TSchema {
--- a/src/plugin-sdk/image-generation.ts
+++ b/src/plugin-sdk/image-generation.ts
@@ -8,7 +8,3 @@ export type {
  ImageGenerationResult,
  ImageGenerationSourceImage,
 } from "../image-generation/types.js";
-
-export { buildFalImageGenerationProvider } from "../../extensions/fal/image-generation-provider.js";
-export { buildGoogleImageGenerationProvider } from "../../extensions/google/image-generation-provider.js";
-export { buildOpenAIImageGenerationProvider } from "../../extensions/openai/image-generation-provider.js";
--- a/src/plugin-sdk/index.test.ts
+++ b/src/plugin-sdk/index.test.ts
@@ -89,9 +89,6 @@ describe("plugin-sdk exports", () => {
  it("keeps the root runtime surface intentionally small", async () => {
    const runtimeExports = await collectRuntimeExports(path.join(import.meta.dirname, "index.ts"));
    expect([...runtimeExports].toSorted()).toEqual([
-      "buildFalImageGenerationProvider",
-      "buildGoogleImageGenerationProvider",
-      "buildOpenAIImageGenerationProvider",
      "delegateCompactionToRuntime",
      "emptyPluginConfigSchema",
      "onDiagnosticEvent",
--- a/src/plugin-sdk/media-understanding.ts
+++ b/src/plugin-sdk/media-understanding.ts
@@ -18,12 +18,3 @@ export {
  describeImagesWithModel,
 } from "../media-understanding/image-runtime.js";
 export { transcribeOpenAiCompatibleAudio } from "../media-understanding/openai-compatible-audio.js";
-export {
-  assertOkOrThrowHttpError,
-  normalizeBaseUrl,
-  postJsonRequest,
-  postTranscriptionRequest,
-  requireTranscriptionText,
-} from "../media-understanding/shared.js";
-export { deepgramMediaUnderstandingProvider } from "../../extensions/deepgram/media-understanding-provider.js";
-export { groqMediaUnderstandingProvider } from "../../extensions/groq/media-understanding-provider.js";
--- a/src/plugin-sdk/provider-http.ts
+++ b/src/plugin-sdk/provider-http.ts
@@ -0,0 +1,12 @@
+// Shared provider-facing HTTP helpers. Keep generic transport utilities here so
+// capability SDKs do not depend on each other.
+
+export {
+  assertOkOrThrowHttpError,
+  fetchWithTimeout,
+  fetchWithTimeoutGuarded,
+  normalizeBaseUrl,
+  postJsonRequest,
+  postTranscriptionRequest,
+  requireTranscriptionText,
+} from "../media-understanding/shared.js";
--- a/src/plugin-sdk/speech-core.ts
+++ b/src/plugin-sdk/speech-core.ts
@@ -3,15 +3,4 @@
 export type { SpeechProviderPlugin } from "../plugins/types.js";
 export type { SpeechVoiceOption } from "../tts/provider-types.js";

-export {
-  edgeTTS,
-  elevenLabsTTS,
-  inferEdgeExtension,
-  OPENAI_TTS_MODELS,
-  OPENAI_TTS_VOICES,
-  openaiTTS,
-  parseTtsDirectives,
-} from "../tts/tts-core.js";
-
-export { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
-export { isVoiceCompatibleAudio } from "../media/audio.js";
+export { parseTtsDirectives } from "../tts/tts-core.js";
--- a/src/plugin-sdk/speech.ts
+++ b/src/plugin-sdk/speech.ts
@@ -1,9 +1,4 @@
-// Public speech-provider builders for bundled or third-party plugins.
+// Public speech helpers for bundled or third-party plugins.

-export { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
-export { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
-export { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
-export { edgeTTS, elevenLabsTTS, inferEdgeExtension, openaiTTS } from "../tts/tts-core.js";
-export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "../tts/tts-core.js";
 export { parseTtsDirectives } from "../tts/tts-core.js";
 export type { SpeechVoiceOption } from "../tts/provider-types.js";
--- a/src/plugin-sdk/subpaths.test.ts
+++ b/src/plugin-sdk/subpaths.test.ts
@@ -544,6 +544,36 @@ describe("plugin-sdk subpath exports", () => {
      "buildOptionalSecretInputSchema",
      "normalizeSecretInputString",
    ]);
+    expectSourceMentions("provider-http", [
+      "assertOkOrThrowHttpError",
+      "normalizeBaseUrl",
+      "postJsonRequest",
+      "postTranscriptionRequest",
+      "requireTranscriptionText",
+    ]);
+    expectSourceOmits("speech", [
+      "buildElevenLabsSpeechProvider",
+      "buildMicrosoftSpeechProvider",
+      "buildOpenAISpeechProvider",
+      "edgeTTS",
+      "elevenLabsTTS",
+      "inferEdgeExtension",
+      "openaiTTS",
+      "OPENAI_TTS_MODELS",
+      "OPENAI_TTS_VOICES",
+    ]);
+    expectSourceOmits("media-understanding", [
+      "deepgramMediaUnderstandingProvider",
+      "groqMediaUnderstandingProvider",
+      "assertOkOrThrowHttpError",
+      "postJsonRequest",
+      "postTranscriptionRequest",
+    ]);
+    expectSourceOmits("image-generation", [
+      "buildFalImageGenerationProvider",
+      "buildGoogleImageGenerationProvider",
+      "buildOpenAIImageGenerationProvider",
+    ]);
    expectSourceOmits("config-runtime", [
      "hasConfiguredSecretInput",
      "normalizeResolvedSecretInputString",
--- a/src/plugins/bundled-plugin-metadata.generated.ts
+++ b/src/plugins/bundled-plugin-metadata.generated.ts
@@ -169,6 +169,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        properties: {},
      },
      providers: ["anthropic"],
+      mediaUnderstandingProviders: ["anthropic"],
      providerAuthEnvVars: {
        anthropic: ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"],
      },
@@ -488,6 +489,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        additionalProperties: false,
        properties: {},
      },
+      mediaUnderstandingProviders: ["deepgram"],
    },
  },
  {
@@ -859,6 +861,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        additionalProperties: false,
        properties: {},
      },
+      speechProviders: ["elevenlabs"],
    },
  },
  {
@@ -925,6 +928,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        properties: {},
      },
      providers: ["fal"],
+      imageGenerationProviders: ["fal"],
      providerAuthEnvVars: {
        fal: ["FAL_KEY"],
      },
@@ -1114,6 +1118,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        },
      },
      providers: ["google", "google-gemini-cli"],
+      mediaUnderstandingProviders: ["google"],
+      imageGenerationProviders: ["google"],
      providerAuthEnvVars: {
        google: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
      },
@@ -1221,6 +1227,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        additionalProperties: false,
        properties: {},
      },
+      mediaUnderstandingProviders: ["groq"],
    },
  },
  {
@@ -1782,6 +1789,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        additionalProperties: false,
        properties: {},
      },
+      speechProviders: ["microsoft"],
    },
  },
  {
@@ -1854,6 +1862,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        properties: {},
      },
      providers: ["minimax", "minimax-portal"],
+      mediaUnderstandingProviders: ["minimax", "minimax-portal"],
+      imageGenerationProviders: ["minimax", "minimax-portal"],
      providerAuthEnvVars: {
        minimax: ["MINIMAX_API_KEY"],
        "minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"],
@@ -1931,6 +1941,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        properties: {},
      },
      providers: ["mistral"],
+      mediaUnderstandingProviders: ["mistral"],
      providerAuthEnvVars: {
        mistral: ["MISTRAL_API_KEY"],
      },
@@ -2072,6 +2083,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        },
      },
      providers: ["moonshot"],
+      mediaUnderstandingProviders: ["moonshot"],
      providerAuthEnvVars: {
        moonshot: ["MOONSHOT_API_KEY"],
      },
@@ -2363,6 +2375,9 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        properties: {},
      },
      providers: ["openai", "openai-codex"],
+      speechProviders: ["openai"],
+      mediaUnderstandingProviders: ["openai", "openai-codex"],
+      imageGenerationProviders: ["openai"],
      providerAuthEnvVars: {
        openai: ["OPENAI_API_KEY"],
      },
@@ -4101,6 +4116,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
        properties: {},
      },
      providers: ["zai"],
+      mediaUnderstandingProviders: ["zai"],
      providerAuthEnvVars: {
        zai: ["ZAI_API_KEY", "Z_AI_API_KEY"],
      },
--- a/src/plugins/contracts/registry.contract.test.ts
+++ b/src/plugins/contracts/registry.contract.test.ts
@@ -120,6 +120,53 @@ describe("plugin contract registry", () => {
    expect(providerContractPluginIds).toEqual(bundledProviderPluginIds);
  });

+  it("covers every bundled speech plugin discovered from manifests", () => {
+    const bundledSpeechPluginIds = loadPluginManifestRegistry({})
+      .plugins.filter(
+        (plugin) => plugin.origin === "bundled" && (plugin.speechProviders?.length ?? 0) > 0,
+      )
+      .map((plugin) => plugin.id)
+      .toSorted((left, right) => left.localeCompare(right));
+
+    expect(
+      [...new Set(speechProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
+        (left, right) => left.localeCompare(right),
+      ),
+    ).toEqual(bundledSpeechPluginIds);
+  });
+
+  it("covers every bundled media-understanding plugin discovered from manifests", () => {
+    const bundledMediaPluginIds = loadPluginManifestRegistry({})
+      .plugins.filter(
+        (plugin) =>
+          plugin.origin === "bundled" && (plugin.mediaUnderstandingProviders?.length ?? 0) > 0,
+      )
+      .map((plugin) => plugin.id)
+      .toSorted((left, right) => left.localeCompare(right));
+
+    expect(
+      [
+        ...new Set(mediaUnderstandingProviderContractRegistry.map((entry) => entry.pluginId)),
+      ].toSorted((left, right) => left.localeCompare(right)),
+    ).toEqual(bundledMediaPluginIds);
+  });
+
+  it("covers every bundled image-generation plugin discovered from manifests", () => {
+    const bundledImagePluginIds = loadPluginManifestRegistry({})
+      .plugins.filter(
+        (plugin) =>
+          plugin.origin === "bundled" && (plugin.imageGenerationProviders?.length ?? 0) > 0,
+      )
+      .map((plugin) => plugin.id)
+      .toSorted((left, right) => left.localeCompare(right));
+
+    expect(
+      [...new Set(imageGenerationProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
+        (left, right) => left.localeCompare(right),
+      ),
+    ).toEqual(bundledImagePluginIds);
+  });
+
  it("covers every bundled web search plugin from the shared resolver", () => {
    const bundledWebSearchPluginIds = resolveBundledWebSearchPluginIds({});

--- a/src/plugins/contracts/registry.ts
+++ b/src/plugins/contracts/registry.ts
@@ -39,6 +39,7 @@ import xiaomiPlugin from "../../../extensions/xiaomi/index.js";
 import zaiPlugin from "../../../extensions/zai/index.js";
 import { bundledWebSearchPluginRegistrations } from "../../bundled-web-search-registry.js";
 import { createCapturedPluginRegistration } from "../captured-registration.js";
+import { loadPluginManifestRegistry } from "../manifest-registry.js";
 import { resolvePluginProviders } from "../provider-auth-choice.runtime.js";
 import type {
  ImageGenerationProviderPlugin,
@@ -85,21 +86,6 @@ const bundledWebSearchPlugins: Array<RegistrablePlugin & { credentialValue: unkn
    ...plugin,
    credentialValue,
  }));
-const bundledSpeechPlugins: RegistrablePlugin[] = [elevenLabsPlugin, microsoftPlugin, openAIPlugin];
-
-const bundledMediaUnderstandingPlugins: RegistrablePlugin[] = [
-  anthropicPlugin,
-  deepgramPlugin,
-  googlePlugin,
-  groqPlugin,
-  minimaxPlugin,
-  mistralPlugin,
-  moonshotPlugin,
-  openAIPlugin,
-  zaiPlugin,
-];
-
-const bundledImageGenerationPlugins: RegistrablePlugin[] = [falPlugin, googlePlugin, openAIPlugin];

 function captureRegistrations(plugin: RegistrablePlugin) {
  const captured = createCapturedPluginRegistration();
@@ -390,6 +376,43 @@ const bundledProviderPlugins = dedupePlugins([
  zaiPlugin,
 ]);

+const bundledRegistrablePluginsById = new Map(
+  dedupePlugins([
+    ...bundledProviderPlugins,
+    elevenLabsPlugin,
+    microsoftPlugin,
+    deepgramPlugin,
+    groqPlugin,
+    ...bundledWebSearchPlugins,
+  ]).map((plugin) => [plugin.id, plugin]),
+);
+
+function resolveBundledCapabilityPluginIds(
+  capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
+): string[] {
+  return loadPluginManifestRegistry({})
+    .plugins.filter(
+      (plugin) => plugin.origin === "bundled" && (plugin[capability]?.length ?? 0) > 0,
+    )
+    .map((plugin) => plugin.id)
+    .toSorted((left, right) => left.localeCompare(right));
+}
+
+function resolveBundledCapabilityPlugins(
+  capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
+): RegistrablePlugin[] {
+  return resolveBundledCapabilityPluginIds(capability).flatMap((pluginId) => {
+    const plugin = bundledRegistrablePluginsById.get(pluginId);
+    return plugin ? [plugin] : [];
+  });
+}
+
+const bundledSpeechPlugins = resolveBundledCapabilityPlugins("speechProviders");
+const bundledMediaUnderstandingPlugins = resolveBundledCapabilityPlugins(
+  "mediaUnderstandingProviders",
+);
+const bundledImageGenerationPlugins = resolveBundledCapabilityPlugins("imageGenerationProviders");
+
 const bundledPluginRegistrationList = dedupePlugins([
  ...bundledSpeechPlugins,
  ...bundledMediaUnderstandingPlugins,
--- a/src/plugins/manifest-registry.ts
+++ b/src/plugins/manifest-registry.ts
@@ -45,6 +45,9 @@ export type PluginManifestRecord = {
  kind?: PluginKind;
  channels: string[];
  providers: string[];
+  speechProviders?: string[];
+  mediaUnderstandingProviders?: string[];
+  imageGenerationProviders?: string[];
  cliBackends: string[];
  providerAuthEnvVars?: Record<string, string[]>;
  providerAuthChoices?: PluginManifest["providerAuthChoices"];
@@ -171,6 +174,9 @@ function buildRecord(params: {
    kind: params.manifest.kind,
    channels: params.manifest.channels ?? [],
    providers: params.manifest.providers ?? [],
+    speechProviders: params.manifest.speechProviders ?? [],
+    mediaUnderstandingProviders: params.manifest.mediaUnderstandingProviders ?? [],
+    imageGenerationProviders: params.manifest.imageGenerationProviders ?? [],
    cliBackends: params.manifest.cliBackends ?? [],
    providerAuthEnvVars: params.manifest.providerAuthEnvVars,
    providerAuthChoices: params.manifest.providerAuthChoices,
@@ -226,6 +232,9 @@ function buildBundleRecord(params: {
    bundleCapabilities: params.manifest.capabilities,
    channels: [],
    providers: [],
+    speechProviders: [],
+    mediaUnderstandingProviders: [],
+    imageGenerationProviders: [],
    cliBackends: [],
    skills: params.manifest.skills ?? [],
    settingsFiles: params.manifest.settingsFiles ?? [],
--- a/src/plugins/manifest.ts
+++ b/src/plugins/manifest.ts
@@ -15,6 +15,9 @@ export type PluginManifest = {
  kind?: PluginKind;
  channels?: string[];
  providers?: string[];
+  speechProviders?: string[];
+  mediaUnderstandingProviders?: string[];
+  imageGenerationProviders?: string[];
  /** Cheap startup activation lookup for plugin-owned CLI inference backends. */
  cliBackends?: string[];
  /** Cheap provider-auth env lookup without booting plugin runtime. */
@@ -205,6 +208,9 @@ export function loadPluginManifest(
  const version = typeof raw.version === "string" ? raw.version.trim() : undefined;
  const channels = normalizeStringList(raw.channels);
  const providers = normalizeStringList(raw.providers);
+  const speechProviders = normalizeStringList(raw.speechProviders);
+  const mediaUnderstandingProviders = normalizeStringList(raw.mediaUnderstandingProviders);
+  const imageGenerationProviders = normalizeStringList(raw.imageGenerationProviders);
  const cliBackends = normalizeStringList(raw.cliBackends);
  const providerAuthEnvVars = normalizeStringListRecord(raw.providerAuthEnvVars);
  const providerAuthChoices = normalizeProviderAuthChoices(raw.providerAuthChoices);
@@ -224,6 +230,9 @@ export function loadPluginManifest(
      kind,
      channels,
      providers,
+      speechProviders,
+      mediaUnderstandingProviders,
+      imageGenerationProviders,
      cliBackends,
      providerAuthEnvVars,
      providerAuthChoices,
--- a/src/tts/provider-registry.test.ts
+++ b/src/tts/provider-registry.test.ts
@@ -58,7 +58,7 @@ describe("speech provider registry", () => {

    const providers = listSpeechProviders();

-    expect(providers.map((provider) => provider.id)).toEqual(["openai", "elevenlabs", "microsoft"]);
+    expect(providers.map((provider) => provider.id)).toEqual(["openai"]);
    expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
  });

@@ -76,22 +76,14 @@ describe("speech provider registry", () => {

    const cfg = {} as OpenClawConfig;

-    expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual([
-      "openai",
-      "elevenlabs",
-      "microsoft",
-    ]);
+    expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual(["microsoft"]);
    expect(getSpeechProvider("edge", cfg)?.id).toBe("microsoft");
    expect(loadOpenClawPluginsMock).toHaveBeenCalledWith({ config: cfg });
  });

-  it("returns builtin providers when neither plugins nor active registry provide speech support", () => {
-    expect(listSpeechProviders().map((provider) => provider.id)).toEqual([
-      "openai",
-      "elevenlabs",
-      "microsoft",
-    ]);
-    expect(getSpeechProvider("openai")?.id).toBe("openai");
+  it("returns no providers when neither plugins nor active registry provide speech support", () => {
+    expect(listSpeechProviders()).toEqual([]);
+    expect(getSpeechProvider("openai")).toBeUndefined();
  });

  it("normalizes the legacy edge alias to microsoft", () => {
--- a/src/tts/provider-registry.ts
+++ b/src/tts/provider-registry.ts
@@ -1,18 +1,9 @@
-import { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
-import { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
-import { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
 import type { OpenClawConfig } from "../config/config.js";
 import { loadOpenClawPlugins } from "../plugins/loader.js";
 import { getActivePluginRegistry } from "../plugins/runtime.js";
 import type { SpeechProviderPlugin } from "../plugins/types.js";
 import type { SpeechProviderId } from "./provider-types.js";

-const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
-  buildOpenAISpeechProvider,
-  buildElevenLabsSpeechProvider,
-  buildMicrosoftSpeechProvider,
-] as const satisfies readonly (() => SpeechProviderPlugin)[];
-
 function trimToUndefined(value: string | undefined): string | undefined {
  const trimmed = value?.trim().toLowerCase();
  return trimmed ? trimmed : undefined;
@@ -58,9 +49,6 @@ function buildProviderMaps(cfg?: OpenClawConfig): {
    }
  };

-  for (const buildProvider of BUILTIN_SPEECH_PROVIDER_BUILDERS) {
-    register(buildProvider());
-  }
  for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
    register(provider);
  }
--- a/src/tts/tts-core.ts
+++ b/src/tts/tts-core.ts
@@ -1,6 +1,5 @@
-import { rmSync, statSync } from "node:fs";
+import { rmSync } from "node:fs";
 import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
-import { EdgeTTS } from "node-edge-tts";
 import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
 import {
  buildModelAliasIndex,
@@ -18,7 +17,6 @@ import type {
  TtsDirectiveParseResult,
 } from "./tts.js";

-const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
 export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
 const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes

@@ -26,14 +24,6 @@ export function isValidVoiceId(voiceId: string): boolean {
  return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
 }

-function normalizeElevenLabsBaseUrl(baseUrl: string): string {
-  const trimmed = baseUrl.trim();
-  if (!trimmed) {
-    return DEFAULT_ELEVENLABS_BASE_URL;
-  }
-  return trimmed.replace(/\/+$/, "");
-}
-
 function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
  const trimmed = baseUrl?.trim();
  if (!trimmed) {
@@ -53,13 +43,6 @@ function requireInRange(value: number, min: number, max: number, label: string):
  }
 }

-function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
-  requireInRange(settings.stability, 0, 1, "stability");
-  requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
-  requireInRange(settings.style, 0, 1, "style");
-  requireInRange(settings.speed, 0.5, 2, "speed");
-}
-
 function normalizeLanguageCode(code?: string): string | undefined {
  const trimmed = code?.trim();
  if (!trimmed) {
@@ -538,177 +521,3 @@ export function scheduleCleanup(
  }, delayMs);
  timer.unref();
 }
-
-export async function elevenLabsTTS(params: {
-  text: string;
-  apiKey: string;
-  baseUrl: string;
-  voiceId: string;
-  modelId: string;
-  outputFormat: string;
-  seed?: number;
-  applyTextNormalization?: "auto" | "on" | "off";
-  languageCode?: string;
-  voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
-  timeoutMs: number;
-}): Promise<Buffer> {
-  const {
-    text,
-    apiKey,
-    baseUrl,
-    voiceId,
-    modelId,
-    outputFormat,
-    seed,
-    applyTextNormalization,
-    languageCode,
-    voiceSettings,
-    timeoutMs,
-  } = params;
-  if (!isValidVoiceId(voiceId)) {
-    throw new Error("Invalid voiceId format");
-  }
-  assertElevenLabsVoiceSettings(voiceSettings);
-  const normalizedLanguage = normalizeLanguageCode(languageCode);
-  const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
-  const normalizedSeed = normalizeSeed(seed);
-
-  const controller = new AbortController();
-  const timeout = setTimeout(() => controller.abort(), timeoutMs);
-
-  try {
-    const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
-    if (outputFormat) {
-      url.searchParams.set("output_format", outputFormat);
-    }
-
-    const response = await fetch(url.toString(), {
-      method: "POST",
-      headers: {
-        "xi-api-key": apiKey,
-        "Content-Type": "application/json",
-        Accept: "audio/mpeg",
-      },
-      body: JSON.stringify({
-        text,
-        model_id: modelId,
-        seed: normalizedSeed,
-        apply_text_normalization: normalizedNormalization,
-        language_code: normalizedLanguage,
-        voice_settings: {
-          stability: voiceSettings.stability,
-          similarity_boost: voiceSettings.similarityBoost,
-          style: voiceSettings.style,
-          use_speaker_boost: voiceSettings.useSpeakerBoost,
-          speed: voiceSettings.speed,
-        },
-      }),
-      signal: controller.signal,
-    });
-
-    if (!response.ok) {
-      throw new Error(`ElevenLabs API error (${response.status})`);
-    }
-
-    return Buffer.from(await response.arrayBuffer());
-  } finally {
-    clearTimeout(timeout);
-  }
-}
-
-export async function openaiTTS(params: {
-  text: string;
-  apiKey: string;
-  baseUrl: string;
-  model: string;
-  voice: string;
-  speed?: number;
-  instructions?: string;
-  responseFormat: "mp3" | "opus" | "pcm";
-  timeoutMs: number;
-}): Promise<Buffer> {
-  const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
-    params;
-  const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
-
-  if (!isValidOpenAIModel(model, baseUrl)) {
-    throw new Error(`Invalid model: ${model}`);
-  }
-  if (!isValidOpenAIVoice(voice, baseUrl)) {
-    throw new Error(`Invalid voice: ${voice}`);
-  }
-
-  const controller = new AbortController();
-  const timeout = setTimeout(() => controller.abort(), timeoutMs);
-
-  try {
-    const response = await fetch(`${baseUrl}/audio/speech`, {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${apiKey}`,
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model,
-        input: text,
-        voice,
-        response_format: responseFormat,
-        ...(speed != null && { speed }),
-        ...(effectiveInstructions != null && { instructions: effectiveInstructions }),
-      }),
-      signal: controller.signal,
-    });
-
-    if (!response.ok) {
-      throw new Error(`OpenAI TTS API error (${response.status})`);
-    }
-
-    return Buffer.from(await response.arrayBuffer());
-  } finally {
-    clearTimeout(timeout);
-  }
-}
-
-export function inferEdgeExtension(outputFormat: string): string {
-  const normalized = outputFormat.toLowerCase();
-  if (normalized.includes("webm")) {
-    return ".webm";
-  }
-  if (normalized.includes("ogg")) {
-    return ".ogg";
-  }
-  if (normalized.includes("opus")) {
-    return ".opus";
-  }
-  if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
-    return ".wav";
-  }
-  return ".mp3";
-}
-
-export async function edgeTTS(params: {
-  text: string;
-  outputPath: string;
-  config: ResolvedTtsConfig["edge"];
-  timeoutMs: number;
-}): Promise<void> {
-  const { text, outputPath, config, timeoutMs } = params;
-  const tts = new EdgeTTS({
-    voice: config.voice,
-    lang: config.lang,
-    outputFormat: config.outputFormat,
-    saveSubtitles: config.saveSubtitles,
-    proxy: config.proxy,
-    rate: config.rate,
-    pitch: config.pitch,
-    volume: config.volume,
-    timeout: config.timeoutMs ?? timeoutMs,
-  });
-  await tts.ttsPromise(text, outputPath);
-
-  const { size } = statSync(outputPath);
-
-  if (size === 0) {
-    throw new Error("Edge TTS produced empty audio file");
-  }
-}