diff --git a/docs/.generated/plugin-sdk-api-baseline.json b/docs/.generated/plugin-sdk-api-baseline.json index ff71a089b2f..220d24625e9 100644 --- a/docs/.generated/plugin-sdk-api-baseline.json +++ b/docs/.generated/plugin-sdk-api-baseline.json @@ -5,33 +5,6 @@ "category": "legacy", "entrypoint": "index", "exports": [ - { - "declaration": "export function buildFalImageGenerationProvider(): ImageGenerationProvider;", - "exportName": "buildFalImageGenerationProvider", - "kind": "function", - "source": { - "line": 190, - "path": "extensions/fal/image-generation-provider.ts" - } - }, - { - "declaration": "export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;", - "exportName": "buildGoogleImageGenerationProvider", - "kind": "function", - "source": { - "line": 98, - "path": "extensions/google/image-generation-provider.ts" - } - }, - { - "declaration": "export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;", - "exportName": "buildOpenAIImageGenerationProvider", - "kind": "function", - "source": { - "line": 22, - "path": "extensions/openai/image-generation-provider.ts" - } - }, { "declaration": "export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;", "exportName": "delegateCompactionToRuntime", @@ -923,7 +896,7 @@ "exportName": "createMessageToolButtonsSchema", "kind": "function", "source": { - "line": 11, + "line": 12, "path": "src/plugin-sdk/channel-actions.ts" } }, @@ -932,7 +905,7 @@ "exportName": "createMessageToolCardSchema", "kind": "function", "source": { - "line": 29, + "line": 30, "path": "src/plugin-sdk/channel-actions.ts" } }, @@ -954,6 +927,15 @@ "path": "src/channels/plugins/actions/shared.ts" } }, + { + "declaration": "export function optionalStringEnum(values: T, options?: StringEnumOptions): TOptional>;", + "exportName": "optionalStringEnum", + "kind": "function", + "source": { + "line": 31, + "path": "src/agents/schema/typebox.ts" + } + }, { "declaration": "export function resolveReactionMessageId(params: { args: Record; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;", "exportName": "resolveReactionMessageId", @@ -962,6 +944,15 @@ "line": 7, "path": "src/channels/plugins/actions/reaction-message-id.ts" } + }, + { + "declaration": "export function stringEnum(values: T, options?: StringEnumOptions): TUnsafe;", + "exportName": "stringEnum", + "kind": "function", + "source": { + "line": 15, + "path": "src/agents/schema/typebox.ts" + } } ], "importSpecifier": "openclaw/plugin-sdk/channel-actions", diff --git a/docs/.generated/plugin-sdk-api-baseline.jsonl b/docs/.generated/plugin-sdk-api-baseline.jsonl index 677fdec8b6e..ab024a272a9 100644 --- a/docs/.generated/plugin-sdk-api-baseline.jsonl +++ b/docs/.generated/plugin-sdk-api-baseline.jsonl @@ -1,7 +1,4 @@ {"category":"legacy","entrypoint":"index","importSpecifier":"openclaw/plugin-sdk","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/index.ts"} -{"declaration":"export function buildFalImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildFalImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":190,"sourcePath":"extensions/fal/image-generation-provider.ts"} -{"declaration":"export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildGoogleImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":98,"sourcePath":"extensions/google/image-generation-provider.ts"} -{"declaration":"export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildOpenAIImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":22,"sourcePath":"extensions/openai/image-generation-provider.ts"} {"declaration":"export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;","entrypoint":"index","exportName":"delegateCompactionToRuntime","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":16,"sourcePath":"src/context-engine/delegate.ts"} {"declaration":"export function emptyPluginConfigSchema(): OpenClawPluginConfigSchema;","entrypoint":"index","exportName":"emptyPluginConfigSchema","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/plugins/config-schema.ts"} {"declaration":"export function onDiagnosticEvent(listener: (evt: DiagnosticEventPayload) => void): () => void;","entrypoint":"index","exportName":"onDiagnosticEvent","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":229,"sourcePath":"src/infra/diagnostic-events.ts"} @@ -100,11 +97,13 @@ {"declaration":"export type BasicAllowlistResolutionEntry = BasicAllowlistResolutionEntry;","entrypoint":"allow-from","exportName":"BasicAllowlistResolutionEntry","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":129,"sourcePath":"src/plugin-sdk/allow-from.ts"} {"declaration":"export type CompiledAllowlist = CompiledAllowlist;","entrypoint":"allow-from","exportName":"CompiledAllowlist","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":19,"sourcePath":"src/channels/allowlist-match.ts"} {"category":"channel","entrypoint":"channel-actions","importSpecifier":"openclaw/plugin-sdk/channel-actions","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-actions.ts"} -{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":11,"sourcePath":"src/plugin-sdk/channel-actions.ts"} -{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":29,"sourcePath":"src/plugin-sdk/channel-actions.ts"} +{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":12,"sourcePath":"src/plugin-sdk/channel-actions.ts"} +{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":30,"sourcePath":"src/plugin-sdk/channel-actions.ts"} {"declaration":"export function createUnionActionGate(accounts: readonly TAccount[], createGate: (account: TAccount) => OptionalDefaultGate): OptionalDefaultGate;","entrypoint":"channel-actions","exportName":"createUnionActionGate","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/channels/plugins/actions/shared.ts"} {"declaration":"export function listTokenSourcedAccounts(accounts: readonly TAccount[]): TAccount[];","entrypoint":"channel-actions","exportName":"listTokenSourcedAccounts","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/shared.ts"} +{"declaration":"export function optionalStringEnum(values: T, options?: StringEnumOptions): TOptional>;","entrypoint":"channel-actions","exportName":"optionalStringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":31,"sourcePath":"src/agents/schema/typebox.ts"} {"declaration":"export function resolveReactionMessageId(params: { args: Record; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;","entrypoint":"channel-actions","exportName":"resolveReactionMessageId","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/reaction-message-id.ts"} +{"declaration":"export function stringEnum(values: T, options?: StringEnumOptions): TUnsafe;","entrypoint":"channel-actions","exportName":"stringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":15,"sourcePath":"src/agents/schema/typebox.ts"} {"category":"channel","entrypoint":"channel-config-schema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-config-schema.ts"} {"declaration":"export function buildCatchallMultiAccountChannelSchema(accountSchema: T): T;","entrypoint":"channel-config-schema","exportName":"buildCatchallMultiAccountChannelSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":26,"sourcePath":"src/channels/plugins/config-schema.ts"} {"declaration":"export function buildChannelConfigSchema(schema: ZodType>): ChannelConfigSchema;","entrypoint":"channel-config-schema","exportName":"buildChannelConfigSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":35,"sourcePath":"src/channels/plugins/config-schema.ts"} diff --git a/extensions/anthropic/openclaw.plugin.json b/extensions/anthropic/openclaw.plugin.json index 1d58bde7188..106f832eef9 100644 --- a/extensions/anthropic/openclaw.plugin.json +++ b/extensions/anthropic/openclaw.plugin.json @@ -1,6 +1,7 @@ { "id": "anthropic", "providers": ["anthropic"], + "mediaUnderstandingProviders": ["anthropic"], "cliBackends": ["claude-cli"], "providerAuthEnvVars": { "anthropic": ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"] diff --git a/extensions/deepgram/audio.ts b/extensions/deepgram/audio.ts index de97397f215..77146ecfa90 100644 --- a/extensions/deepgram/audio.ts +++ b/extensions/deepgram/audio.ts @@ -7,7 +7,7 @@ import { normalizeBaseUrl, postTranscriptionRequest, requireTranscriptionText, -} from "openclaw/plugin-sdk/media-understanding"; +} from "openclaw/plugin-sdk/provider-http"; export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1"; export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3"; diff --git a/extensions/deepgram/openclaw.plugin.json b/extensions/deepgram/openclaw.plugin.json index 7d148b7c720..d522ec8be6a 100644 --- a/extensions/deepgram/openclaw.plugin.json +++ b/extensions/deepgram/openclaw.plugin.json @@ -1,5 +1,6 @@ { "id": "deepgram", + "mediaUnderstandingProviders": ["deepgram"], "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/elevenlabs/openclaw.plugin.json b/extensions/elevenlabs/openclaw.plugin.json index 3015fa282a2..abffc3c4f49 100644 --- a/extensions/elevenlabs/openclaw.plugin.json +++ b/extensions/elevenlabs/openclaw.plugin.json @@ -1,5 +1,6 @@ { "id": "elevenlabs", + "speechProviders": ["elevenlabs"], "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/elevenlabs/speech-provider.ts b/extensions/elevenlabs/speech-provider.ts index 1ef07597958..24e8298ad0d 100644 --- a/extensions/elevenlabs/speech-provider.ts +++ b/extensions/elevenlabs/speech-provider.ts @@ -1,5 +1,6 @@ import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core"; -import { elevenLabsTTS, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech"; +import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech"; +import { elevenLabsTTS } from "./tts.js"; const ELEVENLABS_TTS_MODELS = [ "eleven_multilingual_v2", diff --git a/extensions/elevenlabs/tts.ts b/extensions/elevenlabs/tts.ts new file mode 100644 index 00000000000..bebf1df9060 --- /dev/null +++ b/extensions/elevenlabs/tts.ts @@ -0,0 +1,150 @@ +const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; + +function isValidVoiceId(voiceId: string): boolean { + return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); +} + +function normalizeElevenLabsBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return DEFAULT_ELEVENLABS_BASE_URL; + } + return trimmed.replace(/\/+$/, ""); +} + +function normalizeLanguageCode(code?: string): string | undefined { + const trimmed = code?.trim(); + if (!trimmed) { + return undefined; + } + const normalized = trimmed.toLowerCase(); + if (!/^[a-z]{2}$/.test(normalized)) { + throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)"); + } + return normalized; +} + +function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined { + const trimmed = mode?.trim(); + if (!trimmed) { + return undefined; + } + const normalized = trimmed.toLowerCase(); + if (normalized === "auto" || normalized === "on" || normalized === "off") { + return normalized; + } + throw new Error("applyTextNormalization must be one of: auto, on, off"); +} + +function normalizeSeed(seed?: number): number | undefined { + if (seed == null) { + return undefined; + } + const next = Math.floor(seed); + if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) { + throw new Error("seed must be between 0 and 4294967295"); + } + return next; +} + +function requireInRange(value: number, min: number, max: number, label: string): void { + if (!Number.isFinite(value) || value < min || value > max) { + throw new Error(`${label} must be between ${min} and ${max}`); + } +} + +function assertElevenLabsVoiceSettings(settings: { + stability: number; + similarityBoost: number; + style: number; + useSpeakerBoost: boolean; + speed: number; +}) { + requireInRange(settings.stability, 0, 1, "stability"); + requireInRange(settings.similarityBoost, 0, 1, "similarityBoost"); + requireInRange(settings.style, 0, 1, "style"); + requireInRange(settings.speed, 0.5, 2, "speed"); +} + +export async function elevenLabsTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + voiceId: string; + modelId: string; + outputFormat: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings: { + stability: number; + similarityBoost: number; + style: number; + useSpeakerBoost: boolean; + speed: number; + }; + timeoutMs: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + voiceId, + modelId, + outputFormat, + seed, + applyTextNormalization, + languageCode, + voiceSettings, + timeoutMs, + } = params; + if (!isValidVoiceId(voiceId)) { + throw new Error("Invalid voiceId format"); + } + assertElevenLabsVoiceSettings(voiceSettings); + const normalizedLanguage = normalizeLanguageCode(languageCode); + const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization); + const normalizedSeed = normalizeSeed(seed); + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`); + if (outputFormat) { + url.searchParams.set("output_format", outputFormat); + } + + const response = await fetch(url.toString(), { + method: "POST", + headers: { + "xi-api-key": apiKey, + "Content-Type": "application/json", + Accept: "audio/mpeg", + }, + body: JSON.stringify({ + text, + model_id: modelId, + seed: normalizedSeed, + apply_text_normalization: normalizedNormalization, + language_code: normalizedLanguage, + voice_settings: { + stability: voiceSettings.stability, + similarity_boost: voiceSettings.similarityBoost, + style: voiceSettings.style, + use_speaker_boost: voiceSettings.useSpeakerBoost, + speed: voiceSettings.speed, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`ElevenLabs API error (${response.status})`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} diff --git a/extensions/fal/openclaw.plugin.json b/extensions/fal/openclaw.plugin.json index d7f7e12f677..99ac7d3d1f9 100644 --- a/extensions/fal/openclaw.plugin.json +++ b/extensions/fal/openclaw.plugin.json @@ -1,6 +1,7 @@ { "id": "fal", "providers": ["fal"], + "imageGenerationProviders": ["fal"], "providerAuthEnvVars": { "fal": ["FAL_KEY"] }, diff --git a/extensions/google/image-generation-provider.ts b/extensions/google/image-generation-provider.ts index ef72925b136..f138ff86be0 100644 --- a/extensions/google/image-generation-provider.ts +++ b/extensions/google/image-generation-provider.ts @@ -1,9 +1,4 @@ import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation"; -import { - assertOkOrThrowHttpError, - normalizeBaseUrl, - postJsonRequest, -} from "openclaw/plugin-sdk/media-understanding"; import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth"; import { DEFAULT_GOOGLE_API_BASE_URL, @@ -11,6 +6,11 @@ import { normalizeGoogleModelId, parseGeminiAuth, } from "openclaw/plugin-sdk/provider-google"; +import { + assertOkOrThrowHttpError, + normalizeBaseUrl, + postJsonRequest, +} from "openclaw/plugin-sdk/provider-http"; const DEFAULT_GOOGLE_IMAGE_MODEL = "gemini-3.1-flash-image-preview"; const DEFAULT_OUTPUT_MIME = "image/png"; diff --git a/extensions/google/media-understanding-provider.ts b/extensions/google/media-understanding-provider.ts index c3734a8d12f..2766dab3207 100644 --- a/extensions/google/media-understanding-provider.ts +++ b/extensions/google/media-understanding-provider.ts @@ -1,15 +1,17 @@ import { - assertOkOrThrowHttpError, describeImageWithModel, describeImagesWithModel, - normalizeBaseUrl, - postJsonRequest, type AudioTranscriptionRequest, type AudioTranscriptionResult, type MediaUnderstandingProvider, type VideoDescriptionRequest, type VideoDescriptionResult, } from "openclaw/plugin-sdk/media-understanding"; +import { + assertOkOrThrowHttpError, + normalizeBaseUrl, + postJsonRequest, +} from "openclaw/plugin-sdk/provider-http"; import { DEFAULT_GOOGLE_API_BASE_URL, normalizeGoogleApiBaseUrl, diff --git a/extensions/google/openclaw.plugin.json b/extensions/google/openclaw.plugin.json index 252891b2a52..576d4992fce 100644 --- a/extensions/google/openclaw.plugin.json +++ b/extensions/google/openclaw.plugin.json @@ -1,6 +1,8 @@ { "id": "google", "providers": ["google", "google-gemini-cli"], + "mediaUnderstandingProviders": ["google"], + "imageGenerationProviders": ["google"], "cliBackends": ["google-gemini-cli"], "providerAuthEnvVars": { "google": ["GEMINI_API_KEY", "GOOGLE_API_KEY"] diff --git a/extensions/groq/openclaw.plugin.json b/extensions/groq/openclaw.plugin.json index 5ab0133764b..7da82942848 100644 --- a/extensions/groq/openclaw.plugin.json +++ b/extensions/groq/openclaw.plugin.json @@ -1,5 +1,6 @@ { "id": "groq", + "mediaUnderstandingProviders": ["groq"], "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/microsoft/openclaw.plugin.json b/extensions/microsoft/openclaw.plugin.json index 85a130c463a..7ab6a523125 100644 --- a/extensions/microsoft/openclaw.plugin.json +++ b/extensions/microsoft/openclaw.plugin.json @@ -1,5 +1,6 @@ { "id": "microsoft", + "speechProviders": ["microsoft"], "configSchema": { "type": "object", "additionalProperties": false, diff --git a/extensions/microsoft/speech-provider.ts b/extensions/microsoft/speech-provider.ts index 25997720670..3967a0c62d2 100644 --- a/extensions/microsoft/speech-provider.ts +++ b/extensions/microsoft/speech-provider.ts @@ -8,7 +8,8 @@ import { import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core"; import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task"; import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime"; -import { edgeTTS, inferEdgeExtension, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech"; +import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech"; +import { edgeTTS, inferEdgeExtension } from "./tts.js"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; diff --git a/src/tts/edge-tts-validation.test.ts b/extensions/microsoft/tts.test.ts similarity index 90% rename from src/tts/edge-tts-validation.test.ts rename to extensions/microsoft/tts.test.ts index 85c93211efc..b6dd0db1474 100644 --- a/src/tts/edge-tts-validation.test.ts +++ b/extensions/microsoft/tts.test.ts @@ -3,7 +3,7 @@ import { tmpdir } from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -let edgeTTS: typeof import("./tts-core.js").edgeTTS; +let edgeTTS: typeof import("./tts.js").edgeTTS; let mockTtsPromise = vi.fn<(text: string, filePath: string) => Promise>(); @@ -16,15 +16,13 @@ vi.mock("node-edge-tts", () => ({ })); const baseEdgeConfig = { - enabled: true, voice: "en-US-MichelleNeural", lang: "en-US", outputFormat: "audio-24khz-48kbitrate-mono-mp3", - outputFormatConfigured: false, saveSubtitles: false, }; -describe("edgeTTS – empty audio validation", () => { +describe("edgeTTS empty audio validation", () => { let tempDir: string | undefined; beforeEach(async () => { @@ -36,7 +34,7 @@ describe("edgeTTS – empty audio validation", () => { } }, })); - ({ edgeTTS } = await import("./tts-core.js")); + ({ edgeTTS } = await import("./tts.js")); }); afterEach(() => { diff --git a/extensions/microsoft/tts.ts b/extensions/microsoft/tts.ts new file mode 100644 index 00000000000..4fd13e8b3a3 --- /dev/null +++ b/extensions/microsoft/tts.ts @@ -0,0 +1,55 @@ +import { statSync } from "node:fs"; +import { EdgeTTS } from "node-edge-tts"; + +export function inferEdgeExtension(outputFormat: string): string { + const normalized = outputFormat.toLowerCase(); + if (normalized.includes("webm")) { + return ".webm"; + } + if (normalized.includes("ogg")) { + return ".ogg"; + } + if (normalized.includes("opus")) { + return ".opus"; + } + if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) { + return ".wav"; + } + return ".mp3"; +} + +export async function edgeTTS(params: { + text: string; + outputPath: string; + config: { + voice: string; + lang: string; + outputFormat: string; + saveSubtitles: boolean; + proxy?: string; + rate?: string; + pitch?: string; + volume?: string; + timeoutMs?: number; + }; + timeoutMs: number; +}): Promise { + const { text, outputPath, config, timeoutMs } = params; + const tts = new EdgeTTS({ + voice: config.voice, + lang: config.lang, + outputFormat: config.outputFormat, + saveSubtitles: config.saveSubtitles, + proxy: config.proxy, + rate: config.rate, + pitch: config.pitch, + volume: config.volume, + timeout: config.timeoutMs ?? timeoutMs, + }); + await tts.ttsPromise(text, outputPath); + + const { size } = statSync(outputPath); + if (size === 0) { + throw new Error("Edge TTS produced empty audio file"); + } +} diff --git a/extensions/minimax/openclaw.plugin.json b/extensions/minimax/openclaw.plugin.json index 60a77127713..381865d93ed 100644 --- a/extensions/minimax/openclaw.plugin.json +++ b/extensions/minimax/openclaw.plugin.json @@ -1,6 +1,8 @@ { "id": "minimax", "providers": ["minimax", "minimax-portal"], + "mediaUnderstandingProviders": ["minimax", "minimax-portal"], + "imageGenerationProviders": ["minimax", "minimax-portal"], "providerAuthEnvVars": { "minimax": ["MINIMAX_API_KEY"], "minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"] diff --git a/extensions/mistral/openclaw.plugin.json b/extensions/mistral/openclaw.plugin.json index 93f115cf719..ec142023431 100644 --- a/extensions/mistral/openclaw.plugin.json +++ b/extensions/mistral/openclaw.plugin.json @@ -1,6 +1,7 @@ { "id": "mistral", "providers": ["mistral"], + "mediaUnderstandingProviders": ["mistral"], "providerAuthEnvVars": { "mistral": ["MISTRAL_API_KEY"] }, diff --git a/extensions/moonshot/media-understanding-provider.ts b/extensions/moonshot/media-understanding-provider.ts index 6c652ae58d3..7d7ace86ea0 100644 --- a/extensions/moonshot/media-understanding-provider.ts +++ b/extensions/moonshot/media-understanding-provider.ts @@ -4,10 +4,12 @@ import { type MediaUnderstandingProvider, type VideoDescriptionRequest, type VideoDescriptionResult, +} from "openclaw/plugin-sdk/media-understanding"; +import { assertOkOrThrowHttpError, normalizeBaseUrl, postJsonRequest, -} from "openclaw/plugin-sdk/media-understanding"; +} from "openclaw/plugin-sdk/provider-http"; export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1"; const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5"; diff --git a/extensions/moonshot/openclaw.plugin.json b/extensions/moonshot/openclaw.plugin.json index a5756e05623..39f36e7ecaa 100644 --- a/extensions/moonshot/openclaw.plugin.json +++ b/extensions/moonshot/openclaw.plugin.json @@ -1,6 +1,7 @@ { "id": "moonshot", "providers": ["moonshot"], + "mediaUnderstandingProviders": ["moonshot"], "providerAuthEnvVars": { "moonshot": ["MOONSHOT_API_KEY"] }, diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json index c082cdf93bd..68f3ba07670 100644 --- a/extensions/openai/openclaw.plugin.json +++ b/extensions/openai/openclaw.plugin.json @@ -1,6 +1,9 @@ { "id": "openai", "providers": ["openai", "openai-codex"], + "speechProviders": ["openai"], + "mediaUnderstandingProviders": ["openai", "openai-codex"], + "imageGenerationProviders": ["openai"], "cliBackends": ["codex-cli"], "providerAuthEnvVars": { "openai": ["OPENAI_API_KEY"] diff --git a/extensions/openai/speech-provider.ts b/extensions/openai/speech-provider.ts index 0376d72f4f3..91b8008db9c 100644 --- a/extensions/openai/speech-provider.ts +++ b/extensions/openai/speech-provider.ts @@ -1,5 +1,5 @@ import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core"; -import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "openclaw/plugin-sdk/speech"; +import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js"; export function buildOpenAISpeechProvider(): SpeechProviderPlugin { return { diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts new file mode 100644 index 00000000000..52a288fef6f --- /dev/null +++ b/extensions/openai/tts.ts @@ -0,0 +1,109 @@ +const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; + +export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const; + +export const OPENAI_TTS_VOICES = [ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "fable", + "juniper", + "marin", + "onyx", + "nova", + "sage", + "shimmer", + "verse", +] as const; + +type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; + +function normalizeOpenAITtsBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return DEFAULT_OPENAI_BASE_URL; + } + return trimmed.replace(/\/+$/, ""); +} + +function isCustomOpenAIEndpoint(baseUrl?: string): boolean { + if (baseUrl != null) { + return normalizeOpenAITtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_BASE_URL; + } + return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL; +} + +function isValidOpenAIModel(model: string, baseUrl?: string): boolean { + if (isCustomOpenAIEndpoint(baseUrl)) { + return true; + } + return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); +} + +function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice { + if (isCustomOpenAIEndpoint(baseUrl)) { + return true; + } + return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice); +} + +function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined { + const next = instructions?.trim(); + return next && model.includes("gpt-4o-mini-tts") ? next : undefined; +} + +export async function openaiTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + model: string; + voice: string; + speed?: number; + instructions?: string; + responseFormat: "mp3" | "opus" | "pcm"; + timeoutMs: number; +}): Promise { + const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } = + params; + const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions); + + if (!isValidOpenAIModel(model, baseUrl)) { + throw new Error(`Invalid model: ${model}`); + } + if (!isValidOpenAIVoice(voice, baseUrl)) { + throw new Error(`Invalid voice: ${voice}`); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch(`${baseUrl}/audio/speech`, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + input: text, + voice, + response_format: responseFormat, + ...(speed != null && { speed }), + ...(effectiveInstructions != null && { instructions: effectiveInstructions }), + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`OpenAI TTS API error (${response.status})`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} diff --git a/extensions/zai/openclaw.plugin.json b/extensions/zai/openclaw.plugin.json index 2a7e1c8b40a..0e998d152f7 100644 --- a/extensions/zai/openclaw.plugin.json +++ b/extensions/zai/openclaw.plugin.json @@ -1,6 +1,7 @@ { "id": "zai", "providers": ["zai"], + "mediaUnderstandingProviders": ["zai"], "providerAuthEnvVars": { "zai": ["ZAI_API_KEY", "Z_AI_API_KEY"] }, diff --git a/package.json b/package.json index 6f41b3544f0..006808c3926 100644 --- a/package.json +++ b/package.json @@ -453,6 +453,10 @@ "types": "./dist/plugin-sdk/provider-env-vars.d.ts", "default": "./dist/plugin-sdk/provider-env-vars.js" }, + "./plugin-sdk/provider-http": { + "types": "./dist/plugin-sdk/provider-http.d.ts", + "default": "./dist/plugin-sdk/provider-http.js" + }, "./plugin-sdk/provider-google": { "types": "./dist/plugin-sdk/provider-google.d.ts", "default": "./dist/plugin-sdk/provider-google.js" @@ -529,6 +533,10 @@ "types": "./dist/plugin-sdk/telegram-core.d.ts", "default": "./dist/plugin-sdk/telegram-core.js" }, + "./plugin-sdk/telegram-runtime": { + "types": "./dist/plugin-sdk/telegram-runtime.d.ts", + "default": "./dist/plugin-sdk/telegram-runtime.js" + }, "./plugin-sdk/thread-ownership": { "types": "./dist/plugin-sdk/thread-ownership.d.ts", "default": "./dist/plugin-sdk/thread-ownership.js" diff --git a/scripts/generate-bundled-plugin-metadata.mjs b/scripts/generate-bundled-plugin-metadata.mjs index 4fea8dd62f8..161b6bade7e 100644 --- a/scripts/generate-bundled-plugin-metadata.mjs +++ b/scripts/generate-bundled-plugin-metadata.mjs @@ -103,6 +103,15 @@ function normalizePluginManifest(raw) { ...(normalizeStringList(raw.providers) ? { providers: normalizeStringList(raw.providers) } : {}), + ...(normalizeStringList(raw.speechProviders) + ? { speechProviders: normalizeStringList(raw.speechProviders) } + : {}), + ...(normalizeStringList(raw.mediaUnderstandingProviders) + ? { mediaUnderstandingProviders: normalizeStringList(raw.mediaUnderstandingProviders) } + : {}), + ...(normalizeStringList(raw.imageGenerationProviders) + ? { imageGenerationProviders: normalizeStringList(raw.imageGenerationProviders) } + : {}), ...(normalizeObject(raw.providerAuthEnvVars) ? { providerAuthEnvVars: raw.providerAuthEnvVars } : {}), diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json index 8144c4a9701..f3de2a2ab96 100644 --- a/scripts/lib/plugin-sdk-entrypoints.json +++ b/scripts/lib/plugin-sdk-entrypoints.json @@ -103,6 +103,7 @@ "provider-catalog", "provider-entry", "provider-env-vars", + "provider-http", "provider-google", "provider-models", "provider-onboard", @@ -122,6 +123,7 @@ "state-paths", "telegram", "telegram-core", + "telegram-runtime", "thread-ownership", "tlon", "tool-send", diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 7721dae16b0..613f65d6658 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -461,7 +461,7 @@ export async function applyMediaUnderstanding(params: { .find((value) => value && value.trim()) ?? undefined; const attachments = normalizeMediaAttachments(ctx); - const providerRegistry = buildProviderRegistry(params.providers); + const providerRegistry = buildProviderRegistry(params.providers, cfg); const cache = createMediaAttachmentCache(attachments, { localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }), }); diff --git a/src/media-understanding/audio-transcription-runner.ts b/src/media-understanding/audio-transcription-runner.ts index 3ef2fdfa0fa..2cbc5f15563 100644 --- a/src/media-understanding/audio-transcription-runner.ts +++ b/src/media-understanding/audio-transcription-runner.ts @@ -23,7 +23,7 @@ export async function runAudioTranscription(params: { return { transcript: undefined, attachments }; } - const providerRegistry = buildProviderRegistry(params.providers); + const providerRegistry = buildProviderRegistry(params.providers, params.cfg); const cache = createMediaAttachmentCache( attachments, params.localPathRoots ? { localPathRoots: params.localPathRoots } : undefined, diff --git a/src/media-understanding/provider-registry.test.ts b/src/media-understanding/provider-registry.test.ts index 46885c6d45d..84ef2df4928 100644 --- a/src/media-understanding/provider-registry.test.ts +++ b/src/media-understanding/provider-registry.test.ts @@ -11,15 +11,10 @@ describe("media-understanding provider registry", () => { setActivePluginRegistry(createEmptyPluginRegistry()); }); - it("keeps core-owned fallback providers registered by default", () => { + it("returns no providers by default when no active registry is present", () => { const registry = buildMediaUnderstandingRegistry(); - const groqProvider = getMediaUnderstandingProvider("groq", registry); - const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry); - - expect(groqProvider?.id).toBe("groq"); - expect(groqProvider?.capabilities).toEqual(["audio"]); - expect(deepgramProvider?.id).toBe("deepgram"); - expect(deepgramProvider?.capabilities).toEqual(["audio"]); + expect(getMediaUnderstandingProvider("groq", registry)).toBeUndefined(); + expect(getMediaUnderstandingProvider("deepgram", registry)).toBeUndefined(); }); it("merges plugin-registered media providers into the active registry", async () => { diff --git a/src/media-understanding/provider-registry.ts b/src/media-understanding/provider-registry.ts index 9441ccf5a7c..018d4edc58a 100644 --- a/src/media-understanding/provider-registry.ts +++ b/src/media-understanding/provider-registry.ts @@ -1,18 +1,9 @@ import type { OpenClawConfig } from "../config/config.js"; -import { - deepgramMediaUnderstandingProvider, - groqMediaUnderstandingProvider, -} from "../plugin-sdk/media-understanding.js"; import { loadOpenClawPlugins } from "../plugins/loader.js"; import { getActivePluginRegistry } from "../plugins/runtime.js"; import { normalizeMediaProviderId } from "./provider-id.js"; import type { MediaUnderstandingProvider } from "./types.js"; -const PROVIDERS: MediaUnderstandingProvider[] = [ - groqMediaUnderstandingProvider, - deepgramMediaUnderstandingProvider, -]; - function mergeProviderIntoRegistry( registry: Map, provider: MediaUnderstandingProvider, @@ -36,12 +27,9 @@ export function buildMediaUnderstandingRegistry( cfg?: OpenClawConfig, ): Map { const registry = new Map(); - for (const provider of PROVIDERS) { - mergeProviderIntoRegistry(registry, provider); - } const active = getActivePluginRegistry(); const pluginRegistry = - (active?.mediaUnderstandingProviders?.length ?? 0) > 0 + (active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg ? active : loadOpenClawPlugins({ config: cfg }); for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) { diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index cb4934a5e34..fa9a7379e23 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -494,7 +494,7 @@ export async function resolveAutoImageModel(params: { agentDir?: string; activeModel?: ActiveMediaModel; }): Promise { - const providerRegistry = buildProviderRegistry(); + const providerRegistry = buildProviderRegistry(undefined, params.cfg); const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => { if (!entry || entry.type === "cli") { return null; diff --git a/src/plugin-sdk/channel-actions.ts b/src/plugin-sdk/channel-actions.ts index fe04c82a466..39d5b94ddaf 100644 --- a/src/plugin-sdk/channel-actions.ts +++ b/src/plugin-sdk/channel-actions.ts @@ -7,6 +7,7 @@ export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js"; import { Type } from "@sinclair/typebox"; import type { TSchema } from "@sinclair/typebox"; import { stringEnum } from "../agents/schema/typebox.js"; +export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js"; /** Schema helper for channels that expose button rows on the shared `message` tool. */ export function createMessageToolButtonsSchema(): TSchema { diff --git a/src/plugin-sdk/image-generation.ts b/src/plugin-sdk/image-generation.ts index 1dc0c04b403..0c37a9ece7f 100644 --- a/src/plugin-sdk/image-generation.ts +++ b/src/plugin-sdk/image-generation.ts @@ -8,7 +8,3 @@ export type { ImageGenerationResult, ImageGenerationSourceImage, } from "../image-generation/types.js"; - -export { buildFalImageGenerationProvider } from "../../extensions/fal/image-generation-provider.js"; -export { buildGoogleImageGenerationProvider } from "../../extensions/google/image-generation-provider.js"; -export { buildOpenAIImageGenerationProvider } from "../../extensions/openai/image-generation-provider.js"; diff --git a/src/plugin-sdk/index.test.ts b/src/plugin-sdk/index.test.ts index c801e43218e..74cee9af15a 100644 --- a/src/plugin-sdk/index.test.ts +++ b/src/plugin-sdk/index.test.ts @@ -89,9 +89,6 @@ describe("plugin-sdk exports", () => { it("keeps the root runtime surface intentionally small", async () => { const runtimeExports = await collectRuntimeExports(path.join(import.meta.dirname, "index.ts")); expect([...runtimeExports].toSorted()).toEqual([ - "buildFalImageGenerationProvider", - "buildGoogleImageGenerationProvider", - "buildOpenAIImageGenerationProvider", "delegateCompactionToRuntime", "emptyPluginConfigSchema", "onDiagnosticEvent", diff --git a/src/plugin-sdk/media-understanding.ts b/src/plugin-sdk/media-understanding.ts index 986f47357d7..cd6401ad675 100644 --- a/src/plugin-sdk/media-understanding.ts +++ b/src/plugin-sdk/media-understanding.ts @@ -18,12 +18,3 @@ export { describeImagesWithModel, } from "../media-understanding/image-runtime.js"; export { transcribeOpenAiCompatibleAudio } from "../media-understanding/openai-compatible-audio.js"; -export { - assertOkOrThrowHttpError, - normalizeBaseUrl, - postJsonRequest, - postTranscriptionRequest, - requireTranscriptionText, -} from "../media-understanding/shared.js"; -export { deepgramMediaUnderstandingProvider } from "../../extensions/deepgram/media-understanding-provider.js"; -export { groqMediaUnderstandingProvider } from "../../extensions/groq/media-understanding-provider.js"; diff --git a/src/plugin-sdk/provider-http.ts b/src/plugin-sdk/provider-http.ts new file mode 100644 index 00000000000..de59b4c029b --- /dev/null +++ b/src/plugin-sdk/provider-http.ts @@ -0,0 +1,12 @@ +// Shared provider-facing HTTP helpers. Keep generic transport utilities here so +// capability SDKs do not depend on each other. + +export { + assertOkOrThrowHttpError, + fetchWithTimeout, + fetchWithTimeoutGuarded, + normalizeBaseUrl, + postJsonRequest, + postTranscriptionRequest, + requireTranscriptionText, +} from "../media-understanding/shared.js"; diff --git a/src/plugin-sdk/speech-core.ts b/src/plugin-sdk/speech-core.ts index e4af7a69486..75f9100fbe7 100644 --- a/src/plugin-sdk/speech-core.ts +++ b/src/plugin-sdk/speech-core.ts @@ -3,15 +3,4 @@ export type { SpeechProviderPlugin } from "../plugins/types.js"; export type { SpeechVoiceOption } from "../tts/provider-types.js"; -export { - edgeTTS, - elevenLabsTTS, - inferEdgeExtension, - OPENAI_TTS_MODELS, - OPENAI_TTS_VOICES, - openaiTTS, - parseTtsDirectives, -} from "../tts/tts-core.js"; - -export { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; -export { isVoiceCompatibleAudio } from "../media/audio.js"; +export { parseTtsDirectives } from "../tts/tts-core.js"; diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts index 3c14de0238d..a98e98103e0 100644 --- a/src/plugin-sdk/speech.ts +++ b/src/plugin-sdk/speech.ts @@ -1,9 +1,4 @@ -// Public speech-provider builders for bundled or third-party plugins. +// Public speech helpers for bundled or third-party plugins. -export { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js"; -export { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js"; -export { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js"; -export { edgeTTS, elevenLabsTTS, inferEdgeExtension, openaiTTS } from "../tts/tts-core.js"; -export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "../tts/tts-core.js"; export { parseTtsDirectives } from "../tts/tts-core.js"; export type { SpeechVoiceOption } from "../tts/provider-types.js"; diff --git a/src/plugin-sdk/subpaths.test.ts b/src/plugin-sdk/subpaths.test.ts index c6188cddbaf..251ddd4450f 100644 --- a/src/plugin-sdk/subpaths.test.ts +++ b/src/plugin-sdk/subpaths.test.ts @@ -544,6 +544,36 @@ describe("plugin-sdk subpath exports", () => { "buildOptionalSecretInputSchema", "normalizeSecretInputString", ]); + expectSourceMentions("provider-http", [ + "assertOkOrThrowHttpError", + "normalizeBaseUrl", + "postJsonRequest", + "postTranscriptionRequest", + "requireTranscriptionText", + ]); + expectSourceOmits("speech", [ + "buildElevenLabsSpeechProvider", + "buildMicrosoftSpeechProvider", + "buildOpenAISpeechProvider", + "edgeTTS", + "elevenLabsTTS", + "inferEdgeExtension", + "openaiTTS", + "OPENAI_TTS_MODELS", + "OPENAI_TTS_VOICES", + ]); + expectSourceOmits("media-understanding", [ + "deepgramMediaUnderstandingProvider", + "groqMediaUnderstandingProvider", + "assertOkOrThrowHttpError", + "postJsonRequest", + "postTranscriptionRequest", + ]); + expectSourceOmits("image-generation", [ + "buildFalImageGenerationProvider", + "buildGoogleImageGenerationProvider", + "buildOpenAIImageGenerationProvider", + ]); expectSourceOmits("config-runtime", [ "hasConfiguredSecretInput", "normalizeResolvedSecretInputString", diff --git a/src/plugins/bundled-plugin-metadata.generated.ts b/src/plugins/bundled-plugin-metadata.generated.ts index 8b38bd799a5..317f9b767b2 100644 --- a/src/plugins/bundled-plugin-metadata.generated.ts +++ b/src/plugins/bundled-plugin-metadata.generated.ts @@ -169,6 +169,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ properties: {}, }, providers: ["anthropic"], + mediaUnderstandingProviders: ["anthropic"], providerAuthEnvVars: { anthropic: ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"], }, @@ -488,6 +489,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ additionalProperties: false, properties: {}, }, + mediaUnderstandingProviders: ["deepgram"], }, }, { @@ -859,6 +861,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ additionalProperties: false, properties: {}, }, + speechProviders: ["elevenlabs"], }, }, { @@ -925,6 +928,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ properties: {}, }, providers: ["fal"], + imageGenerationProviders: ["fal"], providerAuthEnvVars: { fal: ["FAL_KEY"], }, @@ -1114,6 +1118,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ }, }, providers: ["google", "google-gemini-cli"], + mediaUnderstandingProviders: ["google"], + imageGenerationProviders: ["google"], providerAuthEnvVars: { google: ["GEMINI_API_KEY", "GOOGLE_API_KEY"], }, @@ -1221,6 +1227,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ additionalProperties: false, properties: {}, }, + mediaUnderstandingProviders: ["groq"], }, }, { @@ -1782,6 +1789,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ additionalProperties: false, properties: {}, }, + speechProviders: ["microsoft"], }, }, { @@ -1854,6 +1862,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ properties: {}, }, providers: ["minimax", "minimax-portal"], + mediaUnderstandingProviders: ["minimax", "minimax-portal"], + imageGenerationProviders: ["minimax", "minimax-portal"], providerAuthEnvVars: { minimax: ["MINIMAX_API_KEY"], "minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"], @@ -1931,6 +1941,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ properties: {}, }, providers: ["mistral"], + mediaUnderstandingProviders: ["mistral"], providerAuthEnvVars: { mistral: ["MISTRAL_API_KEY"], }, @@ -2072,6 +2083,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ }, }, providers: ["moonshot"], + mediaUnderstandingProviders: ["moonshot"], providerAuthEnvVars: { moonshot: ["MOONSHOT_API_KEY"], }, @@ -2363,6 +2375,9 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ properties: {}, }, providers: ["openai", "openai-codex"], + speechProviders: ["openai"], + mediaUnderstandingProviders: ["openai", "openai-codex"], + imageGenerationProviders: ["openai"], providerAuthEnvVars: { openai: ["OPENAI_API_KEY"], }, @@ -4101,6 +4116,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [ properties: {}, }, providers: ["zai"], + mediaUnderstandingProviders: ["zai"], providerAuthEnvVars: { zai: ["ZAI_API_KEY", "Z_AI_API_KEY"], }, diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts index 0260026cecf..7f15fe700e8 100644 --- a/src/plugins/contracts/registry.contract.test.ts +++ b/src/plugins/contracts/registry.contract.test.ts @@ -120,6 +120,53 @@ describe("plugin contract registry", () => { expect(providerContractPluginIds).toEqual(bundledProviderPluginIds); }); + it("covers every bundled speech plugin discovered from manifests", () => { + const bundledSpeechPluginIds = loadPluginManifestRegistry({}) + .plugins.filter( + (plugin) => plugin.origin === "bundled" && (plugin.speechProviders?.length ?? 0) > 0, + ) + .map((plugin) => plugin.id) + .toSorted((left, right) => left.localeCompare(right)); + + expect( + [...new Set(speechProviderContractRegistry.map((entry) => entry.pluginId))].toSorted( + (left, right) => left.localeCompare(right), + ), + ).toEqual(bundledSpeechPluginIds); + }); + + it("covers every bundled media-understanding plugin discovered from manifests", () => { + const bundledMediaPluginIds = loadPluginManifestRegistry({}) + .plugins.filter( + (plugin) => + plugin.origin === "bundled" && (plugin.mediaUnderstandingProviders?.length ?? 0) > 0, + ) + .map((plugin) => plugin.id) + .toSorted((left, right) => left.localeCompare(right)); + + expect( + [ + ...new Set(mediaUnderstandingProviderContractRegistry.map((entry) => entry.pluginId)), + ].toSorted((left, right) => left.localeCompare(right)), + ).toEqual(bundledMediaPluginIds); + }); + + it("covers every bundled image-generation plugin discovered from manifests", () => { + const bundledImagePluginIds = loadPluginManifestRegistry({}) + .plugins.filter( + (plugin) => + plugin.origin === "bundled" && (plugin.imageGenerationProviders?.length ?? 0) > 0, + ) + .map((plugin) => plugin.id) + .toSorted((left, right) => left.localeCompare(right)); + + expect( + [...new Set(imageGenerationProviderContractRegistry.map((entry) => entry.pluginId))].toSorted( + (left, right) => left.localeCompare(right), + ), + ).toEqual(bundledImagePluginIds); + }); + it("covers every bundled web search plugin from the shared resolver", () => { const bundledWebSearchPluginIds = resolveBundledWebSearchPluginIds({}); diff --git a/src/plugins/contracts/registry.ts b/src/plugins/contracts/registry.ts index 1af765586f6..73578d401f2 100644 --- a/src/plugins/contracts/registry.ts +++ b/src/plugins/contracts/registry.ts @@ -39,6 +39,7 @@ import xiaomiPlugin from "../../../extensions/xiaomi/index.js"; import zaiPlugin from "../../../extensions/zai/index.js"; import { bundledWebSearchPluginRegistrations } from "../../bundled-web-search-registry.js"; import { createCapturedPluginRegistration } from "../captured-registration.js"; +import { loadPluginManifestRegistry } from "../manifest-registry.js"; import { resolvePluginProviders } from "../provider-auth-choice.runtime.js"; import type { ImageGenerationProviderPlugin, @@ -85,21 +86,6 @@ const bundledWebSearchPlugins: Array [plugin.id, plugin]), +); + +function resolveBundledCapabilityPluginIds( + capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders", +): string[] { + return loadPluginManifestRegistry({}) + .plugins.filter( + (plugin) => plugin.origin === "bundled" && (plugin[capability]?.length ?? 0) > 0, + ) + .map((plugin) => plugin.id) + .toSorted((left, right) => left.localeCompare(right)); +} + +function resolveBundledCapabilityPlugins( + capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders", +): RegistrablePlugin[] { + return resolveBundledCapabilityPluginIds(capability).flatMap((pluginId) => { + const plugin = bundledRegistrablePluginsById.get(pluginId); + return plugin ? [plugin] : []; + }); +} + +const bundledSpeechPlugins = resolveBundledCapabilityPlugins("speechProviders"); +const bundledMediaUnderstandingPlugins = resolveBundledCapabilityPlugins( + "mediaUnderstandingProviders", +); +const bundledImageGenerationPlugins = resolveBundledCapabilityPlugins("imageGenerationProviders"); + const bundledPluginRegistrationList = dedupePlugins([ ...bundledSpeechPlugins, ...bundledMediaUnderstandingPlugins, diff --git a/src/plugins/manifest-registry.ts b/src/plugins/manifest-registry.ts index adf601db5ca..18f3c6a6427 100644 --- a/src/plugins/manifest-registry.ts +++ b/src/plugins/manifest-registry.ts @@ -45,6 +45,9 @@ export type PluginManifestRecord = { kind?: PluginKind; channels: string[]; providers: string[]; + speechProviders?: string[]; + mediaUnderstandingProviders?: string[]; + imageGenerationProviders?: string[]; cliBackends: string[]; providerAuthEnvVars?: Record; providerAuthChoices?: PluginManifest["providerAuthChoices"]; @@ -171,6 +174,9 @@ function buildRecord(params: { kind: params.manifest.kind, channels: params.manifest.channels ?? [], providers: params.manifest.providers ?? [], + speechProviders: params.manifest.speechProviders ?? [], + mediaUnderstandingProviders: params.manifest.mediaUnderstandingProviders ?? [], + imageGenerationProviders: params.manifest.imageGenerationProviders ?? [], cliBackends: params.manifest.cliBackends ?? [], providerAuthEnvVars: params.manifest.providerAuthEnvVars, providerAuthChoices: params.manifest.providerAuthChoices, @@ -226,6 +232,9 @@ function buildBundleRecord(params: { bundleCapabilities: params.manifest.capabilities, channels: [], providers: [], + speechProviders: [], + mediaUnderstandingProviders: [], + imageGenerationProviders: [], cliBackends: [], skills: params.manifest.skills ?? [], settingsFiles: params.manifest.settingsFiles ?? [], diff --git a/src/plugins/manifest.ts b/src/plugins/manifest.ts index b1a7d593b46..50ec2d0aca0 100644 --- a/src/plugins/manifest.ts +++ b/src/plugins/manifest.ts @@ -15,6 +15,9 @@ export type PluginManifest = { kind?: PluginKind; channels?: string[]; providers?: string[]; + speechProviders?: string[]; + mediaUnderstandingProviders?: string[]; + imageGenerationProviders?: string[]; /** Cheap startup activation lookup for plugin-owned CLI inference backends. */ cliBackends?: string[]; /** Cheap provider-auth env lookup without booting plugin runtime. */ @@ -205,6 +208,9 @@ export function loadPluginManifest( const version = typeof raw.version === "string" ? raw.version.trim() : undefined; const channels = normalizeStringList(raw.channels); const providers = normalizeStringList(raw.providers); + const speechProviders = normalizeStringList(raw.speechProviders); + const mediaUnderstandingProviders = normalizeStringList(raw.mediaUnderstandingProviders); + const imageGenerationProviders = normalizeStringList(raw.imageGenerationProviders); const cliBackends = normalizeStringList(raw.cliBackends); const providerAuthEnvVars = normalizeStringListRecord(raw.providerAuthEnvVars); const providerAuthChoices = normalizeProviderAuthChoices(raw.providerAuthChoices); @@ -224,6 +230,9 @@ export function loadPluginManifest( kind, channels, providers, + speechProviders, + mediaUnderstandingProviders, + imageGenerationProviders, cliBackends, providerAuthEnvVars, providerAuthChoices, diff --git a/src/tts/provider-registry.test.ts b/src/tts/provider-registry.test.ts index b0b6a08abf5..5a084466e55 100644 --- a/src/tts/provider-registry.test.ts +++ b/src/tts/provider-registry.test.ts @@ -58,7 +58,7 @@ describe("speech provider registry", () => { const providers = listSpeechProviders(); - expect(providers.map((provider) => provider.id)).toEqual(["openai", "elevenlabs", "microsoft"]); + expect(providers.map((provider) => provider.id)).toEqual(["openai"]); expect(loadOpenClawPluginsMock).not.toHaveBeenCalled(); }); @@ -76,22 +76,14 @@ describe("speech provider registry", () => { const cfg = {} as OpenClawConfig; - expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual([ - "openai", - "elevenlabs", - "microsoft", - ]); + expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual(["microsoft"]); expect(getSpeechProvider("edge", cfg)?.id).toBe("microsoft"); expect(loadOpenClawPluginsMock).toHaveBeenCalledWith({ config: cfg }); }); - it("returns builtin providers when neither plugins nor active registry provide speech support", () => { - expect(listSpeechProviders().map((provider) => provider.id)).toEqual([ - "openai", - "elevenlabs", - "microsoft", - ]); - expect(getSpeechProvider("openai")?.id).toBe("openai"); + it("returns no providers when neither plugins nor active registry provide speech support", () => { + expect(listSpeechProviders()).toEqual([]); + expect(getSpeechProvider("openai")).toBeUndefined(); }); it("normalizes the legacy edge alias to microsoft", () => { diff --git a/src/tts/provider-registry.ts b/src/tts/provider-registry.ts index 372473e0674..5fc6485e066 100644 --- a/src/tts/provider-registry.ts +++ b/src/tts/provider-registry.ts @@ -1,18 +1,9 @@ -import { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js"; -import { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js"; -import { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js"; import type { OpenClawConfig } from "../config/config.js"; import { loadOpenClawPlugins } from "../plugins/loader.js"; import { getActivePluginRegistry } from "../plugins/runtime.js"; import type { SpeechProviderPlugin } from "../plugins/types.js"; import type { SpeechProviderId } from "./provider-types.js"; -const BUILTIN_SPEECH_PROVIDER_BUILDERS = [ - buildOpenAISpeechProvider, - buildElevenLabsSpeechProvider, - buildMicrosoftSpeechProvider, -] as const satisfies readonly (() => SpeechProviderPlugin)[]; - function trimToUndefined(value: string | undefined): string | undefined { const trimmed = value?.trim().toLowerCase(); return trimmed ? trimmed : undefined; @@ -58,9 +49,6 @@ function buildProviderMaps(cfg?: OpenClawConfig): { } }; - for (const buildProvider of BUILTIN_SPEECH_PROVIDER_BUILDERS) { - register(buildProvider()); - } for (const provider of resolveSpeechProviderPluginEntries(cfg)) { register(provider); } diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts index f665b005a51..809c18c2d78 100644 --- a/src/tts/tts-core.ts +++ b/src/tts/tts-core.ts @@ -1,6 +1,5 @@ -import { rmSync, statSync } from "node:fs"; +import { rmSync } from "node:fs"; import { completeSimple, type TextContent } from "@mariozechner/pi-ai"; -import { EdgeTTS } from "node-edge-tts"; import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js"; import { buildModelAliasIndex, @@ -18,7 +17,6 @@ import type { TtsDirectiveParseResult, } from "./tts.js"; -const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes @@ -26,14 +24,6 @@ export function isValidVoiceId(voiceId: string): boolean { return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); } -function normalizeElevenLabsBaseUrl(baseUrl: string): string { - const trimmed = baseUrl.trim(); - if (!trimmed) { - return DEFAULT_ELEVENLABS_BASE_URL; - } - return trimmed.replace(/\/+$/, ""); -} - function normalizeOpenAITtsBaseUrl(baseUrl?: string): string { const trimmed = baseUrl?.trim(); if (!trimmed) { @@ -53,13 +43,6 @@ function requireInRange(value: number, min: number, max: number, label: string): } } -function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) { - requireInRange(settings.stability, 0, 1, "stability"); - requireInRange(settings.similarityBoost, 0, 1, "similarityBoost"); - requireInRange(settings.style, 0, 1, "style"); - requireInRange(settings.speed, 0.5, 2, "speed"); -} - function normalizeLanguageCode(code?: string): string | undefined { const trimmed = code?.trim(); if (!trimmed) { @@ -538,177 +521,3 @@ export function scheduleCleanup( }, delayMs); timer.unref(); } - -export async function elevenLabsTTS(params: { - text: string; - apiKey: string; - baseUrl: string; - voiceId: string; - modelId: string; - outputFormat: string; - seed?: number; - applyTextNormalization?: "auto" | "on" | "off"; - languageCode?: string; - voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]; - timeoutMs: number; -}): Promise { - const { - text, - apiKey, - baseUrl, - voiceId, - modelId, - outputFormat, - seed, - applyTextNormalization, - languageCode, - voiceSettings, - timeoutMs, - } = params; - if (!isValidVoiceId(voiceId)) { - throw new Error("Invalid voiceId format"); - } - assertElevenLabsVoiceSettings(voiceSettings); - const normalizedLanguage = normalizeLanguageCode(languageCode); - const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization); - const normalizedSeed = normalizeSeed(seed); - - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); - - try { - const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`); - if (outputFormat) { - url.searchParams.set("output_format", outputFormat); - } - - const response = await fetch(url.toString(), { - method: "POST", - headers: { - "xi-api-key": apiKey, - "Content-Type": "application/json", - Accept: "audio/mpeg", - }, - body: JSON.stringify({ - text, - model_id: modelId, - seed: normalizedSeed, - apply_text_normalization: normalizedNormalization, - language_code: normalizedLanguage, - voice_settings: { - stability: voiceSettings.stability, - similarity_boost: voiceSettings.similarityBoost, - style: voiceSettings.style, - use_speaker_boost: voiceSettings.useSpeakerBoost, - speed: voiceSettings.speed, - }, - }), - signal: controller.signal, - }); - - if (!response.ok) { - throw new Error(`ElevenLabs API error (${response.status})`); - } - - return Buffer.from(await response.arrayBuffer()); - } finally { - clearTimeout(timeout); - } -} - -export async function openaiTTS(params: { - text: string; - apiKey: string; - baseUrl: string; - model: string; - voice: string; - speed?: number; - instructions?: string; - responseFormat: "mp3" | "opus" | "pcm"; - timeoutMs: number; -}): Promise { - const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } = - params; - const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions); - - if (!isValidOpenAIModel(model, baseUrl)) { - throw new Error(`Invalid model: ${model}`); - } - if (!isValidOpenAIVoice(voice, baseUrl)) { - throw new Error(`Invalid voice: ${voice}`); - } - - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), timeoutMs); - - try { - const response = await fetch(`${baseUrl}/audio/speech`, { - method: "POST", - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model, - input: text, - voice, - response_format: responseFormat, - ...(speed != null && { speed }), - ...(effectiveInstructions != null && { instructions: effectiveInstructions }), - }), - signal: controller.signal, - }); - - if (!response.ok) { - throw new Error(`OpenAI TTS API error (${response.status})`); - } - - return Buffer.from(await response.arrayBuffer()); - } finally { - clearTimeout(timeout); - } -} - -export function inferEdgeExtension(outputFormat: string): string { - const normalized = outputFormat.toLowerCase(); - if (normalized.includes("webm")) { - return ".webm"; - } - if (normalized.includes("ogg")) { - return ".ogg"; - } - if (normalized.includes("opus")) { - return ".opus"; - } - if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) { - return ".wav"; - } - return ".mp3"; -} - -export async function edgeTTS(params: { - text: string; - outputPath: string; - config: ResolvedTtsConfig["edge"]; - timeoutMs: number; -}): Promise { - const { text, outputPath, config, timeoutMs } = params; - const tts = new EdgeTTS({ - voice: config.voice, - lang: config.lang, - outputFormat: config.outputFormat, - saveSubtitles: config.saveSubtitles, - proxy: config.proxy, - rate: config.rate, - pitch: config.pitch, - volume: config.volume, - timeout: config.timeoutMs ?? timeoutMs, - }); - await tts.ttsPromise(text, outputPath); - - const { size } = statSync(outputPath); - - if (size === 0) { - throw new Error("Edge TTS produced empty audio file"); - } -}