refactor: clean plugin capability boundaries

This commit is contained in:
Peter Steinberger
2026-03-26 21:40:58 +00:00
parent d00dc5f46b
commit ce9dff1458
49 changed files with 572 additions and 342 deletions

View File

@@ -5,33 +5,6 @@
"category": "legacy",
"entrypoint": "index",
"exports": [
{
"declaration": "export function buildFalImageGenerationProvider(): ImageGenerationProvider;",
"exportName": "buildFalImageGenerationProvider",
"kind": "function",
"source": {
"line": 190,
"path": "extensions/fal/image-generation-provider.ts"
}
},
{
"declaration": "export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;",
"exportName": "buildGoogleImageGenerationProvider",
"kind": "function",
"source": {
"line": 98,
"path": "extensions/google/image-generation-provider.ts"
}
},
{
"declaration": "export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;",
"exportName": "buildOpenAIImageGenerationProvider",
"kind": "function",
"source": {
"line": 22,
"path": "extensions/openai/image-generation-provider.ts"
}
},
{
"declaration": "export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;",
"exportName": "delegateCompactionToRuntime",
@@ -923,7 +896,7 @@
"exportName": "createMessageToolButtonsSchema",
"kind": "function",
"source": {
"line": 11,
"line": 12,
"path": "src/plugin-sdk/channel-actions.ts"
}
},
@@ -932,7 +905,7 @@
"exportName": "createMessageToolCardSchema",
"kind": "function",
"source": {
"line": 29,
"line": 30,
"path": "src/plugin-sdk/channel-actions.ts"
}
},
@@ -954,6 +927,15 @@
"path": "src/channels/plugins/actions/shared.ts"
}
},
{
"declaration": "export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;",
"exportName": "optionalStringEnum",
"kind": "function",
"source": {
"line": 31,
"path": "src/agents/schema/typebox.ts"
}
},
{
"declaration": "export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;",
"exportName": "resolveReactionMessageId",
@@ -962,6 +944,15 @@
"line": 7,
"path": "src/channels/plugins/actions/reaction-message-id.ts"
}
},
{
"declaration": "export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;",
"exportName": "stringEnum",
"kind": "function",
"source": {
"line": 15,
"path": "src/agents/schema/typebox.ts"
}
}
],
"importSpecifier": "openclaw/plugin-sdk/channel-actions",

View File

@@ -1,7 +1,4 @@
{"category":"legacy","entrypoint":"index","importSpecifier":"openclaw/plugin-sdk","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/index.ts"}
{"declaration":"export function buildFalImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildFalImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":190,"sourcePath":"extensions/fal/image-generation-provider.ts"}
{"declaration":"export function buildGoogleImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildGoogleImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":98,"sourcePath":"extensions/google/image-generation-provider.ts"}
{"declaration":"export function buildOpenAIImageGenerationProvider(): ImageGenerationProvider;","entrypoint":"index","exportName":"buildOpenAIImageGenerationProvider","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":22,"sourcePath":"extensions/openai/image-generation-provider.ts"}
{"declaration":"export function delegateCompactionToRuntime(params: { sessionId: string; sessionKey?: string | undefined; sessionFile: string; tokenBudget?: number | undefined; force?: boolean | undefined; currentTokenCount?: number | undefined; compactionTarget?: \"budget\" | ... 1 more ... | undefined; customInstructions?: string | undefined; runtimeContext?: ContextEngineRuntimeContext | undefined; }): Promise<...>;","entrypoint":"index","exportName":"delegateCompactionToRuntime","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":16,"sourcePath":"src/context-engine/delegate.ts"}
{"declaration":"export function emptyPluginConfigSchema(): OpenClawPluginConfigSchema;","entrypoint":"index","exportName":"emptyPluginConfigSchema","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/plugins/config-schema.ts"}
{"declaration":"export function onDiagnosticEvent(listener: (evt: DiagnosticEventPayload) => void): () => void;","entrypoint":"index","exportName":"onDiagnosticEvent","importSpecifier":"openclaw/plugin-sdk","kind":"function","recordType":"export","sourceLine":229,"sourcePath":"src/infra/diagnostic-events.ts"}
@@ -100,11 +97,13 @@
{"declaration":"export type BasicAllowlistResolutionEntry = BasicAllowlistResolutionEntry;","entrypoint":"allow-from","exportName":"BasicAllowlistResolutionEntry","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":129,"sourcePath":"src/plugin-sdk/allow-from.ts"}
{"declaration":"export type CompiledAllowlist = CompiledAllowlist;","entrypoint":"allow-from","exportName":"CompiledAllowlist","importSpecifier":"openclaw/plugin-sdk/allow-from","kind":"type","recordType":"export","sourceLine":19,"sourcePath":"src/channels/allowlist-match.ts"}
{"category":"channel","entrypoint":"channel-actions","importSpecifier":"openclaw/plugin-sdk/channel-actions","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":11,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":29,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
{"declaration":"export function createMessageToolButtonsSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolButtonsSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":12,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
{"declaration":"export function createMessageToolCardSchema(): TSchema;","entrypoint":"channel-actions","exportName":"createMessageToolCardSchema","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":30,"sourcePath":"src/plugin-sdk/channel-actions.ts"}
{"declaration":"export function createUnionActionGate<TAccount, TKey extends string>(accounts: readonly TAccount[], createGate: (account: TAccount) => OptionalDefaultGate<TKey>): OptionalDefaultGate<TKey>;","entrypoint":"channel-actions","exportName":"createUnionActionGate","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":13,"sourcePath":"src/channels/plugins/actions/shared.ts"}
{"declaration":"export function listTokenSourcedAccounts<TAccount extends TokenSourcedAccount>(accounts: readonly TAccount[]): TAccount[];","entrypoint":"channel-actions","exportName":"listTokenSourcedAccounts","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/shared.ts"}
{"declaration":"export function optionalStringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TOptional<TUnsafe<T[number]>>;","entrypoint":"channel-actions","exportName":"optionalStringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":31,"sourcePath":"src/agents/schema/typebox.ts"}
{"declaration":"export function resolveReactionMessageId(params: { args: Record<string, unknown>; toolContext?: ReactionToolContext | undefined; }): string | number | undefined;","entrypoint":"channel-actions","exportName":"resolveReactionMessageId","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":7,"sourcePath":"src/channels/plugins/actions/reaction-message-id.ts"}
{"declaration":"export function stringEnum<T extends readonly string[]>(values: T, options?: StringEnumOptions<T>): TUnsafe<T[number]>;","entrypoint":"channel-actions","exportName":"stringEnum","importSpecifier":"openclaw/plugin-sdk/channel-actions","kind":"function","recordType":"export","sourceLine":15,"sourcePath":"src/agents/schema/typebox.ts"}
{"category":"channel","entrypoint":"channel-config-schema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","recordType":"module","sourceLine":1,"sourcePath":"src/plugin-sdk/channel-config-schema.ts"}
{"declaration":"export function buildCatchallMultiAccountChannelSchema<T extends ExtendableZodObject>(accountSchema: T): T;","entrypoint":"channel-config-schema","exportName":"buildCatchallMultiAccountChannelSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":26,"sourcePath":"src/channels/plugins/config-schema.ts"}
{"declaration":"export function buildChannelConfigSchema(schema: ZodType<unknown, unknown, $ZodTypeInternals<unknown, unknown>>): ChannelConfigSchema;","entrypoint":"channel-config-schema","exportName":"buildChannelConfigSchema","importSpecifier":"openclaw/plugin-sdk/channel-config-schema","kind":"function","recordType":"export","sourceLine":35,"sourcePath":"src/channels/plugins/config-schema.ts"}

View File

@@ -1,6 +1,7 @@
{
"id": "anthropic",
"providers": ["anthropic"],
"mediaUnderstandingProviders": ["anthropic"],
"cliBackends": ["claude-cli"],
"providerAuthEnvVars": {
"anthropic": ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"]

View File

@@ -7,7 +7,7 @@ import {
normalizeBaseUrl,
postTranscriptionRequest,
requireTranscriptionText,
} from "openclaw/plugin-sdk/media-understanding";
} from "openclaw/plugin-sdk/provider-http";
export const DEFAULT_DEEPGRAM_AUDIO_BASE_URL = "https://api.deepgram.com/v1";
export const DEFAULT_DEEPGRAM_AUDIO_MODEL = "nova-3";

View File

@@ -1,5 +1,6 @@
{
"id": "deepgram",
"mediaUnderstandingProviders": ["deepgram"],
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -1,5 +1,6 @@
{
"id": "elevenlabs",
"speechProviders": ["elevenlabs"],
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -1,5 +1,6 @@
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
import { elevenLabsTTS, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
import { elevenLabsTTS } from "./tts.js";
const ELEVENLABS_TTS_MODELS = [
"eleven_multilingual_v2",

View File

@@ -0,0 +1,150 @@
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
if (!trimmed) {
return DEFAULT_ELEVENLABS_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
function normalizeLanguageCode(code?: string): string | undefined {
const trimmed = code?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (!/^[a-z]{2}$/.test(normalized)) {
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
}
return normalized;
}
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
const trimmed = mode?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (normalized === "auto" || normalized === "on" || normalized === "off") {
return normalized;
}
throw new Error("applyTextNormalization must be one of: auto, on, off");
}
function normalizeSeed(seed?: number): number | undefined {
if (seed == null) {
return undefined;
}
const next = Math.floor(seed);
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
throw new Error("seed must be between 0 and 4294967295");
}
return next;
}
function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
}
}
function assertElevenLabsVoiceSettings(settings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
}) {
requireInRange(settings.stability, 0, 1, "stability");
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
requireInRange(settings.style, 0, 1, "style");
requireInRange(settings.speed, 0.5, 2, "speed");
}
export async function elevenLabsTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
voiceId: string;
modelId: string;
outputFormat: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
voiceId,
modelId,
outputFormat,
seed,
applyTextNormalization,
languageCode,
voiceSettings,
timeoutMs,
} = params;
if (!isValidVoiceId(voiceId)) {
throw new Error("Invalid voiceId format");
}
assertElevenLabsVoiceSettings(voiceSettings);
const normalizedLanguage = normalizeLanguageCode(languageCode);
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
const normalizedSeed = normalizeSeed(seed);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
if (outputFormat) {
url.searchParams.set("output_format", outputFormat);
}
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
seed: normalizedSeed,
apply_text_normalization: normalizedNormalization,
language_code: normalizedLanguage,
voice_settings: {
stability: voiceSettings.stability,
similarity_boost: voiceSettings.similarityBoost,
style: voiceSettings.style,
use_speaker_boost: voiceSettings.useSpeakerBoost,
speed: voiceSettings.speed,
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}

View File

@@ -1,6 +1,7 @@
{
"id": "fal",
"providers": ["fal"],
"imageGenerationProviders": ["fal"],
"providerAuthEnvVars": {
"fal": ["FAL_KEY"]
},

View File

@@ -1,9 +1,4 @@
import type { ImageGenerationProvider } from "openclaw/plugin-sdk/image-generation";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "openclaw/plugin-sdk/media-understanding";
import { resolveApiKeyForProvider } from "openclaw/plugin-sdk/provider-auth";
import {
DEFAULT_GOOGLE_API_BASE_URL,
@@ -11,6 +6,11 @@ import {
normalizeGoogleModelId,
parseGeminiAuth,
} from "openclaw/plugin-sdk/provider-google";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "openclaw/plugin-sdk/provider-http";
const DEFAULT_GOOGLE_IMAGE_MODEL = "gemini-3.1-flash-image-preview";
const DEFAULT_OUTPUT_MIME = "image/png";

View File

@@ -1,15 +1,17 @@
import {
assertOkOrThrowHttpError,
describeImageWithModel,
describeImagesWithModel,
normalizeBaseUrl,
postJsonRequest,
type AudioTranscriptionRequest,
type AudioTranscriptionResult,
type MediaUnderstandingProvider,
type VideoDescriptionRequest,
type VideoDescriptionResult,
} from "openclaw/plugin-sdk/media-understanding";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "openclaw/plugin-sdk/provider-http";
import {
DEFAULT_GOOGLE_API_BASE_URL,
normalizeGoogleApiBaseUrl,

View File

@@ -1,6 +1,8 @@
{
"id": "google",
"providers": ["google", "google-gemini-cli"],
"mediaUnderstandingProviders": ["google"],
"imageGenerationProviders": ["google"],
"cliBackends": ["google-gemini-cli"],
"providerAuthEnvVars": {
"google": ["GEMINI_API_KEY", "GOOGLE_API_KEY"]

View File

@@ -1,5 +1,6 @@
{
"id": "groq",
"mediaUnderstandingProviders": ["groq"],
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -1,5 +1,6 @@
{
"id": "microsoft",
"speechProviders": ["microsoft"],
"configSchema": {
"type": "object",
"additionalProperties": false,

View File

@@ -8,7 +8,8 @@ import {
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
import { edgeTTS, inferEdgeExtension, type SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
import { edgeTTS, inferEdgeExtension } from "./tts.js";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";

View File

@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
let edgeTTS: typeof import("./tts-core.js").edgeTTS;
let edgeTTS: typeof import("./tts.js").edgeTTS;
let mockTtsPromise = vi.fn<(text: string, filePath: string) => Promise<void>>();
@@ -16,15 +16,13 @@ vi.mock("node-edge-tts", () => ({
}));
const baseEdgeConfig = {
enabled: true,
voice: "en-US-MichelleNeural",
lang: "en-US",
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
outputFormatConfigured: false,
saveSubtitles: false,
};
describe("edgeTTS empty audio validation", () => {
describe("edgeTTS empty audio validation", () => {
let tempDir: string | undefined;
beforeEach(async () => {
@@ -36,7 +34,7 @@ describe("edgeTTS empty audio validation", () => {
}
},
}));
({ edgeTTS } = await import("./tts-core.js"));
({ edgeTTS } = await import("./tts.js"));
});
afterEach(() => {

View File

@@ -0,0 +1,55 @@
import { statSync } from "node:fs";
import { EdgeTTS } from "node-edge-tts";
export function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) {
return ".webm";
}
if (normalized.includes("ogg")) {
return ".ogg";
}
if (normalized.includes("opus")) {
return ".opus";
}
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
return ".wav";
}
return ".mp3";
}
export async function edgeTTS(params: {
text: string;
outputPath: string;
config: {
voice: string;
lang: string;
outputFormat: string;
saveSubtitles: boolean;
proxy?: string;
rate?: string;
pitch?: string;
volume?: string;
timeoutMs?: number;
};
timeoutMs: number;
}): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
const tts = new EdgeTTS({
voice: config.voice,
lang: config.lang,
outputFormat: config.outputFormat,
saveSubtitles: config.saveSubtitles,
proxy: config.proxy,
rate: config.rate,
pitch: config.pitch,
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
const { size } = statSync(outputPath);
if (size === 0) {
throw new Error("Edge TTS produced empty audio file");
}
}

View File

@@ -1,6 +1,8 @@
{
"id": "minimax",
"providers": ["minimax", "minimax-portal"],
"mediaUnderstandingProviders": ["minimax", "minimax-portal"],
"imageGenerationProviders": ["minimax", "minimax-portal"],
"providerAuthEnvVars": {
"minimax": ["MINIMAX_API_KEY"],
"minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"]

View File

@@ -1,6 +1,7 @@
{
"id": "mistral",
"providers": ["mistral"],
"mediaUnderstandingProviders": ["mistral"],
"providerAuthEnvVars": {
"mistral": ["MISTRAL_API_KEY"]
},

View File

@@ -4,10 +4,12 @@ import {
type MediaUnderstandingProvider,
type VideoDescriptionRequest,
type VideoDescriptionResult,
} from "openclaw/plugin-sdk/media-understanding";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "openclaw/plugin-sdk/media-understanding";
} from "openclaw/plugin-sdk/provider-http";
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";

View File

@@ -1,6 +1,7 @@
{
"id": "moonshot",
"providers": ["moonshot"],
"mediaUnderstandingProviders": ["moonshot"],
"providerAuthEnvVars": {
"moonshot": ["MOONSHOT_API_KEY"]
},

View File

@@ -1,6 +1,9 @@
{
"id": "openai",
"providers": ["openai", "openai-codex"],
"speechProviders": ["openai"],
"mediaUnderstandingProviders": ["openai", "openai-codex"],
"imageGenerationProviders": ["openai"],
"cliBackends": ["codex-cli"],
"providerAuthEnvVars": {
"openai": ["OPENAI_API_KEY"]

View File

@@ -1,5 +1,5 @@
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "openclaw/plugin-sdk/speech";
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js";
export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
return {

109
extensions/openai/tts.ts Normal file
View File

@@ -0,0 +1,109 @@
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"ballad",
"cedar",
"coral",
"echo",
"fable",
"juniper",
"marin",
"onyx",
"nova",
"sage",
"shimmer",
"verse",
] as const;
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
if (!trimmed) {
return DEFAULT_OPENAI_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
function isCustomOpenAIEndpoint(baseUrl?: string): boolean {
if (baseUrl != null) {
return normalizeOpenAITtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_BASE_URL;
}
return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL;
}
function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
if (isCustomOpenAIEndpoint(baseUrl)) {
return true;
}
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
if (isCustomOpenAIEndpoint(baseUrl)) {
return true;
}
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
const next = instructions?.trim();
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
}
export async function openaiTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
params;
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
if (!isValidOpenAIModel(model, baseUrl)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice, baseUrl)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${baseUrl}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: responseFormat,
...(speed != null && { speed }),
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}

View File

@@ -1,6 +1,7 @@
{
"id": "zai",
"providers": ["zai"],
"mediaUnderstandingProviders": ["zai"],
"providerAuthEnvVars": {
"zai": ["ZAI_API_KEY", "Z_AI_API_KEY"]
},

View File

@@ -453,6 +453,10 @@
"types": "./dist/plugin-sdk/provider-env-vars.d.ts",
"default": "./dist/plugin-sdk/provider-env-vars.js"
},
"./plugin-sdk/provider-http": {
"types": "./dist/plugin-sdk/provider-http.d.ts",
"default": "./dist/plugin-sdk/provider-http.js"
},
"./plugin-sdk/provider-google": {
"types": "./dist/plugin-sdk/provider-google.d.ts",
"default": "./dist/plugin-sdk/provider-google.js"
@@ -529,6 +533,10 @@
"types": "./dist/plugin-sdk/telegram-core.d.ts",
"default": "./dist/plugin-sdk/telegram-core.js"
},
"./plugin-sdk/telegram-runtime": {
"types": "./dist/plugin-sdk/telegram-runtime.d.ts",
"default": "./dist/plugin-sdk/telegram-runtime.js"
},
"./plugin-sdk/thread-ownership": {
"types": "./dist/plugin-sdk/thread-ownership.d.ts",
"default": "./dist/plugin-sdk/thread-ownership.js"

View File

@@ -103,6 +103,15 @@ function normalizePluginManifest(raw) {
...(normalizeStringList(raw.providers)
? { providers: normalizeStringList(raw.providers) }
: {}),
...(normalizeStringList(raw.speechProviders)
? { speechProviders: normalizeStringList(raw.speechProviders) }
: {}),
...(normalizeStringList(raw.mediaUnderstandingProviders)
? { mediaUnderstandingProviders: normalizeStringList(raw.mediaUnderstandingProviders) }
: {}),
...(normalizeStringList(raw.imageGenerationProviders)
? { imageGenerationProviders: normalizeStringList(raw.imageGenerationProviders) }
: {}),
...(normalizeObject(raw.providerAuthEnvVars)
? { providerAuthEnvVars: raw.providerAuthEnvVars }
: {}),

View File

@@ -103,6 +103,7 @@
"provider-catalog",
"provider-entry",
"provider-env-vars",
"provider-http",
"provider-google",
"provider-models",
"provider-onboard",
@@ -122,6 +123,7 @@
"state-paths",
"telegram",
"telegram-core",
"telegram-runtime",
"thread-ownership",
"tlon",
"tool-send",

View File

@@ -461,7 +461,7 @@ export async function applyMediaUnderstanding(params: {
.find((value) => value && value.trim()) ?? undefined;
const attachments = normalizeMediaAttachments(ctx);
const providerRegistry = buildProviderRegistry(params.providers);
const providerRegistry = buildProviderRegistry(params.providers, cfg);
const cache = createMediaAttachmentCache(attachments, {
localPathRoots: resolveMediaAttachmentLocalRoots({ cfg, ctx }),
});

View File

@@ -23,7 +23,7 @@ export async function runAudioTranscription(params: {
return { transcript: undefined, attachments };
}
const providerRegistry = buildProviderRegistry(params.providers);
const providerRegistry = buildProviderRegistry(params.providers, params.cfg);
const cache = createMediaAttachmentCache(
attachments,
params.localPathRoots ? { localPathRoots: params.localPathRoots } : undefined,

View File

@@ -11,15 +11,10 @@ describe("media-understanding provider registry", () => {
setActivePluginRegistry(createEmptyPluginRegistry());
});
it("keeps core-owned fallback providers registered by default", () => {
it("returns no providers by default when no active registry is present", () => {
const registry = buildMediaUnderstandingRegistry();
const groqProvider = getMediaUnderstandingProvider("groq", registry);
const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry);
expect(groqProvider?.id).toBe("groq");
expect(groqProvider?.capabilities).toEqual(["audio"]);
expect(deepgramProvider?.id).toBe("deepgram");
expect(deepgramProvider?.capabilities).toEqual(["audio"]);
expect(getMediaUnderstandingProvider("groq", registry)).toBeUndefined();
expect(getMediaUnderstandingProvider("deepgram", registry)).toBeUndefined();
});
it("merges plugin-registered media providers into the active registry", async () => {

View File

@@ -1,18 +1,9 @@
import type { OpenClawConfig } from "../config/config.js";
import {
deepgramMediaUnderstandingProvider,
groqMediaUnderstandingProvider,
} from "../plugin-sdk/media-understanding.js";
import { loadOpenClawPlugins } from "../plugins/loader.js";
import { getActivePluginRegistry } from "../plugins/runtime.js";
import { normalizeMediaProviderId } from "./provider-id.js";
import type { MediaUnderstandingProvider } from "./types.js";
const PROVIDERS: MediaUnderstandingProvider[] = [
groqMediaUnderstandingProvider,
deepgramMediaUnderstandingProvider,
];
function mergeProviderIntoRegistry(
registry: Map<string, MediaUnderstandingProvider>,
provider: MediaUnderstandingProvider,
@@ -36,12 +27,9 @@ export function buildMediaUnderstandingRegistry(
cfg?: OpenClawConfig,
): Map<string, MediaUnderstandingProvider> {
const registry = new Map<string, MediaUnderstandingProvider>();
for (const provider of PROVIDERS) {
mergeProviderIntoRegistry(registry, provider);
}
const active = getActivePluginRegistry();
const pluginRegistry =
(active?.mediaUnderstandingProviders?.length ?? 0) > 0
(active?.mediaUnderstandingProviders?.length ?? 0) > 0 || !cfg
? active
: loadOpenClawPlugins({ config: cfg });
for (const entry of pluginRegistry?.mediaUnderstandingProviders ?? []) {

View File

@@ -494,7 +494,7 @@ export async function resolveAutoImageModel(params: {
agentDir?: string;
activeModel?: ActiveMediaModel;
}): Promise<ActiveMediaModel | null> {
const providerRegistry = buildProviderRegistry();
const providerRegistry = buildProviderRegistry(undefined, params.cfg);
const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
if (!entry || entry.type === "cli") {
return null;

View File

@@ -7,6 +7,7 @@ export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
import { Type } from "@sinclair/typebox";
import type { TSchema } from "@sinclair/typebox";
import { stringEnum } from "../agents/schema/typebox.js";
export { optionalStringEnum, stringEnum } from "../agents/schema/typebox.js";
/** Schema helper for channels that expose button rows on the shared `message` tool. */
export function createMessageToolButtonsSchema(): TSchema {

View File

@@ -8,7 +8,3 @@ export type {
ImageGenerationResult,
ImageGenerationSourceImage,
} from "../image-generation/types.js";
export { buildFalImageGenerationProvider } from "../../extensions/fal/image-generation-provider.js";
export { buildGoogleImageGenerationProvider } from "../../extensions/google/image-generation-provider.js";
export { buildOpenAIImageGenerationProvider } from "../../extensions/openai/image-generation-provider.js";

View File

@@ -89,9 +89,6 @@ describe("plugin-sdk exports", () => {
it("keeps the root runtime surface intentionally small", async () => {
const runtimeExports = await collectRuntimeExports(path.join(import.meta.dirname, "index.ts"));
expect([...runtimeExports].toSorted()).toEqual([
"buildFalImageGenerationProvider",
"buildGoogleImageGenerationProvider",
"buildOpenAIImageGenerationProvider",
"delegateCompactionToRuntime",
"emptyPluginConfigSchema",
"onDiagnosticEvent",

View File

@@ -18,12 +18,3 @@ export {
describeImagesWithModel,
} from "../media-understanding/image-runtime.js";
export { transcribeOpenAiCompatibleAudio } from "../media-understanding/openai-compatible-audio.js";
export {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
postTranscriptionRequest,
requireTranscriptionText,
} from "../media-understanding/shared.js";
export { deepgramMediaUnderstandingProvider } from "../../extensions/deepgram/media-understanding-provider.js";
export { groqMediaUnderstandingProvider } from "../../extensions/groq/media-understanding-provider.js";

View File

@@ -0,0 +1,12 @@
// Shared provider-facing HTTP helpers. Keep generic transport utilities here so
// capability SDKs do not depend on each other.
export {
assertOkOrThrowHttpError,
fetchWithTimeout,
fetchWithTimeoutGuarded,
normalizeBaseUrl,
postJsonRequest,
postTranscriptionRequest,
requireTranscriptionText,
} from "../media-understanding/shared.js";

View File

@@ -3,15 +3,4 @@
export type { SpeechProviderPlugin } from "../plugins/types.js";
export type { SpeechVoiceOption } from "../tts/provider-types.js";
export {
edgeTTS,
elevenLabsTTS,
inferEdgeExtension,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
openaiTTS,
parseTtsDirectives,
} from "../tts/tts-core.js";
export { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
export { isVoiceCompatibleAudio } from "../media/audio.js";
export { parseTtsDirectives } from "../tts/tts-core.js";

View File

@@ -1,9 +1,4 @@
// Public speech-provider builders for bundled or third-party plugins.
// Public speech helpers for bundled or third-party plugins.
export { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
export { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
export { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
export { edgeTTS, elevenLabsTTS, inferEdgeExtension, openaiTTS } from "../tts/tts-core.js";
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "../tts/tts-core.js";
export { parseTtsDirectives } from "../tts/tts-core.js";
export type { SpeechVoiceOption } from "../tts/provider-types.js";

View File

@@ -544,6 +544,36 @@ describe("plugin-sdk subpath exports", () => {
"buildOptionalSecretInputSchema",
"normalizeSecretInputString",
]);
expectSourceMentions("provider-http", [
"assertOkOrThrowHttpError",
"normalizeBaseUrl",
"postJsonRequest",
"postTranscriptionRequest",
"requireTranscriptionText",
]);
expectSourceOmits("speech", [
"buildElevenLabsSpeechProvider",
"buildMicrosoftSpeechProvider",
"buildOpenAISpeechProvider",
"edgeTTS",
"elevenLabsTTS",
"inferEdgeExtension",
"openaiTTS",
"OPENAI_TTS_MODELS",
"OPENAI_TTS_VOICES",
]);
expectSourceOmits("media-understanding", [
"deepgramMediaUnderstandingProvider",
"groqMediaUnderstandingProvider",
"assertOkOrThrowHttpError",
"postJsonRequest",
"postTranscriptionRequest",
]);
expectSourceOmits("image-generation", [
"buildFalImageGenerationProvider",
"buildGoogleImageGenerationProvider",
"buildOpenAIImageGenerationProvider",
]);
expectSourceOmits("config-runtime", [
"hasConfiguredSecretInput",
"normalizeResolvedSecretInputString",

View File

@@ -169,6 +169,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
properties: {},
},
providers: ["anthropic"],
mediaUnderstandingProviders: ["anthropic"],
providerAuthEnvVars: {
anthropic: ["ANTHROPIC_OAUTH_TOKEN", "ANTHROPIC_API_KEY"],
},
@@ -488,6 +489,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
additionalProperties: false,
properties: {},
},
mediaUnderstandingProviders: ["deepgram"],
},
},
{
@@ -859,6 +861,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
additionalProperties: false,
properties: {},
},
speechProviders: ["elevenlabs"],
},
},
{
@@ -925,6 +928,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
properties: {},
},
providers: ["fal"],
imageGenerationProviders: ["fal"],
providerAuthEnvVars: {
fal: ["FAL_KEY"],
},
@@ -1114,6 +1118,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
},
},
providers: ["google", "google-gemini-cli"],
mediaUnderstandingProviders: ["google"],
imageGenerationProviders: ["google"],
providerAuthEnvVars: {
google: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
},
@@ -1221,6 +1227,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
additionalProperties: false,
properties: {},
},
mediaUnderstandingProviders: ["groq"],
},
},
{
@@ -1782,6 +1789,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
additionalProperties: false,
properties: {},
},
speechProviders: ["microsoft"],
},
},
{
@@ -1854,6 +1862,8 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
properties: {},
},
providers: ["minimax", "minimax-portal"],
mediaUnderstandingProviders: ["minimax", "minimax-portal"],
imageGenerationProviders: ["minimax", "minimax-portal"],
providerAuthEnvVars: {
minimax: ["MINIMAX_API_KEY"],
"minimax-portal": ["MINIMAX_OAUTH_TOKEN", "MINIMAX_API_KEY"],
@@ -1931,6 +1941,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
properties: {},
},
providers: ["mistral"],
mediaUnderstandingProviders: ["mistral"],
providerAuthEnvVars: {
mistral: ["MISTRAL_API_KEY"],
},
@@ -2072,6 +2083,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
},
},
providers: ["moonshot"],
mediaUnderstandingProviders: ["moonshot"],
providerAuthEnvVars: {
moonshot: ["MOONSHOT_API_KEY"],
},
@@ -2363,6 +2375,9 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
properties: {},
},
providers: ["openai", "openai-codex"],
speechProviders: ["openai"],
mediaUnderstandingProviders: ["openai", "openai-codex"],
imageGenerationProviders: ["openai"],
providerAuthEnvVars: {
openai: ["OPENAI_API_KEY"],
},
@@ -4101,6 +4116,7 @@ export const GENERATED_BUNDLED_PLUGIN_METADATA = [
properties: {},
},
providers: ["zai"],
mediaUnderstandingProviders: ["zai"],
providerAuthEnvVars: {
zai: ["ZAI_API_KEY", "Z_AI_API_KEY"],
},

View File

@@ -120,6 +120,53 @@ describe("plugin contract registry", () => {
expect(providerContractPluginIds).toEqual(bundledProviderPluginIds);
});
it("covers every bundled speech plugin discovered from manifests", () => {
const bundledSpeechPluginIds = loadPluginManifestRegistry({})
.plugins.filter(
(plugin) => plugin.origin === "bundled" && (plugin.speechProviders?.length ?? 0) > 0,
)
.map((plugin) => plugin.id)
.toSorted((left, right) => left.localeCompare(right));
expect(
[...new Set(speechProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
(left, right) => left.localeCompare(right),
),
).toEqual(bundledSpeechPluginIds);
});
it("covers every bundled media-understanding plugin discovered from manifests", () => {
const bundledMediaPluginIds = loadPluginManifestRegistry({})
.plugins.filter(
(plugin) =>
plugin.origin === "bundled" && (plugin.mediaUnderstandingProviders?.length ?? 0) > 0,
)
.map((plugin) => plugin.id)
.toSorted((left, right) => left.localeCompare(right));
expect(
[
...new Set(mediaUnderstandingProviderContractRegistry.map((entry) => entry.pluginId)),
].toSorted((left, right) => left.localeCompare(right)),
).toEqual(bundledMediaPluginIds);
});
it("covers every bundled image-generation plugin discovered from manifests", () => {
const bundledImagePluginIds = loadPluginManifestRegistry({})
.plugins.filter(
(plugin) =>
plugin.origin === "bundled" && (plugin.imageGenerationProviders?.length ?? 0) > 0,
)
.map((plugin) => plugin.id)
.toSorted((left, right) => left.localeCompare(right));
expect(
[...new Set(imageGenerationProviderContractRegistry.map((entry) => entry.pluginId))].toSorted(
(left, right) => left.localeCompare(right),
),
).toEqual(bundledImagePluginIds);
});
it("covers every bundled web search plugin from the shared resolver", () => {
const bundledWebSearchPluginIds = resolveBundledWebSearchPluginIds({});

View File

@@ -39,6 +39,7 @@ import xiaomiPlugin from "../../../extensions/xiaomi/index.js";
import zaiPlugin from "../../../extensions/zai/index.js";
import { bundledWebSearchPluginRegistrations } from "../../bundled-web-search-registry.js";
import { createCapturedPluginRegistration } from "../captured-registration.js";
import { loadPluginManifestRegistry } from "../manifest-registry.js";
import { resolvePluginProviders } from "../provider-auth-choice.runtime.js";
import type {
ImageGenerationProviderPlugin,
@@ -85,21 +86,6 @@ const bundledWebSearchPlugins: Array<RegistrablePlugin & { credentialValue: unkn
...plugin,
credentialValue,
}));
const bundledSpeechPlugins: RegistrablePlugin[] = [elevenLabsPlugin, microsoftPlugin, openAIPlugin];
const bundledMediaUnderstandingPlugins: RegistrablePlugin[] = [
anthropicPlugin,
deepgramPlugin,
googlePlugin,
groqPlugin,
minimaxPlugin,
mistralPlugin,
moonshotPlugin,
openAIPlugin,
zaiPlugin,
];
const bundledImageGenerationPlugins: RegistrablePlugin[] = [falPlugin, googlePlugin, openAIPlugin];
function captureRegistrations(plugin: RegistrablePlugin) {
const captured = createCapturedPluginRegistration();
@@ -390,6 +376,43 @@ const bundledProviderPlugins = dedupePlugins([
zaiPlugin,
]);
const bundledRegistrablePluginsById = new Map(
dedupePlugins([
...bundledProviderPlugins,
elevenLabsPlugin,
microsoftPlugin,
deepgramPlugin,
groqPlugin,
...bundledWebSearchPlugins,
]).map((plugin) => [plugin.id, plugin]),
);
function resolveBundledCapabilityPluginIds(
capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
): string[] {
return loadPluginManifestRegistry({})
.plugins.filter(
(plugin) => plugin.origin === "bundled" && (plugin[capability]?.length ?? 0) > 0,
)
.map((plugin) => plugin.id)
.toSorted((left, right) => left.localeCompare(right));
}
function resolveBundledCapabilityPlugins(
capability: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders",
): RegistrablePlugin[] {
return resolveBundledCapabilityPluginIds(capability).flatMap((pluginId) => {
const plugin = bundledRegistrablePluginsById.get(pluginId);
return plugin ? [plugin] : [];
});
}
const bundledSpeechPlugins = resolveBundledCapabilityPlugins("speechProviders");
const bundledMediaUnderstandingPlugins = resolveBundledCapabilityPlugins(
"mediaUnderstandingProviders",
);
const bundledImageGenerationPlugins = resolveBundledCapabilityPlugins("imageGenerationProviders");
const bundledPluginRegistrationList = dedupePlugins([
...bundledSpeechPlugins,
...bundledMediaUnderstandingPlugins,

View File

@@ -45,6 +45,9 @@ export type PluginManifestRecord = {
kind?: PluginKind;
channels: string[];
providers: string[];
speechProviders?: string[];
mediaUnderstandingProviders?: string[];
imageGenerationProviders?: string[];
cliBackends: string[];
providerAuthEnvVars?: Record<string, string[]>;
providerAuthChoices?: PluginManifest["providerAuthChoices"];
@@ -171,6 +174,9 @@ function buildRecord(params: {
kind: params.manifest.kind,
channels: params.manifest.channels ?? [],
providers: params.manifest.providers ?? [],
speechProviders: params.manifest.speechProviders ?? [],
mediaUnderstandingProviders: params.manifest.mediaUnderstandingProviders ?? [],
imageGenerationProviders: params.manifest.imageGenerationProviders ?? [],
cliBackends: params.manifest.cliBackends ?? [],
providerAuthEnvVars: params.manifest.providerAuthEnvVars,
providerAuthChoices: params.manifest.providerAuthChoices,
@@ -226,6 +232,9 @@ function buildBundleRecord(params: {
bundleCapabilities: params.manifest.capabilities,
channels: [],
providers: [],
speechProviders: [],
mediaUnderstandingProviders: [],
imageGenerationProviders: [],
cliBackends: [],
skills: params.manifest.skills ?? [],
settingsFiles: params.manifest.settingsFiles ?? [],

View File

@@ -15,6 +15,9 @@ export type PluginManifest = {
kind?: PluginKind;
channels?: string[];
providers?: string[];
speechProviders?: string[];
mediaUnderstandingProviders?: string[];
imageGenerationProviders?: string[];
/** Cheap startup activation lookup for plugin-owned CLI inference backends. */
cliBackends?: string[];
/** Cheap provider-auth env lookup without booting plugin runtime. */
@@ -205,6 +208,9 @@ export function loadPluginManifest(
const version = typeof raw.version === "string" ? raw.version.trim() : undefined;
const channels = normalizeStringList(raw.channels);
const providers = normalizeStringList(raw.providers);
const speechProviders = normalizeStringList(raw.speechProviders);
const mediaUnderstandingProviders = normalizeStringList(raw.mediaUnderstandingProviders);
const imageGenerationProviders = normalizeStringList(raw.imageGenerationProviders);
const cliBackends = normalizeStringList(raw.cliBackends);
const providerAuthEnvVars = normalizeStringListRecord(raw.providerAuthEnvVars);
const providerAuthChoices = normalizeProviderAuthChoices(raw.providerAuthChoices);
@@ -224,6 +230,9 @@ export function loadPluginManifest(
kind,
channels,
providers,
speechProviders,
mediaUnderstandingProviders,
imageGenerationProviders,
cliBackends,
providerAuthEnvVars,
providerAuthChoices,

View File

@@ -58,7 +58,7 @@ describe("speech provider registry", () => {
const providers = listSpeechProviders();
expect(providers.map((provider) => provider.id)).toEqual(["openai", "elevenlabs", "microsoft"]);
expect(providers.map((provider) => provider.id)).toEqual(["openai"]);
expect(loadOpenClawPluginsMock).not.toHaveBeenCalled();
});
@@ -76,22 +76,14 @@ describe("speech provider registry", () => {
const cfg = {} as OpenClawConfig;
expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual([
"openai",
"elevenlabs",
"microsoft",
]);
expect(listSpeechProviders(cfg).map((provider) => provider.id)).toEqual(["microsoft"]);
expect(getSpeechProvider("edge", cfg)?.id).toBe("microsoft");
expect(loadOpenClawPluginsMock).toHaveBeenCalledWith({ config: cfg });
});
it("returns builtin providers when neither plugins nor active registry provide speech support", () => {
expect(listSpeechProviders().map((provider) => provider.id)).toEqual([
"openai",
"elevenlabs",
"microsoft",
]);
expect(getSpeechProvider("openai")?.id).toBe("openai");
it("returns no providers when neither plugins nor active registry provide speech support", () => {
expect(listSpeechProviders()).toEqual([]);
expect(getSpeechProvider("openai")).toBeUndefined();
});
it("normalizes the legacy edge alias to microsoft", () => {

View File

@@ -1,18 +1,9 @@
import { buildElevenLabsSpeechProvider } from "../../extensions/elevenlabs/speech-provider.js";
import { buildMicrosoftSpeechProvider } from "../../extensions/microsoft/speech-provider.js";
import { buildOpenAISpeechProvider } from "../../extensions/openai/speech-provider.js";
import type { OpenClawConfig } from "../config/config.js";
import { loadOpenClawPlugins } from "../plugins/loader.js";
import { getActivePluginRegistry } from "../plugins/runtime.js";
import type { SpeechProviderPlugin } from "../plugins/types.js";
import type { SpeechProviderId } from "./provider-types.js";
const BUILTIN_SPEECH_PROVIDER_BUILDERS = [
buildOpenAISpeechProvider,
buildElevenLabsSpeechProvider,
buildMicrosoftSpeechProvider,
] as const satisfies readonly (() => SpeechProviderPlugin)[];
function trimToUndefined(value: string | undefined): string | undefined {
const trimmed = value?.trim().toLowerCase();
return trimmed ? trimmed : undefined;
@@ -58,9 +49,6 @@ function buildProviderMaps(cfg?: OpenClawConfig): {
}
};
for (const buildProvider of BUILTIN_SPEECH_PROVIDER_BUILDERS) {
register(buildProvider());
}
for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
register(provider);
}

View File

@@ -1,6 +1,5 @@
import { rmSync, statSync } from "node:fs";
import { rmSync } from "node:fs";
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
import {
buildModelAliasIndex,
@@ -18,7 +17,6 @@ import type {
TtsDirectiveParseResult,
} from "./tts.js";
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
@@ -26,14 +24,6 @@ export function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
const trimmed = baseUrl.trim();
if (!trimmed) {
return DEFAULT_ELEVENLABS_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
if (!trimmed) {
@@ -53,13 +43,6 @@ function requireInRange(value: number, min: number, max: number, label: string):
}
}
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
requireInRange(settings.stability, 0, 1, "stability");
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
requireInRange(settings.style, 0, 1, "style");
requireInRange(settings.speed, 0.5, 2, "speed");
}
function normalizeLanguageCode(code?: string): string | undefined {
const trimmed = code?.trim();
if (!trimmed) {
@@ -538,177 +521,3 @@ export function scheduleCleanup(
}, delayMs);
timer.unref();
}
export async function elevenLabsTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
voiceId: string;
modelId: string;
outputFormat: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
voiceId,
modelId,
outputFormat,
seed,
applyTextNormalization,
languageCode,
voiceSettings,
timeoutMs,
} = params;
if (!isValidVoiceId(voiceId)) {
throw new Error("Invalid voiceId format");
}
assertElevenLabsVoiceSettings(voiceSettings);
const normalizedLanguage = normalizeLanguageCode(languageCode);
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
const normalizedSeed = normalizeSeed(seed);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
if (outputFormat) {
url.searchParams.set("output_format", outputFormat);
}
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
seed: normalizedSeed,
apply_text_normalization: normalizedNormalization,
language_code: normalizedLanguage,
voice_settings: {
stability: voiceSettings.stability,
similarity_boost: voiceSettings.similarityBoost,
style: voiceSettings.style,
use_speaker_boost: voiceSettings.useSpeakerBoost,
speed: voiceSettings.speed,
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
export async function openaiTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
params;
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
if (!isValidOpenAIModel(model, baseUrl)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice, baseUrl)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${baseUrl}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: responseFormat,
...(speed != null && { speed }),
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
export function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) {
return ".webm";
}
if (normalized.includes("ogg")) {
return ".ogg";
}
if (normalized.includes("opus")) {
return ".opus";
}
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
return ".wav";
}
return ".mp3";
}
export async function edgeTTS(params: {
text: string;
outputPath: string;
config: ResolvedTtsConfig["edge"];
timeoutMs: number;
}): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
const tts = new EdgeTTS({
voice: config.voice,
lang: config.lang,
outputFormat: config.outputFormat,
saveSubtitles: config.saveSubtitles,
proxy: config.proxy,
rate: config.rate,
pitch: config.pitch,
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
const { size } = statSync(outputPath);
if (size === 0) {
throw new Error("Edge TTS produced empty audio file");
}
}