mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-28 01:21:36 +00:00
feat(plugins): add speech provider registration
This commit is contained in:
14
extensions/elevenlabs/index.ts
Normal file
14
extensions/elevenlabs/index.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { buildElevenLabsSpeechProvider } from "../../src/tts/providers/elevenlabs.js";
|
||||
|
||||
const elevenLabsPlugin = {
|
||||
id: "elevenlabs",
|
||||
name: "ElevenLabs Speech",
|
||||
description: "Bundled ElevenLabs speech provider",
|
||||
configSchema: emptyPluginConfigSchema(),
|
||||
register(api: OpenClawPluginApi) {
|
||||
api.registerSpeechProvider(buildElevenLabsSpeechProvider());
|
||||
},
|
||||
};
|
||||
|
||||
export default elevenLabsPlugin;
|
||||
8
extensions/elevenlabs/openclaw.plugin.json
Normal file
8
extensions/elevenlabs/openclaw.plugin.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"id": "elevenlabs",
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
12
extensions/elevenlabs/package.json
Normal file
12
extensions/elevenlabs/package.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"name": "@openclaw/elevenlabs-speech",
|
||||
"version": "2026.3.14",
|
||||
"private": true,
|
||||
"description": "OpenClaw ElevenLabs speech plugin",
|
||||
"type": "module",
|
||||
"openclaw": {
|
||||
"extensions": [
|
||||
"./index.ts"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -44,6 +44,7 @@ function fakeApi(overrides: Partial<OpenClawPluginApi> = {}): OpenClawPluginApi
|
||||
registerCli() {},
|
||||
registerService() {},
|
||||
registerProvider() {},
|
||||
registerSpeechProvider() {},
|
||||
registerWebSearchProvider() {},
|
||||
registerInteractiveHandler() {},
|
||||
registerHook() {},
|
||||
|
||||
14
extensions/microsoft/index.ts
Normal file
14
extensions/microsoft/index.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { buildMicrosoftSpeechProvider } from "../../src/tts/providers/microsoft.js";
|
||||
|
||||
const microsoftPlugin = {
|
||||
id: "microsoft",
|
||||
name: "Microsoft Speech",
|
||||
description: "Bundled Microsoft speech provider",
|
||||
configSchema: emptyPluginConfigSchema(),
|
||||
register(api: OpenClawPluginApi) {
|
||||
api.registerSpeechProvider(buildMicrosoftSpeechProvider());
|
||||
},
|
||||
};
|
||||
|
||||
export default microsoftPlugin;
|
||||
8
extensions/microsoft/openclaw.plugin.json
Normal file
8
extensions/microsoft/openclaw.plugin.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"id": "microsoft",
|
||||
"configSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
12
extensions/microsoft/package.json
Normal file
12
extensions/microsoft/package.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"name": "@openclaw/microsoft-speech",
|
||||
"version": "2026.3.14",
|
||||
"private": true,
|
||||
"description": "OpenClaw Microsoft speech plugin",
|
||||
"type": "module",
|
||||
"openclaw": {
|
||||
"extensions": [
|
||||
"./index.ts"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { buildOpenAISpeechProvider } from "../../src/tts/providers/openai.js";
|
||||
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
|
||||
import { buildOpenAIProvider } from "./openai-provider.js";
|
||||
|
||||
@@ -10,6 +11,7 @@ const openAIPlugin = {
|
||||
register(api: OpenClawPluginApi) {
|
||||
api.registerProvider(buildOpenAIProvider());
|
||||
api.registerProvider(buildOpenAICodexProviderPlugin());
|
||||
api.registerSpeechProvider(buildOpenAISpeechProvider());
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi
|
||||
registerCli() {},
|
||||
registerService() {},
|
||||
registerProvider() {},
|
||||
registerSpeechProvider() {},
|
||||
registerWebSearchProvider() {},
|
||||
registerInteractiveHandler() {},
|
||||
registerCommand() {},
|
||||
|
||||
@@ -80,7 +80,7 @@ const voiceCallConfigSchema = {
|
||||
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
help: "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
|
||||
advanced: true,
|
||||
},
|
||||
"tts.openai.model": { label: "OpenAI TTS Model", advanced: true },
|
||||
|
||||
@@ -101,7 +101,7 @@
|
||||
},
|
||||
"tts.provider": {
|
||||
"label": "TTS Provider Override",
|
||||
"help": "Deep-merges with messages.tts (Edge is ignored for calls).",
|
||||
"help": "Deep-merges with messages.tts (Microsoft is ignored for calls).",
|
||||
"advanced": true
|
||||
},
|
||||
"tts.openai.model": {
|
||||
@@ -420,8 +420,7 @@
|
||||
"enum": ["final", "all"]
|
||||
},
|
||||
"provider": {
|
||||
"type": "string",
|
||||
"enum": ["openai", "elevenlabs", "edge"]
|
||||
"type": "string"
|
||||
},
|
||||
"summaryModel": {
|
||||
"type": "string"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { logVerbose } from "../../globals.js";
|
||||
import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js";
|
||||
import {
|
||||
getLastTtsAttempt,
|
||||
getTtsMaxLength,
|
||||
@@ -54,7 +55,7 @@ function ttsUsage(): ReplyPayload {
|
||||
`• /tts summary [on|off] — View/change auto-summary\n` +
|
||||
`• /tts audio <text> — Generate audio from text\n\n` +
|
||||
`**Providers:**\n` +
|
||||
`• edge — Free, fast (default)\n` +
|
||||
`• microsoft — Microsoft Edge-backed speech (default fallback)\n` +
|
||||
`• openai — High quality (requires API key)\n` +
|
||||
`• elevenlabs — Premium voices (requires API key)\n\n` +
|
||||
`**Text Limit (default: 1500, max: 4096):**\n` +
|
||||
@@ -62,7 +63,7 @@ function ttsUsage(): ReplyPayload {
|
||||
`• Summary ON: AI summarizes, then generates audio\n` +
|
||||
`• Summary OFF: Truncates text, then generates audio\n\n` +
|
||||
`**Examples:**\n` +
|
||||
`/tts provider edge\n` +
|
||||
`/tts provider microsoft\n` +
|
||||
`/tts limit 2000\n` +
|
||||
`/tts audio Hello, this is a test!`,
|
||||
};
|
||||
@@ -161,7 +162,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
if (!args.trim()) {
|
||||
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
|
||||
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
|
||||
const hasEdge = isTtsProviderConfigured(config, "edge");
|
||||
const hasMicrosoft = isTtsProviderConfigured(config, "microsoft", params.cfg);
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: {
|
||||
@@ -170,21 +171,23 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
`Primary: ${currentProvider}\n` +
|
||||
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
|
||||
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
|
||||
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
|
||||
`Usage: /tts provider openai | elevenlabs | edge`,
|
||||
`Microsoft enabled: ${hasMicrosoft ? "✅" : "❌"}\n` +
|
||||
`Usage: /tts provider openai | elevenlabs | microsoft`,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const requested = args.trim().toLowerCase();
|
||||
if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
|
||||
const knownProviders = new Set(listSpeechProviders(params.cfg).map((provider) => provider.id));
|
||||
if (requested !== "edge" && !knownProviders.has(requested)) {
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
|
||||
const nextProvider = normalizeSpeechProviderId(requested) ?? requested;
|
||||
setTtsProvider(prefsPath, requested);
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: `✅ TTS provider set to ${requested}.` },
|
||||
reply: { text: `✅ TTS provider set to ${nextProvider}.` },
|
||||
};
|
||||
}
|
||||
|
||||
@@ -249,7 +252,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
if (action === "status") {
|
||||
const enabled = isTtsEnabled(config, prefsPath);
|
||||
const provider = getTtsProvider(config, prefsPath);
|
||||
const hasKey = isTtsProviderConfigured(config, provider);
|
||||
const hasKey = isTtsProviderConfigured(config, provider, params.cfg);
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
const summarize = isSummarizationEnabled(prefsPath);
|
||||
const last = getLastTtsAttempt();
|
||||
|
||||
@@ -91,6 +91,7 @@ const createRegistry = (channels: PluginRegistry["channels"]): PluginRegistry =>
|
||||
enabled: true,
|
||||
})),
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
webSearchProviders: [],
|
||||
gatewayHandlers: {},
|
||||
httpRoutes: [],
|
||||
|
||||
@@ -337,6 +337,7 @@ describe("ensureChannelSetupPluginInstalled", () => {
|
||||
hookNames: [],
|
||||
channelIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
webSearchProviderIds: [],
|
||||
gatewayMethods: [],
|
||||
cliCommands: [],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import type { SecretInput } from "./types.secrets.js";
|
||||
|
||||
export type TtsProvider = "elevenlabs" | "openai" | "edge";
|
||||
export type TtsProvider = string;
|
||||
|
||||
export type TtsMode = "final" | "all";
|
||||
|
||||
@@ -66,9 +66,22 @@ export type TtsConfig = {
|
||||
/** System-level instructions for the TTS model (gpt-4o-mini-tts only). */
|
||||
instructions?: string;
|
||||
};
|
||||
/** Microsoft Edge (node-edge-tts) configuration. */
|
||||
/** Legacy alias for Microsoft speech configuration. */
|
||||
edge?: {
|
||||
/** Explicitly allow Edge TTS usage (no API key required). */
|
||||
/** Explicitly allow Microsoft speech usage (no API key required). */
|
||||
enabled?: boolean;
|
||||
voice?: string;
|
||||
lang?: string;
|
||||
outputFormat?: string;
|
||||
pitch?: string;
|
||||
rate?: string;
|
||||
volume?: string;
|
||||
saveSubtitles?: boolean;
|
||||
proxy?: string;
|
||||
timeoutMs?: number;
|
||||
};
|
||||
/** Preferred alias for Microsoft speech configuration. */
|
||||
microsoft?: {
|
||||
enabled?: boolean;
|
||||
voice?: string;
|
||||
lang?: string;
|
||||
|
||||
@@ -353,9 +353,24 @@ export const MarkdownConfigSchema = z
|
||||
.strict()
|
||||
.optional();
|
||||
|
||||
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
|
||||
export const TtsProviderSchema = z.string().min(1);
|
||||
export const TtsModeSchema = z.enum(["final", "all"]);
|
||||
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
|
||||
const TtsMicrosoftConfigSchema = z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
voice: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
outputFormat: z.string().optional(),
|
||||
pitch: z.string().optional(),
|
||||
rate: z.string().optional(),
|
||||
volume: z.string().optional(),
|
||||
saveSubtitles: z.boolean().optional(),
|
||||
proxy: z.string().optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional();
|
||||
export const TtsConfigSchema = z
|
||||
.object({
|
||||
auto: TtsAutoSchema.optional(),
|
||||
@@ -409,21 +424,8 @@ export const TtsConfigSchema = z
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
edge: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
voice: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
outputFormat: z.string().optional(),
|
||||
pitch: z.string().optional(),
|
||||
rate: z.string().optional(),
|
||||
volume: z.string().optional(),
|
||||
saveSubtitles: z.boolean().optional(),
|
||||
proxy: z.string().optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
edge: TtsMicrosoftConfigSchema,
|
||||
microsoft: TtsMicrosoftConfigSchema,
|
||||
prefsPath: z.string().optional(),
|
||||
maxTextLength: z.number().int().min(1).optional(),
|
||||
timeoutMs: z.number().int().min(1000).max(120000).optional(),
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { loadConfig } from "../../config/config.js";
|
||||
import { listSpeechProviders, normalizeSpeechProviderId } from "../../tts/provider-registry.js";
|
||||
import {
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
@@ -26,9 +27,9 @@ export const ttsHandlers: GatewayRequestHandlers = {
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
const provider = getTtsProvider(config, prefsPath);
|
||||
const autoMode = resolveTtsAutoMode({ config, prefsPath });
|
||||
const fallbackProviders = resolveTtsProviderOrder(provider)
|
||||
const fallbackProviders = resolveTtsProviderOrder(provider, cfg)
|
||||
.slice(1)
|
||||
.filter((candidate) => isTtsProviderConfigured(config, candidate));
|
||||
.filter((candidate) => isTtsProviderConfigured(config, candidate, cfg));
|
||||
respond(true, {
|
||||
enabled: isTtsEnabled(config, prefsPath),
|
||||
auto: autoMode,
|
||||
@@ -38,7 +39,7 @@ export const ttsHandlers: GatewayRequestHandlers = {
|
||||
prefsPath,
|
||||
hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")),
|
||||
hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")),
|
||||
edgeEnabled: isTtsProviderConfigured(config, "edge"),
|
||||
microsoftEnabled: isTtsProviderConfigured(config, "microsoft", cfg),
|
||||
});
|
||||
} catch (err) {
|
||||
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, formatForLog(err)));
|
||||
@@ -99,20 +100,23 @@ export const ttsHandlers: GatewayRequestHandlers = {
|
||||
}
|
||||
},
|
||||
"tts.setProvider": async ({ params, respond }) => {
|
||||
const provider = typeof params.provider === "string" ? params.provider.trim() : "";
|
||||
if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") {
|
||||
const provider = normalizeSpeechProviderId(
|
||||
typeof params.provider === "string" ? params.provider.trim() : "",
|
||||
);
|
||||
const cfg = loadConfig();
|
||||
const knownProviders = new Set(listSpeechProviders(cfg).map((entry) => entry.id));
|
||||
if (!provider || !knownProviders.has(provider)) {
|
||||
respond(
|
||||
false,
|
||||
undefined,
|
||||
errorShape(
|
||||
ErrorCodes.INVALID_REQUEST,
|
||||
"Invalid provider. Use openai, elevenlabs, or edge.",
|
||||
"Invalid provider. Use a registered TTS provider id such as openai, elevenlabs, or microsoft.",
|
||||
),
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const cfg = loadConfig();
|
||||
const config = resolveTtsConfig(cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
setTtsProvider(prefsPath, provider);
|
||||
@@ -127,27 +131,19 @@ export const ttsHandlers: GatewayRequestHandlers = {
|
||||
const config = resolveTtsConfig(cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
respond(true, {
|
||||
providers: [
|
||||
{
|
||||
id: "openai",
|
||||
name: "OpenAI",
|
||||
configured: Boolean(resolveTtsApiKey(config, "openai")),
|
||||
models: [...OPENAI_TTS_MODELS],
|
||||
voices: [...OPENAI_TTS_VOICES],
|
||||
},
|
||||
{
|
||||
id: "elevenlabs",
|
||||
name: "ElevenLabs",
|
||||
configured: Boolean(resolveTtsApiKey(config, "elevenlabs")),
|
||||
models: ["eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1"],
|
||||
},
|
||||
{
|
||||
id: "edge",
|
||||
name: "Edge TTS",
|
||||
configured: isTtsProviderConfigured(config, "edge"),
|
||||
models: [],
|
||||
},
|
||||
],
|
||||
providers: listSpeechProviders(cfg).map((provider) => ({
|
||||
id: provider.id,
|
||||
name: provider.label,
|
||||
configured: provider.isConfigured({ cfg, config }),
|
||||
models:
|
||||
provider.id === "openai" && provider.models == null
|
||||
? [...OPENAI_TTS_MODELS]
|
||||
: [...(provider.models ?? [])],
|
||||
voices:
|
||||
provider.id === "openai" && provider.voices == null
|
||||
? [...OPENAI_TTS_VOICES]
|
||||
: [...(provider.voices ?? [])],
|
||||
})),
|
||||
active: getTtsProvider(config, prefsPath),
|
||||
});
|
||||
} catch (err) {
|
||||
|
||||
@@ -29,6 +29,7 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({
|
||||
channelSetups: [],
|
||||
commands: [],
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
webSearchProviders: [],
|
||||
gatewayHandlers: {},
|
||||
httpRoutes: [],
|
||||
|
||||
@@ -1,25 +1,9 @@
|
||||
import { vi } from "vitest";
|
||||
import type { PluginRegistry } from "../plugins/registry.js";
|
||||
import { createEmptyPluginRegistry, type PluginRegistry } from "../plugins/registry.js";
|
||||
import { setActivePluginRegistry } from "../plugins/runtime.js";
|
||||
|
||||
export const registryState: { registry: PluginRegistry } = {
|
||||
registry: {
|
||||
plugins: [],
|
||||
tools: [],
|
||||
hooks: [],
|
||||
typedHooks: [],
|
||||
channels: [],
|
||||
channelSetups: [],
|
||||
providers: [],
|
||||
webSearchProviders: [],
|
||||
gatewayHandlers: {},
|
||||
httpHandlers: [],
|
||||
httpRoutes: [],
|
||||
cliRegistrars: [],
|
||||
services: [],
|
||||
commands: [],
|
||||
diagnostics: [],
|
||||
} as PluginRegistry,
|
||||
registry: createEmptyPluginRegistry(),
|
||||
};
|
||||
|
||||
export function setRegistry(registry: PluginRegistry) {
|
||||
|
||||
@@ -146,6 +146,7 @@ const createStubPluginRegistry = (): PluginRegistry => ({
|
||||
],
|
||||
channelSetups: [],
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
webSearchProviders: [],
|
||||
gatewayHandlers: {},
|
||||
httpRoutes: [],
|
||||
|
||||
@@ -21,6 +21,7 @@ export type {
|
||||
ProviderResolveDynamicModelContext,
|
||||
ProviderNormalizeResolvedModelContext,
|
||||
ProviderRuntimeModel,
|
||||
SpeechProviderPlugin,
|
||||
ProviderThinkingPolicyContext,
|
||||
ProviderWrapStreamFnContext,
|
||||
OpenClawPluginService,
|
||||
|
||||
@@ -140,6 +140,7 @@ export type {
|
||||
ProviderResolveDynamicModelContext,
|
||||
ProviderNormalizeResolvedModelContext,
|
||||
ProviderRuntimeModel,
|
||||
SpeechProviderPlugin,
|
||||
ProviderThinkingPolicyContext,
|
||||
ProviderWrapStreamFnContext,
|
||||
} from "../plugins/types.js";
|
||||
|
||||
@@ -494,6 +494,7 @@ function createPluginRecord(params: {
|
||||
hookNames: [],
|
||||
channelIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
webSearchProviderIds: [],
|
||||
gatewayMethods: [],
|
||||
cliCommands: [],
|
||||
|
||||
@@ -46,6 +46,7 @@ import type {
|
||||
PluginHookName,
|
||||
PluginHookHandlerMap,
|
||||
PluginHookRegistration as TypedPluginHookRegistration,
|
||||
SpeechProviderPlugin,
|
||||
WebSearchProviderPlugin,
|
||||
} from "./types.js";
|
||||
|
||||
@@ -110,6 +111,14 @@ export type PluginWebSearchProviderRegistration = {
|
||||
rootDir?: string;
|
||||
};
|
||||
|
||||
export type PluginSpeechProviderRegistration = {
|
||||
pluginId: string;
|
||||
pluginName?: string;
|
||||
provider: SpeechProviderPlugin;
|
||||
source: string;
|
||||
rootDir?: string;
|
||||
};
|
||||
|
||||
export type PluginHookRegistration = {
|
||||
pluginId: string;
|
||||
entry: HookEntry;
|
||||
@@ -154,6 +163,7 @@ export type PluginRecord = {
|
||||
hookNames: string[];
|
||||
channelIds: string[];
|
||||
providerIds: string[];
|
||||
speechProviderIds: string[];
|
||||
webSearchProviderIds: string[];
|
||||
gatewayMethods: string[];
|
||||
cliCommands: string[];
|
||||
@@ -174,6 +184,7 @@ export type PluginRegistry = {
|
||||
channels: PluginChannelRegistration[];
|
||||
channelSetups: PluginChannelSetupRegistration[];
|
||||
providers: PluginProviderRegistration[];
|
||||
speechProviders: PluginSpeechProviderRegistration[];
|
||||
webSearchProviders: PluginWebSearchProviderRegistration[];
|
||||
gatewayHandlers: GatewayRequestHandlers;
|
||||
httpRoutes: PluginHttpRouteRegistration[];
|
||||
@@ -219,6 +230,7 @@ export function createEmptyPluginRegistry(): PluginRegistry {
|
||||
channels: [],
|
||||
channelSetups: [],
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
webSearchProviders: [],
|
||||
gatewayHandlers: {},
|
||||
httpRoutes: [],
|
||||
@@ -550,6 +562,37 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
||||
});
|
||||
};
|
||||
|
||||
const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
|
||||
const id = provider.id.trim();
|
||||
if (!id) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: "speech provider registration missing id",
|
||||
});
|
||||
return;
|
||||
}
|
||||
const existing = registry.speechProviders.find((entry) => entry.provider.id === id);
|
||||
if (existing) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: `speech provider already registered: ${id} (${existing.pluginId})`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
record.speechProviderIds.push(id);
|
||||
registry.speechProviders.push({
|
||||
pluginId: record.id,
|
||||
pluginName: record.name,
|
||||
provider,
|
||||
source: record.source,
|
||||
rootDir: record.rootDir,
|
||||
});
|
||||
};
|
||||
|
||||
const registerWebSearchProvider = (record: PluginRecord, provider: WebSearchProviderPlugin) => {
|
||||
const id = provider.id.trim();
|
||||
if (!id) {
|
||||
@@ -789,6 +832,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
||||
registerChannel: (registration) => registerChannel(record, registration, registrationMode),
|
||||
registerProvider:
|
||||
registrationMode === "full" ? (provider) => registerProvider(record, provider) : () => {},
|
||||
registerSpeechProvider:
|
||||
registrationMode === "full"
|
||||
? (provider) => registerSpeechProvider(record, provider)
|
||||
: () => {},
|
||||
registerWebSearchProvider:
|
||||
registrationMode === "full"
|
||||
? (provider) => registerWebSearchProvider(record, provider)
|
||||
@@ -862,6 +909,7 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
||||
registerTool,
|
||||
registerChannel,
|
||||
registerProvider,
|
||||
registerSpeechProvider,
|
||||
registerWebSearchProvider,
|
||||
registerGatewayMethod,
|
||||
registerCli,
|
||||
|
||||
@@ -27,6 +27,14 @@ import type { HookEntry } from "../hooks/types.js";
|
||||
import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js";
|
||||
import type { RuntimeEnv } from "../runtime.js";
|
||||
import type { RuntimeWebSearchMetadata } from "../secrets/runtime-web-tools.types.js";
|
||||
import type {
|
||||
SpeechProviderConfiguredContext,
|
||||
SpeechProviderId,
|
||||
SpeechSynthesisRequest,
|
||||
SpeechSynthesisResult,
|
||||
SpeechTelephonySynthesisRequest,
|
||||
SpeechTelephonySynthesisResult,
|
||||
} from "../tts/provider-types.js";
|
||||
import type { WizardPrompter } from "../wizard/prompts.js";
|
||||
import type { PluginRuntime } from "./runtime/types.js";
|
||||
|
||||
@@ -853,6 +861,23 @@ export type PluginWebSearchProviderEntry = WebSearchProviderPlugin & {
|
||||
pluginId: string;
|
||||
};
|
||||
|
||||
export type SpeechProviderPlugin = {
|
||||
id: SpeechProviderId;
|
||||
label: string;
|
||||
aliases?: string[];
|
||||
models?: readonly string[];
|
||||
voices?: readonly string[];
|
||||
isConfigured: (ctx: SpeechProviderConfiguredContext) => boolean;
|
||||
synthesize: (req: SpeechSynthesisRequest) => Promise<SpeechSynthesisResult>;
|
||||
synthesizeTelephony?: (
|
||||
req: SpeechTelephonySynthesisRequest,
|
||||
) => Promise<SpeechTelephonySynthesisResult>;
|
||||
};
|
||||
|
||||
export type PluginSpeechProviderEntry = SpeechProviderPlugin & {
|
||||
pluginId: string;
|
||||
};
|
||||
|
||||
export type OpenClawPluginGatewayMethod = {
|
||||
method: string;
|
||||
handler: GatewayRequestHandler;
|
||||
@@ -1211,6 +1236,7 @@ export type OpenClawPluginApi = {
|
||||
registerCli: (registrar: OpenClawPluginCliRegistrar, opts?: { commands?: string[] }) => void;
|
||||
registerService: (service: OpenClawPluginService) => void;
|
||||
registerProvider: (provider: ProviderPlugin) => void;
|
||||
registerSpeechProvider: (provider: SpeechProviderPlugin) => void;
|
||||
registerWebSearchProvider: (provider: WebSearchProviderPlugin) => void;
|
||||
registerInteractiveHandler: (registration: PluginInteractiveHandlerRegistration) => void;
|
||||
/**
|
||||
|
||||
@@ -26,6 +26,7 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl
|
||||
enabled: true,
|
||||
})),
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
webSearchProviders: [],
|
||||
gatewayHandlers: {},
|
||||
httpRoutes: [],
|
||||
|
||||
@@ -2,29 +2,36 @@ import type {
|
||||
AnyAgentTool,
|
||||
OpenClawPluginApi,
|
||||
ProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
WebSearchProviderPlugin,
|
||||
} from "../plugins/types.js";
|
||||
|
||||
export type CapturedPluginRegistration = {
|
||||
api: OpenClawPluginApi;
|
||||
providers: ProviderPlugin[];
|
||||
speechProviders: SpeechProviderPlugin[];
|
||||
webSearchProviders: WebSearchProviderPlugin[];
|
||||
tools: AnyAgentTool[];
|
||||
};
|
||||
|
||||
export function createCapturedPluginRegistration(): CapturedPluginRegistration {
|
||||
const providers: ProviderPlugin[] = [];
|
||||
const speechProviders: SpeechProviderPlugin[] = [];
|
||||
const webSearchProviders: WebSearchProviderPlugin[] = [];
|
||||
const tools: AnyAgentTool[] = [];
|
||||
|
||||
return {
|
||||
providers,
|
||||
speechProviders,
|
||||
webSearchProviders,
|
||||
tools,
|
||||
api: {
|
||||
registerProvider(provider: ProviderPlugin) {
|
||||
providers.push(provider);
|
||||
},
|
||||
registerSpeechProvider(provider: SpeechProviderPlugin) {
|
||||
speechProviders.push(provider);
|
||||
},
|
||||
registerWebSearchProvider(provider: WebSearchProviderPlugin) {
|
||||
webSearchProviders.push(provider);
|
||||
},
|
||||
|
||||
84
src/tts/provider-registry.ts
Normal file
84
src/tts/provider-registry.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { loadOpenClawPlugins } from "../plugins/loader.js";
|
||||
import { getActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import type { SpeechProviderPlugin } from "../plugins/types.js";
|
||||
import type { SpeechProviderId } from "./provider-types.js";
|
||||
import { buildElevenLabsSpeechProvider } from "./providers/elevenlabs.js";
|
||||
import { buildMicrosoftSpeechProvider } from "./providers/microsoft.js";
|
||||
import { buildOpenAISpeechProvider } from "./providers/openai.js";
|
||||
|
||||
const BUILTIN_SPEECH_PROVIDERS: readonly SpeechProviderPlugin[] = [
|
||||
buildOpenAISpeechProvider(),
|
||||
buildElevenLabsSpeechProvider(),
|
||||
buildMicrosoftSpeechProvider(),
|
||||
];
|
||||
|
||||
function trimToUndefined(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim().toLowerCase();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
export function normalizeSpeechProviderId(
|
||||
providerId: string | undefined,
|
||||
): SpeechProviderId | undefined {
|
||||
const normalized = trimToUndefined(providerId);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return normalized === "edge" ? "microsoft" : normalized;
|
||||
}
|
||||
|
||||
function resolveSpeechProviderPluginEntries(cfg?: OpenClawConfig): SpeechProviderPlugin[] {
|
||||
const active = getActivePluginRegistry();
|
||||
const registry =
|
||||
(active?.speechProviders?.length ?? 0) > 0 || !cfg
|
||||
? active
|
||||
: loadOpenClawPlugins({ config: cfg });
|
||||
return registry?.speechProviders?.map((entry) => entry.provider) ?? [];
|
||||
}
|
||||
|
||||
function buildProviderMaps(cfg?: OpenClawConfig): {
|
||||
canonical: Map<string, SpeechProviderPlugin>;
|
||||
aliases: Map<string, SpeechProviderPlugin>;
|
||||
} {
|
||||
const canonical = new Map<string, SpeechProviderPlugin>();
|
||||
const aliases = new Map<string, SpeechProviderPlugin>();
|
||||
const register = (provider: SpeechProviderPlugin) => {
|
||||
const id = normalizeSpeechProviderId(provider.id);
|
||||
if (!id) {
|
||||
return;
|
||||
}
|
||||
canonical.set(id, provider);
|
||||
aliases.set(id, provider);
|
||||
for (const alias of provider.aliases ?? []) {
|
||||
const normalizedAlias = normalizeSpeechProviderId(alias);
|
||||
if (normalizedAlias) {
|
||||
aliases.set(normalizedAlias, provider);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (const provider of BUILTIN_SPEECH_PROVIDERS) {
|
||||
register(provider);
|
||||
}
|
||||
for (const provider of resolveSpeechProviderPluginEntries(cfg)) {
|
||||
register(provider);
|
||||
}
|
||||
|
||||
return { canonical, aliases };
|
||||
}
|
||||
|
||||
export function listSpeechProviders(cfg?: OpenClawConfig): SpeechProviderPlugin[] {
|
||||
return [...buildProviderMaps(cfg).canonical.values()];
|
||||
}
|
||||
|
||||
export function getSpeechProvider(
|
||||
providerId: string | undefined,
|
||||
cfg?: OpenClawConfig,
|
||||
): SpeechProviderPlugin | undefined {
|
||||
const normalized = normalizeSpeechProviderId(providerId);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return buildProviderMaps(cfg).aliases.get(normalized);
|
||||
}
|
||||
38
src/tts/provider-types.ts
Normal file
38
src/tts/provider-types.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "./tts.js";
|
||||
|
||||
export type SpeechProviderId = string;
|
||||
|
||||
export type SpeechSynthesisTarget = "audio-file" | "voice-note";
|
||||
|
||||
export type SpeechProviderConfiguredContext = {
|
||||
cfg?: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
};
|
||||
|
||||
export type SpeechSynthesisRequest = {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
target: SpeechSynthesisTarget;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
};
|
||||
|
||||
export type SpeechSynthesisResult = {
|
||||
audioBuffer: Buffer;
|
||||
outputFormat: string;
|
||||
fileExtension: string;
|
||||
voiceCompatible: boolean;
|
||||
};
|
||||
|
||||
export type SpeechTelephonySynthesisRequest = {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
};
|
||||
|
||||
export type SpeechTelephonySynthesisResult = {
|
||||
audioBuffer: Buffer;
|
||||
outputFormat: string;
|
||||
sampleRate: number;
|
||||
};
|
||||
73
src/tts/providers/elevenlabs.ts
Normal file
73
src/tts/providers/elevenlabs.ts
Normal file
@@ -0,0 +1,73 @@
|
||||
import type { SpeechProviderPlugin } from "../../plugins/types.js";
|
||||
import { elevenLabsTTS } from "../tts-core.js";
|
||||
|
||||
const ELEVENLABS_TTS_MODELS = [
|
||||
"eleven_multilingual_v2",
|
||||
"eleven_turbo_v2_5",
|
||||
"eleven_monolingual_v1",
|
||||
] as const;
|
||||
|
||||
export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "elevenlabs",
|
||||
label: "ElevenLabs",
|
||||
models: ELEVENLABS_TTS_MODELS,
|
||||
isConfigured: ({ config }) =>
|
||||
Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
|
||||
synthesize: async (req) => {
|
||||
const apiKey =
|
||||
req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("ElevenLabs API key missing");
|
||||
}
|
||||
const outputFormat = req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128";
|
||||
const audioBuffer = await elevenLabsTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: req.config.elevenlabs.baseUrl,
|
||||
voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
|
||||
modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
|
||||
outputFormat,
|
||||
seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
|
||||
applyTextNormalization:
|
||||
req.overrides?.elevenlabs?.applyTextNormalization ??
|
||||
req.config.elevenlabs.applyTextNormalization,
|
||||
languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
|
||||
voiceSettings: {
|
||||
...req.config.elevenlabs.voiceSettings,
|
||||
...req.overrides?.elevenlabs?.voiceSettings,
|
||||
},
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
});
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat,
|
||||
fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
|
||||
voiceCompatible: req.target === "voice-note",
|
||||
};
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const apiKey =
|
||||
req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("ElevenLabs API key missing");
|
||||
}
|
||||
const outputFormat = "pcm_22050";
|
||||
const sampleRate = 22_050;
|
||||
const audioBuffer = await elevenLabsTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: req.config.elevenlabs.baseUrl,
|
||||
voiceId: req.config.elevenlabs.voiceId,
|
||||
modelId: req.config.elevenlabs.modelId,
|
||||
outputFormat,
|
||||
seed: req.config.elevenlabs.seed,
|
||||
applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
|
||||
languageCode: req.config.elevenlabs.languageCode,
|
||||
voiceSettings: req.config.elevenlabs.voiceSettings,
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
});
|
||||
return { audioBuffer, outputFormat, sampleRate };
|
||||
},
|
||||
};
|
||||
}
|
||||
60
src/tts/providers/microsoft.ts
Normal file
60
src/tts/providers/microsoft.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
import { resolvePreferredOpenClawTmpDir } from "../../infra/tmp-openclaw-dir.js";
|
||||
import { isVoiceCompatibleAudio } from "../../media/audio.js";
|
||||
import type { SpeechProviderPlugin } from "../../plugins/types.js";
|
||||
import { edgeTTS, inferEdgeExtension } from "../tts-core.js";
|
||||
|
||||
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
|
||||
export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "microsoft",
|
||||
label: "Microsoft",
|
||||
aliases: ["edge"],
|
||||
isConfigured: ({ config }) => config.edge.enabled,
|
||||
synthesize: async (req) => {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
|
||||
let outputFormat = req.config.edge.outputFormat;
|
||||
const fallbackOutputFormat =
|
||||
outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
|
||||
|
||||
try {
|
||||
const runEdge = async (format: string) => {
|
||||
const fileExtension = inferEdgeExtension(format);
|
||||
const outputPath = path.join(tempDir, `speech${fileExtension}`);
|
||||
await edgeTTS({
|
||||
text: req.text,
|
||||
outputPath,
|
||||
config: {
|
||||
...req.config.edge,
|
||||
outputFormat: format,
|
||||
},
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
});
|
||||
const audioBuffer = readFileSync(outputPath);
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat: format,
|
||||
fileExtension,
|
||||
voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
|
||||
};
|
||||
};
|
||||
|
||||
try {
|
||||
return await runEdge(outputFormat);
|
||||
} catch (err) {
|
||||
if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
|
||||
throw err;
|
||||
}
|
||||
outputFormat = fallbackOutputFormat;
|
||||
return await runEdge(outputFormat);
|
||||
}
|
||||
} finally {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
56
src/tts/providers/openai.ts
Normal file
56
src/tts/providers/openai.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
import type { SpeechProviderPlugin } from "../../plugins/types.js";
|
||||
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "../tts-core.js";
|
||||
|
||||
export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "openai",
|
||||
label: "OpenAI",
|
||||
models: OPENAI_TTS_MODELS,
|
||||
voices: OPENAI_TTS_VOICES,
|
||||
isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
|
||||
synthesize: async (req) => {
|
||||
const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
}
|
||||
const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
|
||||
const audioBuffer = await openaiTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: req.config.openai.baseUrl,
|
||||
model: req.overrides?.openai?.model ?? req.config.openai.model,
|
||||
voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
|
||||
speed: req.config.openai.speed,
|
||||
instructions: req.config.openai.instructions,
|
||||
responseFormat,
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
});
|
||||
return {
|
||||
audioBuffer,
|
||||
outputFormat: responseFormat,
|
||||
fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
|
||||
voiceCompatible: req.target === "voice-note",
|
||||
};
|
||||
},
|
||||
synthesizeTelephony: async (req) => {
|
||||
const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
}
|
||||
const outputFormat = "pcm";
|
||||
const sampleRate = 24_000;
|
||||
const audioBuffer = await openaiTTS({
|
||||
text: req.text,
|
||||
apiKey,
|
||||
baseUrl: req.config.openai.baseUrl,
|
||||
model: req.config.openai.model,
|
||||
voice: req.config.openai.voice,
|
||||
speed: req.config.openai.speed,
|
||||
instructions: req.config.openai.instructions,
|
||||
responseFormat: outputFormat,
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
});
|
||||
return { audioBuffer, outputFormat, sampleRate };
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -156,10 +156,13 @@ export function parseTtsDirectives(
|
||||
if (!policy.allowProvider) {
|
||||
break;
|
||||
}
|
||||
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
|
||||
overrides.provider = rawValue;
|
||||
} else {
|
||||
warnings.push(`unsupported provider "${rawValue}"`);
|
||||
{
|
||||
const providerId = rawValue.trim().toLowerCase();
|
||||
if (providerId) {
|
||||
overrides.provider = providerId;
|
||||
} else {
|
||||
warnings.push("invalid provider id");
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "voice":
|
||||
|
||||
@@ -311,7 +311,7 @@ describe("tts", () => {
|
||||
expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1);
|
||||
});
|
||||
|
||||
it("accepts edge as provider override", () => {
|
||||
it("accepts edge as a legacy microsoft provider override", () => {
|
||||
const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true });
|
||||
const input = "Hello [[tts:provider=edge]] world";
|
||||
const result = parseTtsDirectives(input, policy);
|
||||
@@ -524,8 +524,8 @@ describe("tts", () => {
|
||||
ELEVENLABS_API_KEY: undefined,
|
||||
XI_API_KEY: undefined,
|
||||
},
|
||||
prefsPath: "/tmp/tts-prefs-edge.json",
|
||||
expected: "edge",
|
||||
prefsPath: "/tmp/tts-prefs-microsoft.json",
|
||||
expected: "microsoft",
|
||||
},
|
||||
] as const;
|
||||
|
||||
@@ -539,6 +539,25 @@ describe("tts", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveTtsConfig provider normalization", () => {
|
||||
it("normalizes legacy edge provider ids to microsoft", () => {
|
||||
const config = resolveTtsConfig({
|
||||
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||
messages: {
|
||||
tts: {
|
||||
provider: "edge",
|
||||
edge: {
|
||||
enabled: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(config.provider).toBe("microsoft");
|
||||
expect(getTtsProvider(config, "/tmp/tts-prefs-normalized.json")).toBe("microsoft");
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveTtsConfig – openai.baseUrl", () => {
|
||||
const baseCfg: OpenClawConfig = {
|
||||
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||
|
||||
285
src/tts/tts.ts
285
src/tts/tts.ts
@@ -5,7 +5,6 @@ import {
|
||||
readFileSync,
|
||||
writeFileSync,
|
||||
mkdtempSync,
|
||||
rmSync,
|
||||
renameSync,
|
||||
unlinkSync,
|
||||
} from "node:fs";
|
||||
@@ -25,20 +24,20 @@ import type {
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||||
import { stripMarkdown } from "../line/markdown-to-line.js";
|
||||
import { isVoiceCompatibleAudio } from "../media/audio.js";
|
||||
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
|
||||
import {
|
||||
getSpeechProvider,
|
||||
listSpeechProviders,
|
||||
normalizeSpeechProviderId,
|
||||
} from "./provider-registry.js";
|
||||
import {
|
||||
DEFAULT_OPENAI_BASE_URL,
|
||||
edgeTTS,
|
||||
elevenLabsTTS,
|
||||
inferEdgeExtension,
|
||||
isValidOpenAIModel,
|
||||
isValidOpenAIVoice,
|
||||
isValidVoiceId,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
resolveOpenAITtsInstructions,
|
||||
openaiTTS,
|
||||
parseTtsDirectives,
|
||||
scheduleCleanup,
|
||||
summarizeText,
|
||||
@@ -83,11 +82,6 @@ const DEFAULT_OUTPUT = {
|
||||
voiceCompatible: false,
|
||||
};
|
||||
|
||||
const TELEPHONY_OUTPUT = {
|
||||
openai: { format: "pcm" as const, sampleRate: 24000 },
|
||||
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
|
||||
};
|
||||
|
||||
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
|
||||
|
||||
export type ResolvedTtsConfig = {
|
||||
@@ -261,12 +255,13 @@ function resolveModelOverridePolicy(
|
||||
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
|
||||
const raw: TtsConfig = cfg.messages?.tts ?? {};
|
||||
const providerSource = raw.provider ? "config" : "default";
|
||||
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
|
||||
const rawMicrosoft = { ...raw.edge, ...raw.microsoft };
|
||||
const edgeOutputFormat = rawMicrosoft.outputFormat?.trim();
|
||||
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
|
||||
return {
|
||||
auto,
|
||||
mode: raw.mode ?? "final",
|
||||
provider: raw.provider ?? "edge",
|
||||
provider: normalizeSpeechProviderId(raw.provider) ?? "microsoft",
|
||||
providerSource,
|
||||
summaryModel: raw.summaryModel?.trim() || undefined,
|
||||
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
|
||||
@@ -311,17 +306,17 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
|
||||
instructions: raw.openai?.instructions?.trim() || undefined,
|
||||
},
|
||||
edge: {
|
||||
enabled: raw.edge?.enabled ?? true,
|
||||
voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
|
||||
lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
|
||||
enabled: rawMicrosoft.enabled ?? true,
|
||||
voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,
|
||||
lang: rawMicrosoft.lang?.trim() || DEFAULT_EDGE_LANG,
|
||||
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
|
||||
outputFormatConfigured: Boolean(edgeOutputFormat),
|
||||
pitch: raw.edge?.pitch?.trim() || undefined,
|
||||
rate: raw.edge?.rate?.trim() || undefined,
|
||||
volume: raw.edge?.volume?.trim() || undefined,
|
||||
saveSubtitles: raw.edge?.saveSubtitles ?? false,
|
||||
proxy: raw.edge?.proxy?.trim() || undefined,
|
||||
timeoutMs: raw.edge?.timeoutMs,
|
||||
pitch: rawMicrosoft.pitch?.trim() || undefined,
|
||||
rate: rawMicrosoft.rate?.trim() || undefined,
|
||||
volume: rawMicrosoft.volume?.trim() || undefined,
|
||||
saveSubtitles: rawMicrosoft.saveSubtitles ?? false,
|
||||
proxy: rawMicrosoft.proxy?.trim() || undefined,
|
||||
timeoutMs: rawMicrosoft.timeoutMs,
|
||||
},
|
||||
prefsPath: raw.prefsPath,
|
||||
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
|
||||
@@ -448,11 +443,12 @@ export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
|
||||
|
||||
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
|
||||
const prefs = readPrefs(prefsPath);
|
||||
if (prefs.tts?.provider) {
|
||||
return prefs.tts.provider;
|
||||
const prefsProvider = normalizeSpeechProviderId(prefs.tts?.provider);
|
||||
if (prefsProvider) {
|
||||
return prefsProvider;
|
||||
}
|
||||
if (config.providerSource === "config") {
|
||||
return config.provider;
|
||||
return normalizeSpeechProviderId(config.provider) ?? config.provider;
|
||||
}
|
||||
|
||||
if (resolveTtsApiKey(config, "openai")) {
|
||||
@@ -461,12 +457,12 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
|
||||
if (resolveTtsApiKey(config, "elevenlabs")) {
|
||||
return "elevenlabs";
|
||||
}
|
||||
return "edge";
|
||||
return "microsoft";
|
||||
}
|
||||
|
||||
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
|
||||
updatePrefs(prefsPath, (prefs) => {
|
||||
prefs.tts = { ...prefs.tts, provider };
|
||||
prefs.tts = { ...prefs.tts, provider: normalizeSpeechProviderId(provider) ?? provider };
|
||||
});
|
||||
}
|
||||
|
||||
@@ -522,26 +518,42 @@ export function resolveTtsApiKey(
|
||||
config: ResolvedTtsConfig,
|
||||
provider: TtsProvider,
|
||||
): string | undefined {
|
||||
if (provider === "elevenlabs") {
|
||||
const normalizedProvider = normalizeSpeechProviderId(provider);
|
||||
if (normalizedProvider === "elevenlabs") {
|
||||
return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
|
||||
}
|
||||
if (provider === "openai") {
|
||||
if (normalizedProvider === "openai") {
|
||||
return config.openai.apiKey || process.env.OPENAI_API_KEY;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
|
||||
export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const;
|
||||
|
||||
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
|
||||
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
|
||||
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
|
||||
const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary;
|
||||
const ordered = new Set<TtsProvider>([normalizedPrimary]);
|
||||
for (const provider of TTS_PROVIDERS) {
|
||||
if (provider !== normalizedPrimary) {
|
||||
ordered.add(provider);
|
||||
}
|
||||
}
|
||||
for (const provider of listSpeechProviders(cfg)) {
|
||||
const normalized = normalizeSpeechProviderId(provider.id) ?? provider.id;
|
||||
if (normalized !== normalizedPrimary) {
|
||||
ordered.add(normalized);
|
||||
}
|
||||
}
|
||||
return [...ordered];
|
||||
}
|
||||
|
||||
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
|
||||
if (provider === "edge") {
|
||||
return config.edge.enabled;
|
||||
}
|
||||
return Boolean(resolveTtsApiKey(config, provider));
|
||||
export function isTtsProviderConfigured(
|
||||
config: ResolvedTtsConfig,
|
||||
provider: TtsProvider,
|
||||
cfg?: OpenClawConfig,
|
||||
): boolean {
|
||||
const resolvedProvider = getSpeechProvider(provider, cfg);
|
||||
return resolvedProvider?.isConfigured({ cfg, config }) ?? false;
|
||||
}
|
||||
|
||||
function formatTtsProviderError(provider: TtsProvider, err: unknown): string {
|
||||
@@ -581,10 +593,10 @@ function resolveTtsRequestSetup(params: {
|
||||
}
|
||||
|
||||
const userProvider = getTtsProvider(config, prefsPath);
|
||||
const provider = params.providerOverride ?? userProvider;
|
||||
const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider;
|
||||
return {
|
||||
config,
|
||||
providers: resolveTtsProviderOrder(provider),
|
||||
providers: resolveTtsProviderOrder(provider, params.cfg),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -607,136 +619,36 @@ export async function textToSpeech(params: {
|
||||
|
||||
const { config, providers } = setup;
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
const output = resolveOutputFormat(channelId);
|
||||
const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
|
||||
|
||||
const errors: string[] = [];
|
||||
|
||||
for (const provider of providers) {
|
||||
const providerStart = Date.now();
|
||||
try {
|
||||
if (provider === "edge") {
|
||||
if (!config.edge.enabled) {
|
||||
errors.push("edge: disabled");
|
||||
continue;
|
||||
}
|
||||
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
|
||||
let edgeOutputFormat = resolveEdgeOutputFormat(config);
|
||||
const fallbackEdgeOutputFormat =
|
||||
edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
|
||||
|
||||
const attemptEdgeTts = async (outputFormat: string) => {
|
||||
const extension = inferEdgeExtension(outputFormat);
|
||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
|
||||
await edgeTTS({
|
||||
text: params.text,
|
||||
outputPath: audioPath,
|
||||
config: {
|
||||
...config.edge,
|
||||
outputFormat,
|
||||
},
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
return { audioPath, outputFormat };
|
||||
};
|
||||
|
||||
let edgeResult: { audioPath: string; outputFormat: string };
|
||||
try {
|
||||
edgeResult = await attemptEdgeTts(edgeOutputFormat);
|
||||
} catch (err) {
|
||||
if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
|
||||
logVerbose(
|
||||
`TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
|
||||
);
|
||||
edgeOutputFormat = fallbackEdgeOutputFormat;
|
||||
try {
|
||||
edgeResult = await attemptEdgeTts(edgeOutputFormat);
|
||||
} catch (fallbackErr) {
|
||||
try {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore cleanup errors
|
||||
}
|
||||
throw fallbackErr;
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore cleanup errors
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
scheduleCleanup(tempDir);
|
||||
const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioPath: edgeResult.audioPath,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: edgeResult.outputFormat,
|
||||
voiceCompatible,
|
||||
};
|
||||
}
|
||||
|
||||
const apiKey = resolveTtsApiKey(config, provider);
|
||||
if (!apiKey) {
|
||||
errors.push(`${provider}: no API key`);
|
||||
const resolvedProvider = getSpeechProvider(provider, params.cfg);
|
||||
if (!resolvedProvider) {
|
||||
errors.push(`${provider}: no provider registered`);
|
||||
continue;
|
||||
}
|
||||
|
||||
let audioBuffer: Buffer;
|
||||
if (provider === "elevenlabs") {
|
||||
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
|
||||
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
|
||||
const voiceSettings = {
|
||||
...config.elevenlabs.voiceSettings,
|
||||
...params.overrides?.elevenlabs?.voiceSettings,
|
||||
};
|
||||
const seedOverride = params.overrides?.elevenlabs?.seed;
|
||||
const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
|
||||
const languageOverride = params.overrides?.elevenlabs?.languageCode;
|
||||
audioBuffer = await elevenLabsTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.elevenlabs.baseUrl,
|
||||
voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
|
||||
modelId: modelIdOverride ?? config.elevenlabs.modelId,
|
||||
outputFormat: output.elevenlabs,
|
||||
seed: seedOverride ?? config.elevenlabs.seed,
|
||||
applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
|
||||
languageCode: languageOverride ?? config.elevenlabs.languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
} else {
|
||||
const openaiModelOverride = params.overrides?.openai?.model;
|
||||
const openaiVoiceOverride = params.overrides?.openai?.voice;
|
||||
audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: openaiModelOverride ?? config.openai.model,
|
||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||
speed: config.openai.speed,
|
||||
instructions: config.openai.instructions,
|
||||
responseFormat: output.openai,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) {
|
||||
errors.push(`${provider}: not configured`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const synthesis = await resolvedProvider.synthesize({
|
||||
text: params.text,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
target,
|
||||
overrides: params.overrides,
|
||||
});
|
||||
const latencyMs = Date.now() - providerStart;
|
||||
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
|
||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
|
||||
writeFileSync(audioPath, audioBuffer);
|
||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
|
||||
writeFileSync(audioPath, synthesis.audioBuffer);
|
||||
scheduleCleanup(tempDir);
|
||||
|
||||
return {
|
||||
@@ -744,8 +656,8 @@ export async function textToSpeech(params: {
|
||||
audioPath,
|
||||
latencyMs,
|
||||
provider,
|
||||
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
|
||||
voiceCompatible: output.voiceCompatible,
|
||||
outputFormat: synthesis.outputFormat,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
};
|
||||
} catch (err) {
|
||||
errors.push(formatTtsProviderError(provider, err));
|
||||
@@ -776,63 +688,32 @@ export async function textToSpeechTelephony(params: {
|
||||
for (const provider of providers) {
|
||||
const providerStart = Date.now();
|
||||
try {
|
||||
if (provider === "edge") {
|
||||
errors.push("edge: unsupported for telephony");
|
||||
const resolvedProvider = getSpeechProvider(provider, params.cfg);
|
||||
if (!resolvedProvider) {
|
||||
errors.push(`${provider}: no provider registered`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const apiKey = resolveTtsApiKey(config, provider);
|
||||
if (!apiKey) {
|
||||
errors.push(`${provider}: no API key`);
|
||||
if (!resolvedProvider.isConfigured({ cfg: params.cfg, config })) {
|
||||
errors.push(`${provider}: not configured`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (provider === "elevenlabs") {
|
||||
const output = TELEPHONY_OUTPUT.elevenlabs;
|
||||
const audioBuffer = await elevenLabsTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.elevenlabs.baseUrl,
|
||||
voiceId: config.elevenlabs.voiceId,
|
||||
modelId: config.elevenlabs.modelId,
|
||||
outputFormat: output.format,
|
||||
seed: config.elevenlabs.seed,
|
||||
applyTextNormalization: config.elevenlabs.applyTextNormalization,
|
||||
languageCode: config.elevenlabs.languageCode,
|
||||
voiceSettings: config.elevenlabs.voiceSettings,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: output.format,
|
||||
sampleRate: output.sampleRate,
|
||||
};
|
||||
if (!resolvedProvider.synthesizeTelephony) {
|
||||
errors.push(`${provider}: unsupported for telephony`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const output = TELEPHONY_OUTPUT.openai;
|
||||
const audioBuffer = await openaiTTS({
|
||||
const synthesis = await resolvedProvider.synthesizeTelephony({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
speed: config.openai.speed,
|
||||
instructions: config.openai.instructions,
|
||||
responseFormat: output.format,
|
||||
timeoutMs: config.timeoutMs,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioBuffer,
|
||||
audioBuffer: synthesis.audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: output.format,
|
||||
sampleRate: output.sampleRate,
|
||||
outputFormat: synthesis.outputFormat,
|
||||
sampleRate: synthesis.sampleRate,
|
||||
};
|
||||
} catch (err) {
|
||||
errors.push(formatTtsProviderError(provider, err));
|
||||
|
||||
Reference in New Issue
Block a user