import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; import type { SpeechDirectiveTokenParseContext, SpeechProviderConfig, SpeechProviderOverrides, SpeechProviderPlugin, SpeechVoiceOption, } from "openclaw/plugin-sdk/speech"; import { asBoolean, asFiniteNumber, asObject, normalizeApplyTextNormalization, normalizeLanguageCode, normalizeSeed, requireInRange, trimToUndefined, } from "openclaw/plugin-sdk/speech"; import { resolveElevenLabsApiKeyWithProfileFallback } from "./config-api.js"; import { isValidElevenLabsVoiceId, normalizeElevenLabsBaseUrl } from "./shared.js"; import { elevenLabsTTS } from "./tts.js"; const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"; const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, similarityBoost: 0.75, style: 0.0, useSpeakerBoost: true, speed: 1.0, }; const ELEVENLABS_TTS_MODELS = [ "eleven_multilingual_v2", "eleven_turbo_v2_5", "eleven_monolingual_v1", ] as const; type ElevenLabsProviderConfig = { apiKey?: string; baseUrl: string; voiceId: string; modelId: string; seed?: number; applyTextNormalization?: "auto" | "on" | "off"; languageCode?: string; voiceSettings: { stability: number; similarityBoost: number; style: number; useSpeakerBoost: boolean; speed: number; }; }; function parseBooleanValue(value: string): boolean | undefined { const normalized = value.trim().toLowerCase(); if (["true", "1", "yes", "on"].includes(normalized)) { return true; } if (["false", "0", "no", "off"].includes(normalized)) { return false; } return undefined; } function parseNumberValue(value: string): number | undefined { const parsed = Number.parseFloat(value); return Number.isFinite(parsed) ? parsed : undefined; } export const isValidVoiceId = isValidElevenLabsVoiceId; function normalizeElevenLabsProviderConfig( rawConfig: Record, ): ElevenLabsProviderConfig { const providers = asObject(rawConfig.providers); const raw = asObject(providers?.elevenlabs) ?? asObject(rawConfig.elevenlabs); const rawVoiceSettings = asObject(raw?.voiceSettings); return { apiKey: normalizeResolvedSecretInputString({ value: raw?.apiKey, path: "messages.tts.providers.elevenlabs.apiKey", }), baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(raw?.baseUrl)), voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_ELEVENLABS_VOICE_ID, modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_ELEVENLABS_MODEL_ID, seed: asFiniteNumber(raw?.seed), applyTextNormalization: trimToUndefined(raw?.applyTextNormalization) as | "auto" | "on" | "off" | undefined, languageCode: trimToUndefined(raw?.languageCode), voiceSettings: { stability: asFiniteNumber(rawVoiceSettings?.stability) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability, similarityBoost: asFiniteNumber(rawVoiceSettings?.similarityBoost) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost, style: asFiniteNumber(rawVoiceSettings?.style) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style, useSpeakerBoost: asBoolean(rawVoiceSettings?.useSpeakerBoost) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost, speed: asFiniteNumber(rawVoiceSettings?.speed) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed, }, }; } function readElevenLabsProviderConfig(config: SpeechProviderConfig): ElevenLabsProviderConfig { const defaults = normalizeElevenLabsProviderConfig({}); const voiceSettings = asObject(config.voiceSettings); return { apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey, baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl), voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId, modelId: trimToUndefined(config.modelId) ?? defaults.modelId, seed: asFiniteNumber(config.seed) ?? defaults.seed, applyTextNormalization: (trimToUndefined(config.applyTextNormalization) as "auto" | "on" | "off" | undefined) ?? defaults.applyTextNormalization, languageCode: trimToUndefined(config.languageCode) ?? defaults.languageCode, voiceSettings: { stability: asFiniteNumber(voiceSettings?.stability) ?? defaults.voiceSettings.stability, similarityBoost: asFiniteNumber(voiceSettings?.similarityBoost) ?? defaults.voiceSettings.similarityBoost, style: asFiniteNumber(voiceSettings?.style) ?? defaults.voiceSettings.style, useSpeakerBoost: asBoolean(voiceSettings?.useSpeakerBoost) ?? defaults.voiceSettings.useSpeakerBoost, speed: asFiniteNumber(voiceSettings?.speed) ?? defaults.voiceSettings.speed, }, }; } function mergeVoiceSettingsOverride( ctx: SpeechDirectiveTokenParseContext, next: Record, ): SpeechProviderOverrides { return { ...ctx.currentOverrides, voiceSettings: { ...asObject(ctx.currentOverrides?.voiceSettings), ...next, }, }; } function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) { try { switch (ctx.key) { case "voiceid": case "voice_id": case "elevenlabs_voice": case "elevenlabsvoice": if (!ctx.policy.allowVoice) { return { handled: true }; } if (!isValidElevenLabsVoiceId(ctx.value)) { return { handled: true, warnings: [`invalid ElevenLabs voiceId "${ctx.value}"`] }; } return { handled: true, overrides: { ...ctx.currentOverrides, voiceId: ctx.value }, }; case "model": case "modelid": case "model_id": case "elevenlabs_model": case "elevenlabsmodel": if (!ctx.policy.allowModelId) { return { handled: true }; } return { handled: true, overrides: { ...ctx.currentOverrides, modelId: ctx.value }, }; case "stability": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } const value = parseNumberValue(ctx.value); if (value == null) { return { handled: true, warnings: ["invalid stability value"] }; } requireInRange(value, 0, 1, "stability"); return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { stability: value }) }; } case "similarity": case "similarityboost": case "similarity_boost": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } const value = parseNumberValue(ctx.value); if (value == null) { return { handled: true, warnings: ["invalid similarityBoost value"] }; } requireInRange(value, 0, 1, "similarityBoost"); return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { similarityBoost: value }), }; } case "style": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } const value = parseNumberValue(ctx.value); if (value == null) { return { handled: true, warnings: ["invalid style value"] }; } requireInRange(value, 0, 1, "style"); return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { style: value }) }; } case "speed": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } const value = parseNumberValue(ctx.value); if (value == null) { return { handled: true, warnings: ["invalid speed value"] }; } requireInRange(value, 0.5, 2, "speed"); return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { speed: value }) }; } case "speakerboost": case "speaker_boost": case "usespeakerboost": case "use_speaker_boost": { if (!ctx.policy.allowVoiceSettings) { return { handled: true }; } const value = parseBooleanValue(ctx.value); if (value == null) { return { handled: true, warnings: ["invalid useSpeakerBoost value"] }; } return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { useSpeakerBoost: value }), }; } case "normalize": case "applytextnormalization": case "apply_text_normalization": if (!ctx.policy.allowNormalization) { return { handled: true }; } return { handled: true, overrides: { ...ctx.currentOverrides, applyTextNormalization: normalizeApplyTextNormalization(ctx.value), }, }; case "language": case "languagecode": case "language_code": if (!ctx.policy.allowNormalization) { return { handled: true }; } return { handled: true, overrides: { ...ctx.currentOverrides, languageCode: normalizeLanguageCode(ctx.value), }, }; case "seed": if (!ctx.policy.allowSeed) { return { handled: true }; } return { handled: true, overrides: { ...ctx.currentOverrides, seed: normalizeSeed(Number.parseInt(ctx.value, 10)), }, }; default: return { handled: false }; } } catch (error) { return { handled: true, warnings: [formatErrorMessage(error)], }; } } export async function listElevenLabsVoices(params: { apiKey: string; baseUrl?: string; }): Promise { const res = await fetch(`${normalizeElevenLabsBaseUrl(params.baseUrl)}/v1/voices`, { headers: { "xi-api-key": params.apiKey, }, }); if (!res.ok) { throw new Error(`ElevenLabs voices API error (${res.status})`); } const json = (await res.json()) as { voices?: Array<{ voice_id?: string; name?: string; category?: string; description?: string; }>; }; return Array.isArray(json.voices) ? json.voices .map((voice) => ({ id: voice.voice_id?.trim() ?? "", name: trimToUndefined(voice.name), category: trimToUndefined(voice.category), description: trimToUndefined(voice.description), })) .filter((voice) => voice.id.length > 0) : []; } export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin { return { id: "elevenlabs", label: "ElevenLabs", autoSelectOrder: 20, models: ELEVENLABS_TTS_MODELS, resolveConfig: ({ rawConfig }) => normalizeElevenLabsProviderConfig(rawConfig), parseDirectiveToken, resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { const base = normalizeElevenLabsProviderConfig(baseTtsConfig); const talkVoiceSettings = asObject(talkProviderConfig.voiceSettings); const resolvedTalkApiKey = talkProviderConfig.apiKey === undefined ? (resolveElevenLabsApiKeyWithProfileFallback() ?? undefined) : normalizeResolvedSecretInputString({ value: talkProviderConfig.apiKey, path: "talk.providers.elevenlabs.apiKey", }); return { ...base, ...(resolvedTalkApiKey === undefined ? {} : { apiKey: resolvedTalkApiKey }), ...(trimToUndefined(talkProviderConfig.baseUrl) == null ? {} : { baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }), ...(trimToUndefined(talkProviderConfig.voiceId) == null ? {} : { voiceId: trimToUndefined(talkProviderConfig.voiceId) }), ...(trimToUndefined(talkProviderConfig.modelId) == null ? {} : { modelId: trimToUndefined(talkProviderConfig.modelId) }), ...(asFiniteNumber(talkProviderConfig.seed) == null ? {} : { seed: asFiniteNumber(talkProviderConfig.seed) }), ...(trimToUndefined(talkProviderConfig.applyTextNormalization) == null ? {} : { applyTextNormalization: normalizeApplyTextNormalization( trimToUndefined(talkProviderConfig.applyTextNormalization), ), }), ...(trimToUndefined(talkProviderConfig.languageCode) == null ? {} : { languageCode: normalizeLanguageCode(trimToUndefined(talkProviderConfig.languageCode)), }), voiceSettings: { ...base.voiceSettings, ...(asFiniteNumber(talkVoiceSettings?.stability) == null ? {} : { stability: asFiniteNumber(talkVoiceSettings?.stability) }), ...(asFiniteNumber(talkVoiceSettings?.similarityBoost) == null ? {} : { similarityBoost: asFiniteNumber(talkVoiceSettings?.similarityBoost) }), ...(asFiniteNumber(talkVoiceSettings?.style) == null ? {} : { style: asFiniteNumber(talkVoiceSettings?.style) }), ...(asBoolean(talkVoiceSettings?.useSpeakerBoost) == null ? {} : { useSpeakerBoost: asBoolean(talkVoiceSettings?.useSpeakerBoost) }), ...(asFiniteNumber(talkVoiceSettings?.speed) == null ? {} : { speed: asFiniteNumber(talkVoiceSettings?.speed) }), }, }; }, resolveTalkOverrides: ({ params }) => { const normalize = trimToUndefined(params.normalize); const language = trimToUndefined(params.language)?.toLowerCase(); const latencyTier = asFiniteNumber(params.latencyTier); const voiceSettings = { ...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }), ...(asFiniteNumber(params.stability) == null ? {} : { stability: asFiniteNumber(params.stability) }), ...(asFiniteNumber(params.similarity) == null ? {} : { similarityBoost: asFiniteNumber(params.similarity) }), ...(asFiniteNumber(params.style) == null ? {} : { style: asFiniteNumber(params.style) }), ...(asBoolean(params.speakerBoost) == null ? {} : { useSpeakerBoost: asBoolean(params.speakerBoost) }), }; return { ...(trimToUndefined(params.voiceId) == null ? {} : { voiceId: trimToUndefined(params.voiceId) }), ...(trimToUndefined(params.modelId) == null ? {} : { modelId: trimToUndefined(params.modelId) }), ...(trimToUndefined(params.outputFormat) == null ? {} : { outputFormat: trimToUndefined(params.outputFormat) }), ...(asFiniteNumber(params.seed) == null ? {} : { seed: asFiniteNumber(params.seed) }), ...(normalize == null ? {} : { applyTextNormalization: normalizeApplyTextNormalization(normalize) }), ...(language == null ? {} : { languageCode: normalizeLanguageCode(language) }), ...(latencyTier == null ? {} : { latencyTier }), ...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }), }; }, listVoices: async (req) => { const config = req.providerConfig ? readElevenLabsProviderConfig(req.providerConfig) : undefined; const apiKey = req.apiKey || config?.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY; if (!apiKey) { throw new Error("ElevenLabs API key missing"); } return listElevenLabsVoices({ apiKey, baseUrl: req.baseUrl ?? config?.baseUrl, }); }, isConfigured: ({ providerConfig }) => Boolean( readElevenLabsProviderConfig(providerConfig).apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY, ), synthesize: async (req) => { const config = readElevenLabsProviderConfig(req.providerConfig); const overrides = req.providerOverrides ?? {}; const apiKey = config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY; if (!apiKey) { throw new Error("ElevenLabs API key missing"); } const outputFormat = trimToUndefined(overrides.outputFormat) ?? (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"); const overrideVoiceSettings = asObject(overrides.voiceSettings); const latencyTier = asFiniteNumber(overrides.latencyTier); const audioBuffer = await elevenLabsTTS({ text: req.text, apiKey, baseUrl: config.baseUrl, voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId, modelId: trimToUndefined(overrides.modelId) ?? config.modelId, outputFormat, seed: asFiniteNumber(overrides.seed) ?? config.seed, applyTextNormalization: (trimToUndefined(overrides.applyTextNormalization) as | "auto" | "on" | "off" | undefined) ?? config.applyTextNormalization, languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode, latencyTier, voiceSettings: { ...config.voiceSettings, ...(asFiniteNumber(overrideVoiceSettings?.stability) == null ? {} : { stability: asFiniteNumber(overrideVoiceSettings?.stability) }), ...(asFiniteNumber(overrideVoiceSettings?.similarityBoost) == null ? {} : { similarityBoost: asFiniteNumber(overrideVoiceSettings?.similarityBoost) }), ...(asFiniteNumber(overrideVoiceSettings?.style) == null ? {} : { style: asFiniteNumber(overrideVoiceSettings?.style) }), ...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null ? {} : { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }), ...(asFiniteNumber(overrideVoiceSettings?.speed) == null ? {} : { speed: asFiniteNumber(overrideVoiceSettings?.speed) }), }, timeoutMs: req.timeoutMs, }); return { audioBuffer, outputFormat, fileExtension: req.target === "voice-note" ? ".opus" : ".mp3", voiceCompatible: req.target === "voice-note", }; }, synthesizeTelephony: async (req) => { const config = readElevenLabsProviderConfig(req.providerConfig); const apiKey = config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY; if (!apiKey) { throw new Error("ElevenLabs API key missing"); } const outputFormat = "pcm_22050"; const sampleRate = 22_050; const audioBuffer = await elevenLabsTTS({ text: req.text, apiKey, baseUrl: config.baseUrl, voiceId: config.voiceId, modelId: config.modelId, outputFormat, seed: config.seed, applyTextNormalization: config.applyTextNormalization, languageCode: config.languageCode, voiceSettings: config.voiceSettings, timeoutMs: req.timeoutMs, }); return { audioBuffer, outputFormat, sampleRate }; }, }; }