Files
openclaw/extensions/elevenlabs/speech-provider.ts
2026-03-29 21:20:46 +01:00

537 lines
19 KiB
TypeScript

import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
SpeechVoiceOption,
} from "openclaw/plugin-sdk/speech";
import {
normalizeApplyTextNormalization,
normalizeLanguageCode,
normalizeSeed,
requireInRange,
} from "openclaw/plugin-sdk/speech";
import { elevenLabsTTS } from "./tts.js";
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
const ELEVENLABS_TTS_MODELS = [
"eleven_multilingual_v2",
"eleven_turbo_v2_5",
"eleven_monolingual_v1",
] as const;
type ElevenLabsProviderConfig = {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function parseBooleanValue(value: string): boolean | undefined {
const normalized = value.trim().toLowerCase();
if (["true", "1", "yes", "on"].includes(normalized)) {
return true;
}
if (["false", "0", "no", "off"].includes(normalized)) {
return false;
}
return undefined;
}
function parseNumberValue(value: string): number | undefined {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
export function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl: string | undefined): string {
const trimmed = baseUrl?.trim();
return trimmed?.replace(/\/+$/, "") || DEFAULT_ELEVENLABS_BASE_URL;
}
function normalizeElevenLabsProviderConfig(
rawConfig: Record<string, unknown>,
): ElevenLabsProviderConfig {
const providers = asObject(rawConfig.providers);
const raw = asObject(providers?.elevenlabs) ?? asObject(rawConfig.elevenlabs);
const rawVoiceSettings = asObject(raw?.voiceSettings);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.elevenlabs.apiKey",
}),
baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(raw?.baseUrl)),
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: asNumber(raw?.seed),
applyTextNormalization: trimToUndefined(raw?.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined,
languageCode: trimToUndefined(raw?.languageCode),
voiceSettings: {
stability:
asNumber(rawVoiceSettings?.stability) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
asNumber(rawVoiceSettings?.similarityBoost) ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: asNumber(rawVoiceSettings?.style) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
asBoolean(rawVoiceSettings?.useSpeakerBoost) ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: asNumber(rawVoiceSettings?.speed) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
};
}
function readElevenLabsProviderConfig(config: SpeechProviderConfig): ElevenLabsProviderConfig {
const defaults = normalizeElevenLabsProviderConfig({});
const voiceSettings = asObject(config.voiceSettings);
return {
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl),
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
modelId: trimToUndefined(config.modelId) ?? defaults.modelId,
seed: asNumber(config.seed) ?? defaults.seed,
applyTextNormalization:
(trimToUndefined(config.applyTextNormalization) as "auto" | "on" | "off" | undefined) ??
defaults.applyTextNormalization,
languageCode: trimToUndefined(config.languageCode) ?? defaults.languageCode,
voiceSettings: {
stability: asNumber(voiceSettings?.stability) ?? defaults.voiceSettings.stability,
similarityBoost:
asNumber(voiceSettings?.similarityBoost) ?? defaults.voiceSettings.similarityBoost,
style: asNumber(voiceSettings?.style) ?? defaults.voiceSettings.style,
useSpeakerBoost:
asBoolean(voiceSettings?.useSpeakerBoost) ?? defaults.voiceSettings.useSpeakerBoost,
speed: asNumber(voiceSettings?.speed) ?? defaults.voiceSettings.speed,
},
};
}
function mergeVoiceSettingsOverride(
ctx: SpeechDirectiveTokenParseContext,
next: Record<string, unknown>,
): SpeechProviderOverrides {
return {
...(ctx.currentOverrides ?? {}),
voiceSettings: {
...(asObject(ctx.currentOverrides?.voiceSettings) ?? {}),
...next,
},
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
try {
switch (ctx.key) {
case "voiceid":
case "voice_id":
case "elevenlabs_voice":
case "elevenlabsvoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
if (!isValidVoiceId(ctx.value)) {
return { handled: true, warnings: [`invalid ElevenLabs voiceId "${ctx.value}"`] };
}
return {
handled: true,
overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
};
case "model":
case "modelid":
case "model_id":
case "elevenlabs_model":
case "elevenlabsmodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return {
handled: true,
overrides: { ...(ctx.currentOverrides ?? {}), modelId: ctx.value },
};
case "stability": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid stability value"] };
}
requireInRange(value, 0, 1, "stability");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { stability: value }) };
}
case "similarity":
case "similarityboost":
case "similarity_boost": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid similarityBoost value"] };
}
requireInRange(value, 0, 1, "similarityBoost");
return {
handled: true,
overrides: mergeVoiceSettingsOverride(ctx, { similarityBoost: value }),
};
}
case "style": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid style value"] };
}
requireInRange(value, 0, 1, "style");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { style: value }) };
}
case "speed": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid speed value"] };
}
requireInRange(value, 0.5, 2, "speed");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { speed: value }) };
}
case "speakerboost":
case "speaker_boost":
case "usespeakerboost":
case "use_speaker_boost": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseBooleanValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid useSpeakerBoost value"] };
}
return {
handled: true,
overrides: mergeVoiceSettingsOverride(ctx, { useSpeakerBoost: value }),
};
}
case "normalize":
case "applytextnormalization":
case "apply_text_normalization":
if (!ctx.policy.allowNormalization) {
return { handled: true };
}
return {
handled: true,
overrides: {
...(ctx.currentOverrides ?? {}),
applyTextNormalization: normalizeApplyTextNormalization(ctx.value),
},
};
case "language":
case "languagecode":
case "language_code":
if (!ctx.policy.allowNormalization) {
return { handled: true };
}
return {
handled: true,
overrides: {
...(ctx.currentOverrides ?? {}),
languageCode: normalizeLanguageCode(ctx.value),
},
};
case "seed":
if (!ctx.policy.allowSeed) {
return { handled: true };
}
return {
handled: true,
overrides: {
...(ctx.currentOverrides ?? {}),
seed: normalizeSeed(Number.parseInt(ctx.value, 10)),
},
};
default:
return { handled: false };
}
} catch (error) {
return {
handled: true,
warnings: [error instanceof Error ? error.message : String(error)],
};
}
}
export async function listElevenLabsVoices(params: {
apiKey: string;
baseUrl?: string;
}): Promise<SpeechVoiceOption[]> {
const res = await fetch(`${normalizeElevenLabsBaseUrl(params.baseUrl)}/v1/voices`, {
headers: {
"xi-api-key": params.apiKey,
},
});
if (!res.ok) {
throw new Error(`ElevenLabs voices API error (${res.status})`);
}
const json = (await res.json()) as {
voices?: Array<{
voice_id?: string;
name?: string;
category?: string;
description?: string;
}>;
};
return Array.isArray(json.voices)
? json.voices
.map((voice) => ({
id: voice.voice_id?.trim() ?? "",
name: voice.name?.trim() || undefined,
category: voice.category?.trim() || undefined,
description: voice.description?.trim() || undefined,
}))
.filter((voice) => voice.id.length > 0)
: [];
}
export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
return {
id: "elevenlabs",
label: "ElevenLabs",
autoSelectOrder: 20,
models: ELEVENLABS_TTS_MODELS,
resolveConfig: ({ rawConfig }) => normalizeElevenLabsProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeElevenLabsProviderConfig(baseTtsConfig);
const talkVoiceSettings = asObject(talkProviderConfig.voiceSettings);
return {
...base,
...(talkProviderConfig.apiKey === undefined
? {}
: {
apiKey: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.elevenlabs.apiKey",
}),
}),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { modelId: trimToUndefined(talkProviderConfig.modelId) }),
...(asNumber(talkProviderConfig.seed) == null
? {}
: { seed: asNumber(talkProviderConfig.seed) }),
...(trimToUndefined(talkProviderConfig.applyTextNormalization) == null
? {}
: {
applyTextNormalization: normalizeApplyTextNormalization(
trimToUndefined(talkProviderConfig.applyTextNormalization),
),
}),
...(trimToUndefined(talkProviderConfig.languageCode) == null
? {}
: {
languageCode: normalizeLanguageCode(trimToUndefined(talkProviderConfig.languageCode)),
}),
voiceSettings: {
...base.voiceSettings,
...(asNumber(talkVoiceSettings?.stability) == null
? {}
: { stability: asNumber(talkVoiceSettings?.stability) }),
...(asNumber(talkVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asNumber(talkVoiceSettings?.similarityBoost) }),
...(asNumber(talkVoiceSettings?.style) == null
? {}
: { style: asNumber(talkVoiceSettings?.style) }),
...(asBoolean(talkVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(talkVoiceSettings?.useSpeakerBoost) }),
...(asNumber(talkVoiceSettings?.speed) == null
? {}
: { speed: asNumber(talkVoiceSettings?.speed) }),
},
};
},
resolveTalkOverrides: ({ params }) => {
const normalize = trimToUndefined(params.normalize);
const language = trimToUndefined(params.language)?.toLowerCase();
const voiceSettings = {
...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }),
...(asNumber(params.stability) == null ? {} : { stability: asNumber(params.stability) }),
...(asNumber(params.similarity) == null
? {}
: { similarityBoost: asNumber(params.similarity) }),
...(asNumber(params.style) == null ? {} : { style: asNumber(params.style) }),
...(asBoolean(params.speakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(params.speakerBoost) }),
};
return {
...(trimToUndefined(params.voiceId) == null
? {}
: { voiceId: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.modelId) == null
? {}
: { modelId: trimToUndefined(params.modelId) }),
...(trimToUndefined(params.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(params.outputFormat) }),
...(asNumber(params.seed) == null ? {} : { seed: asNumber(params.seed) }),
...(normalize == null
? {}
: { applyTextNormalization: normalizeApplyTextNormalization(normalize) }),
...(language == null ? {} : { languageCode: normalizeLanguageCode(language) }),
...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
};
},
listVoices: async (req) => {
const config = req.providerConfig
? readElevenLabsProviderConfig(req.providerConfig)
: undefined;
const apiKey =
req.apiKey || config?.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
return listElevenLabsVoices({
apiKey,
baseUrl: req.baseUrl ?? config?.baseUrl,
});
},
isConfigured: ({ providerConfig }) =>
Boolean(
readElevenLabsProviderConfig(providerConfig).apiKey ||
process.env.ELEVENLABS_API_KEY ||
process.env.XI_API_KEY,
),
synthesize: async (req) => {
const config = readElevenLabsProviderConfig(req.providerConfig);
const overrides = req.providerOverrides ?? {};
const apiKey = config.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat =
trimToUndefined(overrides.outputFormat) ??
(req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
const overrideVoiceSettings = asObject(overrides.voiceSettings);
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
outputFormat,
seed: asNumber(overrides.seed) ?? config.seed,
applyTextNormalization:
(trimToUndefined(overrides.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined) ?? config.applyTextNormalization,
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
voiceSettings: {
...config.voiceSettings,
...(asNumber(overrideVoiceSettings?.stability) == null
? {}
: { stability: asNumber(overrideVoiceSettings?.stability) }),
...(asNumber(overrideVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asNumber(overrideVoiceSettings?.similarityBoost) }),
...(asNumber(overrideVoiceSettings?.style) == null
? {}
: { style: asNumber(overrideVoiceSettings?.style) }),
...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
...(asNumber(overrideVoiceSettings?.speed) == null
? {}
: { speed: asNumber(overrideVoiceSettings?.speed) }),
},
timeoutMs: req.timeoutMs,
});
return {
audioBuffer,
outputFormat,
fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
voiceCompatible: req.target === "voice-note",
};
},
synthesizeTelephony: async (req) => {
const config = readElevenLabsProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat = "pcm_22050";
const sampleRate = 22_050;
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
modelId: config.modelId,
outputFormat,
seed: config.seed,
applyTextNormalization: config.applyTextNormalization,
languageCode: config.languageCode,
voiceSettings: config.voiceSettings,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
},
};
}