Files
openclaw/extensions/elevenlabs/speech-provider.ts
2026-04-07 08:40:34 +01:00

528 lines
19 KiB
TypeScript

import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
SpeechVoiceOption,
} from "openclaw/plugin-sdk/speech";
import {
asBoolean,
asFiniteNumber,
asObject,
normalizeApplyTextNormalization,
normalizeLanguageCode,
normalizeSeed,
requireInRange,
trimToUndefined,
} from "openclaw/plugin-sdk/speech";
import { resolveElevenLabsApiKeyWithProfileFallback } from "./config-api.js";
import { isValidElevenLabsVoiceId, normalizeElevenLabsBaseUrl } from "./shared.js";
import { elevenLabsTTS } from "./tts.js";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
const ELEVENLABS_TTS_MODELS = [
"eleven_multilingual_v2",
"eleven_turbo_v2_5",
"eleven_monolingual_v1",
] as const;
type ElevenLabsProviderConfig = {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
function parseBooleanValue(value: string): boolean | undefined {
const normalized = value.trim().toLowerCase();
if (["true", "1", "yes", "on"].includes(normalized)) {
return true;
}
if (["false", "0", "no", "off"].includes(normalized)) {
return false;
}
return undefined;
}
function parseNumberValue(value: string): number | undefined {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
export const isValidVoiceId = isValidElevenLabsVoiceId;
function normalizeElevenLabsProviderConfig(
rawConfig: Record<string, unknown>,
): ElevenLabsProviderConfig {
const providers = asObject(rawConfig.providers);
const raw = asObject(providers?.elevenlabs) ?? asObject(rawConfig.elevenlabs);
const rawVoiceSettings = asObject(raw?.voiceSettings);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.elevenlabs.apiKey",
}),
baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(raw?.baseUrl)),
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: asFiniteNumber(raw?.seed),
applyTextNormalization: trimToUndefined(raw?.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined,
languageCode: trimToUndefined(raw?.languageCode),
voiceSettings: {
stability:
asFiniteNumber(rawVoiceSettings?.stability) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
asFiniteNumber(rawVoiceSettings?.similarityBoost) ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: asFiniteNumber(rawVoiceSettings?.style) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
asBoolean(rawVoiceSettings?.useSpeakerBoost) ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: asFiniteNumber(rawVoiceSettings?.speed) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
};
}
function readElevenLabsProviderConfig(config: SpeechProviderConfig): ElevenLabsProviderConfig {
const defaults = normalizeElevenLabsProviderConfig({});
const voiceSettings = asObject(config.voiceSettings);
return {
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl),
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
modelId: trimToUndefined(config.modelId) ?? defaults.modelId,
seed: asFiniteNumber(config.seed) ?? defaults.seed,
applyTextNormalization:
(trimToUndefined(config.applyTextNormalization) as "auto" | "on" | "off" | undefined) ??
defaults.applyTextNormalization,
languageCode: trimToUndefined(config.languageCode) ?? defaults.languageCode,
voiceSettings: {
stability: asFiniteNumber(voiceSettings?.stability) ?? defaults.voiceSettings.stability,
similarityBoost:
asFiniteNumber(voiceSettings?.similarityBoost) ?? defaults.voiceSettings.similarityBoost,
style: asFiniteNumber(voiceSettings?.style) ?? defaults.voiceSettings.style,
useSpeakerBoost:
asBoolean(voiceSettings?.useSpeakerBoost) ?? defaults.voiceSettings.useSpeakerBoost,
speed: asFiniteNumber(voiceSettings?.speed) ?? defaults.voiceSettings.speed,
},
};
}
function mergeVoiceSettingsOverride(
ctx: SpeechDirectiveTokenParseContext,
next: Record<string, unknown>,
): SpeechProviderOverrides {
return {
...ctx.currentOverrides,
voiceSettings: {
...asObject(ctx.currentOverrides?.voiceSettings),
...next,
},
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
try {
switch (ctx.key) {
case "voiceid":
case "voice_id":
case "elevenlabs_voice":
case "elevenlabsvoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
if (!isValidElevenLabsVoiceId(ctx.value)) {
return { handled: true, warnings: [`invalid ElevenLabs voiceId "${ctx.value}"`] };
}
return {
handled: true,
overrides: { ...ctx.currentOverrides, voiceId: ctx.value },
};
case "model":
case "modelid":
case "model_id":
case "elevenlabs_model":
case "elevenlabsmodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return {
handled: true,
overrides: { ...ctx.currentOverrides, modelId: ctx.value },
};
case "stability": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid stability value"] };
}
requireInRange(value, 0, 1, "stability");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { stability: value }) };
}
case "similarity":
case "similarityboost":
case "similarity_boost": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid similarityBoost value"] };
}
requireInRange(value, 0, 1, "similarityBoost");
return {
handled: true,
overrides: mergeVoiceSettingsOverride(ctx, { similarityBoost: value }),
};
}
case "style": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid style value"] };
}
requireInRange(value, 0, 1, "style");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { style: value }) };
}
case "speed": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid speed value"] };
}
requireInRange(value, 0.5, 2, "speed");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { speed: value }) };
}
case "speakerboost":
case "speaker_boost":
case "usespeakerboost":
case "use_speaker_boost": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseBooleanValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid useSpeakerBoost value"] };
}
return {
handled: true,
overrides: mergeVoiceSettingsOverride(ctx, { useSpeakerBoost: value }),
};
}
case "normalize":
case "applytextnormalization":
case "apply_text_normalization":
if (!ctx.policy.allowNormalization) {
return { handled: true };
}
return {
handled: true,
overrides: {
...ctx.currentOverrides,
applyTextNormalization: normalizeApplyTextNormalization(ctx.value),
},
};
case "language":
case "languagecode":
case "language_code":
if (!ctx.policy.allowNormalization) {
return { handled: true };
}
return {
handled: true,
overrides: {
...ctx.currentOverrides,
languageCode: normalizeLanguageCode(ctx.value),
},
};
case "seed":
if (!ctx.policy.allowSeed) {
return { handled: true };
}
return {
handled: true,
overrides: {
...ctx.currentOverrides,
seed: normalizeSeed(Number.parseInt(ctx.value, 10)),
},
};
default:
return { handled: false };
}
} catch (error) {
return {
handled: true,
warnings: [formatErrorMessage(error)],
};
}
}
export async function listElevenLabsVoices(params: {
apiKey: string;
baseUrl?: string;
}): Promise<SpeechVoiceOption[]> {
const res = await fetch(`${normalizeElevenLabsBaseUrl(params.baseUrl)}/v1/voices`, {
headers: {
"xi-api-key": params.apiKey,
},
});
if (!res.ok) {
throw new Error(`ElevenLabs voices API error (${res.status})`);
}
const json = (await res.json()) as {
voices?: Array<{
voice_id?: string;
name?: string;
category?: string;
description?: string;
}>;
};
return Array.isArray(json.voices)
? json.voices
.map((voice) => ({
id: voice.voice_id?.trim() ?? "",
name: trimToUndefined(voice.name),
category: trimToUndefined(voice.category),
description: trimToUndefined(voice.description),
}))
.filter((voice) => voice.id.length > 0)
: [];
}
export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
return {
id: "elevenlabs",
label: "ElevenLabs",
autoSelectOrder: 20,
models: ELEVENLABS_TTS_MODELS,
resolveConfig: ({ rawConfig }) => normalizeElevenLabsProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeElevenLabsProviderConfig(baseTtsConfig);
const talkVoiceSettings = asObject(talkProviderConfig.voiceSettings);
const resolvedTalkApiKey =
talkProviderConfig.apiKey === undefined
? (resolveElevenLabsApiKeyWithProfileFallback() ?? undefined)
: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.elevenlabs.apiKey",
});
return {
...base,
...(resolvedTalkApiKey === undefined ? {} : { apiKey: resolvedTalkApiKey }),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { modelId: trimToUndefined(talkProviderConfig.modelId) }),
...(asFiniteNumber(talkProviderConfig.seed) == null
? {}
: { seed: asFiniteNumber(talkProviderConfig.seed) }),
...(trimToUndefined(talkProviderConfig.applyTextNormalization) == null
? {}
: {
applyTextNormalization: normalizeApplyTextNormalization(
trimToUndefined(talkProviderConfig.applyTextNormalization),
),
}),
...(trimToUndefined(talkProviderConfig.languageCode) == null
? {}
: {
languageCode: normalizeLanguageCode(trimToUndefined(talkProviderConfig.languageCode)),
}),
voiceSettings: {
...base.voiceSettings,
...(asFiniteNumber(talkVoiceSettings?.stability) == null
? {}
: { stability: asFiniteNumber(talkVoiceSettings?.stability) }),
...(asFiniteNumber(talkVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asFiniteNumber(talkVoiceSettings?.similarityBoost) }),
...(asFiniteNumber(talkVoiceSettings?.style) == null
? {}
: { style: asFiniteNumber(talkVoiceSettings?.style) }),
...(asBoolean(talkVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(talkVoiceSettings?.useSpeakerBoost) }),
...(asFiniteNumber(talkVoiceSettings?.speed) == null
? {}
: { speed: asFiniteNumber(talkVoiceSettings?.speed) }),
},
};
},
resolveTalkOverrides: ({ params }) => {
const normalize = trimToUndefined(params.normalize);
const language = trimToUndefined(params.language)?.toLowerCase();
const latencyTier = asFiniteNumber(params.latencyTier);
const voiceSettings = {
...(asFiniteNumber(params.speed) == null ? {} : { speed: asFiniteNumber(params.speed) }),
...(asFiniteNumber(params.stability) == null
? {}
: { stability: asFiniteNumber(params.stability) }),
...(asFiniteNumber(params.similarity) == null
? {}
: { similarityBoost: asFiniteNumber(params.similarity) }),
...(asFiniteNumber(params.style) == null ? {} : { style: asFiniteNumber(params.style) }),
...(asBoolean(params.speakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(params.speakerBoost) }),
};
return {
...(trimToUndefined(params.voiceId) == null
? {}
: { voiceId: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.modelId) == null
? {}
: { modelId: trimToUndefined(params.modelId) }),
...(trimToUndefined(params.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(params.outputFormat) }),
...(asFiniteNumber(params.seed) == null ? {} : { seed: asFiniteNumber(params.seed) }),
...(normalize == null
? {}
: { applyTextNormalization: normalizeApplyTextNormalization(normalize) }),
...(language == null ? {} : { languageCode: normalizeLanguageCode(language) }),
...(latencyTier == null ? {} : { latencyTier }),
...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
};
},
listVoices: async (req) => {
const config = req.providerConfig
? readElevenLabsProviderConfig(req.providerConfig)
: undefined;
const apiKey =
req.apiKey ||
config?.apiKey ||
resolveElevenLabsApiKeyWithProfileFallback() ||
process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
return listElevenLabsVoices({
apiKey,
baseUrl: req.baseUrl ?? config?.baseUrl,
});
},
isConfigured: ({ providerConfig }) =>
Boolean(
readElevenLabsProviderConfig(providerConfig).apiKey ||
resolveElevenLabsApiKeyWithProfileFallback() ||
process.env.XI_API_KEY,
),
synthesize: async (req) => {
const config = readElevenLabsProviderConfig(req.providerConfig);
const overrides = req.providerOverrides ?? {};
const apiKey =
config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat =
trimToUndefined(overrides.outputFormat) ??
(req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
const overrideVoiceSettings = asObject(overrides.voiceSettings);
const latencyTier = asFiniteNumber(overrides.latencyTier);
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
outputFormat,
seed: asFiniteNumber(overrides.seed) ?? config.seed,
applyTextNormalization:
(trimToUndefined(overrides.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined) ?? config.applyTextNormalization,
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
latencyTier,
voiceSettings: {
...config.voiceSettings,
...(asFiniteNumber(overrideVoiceSettings?.stability) == null
? {}
: { stability: asFiniteNumber(overrideVoiceSettings?.stability) }),
...(asFiniteNumber(overrideVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asFiniteNumber(overrideVoiceSettings?.similarityBoost) }),
...(asFiniteNumber(overrideVoiceSettings?.style) == null
? {}
: { style: asFiniteNumber(overrideVoiceSettings?.style) }),
...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
...(asFiniteNumber(overrideVoiceSettings?.speed) == null
? {}
: { speed: asFiniteNumber(overrideVoiceSettings?.speed) }),
},
timeoutMs: req.timeoutMs,
});
return {
audioBuffer,
outputFormat,
fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
voiceCompatible: req.target === "voice-note",
};
},
synthesizeTelephony: async (req) => {
const config = readElevenLabsProviderConfig(req.providerConfig);
const apiKey =
config.apiKey || resolveElevenLabsApiKeyWithProfileFallback() || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat = "pcm_22050";
const sampleRate = 22_050;
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
modelId: config.modelId,
outputFormat,
seed: config.seed,
applyTextNormalization: config.applyTextNormalization,
languageCode: config.languageCode,
voiceSettings: config.voiceSettings,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
},
};
}