refactor: plugin-own speech provider config

This commit is contained in:
Peter Steinberger
2026-03-26 22:27:17 +00:00
parent 8eeb7f0829
commit 2c6d099b01
28 changed files with 1791 additions and 1229 deletions

View File

@@ -645,11 +645,10 @@ export class DiscordVoiceManager {
cfg: this.params.cfg,
override: this.params.discordConfig.voice?.tts,
});
const directive = parseTtsDirectives(
replyText,
ttsConfig.modelOverrides,
ttsConfig.openai.baseUrl,
);
const directive = parseTtsDirectives(replyText, ttsConfig.modelOverrides, {
cfg: ttsCfg,
providerConfigs: ttsConfig.providerConfigs,
});
const speakText = directive.overrides.ttsText ?? directive.cleanedText.trim();
if (!speakText) {
logVoiceVerbose(

View File

@@ -1,16 +1,309 @@
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
SpeechVoiceOption,
} from "openclaw/plugin-sdk/speech-core";
import {
normalizeApplyTextNormalization,
normalizeLanguageCode,
normalizeSeed,
requireInRange,
} from "openclaw/plugin-sdk/speech-core";
import { elevenLabsTTS } from "./tts.js";
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
const ELEVENLABS_TTS_MODELS = [
"eleven_multilingual_v2",
"eleven_turbo_v2_5",
"eleven_monolingual_v1",
] as const;
type ElevenLabsProviderConfig = {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function parseBooleanValue(value: string): boolean | undefined {
const normalized = value.trim().toLowerCase();
if (["true", "1", "yes", "on"].includes(normalized)) {
return true;
}
if (["false", "0", "no", "off"].includes(normalized)) {
return false;
}
return undefined;
}
function parseNumberValue(value: string): number | undefined {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
export function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl: string | undefined): string {
const trimmed = baseUrl?.trim();
return trimmed?.replace(/\/+$/, "") || "https://api.elevenlabs.io";
return trimmed?.replace(/\/+$/, "") || DEFAULT_ELEVENLABS_BASE_URL;
}
function normalizeElevenLabsProviderConfig(
rawConfig: Record<string, unknown>,
): ElevenLabsProviderConfig {
const raw = asObject(rawConfig.elevenlabs);
const rawVoiceSettings = asObject(raw?.voiceSettings);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.elevenlabs.apiKey",
}),
baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(raw?.baseUrl)),
voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: trimToUndefined(raw?.modelId) ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: asNumber(raw?.seed),
applyTextNormalization: trimToUndefined(raw?.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined,
languageCode: trimToUndefined(raw?.languageCode),
voiceSettings: {
stability:
asNumber(rawVoiceSettings?.stability) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
asNumber(rawVoiceSettings?.similarityBoost) ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: asNumber(rawVoiceSettings?.style) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
asBoolean(rawVoiceSettings?.useSpeakerBoost) ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: asNumber(rawVoiceSettings?.speed) ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
};
}
function readElevenLabsProviderConfig(config: SpeechProviderConfig): ElevenLabsProviderConfig {
const defaults = normalizeElevenLabsProviderConfig({});
const voiceSettings = asObject(config.voiceSettings);
return {
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl),
voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId,
modelId: trimToUndefined(config.modelId) ?? defaults.modelId,
seed: asNumber(config.seed) ?? defaults.seed,
applyTextNormalization:
(trimToUndefined(config.applyTextNormalization) as "auto" | "on" | "off" | undefined) ??
defaults.applyTextNormalization,
languageCode: trimToUndefined(config.languageCode) ?? defaults.languageCode,
voiceSettings: {
stability: asNumber(voiceSettings?.stability) ?? defaults.voiceSettings.stability,
similarityBoost:
asNumber(voiceSettings?.similarityBoost) ?? defaults.voiceSettings.similarityBoost,
style: asNumber(voiceSettings?.style) ?? defaults.voiceSettings.style,
useSpeakerBoost:
asBoolean(voiceSettings?.useSpeakerBoost) ?? defaults.voiceSettings.useSpeakerBoost,
speed: asNumber(voiceSettings?.speed) ?? defaults.voiceSettings.speed,
},
};
}
function mergeVoiceSettingsOverride(
ctx: SpeechDirectiveTokenParseContext,
next: Record<string, unknown>,
): SpeechProviderOverrides {
return {
...(ctx.currentOverrides ?? {}),
voiceSettings: {
...(asObject(ctx.currentOverrides?.voiceSettings) ?? {}),
...next,
},
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext) {
try {
switch (ctx.key) {
case "voiceid":
case "voice_id":
case "elevenlabs_voice":
case "elevenlabsvoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
if (!isValidVoiceId(ctx.value)) {
return { handled: true, warnings: [`invalid ElevenLabs voiceId "${ctx.value}"`] };
}
return {
handled: true,
overrides: { ...(ctx.currentOverrides ?? {}), voiceId: ctx.value },
};
case "model":
case "modelid":
case "model_id":
case "elevenlabs_model":
case "elevenlabsmodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
return {
handled: true,
overrides: { ...(ctx.currentOverrides ?? {}), modelId: ctx.value },
};
case "stability": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid stability value"] };
}
requireInRange(value, 0, 1, "stability");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { stability: value }) };
}
case "similarity":
case "similarityboost":
case "similarity_boost": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid similarityBoost value"] };
}
requireInRange(value, 0, 1, "similarityBoost");
return {
handled: true,
overrides: mergeVoiceSettingsOverride(ctx, { similarityBoost: value }),
};
}
case "style": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid style value"] };
}
requireInRange(value, 0, 1, "style");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { style: value }) };
}
case "speed": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseNumberValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid speed value"] };
}
requireInRange(value, 0.5, 2, "speed");
return { handled: true, overrides: mergeVoiceSettingsOverride(ctx, { speed: value }) };
}
case "speakerboost":
case "speaker_boost":
case "usespeakerboost":
case "use_speaker_boost": {
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
const value = parseBooleanValue(ctx.value);
if (value == null) {
return { handled: true, warnings: ["invalid useSpeakerBoost value"] };
}
return {
handled: true,
overrides: mergeVoiceSettingsOverride(ctx, { useSpeakerBoost: value }),
};
}
case "normalize":
case "applytextnormalization":
case "apply_text_normalization":
if (!ctx.policy.allowNormalization) {
return { handled: true };
}
return {
handled: true,
overrides: {
...(ctx.currentOverrides ?? {}),
applyTextNormalization: normalizeApplyTextNormalization(ctx.value),
},
};
case "language":
case "languagecode":
case "language_code":
if (!ctx.policy.allowNormalization) {
return { handled: true };
}
return {
handled: true,
overrides: {
...(ctx.currentOverrides ?? {}),
languageCode: normalizeLanguageCode(ctx.value),
},
};
case "seed":
if (!ctx.policy.allowSeed) {
return { handled: true };
}
return {
handled: true,
overrides: {
...(ctx.currentOverrides ?? {}),
seed: normalizeSeed(Number.parseInt(ctx.value, 10)),
},
};
default:
return { handled: false };
}
} catch (error) {
return {
handled: true,
warnings: [error instanceof Error ? error.message : String(error)],
};
}
}
export async function listElevenLabsVoices(params: {
@@ -49,49 +342,164 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
return {
id: "elevenlabs",
label: "ElevenLabs",
autoSelectOrder: 20,
models: ELEVENLABS_TTS_MODELS,
resolveConfig: ({ rawConfig }) => normalizeElevenLabsProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeElevenLabsProviderConfig(baseTtsConfig);
const talkVoiceSettings = asObject(talkProviderConfig.voiceSettings);
return {
...base,
...(talkProviderConfig.apiKey === undefined
? {}
: {
apiKey: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.elevenlabs.apiKey",
}),
}),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: normalizeElevenLabsBaseUrl(trimToUndefined(talkProviderConfig.baseUrl)) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voiceId: trimToUndefined(talkProviderConfig.voiceId) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { modelId: trimToUndefined(talkProviderConfig.modelId) }),
...(asNumber(talkProviderConfig.seed) == null
? {}
: { seed: asNumber(talkProviderConfig.seed) }),
...(trimToUndefined(talkProviderConfig.applyTextNormalization) == null
? {}
: {
applyTextNormalization: normalizeApplyTextNormalization(
trimToUndefined(talkProviderConfig.applyTextNormalization),
),
}),
...(trimToUndefined(talkProviderConfig.languageCode) == null
? {}
: {
languageCode: normalizeLanguageCode(trimToUndefined(talkProviderConfig.languageCode)),
}),
voiceSettings: {
...base.voiceSettings,
...(asNumber(talkVoiceSettings?.stability) == null
? {}
: { stability: asNumber(talkVoiceSettings?.stability) }),
...(asNumber(talkVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asNumber(talkVoiceSettings?.similarityBoost) }),
...(asNumber(talkVoiceSettings?.style) == null
? {}
: { style: asNumber(talkVoiceSettings?.style) }),
...(asBoolean(talkVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(talkVoiceSettings?.useSpeakerBoost) }),
...(asNumber(talkVoiceSettings?.speed) == null
? {}
: { speed: asNumber(talkVoiceSettings?.speed) }),
},
};
},
resolveTalkOverrides: ({ params }) => {
const normalize = trimToUndefined(params.normalize);
const language = trimToUndefined(params.language)?.toLowerCase();
const voiceSettings = {
...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }),
...(asNumber(params.stability) == null ? {} : { stability: asNumber(params.stability) }),
...(asNumber(params.similarity) == null
? {}
: { similarityBoost: asNumber(params.similarity) }),
...(asNumber(params.style) == null ? {} : { style: asNumber(params.style) }),
...(asBoolean(params.speakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(params.speakerBoost) }),
};
return {
...(trimToUndefined(params.voiceId) == null
? {}
: { voiceId: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.modelId) == null
? {}
: { modelId: trimToUndefined(params.modelId) }),
...(trimToUndefined(params.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(params.outputFormat) }),
...(asNumber(params.seed) == null ? {} : { seed: asNumber(params.seed) }),
...(normalize == null
? {}
: { applyTextNormalization: normalizeApplyTextNormalization(normalize) }),
...(language == null ? {} : { languageCode: normalizeLanguageCode(language) }),
...(Object.keys(voiceSettings).length === 0 ? {} : { voiceSettings }),
};
},
listVoices: async (req) => {
const config = req.providerConfig
? readElevenLabsProviderConfig(req.providerConfig)
: undefined;
const apiKey =
req.apiKey ||
req.config?.elevenlabs.apiKey ||
process.env.ELEVENLABS_API_KEY ||
process.env.XI_API_KEY;
req.apiKey || config?.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
return listElevenLabsVoices({
apiKey,
baseUrl: req.baseUrl ?? req.config?.elevenlabs.baseUrl,
baseUrl: req.baseUrl ?? config?.baseUrl,
});
},
isConfigured: ({ config }) =>
Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
isConfigured: ({ providerConfig }) =>
Boolean(
readElevenLabsProviderConfig(providerConfig).apiKey ||
process.env.ELEVENLABS_API_KEY ||
process.env.XI_API_KEY,
),
synthesize: async (req) => {
const apiKey =
req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
const config = readElevenLabsProviderConfig(req.providerConfig);
const overrides = req.providerOverrides ?? {};
const apiKey = config.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
const outputFormat =
req.overrides?.elevenlabs?.outputFormat ??
trimToUndefined(overrides.outputFormat) ??
(req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
const overrideVoiceSettings = asObject(overrides.voiceSettings);
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: req.config.elevenlabs.baseUrl,
voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
baseUrl: config.baseUrl,
voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId,
modelId: trimToUndefined(overrides.modelId) ?? config.modelId,
outputFormat,
seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
seed: asNumber(overrides.seed) ?? config.seed,
applyTextNormalization:
req.overrides?.elevenlabs?.applyTextNormalization ??
req.config.elevenlabs.applyTextNormalization,
languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
(trimToUndefined(overrides.applyTextNormalization) as
| "auto"
| "on"
| "off"
| undefined) ?? config.applyTextNormalization,
languageCode: trimToUndefined(overrides.languageCode) ?? config.languageCode,
voiceSettings: {
...req.config.elevenlabs.voiceSettings,
...req.overrides?.elevenlabs?.voiceSettings,
...config.voiceSettings,
...(asNumber(overrideVoiceSettings?.stability) == null
? {}
: { stability: asNumber(overrideVoiceSettings?.stability) }),
...(asNumber(overrideVoiceSettings?.similarityBoost) == null
? {}
: { similarityBoost: asNumber(overrideVoiceSettings?.similarityBoost) }),
...(asNumber(overrideVoiceSettings?.style) == null
? {}
: { style: asNumber(overrideVoiceSettings?.style) }),
...(asBoolean(overrideVoiceSettings?.useSpeakerBoost) == null
? {}
: { useSpeakerBoost: asBoolean(overrideVoiceSettings?.useSpeakerBoost) }),
...(asNumber(overrideVoiceSettings?.speed) == null
? {}
: { speed: asNumber(overrideVoiceSettings?.speed) }),
},
timeoutMs: req.config.timeoutMs,
timeoutMs: req.timeoutMs,
});
return {
audioBuffer,
@@ -101,8 +509,8 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
};
},
synthesizeTelephony: async (req) => {
const apiKey =
req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
const config = readElevenLabsProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
if (!apiKey) {
throw new Error("ElevenLabs API key missing");
}
@@ -111,15 +519,15 @@ export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
const audioBuffer = await elevenLabsTTS({
text: req.text,
apiKey,
baseUrl: req.config.elevenlabs.baseUrl,
voiceId: req.config.elevenlabs.voiceId,
modelId: req.config.elevenlabs.modelId,
baseUrl: config.baseUrl,
voiceId: config.voiceId,
modelId: config.modelId,
outputFormat,
seed: req.config.elevenlabs.seed,
applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
languageCode: req.config.elevenlabs.languageCode,
voiceSettings: req.config.elevenlabs.voiceSettings,
timeoutMs: req.config.timeoutMs,
seed: config.seed,
applyTextNormalization: config.applyTextNormalization,
languageCode: config.languageCode,
voiceSettings: config.voiceSettings,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
},

View File

@@ -5,14 +5,33 @@ import {
TRUSTED_CLIENT_TOKEN,
generateSecMsGecToken,
} from "node-edge-tts/dist/drm.js";
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech";
import type {
SpeechProviderConfig,
SpeechProviderPlugin,
SpeechVoiceOption,
} from "openclaw/plugin-sdk/speech-core";
import { edgeTTS, inferEdgeExtension } from "./tts.js";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
type MicrosoftProviderConfig = {
enabled: boolean;
voice: string;
lang: string;
outputFormat: string;
outputFormatConfigured: boolean;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles: boolean;
proxy?: string;
timeoutMs?: number;
};
type MicrosoftVoiceListEntry = {
ShortName?: string;
FriendlyName?: string;
@@ -24,6 +43,64 @@ type MicrosoftVoiceListEntry = {
};
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeMicrosoftProviderConfig(
rawConfig: Record<string, unknown>,
): MicrosoftProviderConfig {
const rawEdge = asObject(rawConfig.edge);
const rawMicrosoft = asObject(rawConfig.microsoft);
const raw = { ...(rawEdge ?? {}), ...(rawMicrosoft ?? {}) };
const outputFormat = trimToUndefined(raw.outputFormat);
return {
enabled: asBoolean(raw.enabled) ?? true,
voice: trimToUndefined(raw.voice) ?? DEFAULT_EDGE_VOICE,
lang: trimToUndefined(raw.lang) ?? DEFAULT_EDGE_LANG,
outputFormat: outputFormat ?? DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(outputFormat),
pitch: trimToUndefined(raw.pitch),
rate: trimToUndefined(raw.rate),
volume: trimToUndefined(raw.volume),
saveSubtitles: asBoolean(raw.saveSubtitles) ?? false,
proxy: trimToUndefined(raw.proxy),
timeoutMs: asNumber(raw.timeoutMs),
};
}
function readMicrosoftProviderConfig(config: SpeechProviderConfig): MicrosoftProviderConfig {
const defaults = normalizeMicrosoftProviderConfig({});
return {
enabled: asBoolean(config.enabled) ?? defaults.enabled,
voice: trimToUndefined(config.voice) ?? defaults.voice,
lang: trimToUndefined(config.lang) ?? defaults.lang,
outputFormat: trimToUndefined(config.outputFormat) ?? defaults.outputFormat,
outputFormatConfigured:
asBoolean(config.outputFormatConfigured) ?? defaults.outputFormatConfigured,
pitch: trimToUndefined(config.pitch) ?? defaults.pitch,
rate: trimToUndefined(config.rate) ?? defaults.rate,
volume: trimToUndefined(config.volume) ?? defaults.volume,
saveSubtitles: asBoolean(config.saveSubtitles) ?? defaults.saveSubtitles,
proxy: trimToUndefined(config.proxy) ?? defaults.proxy,
timeoutMs: asNumber(config.timeoutMs) ?? defaults.timeoutMs,
};
}
function buildMicrosoftVoiceHeaders(): Record<string, string> {
const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0";
return {
@@ -77,13 +154,57 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
id: "microsoft",
label: "Microsoft",
aliases: ["edge"],
autoSelectOrder: 30,
resolveConfig: ({ rawConfig }) => normalizeMicrosoftProviderConfig(rawConfig),
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeMicrosoftProviderConfig(baseTtsConfig);
return {
...base,
enabled: true,
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
...(trimToUndefined(talkProviderConfig.languageCode) == null
? {}
: { lang: trimToUndefined(talkProviderConfig.languageCode) }),
...(trimToUndefined(talkProviderConfig.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(talkProviderConfig.outputFormat) }),
...(trimToUndefined(talkProviderConfig.pitch) == null
? {}
: { pitch: trimToUndefined(talkProviderConfig.pitch) }),
...(trimToUndefined(talkProviderConfig.rate) == null
? {}
: { rate: trimToUndefined(talkProviderConfig.rate) }),
...(trimToUndefined(talkProviderConfig.volume) == null
? {}
: { volume: trimToUndefined(talkProviderConfig.volume) }),
...(trimToUndefined(talkProviderConfig.proxy) == null
? {}
: { proxy: trimToUndefined(talkProviderConfig.proxy) }),
...(asNumber(talkProviderConfig.timeoutMs) == null
? {}
: { timeoutMs: asNumber(talkProviderConfig.timeoutMs) }),
};
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId) == null
? {}
: { voice: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(params.outputFormat) }),
}),
listVoices: async () => await listMicrosoftVoices(),
isConfigured: ({ config }) => config.edge.enabled,
isConfigured: ({ providerConfig }) => readMicrosoftProviderConfig(providerConfig).enabled,
synthesize: async (req) => {
const config = readMicrosoftProviderConfig(req.providerConfig);
const tempRoot = resolvePreferredOpenClawTmpDir();
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat;
const overrideVoice = trimToUndefined(req.providerOverrides?.voice);
let outputFormat =
trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat;
const fallbackOutputFormat =
outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
@@ -95,11 +216,11 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
text: req.text,
outputPath,
config: {
...req.config.edge,
voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice,
...config,
voice: overrideVoice ?? config.voice,
outputFormat: format,
},
timeoutMs: req.config.timeoutMs,
timeoutMs: req.timeoutMs,
});
const audioBuffer = readFileSync(outputPath);
return {
@@ -112,9 +233,9 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
try {
return await runEdge(outputFormat);
} catch (err) {
} catch (error) {
if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
throw err;
throw error;
}
outputFormat = fallbackOutputFormat;
return await runEdge(outputFormat);

View File

@@ -137,32 +137,14 @@ function createLiveTtsConfig(): ResolvedTtsConfig {
allowNormalization: true,
allowSeed: true,
},
elevenlabs: {
baseUrl: "https://api.elevenlabs.io",
voiceId: "",
modelId: "eleven_multilingual_v2",
voiceSettings: {
stability: 0.5,
similarityBoost: 0.75,
style: 0,
useSpeakerBoost: true,
speed: 1,
providerConfigs: {
openai: {
apiKey: OPENAI_API_KEY,
baseUrl: "https://api.openai.com/v1",
model: "gpt-4o-mini-tts",
voice: "alloy",
},
},
openai: {
apiKey: OPENAI_API_KEY,
baseUrl: "https://api.openai.com/v1",
model: "gpt-4o-mini-tts",
voice: "alloy",
},
edge: {
enabled: false,
voice: "en-US-AriaNeural",
lang: "en-US",
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
outputFormatConfigured: false,
saveSubtitles: false,
},
maxTextLength: 4_000,
timeoutMs: 30_000,
};
@@ -358,8 +340,9 @@ describeLive("openai plugin live", () => {
const audioFile = await speechProvider.synthesize({
text: "OpenClaw integration test OK.",
cfg,
config: ttsConfig,
providerConfig: ttsConfig.providerConfigs.openai ?? {},
target: "audio-file",
timeoutMs: ttsConfig.timeoutMs,
});
expect(audioFile.outputFormat).toBe("mp3");
expect(audioFile.fileExtension).toBe(".mp3");
@@ -368,7 +351,8 @@ describeLive("openai plugin live", () => {
const telephony = await speechProvider.synthesizeTelephony?.({
text: "Telephony check OK.",
cfg,
config: ttsConfig,
providerConfig: ttsConfig.providerConfigs.openai ?? {},
timeoutMs: ttsConfig.timeoutMs,
});
expect(telephony?.outputFormat).toBe("pcm");
expect(telephony?.sampleRate).toBe(24_000);
@@ -386,8 +370,9 @@ describeLive("openai plugin live", () => {
const synthesized = await speechProvider.synthesize({
text: "OpenClaw integration test OK.",
cfg,
config: ttsConfig,
providerConfig: ttsConfig.providerConfigs.openai ?? {},
target: "audio-file",
timeoutMs: ttsConfig.timeoutMs,
});
const transcription = await mediaProvider.transcribeAudio?.({

View File

@@ -1,16 +1,181 @@
import type { SpeechProviderPlugin } from "openclaw/plugin-sdk/core";
import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "./tts.js";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech-core";
import {
DEFAULT_OPENAI_BASE_URL,
isValidOpenAIModel,
isValidOpenAIVoice,
normalizeOpenAITtsBaseUrl,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
openaiTTS,
} from "./tts.js";
type OpenAITtsProviderConfig = {
apiKey?: string;
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
};
type OpenAITtsProviderOverrides = {
model?: string;
voice?: string;
speed?: number;
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeOpenAIProviderConfig(
rawConfig: Record<string, unknown>,
): OpenAITtsProviderConfig {
const raw = asObject(rawConfig.openai);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.openai.apiKey",
}),
baseUrl: normalizeOpenAITtsBaseUrl(
trimToUndefined(raw?.baseUrl) ??
trimToUndefined(process.env.OPENAI_TTS_BASE_URL) ??
DEFAULT_OPENAI_BASE_URL,
),
model: trimToUndefined(raw?.model) ?? "gpt-4o-mini-tts",
voice: trimToUndefined(raw?.voice) ?? "coral",
speed: asNumber(raw?.speed),
instructions: trimToUndefined(raw?.instructions),
};
}
function readOpenAIProviderConfig(config: SpeechProviderConfig): OpenAITtsProviderConfig {
const normalized = normalizeOpenAIProviderConfig({});
return {
apiKey: trimToUndefined(config.apiKey) ?? normalized.apiKey,
baseUrl: trimToUndefined(config.baseUrl) ?? normalized.baseUrl,
model: trimToUndefined(config.model) ?? normalized.model,
voice: trimToUndefined(config.voice) ?? normalized.voice,
speed: asNumber(config.speed) ?? normalized.speed,
instructions: trimToUndefined(config.instructions) ?? normalized.instructions,
};
}
function readOpenAIOverrides(
overrides: SpeechProviderOverrides | undefined,
): OpenAITtsProviderOverrides {
if (!overrides) {
return {};
}
return {
model: trimToUndefined(overrides.model),
voice: trimToUndefined(overrides.voice),
speed: asNumber(overrides.speed),
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
warnings?: string[];
} {
const baseUrl = trimToUndefined(ctx.providerConfig?.baseUrl);
switch (ctx.key) {
case "voice":
case "openai_voice":
case "openaivoice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
if (!isValidOpenAIVoice(ctx.value, baseUrl)) {
return { handled: true, warnings: [`invalid OpenAI voice "${ctx.value}"`] };
}
return { handled: true, overrides: { voice: ctx.value } };
case "model":
case "openai_model":
case "openaimodel":
if (!ctx.policy.allowModelId) {
return { handled: true };
}
if (!isValidOpenAIModel(ctx.value, baseUrl)) {
return { handled: false };
}
return { handled: true, overrides: { model: ctx.value } };
default:
return { handled: false };
}
}
export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
return {
id: "openai",
label: "OpenAI",
autoSelectOrder: 10,
models: OPENAI_TTS_MODELS,
voices: OPENAI_TTS_VOICES,
resolveConfig: ({ rawConfig }) => normalizeOpenAIProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeOpenAIProviderConfig(baseTtsConfig);
return {
...base,
...(talkProviderConfig.apiKey === undefined
? {}
: {
apiKey: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.openai.apiKey",
}),
}),
...(trimToUndefined(talkProviderConfig.baseUrl) == null
? {}
: { baseUrl: trimToUndefined(talkProviderConfig.baseUrl) }),
...(trimToUndefined(talkProviderConfig.modelId) == null
? {}
: { model: trimToUndefined(talkProviderConfig.modelId) }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
...(asNumber(talkProviderConfig.speed) == null
? {}
: { speed: asNumber(talkProviderConfig.speed) }),
...(trimToUndefined(talkProviderConfig.instructions) == null
? {}
: { instructions: trimToUndefined(talkProviderConfig.instructions) }),
};
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId) == null
? {}
: { voice: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.modelId) == null
? {}
: { model: trimToUndefined(params.modelId) }),
...(asNumber(params.speed) == null ? {} : { speed: asNumber(params.speed) }),
}),
listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
isConfigured: ({ providerConfig }) =>
Boolean(readOpenAIProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
synthesize: async (req) => {
const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
const config = readOpenAIProviderConfig(req.providerConfig);
const overrides = readOpenAIOverrides(req.providerOverrides);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
@@ -18,13 +183,13 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
const audioBuffer = await openaiTTS({
text: req.text,
apiKey,
baseUrl: req.config.openai.baseUrl,
model: req.overrides?.openai?.model ?? req.config.openai.model,
voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
speed: req.overrides?.openai?.speed ?? req.config.openai.speed,
instructions: req.config.openai.instructions,
baseUrl: config.baseUrl,
model: overrides.model ?? config.model,
voice: overrides.voice ?? config.voice,
speed: overrides.speed ?? config.speed,
instructions: config.instructions,
responseFormat,
timeoutMs: req.config.timeoutMs,
timeoutMs: req.timeoutMs,
});
return {
audioBuffer,
@@ -34,7 +199,8 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
};
},
synthesizeTelephony: async (req) => {
const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
const config = readOpenAIProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
@@ -43,13 +209,13 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
const audioBuffer = await openaiTTS({
text: req.text,
apiKey,
baseUrl: req.config.openai.baseUrl,
model: req.config.openai.model,
voice: req.config.openai.voice,
speed: req.config.openai.speed,
instructions: req.config.openai.instructions,
baseUrl: config.baseUrl,
model: config.model,
voice: config.voice,
speed: config.speed,
instructions: config.instructions,
responseFormat: outputFormat,
timeoutMs: req.config.timeoutMs,
timeoutMs: req.timeoutMs,
});
return { audioBuffer, outputFormat, sampleRate };
},

View File

@@ -1,4 +1,4 @@
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
@@ -21,7 +21,7 @@ export const OPENAI_TTS_VOICES = [
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
export function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
if (!trimmed) {
return DEFAULT_OPENAI_BASE_URL;
@@ -36,21 +36,24 @@ function isCustomOpenAIEndpoint(baseUrl?: string): boolean {
return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL) !== DEFAULT_OPENAI_BASE_URL;
}
function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
export function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
if (isCustomOpenAIEndpoint(baseUrl)) {
return true;
}
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
if (isCustomOpenAIEndpoint(baseUrl)) {
return true;
}
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
export function resolveOpenAITtsInstructions(
model: string,
instructions?: string,
): string | undefined {
const next = instructions?.trim();
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
}

View File

@@ -1,4 +1,3 @@
import { resolveOpenAITtsInstructions } from "../../api.js";
import { convertPcmToMulaw8k } from "../telephony-audio.js";
/**
@@ -72,6 +71,11 @@ function trimToUndefined(value: string | undefined): string | undefined {
return trimmed ? trimmed : undefined;
}
function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
const next = trimToUndefined(instructions);
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
}
/**
* OpenAI TTS Provider for generating speech audio.
*/