Files
openclaw/extensions/azure-speech/speech-provider.ts
2026-05-03 22:52:18 -07:00

307 lines
10 KiB
TypeScript

import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import type {
SpeechDirectiveTokenParseContext,
SpeechProviderConfig,
SpeechProviderOverrides,
SpeechProviderPlugin,
} from "openclaw/plugin-sdk/speech-core";
import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
import {
azureSpeechTTS,
DEFAULT_AZURE_SPEECH_AUDIO_FORMAT,
DEFAULT_AZURE_SPEECH_LANG,
DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
DEFAULT_AZURE_SPEECH_VOICE,
DEFAULT_AZURE_SPEECH_VOICE_NOTE_FORMAT,
inferAzureSpeechFileExtension,
isAzureSpeechVoiceCompatible,
listAzureSpeechVoices,
normalizeAzureSpeechBaseUrl,
} from "./tts.js";
type AzureSpeechProviderConfig = {
apiKey?: string;
region?: string;
endpoint?: string;
baseUrl?: string;
voice: string;
lang: string;
outputFormat: string;
voiceNoteOutputFormat: string;
timeoutMs?: number;
};
type AzureSpeechProviderOverrides = {
voice?: string;
lang?: string;
outputFormat?: string;
};
function readAzureSpeechEnvApiKey(): string | undefined {
return (
trimToUndefined(process.env.AZURE_SPEECH_KEY) ??
trimToUndefined(process.env.AZURE_SPEECH_API_KEY) ??
trimToUndefined(process.env.SPEECH_KEY)
);
}
function readAzureSpeechEnvRegion(): string | undefined {
return (
trimToUndefined(process.env.AZURE_SPEECH_REGION) ?? trimToUndefined(process.env.SPEECH_REGION)
);
}
function readAzureSpeechEnvEndpoint(): string | undefined {
return trimToUndefined(process.env.AZURE_SPEECH_ENDPOINT);
}
function resolveAzureSpeechConfigRecord(
rawConfig: Record<string, unknown>,
): Record<string, unknown> | undefined {
const providers = asObject(rawConfig.providers);
return (
asObject(providers?.["azure-speech"]) ??
asObject(providers?.azure) ??
asObject(rawConfig["azure-speech"]) ??
asObject(rawConfig.azure)
);
}
function normalizeAzureSpeechProviderConfig(
rawConfig: Record<string, unknown>,
): AzureSpeechProviderConfig {
const raw = resolveAzureSpeechConfigRecord(rawConfig);
const region = trimToUndefined(raw?.region) ?? readAzureSpeechEnvRegion();
const endpoint = trimToUndefined(raw?.endpoint) ?? readAzureSpeechEnvEndpoint();
const baseUrl = normalizeAzureSpeechBaseUrl({
baseUrl: trimToUndefined(raw?.baseUrl),
endpoint,
region,
});
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "messages.tts.providers.azure-speech.apiKey",
}),
region,
endpoint,
baseUrl,
voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_AZURE_SPEECH_VOICE,
lang: trimToUndefined(raw?.lang ?? raw?.languageCode) ?? DEFAULT_AZURE_SPEECH_LANG,
outputFormat: trimToUndefined(raw?.outputFormat) ?? DEFAULT_AZURE_SPEECH_AUDIO_FORMAT,
voiceNoteOutputFormat:
trimToUndefined(raw?.voiceNoteOutputFormat) ?? DEFAULT_AZURE_SPEECH_VOICE_NOTE_FORMAT,
timeoutMs: asFiniteNumber(raw?.timeoutMs),
};
}
function readAzureSpeechProviderConfig(config: SpeechProviderConfig): AzureSpeechProviderConfig {
const defaults = normalizeAzureSpeechProviderConfig({});
const region = trimToUndefined(config.region) ?? defaults.region;
const endpoint = trimToUndefined(config.endpoint) ?? defaults.endpoint;
const baseUrl = normalizeAzureSpeechBaseUrl({
baseUrl: trimToUndefined(config.baseUrl) ?? defaults.baseUrl,
endpoint,
region,
});
return {
apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey,
region,
endpoint,
baseUrl,
voice: trimToUndefined(config.voice ?? config.voiceId) ?? defaults.voice,
lang: trimToUndefined(config.lang ?? config.languageCode) ?? defaults.lang,
outputFormat: trimToUndefined(config.outputFormat) ?? defaults.outputFormat,
voiceNoteOutputFormat:
trimToUndefined(config.voiceNoteOutputFormat) ?? defaults.voiceNoteOutputFormat,
timeoutMs: asFiniteNumber(config.timeoutMs) ?? defaults.timeoutMs,
};
}
function readAzureSpeechOverrides(
overrides: SpeechProviderOverrides | undefined,
): AzureSpeechProviderOverrides {
if (!overrides) {
return {};
}
return {
voice: trimToUndefined(overrides.voice ?? overrides.voiceId),
lang: trimToUndefined(overrides.lang ?? overrides.languageCode),
outputFormat: trimToUndefined(overrides.outputFormat),
};
}
function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
handled: boolean;
overrides?: SpeechProviderOverrides;
} {
switch (ctx.key) {
case "voice":
case "voiceid":
case "voice_id":
case "azure_voice":
case "azurevoice":
case "azure_speech_voice":
if (!ctx.policy.allowVoice) {
return { handled: true };
}
return { handled: true, overrides: { ...ctx.currentOverrides, voice: ctx.value } };
case "lang":
case "language":
case "language_code":
case "languagecode":
case "azure_lang":
case "azure_language":
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
return { handled: true, overrides: { ...ctx.currentOverrides, lang: ctx.value } };
case "output_format":
case "outputformat":
case "azure_format":
case "azure_output_format":
if (!ctx.policy.allowVoiceSettings) {
return { handled: true };
}
return { handled: true, overrides: { ...ctx.currentOverrides, outputFormat: ctx.value } };
default:
return { handled: false };
}
}
function resolveApiKey(config: AzureSpeechProviderConfig): string | undefined {
return config.apiKey ?? readAzureSpeechEnvApiKey();
}
function resolveTimeoutMs(config: AzureSpeechProviderConfig, timeoutMs: number): number {
return config.timeoutMs ?? timeoutMs;
}
export function buildAzureSpeechProvider(): SpeechProviderPlugin {
return {
id: "azure-speech",
label: "Azure Speech",
aliases: ["azure"],
autoSelectOrder: 30,
resolveConfig: ({ rawConfig }) => normalizeAzureSpeechProviderConfig(rawConfig),
parseDirectiveToken,
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
const base = normalizeAzureSpeechProviderConfig(baseTtsConfig);
const apiKey =
talkProviderConfig.apiKey === undefined
? undefined
: normalizeResolvedSecretInputString({
value: talkProviderConfig.apiKey,
path: "talk.providers.azure-speech.apiKey",
});
const region = trimToUndefined(talkProviderConfig.region);
const endpoint = trimToUndefined(talkProviderConfig.endpoint ?? talkProviderConfig.baseUrl);
const baseUrl = normalizeAzureSpeechBaseUrl({
baseUrl: trimToUndefined(talkProviderConfig.baseUrl),
endpoint,
region: region ?? base.region,
});
return {
...base,
...(apiKey === undefined ? {} : { apiKey }),
...(region === undefined ? {} : { region }),
...(endpoint === undefined ? {} : { endpoint }),
...(baseUrl === undefined ? {} : { baseUrl }),
...(trimToUndefined(talkProviderConfig.voiceId) == null
? {}
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
...(trimToUndefined(talkProviderConfig.languageCode) == null
? {}
: { lang: trimToUndefined(talkProviderConfig.languageCode) }),
...(trimToUndefined(talkProviderConfig.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(talkProviderConfig.outputFormat) }),
};
},
resolveTalkOverrides: ({ params }) => ({
...(trimToUndefined(params.voiceId) == null
? {}
: { voice: trimToUndefined(params.voiceId) }),
...(trimToUndefined(params.languageCode) == null
? {}
: { lang: trimToUndefined(params.languageCode) }),
...(trimToUndefined(params.outputFormat) == null
? {}
: { outputFormat: trimToUndefined(params.outputFormat) }),
}),
listVoices: async (req) => {
const config = req.providerConfig
? readAzureSpeechProviderConfig(req.providerConfig)
: undefined;
const apiKey = req.apiKey ?? (config ? resolveApiKey(config) : readAzureSpeechEnvApiKey());
if (!apiKey) {
throw new Error("Azure Speech API key missing");
}
return listAzureSpeechVoices({
apiKey,
baseUrl: req.baseUrl ?? config?.baseUrl,
endpoint: config?.endpoint,
region: config?.region ?? readAzureSpeechEnvRegion(),
timeoutMs: config?.timeoutMs,
});
},
isConfigured: ({ providerConfig }) => {
const config = readAzureSpeechProviderConfig(providerConfig);
return Boolean(resolveApiKey(config) && (config.baseUrl || config.region || config.endpoint));
},
synthesize: async (req) => {
const config = readAzureSpeechProviderConfig(req.providerConfig);
const overrides = readAzureSpeechOverrides(req.providerOverrides);
const apiKey = resolveApiKey(config);
if (!apiKey) {
throw new Error("Azure Speech API key missing");
}
const outputFormat =
overrides.outputFormat ??
(req.target === "voice-note" ? config.voiceNoteOutputFormat : config.outputFormat);
const audioBuffer = await azureSpeechTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
endpoint: config.endpoint,
region: config.region,
voice: overrides.voice ?? config.voice,
lang: overrides.lang ?? config.lang,
outputFormat,
timeoutMs: resolveTimeoutMs(config, req.timeoutMs),
});
return {
audioBuffer,
outputFormat,
fileExtension: inferAzureSpeechFileExtension(outputFormat),
voiceCompatible: isAzureSpeechVoiceCompatible(outputFormat),
};
},
synthesizeTelephony: async (req) => {
const config = readAzureSpeechProviderConfig(req.providerConfig);
const overrides = readAzureSpeechOverrides(req.providerOverrides);
const apiKey = resolveApiKey(config);
if (!apiKey) {
throw new Error("Azure Speech API key missing");
}
const sampleRate = 8_000;
const audioBuffer = await azureSpeechTTS({
text: req.text,
apiKey,
baseUrl: config.baseUrl,
endpoint: config.endpoint,
region: config.region,
voice: overrides.voice ?? config.voice,
lang: overrides.lang ?? config.lang,
outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
timeoutMs: resolveTimeoutMs(config, req.timeoutMs),
});
return {
audioBuffer,
outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT,
sampleRate,
};
},
};
}