diff --git a/.github/labeler.yml b/.github/labeler.yml index b55c5fe3b21..045cb538252 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,6 +3,12 @@ - any-glob-to-any-file: - "extensions/bluebubbles/**" - "docs/channels/bluebubbles.md" +"plugin: azure-speech": + - changed-files: + - any-glob-to-any-file: + - "extensions/azure-speech/**" + - "docs/providers/azure-speech.md" + - "docs/tools/tts.md" "channel: discord": - changed-files: - any-glob-to-any-file: diff --git a/CHANGELOG.md b/CHANGELOG.md index fda98454798..2f46c5d0dce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ Docs: https://docs.openclaw.ai ### Changes +- Providers/Azure Speech: add Azure Speech as a bundled TTS provider with + Speech-resource auth, voice listing, SSML escaping, native Ogg/Opus + voice-note output, and telephony output. (#51776) Thanks @leonchui. - CLI/image generation: expose generic `--background` on `openclaw infer image generate` and `openclaw infer image edit`, keep `--openai-background` as an OpenAI alias, and let fal image generation honor diff --git a/docs/.i18n/glossary.zh-CN.json b/docs/.i18n/glossary.zh-CN.json index 6c12a9ae53a..3a16f990b14 100644 --- a/docs/.i18n/glossary.zh-CN.json +++ b/docs/.i18n/glossary.zh-CN.json @@ -11,6 +11,14 @@ "source": "OpenAI provider", "target": "OpenAI provider" }, + { + "source": "Azure Speech", + "target": "Azure Speech" + }, + { + "source": "Azure Speech provider", + "target": "Azure Speech provider" + }, { "source": "Status", "target": "Status" diff --git a/docs/docs.json b/docs/docs.json index 18cc6e8691d..9252d129954 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1301,6 +1301,7 @@ "providers/bedrock-mantle", "providers/anthropic", "providers/arcee", + "providers/azure-speech", "providers/chutes", "providers/claude-max-api-proxy", "providers/cloudflare-ai-gateway", diff --git a/docs/providers/azure-speech.md b/docs/providers/azure-speech.md new file mode 100644 index 00000000000..04b300bc2e0 --- /dev/null +++ b/docs/providers/azure-speech.md @@ -0,0 +1,119 @@ +--- +summary: "Azure AI Speech text-to-speech for OpenClaw replies" +read_when: + - You want Azure Speech synthesis for outbound replies + - You need native Ogg Opus voice-note output from Azure Speech +title: "Azure Speech" +--- + +Azure Speech is an Azure AI Speech text-to-speech provider. In OpenClaw it +synthesizes outbound reply audio as MP3 by default, native Ogg/Opus for voice +notes, and 8 kHz mulaw audio for telephony channels such as Voice Call. + +OpenClaw uses the Azure Speech REST API directly with SSML and sends the +provider-owned output format through `X-Microsoft-OutputFormat`. + +| Detail | Value | +| ----------------------- | -------------------------------------------------------------------------------------------------------------- | +| Website | [Azure AI Speech](https://azure.microsoft.com/products/ai-services/ai-speech) | +| Docs | [Speech REST text-to-speech](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech) | +| Auth | `AZURE_SPEECH_KEY` plus `AZURE_SPEECH_REGION` | +| Default voice | `en-US-JennyNeural` | +| Default file output | `audio-24khz-48kbitrate-mono-mp3` | +| Default voice-note file | `ogg-24khz-16bit-mono-opus` | + +## Getting started + + + + In the Azure portal, create a Speech resource. Copy **KEY 1** from + Resource Management > Keys and Endpoint, and copy the resource location + such as `eastus`. + + ``` + AZURE_SPEECH_KEY= + AZURE_SPEECH_REGION=eastus + ``` + + + + ```json5 + { + messages: { + tts: { + auto: "always", + provider: "azure-speech", + providers: { + "azure-speech": { + voice: "en-US-JennyNeural", + lang: "en-US", + }, + }, + }, + }, + } + ``` + + + Send a reply through any connected channel. OpenClaw synthesizes the audio + with Azure Speech and delivers MP3 for standard audio, or Ogg/Opus when + the channel expects a voice note. + + + +## Configuration options + +| Option | Path | Description | +| ----------------------- | ----------------------------------------------------------- | ----------------------------------------------------------------------------------------------------- | +| `apiKey` | `messages.tts.providers.azure-speech.apiKey` | Azure Speech resource key. Falls back to `AZURE_SPEECH_KEY`, `AZURE_SPEECH_API_KEY`, or `SPEECH_KEY`. | +| `region` | `messages.tts.providers.azure-speech.region` | Azure Speech resource region. Falls back to `AZURE_SPEECH_REGION` or `SPEECH_REGION`. | +| `endpoint` | `messages.tts.providers.azure-speech.endpoint` | Optional Azure Speech endpoint/base URL override. | +| `baseUrl` | `messages.tts.providers.azure-speech.baseUrl` | Optional Azure Speech base URL override. | +| `voice` | `messages.tts.providers.azure-speech.voice` | Azure voice ShortName (default `en-US-JennyNeural`). | +| `lang` | `messages.tts.providers.azure-speech.lang` | SSML language code (default `en-US`). | +| `outputFormat` | `messages.tts.providers.azure-speech.outputFormat` | Audio-file output format (default `audio-24khz-48kbitrate-mono-mp3`). | +| `voiceNoteOutputFormat` | `messages.tts.providers.azure-speech.voiceNoteOutputFormat` | Voice-note output format (default `ogg-24khz-16bit-mono-opus`). | + +## Notes + + + + Azure Speech uses a Speech resource key, not an Azure OpenAI key. The key + is sent as `Ocp-Apim-Subscription-Key`; OpenClaw derives + `https://.tts.speech.microsoft.com` from `region` unless you + provide `endpoint` or `baseUrl`. + + + Use the Azure Speech voice `ShortName` value, for example + `en-US-JennyNeural`. The bundled provider can list voices through the + same Speech resource and filters voices marked deprecated or retired. + + + Azure accepts output formats such as `audio-24khz-48kbitrate-mono-mp3`, + `ogg-24khz-16bit-mono-opus`, and `riff-24khz-16bit-mono-pcm`. OpenClaw + requests Ogg/Opus for `voice-note` targets so channels can send native + voice bubbles without an extra MP3 conversion. + + + `azure` is accepted as a provider alias for existing PRs and user config, + but new config should use `azure-speech` to avoid confusion with Azure + OpenAI model providers. + + + +## Related + + + + TTS overview, providers, and `messages.tts` config. + + + Full config reference including `messages.tts` settings. + + + All bundled OpenClaw providers. + + + Common issues and debugging steps. + + diff --git a/docs/providers/index.md b/docs/providers/index.md index 918767f8d25..839315436b1 100644 --- a/docs/providers/index.md +++ b/docs/providers/index.md @@ -31,6 +31,7 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/Mattermost (plugi - [Amazon Bedrock Mantle](/providers/bedrock-mantle) - [Anthropic (API + Claude CLI)](/providers/anthropic) - [Arcee AI (Trinity models)](/providers/arcee) +- [Azure Speech](/providers/azure-speech) - [BytePlus (International)](/concepts/model-providers#byteplus-international) - [Chutes](/providers/chutes) - [Cloudflare AI Gateway](/providers/cloudflare-ai-gateway) diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 0a8607c7f35..c8a8c20f98c 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -7,11 +7,12 @@ read_when: title: "Text-to-speech" --- -OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Volcengine, Vydra, xAI, or Xiaomi MiMo. +OpenClaw can convert outbound replies into audio using Azure Speech, ElevenLabs, Google Gemini, Gradium, Inworld, Local CLI, Microsoft, MiniMax, OpenAI, Volcengine, Vydra, xAI, or Xiaomi MiMo. It works anywhere OpenClaw can send audio. ## Supported services +- **Azure Speech** (primary or fallback provider; uses the Azure AI Speech REST API) - **ElevenLabs** (primary or fallback provider) - **Google Gemini** (primary or fallback provider; uses Gemini API TTS) - **Gradium** (primary or fallback provider; supports voice-note and telephony output) @@ -40,8 +41,10 @@ or ElevenLabs. ## Optional keys -If you want ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Volcengine, Vydra, xAI, or Xiaomi MiMo: +If you want Azure Speech, ElevenLabs, Google Gemini, Gradium, Inworld, MiniMax, OpenAI, Volcengine, Vydra, xAI, or Xiaomi MiMo: +- `AZURE_SPEECH_KEY` plus `AZURE_SPEECH_REGION` (also accepts + `AZURE_SPEECH_API_KEY`, `SPEECH_KEY`, and `SPEECH_REGION`) - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `GEMINI_API_KEY` (or `GOOGLE_API_KEY`) - `GRADIUM_API_KEY` @@ -67,6 +70,8 @@ so that provider must also be authenticated if you enable summaries. - [OpenAI Text-to-Speech guide](https://platform.openai.com/docs/guides/text-to-speech) - [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio) +- [Azure Speech REST text-to-speech](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech) +- [Azure Speech provider](/providers/azure-speech) - [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) - [Gradium](/providers/gradium) @@ -145,6 +150,36 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Azure Speech primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "azure-speech", + providers: { + "azure-speech": { + // apiKey falls back to AZURE_SPEECH_KEY. + // region falls back to AZURE_SPEECH_REGION. + voice: "en-US-JennyNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + voiceNoteOutputFormat: "ogg-24khz-16bit-mono-opus", + }, + }, + }, + }, +} +``` + +Azure Speech uses a Speech resource key, not an Azure OpenAI key. Resolution +order is `messages.tts.providers.azure-speech.apiKey` -> +`AZURE_SPEECH_KEY` -> `AZURE_SPEECH_API_KEY` -> `SPEECH_KEY`, plus +`messages.tts.providers.azure-speech.region` -> `AZURE_SPEECH_REGION` -> +`SPEECH_REGION` for the region. New config should use `azure-speech`; `azure` +is accepted as a provider alias. + ### Microsoft primary (no API key) ```json5 @@ -495,7 +530,21 @@ Then run: - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). Volcengine uses `appId`/`token` instead. +- `apiKey` values fall back to env vars (`AZURE_SPEECH_KEY`/`AZURE_SPEECH_API_KEY`/`SPEECH_KEY`, `ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `INWORLD_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`, `XIAOMI_API_KEY`). Volcengine uses `appId`/`token` instead. +- `providers.azure-speech.apiKey`: Azure Speech resource key (env: + `AZURE_SPEECH_KEY`, `AZURE_SPEECH_API_KEY`, or `SPEECH_KEY`). +- `providers.azure-speech.region`: Azure Speech region such as `eastus` (env: + `AZURE_SPEECH_REGION` or `SPEECH_REGION`). +- `providers.azure-speech.endpoint` / `providers.azure-speech.baseUrl`: optional + Azure Speech endpoint/base URL override. +- `providers.azure-speech.voice`: Azure voice ShortName (default + `en-US-JennyNeural`). +- `providers.azure-speech.lang`: SSML language code (default `en-US`). +- `providers.azure-speech.outputFormat`: Azure `X-Microsoft-OutputFormat` for + standard audio output (default `audio-24khz-48kbitrate-mono-mp3`). +- `providers.azure-speech.voiceNoteOutputFormat`: Azure + `X-Microsoft-OutputFormat` for voice-note output (default + `ogg-24khz-16bit-mono-opus`). - `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. - `providers.openai.baseUrl`: override the OpenAI TTS endpoint. - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` diff --git a/extensions/azure-speech/azure-speech.live.test.ts b/extensions/azure-speech/azure-speech.live.test.ts new file mode 100644 index 00000000000..19ecbcbef86 --- /dev/null +++ b/extensions/azure-speech/azure-speech.live.test.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from "vitest"; +import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js"; +import { + registerProviderPlugin, + requireRegisteredProvider, +} from "../../test/helpers/plugins/provider-registration.js"; +import plugin from "./index.js"; + +const AZURE_SPEECH_KEY = + process.env.AZURE_SPEECH_KEY?.trim() ?? + process.env.AZURE_SPEECH_API_KEY?.trim() ?? + process.env.SPEECH_KEY?.trim() ?? + ""; +const AZURE_SPEECH_REGION = + process.env.AZURE_SPEECH_REGION?.trim() ?? process.env.SPEECH_REGION?.trim() ?? ""; +const LIVE = isLiveTestEnabled() && AZURE_SPEECH_KEY.length > 0 && AZURE_SPEECH_REGION.length > 0; +const describeLive = LIVE ? describe : describe.skip; + +const registerAzureSpeechPlugin = () => + registerProviderPlugin({ + plugin, + id: "azure-speech", + name: "Azure Speech", + }); + +describeLive("azure speech plugin live", () => { + it("lists voices through the registered speech provider", async () => { + const { speechProviders } = await registerAzureSpeechPlugin(); + const provider = requireRegisteredProvider(speechProviders, "azure-speech"); + + const voices = await provider.listVoices?.({ + providerConfig: { + apiKey: AZURE_SPEECH_KEY, + region: AZURE_SPEECH_REGION, + }, + }); + + expect(voices?.length).toBeGreaterThan(100); + expect(voices).toEqual( + expect.arrayContaining([expect.objectContaining({ id: "en-US-JennyNeural" })]), + ); + }, 120_000); + + it("synthesizes MP3, native Ogg/Opus voice notes, and telephony audio", async () => { + const { speechProviders } = await registerAzureSpeechPlugin(); + const provider = requireRegisteredProvider(speechProviders, "azure-speech"); + const providerConfig = { + apiKey: AZURE_SPEECH_KEY, + region: AZURE_SPEECH_REGION, + voice: "en-US-JennyNeural", + lang: "en-US", + }; + + const audioFile = await provider.synthesize({ + text: "OpenClaw Azure Speech text to speech integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig, + target: "audio-file", + timeoutMs: 90_000, + }); + + expect(audioFile.outputFormat).toBe("audio-24khz-48kbitrate-mono-mp3"); + expect(audioFile.fileExtension).toBe(".mp3"); + expect(audioFile.voiceCompatible).toBe(false); + expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512); + + const voiceNote = await provider.synthesize({ + text: "OpenClaw Azure Speech voice note integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig, + target: "voice-note", + timeoutMs: 90_000, + }); + + expect(voiceNote.outputFormat).toBe("ogg-24khz-16bit-mono-opus"); + expect(voiceNote.fileExtension).toBe(".ogg"); + expect(voiceNote.voiceCompatible).toBe(true); + expect(voiceNote.audioBuffer.byteLength).toBeGreaterThan(128); + expect(voiceNote.audioBuffer.subarray(0, 4).toString("ascii")).toBe("OggS"); + + const telephony = await provider.synthesizeTelephony?.({ + text: "OpenClaw Azure Speech telephony check OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig, + timeoutMs: 90_000, + }); + if (!telephony) { + throw new Error("Azure Speech telephony synthesis did not return audio"); + } + expect(telephony.outputFormat).toBe("raw-8khz-8bit-mono-mulaw"); + expect(telephony.sampleRate).toBe(8_000); + expect(telephony.audioBuffer.byteLength).toBeGreaterThan(512); + }, 180_000); +}); diff --git a/extensions/azure-speech/index.ts b/extensions/azure-speech/index.ts new file mode 100644 index 00000000000..2564b70a36c --- /dev/null +++ b/extensions/azure-speech/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { buildAzureSpeechProvider } from "./speech-provider.js"; + +export default definePluginEntry({ + id: "azure-speech", + name: "Azure Speech", + description: "Bundled Azure Speech provider", + register(api) { + api.registerSpeechProvider(buildAzureSpeechProvider()); + }, +}); diff --git a/extensions/azure-speech/openclaw.plugin.json b/extensions/azure-speech/openclaw.plugin.json new file mode 100644 index 00000000000..ffa9bae0235 --- /dev/null +++ b/extensions/azure-speech/openclaw.plugin.json @@ -0,0 +1,63 @@ +{ + "id": "azure-speech", + "enabledByDefault": true, + "name": "Azure Speech", + "description": "Azure AI Speech text-to-speech (MP3, native Ogg/Opus voice notes, PCM telephony).", + "providerAuthEnvVars": { + "azure-speech": [ + "AZURE_SPEECH_KEY", + "AZURE_SPEECH_API_KEY", + "SPEECH_KEY", + "AZURE_SPEECH_REGION", + "SPEECH_REGION" + ], + "azure": [ + "AZURE_SPEECH_KEY", + "AZURE_SPEECH_API_KEY", + "SPEECH_KEY", + "AZURE_SPEECH_REGION", + "SPEECH_REGION" + ] + }, + "contracts": { + "speechProviders": ["azure-speech", "azure"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": { + "apiKey": { + "type": "string", + "description": "Azure Speech resource key. Falls back to AZURE_SPEECH_KEY, AZURE_SPEECH_API_KEY, or SPEECH_KEY." + }, + "region": { + "type": "string", + "description": "Azure Speech resource region, for example eastus. Falls back to AZURE_SPEECH_REGION or SPEECH_REGION." + }, + "endpoint": { + "type": "string", + "description": "Optional Azure Speech endpoint/base URL override." + }, + "baseUrl": { + "type": "string", + "description": "Optional Azure Speech base URL override." + }, + "voice": { + "type": "string", + "description": "Azure Speech voice ShortName (default en-US-JennyNeural)." + }, + "lang": { + "type": "string", + "description": "SSML language code (default en-US)." + }, + "outputFormat": { + "type": "string", + "description": "Azure Speech X-Microsoft-OutputFormat for audio-file output." + }, + "voiceNoteOutputFormat": { + "type": "string", + "description": "Azure Speech X-Microsoft-OutputFormat for voice-note output." + } + } + } +} diff --git a/extensions/azure-speech/package.json b/extensions/azure-speech/package.json new file mode 100644 index 00000000000..0b9fb1aea07 --- /dev/null +++ b/extensions/azure-speech/package.json @@ -0,0 +1,15 @@ +{ + "name": "@openclaw/azure-speech", + "version": "2026.4.16", + "private": true, + "description": "OpenClaw Azure Speech plugin", + "type": "module", + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/azure-speech/speech-provider.test.ts b/extensions/azure-speech/speech-provider.test.ts new file mode 100644 index 00000000000..40d32ec32e5 --- /dev/null +++ b/extensions/azure-speech/speech-provider.test.ts @@ -0,0 +1,194 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +const { azureSpeechTTSMock, listAzureSpeechVoicesMock } = vi.hoisted(() => ({ + azureSpeechTTSMock: vi.fn(async () => Buffer.from("audio-bytes")), + listAzureSpeechVoicesMock: vi.fn(async () => [{ id: "en-US-JennyNeural", name: "Jenny" }]), +})); + +vi.mock("./tts.js", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + azureSpeechTTS: azureSpeechTTSMock, + listAzureSpeechVoices: listAzureSpeechVoicesMock, + }; +}); + +import { buildAzureSpeechProvider } from "./speech-provider.js"; + +describe("buildAzureSpeechProvider", () => { + const originalEnv = { + AZURE_SPEECH_KEY: process.env.AZURE_SPEECH_KEY, + AZURE_SPEECH_API_KEY: process.env.AZURE_SPEECH_API_KEY, + AZURE_SPEECH_REGION: process.env.AZURE_SPEECH_REGION, + AZURE_SPEECH_ENDPOINT: process.env.AZURE_SPEECH_ENDPOINT, + SPEECH_KEY: process.env.SPEECH_KEY, + SPEECH_REGION: process.env.SPEECH_REGION, + }; + + afterEach(() => { + for (const [key, value] of Object.entries(originalEnv)) { + if (value === undefined) { + delete process.env[key]; + } else { + process.env[key] = value; + } + } + azureSpeechTTSMock.mockClear(); + listAzureSpeechVoicesMock.mockClear(); + vi.restoreAllMocks(); + }); + + it("reports configured only when key plus region or endpoint is available", () => { + const provider = buildAzureSpeechProvider(); + delete process.env.AZURE_SPEECH_KEY; + delete process.env.AZURE_SPEECH_API_KEY; + delete process.env.SPEECH_KEY; + delete process.env.AZURE_SPEECH_REGION; + delete process.env.SPEECH_REGION; + delete process.env.AZURE_SPEECH_ENDPOINT; + + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30_000 })).toBe(false); + expect(provider.isConfigured({ providerConfig: { apiKey: "key" }, timeoutMs: 30_000 })).toBe( + false, + ); + expect( + provider.isConfigured({ + providerConfig: { apiKey: "key", region: "eastus" }, + timeoutMs: 30_000, + }), + ).toBe(true); + + process.env.AZURE_SPEECH_KEY = "env-key"; + process.env.AZURE_SPEECH_REGION = "eastus"; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 30_000 })).toBe(true); + }); + + it("normalizes provider-owned config under canonical and alias keys", () => { + const provider = buildAzureSpeechProvider(); + const canonical = provider.resolveConfig?.({ + cfg: {} as never, + timeoutMs: 30_000, + rawConfig: { + providers: { + "azure-speech": { + apiKey: "key", + region: "eastus", + voice: "en-US-AriaNeural", + lang: "en-US", + }, + }, + }, + }); + const alias = provider.resolveConfig?.({ + cfg: {} as never, + timeoutMs: 30_000, + rawConfig: { + providers: { + azure: { + apiKey: "alias-key", + endpoint: "https://westus.tts.speech.microsoft.com/cognitiveservices/v1", + }, + }, + }, + }); + + expect(canonical).toEqual( + expect.objectContaining({ + apiKey: "key", + region: "eastus", + baseUrl: "https://eastus.tts.speech.microsoft.com", + voice: "en-US-AriaNeural", + }), + ); + expect(alias).toEqual( + expect.objectContaining({ + apiKey: "alias-key", + endpoint: "https://westus.tts.speech.microsoft.com/cognitiveservices/v1", + baseUrl: "https://westus.tts.speech.microsoft.com", + }), + ); + }); + + it("parses provider-specific TTS directives", () => { + const provider = buildAzureSpeechProvider(); + const policy = { + enabled: true, + allowText: true, + allowProvider: true, + allowVoice: true, + allowModelId: true, + allowVoiceSettings: true, + allowNormalization: true, + allowSeed: true, + }; + + expect(provider.parseDirectiveToken?.({ key: "azure_voice", value: "v", policy })).toEqual({ + handled: true, + overrides: { voice: "v" }, + }); + expect(provider.parseDirectiveToken?.({ key: "azure_lang", value: "en-US", policy })).toEqual({ + handled: true, + overrides: { lang: "en-US" }, + }); + expect( + provider.parseDirectiveToken?.({ key: "azure_output_format", value: "ogg", policy }), + ).toEqual({ + handled: true, + overrides: { outputFormat: "ogg" }, + }); + }); + + it("uses native Ogg/Opus for voice-note output", async () => { + const provider = buildAzureSpeechProvider(); + const result = await provider.synthesize({ + text: "hello", + cfg: {} as never, + providerConfig: { + apiKey: "key", + region: "eastus", + voice: "en-US-JennyNeural", + }, + providerOverrides: { + voice: "en-US-AriaNeural", + lang: "en-US", + }, + target: "voice-note", + timeoutMs: 30_000, + }); + + expect(azureSpeechTTSMock).toHaveBeenCalledWith({ + text: "hello", + apiKey: "key", + baseUrl: "https://eastus.tts.speech.microsoft.com", + endpoint: undefined, + region: "eastus", + voice: "en-US-AriaNeural", + lang: "en-US", + outputFormat: "ogg-24khz-16bit-mono-opus", + timeoutMs: 30_000, + }); + expect(result).toEqual({ + audioBuffer: Buffer.from("audio-bytes"), + outputFormat: "ogg-24khz-16bit-mono-opus", + fileExtension: ".ogg", + voiceCompatible: true, + }); + }); + + it("lists voices through config or explicit request auth", async () => { + const provider = buildAzureSpeechProvider(); + const voices = await provider.listVoices?.({ + providerConfig: { apiKey: "key", region: "eastus" }, + }); + + expect(voices).toEqual([{ id: "en-US-JennyNeural", name: "Jenny" }]); + expect(listAzureSpeechVoicesMock).toHaveBeenCalledWith({ + apiKey: "key", + baseUrl: "https://eastus.tts.speech.microsoft.com", + endpoint: undefined, + region: "eastus", + timeoutMs: undefined, + }); + }); +}); diff --git a/extensions/azure-speech/speech-provider.ts b/extensions/azure-speech/speech-provider.ts new file mode 100644 index 00000000000..22fcc637ea5 --- /dev/null +++ b/extensions/azure-speech/speech-provider.ts @@ -0,0 +1,305 @@ +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderOverrides, + SpeechProviderPlugin, +} from "openclaw/plugin-sdk/speech-core"; +import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core"; +import { + azureSpeechTTS, + DEFAULT_AZURE_SPEECH_AUDIO_FORMAT, + DEFAULT_AZURE_SPEECH_LANG, + DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT, + DEFAULT_AZURE_SPEECH_VOICE, + DEFAULT_AZURE_SPEECH_VOICE_NOTE_FORMAT, + inferAzureSpeechFileExtension, + isAzureSpeechVoiceCompatible, + listAzureSpeechVoices, + normalizeAzureSpeechBaseUrl, +} from "./tts.js"; + +type AzureSpeechProviderConfig = { + apiKey?: string; + region?: string; + endpoint?: string; + baseUrl?: string; + voice: string; + lang: string; + outputFormat: string; + voiceNoteOutputFormat: string; + timeoutMs?: number; +}; + +type AzureSpeechProviderOverrides = { + voice?: string; + lang?: string; + outputFormat?: string; +}; + +function readAzureSpeechEnvApiKey(): string | undefined { + return ( + trimToUndefined(process.env.AZURE_SPEECH_KEY) ?? + trimToUndefined(process.env.AZURE_SPEECH_API_KEY) ?? + trimToUndefined(process.env.SPEECH_KEY) + ); +} + +function readAzureSpeechEnvRegion(): string | undefined { + return ( + trimToUndefined(process.env.AZURE_SPEECH_REGION) ?? trimToUndefined(process.env.SPEECH_REGION) + ); +} + +function readAzureSpeechEnvEndpoint(): string | undefined { + return trimToUndefined(process.env.AZURE_SPEECH_ENDPOINT); +} + +function resolveAzureSpeechConfigRecord( + rawConfig: Record, +): Record | undefined { + const providers = asObject(rawConfig.providers); + return ( + asObject(providers?.["azure-speech"]) ?? + asObject(providers?.azure) ?? + asObject(rawConfig["azure-speech"]) ?? + asObject(rawConfig.azure) + ); +} + +function normalizeAzureSpeechProviderConfig( + rawConfig: Record, +): AzureSpeechProviderConfig { + const raw = resolveAzureSpeechConfigRecord(rawConfig); + const region = trimToUndefined(raw?.region) ?? readAzureSpeechEnvRegion(); + const endpoint = trimToUndefined(raw?.endpoint) ?? readAzureSpeechEnvEndpoint(); + const baseUrl = normalizeAzureSpeechBaseUrl({ + baseUrl: trimToUndefined(raw?.baseUrl), + endpoint, + region, + }); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.azure-speech.apiKey", + }), + region, + endpoint, + baseUrl, + voice: trimToUndefined(raw?.voice ?? raw?.voiceId) ?? DEFAULT_AZURE_SPEECH_VOICE, + lang: trimToUndefined(raw?.lang ?? raw?.languageCode) ?? DEFAULT_AZURE_SPEECH_LANG, + outputFormat: trimToUndefined(raw?.outputFormat) ?? DEFAULT_AZURE_SPEECH_AUDIO_FORMAT, + voiceNoteOutputFormat: + trimToUndefined(raw?.voiceNoteOutputFormat) ?? DEFAULT_AZURE_SPEECH_VOICE_NOTE_FORMAT, + timeoutMs: asFiniteNumber(raw?.timeoutMs), + }; +} + +function readAzureSpeechProviderConfig(config: SpeechProviderConfig): AzureSpeechProviderConfig { + const defaults = normalizeAzureSpeechProviderConfig({}); + const region = trimToUndefined(config.region) ?? defaults.region; + const endpoint = trimToUndefined(config.endpoint) ?? defaults.endpoint; + const baseUrl = normalizeAzureSpeechBaseUrl({ + baseUrl: trimToUndefined(config.baseUrl) ?? defaults.baseUrl, + endpoint, + region, + }); + return { + apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey, + region, + endpoint, + baseUrl, + voice: trimToUndefined(config.voice ?? config.voiceId) ?? defaults.voice, + lang: trimToUndefined(config.lang ?? config.languageCode) ?? defaults.lang, + outputFormat: trimToUndefined(config.outputFormat) ?? defaults.outputFormat, + voiceNoteOutputFormat: + trimToUndefined(config.voiceNoteOutputFormat) ?? defaults.voiceNoteOutputFormat, + timeoutMs: asFiniteNumber(config.timeoutMs) ?? defaults.timeoutMs, + }; +} + +function readAzureSpeechOverrides( + overrides: SpeechProviderOverrides | undefined, +): AzureSpeechProviderOverrides { + if (!overrides) { + return {}; + } + return { + voice: trimToUndefined(overrides.voice ?? overrides.voiceId), + lang: trimToUndefined(overrides.lang ?? overrides.languageCode), + outputFormat: trimToUndefined(overrides.outputFormat), + }; +} + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { + handled: boolean; + overrides?: SpeechProviderOverrides; +} { + switch (ctx.key) { + case "voice": + case "voiceid": + case "voice_id": + case "azure_voice": + case "azurevoice": + case "azure_speech_voice": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { handled: true, overrides: { ...ctx.currentOverrides, voice: ctx.value } }; + case "lang": + case "language": + case "language_code": + case "languagecode": + case "azure_lang": + case "azure_language": + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + return { handled: true, overrides: { ...ctx.currentOverrides, lang: ctx.value } }; + case "output_format": + case "outputformat": + case "azure_format": + case "azure_output_format": + if (!ctx.policy.allowVoiceSettings) { + return { handled: true }; + } + return { handled: true, overrides: { ...ctx.currentOverrides, outputFormat: ctx.value } }; + default: + return { handled: false }; + } +} + +function resolveApiKey(config: AzureSpeechProviderConfig): string | undefined { + return config.apiKey ?? readAzureSpeechEnvApiKey(); +} + +function resolveTimeoutMs(config: AzureSpeechProviderConfig, timeoutMs: number): number { + return config.timeoutMs ?? timeoutMs; +} + +export function buildAzureSpeechProvider(): SpeechProviderPlugin { + return { + id: "azure-speech", + label: "Azure Speech", + aliases: ["azure"], + autoSelectOrder: 30, + resolveConfig: ({ rawConfig }) => normalizeAzureSpeechProviderConfig(rawConfig), + parseDirectiveToken, + resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { + const base = normalizeAzureSpeechProviderConfig(baseTtsConfig); + const apiKey = + talkProviderConfig.apiKey === undefined + ? undefined + : normalizeResolvedSecretInputString({ + value: talkProviderConfig.apiKey, + path: "talk.providers.azure-speech.apiKey", + }); + const region = trimToUndefined(talkProviderConfig.region); + const endpoint = trimToUndefined(talkProviderConfig.endpoint ?? talkProviderConfig.baseUrl); + const baseUrl = normalizeAzureSpeechBaseUrl({ + baseUrl: trimToUndefined(talkProviderConfig.baseUrl), + endpoint, + region: region ?? base.region, + }); + return { + ...base, + ...(apiKey === undefined ? {} : { apiKey }), + ...(region === undefined ? {} : { region }), + ...(endpoint === undefined ? {} : { endpoint }), + ...(baseUrl === undefined ? {} : { baseUrl }), + ...(trimToUndefined(talkProviderConfig.voiceId) == null + ? {} + : { voice: trimToUndefined(talkProviderConfig.voiceId) }), + ...(trimToUndefined(talkProviderConfig.languageCode) == null + ? {} + : { lang: trimToUndefined(talkProviderConfig.languageCode) }), + ...(trimToUndefined(talkProviderConfig.outputFormat) == null + ? {} + : { outputFormat: trimToUndefined(talkProviderConfig.outputFormat) }), + }; + }, + resolveTalkOverrides: ({ params }) => ({ + ...(trimToUndefined(params.voiceId) == null + ? {} + : { voice: trimToUndefined(params.voiceId) }), + ...(trimToUndefined(params.languageCode) == null + ? {} + : { lang: trimToUndefined(params.languageCode) }), + ...(trimToUndefined(params.outputFormat) == null + ? {} + : { outputFormat: trimToUndefined(params.outputFormat) }), + }), + listVoices: async (req) => { + const config = req.providerConfig + ? readAzureSpeechProviderConfig(req.providerConfig) + : undefined; + const apiKey = req.apiKey ?? (config ? resolveApiKey(config) : readAzureSpeechEnvApiKey()); + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + return listAzureSpeechVoices({ + apiKey, + baseUrl: req.baseUrl ?? config?.baseUrl, + endpoint: config?.endpoint, + region: config?.region ?? readAzureSpeechEnvRegion(), + timeoutMs: config?.timeoutMs, + }); + }, + isConfigured: ({ providerConfig }) => { + const config = readAzureSpeechProviderConfig(providerConfig); + return Boolean(resolveApiKey(config) && (config.baseUrl || config.region || config.endpoint)); + }, + synthesize: async (req) => { + const config = readAzureSpeechProviderConfig(req.providerConfig); + const overrides = readAzureSpeechOverrides(req.providerOverrides); + const apiKey = resolveApiKey(config); + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + const outputFormat = + overrides.outputFormat ?? + (req.target === "voice-note" ? config.voiceNoteOutputFormat : config.outputFormat); + const audioBuffer = await azureSpeechTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + endpoint: config.endpoint, + region: config.region, + voice: overrides.voice ?? config.voice, + lang: overrides.lang ?? config.lang, + outputFormat, + timeoutMs: resolveTimeoutMs(config, req.timeoutMs), + }); + return { + audioBuffer, + outputFormat, + fileExtension: inferAzureSpeechFileExtension(outputFormat), + voiceCompatible: isAzureSpeechVoiceCompatible(outputFormat), + }; + }, + synthesizeTelephony: async (req) => { + const config = readAzureSpeechProviderConfig(req.providerConfig); + const apiKey = resolveApiKey(config); + if (!apiKey) { + throw new Error("Azure Speech API key missing"); + } + const sampleRate = 8_000; + const audioBuffer = await azureSpeechTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + endpoint: config.endpoint, + region: config.region, + voice: config.voice, + lang: config.lang, + outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT, + timeoutMs: resolveTimeoutMs(config, req.timeoutMs), + }); + return { + audioBuffer, + outputFormat: DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT, + sampleRate, + }; + }, + }; +} diff --git a/extensions/azure-speech/tsconfig.json b/extensions/azure-speech/tsconfig.json new file mode 100644 index 00000000000..b8a85a99ac3 --- /dev/null +++ b/extensions/azure-speech/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../tsconfig.package-boundary.base.json", + "compilerOptions": { + "rootDir": "." + }, + "include": ["./*.ts", "./src/**/*.ts"], + "exclude": [ + "./**/*.test.ts", + "./dist/**", + "./node_modules/**", + "./src/test-support/**", + "./src/**/*test-helpers.ts", + "./src/**/*test-harness.ts", + "./src/**/*test-support.ts" + ] +} diff --git a/extensions/azure-speech/tts.test.ts b/extensions/azure-speech/tts.test.ts new file mode 100644 index 00000000000..1afdb574168 --- /dev/null +++ b/extensions/azure-speech/tts.test.ts @@ -0,0 +1,127 @@ +import { installPinnedHostnameTestHooks } from "openclaw/plugin-sdk/testing"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { + azureSpeechTTS, + buildAzureSpeechSsml, + inferAzureSpeechFileExtension, + isAzureSpeechVoiceCompatible, + listAzureSpeechVoices, + normalizeAzureSpeechBaseUrl, +} from "./tts.js"; + +describe("azure speech tts", () => { + installPinnedHostnameTestHooks(); + + afterEach(() => { + vi.unstubAllGlobals(); + vi.restoreAllMocks(); + }); + + it("escapes SSML text and attributes", () => { + expect( + buildAzureSpeechSsml({ + text: `Tom & "Jerry" `, + voice: `en-US-JennyNeural" xml:lang="evil`, + lang: `en-US" bad="1`, + }), + ).toBe( + `` + + `` + + `Tom & "Jerry" <tag>`, + ); + }); + + it("normalizes region and endpoint routing", () => { + expect(normalizeAzureSpeechBaseUrl({ region: "eastus" })).toBe( + "https://eastus.tts.speech.microsoft.com", + ); + expect( + normalizeAzureSpeechBaseUrl({ + endpoint: "https://eastus.tts.speech.microsoft.com/cognitiveservices/v1/", + }), + ).toBe("https://eastus.tts.speech.microsoft.com"); + expect(normalizeAzureSpeechBaseUrl({ baseUrl: "https://custom.example.com/" })).toBe( + "https://custom.example.com", + ); + }); + + it("maps Azure output formats to attachment metadata", () => { + expect(inferAzureSpeechFileExtension("audio-24khz-48kbitrate-mono-mp3")).toBe(".mp3"); + expect(inferAzureSpeechFileExtension("ogg-24khz-16bit-mono-opus")).toBe(".ogg"); + expect(inferAzureSpeechFileExtension("riff-24khz-16bit-mono-pcm")).toBe(".wav"); + expect(inferAzureSpeechFileExtension("raw-8khz-8bit-mono-mulaw")).toBe(".pcm"); + expect(isAzureSpeechVoiceCompatible("ogg-24khz-16bit-mono-opus")).toBe(true); + expect(isAzureSpeechVoiceCompatible("webm-24khz-16bit-mono-opus")).toBe(false); + }); + + it("posts SSML to the region endpoint with Azure Speech headers", async () => { + const fetchMock = vi.fn().mockResolvedValue(new Response(Buffer.from("mp3"), { status: 200 })); + vi.stubGlobal("fetch", fetchMock); + + const result = await azureSpeechTTS({ + text: "hello", + apiKey: "speech-key", + region: "eastus", + voice: "en-US-JennyNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + timeoutMs: 1234, + }); + + expect(result).toEqual(Buffer.from("mp3")); + expect(fetchMock).toHaveBeenCalledOnce(); + const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(url).toBe("https://eastus.tts.speech.microsoft.com/cognitiveservices/v1"); + expect(init.method).toBe("POST"); + const headers = new Headers(init.headers); + expect(headers.get("Ocp-Apim-Subscription-Key")).toBe("speech-key"); + expect(headers.get("Content-Type")).toBe("application/ssml+xml"); + expect(headers.get("X-Microsoft-OutputFormat")).toBe("audio-24khz-48kbitrate-mono-mp3"); + expect(init.body).toContain(`hello`); + expect(init.signal).toBeInstanceOf(AbortSignal); + }); + + it("lists voices with timeout and filters deprecated entries", async () => { + const fetchMock = vi.fn().mockResolvedValue( + new Response( + JSON.stringify([ + { + ShortName: "en-US-JennyNeural", + DisplayName: "Jenny", + Locale: "en-US", + Gender: "Female", + Status: "GA", + VoiceTag: { VoicePersonalities: ["Warm"] }, + }, + { ShortName: "en-US-OldNeural", DisplayName: "Old", Status: "Deprecated" }, + { ShortName: "en-US-RetiredNeural", DisplayName: "Retired", IsDeprecated: true }, + ]), + { status: 200, headers: { "Content-Type": "application/json" } }, + ), + ); + vi.stubGlobal("fetch", fetchMock); + + const voices = await listAzureSpeechVoices({ + apiKey: "speech-key", + baseUrl: "https://custom.example.com", + timeoutMs: 4321, + }); + + expect(fetchMock).toHaveBeenCalledOnce(); + const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(url).toBe("https://custom.example.com/cognitiveservices/voices/list"); + expect(new Headers(init.headers).get("Ocp-Apim-Subscription-Key")).toBe("speech-key"); + expect(init.signal).toBeInstanceOf(AbortSignal); + expect(voices).toEqual([ + { + id: "en-US-JennyNeural", + name: "Jenny", + description: "Warm", + locale: "en-US", + gender: "Female", + personalities: ["Warm"], + }, + ]); + }); +}); diff --git a/extensions/azure-speech/tts.ts b/extensions/azure-speech/tts.ts new file mode 100644 index 00000000000..5ac8df3a460 --- /dev/null +++ b/extensions/azure-speech/tts.ts @@ -0,0 +1,209 @@ +import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http"; +import type { SpeechVoiceOption } from "openclaw/plugin-sdk/speech-core"; +import { trimToUndefined } from "openclaw/plugin-sdk/speech-core"; +import { + fetchWithSsrFGuard, + ssrfPolicyFromHttpBaseUrlAllowedHostname, +} from "openclaw/plugin-sdk/ssrf-runtime"; + +export const DEFAULT_AZURE_SPEECH_VOICE = "en-US-JennyNeural"; +export const DEFAULT_AZURE_SPEECH_LANG = "en-US"; +export const DEFAULT_AZURE_SPEECH_AUDIO_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; +export const DEFAULT_AZURE_SPEECH_VOICE_NOTE_FORMAT = "ogg-24khz-16bit-mono-opus"; +export const DEFAULT_AZURE_SPEECH_TELEPHONY_FORMAT = "raw-8khz-8bit-mono-mulaw"; + +export type AzureSpeechVoiceEntry = { + ShortName?: string; + DisplayName?: string; + LocalName?: string; + Locale?: string; + Gender?: string; + Status?: string; + IsDeprecated?: boolean | string; + VoiceTag?: { + VoicePersonalities?: string[]; + TailoredScenarios?: string[]; + }; +}; + +export function normalizeAzureSpeechBaseUrl(params: { + baseUrl?: string; + endpoint?: string; + region?: string; +}): string | undefined { + const configured = trimToUndefined(params.baseUrl) ?? trimToUndefined(params.endpoint); + if (configured) { + return configured.replace(/\/+$/, "").replace(/\/cognitiveservices\/v1$/i, ""); + } + const region = trimToUndefined(params.region); + return region ? `https://${region}.tts.speech.microsoft.com` : undefined; +} + +function azureSpeechUrl(params: { + baseUrl?: string; + endpoint?: string; + region?: string; + path: "/cognitiveservices/v1" | "/cognitiveservices/voices/list"; +}): string { + const baseUrl = normalizeAzureSpeechBaseUrl(params); + if (!baseUrl) { + throw new Error("Azure Speech region or endpoint missing"); + } + return `${baseUrl}${params.path}`; +} + +export function escapeXmlText(text: string): string { + return text.replace(/&/g, "&").replace(//g, ">"); +} + +export function escapeXmlAttr(value: string): string { + return escapeXmlText(value).replace(/"/g, """).replace(/'/g, "'"); +} + +export function buildAzureSpeechSsml(params: { + text: string; + voice: string; + lang?: string; +}): string { + const lang = trimToUndefined(params.lang) ?? DEFAULT_AZURE_SPEECH_LANG; + return ( + `` + + `${escapeXmlText(params.text)}` + + `` + ); +} + +export function inferAzureSpeechFileExtension(outputFormat: string): string { + const normalized = outputFormat.toLowerCase(); + if (normalized.includes("mp3")) { + return ".mp3"; + } + if (normalized.startsWith("ogg-")) { + return ".ogg"; + } + if (normalized.startsWith("webm-")) { + return ".webm"; + } + if (normalized.startsWith("riff-")) { + return ".wav"; + } + if (normalized.startsWith("raw-")) { + return ".pcm"; + } + if (normalized.startsWith("amr-")) { + return ".amr"; + } + return ".audio"; +} + +export function isAzureSpeechVoiceCompatible(outputFormat: string): boolean { + const normalized = outputFormat.toLowerCase(); + return normalized.startsWith("ogg-") && normalized.includes("opus"); +} + +function formatVoiceDescription(entry: AzureSpeechVoiceEntry): string | undefined { + const parts = [ + ...(entry.VoiceTag?.TailoredScenarios ?? []), + ...(entry.VoiceTag?.VoicePersonalities ?? []), + ].filter((value) => trimToUndefined(value) !== undefined); + return parts.length > 0 ? parts.join(", ") : undefined; +} + +function isDeprecatedVoice(entry: AzureSpeechVoiceEntry): boolean { + if (entry.IsDeprecated === true) { + return true; + } + if (typeof entry.IsDeprecated === "string" && entry.IsDeprecated.toLowerCase() === "true") { + return true; + } + const status = trimToUndefined(entry.Status)?.toLowerCase(); + return status === "deprecated" || status === "retired" || status === "disabled"; +} + +export async function listAzureSpeechVoices(params: { + apiKey: string; + baseUrl?: string; + endpoint?: string; + region?: string; + timeoutMs?: number; +}): Promise { + const url = azureSpeechUrl({ ...params, path: "/cognitiveservices/voices/list" }); + const { response, release } = await fetchWithSsrFGuard({ + url, + init: { + method: "GET", + headers: { + "Ocp-Apim-Subscription-Key": params.apiKey, + }, + }, + timeoutMs: params.timeoutMs, + policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(url), + auditContext: "azure-speech.voices", + }); + + try { + await assertOkOrThrowProviderError(response, "Azure Speech voices API error"); + const voices = (await response.json()) as AzureSpeechVoiceEntry[]; + return Array.isArray(voices) + ? voices + .filter((voice) => !isDeprecatedVoice(voice)) + .map((voice) => ({ + id: trimToUndefined(voice.ShortName) ?? "", + name: trimToUndefined(voice.DisplayName) ?? trimToUndefined(voice.LocalName), + description: formatVoiceDescription(voice), + locale: trimToUndefined(voice.Locale), + gender: trimToUndefined(voice.Gender), + personalities: voice.VoiceTag?.VoicePersonalities?.filter( + (value): value is string => trimToUndefined(value) !== undefined, + ), + })) + .filter((voice) => voice.id.length > 0) + : []; + } finally { + await release(); + } +} + +export async function azureSpeechTTS(params: { + text: string; + apiKey: string; + baseUrl?: string; + endpoint?: string; + region?: string; + voice?: string; + lang?: string; + outputFormat?: string; + timeoutMs?: number; +}): Promise { + const voice = trimToUndefined(params.voice) ?? DEFAULT_AZURE_SPEECH_VOICE; + const outputFormat = trimToUndefined(params.outputFormat) ?? DEFAULT_AZURE_SPEECH_AUDIO_FORMAT; + const url = azureSpeechUrl({ ...params, path: "/cognitiveservices/v1" }); + const { response, release } = await fetchWithSsrFGuard({ + url, + init: { + method: "POST", + headers: { + "Content-Type": "application/ssml+xml", + "Ocp-Apim-Subscription-Key": params.apiKey, + "X-Microsoft-OutputFormat": outputFormat, + "User-Agent": "OpenClaw", + }, + body: buildAzureSpeechSsml({ + text: params.text, + voice, + lang: params.lang, + }), + }, + timeoutMs: params.timeoutMs, + policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(url), + auditContext: "azure-speech.tts", + }); + + try { + await assertOkOrThrowProviderError(response, "Azure Speech TTS API error"); + return Buffer.from(await response.arrayBuffer()); + } finally { + await release(); + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index eb6c62ef9a3..43211fa7c88 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -292,6 +292,12 @@ importers: specifier: workspace:* version: link:../../packages/plugin-sdk + extensions/azure-speech: + devDependencies: + '@openclaw/plugin-sdk': + specifier: workspace:* + version: link:../../packages/plugin-sdk + extensions/bluebubbles: devDependencies: '@openclaw/plugin-sdk':