From d7e2939791e1d435f86b92020dd64493cad091f6 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 24 Apr 2026 19:43:53 +0200 Subject: [PATCH] feat: add Gradium text-to-speech provider (#64958) Adds the Gradium bundled plugin with TTS and speech-provider registration, docs, label routing, and focused/live coverage. Also carries the current main lint cleanup needed for the rebased CI lane. Co-authored-by: laurent Co-authored-by: Claude Opus 4.6 (1M context) --- .github/labeler.yml | 4 + CHANGELOG.md | 1 + docs/docs.json | 1 + docs/providers/gradium.md | 66 +++++++++ docs/providers/index.md | 1 + docs/tools/media-overview.md | 5 +- docs/tools/tts.md | 40 ++++- extensions/gradium/gradium.live.test.ts | 42 ++++++ extensions/gradium/index.ts | 11 ++ extensions/gradium/openclaw.plugin.json | 14 ++ extensions/gradium/package.json | 15 ++ .../plugin-registration.contract.test.ts | 6 + extensions/gradium/shared.ts | 17 +++ extensions/gradium/speech-provider.test.ts | 131 +++++++++++++++++ extensions/gradium/speech-provider.ts | 116 +++++++++++++++ extensions/gradium/tsconfig.json | 16 ++ extensions/gradium/tts.test.ts | 137 ++++++++++++++++++ extensions/gradium/tts.ts | 86 +++++++++++ pnpm-lock.yaml | 6 + 19 files changed, 707 insertions(+), 8 deletions(-) create mode 100644 docs/providers/gradium.md create mode 100644 extensions/gradium/gradium.live.test.ts create mode 100644 extensions/gradium/index.ts create mode 100644 extensions/gradium/openclaw.plugin.json create mode 100644 extensions/gradium/package.json create mode 100644 extensions/gradium/plugin-registration.contract.test.ts create mode 100644 extensions/gradium/shared.ts create mode 100644 extensions/gradium/speech-provider.test.ts create mode 100644 extensions/gradium/speech-provider.ts create mode 100644 extensions/gradium/tsconfig.json create mode 100644 extensions/gradium/tts.test.ts create mode 100644 extensions/gradium/tts.ts diff --git a/.github/labeler.yml b/.github/labeler.yml index bce17938942..529087c931f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -387,3 +387,7 @@ - changed-files: - any-glob-to-any-file: - "extensions/fal/**" +"extensions: gradium": + - changed-files: + - any-glob-to-any-file: + - "extensions/gradium/**" diff --git a/CHANGELOG.md b/CHANGELOG.md index 013818b038e..ca1c9842e6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai ### Changes +- Gradium: add a bundled text-to-speech provider with voice-note and telephony output support. (#64958) Thanks @LaurentMazare. - TUI/dependencies: remove direct `cli-highlight` usage from the OpenClaw TUI code-block renderer, keeping themed code coloring without the extra root dependency. Thanks @vincentkoc. - Diagnostics/OTEL: export run, model-call, and tool-execution diagnostic lifecycle events as OTEL spans without retaining live span state. Thanks @vincentkoc. - Plugins/activation: expose activation plan reasons and a richer plan API so callers can inspect why a plugin was selected while preserving existing id-list activation behavior. (#70943) Thanks @vincentkoc. diff --git a/docs/docs.json b/docs/docs.json index f5da4c35a00..e90ec56b2a5 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1300,6 +1300,7 @@ "providers/github-copilot", "providers/glm", "providers/google", + "providers/gradium", "providers/groq", "providers/huggingface", "providers/inferrs", diff --git a/docs/providers/gradium.md b/docs/providers/gradium.md new file mode 100644 index 00000000000..527a053baa1 --- /dev/null +++ b/docs/providers/gradium.md @@ -0,0 +1,66 @@ +--- +summary: "Use Gradium text-to-speech in OpenClaw" +read_when: + - You want Gradium for text-to-speech + - You need Gradium API key or voice configuration +title: "Gradium" +--- + +# Gradium + +Gradium is a bundled text-to-speech provider for OpenClaw. It can generate normal audio replies, voice-note-compatible Opus output, and 8 kHz u-law audio for telephony surfaces. + +## Setup + +Create a Gradium API key, then expose it to OpenClaw: + +```bash +export GRADIUM_API_KEY="gsk_..." +``` + +You can also store the key in config under `messages.tts.providers.gradium.apiKey`. + +## Config + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "gradium", + providers: { + gradium: { + voiceId: "YTpq7expH9539ERJ", + // apiKey: "${GRADIUM_API_KEY}", + // baseUrl: "https://api.gradium.ai", + }, + }, + }, + }, +} +``` + +## Voices + +| Name | Voice ID | +| --------- | ------------------ | +| Emma | `YTpq7expH9539ERJ` | +| Kent | `LFZvm12tW_z0xfGo` | +| Tiffany | `Eu9iL_CYe8N-Gkx_` | +| Christina | `2H4HY2CBNyJHBCrP` | +| Sydney | `jtEKaLYNn6iif5PR` | +| John | `KWJiFWu2O9nMPYcR` | +| Arthur | `3jUdJyOi9pgbxBTK` | + +Default voice: Emma. + +## Output + +- Audio-file replies use WAV. +- Voice-note replies use Opus and are marked voice-compatible. +- Telephony synthesis uses `ulaw_8000` at 8 kHz. + +## Related + +- [Text-to-Speech](/tools/tts) +- [Media Overview](/tools/media-overview) diff --git a/docs/providers/index.md b/docs/providers/index.md index 0bd49b1e0a2..5b3e59a711c 100644 --- a/docs/providers/index.md +++ b/docs/providers/index.md @@ -40,6 +40,7 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/Mattermost (plugi - [fal](/providers/fal) - [Fireworks](/providers/fireworks) - [GitHub Copilot](/providers/github-copilot) +- [Gradium](/providers/gradium) - [GLM models](/providers/glm) - [Google (Gemini)](/providers/google) - [Groq (LPU inference)](/providers/groq) diff --git a/docs/tools/media-overview.md b/docs/tools/media-overview.md index ffbb5784ecc..2550b2ac6ae 100644 --- a/docs/tools/media-overview.md +++ b/docs/tools/media-overview.md @@ -18,7 +18,7 @@ OpenClaw generates images, videos, and music, understands inbound media (images, | Image generation | `image_generate` | ComfyUI, fal, Google, MiniMax, OpenAI, Vydra, xAI | Creates or edits images from text prompts or references | | Video generation | `video_generate` | Alibaba, BytePlus, ComfyUI, fal, Google, MiniMax, OpenAI, Qwen, Runway, Together, Vydra, xAI | Creates videos from text, images, or existing videos | | Music generation | `music_generate` | ComfyUI, Google, MiniMax | Creates music or audio tracks from text prompts | -| Text-to-speech (TTS) | `tts` | ElevenLabs, Google, Microsoft, MiniMax, OpenAI, xAI | Converts outbound replies to spoken audio | +| Text-to-speech (TTS) | `tts` | ElevenLabs, Google, Gradium, Microsoft, MiniMax, OpenAI, Vydra, xAI | Converts outbound replies to spoken audio | | Media understanding | (automatic) | Any vision/audio-capable model provider, plus CLI fallbacks | Summarizes inbound images, audio, and video | ## Provider capability matrix @@ -34,6 +34,7 @@ This table shows which providers support which media capabilities across the pla | ElevenLabs | | | | Yes | Yes | | | | fal | Yes | Yes | | | | | | | Google | Yes | Yes | Yes | Yes | | Yes | Yes | +| Gradium | | | | Yes | | | | | Microsoft | | | | Yes | | | | | MiniMax | Yes | Yes | Yes | Yes | | | | | Mistral | | | | | Yes | | | @@ -41,7 +42,7 @@ This table shows which providers support which media capabilities across the pla | Qwen | | Yes | | | | | | | Runway | | Yes | | | | | | | Together | | Yes | | | | | | -| Vydra | Yes | Yes | | | | | | +| Vydra | Yes | Yes | | Yes | | | | | xAI | Yes | Yes | | Yes | Yes | | Yes | diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 158b4e5fa45..d8d6bbdd1ce 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -7,16 +7,18 @@ read_when: title: "Text-to-speech" --- -OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Microsoft, MiniMax, OpenAI, or xAI. +OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Microsoft, MiniMax, OpenAI, Vydra, or xAI. It works anywhere OpenClaw can send audio. ## Supported services - **ElevenLabs** (primary or fallback provider) - **Google Gemini** (primary or fallback provider; uses Gemini API TTS) +- **Gradium** (primary or fallback provider; supports voice-note and telephony output) - **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) - **MiniMax** (primary or fallback provider; uses the T2A v2 API) - **OpenAI** (primary or fallback provider; also used for summaries) +- **Vydra** (primary or fallback provider; shared image, video, and speech provider) - **xAI** (primary or fallback provider; uses the xAI TTS API) ### Microsoft speech notes @@ -34,12 +36,14 @@ or ElevenLabs. ## Optional keys -If you want OpenAI, ElevenLabs, Google Gemini, MiniMax, or xAI: +If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, or xAI: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `GEMINI_API_KEY` (or `GOOGLE_API_KEY`) +- `GRADIUM_API_KEY` - `MINIMAX_API_KEY` - `OPENAI_API_KEY` +- `VYDRA_API_KEY` - `XAI_API_KEY` Microsoft speech does **not** require an API key. @@ -54,6 +58,7 @@ so that provider must also be authenticated if you enable summaries. - [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio) - [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) +- [Gradium](/providers/gradium) - [MiniMax T2A v2 API](https://platform.minimaxi.com/document/T2A%20V2) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) - [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) @@ -226,6 +231,26 @@ Resolution order is `messages.tts.providers.xai.apiKey` -> `XAI_API_KEY`. Current live voices are `ara`, `eve`, `leo`, `rex`, `sal`, and `una`; `eve` is the default. `language` accepts a BCP-47 tag or `auto`. +### Gradium primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "gradium", + providers: { + gradium: { + apiKey: "gradium_api_key", + baseUrl: "https://api.gradium.ai", + voiceId: "YTpq7expH9539ERJ", + }, + }, + }, + }, +} +``` + ### Disable Microsoft speech ```json5 @@ -294,7 +319,7 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts:key=value]]` directives or a `[[tts:text]]...[[/tts:text]]` block. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"microsoft"`, `"minimax"`, or `"openai"` (fallback is automatic). +- `provider`: speech provider id such as `"elevenlabs"`, `"google"`, `"gradium"`, `"microsoft"`, `"minimax"`, `"openai"`, `"vydra"`, or `"xai"` (fallback is automatic). - If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` still works and is normalized to `microsoft`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. @@ -306,7 +331,7 @@ Then run: - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`). +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `GEMINI_API_KEY`/`GOOGLE_API_KEY`, `GRADIUM_API_KEY`, `MINIMAX_API_KEY`, `OPENAI_API_KEY`, `VYDRA_API_KEY`, `XAI_API_KEY`). - `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. - `providers.openai.baseUrl`: override the OpenAI TTS endpoint. - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` @@ -328,6 +353,8 @@ Then run: - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted). - `providers.google.baseUrl`: override the Gemini API base URL. Only `https://generativelanguage.googleapis.com` is accepted. - If `messages.tts.providers.google.apiKey` is omitted, TTS can reuse `models.providers.google.apiKey` before env fallback. +- `providers.gradium.baseUrl`: override Gradium API base URL (default `https://api.gradium.ai`). +- `providers.gradium.voiceId`: Gradium voice identifier (default Emma, `YTpq7expH9539ERJ`). - `providers.xai.apiKey`: xAI TTS API key (env: `XAI_API_KEY`). - `providers.xai.baseUrl`: override the xAI TTS base URL (default `https://api.x.ai/v1`, env: `XAI_BASE_URL`). - `providers.xai.voiceId`: xAI voice id (default `eve`; current live voices: `ara`, `eve`, `leo`, `rex`, `sal`, `una`). @@ -368,8 +395,8 @@ Here you go. Available directive keys (when enabled): -- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `google`, `minimax`, or `microsoft`; requires `allowProvider: true`) -- `voice` (OpenAI voice), `voiceName` / `voice_name` / `google_voice` (Google voice), or `voiceId` (ElevenLabs / MiniMax / xAI) +- `provider` (registered speech provider id, for example `openai`, `elevenlabs`, `google`, `gradium`, `minimax`, `microsoft`, `vydra`, or `xai`; requires `allowProvider: true`) +- `voice` (OpenAI or Gradium voice), `voiceName` / `voice_name` / `google_voice` (Google voice), or `voiceId` (ElevenLabs / Gradium / MiniMax / xAI) - `model` (OpenAI TTS model, ElevenLabs model id, or MiniMax model) or `google_model` (Google TTS model) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` - `vol` / `volume` (MiniMax volume, 0-10) @@ -431,6 +458,7 @@ These override `messages.tts.*` for that host. - 44.1kHz / 128kbps is the default balance for speech clarity. - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate). Voice-note format not natively supported; use OpenAI or ElevenLabs for guaranteed Opus voice messages. - **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path. +- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. - **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). - The bundled transport accepts an `outputFormat`, but not all formats are available from the service. diff --git a/extensions/gradium/gradium.live.test.ts b/extensions/gradium/gradium.live.test.ts new file mode 100644 index 00000000000..159056443b4 --- /dev/null +++ b/extensions/gradium/gradium.live.test.ts @@ -0,0 +1,42 @@ +import { writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; +import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js"; +import { + registerProviderPlugin, + requireRegisteredProvider, +} from "../../test/helpers/plugins/provider-registration.js"; +import plugin from "./index.js"; + +const LIVE = isLiveTestEnabled(); +const GRADIUM_API_KEY = process.env.GRADIUM_API_KEY?.trim() ?? ""; + +const registerGradiumPlugin = () => + registerProviderPlugin({ + plugin, + id: "gradium", + name: "Gradium Speech", + }); + +describe.skipIf(!LIVE || !GRADIUM_API_KEY)("gradium live", () => { + it("synthesizes speech through the registered provider", async () => { + const { speechProviders } = await registerGradiumPlugin(); + const provider = requireRegisteredProvider(speechProviders, "gradium"); + + const result = await provider.synthesize({ + text: "Hello, this is a test of Gradium text to speech.", + cfg: { plugins: { enabled: true } } as never, + providerConfig: { apiKey: GRADIUM_API_KEY }, + target: "audio-file", + timeoutMs: 45_000, + }); + + expect(result.outputFormat).toBe("wav"); + expect(result.audioBuffer.byteLength).toBeGreaterThan(512); + + const outPath = join(tmpdir(), "gradium-live-test.wav"); + writeFileSync(outPath, result.audioBuffer); + console.log(`Audio written to ${outPath}`); + }, 60_000); +}); diff --git a/extensions/gradium/index.ts b/extensions/gradium/index.ts new file mode 100644 index 00000000000..1b2a140af71 --- /dev/null +++ b/extensions/gradium/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { buildGradiumSpeechProvider } from "./speech-provider.js"; + +export default definePluginEntry({ + id: "gradium", + name: "Gradium Speech", + description: "Bundled Gradium speech provider", + register(api) { + api.registerSpeechProvider(buildGradiumSpeechProvider()); + }, +}); diff --git a/extensions/gradium/openclaw.plugin.json b/extensions/gradium/openclaw.plugin.json new file mode 100644 index 00000000000..1c10f40637b --- /dev/null +++ b/extensions/gradium/openclaw.plugin.json @@ -0,0 +1,14 @@ +{ + "id": "gradium", + "providerAuthEnvVars": { + "gradium": ["GRADIUM_API_KEY"] + }, + "contracts": { + "speechProviders": ["gradium"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/gradium/package.json b/extensions/gradium/package.json new file mode 100644 index 00000000000..c348715ac31 --- /dev/null +++ b/extensions/gradium/package.json @@ -0,0 +1,15 @@ +{ + "name": "@openclaw/gradium-speech", + "version": "2026.4.10", + "private": true, + "description": "OpenClaw Gradium speech plugin", + "type": "module", + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/gradium/plugin-registration.contract.test.ts b/extensions/gradium/plugin-registration.contract.test.ts new file mode 100644 index 00000000000..ddcd9a8c217 --- /dev/null +++ b/extensions/gradium/plugin-registration.contract.test.ts @@ -0,0 +1,6 @@ +import { describePluginRegistrationContract } from "../../test/helpers/plugins/plugin-registration-contract.js"; + +describePluginRegistrationContract({ + pluginId: "gradium", + speechProviderIds: ["gradium"], +}); diff --git a/extensions/gradium/shared.ts b/extensions/gradium/shared.ts new file mode 100644 index 00000000000..f957990136b --- /dev/null +++ b/extensions/gradium/shared.ts @@ -0,0 +1,17 @@ +export const DEFAULT_GRADIUM_BASE_URL = "https://api.gradium.ai"; +export const DEFAULT_GRADIUM_VOICE_ID = "YTpq7expH9539ERJ"; + +export const GRADIUM_VOICES = [ + { id: "YTpq7expH9539ERJ", name: "Emma" }, + { id: "LFZvm12tW_z0xfGo", name: "Kent" }, + { id: "Eu9iL_CYe8N-Gkx_", name: "Tiffany" }, + { id: "2H4HY2CBNyJHBCrP", name: "Christina" }, + { id: "jtEKaLYNn6iif5PR", name: "Sydney" }, + { id: "KWJiFWu2O9nMPYcR", name: "John" }, + { id: "3jUdJyOi9pgbxBTK", name: "Arthur" }, +] as const; + +export function normalizeGradiumBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + return trimmed?.replace(/\/+$/, "") || DEFAULT_GRADIUM_BASE_URL; +} diff --git a/extensions/gradium/speech-provider.test.ts b/extensions/gradium/speech-provider.test.ts new file mode 100644 index 00000000000..5b88500c2f3 --- /dev/null +++ b/extensions/gradium/speech-provider.test.ts @@ -0,0 +1,131 @@ +import { installPinnedHostnameTestHooks } from "openclaw/plugin-sdk/testing"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { buildGradiumSpeechProvider } from "./speech-provider.js"; + +describe("gradium speech provider", () => { + installPinnedHostnameTestHooks(); + + const provider = buildGradiumSpeechProvider(); + + afterEach(() => { + vi.unstubAllGlobals(); + vi.restoreAllMocks(); + }); + + it("reports configured when GRADIUM_API_KEY is set", () => { + const original = process.env.GRADIUM_API_KEY; + try { + process.env.GRADIUM_API_KEY = "gsk_test"; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 5_000 })).toBe(true); + } finally { + if (original === undefined) { + delete process.env.GRADIUM_API_KEY; + } else { + process.env.GRADIUM_API_KEY = original; + } + } + }); + + it("reports not configured when no key is available", () => { + const original = process.env.GRADIUM_API_KEY; + try { + delete process.env.GRADIUM_API_KEY; + expect(provider.isConfigured({ providerConfig: {}, timeoutMs: 5_000 })).toBe(false); + } finally { + if (original !== undefined) { + process.env.GRADIUM_API_KEY = original; + } + } + }); + + it("synthesizes audio via the Gradium TTS endpoint", async () => { + const audioData = Buffer.from("wav-audio-data"); + const fetchMock = vi.fn().mockResolvedValue(new Response(audioData, { status: 200 })); + vi.stubGlobal("fetch", fetchMock); + + const result = await provider.synthesize({ + text: "OpenClaw test", + cfg: {} as never, + providerConfig: { apiKey: "gsk_test123" }, + target: "audio-file", + timeoutMs: 30_000, + }); + + expect(fetchMock).toHaveBeenCalledOnce(); + const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(url).toBe("https://api.gradium.ai/api/post/speech/tts"); + const headers = new Headers(init.headers); + expect(headers.get("x-api-key")).toBe("gsk_test123"); + expect(JSON.parse(init.body as string)).toEqual({ + text: "OpenClaw test", + voice_id: "YTpq7expH9539ERJ", + only_audio: true, + output_format: "wav", + json_config: '{"padding_bonus":0}', + }); + expect(result.outputFormat).toBe("wav"); + expect(result.fileExtension).toBe(".wav"); + expect(result.voiceCompatible).toBe(false); + expect(result.audioBuffer).toEqual(audioData); + }); + + it("uses opus and voiceCompatible for voice-note target", async () => { + const audioData = Buffer.from("opus-audio-data"); + const fetchMock = vi.fn().mockResolvedValue(new Response(audioData, { status: 200 })); + vi.stubGlobal("fetch", fetchMock); + + const result = await provider.synthesize({ + text: "Voice note test", + cfg: {} as never, + providerConfig: { apiKey: "gsk_test123" }, + target: "voice-note", + timeoutMs: 30_000, + }); + + const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(JSON.parse(init.body as string).output_format).toBe("opus"); + expect(result.outputFormat).toBe("opus"); + expect(result.fileExtension).toBe(".opus"); + expect(result.voiceCompatible).toBe(true); + expect(result.audioBuffer).toEqual(audioData); + }); + + it("uses ulaw_8000 for telephony synthesis", async () => { + const audioData = Buffer.from("ulaw-audio-data"); + const fetchMock = vi.fn().mockResolvedValue(new Response(audioData, { status: 200 })); + vi.stubGlobal("fetch", fetchMock); + + const result = await provider.synthesizeTelephony!({ + text: "Telephony test", + cfg: {} as never, + providerConfig: { apiKey: "gsk_test123" }, + timeoutMs: 30_000, + }); + + const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(JSON.parse(init.body as string).output_format).toBe("ulaw_8000"); + expect(result.outputFormat).toBe("ulaw_8000"); + expect(result.sampleRate).toBe(8_000); + expect(result.audioBuffer).toEqual(audioData); + }); + + it("throws when no API key is available", async () => { + const original = process.env.GRADIUM_API_KEY; + try { + delete process.env.GRADIUM_API_KEY; + await expect( + provider.synthesize({ + text: "test", + cfg: {} as never, + providerConfig: {}, + target: "audio-file", + timeoutMs: 5_000, + }), + ).rejects.toThrow("Gradium API key missing"); + } finally { + if (original !== undefined) { + process.env.GRADIUM_API_KEY = original; + } + } + }); +}); diff --git a/extensions/gradium/speech-provider.ts b/extensions/gradium/speech-provider.ts new file mode 100644 index 00000000000..877b5dbdaef --- /dev/null +++ b/extensions/gradium/speech-provider.ts @@ -0,0 +1,116 @@ +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import type { + SpeechDirectiveTokenParseContext, + SpeechProviderConfig, + SpeechProviderPlugin, +} from "openclaw/plugin-sdk/speech"; +import { asObject, trimToUndefined } from "openclaw/plugin-sdk/speech"; +import { DEFAULT_GRADIUM_VOICE_ID, GRADIUM_VOICES, normalizeGradiumBaseUrl } from "./shared.js"; +import { gradiumTTS } from "./tts.js"; + +type GradiumProviderConfig = { + apiKey?: string; + baseUrl: string; + voiceId: string; +}; + +function normalizeGradiumProviderConfig(rawConfig: Record): GradiumProviderConfig { + const providers = asObject(rawConfig.providers); + const raw = asObject(providers?.gradium) ?? asObject(rawConfig.gradium); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "messages.tts.providers.gradium.apiKey", + }), + baseUrl: normalizeGradiumBaseUrl(trimToUndefined(raw?.baseUrl)), + voiceId: trimToUndefined(raw?.voiceId) ?? DEFAULT_GRADIUM_VOICE_ID, + }; +} + +function readGradiumProviderConfig(config: SpeechProviderConfig): GradiumProviderConfig { + const defaults = normalizeGradiumProviderConfig({}); + return { + apiKey: trimToUndefined(config.apiKey) ?? defaults.apiKey, + baseUrl: normalizeGradiumBaseUrl(trimToUndefined(config.baseUrl) ?? defaults.baseUrl), + voiceId: trimToUndefined(config.voiceId) ?? defaults.voiceId, + }; +} + +function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { + handled: boolean; + overrides?: Record; + warnings?: string[]; +} { + switch (ctx.key) { + case "voice": + case "voice_id": + case "voiceid": + case "gradium_voice": + case "gradiumvoice": + if (!ctx.policy.allowVoice) { + return { handled: true }; + } + return { + handled: true, + overrides: { ...ctx.currentOverrides, voiceId: ctx.value }, + }; + default: + return { handled: false }; + } +} + +export function buildGradiumSpeechProvider(): SpeechProviderPlugin { + return { + id: "gradium", + label: "Gradium", + autoSelectOrder: 30, + voices: GRADIUM_VOICES.map((v) => v.id), + resolveConfig: ({ rawConfig }) => normalizeGradiumProviderConfig(rawConfig), + parseDirectiveToken, + listVoices: async () => GRADIUM_VOICES.map((v) => ({ id: v.id, name: v.name })), + isConfigured: ({ providerConfig }) => + Boolean(readGradiumProviderConfig(providerConfig).apiKey || process.env.GRADIUM_API_KEY), + synthesize: async (req) => { + const config = readGradiumProviderConfig(req.providerConfig); + const overrides = req.providerOverrides ?? {}; + const apiKey = config.apiKey || process.env.GRADIUM_API_KEY; + if (!apiKey) { + throw new Error("Gradium API key missing"); + } + const wantsVoiceNote = req.target === "voice-note"; + const outputFormat = wantsVoiceNote ? "opus" : "wav"; + const audioBuffer = await gradiumTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + voiceId: trimToUndefined(overrides.voiceId) ?? config.voiceId, + outputFormat, + timeoutMs: req.timeoutMs, + }); + return { + audioBuffer, + outputFormat, + fileExtension: wantsVoiceNote ? ".opus" : ".wav", + voiceCompatible: wantsVoiceNote, + }; + }, + synthesizeTelephony: async (req) => { + const config = readGradiumProviderConfig(req.providerConfig); + const apiKey = config.apiKey || process.env.GRADIUM_API_KEY; + if (!apiKey) { + throw new Error("Gradium API key missing"); + } + const outputFormat = "ulaw_8000"; + const sampleRate = 8_000; + const audioBuffer = await gradiumTTS({ + text: req.text, + apiKey, + baseUrl: config.baseUrl, + voiceId: config.voiceId, + outputFormat, + timeoutMs: req.timeoutMs, + }); + return { audioBuffer, outputFormat, sampleRate }; + }, + }; +} diff --git a/extensions/gradium/tsconfig.json b/extensions/gradium/tsconfig.json new file mode 100644 index 00000000000..b8a85a99ac3 --- /dev/null +++ b/extensions/gradium/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../tsconfig.package-boundary.base.json", + "compilerOptions": { + "rootDir": "." + }, + "include": ["./*.ts", "./src/**/*.ts"], + "exclude": [ + "./**/*.test.ts", + "./dist/**", + "./node_modules/**", + "./src/test-support/**", + "./src/**/*test-helpers.ts", + "./src/**/*test-harness.ts", + "./src/**/*test-support.ts" + ] +} diff --git a/extensions/gradium/tts.test.ts b/extensions/gradium/tts.test.ts new file mode 100644 index 00000000000..07e5627f436 --- /dev/null +++ b/extensions/gradium/tts.test.ts @@ -0,0 +1,137 @@ +import { installPinnedHostnameTestHooks } from "openclaw/plugin-sdk/testing"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { gradiumTTS } from "./tts.js"; + +describe("gradium tts diagnostics", () => { + installPinnedHostnameTestHooks(); + + function createStreamingErrorResponse(params: { + status: number; + chunkCount: number; + chunkSize: number; + byte: number; + }): { response: Response; getReadCount: () => number } { + let reads = 0; + const stream = new ReadableStream({ + pull(controller) { + if (reads >= params.chunkCount) { + controller.close(); + return; + } + reads += 1; + controller.enqueue(new Uint8Array(params.chunkSize).fill(params.byte)); + }, + }); + return { + response: new Response(stream, { status: params.status }), + getReadCount: () => reads, + }; + } + + afterEach(() => { + vi.unstubAllGlobals(); + vi.restoreAllMocks(); + }); + + it("includes parsed provider detail and request id for JSON API errors", async () => { + const fetchMock = vi.fn().mockResolvedValue( + new Response( + JSON.stringify({ + message: "Invalid API key", + }), + { + status: 401, + headers: { + "Content-Type": "application/json", + "x-request-id": "grad_req_123", + }, + }, + ), + ); + vi.stubGlobal("fetch", fetchMock); + + await expect( + gradiumTTS({ + text: "hello", + apiKey: "bad-key", + baseUrl: "https://api.gradium.ai", + voiceId: "YTpq7expH9539ERJ", + outputFormat: "wav", + timeoutMs: 5_000, + }), + ).rejects.toThrow("Gradium API error (401): Invalid API key [request_id=grad_req_123]"); + expect(fetchMock).toHaveBeenCalledOnce(); + }); + + it("falls back to raw body text when the error body is non-JSON", async () => { + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValue(new Response("service unavailable", { status: 503 })), + ); + + await expect( + gradiumTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.gradium.ai", + voiceId: "YTpq7expH9539ERJ", + outputFormat: "wav", + timeoutMs: 5_000, + }), + ).rejects.toThrow("Gradium API error (503): service unavailable"); + }); + + it("caps streamed non-JSON error reads instead of consuming full response bodies", async () => { + const streamed = createStreamingErrorResponse({ + status: 503, + chunkCount: 200, + chunkSize: 1024, + byte: 121, + }); + vi.stubGlobal("fetch", vi.fn().mockResolvedValue(streamed.response)); + + await expect( + gradiumTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.gradium.ai", + voiceId: "YTpq7expH9539ERJ", + outputFormat: "wav", + timeoutMs: 5_000, + }), + ).rejects.toThrow("Gradium API error (503)"); + + expect(streamed.getReadCount()).toBeLessThan(200); + }); + + it("sends the correct request payload", async () => { + const audioData = Buffer.from("fake-wav-data"); + const fetchMock = vi.fn().mockResolvedValue(new Response(audioData, { status: 200 })); + vi.stubGlobal("fetch", fetchMock); + + const result = await gradiumTTS({ + text: "Hello world", + apiKey: "gsk_test123", + baseUrl: "https://api.gradium.ai", + voiceId: "YTpq7expH9539ERJ", + outputFormat: "wav", + timeoutMs: 5_000, + }); + + expect(fetchMock).toHaveBeenCalledOnce(); + const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(url).toBe("https://api.gradium.ai/api/post/speech/tts"); + expect(init.method).toBe("POST"); + const headers = new Headers(init.headers); + expect(headers.get("x-api-key")).toBe("gsk_test123"); + expect(headers.get("content-type")).toBe("application/json"); + expect(JSON.parse(init.body as string)).toEqual({ + text: "Hello world", + voice_id: "YTpq7expH9539ERJ", + only_audio: true, + output_format: "wav", + json_config: '{"padding_bonus":0}', + }); + expect(result).toEqual(audioData); + }); +}); diff --git a/extensions/gradium/tts.ts b/extensions/gradium/tts.ts new file mode 100644 index 00000000000..b795b521c08 --- /dev/null +++ b/extensions/gradium/tts.ts @@ -0,0 +1,86 @@ +import { + asObject, + readResponseTextLimited, + trimToUndefined, + truncateErrorDetail, +} from "openclaw/plugin-sdk/speech"; +import { fetchWithSsrFGuard } from "openclaw/plugin-sdk/ssrf-runtime"; +import { normalizeGradiumBaseUrl } from "./shared.js"; + +function formatGradiumErrorPayload(payload: unknown): string | undefined { + const root = asObject(payload); + if (!root) { + return undefined; + } + const message = + trimToUndefined(root.message) ?? trimToUndefined(root.error) ?? trimToUndefined(root.detail); + if (message) { + return truncateErrorDetail(message); + } + return undefined; +} + +async function extractGradiumErrorDetail(response: Response): Promise { + const rawBody = trimToUndefined(await readResponseTextLimited(response)); + if (!rawBody) { + return undefined; + } + try { + return formatGradiumErrorPayload(JSON.parse(rawBody)) ?? truncateErrorDetail(rawBody); + } catch { + return truncateErrorDetail(rawBody); + } +} + +export async function gradiumTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + voiceId: string; + outputFormat: "wav" | "opus" | "ulaw_8000" | "pcm" | "pcm_24000" | "alaw_8000"; + timeoutMs: number; +}): Promise { + const { text, apiKey, baseUrl, voiceId, outputFormat, timeoutMs } = params; + const normalizedBaseUrl = normalizeGradiumBaseUrl(baseUrl); + const url = `${normalizedBaseUrl}/api/post/speech/tts`; + const hostname = new URL(normalizedBaseUrl).hostname; + + const { response, release } = await fetchWithSsrFGuard({ + url, + init: { + method: "POST", + headers: { + "x-api-key": apiKey, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + text, + voice_id: voiceId, + only_audio: true, + output_format: outputFormat, + json_config: JSON.stringify({ padding_bonus: 0 }), + }), + }, + timeoutMs, + policy: { hostnameAllowlist: [hostname] }, + auditContext: "gradium.tts", + }); + + try { + if (!response.ok) { + const detail = await extractGradiumErrorDetail(response); + const requestId = + trimToUndefined(response.headers.get("x-request-id")) ?? + trimToUndefined(response.headers.get("request-id")); + throw new Error( + `Gradium API error (${response.status})` + + (detail ? `: ${detail}` : "") + + (requestId ? ` [request_id=${requestId}]` : ""), + ); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + await release(); + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 39a1143c614..962e5a01238 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -643,6 +643,12 @@ importers: specifier: workspace:* version: link:../.. + extensions/gradium: + devDependencies: + '@openclaw/plugin-sdk': + specifier: workspace:* + version: link:../../packages/plugin-sdk + extensions/groq: devDependencies: '@openclaw/plugin-sdk':