From b511250e5c004b786ce9a609bb19d11963a6010b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 12:12:11 +0100 Subject: [PATCH] feat(media): add voice conversion and speech plugins --- .github/labeler.yml | 10 + CHANGELOG.md | 4 + docs/channels/qqbot.md | 5 + docs/channels/whatsapp.md | 3 +- docs/nodes/audio.md | 19 +- docs/nodes/media-understanding.md | 4 +- docs/providers/index.md | 2 + docs/providers/senseaudio.md | 65 +++ docs/tools/media-overview.md | 53 ++- docs/tools/tts.md | 43 +- .../src/engine/gateway/inbound-context.ts | 1 + .../src/engine/gateway/inbound-pipeline.ts | 3 + .../engine/gateway/outbound-dispatch.test.ts | 202 ++++++++ .../src/engine/gateway/outbound-dispatch.ts | 30 +- .../src/engine/messaging/outbound-deliver.ts | 1 + .../src/engine/messaging/reply-dispatcher.ts | 21 +- extensions/senseaudio/index.ts | 11 + .../media-understanding-provider.test.ts | 136 ++++++ .../media-understanding-provider.ts | 25 + extensions/senseaudio/openclaw.plugin.json | 15 + extensions/senseaudio/package.json | 15 + extensions/senseaudio/test-api.ts | 1 + extensions/tts-local-cli/index.ts | 11 + extensions/tts-local-cli/openclaw.plugin.json | 12 + extensions/tts-local-cli/package.json | 15 + .../tts-local-cli/speech-provider.test.ts | 283 ++++++++++++ extensions/tts-local-cli/speech-provider.ts | 436 ++++++++++++++++++ .../src/auto-reply/deliver-reply.test.ts | 54 +++ .../whatsapp/src/auto-reply/deliver-reply.ts | 4 +- .../whatsapp/src/outbound-media-contract.ts | 100 +++- extensions/whatsapp/src/send.test.ts | 44 ++ extensions/whatsapp/src/send.ts | 4 +- pnpm-lock.yaml | 78 ++++ ...n-registration.senseaudio.contract.test.ts | 4 + ...egistration.tts-local-cli.contract.test.ts | 4 + src/tts/provider-registry.test.ts | 2 + .../plugin-registration-contract-cases.ts | 8 + 37 files changed, 1681 insertions(+), 47 deletions(-) create mode 100644 docs/providers/senseaudio.md create mode 100644 extensions/qqbot/src/engine/gateway/outbound-dispatch.test.ts create mode 100644 extensions/senseaudio/index.ts create mode 100644 extensions/senseaudio/media-understanding-provider.test.ts create mode 100644 extensions/senseaudio/media-understanding-provider.ts create mode 100644 extensions/senseaudio/openclaw.plugin.json create mode 100644 extensions/senseaudio/package.json create mode 100644 extensions/senseaudio/test-api.ts create mode 100644 extensions/tts-local-cli/index.ts create mode 100644 extensions/tts-local-cli/openclaw.plugin.json create mode 100644 extensions/tts-local-cli/package.json create mode 100644 extensions/tts-local-cli/speech-provider.test.ts create mode 100644 extensions/tts-local-cli/speech-provider.ts create mode 100644 src/plugins/contracts/plugin-registration.senseaudio.contract.test.ts create mode 100644 src/plugins/contracts/plugin-registration.tts-local-cli.contract.test.ts diff --git a/.github/labeler.yml b/.github/labeler.yml index 529087c931f..ddfdc2c0d93 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -351,6 +351,11 @@ - changed-files: - any-glob-to-any-file: - "extensions/qianfan/**" +"extensions: senseaudio": + - changed-files: + - any-glob-to-any-file: + - "extensions/senseaudio/**" + - "docs/providers/senseaudio.md" "extensions: synthetic": - changed-files: - any-glob-to-any-file: @@ -367,6 +372,11 @@ - changed-files: - any-glob-to-any-file: - "extensions/together/**" +"extensions: tts-local-cli": + - changed-files: + - any-glob-to-any-file: + - "extensions/tts-local-cli/**" + - "docs/tools/tts.md" "extensions: venice": - changed-files: - any-glob-to-any-file: diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e63a778bbb..370968bebc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai - Memory-core/hybrid search: expose raw `vectorScore` and `textScore` alongside the combined `score` on hybrid memory search results, so callers can inspect vector-versus-text retrieval contribution before temporal decay or MMR reordering. Fixes #68166. (#68286) Thanks @ajfonthemove. - Providers/Xiaomi: add MiMo TTS as a bundled speech provider with MP3/WAV output and voice-note Opus transcoding. Fixes #52376. (#55614) Thanks @zoujiejun. - Providers/ElevenLabs: include `eleven_v3` in the bundled TTS model catalog so model selection surfaces can offer ElevenLabs v3. (#68321) Thanks @itsuzef. +- Providers/Local CLI TTS: add a bundled local command speech provider with file/stdout input, voice-note Opus conversion, and telephony PCM output. (#56239) Thanks @solar2ain. ### Fixes @@ -58,6 +59,9 @@ Docs: https://docs.openclaw.ai - GitHub Copilot: preserve encrypted Responses reasoning item IDs during replay so Copilot can validate encrypted reasoning payloads across requests. (#71448) Thanks @a410979729-sys. - Agents/replies: recover final-answer text when streamed assistant chunks contain only whitespace, preventing completed turns from surfacing as empty-payload errors. Fixes #71454. (#71467) Thanks @Sanjays2402. - Feishu/TTS: transcode voice-intent MP3 and other audio replies to Ogg/Opus before sending native Feishu audio bubbles, while keeping ordinary MP3 attachments as files. Fixes #61249 and #37868. Thanks @sg1416-zg and @ycjlb2023-peteryi. +- WhatsApp/TTS: transcode MP3/WebM audio, including Microsoft Edge TTS output, to Ogg/Opus before sending PTT voice notes. +- QQBot/TTS: honor plain `audioAsVoice` replies by synthesizing TTS to native QQ voice messages, and mark inbound voice-only messages as audio media without exposing raw voice paths to generic media context. +- Providers/SenseAudio: add bundled SenseAudio batch audio transcription through `tools.media.audio` with `SENSEAUDIO_API_KEY` auth. (#66943) Thanks @Fl0rencess720. - Providers/MiniMax: let TTS use MiniMax portal OAuth and Token Plan credentials before falling back to `MINIMAX_API_KEY`, and include current TTS HD model ids. Fixes #55017. Thanks @zx15210404690-hash. - Telegram/webhook: acknowledge validated webhook updates before running bot middleware, keeping slow agent turns from tripping Telegram delivery retries while preserving per-chat processing lanes. Fixes #71392. Thanks @joelforsberg46-source. - MCP: retire one-shot embedded bundled MCP runtimes at run end, skip bundle-MCP startup when a runtime tool allowlist cannot reach bundle-MCP tools, and add `mcp.sessionIdleTtlMs` idle eviction for leaked session runtimes. Fixes #71106, #71110, #70389, and #70808. diff --git a/docs/channels/qqbot.md b/docs/channels/qqbot.md index 018c4101b0e..f5cfbbf860a 100644 --- a/docs/channels/qqbot.md +++ b/docs/channels/qqbot.md @@ -147,6 +147,11 @@ STT and TTS support two-level configuration with priority fallback: Set `enabled: false` on either to disable. +Inbound QQ voice attachments are exposed to agents as audio media metadata while +keeping raw voice files out of generic `MediaPaths`. `[[audio_as_voice]]` plain +text replies synthesize TTS and send a native QQ voice message when TTS is +configured. + Outbound audio upload/transcode behavior can also be tuned with `channels.qqbot.audioFormatPolicy`: diff --git a/docs/channels/whatsapp.md b/docs/channels/whatsapp.md index 6649d68f816..74dfd52677e 100644 --- a/docs/channels/whatsapp.md +++ b/docs/channels/whatsapp.md @@ -362,7 +362,8 @@ When the linked self number is also present in `allowFrom`, WhatsApp self-chat s - supports image, video, audio (PTT voice-note), and document payloads - reply payloads preserve `audioAsVoice`; WhatsApp sends audio media as Baileys PTT voice notes - - `audio/ogg` is rewritten to `audio/ogg; codecs=opus` for voice-note compatibility + - non-Ogg audio, including Microsoft Edge TTS MP3/WebM output, is transcoded to Ogg/Opus before PTT delivery + - native Ogg/Opus audio is sent with `audio/ogg; codecs=opus` for voice-note compatibility - animated GIF playback is supported via `gifPlayback: true` on video sends - captions are applied to the first media item when sending multi-media reply payloads - media source can be HTTP(S), `file://`, or local paths diff --git a/docs/nodes/audio.md b/docs/nodes/audio.md index ea656fc368e..a80db6fad4f 100644 --- a/docs/nodes/audio.md +++ b/docs/nodes/audio.md @@ -31,7 +31,7 @@ OpenClaw auto-detects in this order and stops at the first working option: 3. **Gemini CLI** (`gemini`) using `read_many_files` 4. **Provider auth** - Configured `models.providers.*` entries that support audio are tried first - - Bundled fallback order: OpenAI → Groq → Deepgram → Google → Mistral + - Bundled fallback order: OpenAI → Groq → xAI → Deepgram → Google → SenseAudio → ElevenLabs → Mistral To disable auto-detection, set `tools.media.audio.enabled: false`. To customize, set `tools.media.audio.models`. @@ -112,6 +112,21 @@ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI } ``` +### Provider-only (SenseAudio) + +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [{ provider: "senseaudio", model: "senseaudio-asr-pro-1.5-260319" }], + }, + }, + }, +} +``` + ### Echo transcript to chat (opt-in) ```json5 @@ -136,6 +151,8 @@ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI - Deepgram picks up `DEEPGRAM_API_KEY` when `provider: "deepgram"` is used. - Deepgram setup details: [Deepgram (audio transcription)](/providers/deepgram). - Mistral setup details: [Mistral](/providers/mistral). +- SenseAudio picks up `SENSEAUDIO_API_KEY` when `provider: "senseaudio"` is used. +- SenseAudio setup details: [SenseAudio](/providers/senseaudio). - Audio providers can override `baseUrl`, `headers`, and `providerOptions` via `tools.media.audio`. - Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried. - Tiny/empty audio files below 1024 bytes are skipped before provider/CLI transcription. diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 14a49c3ec81..f68cfa5c665 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -167,7 +167,7 @@ working option**: example through `agents.defaults.imageModel` or `openclaw infer image describe --model ollama/`. - Bundled fallback order: - - Audio: OpenAI → Groq → xAI → Deepgram → Google → Mistral + - Audio: OpenAI → Groq → xAI → Deepgram → Google → SenseAudio → ElevenLabs → Mistral - Image: OpenAI → Anthropic → Google → MiniMax → MiniMax Portal → Z.AI - Video: Google → Qwen → Moonshot @@ -228,7 +228,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. | Capability | Provider integration | Notes | | ---------- | ---------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | Image | OpenAI, OpenAI Codex OAuth, Codex app-server, OpenRouter, Anthropic, Google, MiniMax, Moonshot, Qwen, Z.AI, config providers | Vendor plugins register image support; `openai-codex/*` uses OAuth provider plumbing; `codex/*` uses a bounded Codex app-server turn; MiniMax and MiniMax OAuth both use `MiniMax-VL-01`; image-capable config providers auto-register. | -| Audio | OpenAI, Groq, Deepgram, Google, Mistral | Provider transcription (Whisper/Deepgram/Gemini/Voxtral). | +| Audio | OpenAI, Groq, xAI, Deepgram, Google, SenseAudio, ElevenLabs, Mistral | Provider transcription (Whisper/Groq/xAI/Deepgram/Gemini/SenseAudio/Scribe/Voxtral). | | Video | Google, Qwen, Moonshot | Provider video understanding via vendor plugins; Qwen video understanding uses the Standard DashScope endpoints. | MiniMax note: diff --git a/docs/providers/index.md b/docs/providers/index.md index 5b3e59a711c..918767f8d25 100644 --- a/docs/providers/index.md +++ b/docs/providers/index.md @@ -62,6 +62,7 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/Mattermost (plugi - [Qianfan](/providers/qianfan) - [Qwen Cloud](/providers/qwen) - [Runway](/providers/runway) +- [SenseAudio](/providers/senseaudio) - [SGLang (local models)](/providers/sglang) - [StepFun](/providers/stepfun) - [Synthetic](/providers/synthetic) @@ -89,6 +90,7 @@ Looking for chat channel docs (WhatsApp/Telegram/Discord/Slack/Mattermost (plugi - [ElevenLabs](/providers/elevenlabs#speech-to-text) - [Mistral](/providers/mistral#audio-transcription-voxtral) - [OpenAI](/providers/openai#speech-to-text) +- [SenseAudio](/providers/senseaudio) - [xAI](/providers/xai#speech-to-text) ## Community tools diff --git a/docs/providers/senseaudio.md b/docs/providers/senseaudio.md new file mode 100644 index 00000000000..24913485a4d --- /dev/null +++ b/docs/providers/senseaudio.md @@ -0,0 +1,65 @@ +--- +summary: "SenseAudio batch speech-to-text for inbound voice notes" +read_when: + - You want SenseAudio speech-to-text for audio attachments + - You need the SenseAudio API key env var or audio config path +title: "SenseAudio" +--- + +# SenseAudio + +SenseAudio can transcribe inbound audio/voice-note attachments through +OpenClaw's shared `tools.media.audio` pipeline. OpenClaw posts multipart audio +to the OpenAI-compatible transcription endpoint and injects the returned text +as `{{Transcript}}` plus an `[Audio]` block. + +| Detail | Value | +| ------------- | ------------------------------------------------ | +| Website | [senseaudio.cn](https://senseaudio.cn) | +| Docs | [senseaudio.cn/docs](https://senseaudio.cn/docs) | +| Auth | `SENSEAUDIO_API_KEY` | +| Default model | `senseaudio-asr-pro-1.5-260319` | +| Default URL | `https://api.senseaudio.cn/v1` | + +## Getting Started + + + + ```bash + export SENSEAUDIO_API_KEY="..." + ``` + + + ```json5 + { + tools: { + media: { + audio: { + enabled: true, + models: [{ provider: "senseaudio", model: "senseaudio-asr-pro-1.5-260319" }], + }, + }, + }, + } + ``` + + + Send an audio message through any connected channel. OpenClaw uploads the + audio to SenseAudio and uses the transcript in the reply pipeline. + + + +## Options + +| Option | Path | Description | +| ---------- | ------------------------------------- | ----------------------------------- | +| `model` | `tools.media.audio.models[].model` | SenseAudio ASR model id | +| `language` | `tools.media.audio.models[].language` | Optional language hint | +| `prompt` | `tools.media.audio.prompt` | Optional transcription prompt | +| `baseUrl` | `tools.media.audio.baseUrl` or model | Override the OpenAI-compatible base | +| `headers` | `tools.media.audio.request.headers` | Extra request headers | + + +SenseAudio is batch STT only in OpenClaw. Voice Call realtime transcription +continues to use providers with streaming STT support. + diff --git a/docs/tools/media-overview.md b/docs/tools/media-overview.md index 2550b2ac6ae..d79cc887cc7 100644 --- a/docs/tools/media-overview.md +++ b/docs/tools/media-overview.md @@ -18,32 +18,35 @@ OpenClaw generates images, videos, and music, understands inbound media (images, | Image generation | `image_generate` | ComfyUI, fal, Google, MiniMax, OpenAI, Vydra, xAI | Creates or edits images from text prompts or references | | Video generation | `video_generate` | Alibaba, BytePlus, ComfyUI, fal, Google, MiniMax, OpenAI, Qwen, Runway, Together, Vydra, xAI | Creates videos from text, images, or existing videos | | Music generation | `music_generate` | ComfyUI, Google, MiniMax | Creates music or audio tracks from text prompts | -| Text-to-speech (TTS) | `tts` | ElevenLabs, Google, Gradium, Microsoft, MiniMax, OpenAI, Vydra, xAI | Converts outbound replies to spoken audio | +| Text-to-speech (TTS) | `tts` | ElevenLabs, Google, Gradium, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, Xiaomi MiMo | Converts outbound replies to spoken audio | | Media understanding | (automatic) | Any vision/audio-capable model provider, plus CLI fallbacks | Summarizes inbound images, audio, and video | ## Provider capability matrix This table shows which providers support which media capabilities across the platform. -| Provider | Image | Video | Music | TTS | STT / Transcription | Realtime Voice | Media Understanding | -| ---------- | ----- | ----- | ----- | --- | ------------------- | -------------- | ------------------- | -| Alibaba | | Yes | | | | | | -| BytePlus | | Yes | | | | | | -| ComfyUI | Yes | Yes | Yes | | | | | -| Deepgram | | | | | Yes | | | -| ElevenLabs | | | | Yes | Yes | | | -| fal | Yes | Yes | | | | | | -| Google | Yes | Yes | Yes | Yes | | Yes | Yes | -| Gradium | | | | Yes | | | | -| Microsoft | | | | Yes | | | | -| MiniMax | Yes | Yes | Yes | Yes | | | | -| Mistral | | | | | Yes | | | -| OpenAI | Yes | Yes | | Yes | Yes | Yes | Yes | -| Qwen | | Yes | | | | | | -| Runway | | Yes | | | | | | -| Together | | Yes | | | | | | -| Vydra | Yes | Yes | | Yes | | | | -| xAI | Yes | Yes | | Yes | Yes | | Yes | +| Provider | Image | Video | Music | TTS | STT / Transcription | Realtime Voice | Media Understanding | +| ----------- | ----- | ----- | ----- | --- | ------------------- | -------------- | ------------------- | +| Alibaba | | Yes | | | | | | +| BytePlus | | Yes | | | | | | +| ComfyUI | Yes | Yes | Yes | | | | | +| Deepgram | | | | | Yes | Yes | | +| ElevenLabs | | | | Yes | Yes | | | +| fal | Yes | Yes | | | | | | +| Google | Yes | Yes | Yes | Yes | | Yes | Yes | +| Gradium | | | | Yes | | | | +| Local CLI | | | | Yes | | | | +| Microsoft | | | | Yes | | | | +| MiniMax | Yes | Yes | Yes | Yes | | | | +| Mistral | | | | | Yes | | | +| OpenAI | Yes | Yes | | Yes | Yes | Yes | Yes | +| Qwen | | Yes | | | | | | +| Runway | | Yes | | | | | | +| SenseAudio | | | | | Yes | | | +| Together | | Yes | | | | | | +| Vydra | Yes | Yes | | Yes | | | | +| xAI | Yes | Yes | | Yes | Yes | | Yes | +| Xiaomi MiMo | Yes | | | Yes | | | Yes | Media understanding uses any vision-capable or audio-capable model registered in your provider config. The table above highlights providers with dedicated media-understanding support; most LLM providers with multimodal models (Anthropic, Google, OpenAI, etc.) can also understand inbound media when configured as the active reply model. @@ -53,11 +56,11 @@ Media understanding uses any vision-capable or audio-capable model registered in Video and music generation run as background tasks because provider processing typically takes 30 seconds to several minutes. When the agent calls `video_generate` or `music_generate`, OpenClaw submits the request to the provider, returns a task ID immediately, and tracks the job in the task ledger. The agent continues responding to other messages while the job runs. When the provider finishes, OpenClaw wakes the agent so it can post the finished media back into the original channel. Image generation and TTS are synchronous and complete inline with the reply. -Deepgram, ElevenLabs, Mistral, OpenAI, and xAI can all transcribe inbound -audio through the batch `tools.media.audio` path when configured. Deepgram, -ElevenLabs, Mistral, OpenAI, and xAI also register Voice Call streaming STT -providers, so live phone audio can be forwarded to the selected vendor -without waiting for a completed recording. +Deepgram, ElevenLabs, Mistral, OpenAI, SenseAudio, and xAI can all transcribe +inbound audio through the batch `tools.media.audio` path when configured. +Deepgram, ElevenLabs, Mistral, OpenAI, and xAI also register Voice Call +streaming STT providers, so live phone audio can be forwarded to the selected +vendor without waiting for a completed recording. Google maps to OpenClaw's image, video, music, batch TTS, backend realtime voice, and media-understanding surfaces. OpenAI maps to OpenClaw's image, diff --git a/docs/tools/tts.md b/docs/tools/tts.md index bd1f071d652..6012df221b3 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -7,7 +7,7 @@ read_when: title: "Text-to-speech" --- -OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo. +OpenClaw can convert outbound replies into audio using ElevenLabs, Google Gemini, Gradium, Local CLI, Microsoft, MiniMax, OpenAI, Vydra, xAI, or Xiaomi MiMo. It works anywhere OpenClaw can send audio. ## Supported services @@ -15,6 +15,7 @@ It works anywhere OpenClaw can send audio. - **ElevenLabs** (primary or fallback provider) - **Google Gemini** (primary or fallback provider; uses Gemini API TTS) - **Gradium** (primary or fallback provider; supports voice-note and telephony output) +- **Local CLI** (primary or fallback provider; runs a configured local TTS command) - **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) - **MiniMax** (primary or fallback provider; uses the T2A v2 API) - **OpenAI** (primary or fallback provider; also used for summaries) @@ -50,7 +51,7 @@ If you want OpenAI, ElevenLabs, Google Gemini, Gradium, MiniMax, Vydra, xAI, or - `XAI_API_KEY` - `XIAOMI_API_KEY` -Microsoft speech does **not** require an API key. +Local CLI and Microsoft speech do **not** require an API key. If multiple providers are configured, the selected provider is used first and the others are fallback options. Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`), @@ -297,6 +298,35 @@ OpenRouter model provider. Resolution order is `messages.tts.providers.openrouter.apiKey` -> `models.providers.openrouter.apiKey` -> `OPENROUTER_API_KEY`. +### Local CLI primary + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "tts-local-cli", + providers: { + "tts-local-cli": { + command: "say", + args: ["-o", "{{OutputPath}}", "{{Text}}"], + outputFormat: "wav", + timeoutMs: 120000, + }, + }, + }, + }, +} +``` + +Local CLI TTS runs the configured command on the gateway host. `{{Text}}`, +`{{OutputPath}}`, `{{OutputDir}}`, and `{{OutputBase}}` placeholders are +expanded in `args`; if no `{{Text}}` placeholder is present, OpenClaw writes the +spoken text to stdin. `outputFormat` accepts `mp3`, `opus`, or `wav`. +Voice-note targets are transcoded to Ogg/Opus and telephony output is +transcoded to raw 16 kHz mono PCM with `ffmpeg`. The legacy provider alias +`cli` still works, but new config should use `tts-local-cli`. + ### Gradium primary ```json5 @@ -417,6 +447,12 @@ Then run: - `providers.minimax.speed`: playback speed `0.5..2.0` (default 1.0). - `providers.minimax.vol`: volume `(0, 10]` (default 1.0; must be greater than 0). - `providers.minimax.pitch`: integer pitch shift `-12..12` (default 0). Fractional values are truncated before calling MiniMax T2A because the API rejects non-integer pitch values. +- `providers.tts-local-cli.command`: local executable or command string for CLI TTS. +- `providers.tts-local-cli.args`: command arguments; supports `{{Text}}`, `{{OutputPath}}`, `{{OutputDir}}`, and `{{OutputBase}}` placeholders. +- `providers.tts-local-cli.outputFormat`: expected CLI output format (`mp3`, `opus`, or `wav`; default `mp3` for audio attachments). +- `providers.tts-local-cli.timeoutMs`: command timeout in milliseconds (default `120000`). +- `providers.tts-local-cli.cwd`: optional command working directory. +- `providers.tts-local-cli.env`: optional string environment overrides for the command. - `providers.google.model`: Gemini TTS model (default `gemini-3.1-flash-tts-preview`). - `providers.google.voiceName`: Gemini prebuilt voice name (default `Kore`; `voice` is also accepted). - `providers.google.audioProfile`: natural-language style prompt prepended before the spoken text. @@ -545,6 +581,9 @@ These override `messages.tts.*` for that host. - 44.1kHz / 128kbps is the default balance for speech clarity. - **MiniMax**: MP3 (`speech-2.8-hd` model, 32kHz sample rate) for normal audio attachments. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes the MiniMax MP3 to 48kHz Opus with `ffmpeg` before delivery. - **Xiaomi MiMo**: MP3 by default, or WAV when configured. For voice-note targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi output to 48kHz Opus with `ffmpeg` before delivery. +- **Local CLI**: uses the configured `outputFormat`. Voice-note targets are + converted to Ogg/Opus and telephony output is converted to raw 16 kHz mono PCM + with `ffmpeg`. - **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path. - **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. - **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. diff --git a/extensions/qqbot/src/engine/gateway/inbound-context.ts b/extensions/qqbot/src/engine/gateway/inbound-context.ts index 7e4358e6eb2..74f7c7f9e53 100644 --- a/extensions/qqbot/src/engine/gateway/inbound-context.ts +++ b/extensions/qqbot/src/engine/gateway/inbound-context.ts @@ -72,6 +72,7 @@ export interface InboundContext { uniqueVoicePaths: string[]; uniqueVoiceUrls: string[]; uniqueVoiceAsrReferTexts: string[]; + voiceMediaTypes: string[]; hasAsrReferFallback: boolean; voiceTranscriptSources: string[]; diff --git a/extensions/qqbot/src/engine/gateway/inbound-pipeline.ts b/extensions/qqbot/src/engine/gateway/inbound-pipeline.ts index b3c65d4599f..ef82dc34bb2 100644 --- a/extensions/qqbot/src/engine/gateway/inbound-pipeline.ts +++ b/extensions/qqbot/src/engine/gateway/inbound-pipeline.ts @@ -254,6 +254,7 @@ export async function buildInboundContext( localMediaTypes.push(t); } } + const voiceMediaTypes = [...uniqueVoicePaths, ...uniqueVoiceUrls].map(() => "audio/wav"); return { event, @@ -279,6 +280,7 @@ export async function buildInboundContext( uniqueVoicePaths, uniqueVoiceUrls, uniqueVoiceAsrReferTexts, + voiceMediaTypes, hasAsrReferFallback, voiceTranscriptSources, replyTo, @@ -342,6 +344,7 @@ function buildBlockedInboundContext(params: { uniqueVoicePaths: [], uniqueVoiceUrls: [], uniqueVoiceAsrReferTexts: [], + voiceMediaTypes: [], hasAsrReferFallback: false, voiceTranscriptSources: [], replyTo: undefined, diff --git a/extensions/qqbot/src/engine/gateway/outbound-dispatch.test.ts b/extensions/qqbot/src/engine/gateway/outbound-dispatch.test.ts new file mode 100644 index 00000000000..16f2e04899e --- /dev/null +++ b/extensions/qqbot/src/engine/gateway/outbound-dispatch.test.ts @@ -0,0 +1,202 @@ +import { describe, expect, it, vi, beforeEach } from "vitest"; +import type { InboundContext } from "./inbound-context.js"; +import { dispatchOutbound } from "./outbound-dispatch.js"; +import type { GatewayAccount, GatewayPluginRuntime } from "./types.js"; + +const sendVoiceMessageMock = vi.hoisted(() => + vi.fn(async () => ({ id: "voice-1", timestamp: "2026-04-25T00:00:00.000Z" })), +); +const sendTextMock = vi.hoisted(() => + vi.fn(async () => ({ id: "text-1", timestamp: "2026-04-25T00:00:00.000Z" })), +); +const audioFileToSilkBase64Mock = vi.hoisted(() => vi.fn(async () => "silk-base64")); + +vi.mock("../messaging/sender.js", () => ({ + accountToCreds: (account: GatewayAccount) => ({ + appId: account.appId, + clientSecret: account.clientSecret, + }), + buildDeliveryTarget: (target: { type: string; senderId: string; groupOpenid?: string }) => ({ + type: target.type === "group" ? "group" : target.type === "c2c" ? "c2c" : target.type, + id: target.type === "group" ? target.groupOpenid : target.senderId, + }), + initApiConfig: vi.fn(), + sendFileMessage: vi.fn(), + sendImage: vi.fn(), + sendText: sendTextMock, + sendVideoMessage: vi.fn(), + sendVoiceMessage: sendVoiceMessageMock, + withTokenRetry: async (_creds: unknown, fn: () => Promise) => await fn(), +})); + +vi.mock("../utils/audio.js", () => ({ + audioFileToSilkBase64: audioFileToSilkBase64Mock, +})); + +const account: GatewayAccount = { + accountId: "qq-main", + appId: "app", + clientSecret: "secret", + markdownSupport: false, + config: {}, +}; + +function makeInbound(overrides: Partial = {}): InboundContext { + return { + event: { + type: "c2c", + senderId: "user-openid", + messageId: "msg-1", + content: "voice", + timestamp: "2026-04-25T00:00:00.000Z", + }, + route: { sessionKey: "qqbot:c2c:user-openid", accountId: "qq-main" }, + isGroupChat: false, + peerId: "user-openid", + qualifiedTarget: "qqbot:c2c:user-openid", + fromAddress: "qqbot:c2c:user-openid", + parsedContent: "voice", + userContent: "voice", + quotePart: "", + dynamicCtx: "", + userMessage: "voice", + agentBody: "voice", + body: "voice", + systemPrompts: [], + attachments: { + attachmentInfo: "", + imageUrls: [], + imageMediaTypes: [], + voiceAttachmentPaths: [], + voiceAttachmentUrls: [], + voiceAsrReferTexts: [], + voiceTranscripts: [], + voiceTranscriptSources: [], + attachmentLocalPaths: [], + }, + localMediaPaths: [], + localMediaTypes: [], + remoteMediaUrls: [], + remoteMediaTypes: [], + uniqueVoicePaths: [], + uniqueVoiceUrls: [], + uniqueVoiceAsrReferTexts: [], + voiceMediaTypes: [], + hasAsrReferFallback: false, + voiceTranscriptSources: [], + commandAuthorized: false, + blocked: false, + typing: { keepAlive: null }, + ...overrides, + }; +} + +function makeRuntime(params: { + onFinalize?: (ctx: Record) => void; + onDeliver?: ( + deliver: ( + payload: { text?: string; audioAsVoice?: boolean }, + info: { kind: string }, + ) => Promise, + ) => Promise; +}): GatewayPluginRuntime { + return { + channel: { + activity: { record: vi.fn() }, + routing: { + resolveAgentRoute: vi.fn(() => ({ + sessionKey: "qqbot:c2c:user-openid", + accountId: "qq-main", + })), + }, + reply: { + dispatchReplyWithBufferedBlockDispatcher: vi.fn(async (rawParams: unknown) => { + const deliver = ( + rawParams as { + dispatcherOptions: { + deliver: ( + payload: { text?: string; audioAsVoice?: boolean }, + info: { kind: string }, + ) => Promise; + }; + } + ).dispatcherOptions.deliver; + await params.onDeliver?.(deliver); + }), + finalizeInboundContext: vi.fn((rawCtx: Record) => { + params.onFinalize?.(rawCtx); + return rawCtx; + }), + formatInboundEnvelope: vi.fn(() => "voice"), + resolveEffectiveMessagesConfig: vi.fn(() => ({})), + resolveEnvelopeFormatOptions: vi.fn(() => ({})), + }, + text: { + chunkMarkdownText: (text: string) => [text], + }, + }, + tts: { + textToSpeech: vi.fn(async () => ({ + success: true, + audioPath: "/tmp/openclaw-qqbot/tts.wav", + provider: "test-tts", + outputFormat: "wav", + })), + }, + }; +} + +describe("dispatchOutbound", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("marks voice-only inbound as audio without adding voice paths to MediaPaths", async () => { + let finalized: Record | undefined; + const runtime = makeRuntime({ onFinalize: (ctx) => (finalized = ctx) }); + + await dispatchOutbound( + makeInbound({ + uniqueVoicePaths: ["/tmp/qqbot/voice.wav"], + voiceMediaTypes: ["audio/wav"], + }), + { runtime, cfg: {}, account }, + ); + + expect(finalized).toMatchObject({ + MediaType: "audio/wav", + MediaTypes: ["audio/wav"], + QQVoiceAttachmentPaths: ["/tmp/qqbot/voice.wav"], + }); + expect(finalized).not.toHaveProperty("MediaPath"); + expect(finalized).not.toHaveProperty("MediaPaths"); + }); + + it("synthesizes plain audioAsVoice text as a QQ voice reply", async () => { + const runtime = makeRuntime({ + onDeliver: async (deliver) => { + await deliver({ text: "read this aloud", audioAsVoice: true }, { kind: "block" }); + }, + }); + + await dispatchOutbound(makeInbound(), { runtime, cfg: {}, account }); + + expect(runtime.tts.textToSpeech).toHaveBeenCalledWith({ + text: "read this aloud", + cfg: {}, + channel: "qqbot", + }); + expect(audioFileToSilkBase64Mock).toHaveBeenCalledWith("/tmp/openclaw-qqbot/tts.wav"); + expect(sendVoiceMessageMock).toHaveBeenCalledWith( + { type: "c2c", id: "user-openid" }, + { appId: "app", clientSecret: "secret" }, + expect.objectContaining({ + filePath: "/tmp/openclaw-qqbot/tts.wav", + msgId: "msg-1", + ttsText: "read this aloud", + voiceBase64: "silk-base64", + }), + ); + expect(sendTextMock).not.toHaveBeenCalled(); + }); +}); diff --git a/extensions/qqbot/src/engine/gateway/outbound-dispatch.ts b/extensions/qqbot/src/engine/gateway/outbound-dispatch.ts index 4909de7cef8..9dc30c16a0d 100644 --- a/extensions/qqbot/src/engine/gateway/outbound-dispatch.ts +++ b/extensions/qqbot/src/engine/gateway/outbound-dispatch.ts @@ -24,6 +24,7 @@ import { } from "../messaging/outbound.js"; import { handleStructuredPayload, + sendTextAsVoiceReply, sendErrorToTarget, sendWithTokenRetry, type ReplyDispatcherDeps, @@ -53,6 +54,13 @@ export interface OutboundDispatchDeps { log?: EngineLogger; } +type ReplyDeliverPayload = { + text?: string; + mediaUrls?: string[]; + mediaUrl?: string; + audioAsVoice?: boolean; +}; + // ============ dispatchOutbound ============ /** @@ -185,10 +193,7 @@ export async function dispatchOutbound( cfg, dispatcherOptions: { responsePrefix: messagesConfig.responsePrefix, - deliver: async ( - payload: { text?: string; mediaUrls?: string[]; mediaUrl?: string }, - info: { kind: string }, - ) => { + deliver: async (payload: ReplyDeliverPayload, info: { kind: string }) => { hasResponse = true; // ---- Tool deliver ---- @@ -303,7 +308,16 @@ export async function dispatchOutbound( return; } - // 3. Plain text + images + // 3. Voice-intent plain text + if (payload.audioAsVoice === true && !payload.mediaUrl && !payload.mediaUrls?.length) { + const sentVoice = await sendTextAsVoiceReply(replyCtx, replyText, replyDeps); + if (sentVoice) { + recordOutbound(); + return; + } + } + + // 4. Plain text + images/media await sendPlainReply( payload, replyText, @@ -380,6 +394,12 @@ function buildCtxPayload(inbound: InboundContext, runtime: GatewayPluginRuntime) QQVoiceAsrReferTexts: inbound.uniqueVoiceAsrReferTexts, QQVoiceInputStrategy: "prefer_audio_stt_then_asr_fallback", CommandAuthorized: inbound.commandAuthorized, + ...(inbound.voiceMediaTypes.length > 0 + ? { + MediaTypes: inbound.voiceMediaTypes, + MediaType: inbound.voiceMediaTypes[0], + } + : {}), ...(inbound.localMediaPaths.length > 0 ? { MediaPaths: inbound.localMediaPaths, diff --git a/extensions/qqbot/src/engine/messaging/outbound-deliver.ts b/extensions/qqbot/src/engine/messaging/outbound-deliver.ts index 92947165924..fbdf399c21b 100644 --- a/extensions/qqbot/src/engine/messaging/outbound-deliver.ts +++ b/extensions/qqbot/src/engine/messaging/outbound-deliver.ts @@ -462,6 +462,7 @@ export interface PlainReplyPayload { text?: string; mediaUrls?: string[]; mediaUrl?: string; + audioAsVoice?: boolean; } /** diff --git a/extensions/qqbot/src/engine/messaging/reply-dispatcher.ts b/extensions/qqbot/src/engine/messaging/reply-dispatcher.ts index 6e292b5c458..17af09a2653 100644 --- a/extensions/qqbot/src/engine/messaging/reply-dispatcher.ts +++ b/extensions/qqbot/src/engine/messaging/reply-dispatcher.ts @@ -380,16 +380,25 @@ async function handleAudioPayload( payload: MediaPayload, deps?: ReplyDispatcherDeps, ): Promise { + const ttsText = payload.caption || payload.path; + await sendTextAsVoiceReply(ctx, ttsText, deps); +} + +export async function sendTextAsVoiceReply( + ctx: ReplyContext, + text: string | undefined, + deps?: ReplyDispatcherDeps, +): Promise { const { target, account, cfg, log } = ctx; if (!deps) { log?.error(`TTS deps not provided, cannot handle audio payload`); - return; + return false; } try { - const ttsText = payload.caption || payload.path; + const ttsText = text; if (!ttsText?.trim()) { log?.error(`Voice missing text`); - return; + return false; } log?.debug?.(`TTS: "${ttsText.slice(0, 50)}..."`); @@ -400,7 +409,7 @@ async function handleAudioPayload( }); if (!ttsResult.success || !ttsResult.audioPath) { log?.error(`TTS failed: ${ttsResult.error ?? "unknown"}`); - return; + return false; } const providerLabel = ttsResult.provider ?? "unknown"; @@ -411,7 +420,7 @@ async function handleAudioPayload( const silkBase64 = await deps.tts.audioFileToSilkBase64(ttsResult.audioPath); if (!silkBase64) { log?.error(`Failed to convert TTS audio to SILK`); - return; + return false; } const silkPath = ttsResult.audioPath; @@ -439,8 +448,10 @@ async function handleAudioPayload( account.accountId, ); log?.debug?.(`Voice message sent`); + return true; } catch (err) { log?.error(`TTS/voice send failed: ${formatErrorMessage(err)}`); + return false; } } diff --git a/extensions/senseaudio/index.ts b/extensions/senseaudio/index.ts new file mode 100644 index 00000000000..8b6b32efe69 --- /dev/null +++ b/extensions/senseaudio/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { senseaudioMediaUnderstandingProvider } from "./media-understanding-provider.js"; + +export default definePluginEntry({ + id: "senseaudio", + name: "SenseAudio", + description: "Bundled SenseAudio audio transcription provider", + register(api) { + api.registerMediaUnderstandingProvider(senseaudioMediaUnderstandingProvider); + }, +}); diff --git a/extensions/senseaudio/media-understanding-provider.test.ts b/extensions/senseaudio/media-understanding-provider.test.ts new file mode 100644 index 00000000000..0e175a53730 --- /dev/null +++ b/extensions/senseaudio/media-understanding-provider.test.ts @@ -0,0 +1,136 @@ +import { spawnSync } from "node:child_process"; +import { mkdtempSync, readFileSync, rmSync } from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { describe, expect, it } from "vitest"; +import { + createAuthCaptureJsonFetch, + createRequestCaptureJsonFetch, + installPinnedHostnameTestHooks, +} from "../../src/media-understanding/audio.test-helpers.ts"; +import { transcribeSenseAudioAudio } from "./media-understanding-provider.js"; + +installPinnedHostnameTestHooks(); + +describe("transcribeSenseAudioAudio", () => { + it("uses SenseAudio base URL by default", async () => { + const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "ok" }); + + await transcribeSenseAudioAudio({ + buffer: Buffer.from("audio"), + fileName: "note.mp3", + apiKey: "test-key", + timeoutMs: 1000, + fetchFn, + }); + + expect(getRequest().url).toBe("https://api.senseaudio.cn/v1/audio/transcriptions"); + }); + + it("respects lowercase authorization header overrides", async () => { + const { fetchFn, getAuthHeader } = createAuthCaptureJsonFetch({ text: "ok" }); + + const result = await transcribeSenseAudioAudio({ + buffer: Buffer.from("audio"), + fileName: "note.mp3", + apiKey: "test-key", + timeoutMs: 1000, + headers: { authorization: "Bearer override" }, + fetchFn, + }); + + expect(getAuthHeader()).toBe("Bearer override"); + expect(result.text).toBe("ok"); + }); + + it("builds the expected request payload", async () => { + const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "hello" }); + + const result = await transcribeSenseAudioAudio({ + buffer: Buffer.from("audio-bytes"), + fileName: "voice.wav", + apiKey: "test-key", + timeoutMs: 1234, + baseUrl: "https://api.example.com/v1/", + model: " ", + language: " en ", + prompt: " hello ", + mime: "audio/wav", + headers: { "X-Custom": "1" }, + fetchFn, + }); + const { url: seenUrl, init: seenInit } = getRequest(); + + expect(result.model).toBe("senseaudio-asr-pro-1.5-260319"); + expect(result.text).toBe("hello"); + expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions"); + expect(seenInit?.method).toBe("POST"); + expect(seenInit?.signal).toBeInstanceOf(AbortSignal); + + const headers = new Headers(seenInit?.headers); + expect(headers.get("authorization")).toBe("Bearer test-key"); + expect(headers.get("x-custom")).toBe("1"); + + const form = seenInit?.body as FormData; + expect(form).toBeInstanceOf(FormData); + expect(form.get("model")).toBe("senseaudio-asr-pro-1.5-260319"); + expect(form.get("language")).toBe("en"); + expect(form.get("prompt")).toBe("hello"); + const file = form.get("file") as Blob | { type?: string; name?: string } | null; + expect(file).not.toBeNull(); + if (file) { + expect(file.type).toBe("audio/wav"); + if ("name" in file && typeof file.name === "string") { + expect(file.name).toBe("voice.wav"); + } + } + }); + + it("throws when the provider response omits text", async () => { + const { fetchFn } = createRequestCaptureJsonFetch({}); + + await expect( + transcribeSenseAudioAudio({ + buffer: Buffer.from("audio-bytes"), + fileName: "voice.wav", + apiKey: "test-key", + timeoutMs: 1234, + fetchFn, + }), + ).rejects.toThrow("Audio transcription response missing text"); + }); + + it("can transcribe generated speech in live mode", async () => { + if (process.env.OPENCLAW_LIVE_TEST !== "1" || !process.env.SENSEAUDIO_API_KEY) { + return; + } + const say = spawnSync("sh", ["-lc", "command -v say"], { encoding: "utf8" }); + if (say.status !== 0) { + return; + } + + const tempDir = mkdtempSync(path.join(os.tmpdir(), "openclaw-senseaudio-live-")); + try { + const aiffPath = path.join(tempDir, "speech.aiff"); + const mp3Path = path.join(tempDir, "speech.mp3"); + const sayResult = spawnSync("say", ["-o", aiffPath, "open claw live transcription test"], { + encoding: "utf8", + }); + expect(sayResult.status).toBe(0); + await runFfmpeg(["-y", "-i", aiffPath, "-c:a", "libmp3lame", "-b:a", "96k", mp3Path]); + + const result = await transcribeSenseAudioAudio({ + buffer: readFileSync(mp3Path), + fileName: "speech.mp3", + mime: "audio/mpeg", + apiKey: process.env.SENSEAUDIO_API_KEY, + timeoutMs: 30_000, + }); + + expect(result.text.trim().length).toBeGreaterThan(0); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } + }); +}); diff --git a/extensions/senseaudio/media-understanding-provider.ts b/extensions/senseaudio/media-understanding-provider.ts new file mode 100644 index 00000000000..8525adf2bc5 --- /dev/null +++ b/extensions/senseaudio/media-understanding-provider.ts @@ -0,0 +1,25 @@ +import { + transcribeOpenAiCompatibleAudio, + type AudioTranscriptionRequest, + type MediaUnderstandingProvider, +} from "openclaw/plugin-sdk/media-understanding"; + +const DEFAULT_SENSEAUDIO_AUDIO_BASE_URL = "https://api.senseaudio.cn/v1"; +const DEFAULT_SENSEAUDIO_AUDIO_MODEL = "senseaudio-asr-pro-1.5-260319"; + +export async function transcribeSenseAudioAudio(params: AudioTranscriptionRequest) { + return await transcribeOpenAiCompatibleAudio({ + ...params, + provider: "senseaudio", + defaultBaseUrl: DEFAULT_SENSEAUDIO_AUDIO_BASE_URL, + defaultModel: DEFAULT_SENSEAUDIO_AUDIO_MODEL, + }); +} + +export const senseaudioMediaUnderstandingProvider: MediaUnderstandingProvider = { + id: "senseaudio", + capabilities: ["audio"], + defaultModels: { audio: DEFAULT_SENSEAUDIO_AUDIO_MODEL }, + autoPriority: { audio: 40 }, + transcribeAudio: transcribeSenseAudioAudio, +}; diff --git a/extensions/senseaudio/openclaw.plugin.json b/extensions/senseaudio/openclaw.plugin.json new file mode 100644 index 00000000000..c23b461697d --- /dev/null +++ b/extensions/senseaudio/openclaw.plugin.json @@ -0,0 +1,15 @@ +{ + "id": "senseaudio", + "enabledByDefault": true, + "providerAuthEnvVars": { + "senseaudio": ["SENSEAUDIO_API_KEY"] + }, + "contracts": { + "mediaUnderstandingProviders": ["senseaudio"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/senseaudio/package.json b/extensions/senseaudio/package.json new file mode 100644 index 00000000000..ce2c00e4c72 --- /dev/null +++ b/extensions/senseaudio/package.json @@ -0,0 +1,15 @@ +{ + "name": "@openclaw/senseaudio-provider", + "version": "2026.4.25", + "private": true, + "description": "OpenClaw SenseAudio media-understanding provider", + "type": "module", + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/senseaudio/test-api.ts b/extensions/senseaudio/test-api.ts new file mode 100644 index 00000000000..7df611fa2b4 --- /dev/null +++ b/extensions/senseaudio/test-api.ts @@ -0,0 +1 @@ +export { senseaudioMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/tts-local-cli/index.ts b/extensions/tts-local-cli/index.ts new file mode 100644 index 00000000000..6b53e6af3fb --- /dev/null +++ b/extensions/tts-local-cli/index.ts @@ -0,0 +1,11 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { buildCliSpeechProvider } from "./speech-provider.js"; + +export default definePluginEntry({ + id: "tts-local-cli", + name: "Local CLI TTS", + description: "Bundled CLI speech provider for local TTS", + register(api) { + api.registerSpeechProvider(buildCliSpeechProvider()); + }, +}); diff --git a/extensions/tts-local-cli/openclaw.plugin.json b/extensions/tts-local-cli/openclaw.plugin.json new file mode 100644 index 00000000000..418a9c7fbe4 --- /dev/null +++ b/extensions/tts-local-cli/openclaw.plugin.json @@ -0,0 +1,12 @@ +{ + "id": "tts-local-cli", + "enabledByDefault": true, + "contracts": { + "speechProviders": ["tts-local-cli", "cli"] + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/tts-local-cli/package.json b/extensions/tts-local-cli/package.json new file mode 100644 index 00000000000..cd1aab94d01 --- /dev/null +++ b/extensions/tts-local-cli/package.json @@ -0,0 +1,15 @@ +{ + "name": "@openclaw/tts-local-cli", + "version": "2026.4.25", + "private": true, + "description": "OpenClaw local CLI TTS plugin", + "type": "module", + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ] + } +} diff --git a/extensions/tts-local-cli/speech-provider.test.ts b/extensions/tts-local-cli/speech-provider.test.ts new file mode 100644 index 00000000000..2457e7fac14 --- /dev/null +++ b/extensions/tts-local-cli/speech-provider.test.ts @@ -0,0 +1,283 @@ +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; +import type { SpeechProviderConfig, SpeechSynthesisRequest } from "openclaw/plugin-sdk/speech-core"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +type SpeechSynthesisTarget = SpeechSynthesisRequest["target"]; + +const runFfmpegMock = vi.hoisted(() => vi.fn<(args: string[]) => Promise>()); + +vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ + runFfmpeg: runFfmpegMock, +})); + +import { buildCliSpeechProvider } from "./speech-provider.js"; + +const TEST_CFG = {} as OpenClawConfig; + +function createCliFixture(): { dir: string; script: string } { + const dir = mkdtempSync(path.join(os.tmpdir(), "openclaw-cli-tts-test-")); + const script = path.join(dir, "write-audio.mjs"); + writeFileSync( + script, + ` +import { writeFileSync } from "node:fs"; + +const outIndex = process.argv.indexOf("--out"); +const outputPath = outIndex >= 0 ? process.argv[outIndex + 1] : ""; +const textIndex = process.argv.indexOf("--text"); +const textArg = textIndex >= 0 ? process.argv[textIndex + 1] : ""; +const stdin = await new Promise((resolve) => { + let data = ""; + process.stdin.setEncoding("utf8"); + process.stdin.on("data", (chunk) => { data += chunk; }); + process.stdin.on("end", () => resolve(data)); +}); +const payload = Buffer.from(JSON.stringify({ args: process.argv.slice(2), stdin, textArg })); +if (outputPath) { + writeFileSync(outputPath, payload); +} else { + process.stdout.write(payload); +} +`, + ); + return { dir, script }; +} + +function baseProviderConfig( + script: string, + overrides: SpeechProviderConfig = {}, +): SpeechProviderConfig { + return { + command: process.execPath, + args: [script], + timeoutMs: 1000, + ...overrides, + }; +} + +async function synthesize(params: { + providerConfig: SpeechProviderConfig; + text?: string; + target?: SpeechSynthesisTarget; +}) { + return await buildCliSpeechProvider().synthesize({ + text: params.text ?? "hello world", + cfg: TEST_CFG, + providerConfig: params.providerConfig, + providerOverrides: {}, + timeoutMs: 1000, + target: params.target ?? "audio-file", + }); +} + +describe("buildCliSpeechProvider", () => { + beforeEach(() => { + runFfmpegMock.mockImplementation(async (args) => { + const outputPath = args.at(-1); + if (typeof outputPath !== "string") { + throw new Error("missing ffmpeg output path"); + } + writeFileSync(outputPath, Buffer.from(`converted:${path.extname(outputPath)}`)); + }); + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it("prefers canonical provider config over the cli alias", () => { + const provider = buildCliSpeechProvider(); + + expect( + provider.resolveConfig?.({ + cfg: TEST_CFG, + rawConfig: { + providers: { + cli: { command: "alias-command" }, + "tts-local-cli": { command: "canonical-command" }, + }, + }, + timeoutMs: 1000, + }), + ).toEqual({ command: "canonical-command" }); + }); + + it("passes text through stdin when args omit the text template", async () => { + const fixture = createCliFixture(); + try { + const result = await synthesize({ + providerConfig: baseProviderConfig(fixture.script, { + args: [fixture.script, "--out", "{{OutputPath}}"], + outputFormat: "mp3", + }), + text: "hello 😀 world", + }); + + expect(result).toMatchObject({ + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }); + expect(JSON.parse(result.audioBuffer.toString("utf8"))).toMatchObject({ + stdin: "hello world", + textArg: "", + }); + expect(runFfmpegMock).not.toHaveBeenCalled(); + } finally { + rmSync(fixture.dir, { recursive: true, force: true }); + } + }); + + it("uses template args and stdout output when no output file is produced", async () => { + const fixture = createCliFixture(); + try { + const result = await synthesize({ + providerConfig: baseProviderConfig(fixture.script, { + args: [fixture.script, "--text", "{{Text}}"], + outputFormat: "wav", + }), + text: "spoken words", + }); + + expect(result).toMatchObject({ + outputFormat: "wav", + fileExtension: ".wav", + voiceCompatible: false, + }); + expect(JSON.parse(result.audioBuffer.toString("utf8"))).toMatchObject({ + stdin: "", + textArg: "spoken words", + }); + } finally { + rmSync(fixture.dir, { recursive: true, force: true }); + } + }); + + it("converts non-opus output for voice-note targets", async () => { + const fixture = createCliFixture(); + try { + const result = await synthesize({ + providerConfig: baseProviderConfig(fixture.script, { + args: [fixture.script, "--out", "{{OutputPath}}"], + outputFormat: "mp3", + }), + target: "voice-note", + }); + + expect(result).toEqual({ + audioBuffer: Buffer.from("converted:.opus"), + outputFormat: "opus", + fileExtension: ".ogg", + voiceCompatible: true, + }); + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-b:a", "64k"]), + ); + } finally { + rmSync(fixture.dir, { recursive: true, force: true }); + } + }); + + it("converts stdout WAV to the requested audio-file format", async () => { + const fixture = createCliFixture(); + try { + const result = await synthesize({ + providerConfig: baseProviderConfig(fixture.script, { + args: [fixture.script, "--text", "{{Text}}"], + outputFormat: "mp3", + }), + }); + + expect(result).toEqual({ + audioBuffer: Buffer.from("converted:.mp3"), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: false, + }); + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libmp3lame", "-b:a", "128k"]), + ); + } finally { + rmSync(fixture.dir, { recursive: true, force: true }); + } + }); + + it("converts CLI output to raw telephony PCM", async () => { + const fixture = createCliFixture(); + try { + const result = await buildCliSpeechProvider().synthesizeTelephony?.({ + text: "phone reply", + cfg: TEST_CFG, + providerConfig: baseProviderConfig(fixture.script, { + args: [fixture.script, "--out", "{{OutputPath}}"], + outputFormat: "wav", + }), + timeoutMs: 1000, + }); + + expect(result).toEqual({ + audioBuffer: Buffer.from("converted:.pcm"), + outputFormat: "pcm", + sampleRate: 16000, + }); + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-ar", "16000", "-ac", "1", "-f", "s16le"]), + ); + } finally { + rmSync(fixture.dir, { recursive: true, force: true }); + } + }); + + it("can synthesize through a real local CLI fixture and ffmpeg", async () => { + if (process.env.OPENCLAW_LIVE_TEST !== "1") { + return; + } + const fixture = createCliFixture(); + const rawFfmpeg = await vi.importActual( + "openclaw/plugin-sdk/media-runtime", + ); + runFfmpegMock.mockImplementation(async (args) => { + await rawFfmpeg.runFfmpeg(args); + }); + try { + const wavPath = path.join(fixture.dir, "source.wav"); + await rawFfmpeg.runFfmpeg([ + "-y", + "-f", + "lavfi", + "-i", + "sine=frequency=660:duration=0.1", + "-c:a", + "pcm_s16le", + wavPath, + ]); + writeFileSync( + fixture.script, + ` +import { copyFileSync } from "node:fs"; +const outIndex = process.argv.indexOf("--out"); +copyFileSync(${JSON.stringify(wavPath)}, process.argv[outIndex + 1]); +`, + ); + + const result = await synthesize({ + providerConfig: baseProviderConfig(fixture.script, { + args: [fixture.script, "--out", "{{OutputPath}}"], + outputFormat: "wav", + }), + target: "voice-note", + }); + + expect(result.outputFormat).toBe("opus"); + expect(result.fileExtension).toBe(".ogg"); + expect(result.voiceCompatible).toBe(true); + expect(result.audioBuffer.byteLength).toBeGreaterThan(0); + expect(readFileSync(wavPath).byteLength).toBeGreaterThan(0); + } finally { + rmSync(fixture.dir, { recursive: true, force: true }); + } + }); +}); diff --git a/extensions/tts-local-cli/speech-provider.ts b/extensions/tts-local-cli/speech-provider.ts new file mode 100644 index 00000000000..6bbd3c99e1d --- /dev/null +++ b/extensions/tts-local-cli/speech-provider.ts @@ -0,0 +1,436 @@ +import { spawn } from "node:child_process"; +import { existsSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { createSubsystemLogger } from "openclaw/plugin-sdk/runtime-env"; +import type { + SpeechProviderConfig, + SpeechProviderPlugin, + SpeechSynthesisRequest, + SpeechTelephonySynthesisRequest, +} from "openclaw/plugin-sdk/speech-core"; +import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; + +const log = createSubsystemLogger("tts-local-cli"); + +const VALID_OUTPUT_FORMATS = ["mp3", "opus", "wav"] as const; +const AUDIO_EXTENSIONS = new Set([".wav", ".mp3", ".opus", ".ogg", ".m4a"]); +type OutputFormat = (typeof VALID_OUTPUT_FORMATS)[number]; + +type CliConfig = { + command: string; + args?: string[]; + outputFormat?: OutputFormat; + timeoutMs?: number; + cwd?: string; + env?: Record; +}; + +const DEFAULT_TIMEOUT_MS = 120_000; + +function asObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function asStringArray(value: unknown): string[] | undefined { + return Array.isArray(value) && value.every((v) => typeof v === "string") ? value : undefined; +} + +function asRecord(value: unknown): Record | undefined { + const obj = asObject(value); + if (!obj) { + return undefined; + } + const result: Record = {}; + for (const [k, v] of Object.entries(obj)) { + if (typeof v === "string") { + result[k] = v; + } + } + return Object.keys(result).length > 0 ? result : undefined; +} + +function normalizeOutputFormat(value: unknown): OutputFormat { + if (typeof value !== "string") { + return "mp3"; + } + const lower = value.toLowerCase().trim(); + if (VALID_OUTPUT_FORMATS.includes(lower as OutputFormat)) { + return lower as OutputFormat; + } + return "mp3"; +} + +function resolveCliProviderConfig(rawConfig: Record): SpeechProviderConfig { + const providers = asObject(rawConfig.providers); + return asObject(providers?.["tts-local-cli"]) ?? asObject(providers?.cli) ?? {}; +} + +function getConfig(cfg: SpeechProviderConfig): CliConfig | null { + const command = typeof cfg.command === "string" ? cfg.command.trim() : ""; + if (!command) { + return null; + } + return { + command, + args: asStringArray(cfg.args), + outputFormat: normalizeOutputFormat(cfg.outputFormat), + timeoutMs: typeof cfg.timeoutMs === "number" ? cfg.timeoutMs : DEFAULT_TIMEOUT_MS, + cwd: typeof cfg.cwd === "string" ? cfg.cwd : undefined, + env: asRecord(cfg.env), + }; +} + +function stripEmojis(text: string): string { + return text + .replace(/[\p{Emoji_Presentation}\p{Extended_Pictographic}]/gu, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function applyTemplate(str: string, ctx: Record): string { + return str.replace(/{{\s*(\w+)\s*}}/gi, (_, key) => { + const normalizedKey = key.charAt(0).toUpperCase() + key.slice(1).toLowerCase(); + return ctx[normalizedKey] ?? ctx[key] ?? ""; + }); +} + +function parseCommand(cmdStr: string): { cmd: string; initialArgs: string[] } { + const parts: string[] = []; + let current = ""; + let inQuote = false; + let quoteChar = ""; + + for (const char of cmdStr.trim()) { + if (inQuote) { + if (char === quoteChar) { + inQuote = false; + } else { + current += char; + } + } else if (char === '"' || char === "'") { + inQuote = true; + quoteChar = char; + } else if (char === " " || char === "\t") { + if (current) { + parts.push(current); + current = ""; + } + } else { + current += char; + } + } + if (current) { + parts.push(current); + } + return { cmd: parts[0] || "", initialArgs: parts.slice(1) }; +} + +function findAudioFile(dir: string, baseName: string): string | null { + const files = readdirSync(dir); + for (const file of files) { + const ext = path.extname(file).toLowerCase(); + if (AUDIO_EXTENSIONS.has(ext) && (file.startsWith(baseName) || file.includes(baseName))) { + return path.join(dir, file); + } + } + for (const file of files) { + const ext = path.extname(file).toLowerCase(); + if (AUDIO_EXTENSIONS.has(ext)) { + return path.join(dir, file); + } + } + return null; +} + +function detectFormat(filePath: string): "mp3" | "opus" | "wav" | null { + const ext = path.extname(filePath).toLowerCase(); + if (ext === ".opus" || ext === ".ogg") { + return "opus"; + } + if (ext === ".wav") { + return "wav"; + } + if (ext === ".mp3" || ext === ".m4a") { + return "mp3"; + } + return null; +} + +function getFileExt(format: string): string { + if (format === "opus") { + return ".opus"; + } + if (format === "wav") { + return ".wav"; + } + return ".mp3"; +} + +async function runCli(params: { + command: string; + args: string[]; + cwd?: string; + env?: Record; + timeoutMs: number; + text: string; + outputDir: string; + filePrefix: string; + outputFormat?: OutputFormat; +}): Promise<{ buffer: Buffer; actualFormat: "mp3" | "opus" | "wav"; audioPath?: string }> { + const cleanText = stripEmojis(params.text); + if (!cleanText) { + throw new Error("CLI TTS: text is empty after removing emojis"); + } + + const outputExt = getFileExt(params.outputFormat ?? "wav"); + const ctx: Record = { + Text: cleanText, + OutputPath: path.join(params.outputDir, `${params.filePrefix}${outputExt}`), + OutputDir: params.outputDir, + OutputBase: params.filePrefix, + }; + + const { cmd, initialArgs } = parseCommand(params.command); + if (!cmd) { + throw new Error("CLI TTS: invalid command"); + } + + const baseArgs = [...initialArgs, ...params.args]; + const args = baseArgs.map((a) => applyTemplate(a, ctx)); + + return new Promise((resolve, reject) => { + let timedOut = false; + const timer = setTimeout(() => { + timedOut = true; + proc.kill(); + // Escalate to SIGKILL if child ignores SIGTERM + setTimeout(() => proc.kill("SIGKILL"), 5000).unref(); + }, params.timeoutMs); + + const env = params.env ? { ...process.env, ...params.env } : process.env; + const proc = spawn(cmd, args, { cwd: params.cwd, env, stdio: ["pipe", "pipe", "pipe"] }); + + const stdoutChunks: Buffer[] = []; + const stderrChunks: Buffer[] = []; + proc.stdout.on("data", (c) => stdoutChunks.push(c)); + proc.stderr.on("data", (c) => stderrChunks.push(c)); + + proc.on("error", (e) => { + clearTimeout(timer); + reject(new Error(`CLI TTS failed: ${e.message}`)); + }); + + proc.on("close", (code) => { + clearTimeout(timer); + if (timedOut) { + return reject(new Error(`CLI TTS timed out after ${params.timeoutMs}ms`)); + } + if (code !== 0) { + const stderr = Buffer.concat(stderrChunks).toString("utf8"); + return reject(new Error(`CLI TTS exit ${code}: ${stderr}`)); + } + + const audioFile = findAudioFile(params.outputDir, params.filePrefix); + if (audioFile) { + if (!existsSync(audioFile)) { + return reject(new Error(`CLI TTS: output file not found at ${audioFile}`)); + } + const format = detectFormat(audioFile); + if (!format) { + return reject(new Error(`CLI TTS: unknown format for ${audioFile}`)); + } + return resolve({ + buffer: readFileSync(audioFile), + actualFormat: format, + audioPath: audioFile, + }); + } + + const stdout = Buffer.concat(stdoutChunks); + if (stdout.length > 0) { + // Assume WAV for stdout output; could be MP3 but caller should convert if needed + return resolve({ buffer: stdout, actualFormat: "wav" }); + } + reject(new Error("CLI TTS produced no output")); + }); + + proc.stdin?.on("error", () => {}); // suppress EPIPE if child ignores stdin + if (!baseArgs.some((a) => /{{\s*text\s*}}/i.test(a))) { + proc.stdin?.write(cleanText); + } + proc.stdin?.end(); + }); +} + +async function convertAudio( + inputPath: string, + outputDir: string, + target: OutputFormat, +): Promise { + const outputPath = path.join(outputDir, `converted${getFileExt(target)}`); + const args = ["-y", "-i", inputPath]; + if (target === "opus") { + args.push("-c:a", "libopus", "-b:a", "64k", outputPath); + } else if (target === "wav") { + args.push("-c:a", "pcm_s16le", outputPath); + } else { + args.push("-c:a", "libmp3lame", "-b:a", "128k", outputPath); + } + await runFfmpeg(args); + return readFileSync(outputPath); +} + +async function convertToRawPcm(inputPath: string, outputDir: string): Promise { + // Output raw 16kHz mono 16-bit little-endian PCM (no WAV headers) + const outputPath = path.join(outputDir, "telephony.pcm"); + await runFfmpeg([ + "-y", + "-i", + inputPath, + "-c:a", + "pcm_s16le", + "-ar", + "16000", + "-ac", + "1", + "-f", + "s16le", + outputPath, + ]); + return readFileSync(outputPath); +} + +export function buildCliSpeechProvider(): SpeechProviderPlugin { + return { + id: "tts-local-cli", + aliases: ["cli"], + label: "Local CLI", + autoSelectOrder: 1000, + + resolveConfig(ctx): SpeechProviderConfig { + return resolveCliProviderConfig(ctx.rawConfig); + }, + + isConfigured(ctx): boolean { + return getConfig(ctx.providerConfig) !== null; + }, + + async synthesize(req: SpeechSynthesisRequest) { + const config = getConfig(req.providerConfig); + if (!config) { + throw new Error("CLI TTS not configured"); + } + + log.debug(`synthesize: text=${req.text.slice(0, 50)}...`); + + const tempDir = mkdtempSync(path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-tts-")); + + try { + const result = await runCli({ + command: config.command, + args: config.args ?? [], + cwd: config.cwd, + env: config.env, + timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS, + text: req.text, + outputDir: tempDir, + filePrefix: "speech", + outputFormat: config.outputFormat, + }); + + log.debug(`synthesize: format=${result.actualFormat}, size=${result.buffer.length}`); + + let buffer: Buffer; + let format: OutputFormat; + + if (req.target === "voice-note") { + if (result.actualFormat !== "opus") { + const inputFile = + result.audioPath ?? path.join(tempDir, `input${getFileExt(result.actualFormat)}`); + if (!result.audioPath) { + writeFileSync(inputFile, result.buffer); + } + buffer = await convertAudio(inputFile, tempDir, "opus"); + format = "opus"; + } else { + buffer = result.buffer; + format = "opus"; + } + } else { + const desired = config.outputFormat ?? "mp3"; + if (result.actualFormat !== desired) { + const inputFile = + result.audioPath ?? path.join(tempDir, `input${getFileExt(result.actualFormat)}`); + if (!result.audioPath) { + writeFileSync(inputFile, result.buffer); + } + buffer = await convertAudio(inputFile, tempDir, desired); + format = desired; + } else { + buffer = result.buffer; + format = result.actualFormat; + } + } + + const fileExtension = format === "opus" ? ".ogg" : `.${format}`; + return { + audioBuffer: buffer, + outputFormat: format, + fileExtension, + voiceCompatible: req.target === "voice-note" && format === "opus", + }; + } finally { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch {} + } + }, + + async synthesizeTelephony(req: SpeechTelephonySynthesisRequest) { + const config = getConfig(req.providerConfig); + if (!config) { + throw new Error("CLI TTS not configured"); + } + + log.debug(`synthesizeTelephony: text=${req.text.slice(0, 50)}...`); + + const tempDir = mkdtempSync(path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-tts-")); + + try { + const result = await runCli({ + command: config.command, + args: config.args ?? [], + cwd: config.cwd, + env: config.env, + timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS, + text: req.text, + outputDir: tempDir, + filePrefix: "telephony", + outputFormat: config.outputFormat, + }); + + const inputFile = + result.audioPath ?? path.join(tempDir, `input${getFileExt(result.actualFormat)}`); + if (!result.audioPath) { + writeFileSync(inputFile, result.buffer); + } + + // Convert to raw 16kHz mono PCM for telephony (no WAV headers) + const pcmBuffer = await convertToRawPcm(inputFile, tempDir); + + return { + audioBuffer: pcmBuffer, + outputFormat: "pcm", + sampleRate: 16000, + }; + } finally { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch {} + } + }, + }; +} diff --git a/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts b/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts index 36a13745206..9124ba1869e 100644 --- a/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts +++ b/extensions/whatsapp/src/auto-reply/deliver-reply.test.ts @@ -1,3 +1,4 @@ +import fsSync from "node:fs"; import { logVerbose } from "openclaw/plugin-sdk/runtime-env"; import { sleep } from "openclaw/plugin-sdk/text-runtime"; import { beforeAll, describe, expect, it, vi } from "vitest"; @@ -5,6 +6,20 @@ import { loadWebMedia } from "../media.js"; import { cacheInboundMessageMeta } from "../quoted-message.js"; import type { WebInboundMsg } from "./types.js"; +const hoisted = vi.hoisted(() => ({ + runFfmpeg: vi.fn(), +})); + +vi.mock("openclaw/plugin-sdk/media-runtime", async () => { + const actual = await vi.importActual( + "openclaw/plugin-sdk/media-runtime", + ); + return { + ...actual, + runFfmpeg: hoisted.runFfmpeg, + }; +}); + vi.mock("openclaw/plugin-sdk/runtime-env", async () => { const actual = await vi.importActual( "openclaw/plugin-sdk/runtime-env", @@ -546,6 +561,45 @@ describe("deliverWebReply", () => { ); }); + it("transcodes mp3 audio media before sending a ptt voice note", async () => { + vi.clearAllMocks(); + hoisted.runFfmpeg.mockImplementation(async (args: string[]) => { + fsSync.writeFileSync(args.at(-1) ?? "", Buffer.from("opus-output")); + return ""; + }); + const msg = makeMsg(); + ( + loadWebMedia as unknown as { mockResolvedValueOnce: (v: unknown) => void } + ).mockResolvedValueOnce({ + buffer: Buffer.from("mp3"), + contentType: "audio/mpeg", + kind: "audio", + fileName: "voice.mp3", + }); + + await deliverWebReply({ + replyResult: { text: "cap", mediaUrl: "http://example.com/a.mp3" }, + msg, + maxMediaBytes: 1024 * 1024, + textLimit: 200, + replyLogger, + skipLog: true, + }); + + expect(hoisted.runFfmpeg).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]), + ); + expect(msg.sendMedia).toHaveBeenCalledWith( + expect.objectContaining({ + audio: Buffer.from("opus-output"), + ptt: true, + mimetype: "audio/ogg; codecs=opus", + caption: "cap", + }), + undefined, + ); + }); + it("sends video media", async () => { const msg = makeMsg(); ( diff --git a/extensions/whatsapp/src/auto-reply/deliver-reply.ts b/extensions/whatsapp/src/auto-reply/deliver-reply.ts index 59d671ce183..cfd16737df4 100644 --- a/extensions/whatsapp/src/auto-reply/deliver-reply.ts +++ b/extensions/whatsapp/src/auto-reply/deliver-reply.ts @@ -8,9 +8,9 @@ import { import { logVerbose, shouldLogVerbose } from "openclaw/plugin-sdk/runtime-env"; import { loadWebMedia } from "../media.js"; import { - normalizeWhatsAppLoadedMedia, normalizeWhatsAppOutboundPayload, normalizeWhatsAppPayloadTextPreservingIndentation, + prepareWhatsAppOutboundMedia, sendWhatsAppOutboundWithRetry, } from "../outbound-media-contract.js"; import { buildQuotedMessageOptions, lookupInboundMessageMeta } from "../quoted-message.js"; @@ -120,7 +120,7 @@ export async function deliverWebReply(params: { mediaUrls: mediaList, caption: leadingCaption, send: async ({ mediaUrl, caption }) => { - const media = normalizeWhatsAppLoadedMedia( + const media = await prepareWhatsAppOutboundMedia( await loadWebMedia(mediaUrl, { maxBytes: maxMediaBytes, localRoots: params.mediaLocalRoots, diff --git a/extensions/whatsapp/src/outbound-media-contract.ts b/extensions/whatsapp/src/outbound-media-contract.ts index 8ec45c7ec4b..e341e9d924a 100644 --- a/extensions/whatsapp/src/outbound-media-contract.ts +++ b/extensions/whatsapp/src/outbound-media-contract.ts @@ -1,4 +1,7 @@ +import fs from "node:fs/promises"; import path from "node:path"; +import { MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS, runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; import { formatError } from "./session-errors.js"; import { sleep } from "./text-runtime.js"; @@ -22,6 +25,11 @@ export type CanonicalWhatsAppLoadedMedia = { fileName?: string; }; +const WHATSAPP_VOICE_FILE_NAME = "voice.ogg"; +const WHATSAPP_VOICE_SAMPLE_RATE_HZ = 48_000; +const WHATSAPP_VOICE_BITRATE = "64k"; +const WHATSAPP_VOICE_MIMETYPE = "audio/ogg; codecs=opus"; + export function normalizeWhatsAppPayloadText(text: string | undefined): string { return text?.trimStart() ?? ""; } @@ -75,8 +83,8 @@ export function normalizeWhatsAppLoadedMedia( ? media.kind : "document"; const mimetype = - kind === "audio" && media.contentType === "audio/ogg" - ? "audio/ogg; codecs=opus" + kind === "audio" && isWhatsAppNativeVoiceAudio({ contentType: media.contentType, mediaUrl }) + ? WHATSAPP_VOICE_MIMETYPE : (media.contentType ?? "application/octet-stream"); const fileName = kind === "document" @@ -90,6 +98,94 @@ export function normalizeWhatsAppLoadedMedia( }; } +export async function prepareWhatsAppOutboundMedia( + media: WhatsAppLoadedMediaLike, + mediaUrl?: string, +): Promise { + const normalized = normalizeWhatsAppLoadedMedia(media, mediaUrl); + if (normalized.kind !== "audio") { + return normalized; + } + if ( + isWhatsAppNativeVoiceAudio({ + contentType: media.contentType, + fileName: media.fileName, + mediaUrl, + }) + ) { + return normalized; + } + + const buffer = await transcodeToWhatsAppVoiceOpus({ + buffer: media.buffer, + fileName: media.fileName ?? deriveWhatsAppDocumentFileName(mediaUrl) ?? "audio", + }); + return { + buffer, + kind: "audio", + mimetype: WHATSAPP_VOICE_MIMETYPE, + }; +} + +function normalizeContentType(value: string | undefined): string { + return value?.split(";", 1)[0]?.trim().toLowerCase() ?? ""; +} + +function isWhatsAppNativeVoiceAudio(params: { + contentType?: string; + fileName?: string; + mediaUrl?: string; +}): boolean { + const contentType = normalizeContentType(params.contentType); + if (contentType === "audio/ogg" || contentType === "audio/opus") { + return true; + } + const fileName = params.fileName ?? deriveWhatsAppDocumentFileName(params.mediaUrl) ?? ""; + const ext = path.extname(fileName).toLowerCase(); + return ext === ".ogg" || ext === ".opus"; +} + +async function transcodeToWhatsAppVoiceOpus(params: { + buffer: Buffer; + fileName: string; +}): Promise { + const tempRoot = resolvePreferredOpenClawTmpDir(); + await fs.mkdir(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = await fs.mkdtemp(path.join(tempRoot, "whatsapp-voice-")); + try { + const ext = path.extname(params.fileName).toLowerCase(); + const inputExt = ext && ext.length <= 12 ? ext : ".audio"; + const inputPath = path.join(tempDir, `input${inputExt}`); + const outputPath = path.join(tempDir, WHATSAPP_VOICE_FILE_NAME); + await fs.writeFile(inputPath, params.buffer, { mode: 0o600 }); + await runFfmpeg([ + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + inputPath, + "-vn", + "-sn", + "-dn", + "-t", + String(MEDIA_FFMPEG_MAX_AUDIO_DURATION_SECS), + "-ar", + String(WHATSAPP_VOICE_SAMPLE_RATE_HZ), + "-ac", + "1", + "-c:a", + "libopus", + "-b:a", + WHATSAPP_VOICE_BITRATE, + outputPath, + ]); + return await fs.readFile(outputPath); + } finally { + await fs.rm(tempDir, { recursive: true, force: true }); + } +} + function deriveWhatsAppDocumentFileName(mediaUrl: string | undefined): string | undefined { if (!mediaUrl) { return undefined; diff --git a/extensions/whatsapp/src/send.test.ts b/extensions/whatsapp/src/send.test.ts index fcccf8ce55a..9d96bd9860d 100644 --- a/extensions/whatsapp/src/send.test.ts +++ b/extensions/whatsapp/src/send.test.ts @@ -10,6 +10,7 @@ import type { ActiveWebListener } from "./inbound/types.js"; const hoisted = vi.hoisted(() => ({ loadOutboundMediaFromUrl: vi.fn(), controllerListeners: new Map(), + runFfmpeg: vi.fn(), })); const loadWebMediaMock = vi.fn(); let sendMessageWhatsApp: typeof import("./send.js").sendMessageWhatsApp; @@ -49,6 +50,16 @@ vi.mock("./outbound-media.runtime.js", async () => { }; }); +vi.mock("openclaw/plugin-sdk/media-runtime", async () => { + const actual = await vi.importActual( + "openclaw/plugin-sdk/media-runtime", + ); + return { + ...actual, + runFfmpeg: hoisted.runFfmpeg, + }; +}); + vi.mock("./text-runtime.js", async () => { const actual = await vi.importActual("./text-runtime.js"); return { @@ -70,6 +81,10 @@ describe("web outbound", () => { beforeEach(() => { vi.clearAllMocks(); + hoisted.runFfmpeg.mockReset().mockImplementation(async (args: string[]) => { + fsSync.writeFileSync(args.at(-1) ?? "", Buffer.from("opus-output")); + return ""; + }); hoisted.loadOutboundMediaFromUrl.mockReset().mockImplementation( async ( mediaUrl: string, @@ -238,6 +253,35 @@ describe("web outbound", () => { ); }); + it.each([ + { name: "mp3", contentType: "audio/mpeg", fileName: "voice.mp3" }, + { name: "webm", contentType: "audio/webm", fileName: "voice.webm" }, + ])("transcodes $name audio to Ogg Opus before sending a PTT voice note", async (media) => { + const buf = Buffer.from(media.name); + loadWebMediaMock.mockResolvedValueOnce({ + buffer: buf, + contentType: media.contentType, + kind: "audio", + fileName: media.fileName, + }); + + await sendMessageWhatsApp("+1555", "voice note", { + verbose: false, + cfg: WHATSAPP_TEST_CFG, + mediaUrl: `/tmp/${media.fileName}`, + }); + + expect(hoisted.runFfmpeg).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-ar", "48000", "-b:a", "64k"]), + ); + expect(sendMessage).toHaveBeenLastCalledWith( + "+1555", + "voice note", + Buffer.from("opus-output"), + "audio/ogg; codecs=opus", + ); + }); + it("maps video with caption", async () => { const buf = Buffer.from("video"); loadWebMediaMock.mockResolvedValueOnce({ diff --git a/extensions/whatsapp/src/send.ts b/extensions/whatsapp/src/send.ts index 6e756950f67..4a54c82a6b1 100644 --- a/extensions/whatsapp/src/send.ts +++ b/extensions/whatsapp/src/send.ts @@ -16,8 +16,8 @@ import { import { getRegisteredWhatsAppConnectionController } from "./connection-controller-registry.js"; import type { ActiveWebListener, ActiveWebSendOptions } from "./inbound/types.js"; import { - normalizeWhatsAppLoadedMedia, normalizeWhatsAppPayloadText, + prepareWhatsAppOutboundMedia, resolveWhatsAppOutboundMediaUrls, } from "./outbound-media-contract.js"; import { loadOutboundMediaFromUrl } from "./outbound-media.runtime.js"; @@ -116,7 +116,7 @@ export async function sendMessageWhatsApp( let mediaType: string | undefined; let documentFileName: string | undefined; if (primaryMediaUrl) { - const media = normalizeWhatsAppLoadedMedia( + const media = await prepareWhatsAppOutboundMedia( await loadOutboundMediaFromUrl(primaryMediaUrl, { maxBytes: resolveWhatsAppMediaMaxBytes(account), mediaAccess: options.mediaAccess, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ad00dd55e03..074e5e0c7b5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1131,6 +1131,12 @@ importers: specifier: workspace:* version: link:../../packages/plugin-sdk + extensions/senseaudio: + devDependencies: + '@openclaw/plugin-sdk': + specifier: workspace:* + version: link:../../packages/plugin-sdk + extensions/sglang: devDependencies: '@openclaw/plugin-sdk': @@ -1276,6 +1282,12 @@ importers: specifier: workspace:* version: link:../../packages/plugin-sdk + extensions/tts-local-cli: + devDependencies: + '@openclaw/plugin-sdk': + specifier: workspace:* + version: link:../../packages/plugin-sdk + extensions/twitch: dependencies: '@twurple/api': @@ -2245,89 +2257,105 @@ packages: resolution: {integrity: sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-arm@1.2.4': resolution: {integrity: sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-ppc64@1.2.4': resolution: {integrity: sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==} cpu: [ppc64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-riscv64@1.2.4': resolution: {integrity: sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==} cpu: [riscv64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-s390x@1.2.4': resolution: {integrity: sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==} cpu: [s390x] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-x64@1.2.4': resolution: {integrity: sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.2.4': resolution: {integrity: sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.2.4': resolution: {integrity: sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-linux-arm64@0.34.5': resolution: {integrity: sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-linux-arm@0.34.5': resolution: {integrity: sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-linux-ppc64@0.34.5': resolution: {integrity: sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [ppc64] os: [linux] + libc: [glibc] '@img/sharp-linux-riscv64@0.34.5': resolution: {integrity: sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [riscv64] os: [linux] + libc: [glibc] '@img/sharp-linux-s390x@0.34.5': resolution: {integrity: sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [s390x] os: [linux] + libc: [glibc] '@img/sharp-linux-x64@0.34.5': resolution: {integrity: sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-linuxmusl-arm64@0.34.5': resolution: {integrity: sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-linuxmusl-x64@0.34.5': resolution: {integrity: sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-wasm32@0.34.5': resolution: {integrity: sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==} @@ -2519,24 +2547,28 @@ packages: engines: {node: '>= 18'} cpu: [arm64] os: [linux] + libc: [glibc] '@lancedb/lancedb-linux-arm64-musl@0.27.2': resolution: {integrity: sha512-bK5Mc50EvwGZaaiym5CoPu8Y4GNSyEEvTQ0dTC2AUIm83qdQu1rGw6kkYtc/rTH/hbvAvPQot4agHDZfMVxfYw==} engines: {node: '>= 18'} cpu: [arm64] os: [linux] + libc: [musl] '@lancedb/lancedb-linux-x64-gnu@0.27.2': resolution: {integrity: sha512-qe+ML0YmPru0o84f33RBHqoNk6zsHBjiXTLKsEBDiiFYKks/XMsrkKy9NQYcTxShBrg/nx/MLzCzd7dihqgNYw==} engines: {node: '>= 18'} cpu: [x64] os: [linux] + libc: [glibc] '@lancedb/lancedb-linux-x64-musl@0.27.2': resolution: {integrity: sha512-ZpX6Oxn06qvzAdm+D/gNb3SRp/A9lgRAPvPg6nnMmSQk5XamC/hbGO07uK1wwop7nlqXUH/thk4is2y2ieWdTw==} engines: {node: '>= 18'} cpu: [x64] os: [linux] + libc: [musl] '@lancedb/lancedb-win32-arm64-msvc@0.27.2': resolution: {integrity: sha512-4ffpFvh49MiUtkdFJOmBytXEbgUPXORphTOuExnJAgT1VAKwQcu4ZzdsgNoK6mumKBaU+pYQU/MedNkgTzx/Lw==} @@ -2632,30 +2664,35 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [glibc] '@mariozechner/clipboard-linux-arm64-musl@0.3.3': resolution: {integrity: sha512-o1paj2+zmAQ/LaPS85XJCxhNowNQpxYM2cGY6pWvB5Kqmz6hZjl6CzDg5tbf1hZkn/Em6jpOaE2UtMxKdELBDA==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [musl] '@mariozechner/clipboard-linux-riscv64-gnu@0.3.3': resolution: {integrity: sha512-dkEhE4ekePJwMbBq9HP1//CFMNmDzA/iV9AXqBfvL5CWmmDIRXqh4A3YZt3tWO/HdMerX+xNCEiR7WiOsIG+UA==} engines: {node: '>= 10'} cpu: [riscv64] os: [linux] + libc: [glibc] '@mariozechner/clipboard-linux-x64-gnu@0.3.3': resolution: {integrity: sha512-lT2yANtTLlEtFBIH3uGoRa/CQas/eBoLNi3qr9axQFoRgF4RGPSJ66yHOSnMECBneTIb1Iqv3UxokTfX27CdoQ==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [glibc] '@mariozechner/clipboard-linux-x64-musl@0.3.3': resolution: {integrity: sha512-saq/MCB0QHK/7ZZLjAZ0QkbY944dyjOsur8gneGCfMitt+GOiE1CU4OUipHC4b6x8UDY9bRLsR4aBaxu22OFPA==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [musl] '@mariozechner/clipboard-win32-arm64-msvc@0.3.3': resolution: {integrity: sha512-cGuvSj0/2X2w983yEcKw+i+r1EBej6ZZIN+fXG3eY2G/HaIQpbXpLvMxKyZ9LKtbZx+Z6q/gELEoSBMLML6BaQ==} @@ -2772,30 +2809,35 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [glibc] '@napi-rs/canvas-linux-arm64-musl@0.1.99': resolution: {integrity: sha512-Z+6nyLdJXWzLPVxi4H6g9TJop4DwN3KSgHWto5JCbZV5/uKoVqcSynPs0tGlUHOoWI8S8tEvJspz51GQkvr07w==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [musl] '@napi-rs/canvas-linux-riscv64-gnu@0.1.99': resolution: {integrity: sha512-jAnfOUv4IO1l8Levk5t85oVtEBOXLa07KnIUgWo1CDlPxiqpxS3uBfiE38Lvj/CQgHaNF6Nxk/SaemwLgsVJgw==} engines: {node: '>= 10'} cpu: [riscv64] os: [linux] + libc: [glibc] '@napi-rs/canvas-linux-x64-gnu@0.1.99': resolution: {integrity: sha512-mIkXw3fGmbYyFjSmfWEvty4jN+rwEOmv0+Dy9bRvvTzLYWCgm3RMgUEQVfAKFw96nIRFnyNZiK83KNQaVVFjng==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [glibc] '@napi-rs/canvas-linux-x64-musl@0.1.99': resolution: {integrity: sha512-f3Uz2P0RgrtBHISxZqr6yiYXJlTDyCVBumDacxo+4AmSg7z0HiqYZKGWC/gszq3fbPhyQUya1W2AEteKxT9Y6A==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [musl] '@napi-rs/canvas-win32-arm64-msvc@0.1.99': resolution: {integrity: sha512-XE6KUkfqRsCNejcoRMiMr3RaUeObxNf6y7dut3hrq2rn7PzfRTZgrjF1F/B2C7FcdgqY/vSHWpQeMuNz1vTNHg==} @@ -3074,48 +3116,56 @@ packages: engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [glibc] '@oxfmt/binding-linux-arm64-musl@0.46.0': resolution: {integrity: sha512-aAUPBWJ1lGwwnxZUEDLJ94+Iy6MuwJwPxUgO4sCA5mEEyDk7b+cDQ+JpX1VR150Zoyd+D49gsrUzpUK5h587Eg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [musl] '@oxfmt/binding-linux-ppc64-gnu@0.46.0': resolution: {integrity: sha512-ufBCJukyFX/UDrokP/r6BGDoTInnsDs7bxyzKAgMiZlt2Qu8GPJSJ6Zm6whIiJzKk0naxA8ilwmbO1LMw6Htxw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [ppc64] os: [linux] + libc: [glibc] '@oxfmt/binding-linux-riscv64-gnu@0.46.0': resolution: {integrity: sha512-eqtlC2YmPqjun76R1gVfGLuKWx7NuEnLEAudZ7n6ipSKbCZTqIKSs1b5Y8K/JHZsRpLkeSmAAjig5HOIg8fQzQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [riscv64] os: [linux] + libc: [glibc] '@oxfmt/binding-linux-riscv64-musl@0.46.0': resolution: {integrity: sha512-yccVOO2nMXkQLGgy0He3EQEwKD7NF0zEk+/OWmroznkqXyJdN6bfK0LtNnr6/14Bh3FjpYq7bP33l/VloCnxpA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [riscv64] os: [linux] + libc: [musl] '@oxfmt/binding-linux-s390x-gnu@0.46.0': resolution: {integrity: sha512-aAf7fG23OQCey6VRPj9IeCraoYtpgtx0ZyJ1CXkPyT1wjzBE7c3xtuxHe/AdHaJfVVb/SXpSk8Gl1LzyQupSqw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [s390x] os: [linux] + libc: [glibc] '@oxfmt/binding-linux-x64-gnu@0.46.0': resolution: {integrity: sha512-q0JPsTMyJNjYrBvYFDz4WbVsafNZaPCZv4RnFypRotLqpKROtBZcEaXQW4eb9YmvLU3NckVemLJnzkSZSdmOxw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [glibc] '@oxfmt/binding-linux-x64-musl@0.46.0': resolution: {integrity: sha512-7LsLY9Cw57GPkhSR+duI3mt9baRczK/DtHYSldQ4BEU92da9igBQNl4z7Vq5U9NNPsh1FmpKvv1q9WDtiUQR1A==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [musl] '@oxfmt/binding-openharmony-arm64@0.46.0': resolution: {integrity: sha512-lHiBOz8Duaku7JtRNLlps3j++eOaICPZSd8FCVmTDM4DFOPT71Bjn7g6iar1z7StXlKRweUKxWUs4sA+zWGDXg==} @@ -3218,48 +3268,56 @@ packages: engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [glibc] '@oxlint/binding-linux-arm64-musl@1.61.0': resolution: {integrity: sha512-bl1dQh8LnVqsj6oOQAcxwbuOmNJkwc4p6o//HTBZhNTzJy21TLDwAviMqUFNUxDHkPGpmdKTSN4tWTjLryP8xg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [musl] '@oxlint/binding-linux-ppc64-gnu@1.61.0': resolution: {integrity: sha512-QoOX6KB2IiEpyOj/HKqaxi+NQHPnOgNgnr22n9N4ANJCzXkUlj1UmeAbFb4PpqdlHIzvGDM5xZ0OKtcLq9RhiQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [ppc64] os: [linux] + libc: [glibc] '@oxlint/binding-linux-riscv64-gnu@1.61.0': resolution: {integrity: sha512-1TGcTerjY6p152wCof3oKElccq3xHljS/Mucp04gV/4ATpP6nO7YNnp7opEg6SHkv2a57/b4b8Ndm9znJ1/qAw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [riscv64] os: [linux] + libc: [glibc] '@oxlint/binding-linux-riscv64-musl@1.61.0': resolution: {integrity: sha512-65wXEmZIrX2ADwC8i/qFL4EWLSbeuBpAm3suuX1vu4IQkKd+wLT/HU/BOl84kp91u2SxPkPDyQgu4yrqp8vwVA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [riscv64] os: [linux] + libc: [musl] '@oxlint/binding-linux-s390x-gnu@1.61.0': resolution: {integrity: sha512-TVvhgMvor7Qa6COeXxCJ7ENOM+lcAOGsQ0iUdPSCv2hxb9qSHLQ4XF1h50S6RE1gBOJ0WV3rNukg4JJJP1LWRA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [s390x] os: [linux] + libc: [glibc] '@oxlint/binding-linux-x64-gnu@1.61.0': resolution: {integrity: sha512-SjpS5uYuFoDnDdZPwZE59ndF95AsY47R5MliuneTWR1pDm2CxGJaYXbKULI71t5TVfLQUWmrHEGRL9xvuq6dnA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [glibc] '@oxlint/binding-linux-x64-musl@1.61.0': resolution: {integrity: sha512-gGfAeGD4sNJGILZbc/yKcIimO9wQnPMoYp9swAaKeEtwsSQAbU+rsdQze5SBtIP6j0QDzeYd4XSSUCRCF+LIeQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [musl] '@oxlint/binding-openharmony-arm64@1.61.0': resolution: {integrity: sha512-OlVT0LrG/ct33EVtWRyR+B/othwmDWeRxfi13wUdPeb3lAT5TgTcFDcfLfarZtzB4W1nWF/zICMgYdkggX2WmQ==} @@ -3403,72 +3461,84 @@ packages: engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [glibc] '@rolldown/binding-linux-arm64-gnu@1.0.0-rc.17': resolution: {integrity: sha512-e6usGaHKW5BMNZOymS1UcEYGowQMWcgZ71Z17Sl/h2+ZziNJ1a9n3Zvcz6LdRyIW5572wBCTH/Z+bKuZouGk9Q==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [glibc] '@rolldown/binding-linux-arm64-musl@1.0.0-rc.16': resolution: {integrity: sha512-3fPzdREH806oRLxpTWW1Gt4tQHs0TitZFOECB2xzCFLPKnSOy90gwA7P29cksYilFO6XVRY1kzga0cL2nRjKPg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [musl] '@rolldown/binding-linux-arm64-musl@1.0.0-rc.17': resolution: {integrity: sha512-b/CgbwAJpmrRLp02RPfhbudf5tZnN9nsPWK82znefso832etkem8H7FSZwxrOI9djcdTP7U6YfNhbRnh7djErg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] + libc: [musl] '@rolldown/binding-linux-ppc64-gnu@1.0.0-rc.16': resolution: {integrity: sha512-EKwI1tSrLs7YVw+JPJT/G2dJQ1jl9qlTTTEG0V2Ok/RdOenRfBw2PQdLPyjhIu58ocdBfP7vIRN/pvMsPxs/AQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [ppc64] os: [linux] + libc: [glibc] '@rolldown/binding-linux-ppc64-gnu@1.0.0-rc.17': resolution: {integrity: sha512-4EII1iNGRUN5WwGbF/kOh/EIkoDN9HsupgLQoXfY+D1oyJm7/F4t5PYU5n8SWZgG0FEwakyM8pGgwcBYruGTlA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [ppc64] os: [linux] + libc: [glibc] '@rolldown/binding-linux-s390x-gnu@1.0.0-rc.16': resolution: {integrity: sha512-Uknladnb3Sxqu6SEcqBldQyJUpk8NleooZEc0MbRBJ4inEhRYWZX0NJu12vNf2mqAq7gsofAxHrGghiUYjhaLQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [s390x] os: [linux] + libc: [glibc] '@rolldown/binding-linux-s390x-gnu@1.0.0-rc.17': resolution: {integrity: sha512-AH8oq3XqQo4IibpVXvPeLDI5pzkpYn0WiZAfT05kFzoJ6tQNzwRdDYQ45M8I/gslbodRZwW8uxLhbSBbkv96rA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [s390x] os: [linux] + libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-rc.16': resolution: {integrity: sha512-FIb8+uG49sZBtLTn+zt1AJ20TqVcqWeSIyoVt0or7uAWesgKaHbiBh6OpA/k9v0LTt+PTrb1Lao133kP4uVxkg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-rc.17': resolution: {integrity: sha512-cLnjV3xfo7KslbU41Z7z8BH/E1y5mzUYzAqih1d1MDaIGZRCMqTijqLv76/P7fyHuvUcfGsIpqCdddbxLLK9rA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [glibc] '@rolldown/binding-linux-x64-musl@1.0.0-rc.16': resolution: {integrity: sha512-RuERhF9/EgWxZEXYWCOaViUWHIboceK4/ivdtQ3R0T44NjLkIIlGIAVAuCddFxsZ7vnRHtNQUrt2vR2n2slB2w==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [musl] '@rolldown/binding-linux-x64-musl@1.0.0-rc.17': resolution: {integrity: sha512-0phclDw1spsL7dUB37sIARuis2tAgomCJXAHZlpt8PXZ4Ba0dRP1e+66lsRqrfhISeN9bEGNjQs+T/Fbd7oYGw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] + libc: [musl] '@rolldown/binding-openharmony-arm64@1.0.0-rc.16': resolution: {integrity: sha512-mXcXnvd9GpazCxeUCCnZ2+YF7nut+ZOEbE4GtaiPtyY6AkhZWbK70y1KK3j+RDhjVq5+U8FySkKRb/+w0EeUwA==} @@ -3837,24 +3907,28 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [glibc] '@snazzah/davey-linux-arm64-musl@0.1.11': resolution: {integrity: sha512-e6pX6Hiabtz99q+H/YHNkm9JVlpqN8HGh0qPib8G2+UY4/SSH8WvqWipk3v581dMy2oyCHt7MOoY1aU1P1N/xA==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] + libc: [musl] '@snazzah/davey-linux-x64-gnu@0.1.11': resolution: {integrity: sha512-TW5bSoqChOJMbvsDb4wAATYrxmAXuNnse7wFNVSAJUaZKSeRfZbu3UAiPWSNn7GwLwSfU6hg322KZUn8IWCuvg==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [glibc] '@snazzah/davey-linux-x64-musl@0.1.11': resolution: {integrity: sha512-5j6Pmc+Wzv5lSxVP6quA7teYRJXibkZqQyYGfTDnTsUOO5dPpcojpqlXlkhyvsA1OAQTj4uxbOCciN3cVWwzug==} engines: {node: '>= 10'} cpu: [x64] os: [linux] + libc: [musl] '@snazzah/davey-wasm32-wasi@0.1.11': resolution: {integrity: sha512-rKOwZ/0J8lp+4VEyOdMDBRP9KR+PksZpa9V1Qn0veMzy4FqTVKthkxwGqewheFe0SFg9fdvt798l/PBFrfDeZw==} @@ -5629,24 +5703,28 @@ packages: engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [glibc] lightningcss-linux-arm64-musl@1.32.0: resolution: {integrity: sha512-UpQkoenr4UJEzgVIYpI80lDFvRmPVg6oqboNHfoH4CQIfNA+HOrZ7Mo7KZP02dC6LjghPQJeBsvXhJod/wnIBg==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [musl] lightningcss-linux-x64-gnu@1.32.0: resolution: {integrity: sha512-V7Qr52IhZmdKPVr+Vtw8o+WLsQJYCTd8loIfpDaMRWGUZfBOYEJeyJIkqGIDMZPwPx24pUMfwSxxI8phr/MbOA==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [glibc] lightningcss-linux-x64-musl@1.32.0: resolution: {integrity: sha512-bYcLp+Vb0awsiXg/80uCRezCYHNg1/l3mt0gzHnWV9XP1W5sKa5/TCdGWaR/zBM2PeF/HbsQv/j2URNOiVuxWg==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [musl] lightningcss-win32-arm64-msvc@1.32.0: resolution: {integrity: sha512-8SbC8BR40pS6baCM8sbtYDSwEVQd4JlFTOlaD3gWGHfThTcABnNDBda6eTZeqbofalIJhFx0qKzgHJmcPTnGdw==} diff --git a/src/plugins/contracts/plugin-registration.senseaudio.contract.test.ts b/src/plugins/contracts/plugin-registration.senseaudio.contract.test.ts new file mode 100644 index 00000000000..4acc9413601 --- /dev/null +++ b/src/plugins/contracts/plugin-registration.senseaudio.contract.test.ts @@ -0,0 +1,4 @@ +import { pluginRegistrationContractCases } from "../../../test/helpers/plugins/plugin-registration-contract-cases.js"; +import { describePluginRegistrationContract } from "../../../test/helpers/plugins/plugin-registration-contract.js"; + +describePluginRegistrationContract(pluginRegistrationContractCases.senseaudio); diff --git a/src/plugins/contracts/plugin-registration.tts-local-cli.contract.test.ts b/src/plugins/contracts/plugin-registration.tts-local-cli.contract.test.ts new file mode 100644 index 00000000000..1cf11ef933b --- /dev/null +++ b/src/plugins/contracts/plugin-registration.tts-local-cli.contract.test.ts @@ -0,0 +1,4 @@ +import { pluginRegistrationContractCases } from "../../../test/helpers/plugins/plugin-registration-contract-cases.js"; +import { describePluginRegistrationContract } from "../../../test/helpers/plugins/plugin-registration-contract.js"; + +describePluginRegistrationContract(pluginRegistrationContractCases["tts-local-cli"]); diff --git a/src/tts/provider-registry.test.ts b/src/tts/provider-registry.test.ts index 3ccba8c5760..02a0c982c66 100644 --- a/src/tts/provider-registry.test.ts +++ b/src/tts/provider-registry.test.ts @@ -9,6 +9,7 @@ const loadPluginManifestRegistryMock = vi.fn(() => ({ { id: "elevenlabs", origin: "bundled", contracts: { speechProviders: [{}] } }, { id: "microsoft", origin: "bundled", contracts: { speechProviders: [{}] } }, { id: "openai", origin: "bundled", contracts: { speechProviders: [{}] } }, + { id: "tts-local-cli", origin: "bundled", contracts: { speechProviders: [{}] } }, ], })); @@ -120,6 +121,7 @@ describe("speech provider registry", () => { elevenlabs: { enabled: true }, microsoft: { enabled: true }, openai: { enabled: true }, + "tts-local-cli": { enabled: true }, }, }, }, diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts index b2bdcc325f9..819cac94a2a 100644 --- a/test/helpers/plugins/plugin-registration-contract-cases.ts +++ b/test/helpers/plugins/plugin-registration-contract-cases.ts @@ -122,11 +122,19 @@ export const pluginRegistrationContractCases = { pluginId: "perplexity", webSearchProviderIds: ["perplexity"], }, + senseaudio: { + pluginId: "senseaudio", + mediaUnderstandingProviderIds: ["senseaudio"], + }, tavily: { pluginId: "tavily", webSearchProviderIds: ["tavily"], toolNames: ["tavily_search", "tavily_extract"], }, + "tts-local-cli": { + pluginId: "tts-local-cli", + speechProviderIds: ["tts-local-cli", "cli"], + }, xai: { pluginId: "xai", providerIds: ["xai"],