From 2c3cf4f387451401fd9d296bf4009f585f5caa5a Mon Sep 17 00:00:00 2001 From: Jealous Date: Tue, 24 Mar 2026 15:53:01 +0800 Subject: [PATCH] chore(tts): rename VOICE_BUBBLE identifiers to OPUS and update docs --- docs/tools/tts.md | 18 +++++++++--------- docs/tts.md | 18 +++++++++--------- extensions/matrix/src/matrix/send/types.ts | 2 +- extensions/telegram/src/send.ts | 4 ++-- src/media/audio-tags.ts | 2 +- src/tts/tts.test.ts | 2 +- src/tts/tts.ts | 16 ++++++++-------- 7 files changed, 31 insertions(+), 31 deletions(-) diff --git a/docs/tools/tts.md b/docs/tools/tts.md index a527d49cc21..f87c47eb725 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -10,7 +10,7 @@ title: "Text-to-Speech" # Text-to-speech (TTS) OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI. -It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble. +It works anywhere OpenClaw can send audio. ## Supported services @@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` -### Only reply with audio after an inbound voice note +### Only reply with audio after an inbound voice message ```json5 { @@ -203,7 +203,7 @@ Then run: ### Notes on fields - `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`). - - `inbound` only sends audio after an inbound voice note. + - `inbound` only sends audio after an inbound voice message. - `tagged` only sends audio when the reply includes `[[tts]]` tags. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). @@ -319,18 +319,18 @@ These override `messages.tts.*` for that host. ## Output formats (fixed) -- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). - - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble. +- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). + - 48kHz / 64kbps is a good voice message tradeoff. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). - The bundled transport accepts an `outputFormat`, but not all formats are available from the service. - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need - guaranteed Opus voice notes. citeturn1search1 + guaranteed Opus voice messages. - If the configured Microsoft output format fails, OpenClaw retries with MP3. -OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX. +OpenAI/ElevenLabs output formats are fixed per channel (see above). ## Auto-TTS behavior @@ -391,8 +391,8 @@ Notes: ## Agent tool The `tts` tool converts text to speech and returns an audio attachment for -reply delivery. When the result is Telegram-compatible, OpenClaw marks it for -voice-bubble delivery. +reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp, +the audio is delivered as a voice message rather than a file attachment. ## Gateway RPC diff --git a/docs/tts.md b/docs/tts.md index 7409ce8b88e..ffd56f8b02e 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -10,7 +10,7 @@ title: "Text-to-Speech (legacy path)" # Text-to-speech (TTS) OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI. -It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble. +It works anywhere OpenClaw can send audio. ## Supported services @@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` -### Only reply with audio after an inbound voice note +### Only reply with audio after an inbound voice message ```json5 { @@ -203,7 +203,7 @@ Then run: ### Notes on fields - `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`). - - `inbound` only sends audio after an inbound voice note. + - `inbound` only sends audio after an inbound voice message. - `tagged` only sends audio when the reply includes `[[tts]]` tags. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). @@ -319,18 +319,18 @@ These override `messages.tts.*` for that host. ## Output formats (fixed) -- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). - - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble. +- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). + - 48kHz / 64kbps is a good voice message tradeoff. - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). - 44.1kHz / 128kbps is the default balance for speech clarity. - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). - The bundled transport accepts an `outputFormat`, but not all formats are available from the service. - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need - guaranteed Opus voice notes. citeturn1search1 + guaranteed Opus voice messages. - If the configured Microsoft output format fails, OpenClaw retries with MP3. -OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX. +OpenAI/ElevenLabs output formats are fixed per channel (see above). ## Auto-TTS behavior @@ -391,8 +391,8 @@ Notes: ## Agent tool The `tts` tool converts text to speech and returns an audio attachment for -reply delivery. When the result is Telegram-compatible, OpenClaw marks it for -voice-bubble delivery. +reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp, +the audio is delivered as a voice message rather than a file attachment. ## Gateway RPC diff --git a/extensions/matrix/src/matrix/send/types.ts b/extensions/matrix/src/matrix/send/types.ts index 2d2d8bf3715..f3d40d92543 100644 --- a/extensions/matrix/src/matrix/send/types.ts +++ b/extensions/matrix/src/matrix/send/types.ts @@ -93,7 +93,7 @@ export type MatrixSendOpts = { replyToId?: string; threadId?: string | number | null; timeoutMs?: number; - /** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */ + /** Send audio as voice message instead of audio file. Defaults to false. */ audioAsVoice?: boolean; }; diff --git a/extensions/telegram/src/send.ts b/extensions/telegram/src/send.ts index 8cd429eb4cc..f68f72046e2 100644 --- a/extensions/telegram/src/send.ts +++ b/extensions/telegram/src/send.ts @@ -67,9 +67,9 @@ type TelegramSendOpts = { retry?: RetryConfig; textMode?: "markdown" | "html"; plainText?: string; - /** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */ + /** Send audio as voice message instead of audio file. Defaults to false. */ asVoice?: boolean; - /** Send video as video note (voice bubble) instead of regular video. Defaults to false. */ + /** Send video as video note instead of regular video. Defaults to false. */ asVideoNote?: boolean; /** Send message silently (no notification). Defaults to false. */ silent?: boolean; diff --git a/src/media/audio-tags.ts b/src/media/audio-tags.ts index 51591539ac7..5ecb1825df5 100644 --- a/src/media/audio-tags.ts +++ b/src/media/audio-tags.ts @@ -2,7 +2,7 @@ import { parseInlineDirectives } from "../utils/directive-tags.js"; /** * Extract audio mode tag from text. - * Supports [[audio_as_voice]] to send audio as voice bubble instead of file. + * Supports [[audio_as_voice]] to send audio as voice message instead of file. * Default is file (preserves backward compatibility). */ export function parseAudioTag(text?: string): { diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 18987eb87f5..cf91e1b66a2 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -231,7 +231,7 @@ describe("tts", () => { }); describe("resolveOutputFormat", () => { - it("selects opus for voice-bubble channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => { + it("selects opus for opus channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => { const cases = [ { channel: "telegram", diff --git a/src/tts/tts.ts b/src/tts/tts.ts index d602462d317..10663d66c76 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -70,10 +70,10 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { speed: 1.0, }; -const TELEGRAM_OUTPUT = { +const OPUS_OUTPUT = { openai: "opus" as const, // ElevenLabs output formats use codec_sample_rate_bitrate naming. - // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram. + // Opus @ 48kHz/64kbps is a good voice message tradeoff. elevenlabs: "opus_48000_64", extension: ".opus", voiceCompatible: true, @@ -517,12 +517,12 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { lastTtsAttempt = entry; } -/** Channels that require opus audio and support voice-bubble playback */ -const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]); +/** Channels that require opus audio */ +const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]); function resolveOutputFormat(channelId?: string | null) { - if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) { - return TELEGRAM_OUTPUT; + if (channelId && OPUS_CHANNELS.has(channelId)) { + return OPUS_OUTPUT; } return DEFAULT_OUTPUT; } @@ -696,7 +696,7 @@ export async function synthesizeSpeech(params: { const { config, providers } = setup; const channelId = resolveChannelId(params.channel); - const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file"; + const target = channelId && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file"; const errors: string[] = []; @@ -948,7 +948,7 @@ export async function maybeApplyTtsToPayload(params: { const channelId = resolveChannelId(params.channel); const shouldVoice = - channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true; + channelId !== null && OPUS_CHANNELS.has(channelId) && result.voiceCompatible === true; const finalPayload = { ...nextPayload, mediaUrl: result.audioPath,