chore(tts): rename VOICE_BUBBLE identifiers to OPUS and update docs

This commit is contained in:
Jealous
2026-03-24 15:53:01 +08:00
committed by Ayaan Zaidi
parent 46d3617d25
commit 2c3cf4f387
7 changed files with 31 additions and 31 deletions

View File

@@ -10,7 +10,7 @@ title: "Text-to-Speech"
# Text-to-speech (TTS)
OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI.
It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble.
It works anywhere OpenClaw can send audio.
## Supported services
@@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration).
}
```
### Only reply with audio after an inbound voice note
### Only reply with audio after an inbound voice message
```json5
{
@@ -203,7 +203,7 @@ Then run:
### Notes on fields
- `auto`: autoTTS mode (`off`, `always`, `inbound`, `tagged`).
- `inbound` only sends audio after an inbound voice note.
- `inbound` only sends audio after an inbound voice message.
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
@@ -319,18 +319,18 @@ These override `messages.tts.*` for that host.
## Output formats (fixed)
- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- 48kHz / 64kbps is a good voice message tradeoff.
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
- 44.1kHz / 128kbps is the default balance for speech clarity.
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
guaranteed Opus voice notes. citeturn1search1
guaranteed Opus voice messages.
- If the configured Microsoft output format fails, OpenClaw retries with MP3.
OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
OpenAI/ElevenLabs output formats are fixed per channel (see above).
## Auto-TTS behavior
@@ -391,8 +391,8 @@ Notes:
## Agent tool
The `tts` tool converts text to speech and returns an audio attachment for
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
voice-bubble delivery.
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
the audio is delivered as a voice message rather than a file attachment.
## Gateway RPC

View File

@@ -10,7 +10,7 @@ title: "Text-to-Speech (legacy path)"
# Text-to-speech (TTS)
OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI.
It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble.
It works anywhere OpenClaw can send audio.
## Supported services
@@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration).
}
```
### Only reply with audio after an inbound voice note
### Only reply with audio after an inbound voice message
```json5
{
@@ -203,7 +203,7 @@ Then run:
### Notes on fields
- `auto`: autoTTS mode (`off`, `always`, `inbound`, `tagged`).
- `inbound` only sends audio after an inbound voice note.
- `inbound` only sends audio after an inbound voice message.
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
@@ -319,18 +319,18 @@ These override `messages.tts.*` for that host.
## Output formats (fixed)
- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- 48kHz / 64kbps is a good voice message tradeoff.
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
- 44.1kHz / 128kbps is the default balance for speech clarity.
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
guaranteed Opus voice notes. citeturn1search1
guaranteed Opus voice messages.
- If the configured Microsoft output format fails, OpenClaw retries with MP3.
OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
OpenAI/ElevenLabs output formats are fixed per channel (see above).
## Auto-TTS behavior
@@ -391,8 +391,8 @@ Notes:
## Agent tool
The `tts` tool converts text to speech and returns an audio attachment for
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
voice-bubble delivery.
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
the audio is delivered as a voice message rather than a file attachment.
## Gateway RPC

View File

@@ -93,7 +93,7 @@ export type MatrixSendOpts = {
replyToId?: string;
threadId?: string | number | null;
timeoutMs?: number;
/** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */
/** Send audio as voice message instead of audio file. Defaults to false. */
audioAsVoice?: boolean;
};

View File

@@ -67,9 +67,9 @@ type TelegramSendOpts = {
retry?: RetryConfig;
textMode?: "markdown" | "html";
plainText?: string;
/** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */
/** Send audio as voice message instead of audio file. Defaults to false. */
asVoice?: boolean;
/** Send video as video note (voice bubble) instead of regular video. Defaults to false. */
/** Send video as video note instead of regular video. Defaults to false. */
asVideoNote?: boolean;
/** Send message silently (no notification). Defaults to false. */
silent?: boolean;

View File

@@ -2,7 +2,7 @@ import { parseInlineDirectives } from "../utils/directive-tags.js";
/**
* Extract audio mode tag from text.
* Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
* Supports [[audio_as_voice]] to send audio as voice message instead of file.
* Default is file (preserves backward compatibility).
*/
export function parseAudioTag(text?: string): {

View File

@@ -231,7 +231,7 @@ describe("tts", () => {
});
describe("resolveOutputFormat", () => {
it("selects opus for voice-bubble channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => {
it("selects opus for opus channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => {
const cases = [
{
channel: "telegram",

View File

@@ -70,10 +70,10 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
speed: 1.0,
};
const TELEGRAM_OUTPUT = {
const OPUS_OUTPUT = {
openai: "opus" as const,
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
// Opus @ 48kHz/64kbps is a good voice message tradeoff.
elevenlabs: "opus_48000_64",
extension: ".opus",
voiceCompatible: true,
@@ -517,12 +517,12 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
lastTtsAttempt = entry;
}
/** Channels that require opus audio and support voice-bubble playback */
const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
/** Channels that require opus audio */
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
function resolveOutputFormat(channelId?: string | null) {
if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) {
return TELEGRAM_OUTPUT;
if (channelId && OPUS_CHANNELS.has(channelId)) {
return OPUS_OUTPUT;
}
return DEFAULT_OUTPUT;
}
@@ -696,7 +696,7 @@ export async function synthesizeSpeech(params: {
const { config, providers } = setup;
const channelId = resolveChannelId(params.channel);
const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
const target = channelId && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
const errors: string[] = [];
@@ -948,7 +948,7 @@ export async function maybeApplyTtsToPayload(params: {
const channelId = resolveChannelId(params.channel);
const shouldVoice =
channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true;
channelId !== null && OPUS_CHANNELS.has(channelId) && result.voiceCompatible === true;
const finalPayload = {
...nextPayload,
mediaUrl: result.audioPath,