mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-26 01:11:37 +00:00
chore(tts): rename VOICE_BUBBLE identifiers to OPUS and update docs
This commit is contained in:
@@ -10,7 +10,7 @@ title: "Text-to-Speech"
|
||||
# Text-to-speech (TTS)
|
||||
|
||||
OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI.
|
||||
It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble.
|
||||
It works anywhere OpenClaw can send audio.
|
||||
|
||||
## Supported services
|
||||
|
||||
@@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration).
|
||||
}
|
||||
```
|
||||
|
||||
### Only reply with audio after an inbound voice note
|
||||
### Only reply with audio after an inbound voice message
|
||||
|
||||
```json5
|
||||
{
|
||||
@@ -203,7 +203,7 @@ Then run:
|
||||
### Notes on fields
|
||||
|
||||
- `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`).
|
||||
- `inbound` only sends audio after an inbound voice note.
|
||||
- `inbound` only sends audio after an inbound voice message.
|
||||
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
|
||||
- `enabled`: legacy toggle (doctor migrates this to `auto`).
|
||||
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
|
||||
@@ -319,18 +319,18 @@ These override `messages.tts.*` for that host.
|
||||
|
||||
## Output formats (fixed)
|
||||
|
||||
- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
|
||||
- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice message tradeoff.
|
||||
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
|
||||
- 44.1kHz / 128kbps is the default balance for speech clarity.
|
||||
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
|
||||
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
|
||||
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
|
||||
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
|
||||
guaranteed Opus voice notes. citeturn1search1
|
||||
guaranteed Opus voice messages.
|
||||
- If the configured Microsoft output format fails, OpenClaw retries with MP3.
|
||||
|
||||
OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
|
||||
OpenAI/ElevenLabs output formats are fixed per channel (see above).
|
||||
|
||||
## Auto-TTS behavior
|
||||
|
||||
@@ -391,8 +391,8 @@ Notes:
|
||||
## Agent tool
|
||||
|
||||
The `tts` tool converts text to speech and returns an audio attachment for
|
||||
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
|
||||
voice-bubble delivery.
|
||||
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
|
||||
the audio is delivered as a voice message rather than a file attachment.
|
||||
|
||||
## Gateway RPC
|
||||
|
||||
|
||||
18
docs/tts.md
18
docs/tts.md
@@ -10,7 +10,7 @@ title: "Text-to-Speech (legacy path)"
|
||||
# Text-to-speech (TTS)
|
||||
|
||||
OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI.
|
||||
It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble.
|
||||
It works anywhere OpenClaw can send audio.
|
||||
|
||||
## Supported services
|
||||
|
||||
@@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration).
|
||||
}
|
||||
```
|
||||
|
||||
### Only reply with audio after an inbound voice note
|
||||
### Only reply with audio after an inbound voice message
|
||||
|
||||
```json5
|
||||
{
|
||||
@@ -203,7 +203,7 @@ Then run:
|
||||
### Notes on fields
|
||||
|
||||
- `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`).
|
||||
- `inbound` only sends audio after an inbound voice note.
|
||||
- `inbound` only sends audio after an inbound voice message.
|
||||
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
|
||||
- `enabled`: legacy toggle (doctor migrates this to `auto`).
|
||||
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
|
||||
@@ -319,18 +319,18 @@ These override `messages.tts.*` for that host.
|
||||
|
||||
## Output formats (fixed)
|
||||
|
||||
- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
|
||||
- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
|
||||
- 48kHz / 64kbps is a good voice message tradeoff.
|
||||
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
|
||||
- 44.1kHz / 128kbps is the default balance for speech clarity.
|
||||
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
|
||||
- The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
|
||||
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
|
||||
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
|
||||
guaranteed Opus voice notes. citeturn1search1
|
||||
guaranteed Opus voice messages.
|
||||
- If the configured Microsoft output format fails, OpenClaw retries with MP3.
|
||||
|
||||
OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
|
||||
OpenAI/ElevenLabs output formats are fixed per channel (see above).
|
||||
|
||||
## Auto-TTS behavior
|
||||
|
||||
@@ -391,8 +391,8 @@ Notes:
|
||||
## Agent tool
|
||||
|
||||
The `tts` tool converts text to speech and returns an audio attachment for
|
||||
reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
|
||||
voice-bubble delivery.
|
||||
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
|
||||
the audio is delivered as a voice message rather than a file attachment.
|
||||
|
||||
## Gateway RPC
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ export type MatrixSendOpts = {
|
||||
replyToId?: string;
|
||||
threadId?: string | number | null;
|
||||
timeoutMs?: number;
|
||||
/** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */
|
||||
/** Send audio as voice message instead of audio file. Defaults to false. */
|
||||
audioAsVoice?: boolean;
|
||||
};
|
||||
|
||||
|
||||
@@ -67,9 +67,9 @@ type TelegramSendOpts = {
|
||||
retry?: RetryConfig;
|
||||
textMode?: "markdown" | "html";
|
||||
plainText?: string;
|
||||
/** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */
|
||||
/** Send audio as voice message instead of audio file. Defaults to false. */
|
||||
asVoice?: boolean;
|
||||
/** Send video as video note (voice bubble) instead of regular video. Defaults to false. */
|
||||
/** Send video as video note instead of regular video. Defaults to false. */
|
||||
asVideoNote?: boolean;
|
||||
/** Send message silently (no notification). Defaults to false. */
|
||||
silent?: boolean;
|
||||
|
||||
@@ -2,7 +2,7 @@ import { parseInlineDirectives } from "../utils/directive-tags.js";
|
||||
|
||||
/**
|
||||
* Extract audio mode tag from text.
|
||||
* Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
|
||||
* Supports [[audio_as_voice]] to send audio as voice message instead of file.
|
||||
* Default is file (preserves backward compatibility).
|
||||
*/
|
||||
export function parseAudioTag(text?: string): {
|
||||
|
||||
@@ -231,7 +231,7 @@ describe("tts", () => {
|
||||
});
|
||||
|
||||
describe("resolveOutputFormat", () => {
|
||||
it("selects opus for voice-bubble channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => {
|
||||
it("selects opus for opus channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => {
|
||||
const cases = [
|
||||
{
|
||||
channel: "telegram",
|
||||
|
||||
@@ -70,10 +70,10 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
|
||||
speed: 1.0,
|
||||
};
|
||||
|
||||
const TELEGRAM_OUTPUT = {
|
||||
const OPUS_OUTPUT = {
|
||||
openai: "opus" as const,
|
||||
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
|
||||
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
|
||||
// Opus @ 48kHz/64kbps is a good voice message tradeoff.
|
||||
elevenlabs: "opus_48000_64",
|
||||
extension: ".opus",
|
||||
voiceCompatible: true,
|
||||
@@ -517,12 +517,12 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
|
||||
lastTtsAttempt = entry;
|
||||
}
|
||||
|
||||
/** Channels that require opus audio and support voice-bubble playback */
|
||||
const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
|
||||
/** Channels that require opus audio */
|
||||
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
|
||||
|
||||
function resolveOutputFormat(channelId?: string | null) {
|
||||
if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) {
|
||||
return TELEGRAM_OUTPUT;
|
||||
if (channelId && OPUS_CHANNELS.has(channelId)) {
|
||||
return OPUS_OUTPUT;
|
||||
}
|
||||
return DEFAULT_OUTPUT;
|
||||
}
|
||||
@@ -696,7 +696,7 @@ export async function synthesizeSpeech(params: {
|
||||
|
||||
const { config, providers } = setup;
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
|
||||
const target = channelId && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
|
||||
|
||||
const errors: string[] = [];
|
||||
|
||||
@@ -948,7 +948,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
const shouldVoice =
|
||||
channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true;
|
||||
channelId !== null && OPUS_CHANNELS.has(channelId) && result.voiceCompatible === true;
|
||||
const finalPayload = {
|
||||
...nextPayload,
|
||||
mediaUrl: result.audioPath,
|
||||
|
||||
Reference in New Issue
Block a user