From 2c3cf4f387451401fd9d296bf4009f585f5caa5a Mon Sep 17 00:00:00 2001
From: Jealous <CooLanfei@163.com>
Date: Tue, 24 Mar 2026 15:53:01 +0800
Subject: [PATCH] chore(tts): rename VOICE_BUBBLE identifiers to OPUS and
 update docs

---
 docs/tools/tts.md                          | 18 +++++++++---------
 docs/tts.md                                | 18 +++++++++---------
 extensions/matrix/src/matrix/send/types.ts |  2 +-
 extensions/telegram/src/send.ts            |  4 ++--
 src/media/audio-tags.ts                    |  2 +-
 src/tts/tts.test.ts                        |  2 +-
 src/tts/tts.ts                             | 16 ++++++++--------
 7 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/docs/tools/tts.md b/docs/tools/tts.md
index a527d49cc21..f87c47eb725 100644
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -10,7 +10,7 @@ title: "Text-to-Speech"
 # Text-to-speech (TTS)
 
 OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI.
-It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble.
+It works anywhere OpenClaw can send audio.
 
 ## Supported services
 
@@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration).
 }
 ```
 
-### Only reply with audio after an inbound voice note
+### Only reply with audio after an inbound voice message
 
 ```json5
 {
@@ -203,7 +203,7 @@ Then run:
 ### Notes on fields
 
 - `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`).
-  - `inbound` only sends audio after an inbound voice note.
+  - `inbound` only sends audio after an inbound voice message.
   - `tagged` only sends audio when the reply includes `[[tts]]` tags.
 - `enabled`: legacy toggle (doctor migrates this to `auto`).
 - `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
@@ -319,18 +319,18 @@ These override `messages.tts.*` for that host.
 
 ## Output formats (fixed)
 
-- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
-  - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
+- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
+  - 48kHz / 64kbps is a good voice message tradeoff.
 - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
   - 44.1kHz / 128kbps is the default balance for speech clarity.
 - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
   - The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
   - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
   - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
-    guaranteed Opus voice notes. citeturn1search1
+    guaranteed Opus voice messages.
   - If the configured Microsoft output format fails, OpenClaw retries with MP3.
 
-OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
+OpenAI/ElevenLabs output formats are fixed per channel (see above).
 
 ## Auto-TTS behavior
 
@@ -391,8 +391,8 @@ Notes:
 ## Agent tool
 
 The `tts` tool converts text to speech and returns an audio attachment for
-reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
-voice-bubble delivery.
+reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
+the audio is delivered as a voice message rather than a file attachment.
 
 ## Gateway RPC
 
diff --git a/docs/tts.md b/docs/tts.md
index 7409ce8b88e..ffd56f8b02e 100644
--- a/docs/tts.md
+++ b/docs/tts.md
@@ -10,7 +10,7 @@ title: "Text-to-Speech (legacy path)"
 # Text-to-speech (TTS)
 
 OpenClaw can convert outbound replies into audio using ElevenLabs, Microsoft, or OpenAI.
-It works anywhere OpenClaw can send audio; Telegram gets a round voice-note bubble.
+It works anywhere OpenClaw can send audio.
 
 ## Supported services
 
@@ -170,7 +170,7 @@ Full schema is in [Gateway configuration](/gateway/configuration).
 }
 ```
 
-### Only reply with audio after an inbound voice note
+### Only reply with audio after an inbound voice message
 
 ```json5
 {
@@ -203,7 +203,7 @@ Then run:
 ### Notes on fields
 
 - `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`).
-  - `inbound` only sends audio after an inbound voice note.
+  - `inbound` only sends audio after an inbound voice message.
   - `tagged` only sends audio when the reply includes `[[tts]]` tags.
 - `enabled`: legacy toggle (doctor migrates this to `auto`).
 - `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
@@ -319,18 +319,18 @@ These override `messages.tts.*` for that host.
 
 ## Output formats (fixed)
 
-- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
-  - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
+- **Feishu / Matrix / Telegram / WhatsApp**: Opus voice message (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
+  - 48kHz / 64kbps is a good voice message tradeoff.
 - **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
   - 44.1kHz / 128kbps is the default balance for speech clarity.
 - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
   - The bundled transport accepts an `outputFormat`, but not all formats are available from the service.
   - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
   - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
-    guaranteed Opus voice notes. citeturn1search1
+    guaranteed Opus voice messages.
   - If the configured Microsoft output format fails, OpenClaw retries with MP3.
 
-OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
+OpenAI/ElevenLabs output formats are fixed per channel (see above).
 
 ## Auto-TTS behavior
 
@@ -391,8 +391,8 @@ Notes:
 ## Agent tool
 
 The `tts` tool converts text to speech and returns an audio attachment for
-reply delivery. When the result is Telegram-compatible, OpenClaw marks it for
-voice-bubble delivery.
+reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
+the audio is delivered as a voice message rather than a file attachment.
 
 ## Gateway RPC
 
diff --git a/extensions/matrix/src/matrix/send/types.ts b/extensions/matrix/src/matrix/send/types.ts
index 2d2d8bf3715..f3d40d92543 100644
--- a/extensions/matrix/src/matrix/send/types.ts
+++ b/extensions/matrix/src/matrix/send/types.ts
@@ -93,7 +93,7 @@ export type MatrixSendOpts = {
   replyToId?: string;
   threadId?: string | number | null;
   timeoutMs?: number;
-  /** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */
+  /** Send audio as voice message instead of audio file. Defaults to false. */
   audioAsVoice?: boolean;
 };
 
diff --git a/extensions/telegram/src/send.ts b/extensions/telegram/src/send.ts
index 8cd429eb4cc..f68f72046e2 100644
--- a/extensions/telegram/src/send.ts
+++ b/extensions/telegram/src/send.ts
@@ -67,9 +67,9 @@ type TelegramSendOpts = {
   retry?: RetryConfig;
   textMode?: "markdown" | "html";
   plainText?: string;
-  /** Send audio as voice message (voice bubble) instead of audio file. Defaults to false. */
+  /** Send audio as voice message instead of audio file. Defaults to false. */
   asVoice?: boolean;
-  /** Send video as video note (voice bubble) instead of regular video. Defaults to false. */
+  /** Send video as video note instead of regular video. Defaults to false. */
   asVideoNote?: boolean;
   /** Send message silently (no notification). Defaults to false. */
   silent?: boolean;
diff --git a/src/media/audio-tags.ts b/src/media/audio-tags.ts
index 51591539ac7..5ecb1825df5 100644
--- a/src/media/audio-tags.ts
+++ b/src/media/audio-tags.ts
@@ -2,7 +2,7 @@ import { parseInlineDirectives } from "../utils/directive-tags.js";
 
 /**
  * Extract audio mode tag from text.
- * Supports [[audio_as_voice]] to send audio as voice bubble instead of file.
+ * Supports [[audio_as_voice]] to send audio as voice message instead of file.
  * Default is file (preserves backward compatibility).
  */
 export function parseAudioTag(text?: string): {
diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts
index 18987eb87f5..cf91e1b66a2 100644
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -231,7 +231,7 @@ describe("tts", () => {
   });
 
   describe("resolveOutputFormat", () => {
-    it("selects opus for voice-bubble channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => {
+    it("selects opus for opus channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => {
       const cases = [
         {
           channel: "telegram",
diff --git a/src/tts/tts.ts b/src/tts/tts.ts
index d602462d317..10663d66c76 100644
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -70,10 +70,10 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
   speed: 1.0,
 };
 
-const TELEGRAM_OUTPUT = {
+const OPUS_OUTPUT = {
   openai: "opus" as const,
   // ElevenLabs output formats use codec_sample_rate_bitrate naming.
-  // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
+  // Opus @ 48kHz/64kbps is a good voice message tradeoff.
   elevenlabs: "opus_48000_64",
   extension: ".opus",
   voiceCompatible: true,
@@ -517,12 +517,12 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
   lastTtsAttempt = entry;
 }
 
-/** Channels that require opus audio and support voice-bubble playback */
-const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
+/** Channels that require opus audio */
+const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
 
 function resolveOutputFormat(channelId?: string | null) {
-  if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) {
-    return TELEGRAM_OUTPUT;
+  if (channelId && OPUS_CHANNELS.has(channelId)) {
+    return OPUS_OUTPUT;
   }
   return DEFAULT_OUTPUT;
 }
@@ -696,7 +696,7 @@ export async function synthesizeSpeech(params: {
 
   const { config, providers } = setup;
   const channelId = resolveChannelId(params.channel);
-  const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
+  const target = channelId && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
 
   const errors: string[] = [];
 
@@ -948,7 +948,7 @@ export async function maybeApplyTtsToPayload(params: {
 
     const channelId = resolveChannelId(params.channel);
     const shouldVoice =
-      channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true;
+      channelId !== null && OPUS_CHANNELS.has(channelId) && result.voiceCompatible === true;
     const finalPayload = {
       ...nextPayload,
       mediaUrl: result.audioPath,