From 5f6adaf15788ba6b99a4a7186fa33fc05b3ebd0f Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 2 May 2026 09:24:31 +0100
Subject: [PATCH] fix(tts): honor short tagged speech

---
 CHANGELOG.md                           |  1 +
 extensions/speech-core/src/tts.test.ts | 63 ++++++++++++++++++++++++++
 extensions/speech-core/src/tts.ts      | 10 ++--
 3 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dfd2c10056b..65987e45ba6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai
 
 ### Fixes
 
+- TTS: honor explicit short `[[tts:text]]...[[/tts:text]]` blocks while keeping untagged short auto-TTS suppressed, so tagged voice replies are synthesized instead of being dropped as empty voice-only payloads. Fixes #73758. Thanks @yfge.
 - Proxy/audio: convert standard `FormData` bodies before proxy-backed undici fetches, so audio transcription and multipart uploads no longer send `[object FormData]` when `HTTP_PROXY` or `HTTPS_PROXY` is configured. Fixes #48554. Thanks @dco5.
 - Gateway/diagnostics: include a bounded redacted startup error message in stability bundles, so crash-loop reports identify the failing plugin or contract without exposing secrets. Refs #75797. Thanks @ymebosma.
 - Gateway/pricing: abort in-flight model pricing catalog fetches when Gateway shutdown stops the refresh loop, and avoid post-stop cache writes or refresh timers. Fixes #72208. Thanks @rzcq.
diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts
index 704d2056188..52714a41039 100644
--- a/extensions/speech-core/src/tts.test.ts
+++ b/extensions/speech-core/src/tts.test.ts
@@ -388,6 +388,69 @@ describe("speech-core native voice-note routing", () => {
     });
   });
 
+  it("synthesizes explicitly tagged short hidden TTS text", async () => {
+    const cfg = createTtsConfig("openclaw-speech-core-short-hidden-tts-test");
+    let mediaDir: string | undefined;
+    try {
+      const result = await maybeApplyTtsToPayload({
+        payload: {
+          text: "[[tts:text]]hello[[/tts:text]]",
+          audioAsVoice: true,
+        },
+        cfg,
+        channel: "telegram",
+        kind: "final",
+      });
+
+      expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ text: "hello" }));
+      expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/);
+      expect(result.audioAsVoice).toBe(true);
+      expect(result.text).toBeUndefined();
+      mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined;
+    } finally {
+      if (mediaDir) {
+        rmSync(mediaDir, { recursive: true, force: true });
+      }
+    }
+  });
+
+  it("keeps skipping untagged short TTS text", async () => {
+    const cfg = createTtsConfig("openclaw-speech-core-short-plain-tts-test");
+    const result = await maybeApplyTtsToPayload({
+      payload: {
+        text: "hello",
+        audioAsVoice: true,
+      },
+      cfg,
+      channel: "telegram",
+      kind: "final",
+    });
+
+    expect(synthesizeMock).not.toHaveBeenCalled();
+    expect(result).toEqual({
+      text: "hello",
+      audioAsVoice: true,
+    });
+  });
+
+  it("keeps skipping explicit tagged TTS text that strips to empty markdown", async () => {
+    const cfg = createTtsConfig("openclaw-speech-core-empty-hidden-tts-test");
+    const result = await maybeApplyTtsToPayload({
+      payload: {
+        text: "[[tts:text]]***[[/tts:text]]",
+        audioAsVoice: true,
+      },
+      cfg,
+      channel: "telegram",
+      kind: "final",
+    });
+
+    expect(synthesizeMock).not.toHaveBeenCalled();
+    expect(result).toEqual({
+      audioAsVoice: true,
+    });
+  });
+
   it("selects persona preferred provider before config fallback", () => {
     const cfg: OpenClawConfig = {
       messages: {
diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts
index 52404c2db48..9616334e607 100644
--- a/extensions/speech-core/src/tts.ts
+++ b/extensions/speech-core/src/tts.ts
@@ -1527,7 +1527,8 @@ export async function maybeApplyTtsToPayload(params: {
   const cleanedText = directives.cleanedText;
   const trimmedCleaned = cleanedText.trim();
   const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
-  const ttsText = directives.ttsText?.trim() || visibleText;
+  const explicitTtsText = directives.ttsText?.trim() || "";
+  const ttsText = explicitTtsText || visibleText;
 
   const nextPayload =
     visibleText === text.trim()
@@ -1558,7 +1559,7 @@ export async function maybeApplyTtsToPayload(params: {
   if (text.includes("MEDIA:")) {
     return nextPayload;
   }
-  if (ttsText.trim().length < 10) {
+  if (!explicitTtsText && ttsText.trim().length < 10) {
     return nextPayload;
   }
 
@@ -1598,7 +1599,10 @@ export async function maybeApplyTtsToPayload(params: {
   }
 
   textForAudio = stripMarkdown(textForAudio).trim();
-  if (textForAudio.length < 10) {
+  if (!textForAudio) {
+    return nextPayload;
+  }
+  if (!explicitTtsText && textForAudio.length < 10) {
     return nextPayload;
   }