From 5f6adaf15788ba6b99a4a7186fa33fc05b3ebd0f Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 09:24:31 +0100 Subject: [PATCH] fix(tts): honor short tagged speech --- CHANGELOG.md | 1 + extensions/speech-core/src/tts.test.ts | 63 ++++++++++++++++++++++++++ extensions/speech-core/src/tts.ts | 10 ++-- 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfd2c10056b..65987e45ba6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- TTS: honor explicit short `[[tts:text]]...[[/tts:text]]` blocks while keeping untagged short auto-TTS suppressed, so tagged voice replies are synthesized instead of being dropped as empty voice-only payloads. Fixes #73758. Thanks @yfge. - Proxy/audio: convert standard `FormData` bodies before proxy-backed undici fetches, so audio transcription and multipart uploads no longer send `[object FormData]` when `HTTP_PROXY` or `HTTPS_PROXY` is configured. Fixes #48554. Thanks @dco5. - Gateway/diagnostics: include a bounded redacted startup error message in stability bundles, so crash-loop reports identify the failing plugin or contract without exposing secrets. Refs #75797. Thanks @ymebosma. - Gateway/pricing: abort in-flight model pricing catalog fetches when Gateway shutdown stops the refresh loop, and avoid post-stop cache writes or refresh timers. Fixes #72208. Thanks @rzcq. diff --git a/extensions/speech-core/src/tts.test.ts b/extensions/speech-core/src/tts.test.ts index 704d2056188..52714a41039 100644 --- a/extensions/speech-core/src/tts.test.ts +++ b/extensions/speech-core/src/tts.test.ts @@ -388,6 +388,69 @@ describe("speech-core native voice-note routing", () => { }); }); + it("synthesizes explicitly tagged short hidden TTS text", async () => { + const cfg = createTtsConfig("openclaw-speech-core-short-hidden-tts-test"); + let mediaDir: string | undefined; + try { + const result = await maybeApplyTtsToPayload({ + payload: { + text: "[[tts:text]]hello[[/tts:text]]", + audioAsVoice: true, + }, + cfg, + channel: "telegram", + kind: "final", + }); + + expect(synthesizeMock).toHaveBeenCalledWith(expect.objectContaining({ text: "hello" })); + expect(result.mediaUrl).toMatch(/voice-\d+\.ogg$/); + expect(result.audioAsVoice).toBe(true); + expect(result.text).toBeUndefined(); + mediaDir = result.mediaUrl ? path.dirname(result.mediaUrl) : undefined; + } finally { + if (mediaDir) { + rmSync(mediaDir, { recursive: true, force: true }); + } + } + }); + + it("keeps skipping untagged short TTS text", async () => { + const cfg = createTtsConfig("openclaw-speech-core-short-plain-tts-test"); + const result = await maybeApplyTtsToPayload({ + payload: { + text: "hello", + audioAsVoice: true, + }, + cfg, + channel: "telegram", + kind: "final", + }); + + expect(synthesizeMock).not.toHaveBeenCalled(); + expect(result).toEqual({ + text: "hello", + audioAsVoice: true, + }); + }); + + it("keeps skipping explicit tagged TTS text that strips to empty markdown", async () => { + const cfg = createTtsConfig("openclaw-speech-core-empty-hidden-tts-test"); + const result = await maybeApplyTtsToPayload({ + payload: { + text: "[[tts:text]]***[[/tts:text]]", + audioAsVoice: true, + }, + cfg, + channel: "telegram", + kind: "final", + }); + + expect(synthesizeMock).not.toHaveBeenCalled(); + expect(result).toEqual({ + audioAsVoice: true, + }); + }); + it("selects persona preferred provider before config fallback", () => { const cfg: OpenClawConfig = { messages: { diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index 52404c2db48..9616334e607 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -1527,7 +1527,8 @@ export async function maybeApplyTtsToPayload(params: { const cleanedText = directives.cleanedText; const trimmedCleaned = cleanedText.trim(); const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : ""; - const ttsText = directives.ttsText?.trim() || visibleText; + const explicitTtsText = directives.ttsText?.trim() || ""; + const ttsText = explicitTtsText || visibleText; const nextPayload = visibleText === text.trim() @@ -1558,7 +1559,7 @@ export async function maybeApplyTtsToPayload(params: { if (text.includes("MEDIA:")) { return nextPayload; } - if (ttsText.trim().length < 10) { + if (!explicitTtsText && ttsText.trim().length < 10) { return nextPayload; } @@ -1598,7 +1599,10 @@ export async function maybeApplyTtsToPayload(params: { } textForAudio = stripMarkdown(textForAudio).trim(); - if (textForAudio.length < 10) { + if (!textForAudio) { + return nextPayload; + } + if (!explicitTtsText && textForAudio.length < 10) { return nextPayload; }