From c02605253d70381a19a1849abca0ce1a4ae03be9 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 2 May 2026 03:16:30 +0100 Subject: [PATCH] fix: require explicit TTS intent --- CHANGELOG.md | 1 + docs/tools/tts.md | 3 +++ src/agents/openclaw-tools.tts-config.test.ts | 21 ++++++++++++++++++++ src/agents/tools/tts-tool.test.ts | 8 ++++++++ src/agents/tools/tts-tool.ts | 4 +++- src/tts/tts-config.test.ts | 11 ++++++++++ 6 files changed, 47 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4123d1d9a9c..1ace21f40a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai ### Fixes - TTS/Telegram: keep trusted local audio generated by the TTS tool queued for voice-note delivery even when the run-level built-in tool list omits the raw `tts` name. Fixes #74752. Thanks @Loveworld3033 and @andyliu. +- TTS: require explicit user or config audio intent for the agent speech tool so dashboard chats stay text unless audio is requested. Fixes #69777. Thanks @alexandre-leng. - Heartbeat: strip legacy `[TOOL_CALL]...[/TOOL_CALL]` and `[TOOL_RESULT]...[/TOOL_RESULT]` pseudo-call blocks from heartbeat replies before channel delivery. Fixes #54138. Thanks @Deniable9570. - macOS/Voice Wake: send wake-word and Push-to-Talk transcripts through the selected macOS session target instead of always falling back to main WebChat. Fixes #51040. Thanks @carl-jeffrolc. - Providers/xAI: give Grok `web_search` a 60s default timeout, harden malformed xAI Responses parsing, and return structured timeout errors instead of aborting the tool call. Fixes #58063 and #58733. Thanks @dnishimura, @marvcasasola-svg, and @Nanako0129. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 8231921767e..45c66429793 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -48,6 +48,9 @@ audio attachments everywhere else, and PCM/Ulaw streams for telephony and Talk. Auto-TTS is **off** by default. When `messages.tts.provider` is unset, OpenClaw picks the first configured provider in registry auto-select order. +The built-in `tts` agent tool is explicit-intent only: ordinary chat stays +text unless the user asks for audio, uses `/tts`, or enables Auto-TTS/directive +speech. ## Supported providers diff --git a/src/agents/openclaw-tools.tts-config.test.ts b/src/agents/openclaw-tools.tts-config.test.ts index f84a56f735d..a592fbedf6a 100644 --- a/src/agents/openclaw-tools.tts-config.test.ts +++ b/src/agents/openclaw-tools.tts-config.test.ts @@ -168,6 +168,27 @@ describe("createOpenClawTools TTS config wiring", () => { } }); + it("keeps direct TTS tool guidance explicit even when the tool is available", async () => { + const { __testing, createOpenClawTools } = await import("./openclaw-tools.js"); + __testing.setDepsForTest({ config: {} }); + + try { + const tool = createOpenClawTools({ + disableMessageTool: true, + disablePluginTools: true, + }).find((candidate) => candidate.name === "tts"); + + if (!tool) { + throw new Error("missing tts tool"); + } + + expect(tool.description).toContain("Use only for explicit audio intent"); + expect(tool.description).toContain("Never use for ordinary text replies"); + } finally { + __testing.setDepsForTest(); + } + }); + it("passes the resolved session agent id into the tts tool", async () => { const injectedConfig = { agents: { diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index c78cf85f701..b32e888151b 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -17,6 +17,14 @@ describe("createTtsTool", () => { expect(tool.description).toContain(SILENT_REPLY_TOKEN); }); + it("requires explicit user or config audio intent in guidance text", () => { + const tool = createTtsTool(); + + expect(tool.description).toContain("Use only for explicit audio intent"); + expect(tool.description).toContain("active TTS config"); + expect(tool.description).toContain("Never use for ordinary text replies"); + }); + it("stores audio delivery in details.media and preserves the spoken text in content", async () => { textToSpeechSpy.mockResolvedValue({ success: true, diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index 3b8386bb195..715dc5ab4e4 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -64,7 +64,9 @@ export function createTtsTool(opts?: { label: "TTS", name: "tts", displaySummary: "Convert text to speech and return audio.", - description: `Convert text to speech. Audio is delivered automatically from the tool result — reply with ${SILENT_REPLY_TOKEN} after a successful call to avoid duplicate messages.`, + description: + "Use only for explicit audio intent (audio, voice, speech, TTS) or active TTS config. Never use for ordinary text replies. " + + `Audio is delivered automatically from the tool result — reply with ${SILENT_REPLY_TOKEN} after a successful call to avoid duplicate messages.`, parameters: TtsToolSchema, execute: async (_toolCallId, args) => { const params = args as Record; diff --git a/src/tts/tts-config.test.ts b/src/tts/tts-config.test.ts index f3a7d6b9d17..c935b1abd9f 100644 --- a/src/tts/tts-config.test.ts +++ b/src/tts/tts-config.test.ts @@ -46,6 +46,17 @@ describe("shouldAttemptTtsPayload", () => { expect(shouldAttemptTtsPayload({ cfg: {} as OpenClawConfig })).toBe(false); }); + it("does not infer automatic TTS from a dashboard text turn without opt-in state", () => { + expect( + shouldAttemptTtsPayload({ + cfg: {} as OpenClawConfig, + agentId: "main", + channelId: "webchat", + accountId: "dashboard", + }), + ).toBe(false); + }); + it("honors session auto state before prefs and config", () => { writeFileSync(prefsPath, JSON.stringify({ tts: { auto: "off" } })); const cfg = { messages: { tts: { auto: "off" } } } as OpenClawConfig;