diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ac343e712b..5e812e51a22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,9 @@ Docs: https://docs.openclaw.ai ### Fixes +- TTS: strip model-emitted TTS directives from streamed block text before channel + delivery, including directives split across adjacent blocks, while preserving + the accumulated raw reply for final-mode synthesis. Fixes #38937. - ACP: send subagent and async-task completion wakes to external ACP harnesses as plain prompts instead of OpenClaw internal runtime-context envelopes, while keeping those envelopes out of ACP transcripts. diff --git a/docs/tools/tts.md b/docs/tools/tts.md index df0ea10079e..600ed89da15 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -680,6 +680,10 @@ for a single reply, plus an optional `[[tts:text]]...[[/tts:text]]` block to provide expressive tags (laughter, singing cues, etc) that should only appear in the audio. +Streaming block delivery strips these directives from visible text before the +channel sees them, even when a directive is split across adjacent blocks. Final +mode still parses the accumulated raw reply for TTS synthesis. + `provider=...` directives are ignored unless `modelOverrides.allowProvider: true`. Example reply payload: diff --git a/src/auto-reply/reply/dispatch-acp-delivery.test.ts b/src/auto-reply/reply/dispatch-acp-delivery.test.ts index 9fd472d853c..796bfb96018 100644 --- a/src/auto-reply/reply/dispatch-acp-delivery.test.ts +++ b/src/auto-reply/reply/dispatch-acp-delivery.test.ts @@ -196,6 +196,37 @@ describe("createAcpDispatchDeliveryCoordinator", () => { expect(coordinator.getRoutedCounts().block).toBe(0); }); + it("strips split TTS directives from visible ACP block delivery", async () => { + const dispatcher = createDispatcher(); + const coordinator = createAcpDispatchDeliveryCoordinator({ + cfg: createAcpTestConfig({ + messages: { tts: { enabled: true } }, + }), + ctx: buildTestCtx({ + Provider: "visiblechat", + Surface: "visiblechat", + SessionKey: "agent:codex-acp:session-1", + }), + dispatcher, + inboundAudio: false, + shouldRouteToOriginating: false, + }); + + await coordinator.deliver("block", { text: "Intro [[tts:te" }, { skipTts: true }); + await coordinator.deliver( + "block", + { text: "xt]]hidden[[/tts:text]] visible" }, + { skipTts: true }, + ); + + expect(dispatcher.sendBlockReply).toHaveBeenNthCalledWith(1, { text: "Intro " }); + expect(dispatcher.sendBlockReply).toHaveBeenNthCalledWith(2, { text: " visible" }); + expect(coordinator.getAccumulatedVisibleBlockText()).toBe("Intro \n visible"); + expect(coordinator.getAccumulatedBlockTtsText()).toBe( + "Intro [[tts:text]]hidden[[/tts:text]] visible", + ); + }); + it("prefers provider over surface when detecting direct channel visibility", async () => { const coordinator = createAcpDispatchDeliveryCoordinator({ cfg: createAcpTestConfig(), diff --git a/src/auto-reply/reply/dispatch-acp-delivery.ts b/src/auto-reply/reply/dispatch-acp-delivery.ts index a31ff162aee..1a114f44ad2 100644 --- a/src/auto-reply/reply/dispatch-acp-delivery.ts +++ b/src/auto-reply/reply/dispatch-acp-delivery.ts @@ -7,8 +7,9 @@ import { normalizeOptionalLowercaseString, normalizeOptionalString, } from "../../shared/string-coerce.js"; +import { createTtsDirectiveTextStreamCleaner } from "../../tts/directives.js"; import { resolveStatusTtsSnapshot } from "../../tts/status-config.js"; -import { resolveConfiguredTtsMode } from "../../tts/tts-config.js"; +import { resolveConfiguredTtsMode, shouldCleanTtsDirectiveText } from "../../tts/tts-config.js"; import type { FinalizedMsgContext } from "../templating.js"; import type { ReplyPayload } from "../types.js"; import type { ReplyDispatchKind, ReplyDispatcher } from "./reply-dispatcher.types.js"; @@ -127,6 +128,9 @@ async function maybeApplyAcpTts(params: { type AcpDispatchDeliveryState = { startedReplyLifecycle: boolean; accumulatedBlockText: string; + accumulatedVisibleBlockText: string; + accumulatedBlockTtsText: string; + cleanBlockTtsDirectiveText?: ReturnType; blockCount: number; deliveredFinalReply: boolean; deliveredVisibleText: boolean; @@ -146,6 +150,8 @@ export type AcpDispatchDeliveryCoordinator = { ) => Promise; getBlockCount: () => number; getAccumulatedBlockText: () => string; + getAccumulatedVisibleBlockText: () => string; + getAccumulatedBlockTtsText: () => string; settleVisibleText: () => Promise; hasDeliveredFinalReply: () => boolean; hasDeliveredVisibleText: () => boolean; @@ -172,6 +178,15 @@ export function createAcpDispatchDeliveryCoordinator(params: { const state: AcpDispatchDeliveryState = { startedReplyLifecycle: false, accumulatedBlockText: "", + accumulatedVisibleBlockText: "", + accumulatedBlockTtsText: "", + cleanBlockTtsDirectiveText: shouldCleanTtsDirectiveText({ + cfg: params.cfg, + ttsAuto: params.sessionTtsAuto, + agentId: params.agentId, + }) + ? createTtsDirectiveTextStreamCleaner() + : undefined, blockCount: 0, deliveredFinalReply: false, deliveredVisibleText: false, @@ -279,16 +294,37 @@ export function createAcpDispatchDeliveryCoordinator(params: { payload: ReplyPayload, meta?: AcpDispatchDeliveryMeta, ): Promise => { - if (kind === "block" && normalizeOptionalString(payload.text)) { + let visiblePayload = payload; + const rawBlockText = kind === "block" ? normalizeOptionalString(payload.text) : undefined; + if (rawBlockText) { + const joinsBufferedTtsDirective = + state.cleanBlockTtsDirectiveText?.hasBufferedDirectiveText() === true; if (state.accumulatedBlockText.length > 0) { state.accumulatedBlockText += "\n"; } - state.accumulatedBlockText += payload.text; + state.accumulatedBlockText += rawBlockText; + if (state.accumulatedBlockTtsText.length > 0 && !joinsBufferedTtsDirective) { + state.accumulatedBlockTtsText += "\n"; + } + state.accumulatedBlockTtsText += rawBlockText; state.blockCount += 1; + + if (state.cleanBlockTtsDirectiveText && !payload.isCompactionNotice) { + const text = state.cleanBlockTtsDirectiveText.push(rawBlockText); + visiblePayload = { ...payload, text: text.trim() ? text : undefined }; + } + if (visiblePayload.text) { + if (state.accumulatedVisibleBlockText.length > 0) { + state.accumulatedVisibleBlockText += "\n"; + } + state.accumulatedVisibleBlockText += visiblePayload.text; + } } - if (hasOutboundReplyContent(payload, { trimText: true })) { + if (hasOutboundReplyContent(visiblePayload, { trimText: true })) { await startReplyLifecycleOnce(); + } else { + return false; } if (params.suppressUserDelivery) { @@ -296,7 +332,7 @@ export function createAcpDispatchDeliveryCoordinator(params: { } const ttsPayload = await maybeApplyAcpTts({ - payload, + payload: visiblePayload, cfg: params.cfg, agentId: params.agentId, channel: params.ttsChannel, @@ -396,6 +432,8 @@ export function createAcpDispatchDeliveryCoordinator(params: { deliver, getBlockCount: () => state.blockCount, getAccumulatedBlockText: () => state.accumulatedBlockText, + getAccumulatedVisibleBlockText: () => state.accumulatedVisibleBlockText, + getAccumulatedBlockTtsText: () => state.accumulatedBlockTtsText, settleVisibleText: settleDirectVisibleText, hasDeliveredFinalReply: () => state.deliveredFinalReply, hasDeliveredVisibleText: () => state.deliveredVisibleText, diff --git a/src/auto-reply/reply/dispatch-acp.ts b/src/auto-reply/reply/dispatch-acp.ts index b8e7d4ff359..badaa93f5e0 100644 --- a/src/auto-reply/reply/dispatch-acp.ts +++ b/src/auto-reply/reply/dispatch-acp.ts @@ -197,8 +197,9 @@ async function finalizeAcpTurnOutput(params: { let queuedFinal = params.delivery.hasDeliveredVisibleText() && !params.delivery.hasFailedVisibleTextDelivery(); const ttsMode = resolveConfiguredTtsMode(params.cfg, params.agentId); - const accumulatedBlockText = params.delivery.getAccumulatedBlockText(); - const hasAccumulatedBlockText = accumulatedBlockText.trim().length > 0; + const accumulatedVisibleBlockText = params.delivery.getAccumulatedVisibleBlockText(); + const accumulatedBlockTtsText = params.delivery.getAccumulatedBlockTtsText(); + const hasAccumulatedBlockText = accumulatedBlockTtsText.trim().length > 0; const ttsStatus = resolveStatusTtsSnapshot({ cfg: params.cfg, sessionAuto: params.sessionTtsAuto, @@ -212,7 +213,7 @@ async function finalizeAcpTurnOutput(params: { try { const { maybeApplyTtsToPayload } = await loadDispatchAcpTtsRuntime(); const ttsSyntheticReply = await maybeApplyTtsToPayload({ - payload: { text: accumulatedBlockText }, + payload: { text: accumulatedBlockTtsText }, cfg: params.cfg, channel: params.ttsChannel, kind: "final", @@ -224,7 +225,7 @@ async function finalizeAcpTurnOutput(params: { const delivered = await params.delivery.deliver("final", { mediaUrl: ttsSyntheticReply.mediaUrl, audioAsVoice: ttsSyntheticReply.audioAsVoice, - spokenText: accumulatedBlockText, + spokenText: accumulatedBlockTtsText, }); queuedFinal = queuedFinal || delivered; finalMediaDelivered = delivered; @@ -238,14 +239,14 @@ async function finalizeAcpTurnOutput(params: { // to prove the final result was visible to the user. const shouldDeliverTextFallback = ttsMode !== "all" && - hasAccumulatedBlockText && + accumulatedVisibleBlockText.trim().length > 0 && !finalMediaDelivered && !params.delivery.hasDeliveredFinalReply() && (!params.delivery.hasDeliveredVisibleText() || params.delivery.hasFailedVisibleTextDelivery()); if (shouldDeliverTextFallback) { const delivered = await params.delivery.deliver( "final", - { text: accumulatedBlockText }, + { text: accumulatedVisibleBlockText }, { skipTts: true }, ); queuedFinal = queuedFinal || delivered; diff --git a/src/auto-reply/reply/dispatch-from-config.shared.test-harness.ts b/src/auto-reply/reply/dispatch-from-config.shared.test-harness.ts index f631f83216e..12d7478abbd 100644 --- a/src/auto-reply/reply/dispatch-from-config.shared.test-harness.ts +++ b/src/auto-reply/reply/dispatch-from-config.shared.test-harness.ts @@ -293,6 +293,7 @@ vi.mock("./dispatch-acp-session.runtime.js", () => ({ vi.mock("../../tts/tts-config.js", () => ({ normalizeTtsAutoMode: (value: unknown) => ttsMocks.normalizeTtsAutoMode(value), resolveConfiguredTtsMode: (cfg: OpenClawConfig) => ttsMocks.resolveTtsConfig(cfg).mode, + shouldCleanTtsDirectiveText: () => true, shouldAttemptTtsPayload: () => true, })); diff --git a/src/auto-reply/reply/dispatch-from-config.test.ts b/src/auto-reply/reply/dispatch-from-config.test.ts index f00e197d3db..0209b49753f 100644 --- a/src/auto-reply/reply/dispatch-from-config.test.ts +++ b/src/auto-reply/reply/dispatch-from-config.test.ts @@ -363,6 +363,7 @@ vi.mock("./dispatch-acp-session.runtime.js", () => ({ vi.mock("../../tts/tts-config.js", () => ({ normalizeTtsAutoMode: (value: unknown) => ttsMocks.normalizeTtsAutoMode(value), resolveConfiguredTtsMode: (cfg: OpenClawConfig) => ttsMocks.resolveTtsConfig(cfg).mode, + shouldCleanTtsDirectiveText: () => true, shouldAttemptTtsPayload: () => true, })); @@ -3331,6 +3332,45 @@ describe("dispatchReplyFromConfig", () => { expect(blockReplySentTexts).toContain("The answer is 42"); }); + it("strips split TTS directives from streamed block text before delivery", async () => { + setNoAbort(); + ttsMocks.state.synthesizeFinalAudio = true; + const dispatcher = createDispatcher(); + const ctx = buildTestCtx({ Provider: "whatsapp" }); + const blockReplySentTexts: string[] = []; + const replyResolver = async ( + _ctx: MsgContext, + opts?: GetReplyOptions, + ): Promise => { + await opts?.onBlockReply?.({ text: "Intro [[tts:te" }); + await opts?.onBlockReply?.({ text: "xt]]hidden[[/tts:text]] visible" }); + return undefined; + }; + (dispatcher.sendBlockReply as ReturnType).mockImplementation( + (payload: ReplyPayload) => { + if (payload.text) { + blockReplySentTexts.push(payload.text); + } + return true; + }, + ); + + await dispatchReplyFromConfig({ ctx, cfg: emptyConfig, dispatcher, replyResolver }); + + expect(blockReplySentTexts).toEqual(["Intro ", " visible"]); + expect(blockReplySentTexts.join("")).not.toContain("[[tts"); + expect(blockReplySentTexts.join("")).not.toContain("hidden"); + expect(ttsMocks.maybeApplyTtsToPayload).toHaveBeenCalledWith( + expect.objectContaining({ + kind: "final", + payload: { text: "Intro [[tts:text]]hidden[[/tts:text]] visible" }, + }), + ); + expect(dispatcher.sendFinalReply).toHaveBeenCalledWith( + expect.objectContaining({ mediaUrl: "https://example.com/tts-synth.opus" }), + ); + }); + it("forwards generated-media block replies in WhatsApp group sessions", async () => { setNoAbort(); const dispatcher = createDispatcher(); diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 73d697b91e2..396e939ae3c 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -49,9 +49,11 @@ import { normalizeOptionalLowercaseString, normalizeOptionalString, } from "../../shared/string-coerce.js"; +import { createTtsDirectiveTextStreamCleaner } from "../../tts/directives.js"; import { normalizeTtsAutoMode, resolveConfiguredTtsMode, + shouldCleanTtsDirectiveText, shouldAttemptTtsPayload, } from "../../tts/tts-config.js"; import { INTERNAL_MESSAGE_CHANNEL, normalizeMessageChannel } from "../../utils/message-channel.js"; @@ -931,7 +933,15 @@ export async function dispatchReplyFromConfig( // When block streaming succeeds, there's no final reply, so we need to generate // TTS audio separately from the accumulated block content. let accumulatedBlockText = ""; + let accumulatedBlockTtsText = ""; let blockCount = 0; + const cleanBlockTtsDirectiveText = shouldCleanTtsDirectiveText({ + cfg, + ttsAuto: sessionTtsAuto, + agentId: sessionAgentId, + }) + ? createTtsDirectiveTextStreamCleaner() + : undefined; const resolveToolDeliveryPayload = (payload: ReplyPayload): ReplyPayload | null => { if ( @@ -1076,12 +1086,28 @@ export async function dispatchReplyFromConfig( // Exclude compaction status notices — they are informational UI // signals and must not be synthesised into the spoken reply. if (payload.text && !payload.isCompactionNotice) { + const joinsBufferedTtsDirective = + cleanBlockTtsDirectiveText?.hasBufferedDirectiveText() === true; if (accumulatedBlockText.length > 0) { accumulatedBlockText += "\n"; } accumulatedBlockText += payload.text; + if (accumulatedBlockTtsText.length > 0 && !joinsBufferedTtsDirective) { + accumulatedBlockTtsText += "\n"; + } + accumulatedBlockTtsText += payload.text; blockCount++; } + const visiblePayload = + payload.text && cleanBlockTtsDirectiveText && !payload.isCompactionNotice + ? (() => { + const text = cleanBlockTtsDirectiveText.push(payload.text); + return { ...payload, text: text.trim() ? text : undefined }; + })() + : payload; + if (!resolveSendableOutboundReplyParts(visiblePayload).hasContent) { + return; + } // Channels that keep a live draft preview may need to rotate their // preview state at the logical block boundary before queued block // delivery drains asynchronously through the dispatcher. @@ -1093,9 +1119,9 @@ export async function dispatchReplyFromConfig( assistantMessageIndex: payloadMetadata.assistantMessageIndex, } : context; - await params.replyOptions?.onBlockReplyQueued?.(payload, queuedContext); + await params.replyOptions?.onBlockReplyQueued?.(visiblePayload, queuedContext); const ttsPayload = await maybeApplyTtsToReplyPayload({ - payload, + payload: visiblePayload, cfg, channel: deliveryChannel, kind: "block", @@ -1180,11 +1206,11 @@ export async function dispatchReplyFromConfig( ttsMode === "final" && replies.length === 0 && blockCount > 0 && - accumulatedBlockText.trim() + accumulatedBlockTtsText.trim() ) { try { const ttsSyntheticReply = await maybeApplyTtsToReplyPayload({ - payload: { text: accumulatedBlockText }, + payload: { text: accumulatedBlockTtsText }, cfg, channel: deliveryChannel, kind: "final", @@ -1199,7 +1225,7 @@ export async function dispatchReplyFromConfig( const ttsOnlyPayload: ReplyPayload = { mediaUrl: ttsSyntheticReply.mediaUrl, audioAsVoice: ttsSyntheticReply.audioAsVoice, - spokenText: accumulatedBlockText, + spokenText: accumulatedBlockTtsText, }; const result = await routeReplyToOriginating(ttsOnlyPayload); if (result) { diff --git a/src/tts/directives.test.ts b/src/tts/directives.test.ts index f111f176397..79ba1836aec 100644 --- a/src/tts/directives.test.ts +++ b/src/tts/directives.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; import type { SpeechProviderPlugin } from "../plugins/types.js"; -import { parseTtsDirectives } from "./directives.js"; +import { createTtsDirectiveTextStreamCleaner, parseTtsDirectives } from "./directives.js"; import type { SpeechDirectiveTokenParseContext, SpeechDirectiveTokenParseResult, @@ -218,3 +218,36 @@ describe("parseTtsDirectives provider-aware routing", () => { }); }); }); + +describe("createTtsDirectiveTextStreamCleaner", () => { + it("strips directive tags split across streamed chunks", () => { + const cleaner = createTtsDirectiveTextStreamCleaner(); + + expect(cleaner.push("Hello [[tts:voice=al")).toBe("Hello "); + expect(cleaner.push("loy]]world[[/tt")).toBe("world"); + expect(cleaner.push("s]]")).toBe(""); + expect(cleaner.flush()).toBe(""); + }); + + it("suppresses hidden tts text blocks while preserving normal text", () => { + const cleaner = createTtsDirectiveTextStreamCleaner(); + + expect(cleaner.push("Shown [[tts:text]]hid")).toBe("Shown "); + expect(cleaner.push("den[[/tts:text]] visible")).toBe(" visible"); + expect(cleaner.flush()).toBe(""); + }); + + it("keeps plain tts block contents visible", () => { + const cleaner = createTtsDirectiveTextStreamCleaner(); + + expect(cleaner.push("[[tts]]read")).toBe("read"); + expect(cleaner.push(" this[[/tts]] now")).toBe(" this now"); + }); + + it("preserves non-tts bracket markup and flushes incomplete literals", () => { + const cleaner = createTtsDirectiveTextStreamCleaner(); + + expect(cleaner.push("See [[note")).toBe("See "); + expect(cleaner.flush()).toBe("[[note"); + }); +}); diff --git a/src/tts/directives.ts b/src/tts/directives.ts index 11b9aad0c79..aeb3df5da76 100644 --- a/src/tts/directives.ts +++ b/src/tts/directives.ts @@ -21,6 +21,12 @@ type TextRange = { end: number; }; +export type TtsDirectiveTextStreamCleaner = { + push: (text: string) => string; + flush: () => string; + hasBufferedDirectiveText: () => boolean; +}; + function buildProviderOrder(left: SpeechProviderPlugin, right: SpeechProviderPlugin): number { const leftOrder = left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER; const rightOrder = right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER; @@ -98,6 +104,85 @@ function replaceOutsideMarkdownCode( }); } +function normalizeTtsTagBody(body: string): string { + return body.trim().replace(/\s+/g, "").toLowerCase(); +} + +function classifyTtsTag(body: string): "hidden-open" | "hidden-close" | "tts" | "other" { + const normalized = normalizeTtsTagBody(body); + if (normalized === "tts:text") { + return "hidden-open"; + } + if (normalized === "/tts:text") { + return "hidden-close"; + } + if ( + normalized === "tts" || + normalized.startsWith("tts:") || + normalized === "/tts" || + normalized.startsWith("/tts:") + ) { + return "tts"; + } + return "other"; +} + +export function createTtsDirectiveTextStreamCleaner(): TtsDirectiveTextStreamCleaner { + let pending = ""; + let insideHiddenTextBlock = false; + + return { + push(text: string): string { + const input = pending + text; + pending = ""; + let output = ""; + let index = 0; + + while (index < input.length) { + const tagStart = input.indexOf("[[", index); + if (tagStart === -1) { + if (!insideHiddenTextBlock) { + output += input.slice(index); + } + break; + } + + if (!insideHiddenTextBlock) { + output += input.slice(index, tagStart); + } + + const tagEnd = input.indexOf("]]", tagStart + 2); + if (tagEnd === -1) { + pending = input.slice(tagStart); + break; + } + + const rawTag = input.slice(tagStart, tagEnd + 2); + const tag = classifyTtsTag(input.slice(tagStart + 2, tagEnd)); + if (tag === "hidden-open") { + insideHiddenTextBlock = true; + } else if (tag === "hidden-close") { + insideHiddenTextBlock = false; + } else if (tag === "other" && !insideHiddenTextBlock) { + output += rawTag; + } + + index = tagEnd + 2; + } + + return output; + }, + flush(): string { + const tail = pending; + pending = ""; + return insideHiddenTextBlock ? "" : tail; + }, + hasBufferedDirectiveText(): boolean { + return pending.length > 0 || insideHiddenTextBlock; + }, + }; +} + export function parseTtsDirectives( text: string, policy: SpeechModelOverridePolicy, diff --git a/src/tts/tts-config.ts b/src/tts/tts-config.ts index ea4e679e48d..8208c4c3a62 100644 --- a/src/tts/tts-config.ts +++ b/src/tts/tts-config.ts @@ -105,3 +105,14 @@ export function shouldAttemptTtsPayload(params: { } return raw?.enabled === true; } + +export function shouldCleanTtsDirectiveText(params: { + cfg: OpenClawConfig; + ttsAuto?: string; + agentId?: string; +}): boolean { + if (!shouldAttemptTtsPayload(params)) { + return false; + } + return resolveEffectiveTtsConfig(params.cfg, params.agentId).modelOverrides?.enabled !== false; +}