From 41fe304ce77ebc035b391be9c729307ce40c4d8c Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Sun, 15 Mar 2026 19:37:15 +0000 Subject: [PATCH] TTS: extract payload planning --- src/extension-host/tts-payload.ts | 139 ++++++++++++++++++++++++++++++ src/tts/tts.ts | 116 ++++--------------------- 2 files changed, 157 insertions(+), 98 deletions(-) create mode 100644 src/extension-host/tts-payload.ts diff --git a/src/extension-host/tts-payload.ts b/src/extension-host/tts-payload.ts new file mode 100644 index 00000000000..0bce003071b --- /dev/null +++ b/src/extension-host/tts-payload.ts @@ -0,0 +1,139 @@ +import type { ReplyPayload } from "../auto-reply/types.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { logVerbose } from "../globals.js"; +import { stripMarkdown } from "../line/markdown-to-line.js"; +import { parseTtsDirectives, summarizeText } from "../tts/tts-core.js"; +import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "../tts/tts.js"; +import { + getExtensionHostTtsMaxLength, + isExtensionHostTtsSummarizationEnabled, + resolveExtensionHostTtsAutoMode, +} from "./tts-preferences.js"; + +export type ExtensionHostTtsPayloadPlan = + | { + kind: "skip"; + payload: ReplyPayload; + } + | { + kind: "ready"; + nextPayload: ReplyPayload; + textForAudio: string; + wasSummarized: boolean; + overrides: TtsDirectiveOverrides; + }; + +export async function resolveExtensionHostTtsPayloadPlan(params: { + payload: ReplyPayload; + cfg: OpenClawConfig; + config: ResolvedTtsConfig; + prefsPath: string; + kind?: "tool" | "block" | "final"; + inboundAudio?: boolean; + ttsAuto?: string; +}): Promise { + const autoMode = resolveExtensionHostTtsAutoMode({ + config: params.config, + prefsPath: params.prefsPath, + sessionAuto: params.ttsAuto, + }); + if (autoMode === "off") { + return { kind: "skip", payload: params.payload }; + } + + const text = params.payload.text ?? ""; + const directives = parseTtsDirectives( + text, + params.config.modelOverrides, + params.config.openai.baseUrl, + ); + if (directives.warnings.length > 0) { + logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`); + } + + const cleanedText = directives.cleanedText; + const trimmedCleaned = cleanedText.trim(); + const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : ""; + const ttsText = directives.ttsText?.trim() || visibleText; + + const nextPayload = + visibleText === text.trim() + ? params.payload + : { + ...params.payload, + text: visibleText.length > 0 ? visibleText : undefined, + }; + + if (autoMode === "tagged" && !directives.hasDirective) { + return { kind: "skip", payload: nextPayload }; + } + if (autoMode === "inbound" && params.inboundAudio !== true) { + return { kind: "skip", payload: nextPayload }; + } + + const mode = params.config.mode ?? "final"; + if (mode === "final" && params.kind && params.kind !== "final") { + return { kind: "skip", payload: nextPayload }; + } + + if (!ttsText.trim()) { + return { kind: "skip", payload: nextPayload }; + } + if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) { + return { kind: "skip", payload: nextPayload }; + } + if (text.includes("MEDIA:")) { + return { kind: "skip", payload: nextPayload }; + } + if (ttsText.trim().length < 10) { + return { kind: "skip", payload: nextPayload }; + } + + const maxLength = getExtensionHostTtsMaxLength(params.prefsPath); + let textForAudio = ttsText.trim(); + let wasSummarized = false; + + if (textForAudio.length > maxLength) { + if (!isExtensionHostTtsSummarizationEnabled(params.prefsPath)) { + logVerbose( + `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, + ); + textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; + } else { + try { + const summary = await summarizeText({ + text: textForAudio, + targetLength: maxLength, + cfg: params.cfg, + config: params.config, + timeoutMs: params.config.timeoutMs, + }); + textForAudio = summary.summary; + wasSummarized = true; + if (textForAudio.length > params.config.maxTextLength) { + logVerbose( + `TTS: summary exceeded hard limit (${textForAudio.length} > ${params.config.maxTextLength}); truncating.`, + ); + textForAudio = `${textForAudio.slice(0, params.config.maxTextLength - 3)}...`; + } + } catch (err) { + const error = err as Error; + logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`); + textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; + } + } + } + + textForAudio = stripMarkdown(textForAudio).trim(); + if (textForAudio.length < 10) { + return { kind: "skip", payload: nextPayload }; + } + + return { + kind: "ready", + nextPayload, + textForAudio, + wasSummarized, + overrides: directives.overrides, + }; +} diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 3cb8bb579cd..5a12e94880f 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -8,6 +8,7 @@ import type { TtsProvider, TtsModelOverrideConfig, } from "../config/types.tts.js"; +import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js"; import { getExtensionHostTtsMaxLength, isExtensionHostTtsEnabled, @@ -39,7 +40,6 @@ import { resolveExtensionHostTtsRequestSetup, } from "../extension-host/tts-runtime-setup.js"; import { logVerbose } from "../globals.js"; -import { stripMarkdown } from "../line/markdown-to-line.js"; import { DEFAULT_OPENAI_BASE_URL, isValidOpenAIModel, @@ -47,8 +47,8 @@ import { isValidVoiceId, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, - resolveOpenAITtsInstructions, parseTtsDirectives, + resolveOpenAITtsInstructions, summarizeText, } from "./tts-core.js"; export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js"; @@ -305,7 +305,7 @@ export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefine return undefined; } const maxLength = getExtensionHostTtsMaxLength(prefsPath); - const summarize = isExtensionHostTtsSummarizationEnabled(prefsPath) ? "on" : "off"; + const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off"; const autoHint = autoMode === "inbound" ? "Only use TTS when the user's last message includes audio/voice." @@ -417,114 +417,34 @@ export async function maybeApplyTtsToPayload(params: { }): Promise { const config = resolveTtsConfig(params.cfg); const prefsPath = resolveTtsPrefsPath(config); - const autoMode = resolveTtsAutoMode({ + const plan = await resolveExtensionHostTtsPayloadPlan({ + payload: params.payload, + cfg: params.cfg, config, prefsPath, - sessionAuto: params.ttsAuto, + kind: params.kind, + inboundAudio: params.inboundAudio, + ttsAuto: params.ttsAuto, }); - if (autoMode === "off") { - return params.payload; - } - - const text = params.payload.text ?? ""; - const directives = parseTtsDirectives(text, config.modelOverrides, config.openai.baseUrl); - if (directives.warnings.length > 0) { - logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`); - } - - const cleanedText = directives.cleanedText; - const trimmedCleaned = cleanedText.trim(); - const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : ""; - const ttsText = directives.ttsText?.trim() || visibleText; - - const nextPayload = - visibleText === text.trim() - ? params.payload - : { - ...params.payload, - text: visibleText.length > 0 ? visibleText : undefined, - }; - - if (autoMode === "tagged" && !directives.hasDirective) { - return nextPayload; - } - if (autoMode === "inbound" && params.inboundAudio !== true) { - return nextPayload; - } - - const mode = config.mode ?? "final"; - if (mode === "final" && params.kind && params.kind !== "final") { - return nextPayload; - } - - if (!ttsText.trim()) { - return nextPayload; - } - if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) { - return nextPayload; - } - if (text.includes("MEDIA:")) { - return nextPayload; - } - if (ttsText.trim().length < 10) { - return nextPayload; - } - - const maxLength = getTtsMaxLength(prefsPath); - let textForAudio = ttsText.trim(); - let wasSummarized = false; - - if (textForAudio.length > maxLength) { - if (!isSummarizationEnabled(prefsPath)) { - logVerbose( - `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, - ); - textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; - } else { - try { - const summary = await summarizeText({ - text: textForAudio, - targetLength: maxLength, - cfg: params.cfg, - config, - timeoutMs: config.timeoutMs, - }); - textForAudio = summary.summary; - wasSummarized = true; - if (textForAudio.length > config.maxTextLength) { - logVerbose( - `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`, - ); - textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`; - } - } catch (err) { - const error = err as Error; - logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`); - textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; - } - } - } - - textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.) - if (textForAudio.length < 10) { - return nextPayload; + if (plan.kind === "skip") { + return plan.payload; } const ttsStart = Date.now(); const result = await textToSpeech({ - text: textForAudio, + text: plan.textForAudio, cfg: params.cfg, prefsPath, channel: params.channel, - overrides: directives.overrides, + overrides: plan.overrides, }); if (result.success && result.audioPath) { lastTtsAttempt = { timestamp: Date.now(), success: true, - textLength: text.length, - summarized: wasSummarized, + textLength: (params.payload.text ?? "").length, + summarized: plan.wasSummarized, provider: result.provider, latencyMs: result.latencyMs, }; @@ -532,7 +452,7 @@ export async function maybeApplyTtsToPayload(params: { const shouldVoice = isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true; const finalPayload = { - ...nextPayload, + ...plan.nextPayload, mediaUrl: result.audioPath, audioAsVoice: shouldVoice || params.payload.audioAsVoice, }; @@ -542,8 +462,8 @@ export async function maybeApplyTtsToPayload(params: { lastTtsAttempt = { timestamp: Date.now(), success: false, - textLength: text.length, - summarized: wasSummarized, + textLength: (params.payload.text ?? "").length, + summarized: plan.wasSummarized, error: result.error, };