TTS: extract payload planning

2026-03-25 00:42:24 +00:00 · 2026-03-15 19:37:15 +00:00
parent fb3021da66
commit 41fe304ce7
2 changed files with 157 additions and 98 deletions
--- a/src/extension-host/tts-payload.ts
+++ b/src/extension-host/tts-payload.ts
@@ -0,0 +1,139 @@
+import type { ReplyPayload } from "../auto-reply/types.js";
+import type { OpenClawConfig } from "../config/config.js";
+import { logVerbose } from "../globals.js";
+import { stripMarkdown } from "../line/markdown-to-line.js";
+import { parseTtsDirectives, summarizeText } from "../tts/tts-core.js";
+import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "../tts/tts.js";
+import {
+  getExtensionHostTtsMaxLength,
+  isExtensionHostTtsSummarizationEnabled,
+  resolveExtensionHostTtsAutoMode,
+} from "./tts-preferences.js";
+
+export type ExtensionHostTtsPayloadPlan =
+  | {
+      kind: "skip";
+      payload: ReplyPayload;
+    }
+  | {
+      kind: "ready";
+      nextPayload: ReplyPayload;
+      textForAudio: string;
+      wasSummarized: boolean;
+      overrides: TtsDirectiveOverrides;
+    };
+
+export async function resolveExtensionHostTtsPayloadPlan(params: {
+  payload: ReplyPayload;
+  cfg: OpenClawConfig;
+  config: ResolvedTtsConfig;
+  prefsPath: string;
+  kind?: "tool" | "block" | "final";
+  inboundAudio?: boolean;
+  ttsAuto?: string;
+}): Promise<ExtensionHostTtsPayloadPlan> {
+  const autoMode = resolveExtensionHostTtsAutoMode({
+    config: params.config,
+    prefsPath: params.prefsPath,
+    sessionAuto: params.ttsAuto,
+  });
+  if (autoMode === "off") {
+    return { kind: "skip", payload: params.payload };
+  }
+
+  const text = params.payload.text ?? "";
+  const directives = parseTtsDirectives(
+    text,
+    params.config.modelOverrides,
+    params.config.openai.baseUrl,
+  );
+  if (directives.warnings.length > 0) {
+    logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
+  }
+
+  const cleanedText = directives.cleanedText;
+  const trimmedCleaned = cleanedText.trim();
+  const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
+  const ttsText = directives.ttsText?.trim() || visibleText;
+
+  const nextPayload =
+    visibleText === text.trim()
+      ? params.payload
+      : {
+          ...params.payload,
+          text: visibleText.length > 0 ? visibleText : undefined,
+        };
+
+  if (autoMode === "tagged" && !directives.hasDirective) {
+    return { kind: "skip", payload: nextPayload };
+  }
+  if (autoMode === "inbound" && params.inboundAudio !== true) {
+    return { kind: "skip", payload: nextPayload };
+  }
+
+  const mode = params.config.mode ?? "final";
+  if (mode === "final" && params.kind && params.kind !== "final") {
+    return { kind: "skip", payload: nextPayload };
+  }
+
+  if (!ttsText.trim()) {
+    return { kind: "skip", payload: nextPayload };
+  }
+  if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
+    return { kind: "skip", payload: nextPayload };
+  }
+  if (text.includes("MEDIA:")) {
+    return { kind: "skip", payload: nextPayload };
+  }
+  if (ttsText.trim().length < 10) {
+    return { kind: "skip", payload: nextPayload };
+  }
+
+  const maxLength = getExtensionHostTtsMaxLength(params.prefsPath);
+  let textForAudio = ttsText.trim();
+  let wasSummarized = false;
+
+  if (textForAudio.length > maxLength) {
+    if (!isExtensionHostTtsSummarizationEnabled(params.prefsPath)) {
+      logVerbose(
+        `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
+      );
+      textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
+    } else {
+      try {
+        const summary = await summarizeText({
+          text: textForAudio,
+          targetLength: maxLength,
+          cfg: params.cfg,
+          config: params.config,
+          timeoutMs: params.config.timeoutMs,
+        });
+        textForAudio = summary.summary;
+        wasSummarized = true;
+        if (textForAudio.length > params.config.maxTextLength) {
+          logVerbose(
+            `TTS: summary exceeded hard limit (${textForAudio.length} > ${params.config.maxTextLength}); truncating.`,
+          );
+          textForAudio = `${textForAudio.slice(0, params.config.maxTextLength - 3)}...`;
+        }
+      } catch (err) {
+        const error = err as Error;
+        logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
+        textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
+      }
+    }
+  }
+
+  textForAudio = stripMarkdown(textForAudio).trim();
+  if (textForAudio.length < 10) {
+    return { kind: "skip", payload: nextPayload };
+  }
+
+  return {
+    kind: "ready",
+    nextPayload,
+    textForAudio,
+    wasSummarized,
+    overrides: directives.overrides,
+  };
+}
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -8,6 +8,7 @@ import type {
  TtsProvider,
  TtsModelOverrideConfig,
 } from "../config/types.tts.js";
+import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js";
 import {
  getExtensionHostTtsMaxLength,
  isExtensionHostTtsEnabled,
@@ -39,7 +40,6 @@ import {
  resolveExtensionHostTtsRequestSetup,
 } from "../extension-host/tts-runtime-setup.js";
 import { logVerbose } from "../globals.js";
-import { stripMarkdown } from "../line/markdown-to-line.js";
 import {
  DEFAULT_OPENAI_BASE_URL,
  isValidOpenAIModel,
@@ -47,8 +47,8 @@ import {
  isValidVoiceId,
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
-  resolveOpenAITtsInstructions,
  parseTtsDirectives,
+  resolveOpenAITtsInstructions,
  summarizeText,
 } from "./tts-core.js";
 export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
@@ -305,7 +305,7 @@ export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefine
    return undefined;
  }
  const maxLength = getExtensionHostTtsMaxLength(prefsPath);
-  const summarize = isExtensionHostTtsSummarizationEnabled(prefsPath) ? "on" : "off";
+  const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
  const autoHint =
    autoMode === "inbound"
      ? "Only use TTS when the user's last message includes audio/voice."
@@ -417,114 +417,34 @@ export async function maybeApplyTtsToPayload(params: {
 }): Promise<ReplyPayload> {
  const config = resolveTtsConfig(params.cfg);
  const prefsPath = resolveTtsPrefsPath(config);
-  const autoMode = resolveTtsAutoMode({
+  const plan = await resolveExtensionHostTtsPayloadPlan({
+    payload: params.payload,
+    cfg: params.cfg,
    config,
    prefsPath,
-    sessionAuto: params.ttsAuto,
+    kind: params.kind,
+    inboundAudio: params.inboundAudio,
+    ttsAuto: params.ttsAuto,
  });
-  if (autoMode === "off") {
-    return params.payload;
-  }
-
-  const text = params.payload.text ?? "";
-  const directives = parseTtsDirectives(text, config.modelOverrides, config.openai.baseUrl);
-  if (directives.warnings.length > 0) {
-    logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
-  }
-
-  const cleanedText = directives.cleanedText;
-  const trimmedCleaned = cleanedText.trim();
-  const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
-  const ttsText = directives.ttsText?.trim() || visibleText;
-
-  const nextPayload =
-    visibleText === text.trim()
-      ? params.payload
-      : {
-          ...params.payload,
-          text: visibleText.length > 0 ? visibleText : undefined,
-        };
-
-  if (autoMode === "tagged" && !directives.hasDirective) {
-    return nextPayload;
-  }
-  if (autoMode === "inbound" && params.inboundAudio !== true) {
-    return nextPayload;
-  }
-
-  const mode = config.mode ?? "final";
-  if (mode === "final" && params.kind && params.kind !== "final") {
-    return nextPayload;
-  }
-
-  if (!ttsText.trim()) {
-    return nextPayload;
-  }
-  if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
-    return nextPayload;
-  }
-  if (text.includes("MEDIA:")) {
-    return nextPayload;
-  }
-  if (ttsText.trim().length < 10) {
-    return nextPayload;
-  }
-
-  const maxLength = getTtsMaxLength(prefsPath);
-  let textForAudio = ttsText.trim();
-  let wasSummarized = false;
-
-  if (textForAudio.length > maxLength) {
-    if (!isSummarizationEnabled(prefsPath)) {
-      logVerbose(
-        `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
-      );
-      textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
-    } else {
-      try {
-        const summary = await summarizeText({
-          text: textForAudio,
-          targetLength: maxLength,
-          cfg: params.cfg,
-          config,
-          timeoutMs: config.timeoutMs,
-        });
-        textForAudio = summary.summary;
-        wasSummarized = true;
-        if (textForAudio.length > config.maxTextLength) {
-          logVerbose(
-            `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
-          );
-          textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
-        }
-      } catch (err) {
-        const error = err as Error;
-        logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
-        textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
-      }
-    }
-  }
-
-  textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
-  if (textForAudio.length < 10) {
-    return nextPayload;
+  if (plan.kind === "skip") {
+    return plan.payload;
  }

  const ttsStart = Date.now();
  const result = await textToSpeech({
-    text: textForAudio,
+    text: plan.textForAudio,
    cfg: params.cfg,
    prefsPath,
    channel: params.channel,
-    overrides: directives.overrides,
+    overrides: plan.overrides,
  });

  if (result.success && result.audioPath) {
    lastTtsAttempt = {
      timestamp: Date.now(),
      success: true,
-      textLength: text.length,
-      summarized: wasSummarized,
+      textLength: (params.payload.text ?? "").length,
+      summarized: plan.wasSummarized,
      provider: result.provider,
      latencyMs: result.latencyMs,
    };
@@ -532,7 +452,7 @@ export async function maybeApplyTtsToPayload(params: {
    const shouldVoice =
      isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true;
    const finalPayload = {
-      ...nextPayload,
+      ...plan.nextPayload,
      mediaUrl: result.audioPath,
      audioAsVoice: shouldVoice || params.payload.audioAsVoice,
    };
@@ -542,8 +462,8 @@ export async function maybeApplyTtsToPayload(params: {
  lastTtsAttempt = {
    timestamp: Date.now(),
    success: false,
-    textLength: text.length,
-    summarized: wasSummarized,
+    textLength: (params.payload.text ?? "").length,
+    summarized: plan.wasSummarized,
    error: result.error,
  };