mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-25 00:42:24 +00:00
TTS: extract payload planning
This commit is contained in:
139
src/extension-host/tts-payload.ts
Normal file
139
src/extension-host/tts-payload.ts
Normal file
@@ -0,0 +1,139 @@
|
||||
import type { ReplyPayload } from "../auto-reply/types.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { stripMarkdown } from "../line/markdown-to-line.js";
|
||||
import { parseTtsDirectives, summarizeText } from "../tts/tts-core.js";
|
||||
import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "../tts/tts.js";
|
||||
import {
|
||||
getExtensionHostTtsMaxLength,
|
||||
isExtensionHostTtsSummarizationEnabled,
|
||||
resolveExtensionHostTtsAutoMode,
|
||||
} from "./tts-preferences.js";
|
||||
|
||||
export type ExtensionHostTtsPayloadPlan =
|
||||
| {
|
||||
kind: "skip";
|
||||
payload: ReplyPayload;
|
||||
}
|
||||
| {
|
||||
kind: "ready";
|
||||
nextPayload: ReplyPayload;
|
||||
textForAudio: string;
|
||||
wasSummarized: boolean;
|
||||
overrides: TtsDirectiveOverrides;
|
||||
};
|
||||
|
||||
export async function resolveExtensionHostTtsPayloadPlan(params: {
|
||||
payload: ReplyPayload;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
prefsPath: string;
|
||||
kind?: "tool" | "block" | "final";
|
||||
inboundAudio?: boolean;
|
||||
ttsAuto?: string;
|
||||
}): Promise<ExtensionHostTtsPayloadPlan> {
|
||||
const autoMode = resolveExtensionHostTtsAutoMode({
|
||||
config: params.config,
|
||||
prefsPath: params.prefsPath,
|
||||
sessionAuto: params.ttsAuto,
|
||||
});
|
||||
if (autoMode === "off") {
|
||||
return { kind: "skip", payload: params.payload };
|
||||
}
|
||||
|
||||
const text = params.payload.text ?? "";
|
||||
const directives = parseTtsDirectives(
|
||||
text,
|
||||
params.config.modelOverrides,
|
||||
params.config.openai.baseUrl,
|
||||
);
|
||||
if (directives.warnings.length > 0) {
|
||||
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
|
||||
}
|
||||
|
||||
const cleanedText = directives.cleanedText;
|
||||
const trimmedCleaned = cleanedText.trim();
|
||||
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
|
||||
const ttsText = directives.ttsText?.trim() || visibleText;
|
||||
|
||||
const nextPayload =
|
||||
visibleText === text.trim()
|
||||
? params.payload
|
||||
: {
|
||||
...params.payload,
|
||||
text: visibleText.length > 0 ? visibleText : undefined,
|
||||
};
|
||||
|
||||
if (autoMode === "tagged" && !directives.hasDirective) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
if (autoMode === "inbound" && params.inboundAudio !== true) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
|
||||
const mode = params.config.mode ?? "final";
|
||||
if (mode === "final" && params.kind && params.kind !== "final") {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
|
||||
if (!ttsText.trim()) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
if (text.includes("MEDIA:")) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
if (ttsText.trim().length < 10) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
|
||||
const maxLength = getExtensionHostTtsMaxLength(params.prefsPath);
|
||||
let textForAudio = ttsText.trim();
|
||||
let wasSummarized = false;
|
||||
|
||||
if (textForAudio.length > maxLength) {
|
||||
if (!isExtensionHostTtsSummarizationEnabled(params.prefsPath)) {
|
||||
logVerbose(
|
||||
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
} else {
|
||||
try {
|
||||
const summary = await summarizeText({
|
||||
text: textForAudio,
|
||||
targetLength: maxLength,
|
||||
cfg: params.cfg,
|
||||
config: params.config,
|
||||
timeoutMs: params.config.timeoutMs,
|
||||
});
|
||||
textForAudio = summary.summary;
|
||||
wasSummarized = true;
|
||||
if (textForAudio.length > params.config.maxTextLength) {
|
||||
logVerbose(
|
||||
`TTS: summary exceeded hard limit (${textForAudio.length} > ${params.config.maxTextLength}); truncating.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, params.config.maxTextLength - 3)}...`;
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
textForAudio = stripMarkdown(textForAudio).trim();
|
||||
if (textForAudio.length < 10) {
|
||||
return { kind: "skip", payload: nextPayload };
|
||||
}
|
||||
|
||||
return {
|
||||
kind: "ready",
|
||||
nextPayload,
|
||||
textForAudio,
|
||||
wasSummarized,
|
||||
overrides: directives.overrides,
|
||||
};
|
||||
}
|
||||
116
src/tts/tts.ts
116
src/tts/tts.ts
@@ -8,6 +8,7 @@ import type {
|
||||
TtsProvider,
|
||||
TtsModelOverrideConfig,
|
||||
} from "../config/types.tts.js";
|
||||
import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js";
|
||||
import {
|
||||
getExtensionHostTtsMaxLength,
|
||||
isExtensionHostTtsEnabled,
|
||||
@@ -39,7 +40,6 @@ import {
|
||||
resolveExtensionHostTtsRequestSetup,
|
||||
} from "../extension-host/tts-runtime-setup.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { stripMarkdown } from "../line/markdown-to-line.js";
|
||||
import {
|
||||
DEFAULT_OPENAI_BASE_URL,
|
||||
isValidOpenAIModel,
|
||||
@@ -47,8 +47,8 @@ import {
|
||||
isValidVoiceId,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
resolveOpenAITtsInstructions,
|
||||
parseTtsDirectives,
|
||||
resolveOpenAITtsInstructions,
|
||||
summarizeText,
|
||||
} from "./tts-core.js";
|
||||
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
|
||||
@@ -305,7 +305,7 @@ export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefine
|
||||
return undefined;
|
||||
}
|
||||
const maxLength = getExtensionHostTtsMaxLength(prefsPath);
|
||||
const summarize = isExtensionHostTtsSummarizationEnabled(prefsPath) ? "on" : "off";
|
||||
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
|
||||
const autoHint =
|
||||
autoMode === "inbound"
|
||||
? "Only use TTS when the user's last message includes audio/voice."
|
||||
@@ -417,114 +417,34 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
}): Promise<ReplyPayload> {
|
||||
const config = resolveTtsConfig(params.cfg);
|
||||
const prefsPath = resolveTtsPrefsPath(config);
|
||||
const autoMode = resolveTtsAutoMode({
|
||||
const plan = await resolveExtensionHostTtsPayloadPlan({
|
||||
payload: params.payload,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
prefsPath,
|
||||
sessionAuto: params.ttsAuto,
|
||||
kind: params.kind,
|
||||
inboundAudio: params.inboundAudio,
|
||||
ttsAuto: params.ttsAuto,
|
||||
});
|
||||
if (autoMode === "off") {
|
||||
return params.payload;
|
||||
}
|
||||
|
||||
const text = params.payload.text ?? "";
|
||||
const directives = parseTtsDirectives(text, config.modelOverrides, config.openai.baseUrl);
|
||||
if (directives.warnings.length > 0) {
|
||||
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
|
||||
}
|
||||
|
||||
const cleanedText = directives.cleanedText;
|
||||
const trimmedCleaned = cleanedText.trim();
|
||||
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
|
||||
const ttsText = directives.ttsText?.trim() || visibleText;
|
||||
|
||||
const nextPayload =
|
||||
visibleText === text.trim()
|
||||
? params.payload
|
||||
: {
|
||||
...params.payload,
|
||||
text: visibleText.length > 0 ? visibleText : undefined,
|
||||
};
|
||||
|
||||
if (autoMode === "tagged" && !directives.hasDirective) {
|
||||
return nextPayload;
|
||||
}
|
||||
if (autoMode === "inbound" && params.inboundAudio !== true) {
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
const mode = config.mode ?? "final";
|
||||
if (mode === "final" && params.kind && params.kind !== "final") {
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
if (!ttsText.trim()) {
|
||||
return nextPayload;
|
||||
}
|
||||
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
|
||||
return nextPayload;
|
||||
}
|
||||
if (text.includes("MEDIA:")) {
|
||||
return nextPayload;
|
||||
}
|
||||
if (ttsText.trim().length < 10) {
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
let textForAudio = ttsText.trim();
|
||||
let wasSummarized = false;
|
||||
|
||||
if (textForAudio.length > maxLength) {
|
||||
if (!isSummarizationEnabled(prefsPath)) {
|
||||
logVerbose(
|
||||
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
} else {
|
||||
try {
|
||||
const summary = await summarizeText({
|
||||
text: textForAudio,
|
||||
targetLength: maxLength,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
textForAudio = summary.summary;
|
||||
wasSummarized = true;
|
||||
if (textForAudio.length > config.maxTextLength) {
|
||||
logVerbose(
|
||||
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
|
||||
if (textForAudio.length < 10) {
|
||||
return nextPayload;
|
||||
if (plan.kind === "skip") {
|
||||
return plan.payload;
|
||||
}
|
||||
|
||||
const ttsStart = Date.now();
|
||||
const result = await textToSpeech({
|
||||
text: textForAudio,
|
||||
text: plan.textForAudio,
|
||||
cfg: params.cfg,
|
||||
prefsPath,
|
||||
channel: params.channel,
|
||||
overrides: directives.overrides,
|
||||
overrides: plan.overrides,
|
||||
});
|
||||
|
||||
if (result.success && result.audioPath) {
|
||||
lastTtsAttempt = {
|
||||
timestamp: Date.now(),
|
||||
success: true,
|
||||
textLength: text.length,
|
||||
summarized: wasSummarized,
|
||||
textLength: (params.payload.text ?? "").length,
|
||||
summarized: plan.wasSummarized,
|
||||
provider: result.provider,
|
||||
latencyMs: result.latencyMs,
|
||||
};
|
||||
@@ -532,7 +452,7 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
const shouldVoice =
|
||||
isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true;
|
||||
const finalPayload = {
|
||||
...nextPayload,
|
||||
...plan.nextPayload,
|
||||
mediaUrl: result.audioPath,
|
||||
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
|
||||
};
|
||||
@@ -542,8 +462,8 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
lastTtsAttempt = {
|
||||
timestamp: Date.now(),
|
||||
success: false,
|
||||
textLength: text.length,
|
||||
summarized: wasSummarized,
|
||||
textLength: (params.payload.text ?? "").length,
|
||||
summarized: plan.wasSummarized,
|
||||
error: result.error,
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user