Files
openclaw/extensions/speech-core/src/tts.ts
2026-03-28 12:56:27 +00:00

945 lines
27 KiB
TypeScript

import { randomBytes } from "node:crypto";
import {
existsSync,
mkdirSync,
readFileSync,
writeFileSync,
mkdtempSync,
renameSync,
unlinkSync,
} from "node:fs";
import path from "node:path";
import { normalizeChannelId, type ChannelId } from "openclaw/plugin-sdk/channel-runtime";
import type {
OpenClawConfig,
TtsAutoMode,
TtsConfig,
TtsMode,
TtsModelOverrideConfig,
TtsProvider,
} from "openclaw/plugin-sdk/config-runtime";
import { resolveSendableOutboundReplyParts } from "openclaw/plugin-sdk/reply-payload";
import type { ReplyPayload } from "openclaw/plugin-sdk/reply-runtime";
import { logVerbose } from "openclaw/plugin-sdk/runtime-env";
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/sandbox";
import { CONFIG_DIR, resolveUserPath, stripMarkdown } from "openclaw/plugin-sdk/text-runtime";
import {
canonicalizeSpeechProviderId,
getSpeechProvider,
listSpeechProviders,
normalizeSpeechProviderId,
normalizeTtsAutoMode,
parseTtsDirectives,
scheduleCleanup,
summarizeText,
type SpeechModelOverridePolicy,
type SpeechProviderConfig,
type SpeechVoiceOption,
type TtsDirectiveOverrides,
type TtsDirectiveParseResult,
} from "../api.js";
export type { TtsDirectiveOverrides, TtsDirectiveParseResult };
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
export type ResolvedTtsConfig = {
auto: TtsAutoMode;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
summaryModel?: string;
modelOverrides: ResolvedTtsModelOverrides;
providerConfigs: Record<string, SpeechProviderConfig>;
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
rawConfig?: TtsConfig;
sourceConfig?: OpenClawConfig;
};
type TtsUserPrefs = {
tts?: {
auto?: TtsAutoMode;
enabled?: boolean;
provider?: TtsProvider;
maxLength?: number;
summarize?: boolean;
};
};
export type ResolvedTtsModelOverrides = SpeechModelOverridePolicy;
export type TtsResult = {
success: boolean;
audioPath?: string;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
voiceCompatible?: boolean;
};
export type TtsSynthesisResult = {
success: boolean;
audioBuffer?: Buffer;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
voiceCompatible?: boolean;
fileExtension?: string;
};
export type TtsTelephonyResult = {
success: boolean;
audioBuffer?: Buffer;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
sampleRate?: number;
};
type TtsStatusEntry = {
timestamp: number;
success: boolean;
textLength: number;
summarized: boolean;
provider?: string;
latencyMs?: number;
error?: string;
};
let lastTtsAttempt: TtsStatusEntry | undefined;
function resolveConfiguredTtsAutoMode(raw: TtsConfig): TtsAutoMode {
return normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
}
function normalizeConfiguredSpeechProviderId(
providerId: string | undefined,
): TtsProvider | undefined {
const normalized = normalizeSpeechProviderId(providerId);
if (!normalized) {
return undefined;
}
return normalized === "edge" ? "microsoft" : normalized;
}
function resolveTtsPrefsPathValue(prefsPath: string | undefined): string {
if (prefsPath?.trim()) {
return resolveUserPath(prefsPath.trim());
}
const envPath = process.env.OPENCLAW_TTS_PREFS?.trim();
if (envPath) {
return resolveUserPath(envPath);
}
return path.join(CONFIG_DIR, "settings", "tts.json");
}
function resolveModelOverridePolicy(
overrides: TtsModelOverrideConfig | undefined,
): ResolvedTtsModelOverrides {
const enabled = overrides?.enabled ?? true;
if (!enabled) {
return {
enabled: false,
allowText: false,
allowProvider: false,
allowVoice: false,
allowModelId: false,
allowVoiceSettings: false,
allowNormalization: false,
allowSeed: false,
};
}
const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
return {
enabled: true,
allowText: allow(overrides?.allowText),
allowProvider: allow(overrides?.allowProvider, false),
allowVoice: allow(overrides?.allowVoice),
allowModelId: allow(overrides?.allowModelId),
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed),
};
}
function sortSpeechProvidersForAutoSelection(cfg?: OpenClawConfig) {
return listSpeechProviders(cfg).toSorted((left, right) => {
const leftOrder = left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER;
const rightOrder = right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER;
if (leftOrder !== rightOrder) {
return leftOrder - rightOrder;
}
return left.id.localeCompare(right.id);
});
}
function resolveRegistryDefaultSpeechProviderId(cfg?: OpenClawConfig): TtsProvider {
return sortSpeechProvidersForAutoSelection(cfg)[0]?.id ?? "";
}
function asProviderConfig(value: unknown): SpeechProviderConfig {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as SpeechProviderConfig)
: {};
}
function asProviderConfigMap(value: unknown): Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: {};
}
function resolveRawProviderConfig(
raw: TtsConfig | undefined,
providerId: string,
): SpeechProviderConfig {
if (!raw) {
return {};
}
const rawProviders = asProviderConfigMap(raw.providers);
const direct = rawProviders[providerId] ?? (raw as Record<string, unknown>)[providerId];
return asProviderConfig(direct);
}
function resolveLazyProviderConfig(
config: ResolvedTtsConfig,
providerId: string,
cfg?: OpenClawConfig,
): SpeechProviderConfig {
const canonical =
normalizeConfiguredSpeechProviderId(providerId) ?? providerId.trim().toLowerCase();
const existing = config.providerConfigs[canonical];
const effectiveCfg = cfg ?? config.sourceConfig;
if (existing && !effectiveCfg) {
return existing;
}
const rawConfig = resolveRawProviderConfig(config.rawConfig, canonical);
const resolvedProvider = getSpeechProvider(canonical, effectiveCfg);
const next =
effectiveCfg && resolvedProvider?.resolveConfig
? resolvedProvider.resolveConfig({
cfg: effectiveCfg,
rawConfig: {
...(config.rawConfig as Record<string, unknown> | undefined),
providers: asProviderConfigMap(config.rawConfig?.providers),
},
timeoutMs: config.timeoutMs,
})
: rawConfig;
config.providerConfigs[canonical] = next;
return next;
}
function collectDirectProviderConfigEntries(raw: TtsConfig): Record<string, SpeechProviderConfig> {
const entries: Record<string, SpeechProviderConfig> = {};
const rawProviders = asProviderConfigMap(raw.providers);
for (const [providerId, value] of Object.entries(rawProviders)) {
const normalized = normalizeConfiguredSpeechProviderId(providerId) ?? providerId;
entries[normalized] = asProviderConfig(value);
}
const reservedKeys = new Set([
"auto",
"enabled",
"maxTextLength",
"mode",
"modelOverrides",
"prefsPath",
"provider",
"providers",
"summaryModel",
"timeoutMs",
]);
for (const [key, value] of Object.entries(raw as Record<string, unknown>)) {
if (reservedKeys.has(key)) {
continue;
}
if (typeof value !== "object" || value === null || Array.isArray(value)) {
continue;
}
const normalized = normalizeConfiguredSpeechProviderId(key) ?? key;
entries[normalized] ??= asProviderConfig(value);
}
return entries;
}
export function getResolvedSpeechProviderConfig(
config: ResolvedTtsConfig,
providerId: string,
cfg?: OpenClawConfig,
): SpeechProviderConfig {
const canonical =
canonicalizeSpeechProviderId(providerId, cfg) ??
normalizeConfiguredSpeechProviderId(providerId) ??
providerId.trim().toLowerCase();
return resolveLazyProviderConfig(config, canonical, cfg);
}
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const auto = resolveConfiguredTtsAutoMode(raw);
return {
auto,
mode: raw.mode ?? "final",
provider:
normalizeConfiguredSpeechProviderId(raw.provider) ??
(providerSource === "config" ? raw.provider?.trim().toLowerCase() || "" : ""),
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
providerConfigs: collectDirectProviderConfigEntries(raw),
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs,
rawConfig: raw,
sourceConfig: cfg,
};
}
export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
return resolveTtsPrefsPathValue(config.prefsPath);
}
function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefined {
const auto = normalizeTtsAutoMode(prefs.tts?.auto);
if (auto) {
return auto;
}
if (typeof prefs.tts?.enabled === "boolean") {
return prefs.tts.enabled ? "always" : "off";
}
return undefined;
}
export function resolveTtsAutoMode(params: {
config: ResolvedTtsConfig;
prefsPath: string;
sessionAuto?: string;
}): TtsAutoMode {
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) {
return sessionAuto;
}
const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(params.prefsPath));
if (prefsAuto) {
return prefsAuto;
}
return params.config.auto;
}
function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto?: string }): {
autoMode: TtsAutoMode;
prefsPath: string;
} {
const raw: TtsConfig = params.cfg.messages?.tts ?? {};
const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath);
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) {
return { autoMode: sessionAuto, prefsPath };
}
const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(prefsPath));
if (prefsAuto) {
return { autoMode: prefsAuto, prefsPath };
}
return {
autoMode: resolveConfiguredTtsAutoMode(raw),
prefsPath,
};
}
export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg });
if (autoMode === "off") {
return undefined;
}
const config = resolveTtsConfig(cfg);
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
: autoMode === "tagged"
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
: undefined;
return [
"Voice (TTS) is enabled.",
autoHint,
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
]
.filter(Boolean)
.join("\n");
}
function readPrefs(prefsPath: string): TtsUserPrefs {
try {
if (!existsSync(prefsPath)) {
return {};
}
return JSON.parse(readFileSync(prefsPath, "utf8")) as TtsUserPrefs;
} catch {
return {};
}
}
function atomicWriteFileSync(filePath: string, content: string): void {
const tmpPath = `${filePath}.tmp.${Date.now()}.${randomBytes(8).toString("hex")}`;
writeFileSync(tmpPath, content, { mode: 0o600 });
try {
renameSync(tmpPath, filePath);
} catch (err) {
try {
unlinkSync(tmpPath);
} catch {
// ignore
}
throw err;
}
}
function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): void {
const prefs = readPrefs(prefsPath);
update(prefs);
mkdirSync(path.dirname(prefsPath), { recursive: true });
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
export function isTtsEnabled(
config: ResolvedTtsConfig,
prefsPath: string,
sessionAuto?: string,
): boolean {
return resolveTtsAutoMode({ config, prefsPath, sessionAuto }) !== "off";
}
export function setTtsAutoMode(prefsPath: string, mode: TtsAutoMode): void {
updatePrefs(prefsPath, (prefs) => {
const next = { ...prefs.tts };
delete next.enabled;
next.auto = mode;
prefs.tts = next;
});
}
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
setTtsAutoMode(prefsPath, enabled ? "always" : "off");
}
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
const prefs = readPrefs(prefsPath);
const prefsProvider =
canonicalizeSpeechProviderId(prefs.tts?.provider) ??
normalizeConfiguredSpeechProviderId(prefs.tts?.provider);
if (prefsProvider) {
return prefsProvider;
}
if (config.providerSource === "config") {
return normalizeConfiguredSpeechProviderId(config.provider) ?? config.provider;
}
for (const provider of sortSpeechProvidersForAutoSelection()) {
if (
provider.isConfigured({
providerConfig: config.providerConfigs[provider.id] ?? {},
timeoutMs: config.timeoutMs,
})
) {
return provider.id;
}
}
return config.provider;
}
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, provider: canonicalizeSpeechProviderId(provider) ?? provider };
});
}
export function getTtsMaxLength(prefsPath: string): number {
const prefs = readPrefs(prefsPath);
return prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH;
}
export function setTtsMaxLength(prefsPath: string, maxLength: number): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, maxLength };
});
}
export function isSummarizationEnabled(prefsPath: string): boolean {
const prefs = readPrefs(prefsPath);
return prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE;
}
export function setSummarizationEnabled(prefsPath: string, enabled: boolean): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, summarize: enabled };
});
}
export function getLastTtsAttempt(): TtsStatusEntry | undefined {
return lastTtsAttempt;
}
export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
lastTtsAttempt = entry;
}
const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]);
function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
}
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
const normalizedPrimary = canonicalizeSpeechProviderId(primary, cfg) ?? primary;
const ordered = new Set<TtsProvider>([normalizedPrimary]);
for (const provider of sortSpeechProvidersForAutoSelection(cfg)) {
const normalized = provider.id;
if (normalized !== normalizedPrimary) {
ordered.add(normalized);
}
}
return [...ordered];
}
export function isTtsProviderConfigured(
config: ResolvedTtsConfig,
provider: TtsProvider,
cfg?: OpenClawConfig,
): boolean {
const resolvedProvider = getSpeechProvider(provider, cfg);
if (!resolvedProvider) {
return false;
}
return (
resolvedProvider.isConfigured({
cfg,
providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, cfg),
timeoutMs: config.timeoutMs,
}) ?? false
);
}
function formatTtsProviderError(provider: TtsProvider, err: unknown): string {
const error = err instanceof Error ? err : new Error(String(err));
if (error.name === "AbortError") {
return `${provider}: request timed out`;
}
return `${provider}: ${error.message}`;
}
function buildTtsFailureResult(errors: string[]): { success: false; error: string } {
return {
success: false,
error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`,
};
}
function resolveReadySpeechProvider(params: {
provider: TtsProvider;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
errors: string[];
requireTelephony?: boolean;
}): NonNullable<ReturnType<typeof getSpeechProvider>> | null {
const resolvedProvider = getSpeechProvider(params.provider, params.cfg);
if (!resolvedProvider) {
params.errors.push(`${params.provider}: no provider registered`);
return null;
}
const providerConfig = getResolvedSpeechProviderConfig(
params.config,
resolvedProvider.id,
params.cfg,
);
if (
!resolvedProvider.isConfigured({
cfg: params.cfg,
providerConfig,
timeoutMs: params.config.timeoutMs,
})
) {
params.errors.push(`${params.provider}: not configured`);
return null;
}
if (params.requireTelephony && !resolvedProvider.synthesizeTelephony) {
params.errors.push(`${params.provider}: unsupported for telephony`);
return null;
}
return resolvedProvider;
}
function resolveTtsRequestSetup(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
providerOverride?: TtsProvider;
disableFallback?: boolean;
}):
| {
config: ResolvedTtsConfig;
providers: TtsProvider[];
}
| {
error: string;
} {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const provider =
canonicalizeSpeechProviderId(params.providerOverride, params.cfg) ?? userProvider;
return {
config,
providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg),
};
}
export async function textToSpeech(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
channel?: string;
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
}): Promise<TtsResult> {
const synthesis = await synthesizeSpeech(params);
if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) {
return buildTtsFailureResult([synthesis.error ?? "TTS conversion failed"]);
}
const tempRoot = resolvePreferredOpenClawTmpDir();
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
writeFileSync(audioPath, synthesis.audioBuffer);
scheduleCleanup(tempDir);
return {
success: true,
audioPath,
latencyMs: synthesis.latencyMs,
provider: synthesis.provider,
outputFormat: synthesis.outputFormat,
voiceCompatible: synthesis.voiceCompatible,
};
}
export async function synthesizeSpeech(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
channel?: string;
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
}): Promise<TtsSynthesisResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
cfg: params.cfg,
prefsPath: params.prefsPath,
providerOverride: params.overrides?.provider,
disableFallback: params.disableFallback,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
const { config, providers } = setup;
const channelId = resolveChannelId(params.channel);
const target = channelId && OPUS_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
const errors: string[] = [];
for (const provider of providers) {
const providerStart = Date.now();
try {
const resolvedProvider = resolveReadySpeechProvider({
provider,
cfg: params.cfg,
config,
errors,
});
if (!resolvedProvider) {
continue;
}
const synthesis = await resolvedProvider.synthesize({
text: params.text,
cfg: params.cfg,
providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, params.cfg),
target,
providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.id],
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer: synthesis.audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: synthesis.outputFormat,
voiceCompatible: synthesis.voiceCompatible,
fileExtension: synthesis.fileExtension,
};
} catch (err) {
errors.push(formatTtsProviderError(provider, err));
}
}
return buildTtsFailureResult(errors);
}
export async function textToSpeechTelephony(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
cfg: params.cfg,
prefsPath: params.prefsPath,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
const { config, providers } = setup;
const errors: string[] = [];
for (const provider of providers) {
const providerStart = Date.now();
try {
const resolvedProvider = resolveReadySpeechProvider({
provider,
cfg: params.cfg,
config,
errors,
requireTelephony: true,
});
if (!resolvedProvider?.synthesizeTelephony) {
continue;
}
const synthesis = await resolvedProvider.synthesizeTelephony({
text: params.text,
cfg: params.cfg,
providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, params.cfg),
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer: synthesis.audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: synthesis.outputFormat,
sampleRate: synthesis.sampleRate,
};
} catch (err) {
errors.push(formatTtsProviderError(provider, err));
}
}
return buildTtsFailureResult(errors);
}
export async function listSpeechVoices(params: {
provider: string;
cfg?: OpenClawConfig;
config?: ResolvedTtsConfig;
apiKey?: string;
baseUrl?: string;
}): Promise<SpeechVoiceOption[]> {
const provider = canonicalizeSpeechProviderId(params.provider, params.cfg);
if (!provider) {
throw new Error("speech provider id is required");
}
const config = params.config ?? (params.cfg ? resolveTtsConfig(params.cfg) : undefined);
if (!config) {
throw new Error(`speech provider ${provider} requires cfg or resolved config`);
}
const resolvedProvider = getSpeechProvider(provider, params.cfg);
if (!resolvedProvider) {
throw new Error(`speech provider ${provider} is not registered`);
}
if (!resolvedProvider.listVoices) {
throw new Error(`speech provider ${provider} does not support voice listing`);
}
return await resolvedProvider.listVoices({
cfg: params.cfg,
providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, params.cfg),
apiKey: params.apiKey,
baseUrl: params.baseUrl,
});
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
channel?: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ReplyPayload> {
if (params.payload.isCompactionNotice) {
return params.payload;
}
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({
cfg: params.cfg,
sessionAuto: params.ttsAuto,
});
if (autoMode === "off") {
return params.payload;
}
const config = resolveTtsConfig(params.cfg);
const reply = resolveSendableOutboundReplyParts(params.payload);
const text = reply.text;
const directives = parseTtsDirectives(text, config.modelOverrides, {
cfg: params.cfg,
providerConfigs: config.providerConfigs,
});
if (directives.warnings.length > 0) {
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
}
const cleanedText = directives.cleanedText;
const trimmedCleaned = cleanedText.trim();
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
const ttsText = directives.ttsText?.trim() || visibleText;
const nextPayload =
visibleText === text.trim()
? params.payload
: {
...params.payload,
text: visibleText.length > 0 ? visibleText : undefined,
};
if (autoMode === "tagged" && !directives.hasDirective) {
return nextPayload;
}
if (autoMode === "inbound" && params.inboundAudio !== true) {
return nextPayload;
}
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") {
return nextPayload;
}
if (!ttsText.trim()) {
return nextPayload;
}
if (reply.hasMedia) {
return nextPayload;
}
if (text.includes("MEDIA:")) {
return nextPayload;
}
if (ttsText.trim().length < 10) {
return nextPayload;
}
const maxLength = getTtsMaxLength(prefsPath);
let textForAudio = ttsText.trim();
let wasSummarized = false;
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
logVerbose(
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config,
timeoutMs: config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
}
}
}
textForAudio = stripMarkdown(textForAudio).trim();
if (textForAudio.length < 10) {
return nextPayload;
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
overrides: directives.overrides,
});
if (result.success && result.audioPath) {
lastTtsAttempt = {
timestamp: Date.now(),
success: true,
textLength: text.length,
summarized: wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
};
const channelId = resolveChannelId(params.channel);
const shouldVoice =
channelId !== null && OPUS_CHANNELS.has(channelId) && result.voiceCompatible === true;
return {
...nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
}
lastTtsAttempt = {
timestamp: Date.now(),
success: false,
textLength: text.length,
summarized: wasSummarized,
error: result.error,
};
const latency = Date.now() - ttsStart;
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
return nextPayload;
}
export const _test = {
parseTtsDirectives,
resolveModelOverridePolicy,
summarizeText,
getResolvedSpeechProviderConfig,
};