feat(tts): add per-agent voice overrides

This commit is contained in:
Peter Steinberger
2026-04-26 02:45:45 +01:00
parent 1bc9bada65
commit 0ca952cdd5
31 changed files with 605 additions and 34 deletions

View File

@@ -49,7 +49,7 @@ vi.mock("../api.js", async () => {
};
});
const { _test, maybeApplyTtsToPayload } = await import("./tts.js");
const { _test, maybeApplyTtsToPayload, resolveTtsConfig } = await import("./tts.js");
const nativeVoiceNoteChannels = ["discord", "feishu", "matrix", "telegram", "whatsapp"] as const;
@@ -158,3 +158,82 @@ describe("speech-core native voice-note routing", () => {
});
});
});
describe("speech-core per-agent TTS config", () => {
it("deep-merges the active agent TTS override over messages.tts", () => {
const cfg = {
messages: {
tts: {
enabled: true,
provider: "openai",
providers: {
openai: {
apiKey: "${OPENAI_API_KEY}",
voice: "coral",
speed: 1,
},
},
},
},
agents: {
list: [
{
id: "reader",
tts: {
provider: "openai",
providers: {
openai: {
voice: "nova",
},
},
},
},
],
},
} satisfies OpenClawConfig;
const resolved = resolveTtsConfig(cfg, "reader");
expect(resolved.rawConfig).toMatchObject({
enabled: true,
provider: "openai",
providers: {
openai: {
apiKey: "${OPENAI_API_KEY}",
voice: "nova",
speed: 1,
},
},
});
});
it("ignores prototype-pollution keys in agent TTS overrides", () => {
const cfg = {
messages: {
tts: {
provider: "openai",
providers: {
openai: {
voice: "coral",
},
},
},
},
agents: {
list: [
{
id: "reader",
tts: JSON.parse(
'{"providers":{"openai":{"voice":"nova","__proto__":{"polluted":true}}}}',
),
},
],
},
} as OpenClawConfig;
const resolved = resolveTtsConfig(cfg, "reader");
expect(resolved.rawConfig?.providers?.openai).toEqual({ voice: "nova" });
expect(({} as Record<string, unknown>).polluted).toBeUndefined();
});
});

View File

@@ -62,6 +62,7 @@ const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const BLOCKED_MERGE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
type TtsUserPrefs = {
tts?: {
@@ -240,6 +241,48 @@ function resolveRawProviderConfig(
return asProviderConfig(direct);
}
function isPlainObject(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function deepMergeDefined(base: unknown, override: unknown): unknown {
if (!isPlainObject(base) || !isPlainObject(override)) {
return override === undefined ? base : override;
}
const result: Record<string, unknown> = { ...base };
for (const [key, value] of Object.entries(override)) {
if (BLOCKED_MERGE_KEYS.has(key) || value === undefined) {
continue;
}
const existing = result[key];
result[key] = key in result ? deepMergeDefined(existing, value) : value;
}
return result;
}
function normalizeAgentConfigId(value: string | undefined | null): string {
return normalizeLowercaseStringOrEmpty(value);
}
function resolveAgentTtsOverride(
cfg: OpenClawConfig,
agentId: string | undefined,
): TtsConfig | undefined {
if (!agentId || !Array.isArray(cfg.agents?.list)) {
return undefined;
}
const normalized = normalizeAgentConfigId(agentId);
const agent = cfg.agents.list.find((entry) => normalizeAgentConfigId(entry.id) === normalized);
return agent?.tts;
}
function resolveEffectiveTtsRawConfig(cfg: OpenClawConfig, agentId?: string): TtsConfig {
const base = cfg.messages?.tts ?? {};
const override = resolveAgentTtsOverride(cfg, agentId);
return deepMergeDefined(base, override ?? {}) as TtsConfig;
}
function resolveLazyProviderConfig(
config: ResolvedTtsConfig,
providerId: string,
@@ -313,8 +356,8 @@ export function getResolvedSpeechProviderConfig(
return resolveLazyProviderConfig(config, canonical, cfg);
}
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
export function resolveTtsConfig(cfg: OpenClawConfig, agentId?: string): ResolvedTtsConfig {
const raw: TtsConfig = resolveEffectiveTtsRawConfig(cfg, agentId);
const providerSource = raw.provider ? "config" : "default";
const timeoutMs = raw.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const auto = resolveConfiguredTtsAutoMode(raw);
@@ -367,11 +410,15 @@ export function resolveTtsAutoMode(params: {
return params.config.auto;
}
function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto?: string }): {
function resolveEffectiveTtsAutoState(params: {
cfg: OpenClawConfig;
sessionAuto?: string;
agentId?: string;
}): {
autoMode: TtsAutoMode;
prefsPath: string;
} {
const raw: TtsConfig = params.cfg.messages?.tts ?? {};
const raw: TtsConfig = resolveEffectiveTtsRawConfig(params.cfg, params.agentId);
const prefsPath = resolveTtsPrefsPathValue(raw.prefsPath);
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) {
@@ -387,12 +434,15 @@ function resolveEffectiveTtsAutoState(params: { cfg: OpenClawConfig; sessionAuto
};
}
export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg });
export function buildTtsSystemPromptHint(
cfg: OpenClawConfig,
agentId?: string,
): string | undefined {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({ cfg, agentId });
if (autoMode === "off") {
return undefined;
}
const _config = resolveTtsConfig(cfg);
const _config = resolveTtsConfig(cfg, agentId);
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
@@ -504,11 +554,12 @@ export function resolveExplicitTtsOverrides(params: {
provider?: string;
modelId?: string;
voiceId?: string;
agentId?: string;
}): TtsDirectiveOverrides {
const providerInput = params.provider?.trim();
const modelId = params.modelId?.trim();
const voiceId = params.voiceId?.trim();
const config = resolveTtsConfig(params.cfg);
const config = resolveTtsConfig(params.cfg, params.agentId);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
const selectedProvider =
canonicalizeSpeechProviderId(providerInput, params.cfg) ??
@@ -741,6 +792,7 @@ function resolveTtsRequestSetup(params: {
prefsPath?: string;
providerOverride?: TtsProvider;
disableFallback?: boolean;
agentId?: string;
}):
| {
config: ResolvedTtsConfig;
@@ -749,7 +801,7 @@ function resolveTtsRequestSetup(params: {
| {
error: string;
} {
const config = resolveTtsConfig(params.cfg);
const config = resolveTtsConfig(params.cfg, params.agentId);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
@@ -774,6 +826,7 @@ export async function textToSpeech(params: {
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
agentId?: string;
}): Promise<TtsResult> {
const synthesis = await synthesizeSpeech(params);
if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) {
@@ -819,6 +872,7 @@ export async function synthesizeSpeech(params: {
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
agentId?: string;
}): Promise<TtsSynthesisResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
@@ -826,6 +880,7 @@ export async function synthesizeSpeech(params: {
prefsPath: params.prefsPath,
providerOverride: params.overrides?.provider,
disableFallback: params.disableFallback,
agentId: params.agentId,
});
if ("error" in setup) {
return { success: false, error: setup.error };
@@ -1064,6 +1119,7 @@ export async function maybeApplyTtsToPayload(params: {
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
agentId?: string;
}): Promise<ReplyPayload> {
if (params.payload.isCompactionNotice) {
return params.payload;
@@ -1071,11 +1127,12 @@ export async function maybeApplyTtsToPayload(params: {
const { autoMode, prefsPath } = resolveEffectiveTtsAutoState({
cfg: params.cfg,
sessionAuto: params.ttsAuto,
agentId: params.agentId,
});
if (autoMode === "off") {
return params.payload;
}
const config = resolveTtsConfig(params.cfg);
const config = resolveTtsConfig(params.cfg, params.agentId);
const activeProvider = getTtsProvider(config, prefsPath);
const reply = resolveSendableOutboundReplyParts(params.payload);
@@ -1183,6 +1240,7 @@ export async function maybeApplyTtsToPayload(params: {
prefsPath,
channel: params.channel,
overrides: directives.overrides,
agentId: params.agentId,
});
if (result.success && result.audioPath) {